Coverage Report

Created: 2026-04-14 13:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format/table/iceberg/schema_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "format/table/iceberg/schema_parser.h"
19
20
#include <rapidjson/document.h>
21
#include <rapidjson/error/en.h>
22
23
#include <cstdint>
24
#include <optional>
25
#include <unordered_set>
26
27
#include "format/table/iceberg/schema.h"
28
#include "format/table/iceberg/types.h"
29
30
namespace doris::iceberg {
31
32
const char* SchemaParser::SCHEMA_ID = "schema-id";
33
const char* SchemaParser::IDENTIFIER_FIELD_IDS = "identifier-field-ids";
34
const char* SchemaParser::TYPE = "type";
35
const char* SchemaParser::STRUCT = "struct";
36
const char* SchemaParser::LIST = "list";
37
const char* SchemaParser::MAP = "map";
38
const char* SchemaParser::FIELDS = "fields";
39
const char* SchemaParser::ELEMENT = "element";
40
const char* SchemaParser::KEY = "key";
41
const char* SchemaParser::VALUE = "value";
42
const char* SchemaParser::DOC = "doc";
43
const char* SchemaParser::NAME = "name";
44
const char* SchemaParser::ID = "id";
45
const char* SchemaParser::ELEMENT_ID = "element-id";
46
const char* SchemaParser::KEY_ID = "key-id";
47
const char* SchemaParser::VALUE_ID = "value-id";
48
const char* SchemaParser::REQUIRED = "required";
49
const char* SchemaParser::ELEMENT_REQUIRED = "element-required";
50
const char* SchemaParser::VALUE_REQUIRED = "value-required";
51
52
58
std::unique_ptr<Type> SchemaParser::_type_from_json(const rapidjson::Value& value) {
53
58
    if (value.IsString()) {
54
31
        return Types::from_primitive_string(value.GetString());
55
31
    } else if (value.IsObject()) {
56
27
        if (value.HasMember(TYPE)) {
57
27
            std::string type = value[TYPE].GetString();
58
27
            if (type == STRUCT) {
59
10
                return _struct_from_json(value);
60
17
            } else if (type == LIST) {
61
8
                return _list_from_json(value);
62
9
            } else if (type == MAP) {
63
9
                return _map_from_json(value);
64
9
            }
65
27
        }
66
27
    }
67
0
    throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "Cannot parse type from json.");
68
58
}
69
70
10
std::unique_ptr<StructType> SchemaParser::_struct_from_json(const rapidjson::Value& value) {
71
10
    if (!value.HasMember("fields") || !value["fields"].IsArray()) {
72
0
        throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
73
0
                               "Cannot parse struct fields from non-array.");
74
0
    }
75
76
10
    const rapidjson::Value& field_array = value["fields"];
77
10
    std::vector<NestedField> fields;
78
10
    fields.reserve(field_array.Size());
79
80
26
    for (uint32_t i = 0; i < field_array.Size(); ++i) {
81
16
        const rapidjson::Value& field = field_array[i];
82
16
        if (!field.IsObject()) {
83
0
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
84
0
                                   "Cannot parse struct field from non-object.");
85
0
        }
86
87
16
        int id = field[ID].GetInt();
88
16
        std::string name = field[NAME].GetString();
89
16
        std::unique_ptr<Type> type = _type_from_json(field[TYPE]);
90
91
16
        std::optional<std::string> doc = std::nullopt;
92
16
        if (field.HasMember(DOC)) {
93
1
            doc = field[DOC].GetString();
94
1
        }
95
96
16
        bool is_required = field[REQUIRED].GetBool();
97
98
16
        fields.emplace_back(!is_required, id, name, std::move(type), doc);
99
16
    }
100
101
10
    return std::make_unique<StructType>(std::move(fields));
102
10
}
103
104
8
std::unique_ptr<ListType> SchemaParser::_list_from_json(const rapidjson::Value& value) {
105
8
    int element_id = value[ELEMENT_ID].GetInt();
106
8
    std::unique_ptr<Type> element_type = _type_from_json(value[ELEMENT]);
107
8
    bool is_required = value[ELEMENT_REQUIRED].GetBool();
108
109
8
    if (is_required) {
110
6
        return ListType::of_required(element_id, std::move(element_type));
111
6
    } else {
112
2
        return ListType::of_optional(element_id, std::move(element_type));
113
2
    }
114
8
}
115
116
9
std::unique_ptr<MapType> SchemaParser::_map_from_json(const rapidjson::Value& value) {
117
9
    int key_id = value[KEY_ID].GetInt();
118
9
    std::unique_ptr<Type> key_type = _type_from_json(value[KEY]);
119
120
9
    int value_id = value[VALUE_ID].GetInt();
121
9
    std::unique_ptr<Type> value_type = _type_from_json(value[VALUE]);
122
123
9
    bool is_required = value[VALUE_REQUIRED].GetBool();
124
125
9
    if (is_required) {
126
6
        return MapType::of_required(key_id, value_id, std::move(key_type), std::move(value_type));
127
6
    } else {
128
3
        return MapType::of_optional(key_id, value_id, std::move(key_type), std::move(value_type));
129
3
    }
130
9
}
131
132
9
std::unique_ptr<Schema> SchemaParser::from_json(const std::string& json) {
133
9
    rapidjson::Document doc;
134
9
    doc.Parse(json.c_str());
135
136
9
    if (doc.HasParseError()) {
137
0
        throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "Failed to parse JSON: {}.",
138
0
                               std::string(GetParseError_En(doc.GetParseError())));
139
0
    }
140
9
    std::unique_ptr<Type> type = _type_from_json(doc);
141
9
    return std::make_unique<Schema>(type->as_nested_type()->as_struct_type()->move_fields());
142
9
}
143
144
std::unordered_set<int> SchemaParser::_get_integer_set(const char* key,
145
0
                                                       const rapidjson::Value& value) {
146
0
    std::unordered_set<int> integer_set;
147
148
0
    if (value.HasMember(key) && value[key].IsArray()) {
149
0
        const rapidjson::Value& arr = value[key];
150
0
        for (uint32_t i = 0; i < arr.Size(); i++) {
151
0
            if (arr[i].IsInt()) {
152
0
                integer_set.insert(arr[i].GetInt());
153
0
            } else {
154
0
                throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
155
0
                                       "Unexpected non-integer element in the array.");
156
0
            }
157
0
        }
158
0
    }
159
0
    return integer_set;
160
0
}
161
162
} // namespace doris::iceberg