be/src/format/table/iceberg/schema_parser.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "format/table/iceberg/schema_parser.h" |
19 | | |
20 | | #include <rapidjson/document.h> |
21 | | #include <rapidjson/error/en.h> |
22 | | |
23 | | #include <cstdint> |
24 | | #include <optional> |
25 | | #include <unordered_set> |
26 | | |
27 | | #include "format/table/iceberg/schema.h" |
28 | | #include "format/table/iceberg/types.h" |
29 | | |
30 | | namespace doris::iceberg { |
31 | | #include "common/compile_check_begin.h" |
32 | | |
33 | | const char* SchemaParser::SCHEMA_ID = "schema-id"; |
34 | | const char* SchemaParser::IDENTIFIER_FIELD_IDS = "identifier-field-ids"; |
35 | | const char* SchemaParser::TYPE = "type"; |
36 | | const char* SchemaParser::STRUCT = "struct"; |
37 | | const char* SchemaParser::LIST = "list"; |
38 | | const char* SchemaParser::MAP = "map"; |
39 | | const char* SchemaParser::FIELDS = "fields"; |
40 | | const char* SchemaParser::ELEMENT = "element"; |
41 | | const char* SchemaParser::KEY = "key"; |
42 | | const char* SchemaParser::VALUE = "value"; |
43 | | const char* SchemaParser::DOC = "doc"; |
44 | | const char* SchemaParser::NAME = "name"; |
45 | | const char* SchemaParser::ID = "id"; |
46 | | const char* SchemaParser::ELEMENT_ID = "element-id"; |
47 | | const char* SchemaParser::KEY_ID = "key-id"; |
48 | | const char* SchemaParser::VALUE_ID = "value-id"; |
49 | | const char* SchemaParser::REQUIRED = "required"; |
50 | | const char* SchemaParser::ELEMENT_REQUIRED = "element-required"; |
51 | | const char* SchemaParser::VALUE_REQUIRED = "value-required"; |
52 | | |
53 | 47 | std::unique_ptr<Type> SchemaParser::_type_from_json(const rapidjson::Value& value) { |
54 | 47 | if (value.IsString()) { |
55 | 24 | return Types::from_primitive_string(value.GetString()); |
56 | 24 | } else if (value.IsObject()) { |
57 | 23 | if (value.HasMember(TYPE)) { |
58 | 23 | std::string type = value[TYPE].GetString(); |
59 | 23 | if (type == STRUCT) { |
60 | 6 | return _struct_from_json(value); |
61 | 17 | } else if (type == LIST) { |
62 | 8 | return _list_from_json(value); |
63 | 9 | } else if (type == MAP) { |
64 | 9 | return _map_from_json(value); |
65 | 9 | } |
66 | 23 | } |
67 | 23 | } |
68 | 0 | throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "Cannot parse type from json."); |
69 | 47 | } |
70 | | |
71 | 6 | std::unique_ptr<StructType> SchemaParser::_struct_from_json(const rapidjson::Value& value) { |
72 | 6 | if (!value.HasMember("fields") || !value["fields"].IsArray()) { |
73 | 0 | throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, |
74 | 0 | "Cannot parse struct fields from non-array."); |
75 | 0 | } |
76 | | |
77 | 6 | const rapidjson::Value& field_array = value["fields"]; |
78 | 6 | std::vector<NestedField> fields; |
79 | 6 | fields.reserve(field_array.Size()); |
80 | | |
81 | 15 | for (uint32_t i = 0; i < field_array.Size(); ++i) { |
82 | 9 | const rapidjson::Value& field = field_array[i]; |
83 | 9 | if (!field.IsObject()) { |
84 | 0 | throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, |
85 | 0 | "Cannot parse struct field from non-object."); |
86 | 0 | } |
87 | | |
88 | 9 | int id = field[ID].GetInt(); |
89 | 9 | std::string name = field[NAME].GetString(); |
90 | 9 | std::unique_ptr<Type> type = _type_from_json(field[TYPE]); |
91 | | |
92 | 9 | std::optional<std::string> doc = std::nullopt; |
93 | 9 | if (field.HasMember(DOC)) { |
94 | 1 | doc = field[DOC].GetString(); |
95 | 1 | } |
96 | | |
97 | 9 | bool is_required = field[REQUIRED].GetBool(); |
98 | | |
99 | 9 | fields.emplace_back(!is_required, id, name, std::move(type), doc); |
100 | 9 | } |
101 | | |
102 | 6 | return std::make_unique<StructType>(std::move(fields)); |
103 | 6 | } |
104 | | |
105 | 8 | std::unique_ptr<ListType> SchemaParser::_list_from_json(const rapidjson::Value& value) { |
106 | 8 | int element_id = value[ELEMENT_ID].GetInt(); |
107 | 8 | std::unique_ptr<Type> element_type = _type_from_json(value[ELEMENT]); |
108 | 8 | bool is_required = value[ELEMENT_REQUIRED].GetBool(); |
109 | | |
110 | 8 | if (is_required) { |
111 | 6 | return ListType::of_required(element_id, std::move(element_type)); |
112 | 6 | } else { |
113 | 2 | return ListType::of_optional(element_id, std::move(element_type)); |
114 | 2 | } |
115 | 8 | } |
116 | | |
117 | 9 | std::unique_ptr<MapType> SchemaParser::_map_from_json(const rapidjson::Value& value) { |
118 | 9 | int key_id = value[KEY_ID].GetInt(); |
119 | 9 | std::unique_ptr<Type> key_type = _type_from_json(value[KEY]); |
120 | | |
121 | 9 | int value_id = value[VALUE_ID].GetInt(); |
122 | 9 | std::unique_ptr<Type> value_type = _type_from_json(value[VALUE]); |
123 | | |
124 | 9 | bool is_required = value[VALUE_REQUIRED].GetBool(); |
125 | | |
126 | 9 | if (is_required) { |
127 | 6 | return MapType::of_required(key_id, value_id, std::move(key_type), std::move(value_type)); |
128 | 6 | } else { |
129 | 3 | return MapType::of_optional(key_id, value_id, std::move(key_type), std::move(value_type)); |
130 | 3 | } |
131 | 9 | } |
132 | | |
133 | 5 | std::unique_ptr<Schema> SchemaParser::from_json(const std::string& json) { |
134 | 5 | rapidjson::Document doc; |
135 | 5 | doc.Parse(json.c_str()); |
136 | | |
137 | 5 | if (doc.HasParseError()) { |
138 | 0 | throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, "Failed to parse JSON: {}.", |
139 | 0 | std::string(GetParseError_En(doc.GetParseError()))); |
140 | 0 | } |
141 | 5 | std::unique_ptr<Type> type = _type_from_json(doc); |
142 | 5 | return std::make_unique<Schema>(type->as_nested_type()->as_struct_type()->move_fields()); |
143 | 5 | } |
144 | | |
145 | | std::unordered_set<int> SchemaParser::_get_integer_set(const char* key, |
146 | 0 | const rapidjson::Value& value) { |
147 | 0 | std::unordered_set<int> integer_set; |
148 | |
|
149 | 0 | if (value.HasMember(key) && value[key].IsArray()) { |
150 | 0 | const rapidjson::Value& arr = value[key]; |
151 | 0 | for (uint32_t i = 0; i < arr.Size(); i++) { |
152 | 0 | if (arr[i].IsInt()) { |
153 | 0 | integer_set.insert(arr[i].GetInt()); |
154 | 0 | } else { |
155 | 0 | throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, |
156 | 0 | "Unexpected non-integer element in the array."); |
157 | 0 | } |
158 | 0 | } |
159 | 0 | } |
160 | 0 | return integer_set; |
161 | 0 | } |
162 | | |
163 | | #include "common/compile_check_end.h" |
164 | | } // namespace doris::iceberg |