/root/doris/be/src/exprs/json_functions.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "exprs/json_functions.h" |
19 | | |
20 | | #include <rapidjson/allocators.h> |
21 | | #include <rapidjson/document.h> |
22 | | #include <rapidjson/encodings.h> |
23 | | #include <rapidjson/rapidjson.h> |
24 | | #include <rapidjson/stringbuffer.h> |
25 | | #include <rapidjson/writer.h> |
26 | | #include <re2/re2.h> |
27 | | #include <simdjson/error.h> |
28 | | #include <simdjson/simdjson.h> // IWYU pragma: keep |
29 | | #include <stdlib.h> |
30 | | |
31 | | #include <boost/iterator/iterator_facade.hpp> |
32 | | #include <boost/token_functions.hpp> |
33 | | #include <boost/tokenizer.hpp> |
34 | | #include <sstream> |
35 | | #include <string> |
36 | | #include <vector> |
37 | | |
38 | | #include "common/compiler_util.h" // IWYU pragma: keep |
39 | | #include "common/exception.h" |
40 | | #include "common/logging.h" |
41 | | |
42 | | namespace doris { |
43 | | |
44 | | // static const re2::RE2 JSON_PATTERN("^([a-zA-Z0-9_\\-\\:\\s#\\|\\.]*)(?:\\[([0-9]+)\\])?"); |
45 | | // json path cannot contains: ", [, ] |
46 | | static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?"); |
47 | | |
48 | | rapidjson::Value* JsonFunctions::match_value(const std::vector<JsonPath>& parsed_paths, |
49 | | rapidjson::Value* document, |
50 | | rapidjson::Document::AllocatorType& mem_allocator, |
51 | 8 | bool is_insert_null) { |
52 | 8 | rapidjson::Value* root = document; |
53 | 8 | rapidjson::Value* array_obj = nullptr; |
54 | 21 | for (int i = 1; i < parsed_paths.size(); i++) { |
55 | 13 | VLOG_TRACE << "parsed_paths: " << parsed_paths[i].debug_string(); |
56 | | |
57 | 13 | if (root == nullptr || root->IsNull()) { |
58 | 0 | return nullptr; |
59 | 0 | } |
60 | | |
61 | 13 | if (UNLIKELY(!parsed_paths[i].is_valid)) { |
62 | 0 | return nullptr; |
63 | 0 | } |
64 | | |
65 | 13 | const std::string& col = parsed_paths[i].key; |
66 | 13 | int index = parsed_paths[i].idx; |
67 | 13 | if (LIKELY(!col.empty())) { |
68 | 9 | if (root->IsArray()) { |
69 | 5 | array_obj = static_cast<rapidjson::Value*>( |
70 | 5 | mem_allocator.Malloc(sizeof(rapidjson::Value))); |
71 | 5 | array_obj->SetArray(); |
72 | 5 | bool is_null = true; |
73 | | |
74 | | // if array ,loop the array,find out all Objects,then find the results from the objects |
75 | 16 | for (int j = 0; j < root->Size(); j++) { |
76 | 11 | rapidjson::Value* json_elem = &((*root)[j]); |
77 | | |
78 | 11 | if (json_elem->IsArray() || json_elem->IsNull()) { |
79 | 0 | continue; |
80 | 11 | } else { |
81 | 11 | if (!json_elem->IsObject()) { |
82 | 0 | continue; |
83 | 0 | } |
84 | 11 | if (!json_elem->HasMember(col.c_str())) { |
85 | 1 | if (is_insert_null) { // not found item, then insert a null object. |
86 | 1 | is_null = false; |
87 | 1 | rapidjson::Value nullObject(rapidjson::kNullType); |
88 | 1 | array_obj->PushBack(nullObject, mem_allocator); |
89 | 1 | } |
90 | 1 | continue; |
91 | 1 | } |
92 | 10 | rapidjson::Value* obj = &((*json_elem)[col.c_str()]); |
93 | 10 | if (obj->IsArray()) { |
94 | 0 | is_null = false; |
95 | 0 | for (int k = 0; k < obj->Size(); k++) { |
96 | 0 | array_obj->PushBack((*obj)[k], mem_allocator); |
97 | 0 | } |
98 | 10 | } else if (!obj->IsNull()) { |
99 | 10 | is_null = false; |
100 | 10 | array_obj->PushBack(*obj, mem_allocator); |
101 | 10 | } |
102 | 10 | } |
103 | 11 | } |
104 | | |
105 | 5 | root = is_null ? &(array_obj->SetNull()) : array_obj; |
106 | 5 | } else if (root->IsObject()) { |
107 | 4 | if (!root->HasMember(col.c_str())) { |
108 | 0 | return nullptr; |
109 | 4 | } else { |
110 | 4 | root = &((*root)[col.c_str()]); |
111 | 4 | } |
112 | 4 | } else { |
113 | | // root is not a nested type, return nullptr |
114 | 0 | return nullptr; |
115 | 0 | } |
116 | 9 | } |
117 | | |
118 | 13 | if (UNLIKELY(index != -1)) { |
119 | | // judge the rapidjson:Value, which base the top's result, |
120 | | // if not array return nullptr;else get the index value from the array |
121 | 4 | if (root->IsArray()) { |
122 | 4 | if (root->IsNull()) { |
123 | 0 | return nullptr; |
124 | 4 | } else if (index == -2) { |
125 | | // [*] |
126 | 4 | array_obj = static_cast<rapidjson::Value*>( |
127 | 4 | mem_allocator.Malloc(sizeof(rapidjson::Value))); |
128 | 4 | array_obj->SetArray(); |
129 | | |
130 | 13 | for (int j = 0; j < root->Size(); j++) { |
131 | 9 | rapidjson::Value v; |
132 | 9 | v.CopyFrom((*root)[j], mem_allocator); |
133 | 9 | array_obj->PushBack(v, mem_allocator); |
134 | 9 | } |
135 | 4 | root = array_obj; |
136 | 4 | } else if (index >= root->Size()) { |
137 | 0 | return nullptr; |
138 | 0 | } else { |
139 | 0 | root = &((*root)[index]); |
140 | 0 | } |
141 | 4 | } else { |
142 | 0 | return nullptr; |
143 | 0 | } |
144 | 4 | } |
145 | 13 | } |
146 | 8 | return root; |
147 | 8 | } |
148 | | |
149 | | rapidjson::Value* JsonFunctions::get_json_array_from_parsed_json( |
150 | | const std::string& json_path, rapidjson::Value* document, |
151 | 9 | rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly) { |
152 | 9 | std::vector<JsonPath> vec; |
153 | 9 | parse_json_paths(json_path, &vec); |
154 | 9 | return get_json_array_from_parsed_json(vec, document, mem_allocator, wrap_explicitly); |
155 | 9 | } |
156 | | |
157 | | rapidjson::Value* JsonFunctions::get_json_array_from_parsed_json( |
158 | | const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document, |
159 | 9 | rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly) { |
160 | 9 | *wrap_explicitly = false; |
161 | 9 | if (!parsed_paths[0].is_valid) { |
162 | 0 | return nullptr; |
163 | 0 | } |
164 | | |
165 | 9 | if (parsed_paths.size() == 1) { |
166 | | // the json path is "$", just return entire document |
167 | | // wrapper an array |
168 | 1 | rapidjson::Value* array_obj = nullptr; |
169 | 1 | array_obj = static_cast<rapidjson::Value*>(mem_allocator.Malloc(sizeof(rapidjson::Value))); |
170 | 1 | array_obj->SetArray(); |
171 | 1 | array_obj->PushBack(*document, mem_allocator); |
172 | 1 | return array_obj; |
173 | 1 | } |
174 | | |
175 | 8 | rapidjson::Value* root = match_value(parsed_paths, document, mem_allocator, true); |
176 | 8 | if (root == nullptr || root == document) { // not found |
177 | 0 | return nullptr; |
178 | 8 | } else if (!root->IsArray() && wrap_explicitly) { |
179 | 1 | rapidjson::Value* array_obj = nullptr; |
180 | 1 | array_obj = static_cast<rapidjson::Value*>(mem_allocator.Malloc(sizeof(rapidjson::Value))); |
181 | 1 | array_obj->SetArray(); |
182 | 1 | rapidjson::Value copy; |
183 | 1 | copy.CopyFrom(*root, mem_allocator); |
184 | 1 | array_obj->PushBack(std::move(copy), mem_allocator); |
185 | | // set `wrap_explicitly` to true, so that the caller knows that this Array is wrapped actively. |
186 | 1 | *wrap_explicitly = true; |
187 | 1 | return array_obj; |
188 | 1 | } |
189 | 7 | return root; |
190 | 8 | } |
191 | | |
192 | | rapidjson::Value* JsonFunctions::get_json_object_from_parsed_json( |
193 | | const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document, |
194 | 0 | rapidjson::Document::AllocatorType& mem_allocator) { |
195 | 0 | if (!parsed_paths[0].is_valid) { |
196 | 0 | return nullptr; |
197 | 0 | } |
198 | | |
199 | 0 | if (parsed_paths.size() == 1) { |
200 | | // the json path is "$", just return entire document |
201 | 0 | return document; |
202 | 0 | } |
203 | | |
204 | 0 | rapidjson::Value* root = match_value(parsed_paths, document, mem_allocator, true); |
205 | 0 | if (root == nullptr || root == document) { // not found |
206 | 0 | return nullptr; |
207 | 0 | } |
208 | 0 | return root; |
209 | 0 | } |
210 | | |
211 | | void JsonFunctions::parse_json_paths(const std::string& path_string, |
212 | 9 | std::vector<JsonPath>* parsed_paths) { |
213 | | // split path by ".", and escape quota by "\" |
214 | | // eg: |
215 | | // '$.text#abc.xyz' -> [$, text#abc, xyz] |
216 | | // '$."text.abc".xyz' -> [$, text.abc, xyz] |
217 | | // '$."text.abc"[1].xyz' -> [$, text.abc[1], xyz] |
218 | 9 | try { |
219 | 9 | boost::tokenizer<boost::escaped_list_separator<char>> tok( |
220 | 9 | path_string, boost::escaped_list_separator<char>("\\", ".", "\"")); |
221 | 9 | std::vector<std::string> paths(tok.begin(), tok.end()); |
222 | 9 | get_parsed_paths(paths, parsed_paths); |
223 | 9 | } catch (const boost::escaped_list_error& err) { |
224 | 0 | throw doris::Exception(ErrorCode::INVALID_JSON_PATH, "meet error {}", err.what()); |
225 | 0 | } |
226 | 9 | } |
227 | | |
228 | | void JsonFunctions::get_parsed_paths(const std::vector<std::string>& path_exprs, |
229 | 9 | std::vector<JsonPath>* parsed_paths) { |
230 | 9 | if (path_exprs.empty()) { |
231 | 0 | return; |
232 | 0 | } |
233 | | |
234 | 9 | if (path_exprs[0] != "$") { |
235 | 0 | parsed_paths->emplace_back("", -1, false); |
236 | 9 | } else { |
237 | 9 | parsed_paths->emplace_back("$", -1, true); |
238 | 9 | } |
239 | | |
240 | 22 | for (int i = 1; i < path_exprs.size(); i++) { |
241 | 13 | std::string col; |
242 | 13 | std::string index; |
243 | 13 | if (UNLIKELY(!RE2::FullMatch(path_exprs[i], JSON_PATTERN, &col, &index))) { |
244 | 0 | parsed_paths->emplace_back("", -1, false); |
245 | 13 | } else { |
246 | 13 | int idx = -1; |
247 | 13 | if (!index.empty()) { |
248 | 4 | if (index == "*") { |
249 | 4 | idx = -2; |
250 | 4 | } else { |
251 | 0 | idx = atoi(index.c_str()); |
252 | 0 | } |
253 | 4 | } |
254 | 13 | parsed_paths->emplace_back(std::move(col), idx, true); |
255 | 13 | } |
256 | 13 | } |
257 | 9 | } |
258 | | |
259 | | Status JsonFunctions::extract_from_object(simdjson::ondemand::object& obj, |
260 | | const std::vector<JsonPath>& jsonpath, |
261 | 0 | simdjson::ondemand::value* value) noexcept { |
262 | | // Return DataQualityError when it's a malformed json. |
263 | | // Otherwise the path was not found, due to |
264 | | // 1. array out of bound |
265 | | // 2. not exist such field in object |
266 | | // 3. the input type is not object but could be null or other types and lead to simdjson::INCORRECT_TYPE |
267 | 0 | #define HANDLE_SIMDJSON_ERROR(err, msg) \ |
268 | 0 | do { \ |
269 | 0 | const simdjson::error_code& _err = err; \ |
270 | 0 | const std::string& _msg = msg; \ |
271 | 0 | if (UNLIKELY(_err)) { \ |
272 | 0 | if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS || \ |
273 | 0 | _err == simdjson::INCORRECT_TYPE) { \ |
274 | 0 | return Status::NotFound<false>( \ |
275 | 0 | fmt::format("Not found target filed, err: {}, msg: {}", \ |
276 | 0 | simdjson::error_message(_err), _msg)); \ |
277 | 0 | } \ |
278 | 0 | return Status::DataQualityError( \ |
279 | 0 | fmt::format("err: {}, msg: {}", simdjson::error_message(_err), _msg)); \ |
280 | 0 | } \ |
281 | 0 | } while (false); |
282 | |
|
283 | 0 | if (jsonpath.size() <= 1) { |
284 | | // The first elem of json path should be '$'. |
285 | | // A valid json path's size is >= 2. |
286 | 0 | return Status::DataQualityError("empty json path"); |
287 | 0 | } |
288 | | |
289 | 0 | simdjson::ondemand::value tvalue; |
290 | | |
291 | | // Skip the first $. |
292 | 0 | for (int i = 1; i < jsonpath.size(); i++) { |
293 | 0 | if (UNLIKELY(!jsonpath[i].is_valid)) { |
294 | 0 | return Status::DataQualityError(fmt::format("invalid json path: {}", jsonpath[i].key)); |
295 | 0 | } |
296 | | |
297 | 0 | const std::string& col = jsonpath[i].key; |
298 | 0 | int index = jsonpath[i].idx; |
299 | | |
300 | | // Since the simdjson::ondemand::object cannot be converted to simdjson::ondemand::value, |
301 | | // we have to do some special treatment for the second elem of json path. |
302 | | // If the key is not found in json object, simdjson::NO_SUCH_FIELD would be returned. |
303 | 0 | if (i == 1) { |
304 | 0 | HANDLE_SIMDJSON_ERROR(obj.find_field_unordered(col).get(tvalue), |
305 | 0 | fmt::format("unable to find field: {}", col)); |
306 | 0 | } else { |
307 | 0 | HANDLE_SIMDJSON_ERROR(tvalue.find_field_unordered(col).get(tvalue), |
308 | 0 | fmt::format("unable to find field: {}", col)); |
309 | 0 | } |
310 | | |
311 | | // TODO support [*] which idex == -2 |
312 | 0 | if (index != -1) { |
313 | | // try to access tvalue as array. |
314 | | // If the index is beyond the length of array, simdjson::INDEX_OUT_OF_BOUNDS would be returned. |
315 | 0 | simdjson::ondemand::array arr; |
316 | 0 | HANDLE_SIMDJSON_ERROR(tvalue.get_array().get(arr), |
317 | 0 | fmt::format("failed to access field as array, field: {}", col)); |
318 | |
|
319 | 0 | HANDLE_SIMDJSON_ERROR( |
320 | 0 | arr.at(index).get(tvalue), |
321 | 0 | fmt::format("failed to access array field: {}, index: {}", col, index)); |
322 | 0 | } |
323 | 0 | } |
324 | | |
325 | 0 | std::swap(*value, tvalue); |
326 | |
|
327 | 0 | return Status::OK(); |
328 | 0 | } |
329 | | |
330 | 0 | std::string JsonFunctions::print_json_value(const rapidjson::Value& value) { |
331 | 0 | rapidjson::StringBuffer buffer; |
332 | 0 | buffer.Clear(); |
333 | 0 | rapidjson::Writer<rapidjson::StringBuffer> writer(buffer); |
334 | 0 | value.Accept(writer); |
335 | 0 | return std::string(buffer.GetString()); |
336 | 0 | } |
337 | | |
338 | | void JsonFunctions::merge_objects(rapidjson::Value& dst_object, rapidjson::Value& src_object, |
339 | 0 | rapidjson::Document::AllocatorType& allocator) { |
340 | 0 | if (!src_object.IsObject()) { |
341 | 0 | return; |
342 | 0 | } |
343 | 0 | VLOG_DEBUG << "merge from src: " << print_json_value(src_object) |
344 | 0 | << ", to: " << print_json_value(dst_object); |
345 | 0 | for (auto src_it = src_object.MemberBegin(); src_it != src_object.MemberEnd(); ++src_it) { |
346 | 0 | auto dst_it = dst_object.FindMember(src_it->name); |
347 | 0 | if (dst_it != dst_object.MemberEnd()) { |
348 | 0 | if (src_it->value.IsObject() && dst_it->value.IsObject()) { |
349 | 0 | merge_objects(dst_it->value, src_it->value, allocator); |
350 | 0 | } else { |
351 | 0 | if (dst_it->value.IsNull()) { |
352 | 0 | dst_it->value = src_it->value; |
353 | 0 | } |
354 | 0 | } |
355 | 0 | } else { |
356 | 0 | dst_object.AddMember(src_it->name, src_it->value, allocator); |
357 | 0 | } |
358 | 0 | } |
359 | 0 | } |
360 | | |
361 | | // root path "$." |
362 | 0 | bool JsonFunctions::is_root_path(const std::vector<JsonPath>& json_path) { |
363 | 0 | return json_path.size() == 2 && json_path[0].key == "$" && json_path[1].key.empty(); |
364 | 0 | } |
365 | | |
366 | | } // namespace doris |