/root/doris/be/src/exprs/json_functions.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "exprs/json_functions.h" |
19 | | |
20 | | #include <rapidjson/allocators.h> |
21 | | #include <rapidjson/document.h> |
22 | | #include <rapidjson/encodings.h> |
23 | | #include <rapidjson/rapidjson.h> |
24 | | #include <re2/re2.h> |
25 | | #include <simdjson/error.h> |
26 | | #include <simdjson/simdjson.h> // IWYU pragma: keep |
27 | | #include <stdlib.h> |
28 | | |
29 | | #include <boost/iterator/iterator_facade.hpp> |
30 | | #include <boost/token_functions.hpp> |
31 | | #include <boost/tokenizer.hpp> |
32 | | #include <sstream> |
33 | | #include <string> |
34 | | #include <vector> |
35 | | |
36 | | // IWYU pragma: no_include <opentelemetry/common/threadlocal.h> |
37 | | #include "common/compiler_util.h" // IWYU pragma: keep |
38 | | #include "common/exception.h" |
39 | | #include "common/logging.h" |
40 | | |
41 | | namespace doris { |
42 | | |
43 | | // static const re2::RE2 JSON_PATTERN("^([a-zA-Z0-9_\\-\\:\\s#\\|\\.]*)(?:\\[([0-9]+)\\])?"); |
44 | | // json path cannot contains: ", [, ] |
45 | | static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?"); |
46 | | |
47 | | rapidjson::Value* JsonFunctions::match_value(const std::vector<JsonPath>& parsed_paths, |
48 | | rapidjson::Value* document, |
49 | | rapidjson::Document::AllocatorType& mem_allocator, |
50 | 8 | bool is_insert_null) { |
51 | 8 | rapidjson::Value* root = document; |
52 | 8 | rapidjson::Value* array_obj = nullptr; |
53 | 21 | for (int i = 1; i < parsed_paths.size(); i++) { |
54 | 13 | VLOG_TRACE << "parsed_paths: " << parsed_paths[i].debug_string(); |
55 | | |
56 | 13 | if (root == nullptr || root->IsNull()) { |
57 | 0 | return nullptr; |
58 | 0 | } |
59 | | |
60 | 13 | if (UNLIKELY(!parsed_paths[i].is_valid)) { |
61 | 0 | return nullptr; |
62 | 0 | } |
63 | | |
64 | 13 | const std::string& col = parsed_paths[i].key; |
65 | 13 | int index = parsed_paths[i].idx; |
66 | 13 | if (LIKELY(!col.empty())) { |
67 | 9 | if (root->IsArray()) { |
68 | 5 | array_obj = static_cast<rapidjson::Value*>( |
69 | 5 | mem_allocator.Malloc(sizeof(rapidjson::Value))); |
70 | 5 | array_obj->SetArray(); |
71 | 5 | bool is_null = true; |
72 | | |
73 | | // if array ,loop the array,find out all Objects,then find the results from the objects |
74 | 16 | for (int j = 0; j < root->Size(); j++) { |
75 | 11 | rapidjson::Value* json_elem = &((*root)[j]); |
76 | | |
77 | 11 | if (json_elem->IsArray() || json_elem->IsNull()) { |
78 | 0 | continue; |
79 | 11 | } else { |
80 | 11 | if (!json_elem->IsObject()) { |
81 | 0 | continue; |
82 | 0 | } |
83 | 11 | if (!json_elem->HasMember(col.c_str())) { |
84 | 1 | if (is_insert_null) { // not found item, then insert a null object. |
85 | 1 | is_null = false; |
86 | 1 | rapidjson::Value nullObject(rapidjson::kNullType); |
87 | 1 | array_obj->PushBack(nullObject, mem_allocator); |
88 | 1 | } |
89 | 1 | continue; |
90 | 1 | } |
91 | 10 | rapidjson::Value* obj = &((*json_elem)[col.c_str()]); |
92 | 10 | if (obj->IsArray()) { |
93 | 0 | is_null = false; |
94 | 0 | for (int k = 0; k < obj->Size(); k++) { |
95 | 0 | array_obj->PushBack((*obj)[k], mem_allocator); |
96 | 0 | } |
97 | 10 | } else if (!obj->IsNull()) { |
98 | 10 | is_null = false; |
99 | 10 | array_obj->PushBack(*obj, mem_allocator); |
100 | 10 | } |
101 | 10 | } |
102 | 11 | } |
103 | | |
104 | 5 | root = is_null ? &(array_obj->SetNull()) : array_obj; |
105 | 5 | } else if (root->IsObject()) { |
106 | 4 | if (!root->HasMember(col.c_str())) { |
107 | 0 | return nullptr; |
108 | 4 | } else { |
109 | 4 | root = &((*root)[col.c_str()]); |
110 | 4 | } |
111 | 4 | } else { |
112 | | // root is not a nested type, return nullptr |
113 | 0 | return nullptr; |
114 | 0 | } |
115 | 9 | } |
116 | | |
117 | 13 | if (UNLIKELY(index != -1)) { |
118 | | // judge the rapidjson:Value, which base the top's result, |
119 | | // if not array return nullptr;else get the index value from the array |
120 | 4 | if (root->IsArray()) { |
121 | 4 | if (root->IsNull()) { |
122 | 0 | return nullptr; |
123 | 4 | } else if (index == -2) { |
124 | | // [*] |
125 | 4 | array_obj = static_cast<rapidjson::Value*>( |
126 | 4 | mem_allocator.Malloc(sizeof(rapidjson::Value))); |
127 | 4 | array_obj->SetArray(); |
128 | | |
129 | 13 | for (int j = 0; j < root->Size(); j++) { |
130 | 9 | rapidjson::Value v; |
131 | 9 | v.CopyFrom((*root)[j], mem_allocator); |
132 | 9 | array_obj->PushBack(v, mem_allocator); |
133 | 9 | } |
134 | 4 | root = array_obj; |
135 | 4 | } else if (index >= root->Size()) { |
136 | 0 | return nullptr; |
137 | 0 | } else { |
138 | 0 | root = &((*root)[index]); |
139 | 0 | } |
140 | 4 | } else { |
141 | 0 | return nullptr; |
142 | 0 | } |
143 | 4 | } |
144 | 13 | } |
145 | 8 | return root; |
146 | 8 | } |
147 | | |
148 | | rapidjson::Value* JsonFunctions::get_json_array_from_parsed_json( |
149 | | const std::string& json_path, rapidjson::Value* document, |
150 | 9 | rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly) { |
151 | 9 | std::vector<JsonPath> vec; |
152 | 9 | parse_json_paths(json_path, &vec); |
153 | 9 | return get_json_array_from_parsed_json(vec, document, mem_allocator, wrap_explicitly); |
154 | 9 | } |
155 | | |
156 | | rapidjson::Value* JsonFunctions::get_json_array_from_parsed_json( |
157 | | const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document, |
158 | 9 | rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly) { |
159 | 9 | *wrap_explicitly = false; |
160 | 9 | if (!parsed_paths[0].is_valid) { |
161 | 0 | return nullptr; |
162 | 0 | } |
163 | | |
164 | 9 | if (parsed_paths.size() == 1) { |
165 | | // the json path is "$", just return entire document |
166 | | // wrapper an array |
167 | 1 | rapidjson::Value* array_obj = nullptr; |
168 | 1 | array_obj = static_cast<rapidjson::Value*>(mem_allocator.Malloc(sizeof(rapidjson::Value))); |
169 | 1 | array_obj->SetArray(); |
170 | 1 | array_obj->PushBack(*document, mem_allocator); |
171 | 1 | return array_obj; |
172 | 1 | } |
173 | | |
174 | 8 | rapidjson::Value* root = match_value(parsed_paths, document, mem_allocator, true); |
175 | 8 | if (root == nullptr || root == document) { // not found |
176 | 0 | return nullptr; |
177 | 8 | } else if (!root->IsArray() && wrap_explicitly) { |
178 | 1 | rapidjson::Value* array_obj = nullptr; |
179 | 1 | array_obj = static_cast<rapidjson::Value*>(mem_allocator.Malloc(sizeof(rapidjson::Value))); |
180 | 1 | array_obj->SetArray(); |
181 | 1 | rapidjson::Value copy; |
182 | 1 | copy.CopyFrom(*root, mem_allocator); |
183 | 1 | array_obj->PushBack(std::move(copy), mem_allocator); |
184 | | // set `wrap_explicitly` to true, so that the caller knows that this Array is wrapped actively. |
185 | 1 | *wrap_explicitly = true; |
186 | 1 | return array_obj; |
187 | 1 | } |
188 | 7 | return root; |
189 | 8 | } |
190 | | |
191 | | rapidjson::Value* JsonFunctions::get_json_object_from_parsed_json( |
192 | | const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document, |
193 | 0 | rapidjson::Document::AllocatorType& mem_allocator) { |
194 | 0 | if (!parsed_paths[0].is_valid) { |
195 | 0 | return nullptr; |
196 | 0 | } |
197 | | |
198 | 0 | if (parsed_paths.size() == 1) { |
199 | | // the json path is "$", just return entire document |
200 | 0 | return document; |
201 | 0 | } |
202 | | |
203 | 0 | rapidjson::Value* root = match_value(parsed_paths, document, mem_allocator, true); |
204 | 0 | if (root == nullptr || root == document) { // not found |
205 | 0 | return nullptr; |
206 | 0 | } |
207 | 0 | return root; |
208 | 0 | } |
209 | | |
210 | | void JsonFunctions::parse_json_paths(const std::string& path_string, |
211 | 9 | std::vector<JsonPath>* parsed_paths) { |
212 | | // split path by ".", and escape quota by "\" |
213 | | // eg: |
214 | | // '$.text#abc.xyz' -> [$, text#abc, xyz] |
215 | | // '$."text.abc".xyz' -> [$, text.abc, xyz] |
216 | | // '$."text.abc"[1].xyz' -> [$, text.abc[1], xyz] |
217 | 9 | try { |
218 | 9 | boost::tokenizer<boost::escaped_list_separator<char>> tok( |
219 | 9 | path_string, boost::escaped_list_separator<char>("\\", ".", "\"")); |
220 | 9 | std::vector<std::string> paths(tok.begin(), tok.end()); |
221 | 9 | get_parsed_paths(paths, parsed_paths); |
222 | 9 | } catch (const boost::escaped_list_error& err) { |
223 | 0 | throw doris::Exception(ErrorCode::INVALID_JSON_PATH, "meet error {}", err.what()); |
224 | 0 | } |
225 | 9 | } |
226 | | |
227 | | void JsonFunctions::get_parsed_paths(const std::vector<std::string>& path_exprs, |
228 | 9 | std::vector<JsonPath>* parsed_paths) { |
229 | 9 | if (path_exprs.empty()) { |
230 | 0 | return; |
231 | 0 | } |
232 | | |
233 | 9 | if (path_exprs[0] != "$") { |
234 | 0 | parsed_paths->emplace_back("", -1, false); |
235 | 9 | } else { |
236 | 9 | parsed_paths->emplace_back("$", -1, true); |
237 | 9 | } |
238 | | |
239 | 22 | for (int i = 1; i < path_exprs.size(); i++) { |
240 | 13 | std::string col; |
241 | 13 | std::string index; |
242 | 13 | if (UNLIKELY(!RE2::FullMatch(path_exprs[i], JSON_PATTERN, &col, &index))) { |
243 | 0 | parsed_paths->emplace_back("", -1, false); |
244 | 13 | } else { |
245 | 13 | int idx = -1; |
246 | 13 | if (!index.empty()) { |
247 | 4 | if (index == "*") { |
248 | 4 | idx = -2; |
249 | 4 | } else { |
250 | 0 | idx = atoi(index.c_str()); |
251 | 0 | } |
252 | 4 | } |
253 | 13 | parsed_paths->emplace_back(std::move(col), idx, true); |
254 | 13 | } |
255 | 13 | } |
256 | 9 | } |
257 | | |
258 | | Status JsonFunctions::extract_from_object(simdjson::ondemand::object& obj, |
259 | | const std::vector<JsonPath>& jsonpath, |
260 | 0 | simdjson::ondemand::value* value) noexcept { |
261 | | // Return DataQualityError when it's a malformed json. |
262 | | // Otherwise the path was not found, due to |
263 | | // 1. array out of bound |
264 | | // 2. not exist such field in object |
265 | | // 3. the input type is not object but could be null or other types and lead to simdjson::INCORRECT_TYPE |
266 | 0 | #define HANDLE_SIMDJSON_ERROR(err, msg) \ |
267 | 0 | do { \ |
268 | 0 | const simdjson::error_code& _err = err; \ |
269 | 0 | const std::string& _msg = msg; \ |
270 | 0 | if (UNLIKELY(_err)) { \ |
271 | 0 | if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS || \ |
272 | 0 | _err == simdjson::INCORRECT_TYPE) { \ |
273 | 0 | return Status::NotFound<false>( \ |
274 | 0 | fmt::format("Not found target filed, err: {}, msg: {}", \ |
275 | 0 | simdjson::error_message(_err), _msg)); \ |
276 | 0 | } \ |
277 | 0 | return Status::DataQualityError( \ |
278 | 0 | fmt::format("err: {}, msg: {}", simdjson::error_message(_err), _msg)); \ |
279 | 0 | } \ |
280 | 0 | } while (false); |
281 | |
|
282 | 0 | if (jsonpath.size() <= 1) { |
283 | | // The first elem of json path should be '$'. |
284 | | // A valid json path's size is >= 2. |
285 | 0 | return Status::DataQualityError("empty json path"); |
286 | 0 | } |
287 | | |
288 | 0 | simdjson::ondemand::value tvalue; |
289 | | |
290 | | // Skip the first $. |
291 | 0 | for (int i = 1; i < jsonpath.size(); i++) { |
292 | 0 | if (UNLIKELY(!jsonpath[i].is_valid)) { |
293 | 0 | return Status::DataQualityError(fmt::format("invalid json path: {}", jsonpath[i].key)); |
294 | 0 | } |
295 | | |
296 | 0 | const std::string& col = jsonpath[i].key; |
297 | 0 | int index = jsonpath[i].idx; |
298 | | |
299 | | // Since the simdjson::ondemand::object cannot be converted to simdjson::ondemand::value, |
300 | | // we have to do some special treatment for the second elem of json path. |
301 | | // If the key is not found in json object, simdjson::NO_SUCH_FIELD would be returned. |
302 | 0 | if (i == 1) { |
303 | 0 | HANDLE_SIMDJSON_ERROR(obj.find_field_unordered(col).get(tvalue), |
304 | 0 | fmt::format("unable to find field: {}", col)); |
305 | 0 | } else { |
306 | 0 | HANDLE_SIMDJSON_ERROR(tvalue.find_field_unordered(col).get(tvalue), |
307 | 0 | fmt::format("unable to find field: {}", col)); |
308 | 0 | } |
309 | | |
310 | | // TODO support [*] which idex == -2 |
311 | 0 | if (index != -1) { |
312 | | // try to access tvalue as array. |
313 | | // If the index is beyond the length of array, simdjson::INDEX_OUT_OF_BOUNDS would be returned. |
314 | 0 | simdjson::ondemand::array arr; |
315 | 0 | HANDLE_SIMDJSON_ERROR(tvalue.get_array().get(arr), |
316 | 0 | fmt::format("failed to access field as array, field: {}", col)); |
317 | |
|
318 | 0 | HANDLE_SIMDJSON_ERROR( |
319 | 0 | arr.at(index).get(tvalue), |
320 | 0 | fmt::format("failed to access array field: {}, index: {}", col, index)); |
321 | 0 | } |
322 | 0 | } |
323 | | |
324 | 0 | std::swap(*value, tvalue); |
325 | |
|
326 | 0 | return Status::OK(); |
327 | 0 | } |
328 | | |
329 | | } // namespace doris |