be/src/exprs/function/function_json.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <glog/logging.h> |
19 | | #include <rapidjson/allocators.h> |
20 | | #include <rapidjson/document.h> |
21 | | #include <rapidjson/encodings.h> |
22 | | #include <rapidjson/pointer.h> |
23 | | #include <rapidjson/rapidjson.h> |
24 | | #include <rapidjson/stringbuffer.h> |
25 | | #include <rapidjson/writer.h> |
26 | | #include <re2/re2.h> |
27 | | #include <stdint.h> |
28 | | #include <stdlib.h> |
29 | | #include <string.h> |
30 | | |
31 | | #include <algorithm> |
32 | | #include <boost/iterator/iterator_facade.hpp> |
33 | | #include <boost/token_functions.hpp> |
34 | | #include <boost/tokenizer.hpp> |
35 | | #include <memory> |
36 | | #include <string> |
37 | | #include <string_view> |
38 | | #include <type_traits> |
39 | | #include <utility> |
40 | | #include <vector> |
41 | | |
42 | | #include "common/cast_set.h" |
43 | | #include "common/compiler_util.h" // IWYU pragma: keep |
44 | | #include "common/status.h" |
45 | | #include "core/assert_cast.h" |
46 | | #include "core/block/block.h" |
47 | | #include "core/block/column_numbers.h" |
48 | | #include "core/block/column_with_type_and_name.h" |
49 | | #include "core/column/column.h" |
50 | | #include "core/column/column_nullable.h" |
51 | | #include "core/column/column_string.h" |
52 | | #include "core/column/column_vector.h" |
53 | | #include "core/data_type/data_type.h" |
54 | | #include "core/data_type/data_type_nullable.h" |
55 | | #include "core/data_type/data_type_number.h" |
56 | | #include "core/data_type/data_type_string.h" |
57 | | #include "core/string_ref.h" |
58 | | #include "core/types.h" |
59 | | #include "core/value/jsonb_value.h" |
60 | | #include "exec/common/stringop_substring.h" |
61 | | #include "exec/common/template_helpers.hpp" |
62 | | #include "exprs/aggregate/aggregate_function.h" |
63 | | #include "exprs/function/function.h" |
64 | | #include "exprs/function/function_totype.h" |
65 | | #include "exprs/function/simple_function_factory.h" |
66 | | #include "exprs/json_functions.h" |
67 | | #include "util/io_helper.h" |
68 | | #include "util/string_parser.hpp" |
69 | | #include "util/string_util.h" |
70 | | |
71 | | namespace doris { |
72 | | class FunctionContext; |
73 | | } // namespace doris |
74 | | |
75 | | namespace doris { |
76 | | static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?"); |
77 | | |
78 | | template <typename T, typename U> |
79 | | void char_split(std::vector<T>& res, const U& var, char p) { |
80 | | int start = 0; |
81 | | int pos = start; |
82 | | int end = var.length(); |
83 | | while (pos < end) { |
84 | | while (var[pos] != p && pos < end) { |
85 | | pos++; |
86 | | } |
87 | | res.emplace_back(&var[start], pos - start); |
88 | | pos++; |
89 | | start = pos; |
90 | | } |
91 | | } |
92 | | |
93 | | // T = std::vector<std::string> |
94 | | // TODO: update RE2 to support std::vector<std::string_view> |
95 | | template <typename T> |
96 | | void get_parsed_paths(const T& path_exprs, std::vector<JsonPath>* parsed_paths) { |
97 | | if (path_exprs.empty()) { |
98 | | return; |
99 | | } |
100 | | |
101 | | if (path_exprs[0] != "$") { |
102 | | parsed_paths->emplace_back("", -1, false); |
103 | | } else { |
104 | | parsed_paths->emplace_back("$", -1, true); |
105 | | } |
106 | | |
107 | | for (int i = 1; i < path_exprs.size(); i++) { |
108 | | std::string col; |
109 | | std::string index; |
110 | | if (UNLIKELY(!RE2::FullMatch(path_exprs[i], JSON_PATTERN, &col, &index))) { |
111 | | parsed_paths->emplace_back("", -1, false); |
112 | | } else { |
113 | | int idx = -1; |
114 | | if (!index.empty()) { |
115 | | if (index == "*") { |
116 | | idx = -2; |
117 | | } else { |
118 | | idx = atoi(index.c_str()); |
119 | | } |
120 | | } |
121 | | parsed_paths->emplace_back(col, idx, true); |
122 | | } |
123 | | } |
124 | | } |
125 | | |
126 | | rapidjson::Value* NO_SANITIZE_UNDEFINED |
127 | | match_value(const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document, |
128 | 0 | rapidjson::Document::AllocatorType& mem_allocator, bool is_insert_null = false) { |
129 | 0 | rapidjson::Value* root = document; |
130 | 0 | rapidjson::Value* array_obj = nullptr; |
131 | 0 | for (int i = 1; i < parsed_paths.size(); i++) { |
132 | 0 | if (root == nullptr || root->IsNull()) { |
133 | 0 | return nullptr; |
134 | 0 | } |
135 | | |
136 | 0 | if (UNLIKELY(!parsed_paths[i].is_valid)) { |
137 | 0 | return nullptr; |
138 | 0 | } |
139 | | |
140 | 0 | const std::string& col = parsed_paths[i].key; |
141 | 0 | int index = parsed_paths[i].idx; |
142 | 0 | if (LIKELY(!col.empty())) { |
143 | 0 | if (root->IsObject()) { |
144 | 0 | if (!root->HasMember(col.c_str())) { |
145 | 0 | return nullptr; |
146 | 0 | } else { |
147 | 0 | root = &((*root)[col.c_str()]); |
148 | 0 | } |
149 | 0 | } else { |
150 | | // root is not a nested type, return NULL |
151 | 0 | return nullptr; |
152 | 0 | } |
153 | 0 | } |
154 | | |
155 | 0 | if (UNLIKELY(index != -1)) { |
156 | | // judge the rapidjson:Value, which base the top's result, |
157 | | // if not array return NULL;else get the index value from the array |
158 | 0 | if (root->IsArray()) { |
159 | 0 | if (root->IsNull()) { |
160 | 0 | return nullptr; |
161 | 0 | } else if (index == -2) { |
162 | | // [*] |
163 | 0 | array_obj = static_cast<rapidjson::Value*>( |
164 | 0 | mem_allocator.Malloc(sizeof(rapidjson::Value))); |
165 | 0 | array_obj->SetArray(); |
166 | |
|
167 | 0 | for (int j = 0; j < root->Size(); j++) { |
168 | 0 | rapidjson::Value v; |
169 | 0 | v.CopyFrom((*root)[j], mem_allocator); |
170 | 0 | array_obj->PushBack(v, mem_allocator); |
171 | 0 | } |
172 | 0 | root = array_obj; |
173 | 0 | } else if (index >= root->Size()) { |
174 | 0 | return nullptr; |
175 | 0 | } else { |
176 | 0 | root = &((*root)[index]); |
177 | 0 | } |
178 | 0 | } else { |
179 | 0 | return nullptr; |
180 | 0 | } |
181 | 0 | } |
182 | 0 | } |
183 | 0 | return root; |
184 | 0 | } |
185 | | |
186 | | template <JsonFunctionType fntype> |
187 | | rapidjson::Value* get_json_object(std::string_view json_string, std::string_view path_string, |
188 | | rapidjson::Document* document) { |
189 | | std::vector<JsonPath>* parsed_paths; |
190 | | std::vector<JsonPath> tmp_parsed_paths; |
191 | | |
192 | | //Cannot use '\' as the last character, return NULL |
193 | | if (path_string.back() == '\\') { |
194 | | return nullptr; |
195 | | } |
196 | | |
197 | | std::string fixed_string; |
198 | | if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] != '.') { |
199 | | // Boost tokenizer requires explicit "." after "$" to correctly extract JSON path tokens. |
200 | | // Without this, expressions like "$[0].key" cannot be properly split. |
201 | | // This commit ensures a "." is automatically added after "$" to maintain consistent token parsing behavior. |
202 | | fixed_string = "$."; |
203 | | fixed_string += path_string.substr(1); |
204 | | path_string = fixed_string; |
205 | | } |
206 | | |
207 | | try { |
208 | | #ifdef USE_LIBCPP |
209 | | std::string s(path_string); |
210 | | auto tok = get_json_token(s); |
211 | | #else |
212 | | auto tok = get_json_token(path_string); |
213 | | #endif |
214 | | std::vector<std::string> paths(tok.begin(), tok.end()); |
215 | | get_parsed_paths(paths, &tmp_parsed_paths); |
216 | | if (tmp_parsed_paths.empty()) { |
217 | | return document; |
218 | | } |
219 | | } catch (boost::escaped_list_error&) { |
220 | | // meet unknown escape sequence, example '$.name\k' |
221 | | return nullptr; |
222 | | } |
223 | | |
224 | | parsed_paths = &tmp_parsed_paths; |
225 | | |
226 | | if (!(*parsed_paths)[0].is_valid) { |
227 | | return nullptr; |
228 | | } |
229 | | |
230 | | if (UNLIKELY((*parsed_paths).size() == 1)) { |
231 | | if (fntype == JSON_FUN_STRING) { |
232 | | document->SetString(json_string.data(), |
233 | | cast_set<rapidjson::SizeType>(json_string.size()), |
234 | | document->GetAllocator()); |
235 | | } else { |
236 | | return document; |
237 | | } |
238 | | } |
239 | | |
240 | | document->Parse(json_string.data(), json_string.size()); |
241 | | if (UNLIKELY(document->HasParseError())) { |
242 | | // VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": " |
243 | | // << GetParseError_En(document->GetParseError()); |
244 | | return nullptr; |
245 | | } |
246 | | |
247 | | return match_value(*parsed_paths, document, document->GetAllocator()); |
248 | | } |
249 | | |
250 | | template <int flag> |
251 | | struct JsonParser { |
252 | | //string |
253 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
254 | | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
255 | | value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator); |
256 | | } |
257 | | }; |
258 | | |
259 | | template <> |
260 | | struct JsonParser<'0'> { |
261 | | // null |
262 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
263 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
264 | 0 | value.SetNull(); |
265 | 0 | } |
266 | | }; |
267 | | |
268 | | template <> |
269 | | struct JsonParser<'1'> { |
270 | | // bool |
271 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
272 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
273 | 0 | DCHECK(data.size == 1 || strncmp(data.data, "true", 4) == 0 || |
274 | 0 | strncmp(data.data, "false", 5) == 0); |
275 | 0 | value.SetBool(*data.data == '1' || *data.data == 't'); |
276 | 0 | } |
277 | | }; |
278 | | |
279 | | template <> |
280 | | struct JsonParser<'2'> { |
281 | | // int |
282 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
283 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
284 | 0 | value.SetInt(StringParser::string_to_int<int32_t>(data.data, data.size, &result)); |
285 | 0 | } |
286 | | }; |
287 | | |
288 | | template <> |
289 | | struct JsonParser<'3'> { |
290 | | // double |
291 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
292 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
293 | 0 | value.SetDouble(StringParser::string_to_float<double>(data.data, data.size, &result)); |
294 | 0 | } |
295 | | }; |
296 | | |
297 | | template <> |
298 | | struct JsonParser<'4'> { |
299 | | // time |
300 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
301 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
302 | 0 | // remove double quotes, "xxx" -> xxx |
303 | 0 | value.SetString(data.data + 1, cast_set<rapidjson::SizeType>(data.size - 2), allocator); |
304 | 0 | } |
305 | | }; |
306 | | |
307 | | template <> |
308 | | struct JsonParser<'5'> { |
309 | | // bigint |
310 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
311 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
312 | 0 | value.SetInt64(StringParser::string_to_int<int64_t>(data.data, data.size, &result)); |
313 | 0 | } |
314 | | }; |
315 | | |
316 | | template <> |
317 | | struct JsonParser<'7'> { |
318 | | // json string |
319 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
320 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
321 | 0 | rapidjson::Document document; |
322 | 0 | const JsonbValue* json_val = JsonbDocument::createValue(data.data, data.size); |
323 | 0 | convert_jsonb_to_rapidjson(*json_val, document, allocator); |
324 | 0 | value.CopyFrom(document, allocator); |
325 | 0 | } |
326 | | }; |
327 | | |
328 | | template <int flag, typename Impl> |
329 | | struct ExecuteReducer { |
330 | | template <typename... TArgs> |
331 | | static void run(TArgs&&... args) { |
332 | | Impl::template execute_type<JsonParser<flag>>(std::forward<TArgs>(args)...); |
333 | | } |
334 | | }; |
335 | | |
336 | | struct FunctionJsonQuoteImpl { |
337 | | static constexpr auto name = "json_quote"; |
338 | | |
339 | 0 | static DataTypePtr get_return_type_impl(const DataTypes& arguments) { |
340 | 0 | if (!arguments.empty() && arguments[0] && arguments[0]->is_nullable()) { |
341 | 0 | return make_nullable(std::make_shared<DataTypeString>()); |
342 | 0 | } |
343 | 0 | return std::make_shared<DataTypeString>(); |
344 | 0 | } |
345 | | static void execute(const std::vector<const ColumnString*>& data_columns, |
346 | 0 | ColumnString& result_column, size_t input_rows_count) { |
347 | 0 | rapidjson::Document document; |
348 | 0 | rapidjson::Document::AllocatorType& allocator = document.GetAllocator(); |
349 | |
|
350 | 0 | rapidjson::Value value; |
351 | |
|
352 | 0 | rapidjson::StringBuffer buf; |
353 | |
|
354 | 0 | for (int i = 0; i < input_rows_count; i++) { |
355 | 0 | StringRef data = data_columns[0]->get_data_at(i); |
356 | 0 | value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator); |
357 | |
|
358 | 0 | buf.Clear(); |
359 | 0 | rapidjson::Writer<rapidjson::StringBuffer> writer(buf); |
360 | 0 | value.Accept(writer); |
361 | 0 | result_column.insert_data(buf.GetString(), buf.GetSize()); |
362 | 0 | } |
363 | 0 | } |
364 | | }; |
365 | | |
366 | | template <typename Impl> |
367 | | class FunctionJson : public IFunction { |
368 | | public: |
369 | | static constexpr auto name = Impl::name; |
370 | | |
371 | 2 | static FunctionPtr create() { return std::make_shared<FunctionJson<Impl>>(); } |
372 | | |
373 | 0 | String get_name() const override { return name; } |
374 | | |
375 | 0 | size_t get_number_of_arguments() const override { return 0; } |
376 | | |
377 | 1 | bool is_variadic() const override { return true; } |
378 | | |
379 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
380 | 0 | return Impl::get_return_type_impl(arguments); |
381 | 0 | } |
382 | | |
383 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
384 | 0 | uint32_t result, size_t input_rows_count) const override { |
385 | 0 | auto result_column = ColumnString::create(); |
386 | |
|
387 | 0 | std::vector<ColumnPtr> column_ptrs; // prevent converted column destruct |
388 | 0 | std::vector<const ColumnString*> data_columns; |
389 | 0 | for (int i = 0; i < arguments.size(); i++) { |
390 | 0 | column_ptrs.push_back( |
391 | 0 | block.get_by_position(arguments[i]).column->convert_to_full_column_if_const()); |
392 | 0 | data_columns.push_back(assert_cast<const ColumnString*>(column_ptrs.back().get())); |
393 | 0 | } |
394 | |
|
395 | 0 | Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()), |
396 | 0 | input_rows_count); |
397 | 0 | block.get_by_position(result).column = std::move(result_column); |
398 | 0 | return Status::OK(); |
399 | 0 | } |
400 | | }; |
401 | | |
402 | | template <typename Impl> |
403 | | class FunctionJsonNullable : public IFunction { |
404 | | public: |
405 | | static constexpr auto name = Impl::name; |
406 | | static FunctionPtr create() { return std::make_shared<FunctionJsonNullable<Impl>>(); } |
407 | | String get_name() const override { return name; } |
408 | | size_t get_number_of_arguments() const override { return 0; } |
409 | | bool is_variadic() const override { return true; } |
410 | | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
411 | | return make_nullable(std::make_shared<DataTypeString>()); |
412 | | } |
413 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
414 | | uint32_t result, size_t input_rows_count) const override { |
415 | | auto result_column = ColumnString::create(); |
416 | | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
417 | | std::vector<const ColumnString*> data_columns; |
418 | | std::vector<bool> column_is_consts; |
419 | | for (int i = 0; i < arguments.size(); i++) { |
420 | | ColumnPtr arg_col; |
421 | | bool arg_const; |
422 | | std::tie(arg_col, arg_const) = |
423 | | unpack_if_const(block.get_by_position(arguments[i]).column); |
424 | | column_is_consts.push_back(arg_const); |
425 | | data_columns.push_back(assert_cast<const ColumnString*>(arg_col.get())); |
426 | | } |
427 | | Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()), |
428 | | null_map->get_data(), input_rows_count, column_is_consts); |
429 | | block.replace_by_position( |
430 | | result, ColumnNullable::create(std::move(result_column), std::move(null_map))); |
431 | | return Status::OK(); |
432 | | } |
433 | | }; |
434 | | |
435 | | class FunctionJsonValid : public IFunction { |
436 | | public: |
437 | | static constexpr auto name = "json_valid"; |
438 | 2 | static FunctionPtr create() { return std::make_shared<FunctionJsonValid>(); } |
439 | | |
440 | 1 | String get_name() const override { return name; } |
441 | | |
442 | 0 | size_t get_number_of_arguments() const override { return 1; } |
443 | | |
444 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
445 | 0 | return make_nullable(std::make_shared<DataTypeInt32>()); |
446 | 0 | } |
447 | | |
448 | 0 | bool use_default_implementation_for_nulls() const override { return false; } |
449 | | |
450 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
451 | 0 | uint32_t result, size_t input_rows_count) const override { |
452 | 0 | const IColumn& col_from = *(block.get_by_position(arguments[0]).column); |
453 | |
|
454 | 0 | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
455 | |
|
456 | 0 | const ColumnUInt8::Container* input_null_map = nullptr; |
457 | 0 | const ColumnString* col_from_string = nullptr; |
458 | 0 | if (const auto* nullable = check_and_get_column<ColumnNullable>(col_from)) { |
459 | 0 | input_null_map = &nullable->get_null_map_data(); |
460 | 0 | col_from_string = |
461 | 0 | check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr()); |
462 | 0 | } else { |
463 | 0 | col_from_string = check_and_get_column<ColumnString>(col_from); |
464 | 0 | } |
465 | |
|
466 | 0 | if (!col_from_string) { |
467 | 0 | return Status::RuntimeError("Illegal column {} should be ColumnString", |
468 | 0 | col_from.get_name()); |
469 | 0 | } |
470 | | |
471 | 0 | auto col_to = ColumnInt32::create(); |
472 | 0 | auto& vec_to = col_to->get_data(); |
473 | 0 | size_t size = col_from.size(); |
474 | 0 | vec_to.resize(size); |
475 | | |
476 | | // parser can be reused for performance |
477 | |
|
478 | 0 | auto input_type = block.get_by_position(arguments[0]).type->get_primitive_type(); |
479 | |
|
480 | 0 | if (input_type == PrimitiveType::TYPE_VARCHAR || input_type == PrimitiveType::TYPE_CHAR || |
481 | 0 | input_type == PrimitiveType::TYPE_STRING) { |
482 | 0 | JsonBinaryValue jsonb_value; |
483 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
484 | 0 | if (input_null_map && (*input_null_map)[i]) { |
485 | 0 | null_map->get_data()[i] = 1; |
486 | 0 | vec_to[i] = 0; |
487 | 0 | continue; |
488 | 0 | } |
489 | | |
490 | 0 | const auto& val = col_from_string->get_data_at(i); |
491 | 0 | if (jsonb_value.from_json_string(val.data, cast_set<unsigned int>(val.size)).ok()) { |
492 | 0 | vec_to[i] = 1; |
493 | 0 | } else { |
494 | 0 | vec_to[i] = 0; |
495 | 0 | } |
496 | 0 | } |
497 | |
|
498 | 0 | } else { |
499 | 0 | DCHECK(input_type == PrimitiveType::TYPE_JSONB); |
500 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
501 | 0 | if (input_null_map && (*input_null_map)[i]) { |
502 | 0 | null_map->get_data()[i] = 1; |
503 | 0 | vec_to[i] = 0; |
504 | 0 | continue; |
505 | 0 | } |
506 | 0 | const auto& val = col_from_string->get_data_at(i); |
507 | 0 | if (val.size == 0) { |
508 | 0 | vec_to[i] = 0; |
509 | 0 | continue; |
510 | 0 | } |
511 | 0 | const JsonbDocument* doc = nullptr; |
512 | 0 | auto st = JsonbDocument::checkAndCreateDocument(val.data, val.size, &doc); |
513 | 0 | if (!st.ok() || !doc || !doc->getValue()) [[unlikely]] { |
514 | 0 | vec_to[i] = 0; |
515 | 0 | continue; |
516 | 0 | } |
517 | 0 | const JsonbValue* value = doc->getValue(); |
518 | 0 | if (UNLIKELY(!value)) { |
519 | 0 | vec_to[i] = 0; |
520 | 0 | continue; |
521 | 0 | } |
522 | 0 | vec_to[i] = 1; |
523 | 0 | } |
524 | 0 | } |
525 | |
|
526 | 0 | block.replace_by_position(result, |
527 | 0 | ColumnNullable::create(std::move(col_to), std::move(null_map))); |
528 | |
|
529 | 0 | return Status::OK(); |
530 | 0 | } |
531 | | }; |
532 | | class FunctionJsonUnquote : public IFunction { |
533 | | public: |
534 | | static constexpr auto name = "json_unquote"; |
535 | 2 | static FunctionPtr create() { return std::make_shared<FunctionJsonUnquote>(); } |
536 | | |
537 | 1 | String get_name() const override { return name; } |
538 | | |
539 | 0 | size_t get_number_of_arguments() const override { return 1; } |
540 | | |
541 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
542 | 0 | return make_nullable(std::make_shared<DataTypeString>()); |
543 | 0 | } |
544 | | |
545 | 0 | bool use_default_implementation_for_nulls() const override { return false; } |
546 | | |
547 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
548 | 0 | uint32_t result, size_t input_rows_count) const override { |
549 | 0 | const IColumn& col_from = *(block.get_by_position(arguments[0]).column); |
550 | |
|
551 | 0 | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
552 | |
|
553 | 0 | const ColumnString* col_from_string = check_and_get_column<ColumnString>(col_from); |
554 | 0 | if (auto* nullable = check_and_get_column<ColumnNullable>(col_from)) { |
555 | 0 | col_from_string = |
556 | 0 | check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr()); |
557 | 0 | } |
558 | |
|
559 | 0 | if (!col_from_string) { |
560 | 0 | return Status::RuntimeError("Illegal column {} should be ColumnString", |
561 | 0 | col_from.get_name()); |
562 | 0 | } |
563 | | |
564 | 0 | auto col_to = ColumnString::create(); |
565 | 0 | col_to->reserve(input_rows_count); |
566 | | |
567 | | // parser can be reused for performance |
568 | 0 | rapidjson::Document document; |
569 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
570 | 0 | if (col_from.is_null_at(i)) { |
571 | 0 | null_map->get_data()[i] = 1; |
572 | 0 | col_to->insert_data(nullptr, 0); |
573 | 0 | continue; |
574 | 0 | } |
575 | | |
576 | 0 | const auto& json_str = col_from_string->get_data_at(i); |
577 | 0 | if (json_str.size < 2 || json_str.data[0] != '"' || |
578 | 0 | json_str.data[json_str.size - 1] != '"') { |
579 | | // non-quoted string |
580 | 0 | col_to->insert_data(json_str.data, json_str.size); |
581 | 0 | } else { |
582 | 0 | document.Parse(json_str.data, json_str.size); |
583 | 0 | if (document.HasParseError() || !document.IsString()) { |
584 | 0 | return Status::RuntimeError( |
585 | 0 | fmt::format("Invalid JSON text in argument 1 to function {}: {}", name, |
586 | 0 | std::string_view(json_str.data, json_str.size))); |
587 | 0 | } |
588 | 0 | col_to->insert_data(document.GetString(), document.GetStringLength()); |
589 | 0 | } |
590 | 0 | } |
591 | | |
592 | 0 | block.replace_by_position(result, |
593 | 0 | ColumnNullable::create(std::move(col_to), std::move(null_map))); |
594 | |
|
595 | 0 | return Status::OK(); |
596 | 0 | } |
597 | | }; |
598 | | |
599 | 1 | void register_function_json(SimpleFunctionFactory& factory) { |
600 | 1 | factory.register_function<FunctionJsonUnquote>(); |
601 | | |
602 | 1 | factory.register_function<FunctionJson<FunctionJsonQuoteImpl>>(); |
603 | | |
604 | 1 | factory.register_function<FunctionJsonValid>(); |
605 | 1 | } |
606 | | |
607 | | } // namespace doris |