be/src/exprs/function/function_json.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <glog/logging.h> |
19 | | #include <rapidjson/allocators.h> |
20 | | #include <rapidjson/document.h> |
21 | | #include <rapidjson/encodings.h> |
22 | | #include <rapidjson/pointer.h> |
23 | | #include <rapidjson/rapidjson.h> |
24 | | #include <rapidjson/stringbuffer.h> |
25 | | #include <rapidjson/writer.h> |
26 | | #include <re2/re2.h> |
27 | | #include <stdint.h> |
28 | | #include <stdlib.h> |
29 | | #include <string.h> |
30 | | |
31 | | #include <algorithm> |
32 | | #include <boost/iterator/iterator_facade.hpp> |
33 | | #include <boost/token_functions.hpp> |
34 | | #include <boost/tokenizer.hpp> |
35 | | #include <memory> |
36 | | #include <string> |
37 | | #include <string_view> |
38 | | #include <type_traits> |
39 | | #include <utility> |
40 | | #include <vector> |
41 | | |
42 | | #include "common/cast_set.h" |
43 | | #include "common/compiler_util.h" // IWYU pragma: keep |
44 | | #include "common/status.h" |
45 | | #include "core/assert_cast.h" |
46 | | #include "core/block/block.h" |
47 | | #include "core/block/column_numbers.h" |
48 | | #include "core/block/column_with_type_and_name.h" |
49 | | #include "core/column/column.h" |
50 | | #include "core/column/column_nullable.h" |
51 | | #include "core/column/column_string.h" |
52 | | #include "core/column/column_vector.h" |
53 | | #include "core/data_type/data_type.h" |
54 | | #include "core/data_type/data_type_nullable.h" |
55 | | #include "core/data_type/data_type_number.h" |
56 | | #include "core/data_type/data_type_string.h" |
57 | | #include "core/string_ref.h" |
58 | | #include "core/types.h" |
59 | | #include "core/value/jsonb_value.h" |
60 | | #include "exec/common/stringop_substring.h" |
61 | | #include "exec/common/template_helpers.hpp" |
62 | | #include "exprs/aggregate/aggregate_function.h" |
63 | | #include "exprs/function/function.h" |
64 | | #include "exprs/function/function_totype.h" |
65 | | #include "exprs/function/simple_function_factory.h" |
66 | | #include "exprs/json_functions.h" |
67 | | #include "util/io_helper.h" |
68 | | #include "util/string_parser.hpp" |
69 | | #include "util/string_util.h" |
70 | | |
71 | | namespace doris { |
72 | | class FunctionContext; |
73 | | } // namespace doris |
74 | | |
75 | | namespace doris { |
76 | | #include "common/compile_check_begin.h" |
77 | | static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?"); |
78 | | |
79 | | template <typename T, typename U> |
80 | | void char_split(std::vector<T>& res, const U& var, char p) { |
81 | | int start = 0; |
82 | | int pos = start; |
83 | | int end = var.length(); |
84 | | while (pos < end) { |
85 | | while (var[pos] != p && pos < end) { |
86 | | pos++; |
87 | | } |
88 | | res.emplace_back(&var[start], pos - start); |
89 | | pos++; |
90 | | start = pos; |
91 | | } |
92 | | } |
93 | | |
94 | | // T = std::vector<std::string> |
95 | | // TODO: update RE2 to support std::vector<std::string_view> |
96 | | template <typename T> |
97 | | void get_parsed_paths(const T& path_exprs, std::vector<JsonPath>* parsed_paths) { |
98 | | if (path_exprs.empty()) { |
99 | | return; |
100 | | } |
101 | | |
102 | | if (path_exprs[0] != "$") { |
103 | | parsed_paths->emplace_back("", -1, false); |
104 | | } else { |
105 | | parsed_paths->emplace_back("$", -1, true); |
106 | | } |
107 | | |
108 | | for (int i = 1; i < path_exprs.size(); i++) { |
109 | | std::string col; |
110 | | std::string index; |
111 | | if (UNLIKELY(!RE2::FullMatch(path_exprs[i], JSON_PATTERN, &col, &index))) { |
112 | | parsed_paths->emplace_back("", -1, false); |
113 | | } else { |
114 | | int idx = -1; |
115 | | if (!index.empty()) { |
116 | | if (index == "*") { |
117 | | idx = -2; |
118 | | } else { |
119 | | idx = atoi(index.c_str()); |
120 | | } |
121 | | } |
122 | | parsed_paths->emplace_back(col, idx, true); |
123 | | } |
124 | | } |
125 | | } |
126 | | |
127 | | rapidjson::Value* NO_SANITIZE_UNDEFINED |
128 | | match_value(const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document, |
129 | 0 | rapidjson::Document::AllocatorType& mem_allocator, bool is_insert_null = false) { |
130 | 0 | rapidjson::Value* root = document; |
131 | 0 | rapidjson::Value* array_obj = nullptr; |
132 | 0 | for (int i = 1; i < parsed_paths.size(); i++) { |
133 | 0 | if (root == nullptr || root->IsNull()) { |
134 | 0 | return nullptr; |
135 | 0 | } |
136 | | |
137 | 0 | if (UNLIKELY(!parsed_paths[i].is_valid)) { |
138 | 0 | return nullptr; |
139 | 0 | } |
140 | | |
141 | 0 | const std::string& col = parsed_paths[i].key; |
142 | 0 | int index = parsed_paths[i].idx; |
143 | 0 | if (LIKELY(!col.empty())) { |
144 | 0 | if (root->IsObject()) { |
145 | 0 | if (!root->HasMember(col.c_str())) { |
146 | 0 | return nullptr; |
147 | 0 | } else { |
148 | 0 | root = &((*root)[col.c_str()]); |
149 | 0 | } |
150 | 0 | } else { |
151 | | // root is not a nested type, return NULL |
152 | 0 | return nullptr; |
153 | 0 | } |
154 | 0 | } |
155 | | |
156 | 0 | if (UNLIKELY(index != -1)) { |
157 | | // judge the rapidjson:Value, which base the top's result, |
158 | | // if not array return NULL;else get the index value from the array |
159 | 0 | if (root->IsArray()) { |
160 | 0 | if (root->IsNull()) { |
161 | 0 | return nullptr; |
162 | 0 | } else if (index == -2) { |
163 | | // [*] |
164 | 0 | array_obj = static_cast<rapidjson::Value*>( |
165 | 0 | mem_allocator.Malloc(sizeof(rapidjson::Value))); |
166 | 0 | array_obj->SetArray(); |
167 | |
|
168 | 0 | for (int j = 0; j < root->Size(); j++) { |
169 | 0 | rapidjson::Value v; |
170 | 0 | v.CopyFrom((*root)[j], mem_allocator); |
171 | 0 | array_obj->PushBack(v, mem_allocator); |
172 | 0 | } |
173 | 0 | root = array_obj; |
174 | 0 | } else if (index >= root->Size()) { |
175 | 0 | return nullptr; |
176 | 0 | } else { |
177 | 0 | root = &((*root)[index]); |
178 | 0 | } |
179 | 0 | } else { |
180 | 0 | return nullptr; |
181 | 0 | } |
182 | 0 | } |
183 | 0 | } |
184 | 0 | return root; |
185 | 0 | } |
186 | | |
187 | | template <JsonFunctionType fntype> |
188 | | rapidjson::Value* get_json_object(std::string_view json_string, std::string_view path_string, |
189 | | rapidjson::Document* document) { |
190 | | std::vector<JsonPath>* parsed_paths; |
191 | | std::vector<JsonPath> tmp_parsed_paths; |
192 | | |
193 | | //Cannot use '\' as the last character, return NULL |
194 | | if (path_string.back() == '\\') { |
195 | | return nullptr; |
196 | | } |
197 | | |
198 | | std::string fixed_string; |
199 | | if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] != '.') { |
200 | | // Boost tokenizer requires explicit "." after "$" to correctly extract JSON path tokens. |
201 | | // Without this, expressions like "$[0].key" cannot be properly split. |
202 | | // This commit ensures a "." is automatically added after "$" to maintain consistent token parsing behavior. |
203 | | fixed_string = "$."; |
204 | | fixed_string += path_string.substr(1); |
205 | | path_string = fixed_string; |
206 | | } |
207 | | |
208 | | try { |
209 | | #ifdef USE_LIBCPP |
210 | | std::string s(path_string); |
211 | | auto tok = get_json_token(s); |
212 | | #else |
213 | | auto tok = get_json_token(path_string); |
214 | | #endif |
215 | | std::vector<std::string> paths(tok.begin(), tok.end()); |
216 | | get_parsed_paths(paths, &tmp_parsed_paths); |
217 | | if (tmp_parsed_paths.empty()) { |
218 | | return document; |
219 | | } |
220 | | } catch (boost::escaped_list_error&) { |
221 | | // meet unknown escape sequence, example '$.name\k' |
222 | | return nullptr; |
223 | | } |
224 | | |
225 | | parsed_paths = &tmp_parsed_paths; |
226 | | |
227 | | if (!(*parsed_paths)[0].is_valid) { |
228 | | return nullptr; |
229 | | } |
230 | | |
231 | | if (UNLIKELY((*parsed_paths).size() == 1)) { |
232 | | if (fntype == JSON_FUN_STRING) { |
233 | | document->SetString(json_string.data(), |
234 | | cast_set<rapidjson::SizeType>(json_string.size()), |
235 | | document->GetAllocator()); |
236 | | } else { |
237 | | return document; |
238 | | } |
239 | | } |
240 | | |
241 | | document->Parse(json_string.data(), json_string.size()); |
242 | | if (UNLIKELY(document->HasParseError())) { |
243 | | // VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": " |
244 | | // << GetParseError_En(document->GetParseError()); |
245 | | return nullptr; |
246 | | } |
247 | | |
248 | | return match_value(*parsed_paths, document, document->GetAllocator()); |
249 | | } |
250 | | |
251 | | template <int flag> |
252 | | struct JsonParser { |
253 | | //string |
254 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
255 | | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
256 | | value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator); |
257 | | } |
258 | | }; |
259 | | |
260 | | template <> |
261 | | struct JsonParser<'0'> { |
262 | | // null |
263 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
264 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
265 | 0 | value.SetNull(); |
266 | 0 | } |
267 | | }; |
268 | | |
269 | | template <> |
270 | | struct JsonParser<'1'> { |
271 | | // bool |
272 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
273 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
274 | 0 | DCHECK(data.size == 1 || strncmp(data.data, "true", 4) == 0 || |
275 | 0 | strncmp(data.data, "false", 5) == 0); |
276 | 0 | value.SetBool(*data.data == '1' || *data.data == 't'); |
277 | 0 | } |
278 | | }; |
279 | | |
280 | | template <> |
281 | | struct JsonParser<'2'> { |
282 | | // int |
283 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
284 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
285 | 0 | value.SetInt(StringParser::string_to_int<int32_t>(data.data, data.size, &result)); |
286 | 0 | } |
287 | | }; |
288 | | |
289 | | template <> |
290 | | struct JsonParser<'3'> { |
291 | | // double |
292 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
293 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
294 | 0 | value.SetDouble(StringParser::string_to_float<double>(data.data, data.size, &result)); |
295 | 0 | } |
296 | | }; |
297 | | |
298 | | template <> |
299 | | struct JsonParser<'4'> { |
300 | | // time |
301 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
302 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
303 | 0 | // remove double quotes, "xxx" -> xxx |
304 | 0 | value.SetString(data.data + 1, cast_set<rapidjson::SizeType>(data.size - 2), allocator); |
305 | 0 | } |
306 | | }; |
307 | | |
308 | | template <> |
309 | | struct JsonParser<'5'> { |
310 | | // bigint |
311 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
312 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
313 | 0 | value.SetInt64(StringParser::string_to_int<int64_t>(data.data, data.size, &result)); |
314 | 0 | } |
315 | | }; |
316 | | |
317 | | template <> |
318 | | struct JsonParser<'7'> { |
319 | | // json string |
320 | | static void update_value(StringParser::ParseResult& result, rapidjson::Value& value, |
321 | 0 | StringRef data, rapidjson::Document::AllocatorType& allocator) { |
322 | 0 | rapidjson::Document document; |
323 | 0 | const JsonbValue* json_val = JsonbDocument::createValue(data.data, data.size); |
324 | 0 | convert_jsonb_to_rapidjson(*json_val, document, allocator); |
325 | 0 | value.CopyFrom(document, allocator); |
326 | 0 | } |
327 | | }; |
328 | | |
329 | | template <int flag, typename Impl> |
330 | | struct ExecuteReducer { |
331 | | template <typename... TArgs> |
332 | | static void run(TArgs&&... args) { |
333 | | Impl::template execute_type<JsonParser<flag>>(std::forward<TArgs>(args)...); |
334 | | } |
335 | | }; |
336 | | |
337 | | struct FunctionJsonQuoteImpl { |
338 | | static constexpr auto name = "json_quote"; |
339 | | |
340 | 0 | static DataTypePtr get_return_type_impl(const DataTypes& arguments) { |
341 | 0 | if (!arguments.empty() && arguments[0] && arguments[0]->is_nullable()) { |
342 | 0 | return make_nullable(std::make_shared<DataTypeString>()); |
343 | 0 | } |
344 | 0 | return std::make_shared<DataTypeString>(); |
345 | 0 | } |
346 | | static void execute(const std::vector<const ColumnString*>& data_columns, |
347 | 0 | ColumnString& result_column, size_t input_rows_count) { |
348 | 0 | rapidjson::Document document; |
349 | 0 | rapidjson::Document::AllocatorType& allocator = document.GetAllocator(); |
350 | |
|
351 | 0 | rapidjson::Value value; |
352 | |
|
353 | 0 | rapidjson::StringBuffer buf; |
354 | |
|
355 | 0 | for (int i = 0; i < input_rows_count; i++) { |
356 | 0 | StringRef data = data_columns[0]->get_data_at(i); |
357 | 0 | value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator); |
358 | |
|
359 | 0 | buf.Clear(); |
360 | 0 | rapidjson::Writer<rapidjson::StringBuffer> writer(buf); |
361 | 0 | value.Accept(writer); |
362 | 0 | result_column.insert_data(buf.GetString(), buf.GetSize()); |
363 | 0 | } |
364 | 0 | } |
365 | | }; |
366 | | |
367 | | template <typename Impl> |
368 | | class FunctionJson : public IFunction { |
369 | | public: |
370 | | static constexpr auto name = Impl::name; |
371 | | |
372 | 8 | static FunctionPtr create() { return std::make_shared<FunctionJson<Impl>>(); } |
373 | | |
374 | 0 | String get_name() const override { return name; } |
375 | | |
376 | 0 | size_t get_number_of_arguments() const override { return 0; } |
377 | | |
378 | 1 | bool is_variadic() const override { return true; } |
379 | | |
380 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
381 | 0 | return Impl::get_return_type_impl(arguments); |
382 | 0 | } |
383 | | |
384 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
385 | 0 | uint32_t result, size_t input_rows_count) const override { |
386 | 0 | auto result_column = ColumnString::create(); |
387 | |
|
388 | 0 | std::vector<ColumnPtr> column_ptrs; // prevent converted column destruct |
389 | 0 | std::vector<const ColumnString*> data_columns; |
390 | 0 | for (int i = 0; i < arguments.size(); i++) { |
391 | 0 | column_ptrs.push_back( |
392 | 0 | block.get_by_position(arguments[i]).column->convert_to_full_column_if_const()); |
393 | 0 | data_columns.push_back(assert_cast<const ColumnString*>(column_ptrs.back().get())); |
394 | 0 | } |
395 | |
|
396 | 0 | Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()), |
397 | 0 | input_rows_count); |
398 | 0 | block.get_by_position(result).column = std::move(result_column); |
399 | 0 | return Status::OK(); |
400 | 0 | } |
401 | | }; |
402 | | |
403 | | template <typename Impl> |
404 | | class FunctionJsonNullable : public IFunction { |
405 | | public: |
406 | | static constexpr auto name = Impl::name; |
407 | | static FunctionPtr create() { return std::make_shared<FunctionJsonNullable<Impl>>(); } |
408 | | String get_name() const override { return name; } |
409 | | size_t get_number_of_arguments() const override { return 0; } |
410 | | bool is_variadic() const override { return true; } |
411 | | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
412 | | return make_nullable(std::make_shared<DataTypeString>()); |
413 | | } |
414 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
415 | | uint32_t result, size_t input_rows_count) const override { |
416 | | auto result_column = ColumnString::create(); |
417 | | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
418 | | std::vector<const ColumnString*> data_columns; |
419 | | std::vector<bool> column_is_consts; |
420 | | for (int i = 0; i < arguments.size(); i++) { |
421 | | ColumnPtr arg_col; |
422 | | bool arg_const; |
423 | | std::tie(arg_col, arg_const) = |
424 | | unpack_if_const(block.get_by_position(arguments[i]).column); |
425 | | column_is_consts.push_back(arg_const); |
426 | | data_columns.push_back(assert_cast<const ColumnString*>(arg_col.get())); |
427 | | } |
428 | | Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()), |
429 | | null_map->get_data(), input_rows_count, column_is_consts); |
430 | | block.replace_by_position( |
431 | | result, ColumnNullable::create(std::move(result_column), std::move(null_map))); |
432 | | return Status::OK(); |
433 | | } |
434 | | }; |
435 | | |
436 | | class FunctionJsonValid : public IFunction { |
437 | | public: |
438 | | static constexpr auto name = "json_valid"; |
439 | 8 | static FunctionPtr create() { return std::make_shared<FunctionJsonValid>(); } |
440 | | |
441 | 1 | String get_name() const override { return name; } |
442 | | |
443 | 0 | size_t get_number_of_arguments() const override { return 1; } |
444 | | |
445 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
446 | 0 | return make_nullable(std::make_shared<DataTypeInt32>()); |
447 | 0 | } |
448 | | |
449 | 0 | bool use_default_implementation_for_nulls() const override { return false; } |
450 | | |
451 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
452 | 0 | uint32_t result, size_t input_rows_count) const override { |
453 | 0 | const IColumn& col_from = *(block.get_by_position(arguments[0]).column); |
454 | |
|
455 | 0 | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
456 | |
|
457 | 0 | const ColumnUInt8::Container* input_null_map = nullptr; |
458 | 0 | const ColumnString* col_from_string = nullptr; |
459 | 0 | if (const auto* nullable = check_and_get_column<ColumnNullable>(col_from)) { |
460 | 0 | input_null_map = &nullable->get_null_map_data(); |
461 | 0 | col_from_string = |
462 | 0 | check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr()); |
463 | 0 | } else { |
464 | 0 | col_from_string = check_and_get_column<ColumnString>(col_from); |
465 | 0 | } |
466 | |
|
467 | 0 | if (!col_from_string) { |
468 | 0 | return Status::RuntimeError("Illegal column {} should be ColumnString", |
469 | 0 | col_from.get_name()); |
470 | 0 | } |
471 | | |
472 | 0 | auto col_to = ColumnInt32::create(); |
473 | 0 | auto& vec_to = col_to->get_data(); |
474 | 0 | size_t size = col_from.size(); |
475 | 0 | vec_to.resize(size); |
476 | | |
477 | | // parser can be reused for performance |
478 | |
|
479 | 0 | auto input_type = block.get_by_position(arguments[0]).type->get_primitive_type(); |
480 | |
|
481 | 0 | if (input_type == PrimitiveType::TYPE_VARCHAR || input_type == PrimitiveType::TYPE_CHAR || |
482 | 0 | input_type == PrimitiveType::TYPE_STRING) { |
483 | 0 | JsonBinaryValue jsonb_value; |
484 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
485 | 0 | if (input_null_map && (*input_null_map)[i]) { |
486 | 0 | null_map->get_data()[i] = 1; |
487 | 0 | vec_to[i] = 0; |
488 | 0 | continue; |
489 | 0 | } |
490 | | |
491 | 0 | const auto& val = col_from_string->get_data_at(i); |
492 | 0 | if (jsonb_value.from_json_string(val.data, cast_set<unsigned int>(val.size)).ok()) { |
493 | 0 | vec_to[i] = 1; |
494 | 0 | } else { |
495 | 0 | vec_to[i] = 0; |
496 | 0 | } |
497 | 0 | } |
498 | |
|
499 | 0 | } else { |
500 | 0 | DCHECK(input_type == PrimitiveType::TYPE_JSONB); |
501 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
502 | 0 | if (input_null_map && (*input_null_map)[i]) { |
503 | 0 | null_map->get_data()[i] = 1; |
504 | 0 | vec_to[i] = 0; |
505 | 0 | continue; |
506 | 0 | } |
507 | 0 | const auto& val = col_from_string->get_data_at(i); |
508 | 0 | if (val.size == 0) { |
509 | 0 | vec_to[i] = 0; |
510 | 0 | continue; |
511 | 0 | } |
512 | 0 | const JsonbDocument* doc = nullptr; |
513 | 0 | auto st = JsonbDocument::checkAndCreateDocument(val.data, val.size, &doc); |
514 | 0 | if (!st.ok() || !doc || !doc->getValue()) [[unlikely]] { |
515 | 0 | vec_to[i] = 0; |
516 | 0 | continue; |
517 | 0 | } |
518 | 0 | const JsonbValue* value = doc->getValue(); |
519 | 0 | if (UNLIKELY(!value)) { |
520 | 0 | vec_to[i] = 0; |
521 | 0 | continue; |
522 | 0 | } |
523 | 0 | vec_to[i] = 1; |
524 | 0 | } |
525 | 0 | } |
526 | |
|
527 | 0 | block.replace_by_position(result, |
528 | 0 | ColumnNullable::create(std::move(col_to), std::move(null_map))); |
529 | |
|
530 | 0 | return Status::OK(); |
531 | 0 | } |
532 | | }; |
533 | | class FunctionJsonUnquote : public IFunction { |
534 | | public: |
535 | | static constexpr auto name = "json_unquote"; |
536 | 8 | static FunctionPtr create() { return std::make_shared<FunctionJsonUnquote>(); } |
537 | | |
538 | 1 | String get_name() const override { return name; } |
539 | | |
540 | 0 | size_t get_number_of_arguments() const override { return 1; } |
541 | | |
542 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
543 | 0 | return make_nullable(std::make_shared<DataTypeString>()); |
544 | 0 | } |
545 | | |
546 | 0 | bool use_default_implementation_for_nulls() const override { return false; } |
547 | | |
548 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
549 | 0 | uint32_t result, size_t input_rows_count) const override { |
550 | 0 | const IColumn& col_from = *(block.get_by_position(arguments[0]).column); |
551 | |
|
552 | 0 | auto null_map = ColumnUInt8::create(input_rows_count, 0); |
553 | |
|
554 | 0 | const ColumnString* col_from_string = check_and_get_column<ColumnString>(col_from); |
555 | 0 | if (auto* nullable = check_and_get_column<ColumnNullable>(col_from)) { |
556 | 0 | col_from_string = |
557 | 0 | check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr()); |
558 | 0 | } |
559 | |
|
560 | 0 | if (!col_from_string) { |
561 | 0 | return Status::RuntimeError("Illegal column {} should be ColumnString", |
562 | 0 | col_from.get_name()); |
563 | 0 | } |
564 | | |
565 | 0 | auto col_to = ColumnString::create(); |
566 | 0 | col_to->reserve(input_rows_count); |
567 | | |
568 | | // parser can be reused for performance |
569 | 0 | rapidjson::Document document; |
570 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
571 | 0 | if (col_from.is_null_at(i)) { |
572 | 0 | null_map->get_data()[i] = 1; |
573 | 0 | col_to->insert_data(nullptr, 0); |
574 | 0 | continue; |
575 | 0 | } |
576 | | |
577 | 0 | const auto& json_str = col_from_string->get_data_at(i); |
578 | 0 | if (json_str.size < 2 || json_str.data[0] != '"' || |
579 | 0 | json_str.data[json_str.size - 1] != '"') { |
580 | | // non-quoted string |
581 | 0 | col_to->insert_data(json_str.data, json_str.size); |
582 | 0 | } else { |
583 | 0 | document.Parse(json_str.data, json_str.size); |
584 | 0 | if (document.HasParseError() || !document.IsString()) { |
585 | 0 | return Status::RuntimeError( |
586 | 0 | fmt::format("Invalid JSON text in argument 1 to function {}: {}", name, |
587 | 0 | std::string_view(json_str.data, json_str.size))); |
588 | 0 | } |
589 | 0 | col_to->insert_data(document.GetString(), document.GetStringLength()); |
590 | 0 | } |
591 | 0 | } |
592 | | |
593 | 0 | block.replace_by_position(result, |
594 | 0 | ColumnNullable::create(std::move(col_to), std::move(null_map))); |
595 | |
|
596 | 0 | return Status::OK(); |
597 | 0 | } |
598 | | }; |
599 | | |
600 | 7 | void register_function_json(SimpleFunctionFactory& factory) { |
601 | 7 | factory.register_function<FunctionJsonUnquote>(); |
602 | | |
603 | 7 | factory.register_function<FunctionJson<FunctionJsonQuoteImpl>>(); |
604 | | |
605 | 7 | factory.register_function<FunctionJsonValid>(); |
606 | 7 | } |
607 | | |
608 | | } // namespace doris |