Coverage Report

Created: 2026-04-24 20:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_json.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <glog/logging.h>
19
#include <rapidjson/allocators.h>
20
#include <rapidjson/document.h>
21
#include <rapidjson/encodings.h>
22
#include <rapidjson/pointer.h>
23
#include <rapidjson/rapidjson.h>
24
#include <rapidjson/stringbuffer.h>
25
#include <rapidjson/writer.h>
26
#include <re2/re2.h>
27
#include <stdint.h>
28
#include <stdlib.h>
29
#include <string.h>
30
31
#include <algorithm>
32
#include <boost/iterator/iterator_facade.hpp>
33
#include <boost/token_functions.hpp>
34
#include <boost/tokenizer.hpp>
35
#include <memory>
36
#include <string>
37
#include <string_view>
38
#include <type_traits>
39
#include <utility>
40
#include <vector>
41
42
#include "common/cast_set.h"
43
#include "common/compiler_util.h" // IWYU pragma: keep
44
#include "common/status.h"
45
#include "core/assert_cast.h"
46
#include "core/block/block.h"
47
#include "core/block/column_numbers.h"
48
#include "core/block/column_with_type_and_name.h"
49
#include "core/column/column.h"
50
#include "core/column/column_nullable.h"
51
#include "core/column/column_string.h"
52
#include "core/column/column_vector.h"
53
#include "core/data_type/data_type.h"
54
#include "core/data_type/data_type_nullable.h"
55
#include "core/data_type/data_type_number.h"
56
#include "core/data_type/data_type_string.h"
57
#include "core/string_ref.h"
58
#include "core/types.h"
59
#include "core/value/jsonb_value.h"
60
#include "exec/common/stringop_substring.h"
61
#include "exec/common/template_helpers.hpp"
62
#include "exprs/aggregate/aggregate_function.h"
63
#include "exprs/function/function.h"
64
#include "exprs/function/function_totype.h"
65
#include "exprs/function/simple_function_factory.h"
66
#include "exprs/json_functions.h"
67
#include "util/string_parser.hpp"
68
#include "util/string_util.h"
69
70
namespace doris {
71
class FunctionContext;
72
} // namespace doris
73
74
namespace doris {
75
static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?");
76
77
template <typename T, typename U>
78
void char_split(std::vector<T>& res, const U& var, char p) {
79
    int start = 0;
80
    int pos = start;
81
    int end = var.length();
82
    while (pos < end) {
83
        while (var[pos] != p && pos < end) {
84
            pos++;
85
        }
86
        res.emplace_back(&var[start], pos - start);
87
        pos++;
88
        start = pos;
89
    }
90
}
91
92
// T = std::vector<std::string>
93
// TODO: update RE2 to support std::vector<std::string_view>
94
template <typename T>
95
void get_parsed_paths(const T& path_exprs, std::vector<JsonPath>* parsed_paths) {
96
    if (path_exprs.empty()) {
97
        return;
98
    }
99
100
    if (path_exprs[0] != "$") {
101
        parsed_paths->emplace_back("", -1, false);
102
    } else {
103
        parsed_paths->emplace_back("$", -1, true);
104
    }
105
106
    for (int i = 1; i < path_exprs.size(); i++) {
107
        std::string col;
108
        std::string index;
109
        if (UNLIKELY(!RE2::FullMatch(path_exprs[i], JSON_PATTERN, &col, &index))) {
110
            parsed_paths->emplace_back("", -1, false);
111
        } else {
112
            int idx = -1;
113
            if (!index.empty()) {
114
                if (index == "*") {
115
                    idx = -2;
116
                } else {
117
                    idx = atoi(index.c_str());
118
                }
119
            }
120
            parsed_paths->emplace_back(col, idx, true);
121
        }
122
    }
123
}
124
125
rapidjson::Value* NO_SANITIZE_UNDEFINED
126
match_value(const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document,
127
0
            rapidjson::Document::AllocatorType& mem_allocator, bool is_insert_null = false) {
128
0
    rapidjson::Value* root = document;
129
0
    rapidjson::Value* array_obj = nullptr;
130
0
    for (int i = 1; i < parsed_paths.size(); i++) {
131
0
        if (root == nullptr || root->IsNull()) {
132
0
            return nullptr;
133
0
        }
134
135
0
        if (UNLIKELY(!parsed_paths[i].is_valid)) {
136
0
            return nullptr;
137
0
        }
138
139
0
        const std::string& col = parsed_paths[i].key;
140
0
        int index = parsed_paths[i].idx;
141
0
        if (LIKELY(!col.empty())) {
142
0
            if (root->IsObject()) {
143
0
                if (!root->HasMember(col.c_str())) {
144
0
                    return nullptr;
145
0
                } else {
146
0
                    root = &((*root)[col.c_str()]);
147
0
                }
148
0
            } else {
149
                // root is not a nested type, return NULL
150
0
                return nullptr;
151
0
            }
152
0
        }
153
154
0
        if (UNLIKELY(index != -1)) {
155
            // judge the rapidjson:Value, which base the top's result,
156
            // if not array return NULL;else get the index value from the array
157
0
            if (root->IsArray()) {
158
0
                if (root->IsNull()) {
159
0
                    return nullptr;
160
0
                } else if (index == -2) {
161
                    // [*]
162
0
                    array_obj = static_cast<rapidjson::Value*>(
163
0
                            mem_allocator.Malloc(sizeof(rapidjson::Value)));
164
0
                    array_obj->SetArray();
165
166
0
                    for (int j = 0; j < root->Size(); j++) {
167
0
                        rapidjson::Value v;
168
0
                        v.CopyFrom((*root)[j], mem_allocator);
169
0
                        array_obj->PushBack(v, mem_allocator);
170
0
                    }
171
0
                    root = array_obj;
172
0
                } else if (index >= root->Size()) {
173
0
                    return nullptr;
174
0
                } else {
175
0
                    root = &((*root)[index]);
176
0
                }
177
0
            } else {
178
0
                return nullptr;
179
0
            }
180
0
        }
181
0
    }
182
0
    return root;
183
0
}
184
185
template <JsonFunctionType fntype>
186
rapidjson::Value* get_json_object(std::string_view json_string, std::string_view path_string,
187
                                  rapidjson::Document* document) {
188
    std::vector<JsonPath>* parsed_paths;
189
    std::vector<JsonPath> tmp_parsed_paths;
190
191
    //Cannot use '\' as the last character, return NULL
192
    if (path_string.back() == '\\') {
193
        return nullptr;
194
    }
195
196
    std::string fixed_string;
197
    if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] != '.') {
198
        // Boost tokenizer requires explicit "." after "$" to correctly extract JSON path tokens.
199
        // Without this, expressions like "$[0].key" cannot be properly split.
200
        // This commit ensures a "." is automatically added after "$" to maintain consistent token parsing behavior.
201
        fixed_string = "$.";
202
        fixed_string += path_string.substr(1);
203
        path_string = fixed_string;
204
    }
205
206
    try {
207
#ifdef USE_LIBCPP
208
        std::string s(path_string);
209
        auto tok = get_json_token(s);
210
#else
211
        auto tok = get_json_token(path_string);
212
#endif
213
        std::vector<std::string> paths(tok.begin(), tok.end());
214
        get_parsed_paths(paths, &tmp_parsed_paths);
215
        if (tmp_parsed_paths.empty()) {
216
            return document;
217
        }
218
    } catch (boost::escaped_list_error&) {
219
        // meet unknown escape sequence, example '$.name\k'
220
        return nullptr;
221
    }
222
223
    parsed_paths = &tmp_parsed_paths;
224
225
    if (!(*parsed_paths)[0].is_valid) {
226
        return nullptr;
227
    }
228
229
    if (UNLIKELY((*parsed_paths).size() == 1)) {
230
        if (fntype == JSON_FUN_STRING) {
231
            document->SetString(json_string.data(),
232
                                cast_set<rapidjson::SizeType>(json_string.size()),
233
                                document->GetAllocator());
234
        } else {
235
            return document;
236
        }
237
    }
238
239
    document->Parse(json_string.data(), json_string.size());
240
    if (UNLIKELY(document->HasParseError())) {
241
        // VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": "
242
        //         << GetParseError_En(document->GetParseError());
243
        return nullptr;
244
    }
245
246
    return match_value(*parsed_paths, document, document->GetAllocator());
247
}
248
249
template <int flag>
250
struct JsonParser {
251
    //string
252
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
253
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
254
        value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator);
255
    }
256
};
257
258
template <>
259
struct JsonParser<'0'> {
260
    // null
261
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
262
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
263
0
        value.SetNull();
264
0
    }
265
};
266
267
template <>
268
struct JsonParser<'1'> {
269
    // bool
270
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
271
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
272
0
        DCHECK(data.size == 1 || strncmp(data.data, "true", 4) == 0 ||
273
0
               strncmp(data.data, "false", 5) == 0);
274
0
        value.SetBool(*data.data == '1' || *data.data == 't');
275
0
    }
276
};
277
278
template <>
279
struct JsonParser<'2'> {
280
    // int
281
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
282
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
283
0
        value.SetInt(StringParser::string_to_int<int32_t>(data.data, data.size, &result));
284
0
    }
285
};
286
287
template <>
288
struct JsonParser<'3'> {
289
    // double
290
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
291
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
292
0
        value.SetDouble(StringParser::string_to_float<double>(data.data, data.size, &result));
293
0
    }
294
};
295
296
template <>
297
struct JsonParser<'4'> {
298
    // time
299
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
300
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
301
0
        // remove double quotes, "xxx" -> xxx
302
0
        value.SetString(data.data + 1, cast_set<rapidjson::SizeType>(data.size - 2), allocator);
303
0
    }
304
};
305
306
template <>
307
struct JsonParser<'5'> {
308
    // bigint
309
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
310
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
311
0
        value.SetInt64(StringParser::string_to_int<int64_t>(data.data, data.size, &result));
312
0
    }
313
};
314
315
template <>
316
struct JsonParser<'7'> {
317
    // json string
318
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
319
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
320
0
        rapidjson::Document document;
321
0
        const JsonbValue* json_val = JsonbDocument::createValue(data.data, data.size);
322
0
        convert_jsonb_to_rapidjson(*json_val, document, allocator);
323
0
        value.CopyFrom(document, allocator);
324
0
    }
325
};
326
327
template <int flag, typename Impl>
328
struct ExecuteReducer {
329
    template <typename... TArgs>
330
    static void run(TArgs&&... args) {
331
        Impl::template execute_type<JsonParser<flag>>(std::forward<TArgs>(args)...);
332
    }
333
};
334
335
struct FunctionJsonQuoteImpl {
336
    static constexpr auto name = "json_quote";
337
338
0
    static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
339
0
        if (!arguments.empty() && arguments[0] && arguments[0]->is_nullable()) {
340
0
            return make_nullable(std::make_shared<DataTypeString>());
341
0
        }
342
0
        return std::make_shared<DataTypeString>();
343
0
    }
344
    static void execute(const std::vector<const ColumnString*>& data_columns,
345
0
                        ColumnString& result_column, size_t input_rows_count) {
346
0
        rapidjson::Document document;
347
0
        rapidjson::Document::AllocatorType& allocator = document.GetAllocator();
348
349
0
        rapidjson::Value value;
350
351
0
        rapidjson::StringBuffer buf;
352
353
0
        for (int i = 0; i < input_rows_count; i++) {
354
0
            StringRef data = data_columns[0]->get_data_at(i);
355
0
            value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator);
356
357
0
            buf.Clear();
358
0
            rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
359
0
            value.Accept(writer);
360
0
            result_column.insert_data(buf.GetString(), buf.GetSize());
361
0
        }
362
0
    }
363
};
364
365
template <typename Impl>
366
class FunctionJson : public IFunction {
367
public:
368
    static constexpr auto name = Impl::name;
369
370
2
    static FunctionPtr create() { return std::make_shared<FunctionJson<Impl>>(); }
371
372
0
    String get_name() const override { return name; }
373
374
0
    size_t get_number_of_arguments() const override { return 0; }
375
376
1
    bool is_variadic() const override { return true; }
377
378
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
379
0
        return Impl::get_return_type_impl(arguments);
380
0
    }
381
382
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
383
0
                        uint32_t result, size_t input_rows_count) const override {
384
0
        auto result_column = ColumnString::create();
385
386
0
        std::vector<ColumnPtr> column_ptrs; // prevent converted column destruct
387
0
        std::vector<const ColumnString*> data_columns;
388
0
        for (int i = 0; i < arguments.size(); i++) {
389
0
            column_ptrs.push_back(
390
0
                    block.get_by_position(arguments[i]).column->convert_to_full_column_if_const());
391
0
            data_columns.push_back(assert_cast<const ColumnString*>(column_ptrs.back().get()));
392
0
        }
393
394
0
        Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()),
395
0
                      input_rows_count);
396
0
        block.get_by_position(result).column = std::move(result_column);
397
0
        return Status::OK();
398
0
    }
399
};
400
401
class FunctionJsonValid : public IFunction {
402
public:
403
    static constexpr auto name = "json_valid";
404
2
    static FunctionPtr create() { return std::make_shared<FunctionJsonValid>(); }
405
406
1
    String get_name() const override { return name; }
407
408
0
    size_t get_number_of_arguments() const override { return 1; }
409
410
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
411
0
        return make_nullable(std::make_shared<DataTypeInt32>());
412
0
    }
413
414
0
    bool use_default_implementation_for_nulls() const override { return false; }
415
416
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
417
0
                        uint32_t result, size_t input_rows_count) const override {
418
0
        const IColumn& col_from = *(block.get_by_position(arguments[0]).column);
419
420
0
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
421
422
0
        const ColumnUInt8::Container* input_null_map = nullptr;
423
0
        const ColumnString* col_from_string = nullptr;
424
0
        if (const auto* nullable = check_and_get_column<ColumnNullable>(col_from)) {
425
0
            input_null_map = &nullable->get_null_map_data();
426
0
            col_from_string =
427
0
                    check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
428
0
        } else {
429
0
            col_from_string = check_and_get_column<ColumnString>(col_from);
430
0
        }
431
432
0
        if (!col_from_string) {
433
0
            return Status::RuntimeError("Illegal column {} should be ColumnString",
434
0
                                        col_from.get_name());
435
0
        }
436
437
0
        auto col_to = ColumnInt32::create();
438
0
        auto& vec_to = col_to->get_data();
439
0
        size_t size = col_from.size();
440
0
        vec_to.resize(size);
441
442
        // parser can be reused for performance
443
444
0
        auto input_type = block.get_by_position(arguments[0]).type->get_primitive_type();
445
446
0
        if (input_type == PrimitiveType::TYPE_VARCHAR || input_type == PrimitiveType::TYPE_CHAR ||
447
0
            input_type == PrimitiveType::TYPE_STRING) {
448
0
            JsonBinaryValue jsonb_value;
449
0
            for (size_t i = 0; i < input_rows_count; ++i) {
450
0
                if (input_null_map && (*input_null_map)[i]) {
451
0
                    null_map->get_data()[i] = 1;
452
0
                    vec_to[i] = 0;
453
0
                    continue;
454
0
                }
455
456
0
                const auto& val = col_from_string->get_data_at(i);
457
0
                if (jsonb_value.from_json_string(val.data, cast_set<unsigned int>(val.size)).ok()) {
458
0
                    vec_to[i] = 1;
459
0
                } else {
460
0
                    vec_to[i] = 0;
461
0
                }
462
0
            }
463
464
0
        } else {
465
0
            DCHECK(input_type == PrimitiveType::TYPE_JSONB);
466
0
            for (size_t i = 0; i < input_rows_count; ++i) {
467
0
                if (input_null_map && (*input_null_map)[i]) {
468
0
                    null_map->get_data()[i] = 1;
469
0
                    vec_to[i] = 0;
470
0
                    continue;
471
0
                }
472
0
                const auto& val = col_from_string->get_data_at(i);
473
0
                if (val.size == 0) {
474
0
                    vec_to[i] = 0;
475
0
                    continue;
476
0
                }
477
0
                const JsonbDocument* doc = nullptr;
478
0
                auto st = JsonbDocument::checkAndCreateDocument(val.data, val.size, &doc);
479
0
                if (!st.ok() || !doc || !doc->getValue()) [[unlikely]] {
480
0
                    vec_to[i] = 0;
481
0
                    continue;
482
0
                }
483
0
                const JsonbValue* value = doc->getValue();
484
0
                if (UNLIKELY(!value)) {
485
0
                    vec_to[i] = 0;
486
0
                    continue;
487
0
                }
488
0
                vec_to[i] = 1;
489
0
            }
490
0
        }
491
492
0
        block.replace_by_position(result,
493
0
                                  ColumnNullable::create(std::move(col_to), std::move(null_map)));
494
495
0
        return Status::OK();
496
0
    }
497
};
498
class FunctionJsonUnquote : public IFunction {
499
public:
500
    static constexpr auto name = "json_unquote";
501
2
    static FunctionPtr create() { return std::make_shared<FunctionJsonUnquote>(); }
502
503
1
    String get_name() const override { return name; }
504
505
0
    size_t get_number_of_arguments() const override { return 1; }
506
507
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
508
0
        return make_nullable(std::make_shared<DataTypeString>());
509
0
    }
510
511
0
    bool use_default_implementation_for_nulls() const override { return false; }
512
513
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
514
0
                        uint32_t result, size_t input_rows_count) const override {
515
0
        const IColumn& col_from = *(block.get_by_position(arguments[0]).column);
516
517
0
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
518
519
0
        const ColumnString* col_from_string = check_and_get_column<ColumnString>(col_from);
520
0
        if (auto* nullable = check_and_get_column<ColumnNullable>(col_from)) {
521
0
            col_from_string =
522
0
                    check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
523
0
        }
524
525
0
        if (!col_from_string) {
526
0
            return Status::RuntimeError("Illegal column {} should be ColumnString",
527
0
                                        col_from.get_name());
528
0
        }
529
530
0
        auto col_to = ColumnString::create();
531
0
        col_to->reserve(input_rows_count);
532
533
        // parser can be reused for performance
534
0
        rapidjson::Document document;
535
0
        for (size_t i = 0; i < input_rows_count; ++i) {
536
0
            if (col_from.is_null_at(i)) {
537
0
                null_map->get_data()[i] = 1;
538
0
                col_to->insert_data(nullptr, 0);
539
0
                continue;
540
0
            }
541
542
0
            const auto& json_str = col_from_string->get_data_at(i);
543
0
            if (json_str.size < 2 || json_str.data[0] != '"' ||
544
0
                json_str.data[json_str.size - 1] != '"') {
545
                // non-quoted string
546
0
                col_to->insert_data(json_str.data, json_str.size);
547
0
            } else {
548
0
                document.Parse(json_str.data, json_str.size);
549
0
                if (document.HasParseError() || !document.IsString()) {
550
0
                    return Status::RuntimeError(
551
0
                            fmt::format("Invalid JSON text in argument 1 to function {}: {}", name,
552
0
                                        std::string_view(json_str.data, json_str.size)));
553
0
                }
554
0
                col_to->insert_data(document.GetString(), document.GetStringLength());
555
0
            }
556
0
        }
557
558
0
        block.replace_by_position(result,
559
0
                                  ColumnNullable::create(std::move(col_to), std::move(null_map)));
560
561
0
        return Status::OK();
562
0
    }
563
};
564
565
1
void register_function_json(SimpleFunctionFactory& factory) {
566
1
    factory.register_function<FunctionJsonUnquote>();
567
568
1
    factory.register_function<FunctionJson<FunctionJsonQuoteImpl>>();
569
570
1
    factory.register_function<FunctionJsonValid>();
571
1
}
572
573
} // namespace doris