Coverage Report

Created: 2026-04-16 13:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_json.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <glog/logging.h>
19
#include <rapidjson/allocators.h>
20
#include <rapidjson/document.h>
21
#include <rapidjson/encodings.h>
22
#include <rapidjson/pointer.h>
23
#include <rapidjson/rapidjson.h>
24
#include <rapidjson/stringbuffer.h>
25
#include <rapidjson/writer.h>
26
#include <re2/re2.h>
27
#include <stdint.h>
28
#include <stdlib.h>
29
#include <string.h>
30
31
#include <algorithm>
32
#include <boost/iterator/iterator_facade.hpp>
33
#include <boost/token_functions.hpp>
34
#include <boost/tokenizer.hpp>
35
#include <memory>
36
#include <string>
37
#include <string_view>
38
#include <type_traits>
39
#include <utility>
40
#include <vector>
41
42
#include "common/cast_set.h"
43
#include "common/compiler_util.h" // IWYU pragma: keep
44
#include "common/status.h"
45
#include "core/assert_cast.h"
46
#include "core/block/block.h"
47
#include "core/block/column_numbers.h"
48
#include "core/block/column_with_type_and_name.h"
49
#include "core/column/column.h"
50
#include "core/column/column_nullable.h"
51
#include "core/column/column_string.h"
52
#include "core/column/column_vector.h"
53
#include "core/data_type/data_type.h"
54
#include "core/data_type/data_type_nullable.h"
55
#include "core/data_type/data_type_number.h"
56
#include "core/data_type/data_type_string.h"
57
#include "core/string_ref.h"
58
#include "core/types.h"
59
#include "core/value/jsonb_value.h"
60
#include "exec/common/stringop_substring.h"
61
#include "exec/common/template_helpers.hpp"
62
#include "exprs/aggregate/aggregate_function.h"
63
#include "exprs/function/function.h"
64
#include "exprs/function/function_totype.h"
65
#include "exprs/function/simple_function_factory.h"
66
#include "exprs/json_functions.h"
67
#include "util/string_parser.hpp"
68
#include "util/string_util.h"
69
70
namespace doris {
71
class FunctionContext;
72
} // namespace doris
73
74
namespace doris {
75
static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?");
76
77
template <typename T, typename U>
78
void char_split(std::vector<T>& res, const U& var, char p) {
79
    int start = 0;
80
    int pos = start;
81
    int end = var.length();
82
    while (pos < end) {
83
        while (var[pos] != p && pos < end) {
84
            pos++;
85
        }
86
        res.emplace_back(&var[start], pos - start);
87
        pos++;
88
        start = pos;
89
    }
90
}
91
92
// T = std::vector<std::string>
93
// TODO: update RE2 to support std::vector<std::string_view>
94
template <typename T>
95
void get_parsed_paths(const T& path_exprs, std::vector<JsonPath>* parsed_paths) {
96
    if (path_exprs.empty()) {
97
        return;
98
    }
99
100
    if (path_exprs[0] != "$") {
101
        parsed_paths->emplace_back("", -1, false);
102
    } else {
103
        parsed_paths->emplace_back("$", -1, true);
104
    }
105
106
    for (int i = 1; i < path_exprs.size(); i++) {
107
        std::string col;
108
        std::string index;
109
        if (UNLIKELY(!RE2::FullMatch(path_exprs[i], JSON_PATTERN, &col, &index))) {
110
            parsed_paths->emplace_back("", -1, false);
111
        } else {
112
            int idx = -1;
113
            if (!index.empty()) {
114
                if (index == "*") {
115
                    idx = -2;
116
                } else {
117
                    idx = atoi(index.c_str());
118
                }
119
            }
120
            parsed_paths->emplace_back(col, idx, true);
121
        }
122
    }
123
}
124
125
rapidjson::Value* NO_SANITIZE_UNDEFINED
126
match_value(const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document,
127
0
            rapidjson::Document::AllocatorType& mem_allocator, bool is_insert_null = false) {
128
0
    rapidjson::Value* root = document;
129
0
    rapidjson::Value* array_obj = nullptr;
130
0
    for (int i = 1; i < parsed_paths.size(); i++) {
131
0
        if (root == nullptr || root->IsNull()) {
132
0
            return nullptr;
133
0
        }
134
135
0
        if (UNLIKELY(!parsed_paths[i].is_valid)) {
136
0
            return nullptr;
137
0
        }
138
139
0
        const std::string& col = parsed_paths[i].key;
140
0
        int index = parsed_paths[i].idx;
141
0
        if (LIKELY(!col.empty())) {
142
0
            if (root->IsObject()) {
143
0
                if (!root->HasMember(col.c_str())) {
144
0
                    return nullptr;
145
0
                } else {
146
0
                    root = &((*root)[col.c_str()]);
147
0
                }
148
0
            } else {
149
                // root is not a nested type, return NULL
150
0
                return nullptr;
151
0
            }
152
0
        }
153
154
0
        if (UNLIKELY(index != -1)) {
155
            // judge the rapidjson:Value, which base the top's result,
156
            // if not array return NULL;else get the index value from the array
157
0
            if (root->IsArray()) {
158
0
                if (root->IsNull()) {
159
0
                    return nullptr;
160
0
                } else if (index == -2) {
161
                    // [*]
162
0
                    array_obj = static_cast<rapidjson::Value*>(
163
0
                            mem_allocator.Malloc(sizeof(rapidjson::Value)));
164
0
                    array_obj->SetArray();
165
166
0
                    for (int j = 0; j < root->Size(); j++) {
167
0
                        rapidjson::Value v;
168
0
                        v.CopyFrom((*root)[j], mem_allocator);
169
0
                        array_obj->PushBack(v, mem_allocator);
170
0
                    }
171
0
                    root = array_obj;
172
0
                } else if (index >= root->Size()) {
173
0
                    return nullptr;
174
0
                } else {
175
0
                    root = &((*root)[index]);
176
0
                }
177
0
            } else {
178
0
                return nullptr;
179
0
            }
180
0
        }
181
0
    }
182
0
    return root;
183
0
}
184
185
template <JsonFunctionType fntype>
186
rapidjson::Value* get_json_object(std::string_view json_string, std::string_view path_string,
187
                                  rapidjson::Document* document) {
188
    std::vector<JsonPath>* parsed_paths;
189
    std::vector<JsonPath> tmp_parsed_paths;
190
191
    //Cannot use '\' as the last character, return NULL
192
    if (path_string.back() == '\\') {
193
        return nullptr;
194
    }
195
196
    std::string fixed_string;
197
    if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] != '.') {
198
        // Boost tokenizer requires explicit "." after "$" to correctly extract JSON path tokens.
199
        // Without this, expressions like "$[0].key" cannot be properly split.
200
        // This commit ensures a "." is automatically added after "$" to maintain consistent token parsing behavior.
201
        fixed_string = "$.";
202
        fixed_string += path_string.substr(1);
203
        path_string = fixed_string;
204
    }
205
206
    try {
207
#ifdef USE_LIBCPP
208
        std::string s(path_string);
209
        auto tok = get_json_token(s);
210
#else
211
        auto tok = get_json_token(path_string);
212
#endif
213
        std::vector<std::string> paths(tok.begin(), tok.end());
214
        get_parsed_paths(paths, &tmp_parsed_paths);
215
        if (tmp_parsed_paths.empty()) {
216
            return document;
217
        }
218
    } catch (boost::escaped_list_error&) {
219
        // meet unknown escape sequence, example '$.name\k'
220
        return nullptr;
221
    }
222
223
    parsed_paths = &tmp_parsed_paths;
224
225
    if (!(*parsed_paths)[0].is_valid) {
226
        return nullptr;
227
    }
228
229
    if (UNLIKELY((*parsed_paths).size() == 1)) {
230
        if (fntype == JSON_FUN_STRING) {
231
            document->SetString(json_string.data(),
232
                                cast_set<rapidjson::SizeType>(json_string.size()),
233
                                document->GetAllocator());
234
        } else {
235
            return document;
236
        }
237
    }
238
239
    document->Parse(json_string.data(), json_string.size());
240
    if (UNLIKELY(document->HasParseError())) {
241
        // VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": "
242
        //         << GetParseError_En(document->GetParseError());
243
        return nullptr;
244
    }
245
246
    return match_value(*parsed_paths, document, document->GetAllocator());
247
}
248
249
template <int flag>
250
struct JsonParser {
251
    //string
252
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
253
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
254
        value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator);
255
    }
256
};
257
258
template <>
259
struct JsonParser<'0'> {
260
    // null
261
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
262
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
263
0
        value.SetNull();
264
0
    }
265
};
266
267
template <>
268
struct JsonParser<'1'> {
269
    // bool
270
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
271
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
272
0
        DCHECK(data.size == 1 || strncmp(data.data, "true", 4) == 0 ||
273
0
               strncmp(data.data, "false", 5) == 0);
274
0
        value.SetBool(*data.data == '1' || *data.data == 't');
275
0
    }
276
};
277
278
template <>
279
struct JsonParser<'2'> {
280
    // int
281
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
282
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
283
0
        value.SetInt(StringParser::string_to_int<int32_t>(data.data, data.size, &result));
284
0
    }
285
};
286
287
template <>
288
struct JsonParser<'3'> {
289
    // double
290
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
291
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
292
0
        value.SetDouble(StringParser::string_to_float<double>(data.data, data.size, &result));
293
0
    }
294
};
295
296
template <>
297
struct JsonParser<'4'> {
298
    // time
299
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
300
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
301
0
        // remove double quotes, "xxx" -> xxx
302
0
        value.SetString(data.data + 1, cast_set<rapidjson::SizeType>(data.size - 2), allocator);
303
0
    }
304
};
305
306
template <>
307
struct JsonParser<'5'> {
308
    // bigint
309
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
310
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
311
0
        value.SetInt64(StringParser::string_to_int<int64_t>(data.data, data.size, &result));
312
0
    }
313
};
314
315
template <>
316
struct JsonParser<'7'> {
317
    // json string
318
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
319
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
320
0
        rapidjson::Document document;
321
0
        const JsonbValue* json_val = JsonbDocument::createValue(data.data, data.size);
322
0
        convert_jsonb_to_rapidjson(*json_val, document, allocator);
323
0
        value.CopyFrom(document, allocator);
324
0
    }
325
};
326
327
template <int flag, typename Impl>
328
struct ExecuteReducer {
329
    template <typename... TArgs>
330
    static void run(TArgs&&... args) {
331
        Impl::template execute_type<JsonParser<flag>>(std::forward<TArgs>(args)...);
332
    }
333
};
334
335
struct FunctionJsonQuoteImpl {
336
    static constexpr auto name = "json_quote";
337
338
7
    static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
339
7
        if (!arguments.empty() && arguments[0] && arguments[0]->is_nullable()) {
340
0
            return make_nullable(std::make_shared<DataTypeString>());
341
0
        }
342
7
        return std::make_shared<DataTypeString>();
343
7
    }
344
    static void execute(const std::vector<const ColumnString*>& data_columns,
345
16
                        ColumnString& result_column, size_t input_rows_count) {
346
16
        rapidjson::Document document;
347
16
        rapidjson::Document::AllocatorType& allocator = document.GetAllocator();
348
349
16
        rapidjson::Value value;
350
351
16
        rapidjson::StringBuffer buf;
352
353
42
        for (int i = 0; i < input_rows_count; i++) {
354
26
            StringRef data = data_columns[0]->get_data_at(i);
355
26
            value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator);
356
357
26
            buf.Clear();
358
26
            rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
359
26
            value.Accept(writer);
360
26
            result_column.insert_data(buf.GetString(), buf.GetSize());
361
26
        }
362
16
    }
363
};
364
365
template <typename Impl>
366
class FunctionJson : public IFunction {
367
public:
368
    static constexpr auto name = Impl::name;
369
370
16
    static FunctionPtr create() { return std::make_shared<FunctionJson<Impl>>(); }
371
372
0
    String get_name() const override { return name; }
373
374
0
    size_t get_number_of_arguments() const override { return 0; }
375
376
8
    bool is_variadic() const override { return true; }
377
378
7
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
379
7
        return Impl::get_return_type_impl(arguments);
380
7
    }
381
382
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
383
16
                        uint32_t result, size_t input_rows_count) const override {
384
16
        auto result_column = ColumnString::create();
385
386
16
        std::vector<ColumnPtr> column_ptrs; // prevent converted column destruct
387
16
        std::vector<const ColumnString*> data_columns;
388
32
        for (int i = 0; i < arguments.size(); i++) {
389
16
            column_ptrs.push_back(
390
16
                    block.get_by_position(arguments[i]).column->convert_to_full_column_if_const());
391
16
            data_columns.push_back(assert_cast<const ColumnString*>(column_ptrs.back().get()));
392
16
        }
393
394
16
        Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()),
395
16
                      input_rows_count);
396
16
        block.get_by_position(result).column = std::move(result_column);
397
16
        return Status::OK();
398
16
    }
399
};
400
401
template <typename Impl>
402
class FunctionJsonNullable : public IFunction {
403
public:
404
    static constexpr auto name = Impl::name;
405
    static FunctionPtr create() { return std::make_shared<FunctionJsonNullable<Impl>>(); }
406
    String get_name() const override { return name; }
407
    size_t get_number_of_arguments() const override { return 0; }
408
    bool is_variadic() const override { return true; }
409
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
410
        return make_nullable(std::make_shared<DataTypeString>());
411
    }
412
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
413
                        uint32_t result, size_t input_rows_count) const override {
414
        auto result_column = ColumnString::create();
415
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
416
        std::vector<const ColumnString*> data_columns;
417
        std::vector<bool> column_is_consts;
418
        for (int i = 0; i < arguments.size(); i++) {
419
            ColumnPtr arg_col;
420
            bool arg_const;
421
            std::tie(arg_col, arg_const) =
422
                    unpack_if_const(block.get_by_position(arguments[i]).column);
423
            column_is_consts.push_back(arg_const);
424
            data_columns.push_back(assert_cast<const ColumnString*>(arg_col.get()));
425
        }
426
        Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()),
427
                      null_map->get_data(), input_rows_count, column_is_consts);
428
        block.replace_by_position(
429
                result, ColumnNullable::create(std::move(result_column), std::move(null_map)));
430
        return Status::OK();
431
    }
432
};
433
434
class FunctionJsonValid : public IFunction {
435
public:
436
    static constexpr auto name = "json_valid";
437
25
    static FunctionPtr create() { return std::make_shared<FunctionJsonValid>(); }
438
439
1
    String get_name() const override { return name; }
440
441
16
    size_t get_number_of_arguments() const override { return 1; }
442
443
16
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
444
16
        return make_nullable(std::make_shared<DataTypeInt32>());
445
16
    }
446
447
68
    bool use_default_implementation_for_nulls() const override { return false; }
448
449
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
450
52
                        uint32_t result, size_t input_rows_count) const override {
451
52
        const IColumn& col_from = *(block.get_by_position(arguments[0]).column);
452
453
52
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
454
455
52
        const ColumnUInt8::Container* input_null_map = nullptr;
456
52
        const ColumnString* col_from_string = nullptr;
457
52
        if (const auto* nullable = check_and_get_column<ColumnNullable>(col_from)) {
458
34
            input_null_map = &nullable->get_null_map_data();
459
34
            col_from_string =
460
34
                    check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
461
34
        } else {
462
18
            col_from_string = check_and_get_column<ColumnString>(col_from);
463
18
        }
464
465
52
        if (!col_from_string) {
466
0
            return Status::RuntimeError("Illegal column {} should be ColumnString",
467
0
                                        col_from.get_name());
468
0
        }
469
470
52
        auto col_to = ColumnInt32::create();
471
52
        auto& vec_to = col_to->get_data();
472
52
        size_t size = col_from.size();
473
52
        vec_to.resize(size);
474
475
        // parser can be reused for performance
476
477
52
        auto input_type = block.get_by_position(arguments[0]).type->get_primitive_type();
478
479
52
        if (input_type == PrimitiveType::TYPE_VARCHAR || input_type == PrimitiveType::TYPE_CHAR ||
480
52
            input_type == PrimitiveType::TYPE_STRING) {
481
12
            JsonBinaryValue jsonb_value;
482
24
            for (size_t i = 0; i < input_rows_count; ++i) {
483
12
                if (input_null_map && (*input_null_map)[i]) {
484
4
                    null_map->get_data()[i] = 1;
485
4
                    vec_to[i] = 0;
486
4
                    continue;
487
4
                }
488
489
8
                const auto& val = col_from_string->get_data_at(i);
490
8
                if (jsonb_value.from_json_string(val.data, cast_set<unsigned int>(val.size)).ok()) {
491
4
                    vec_to[i] = 1;
492
4
                } else {
493
4
                    vec_to[i] = 0;
494
4
                }
495
8
            }
496
497
40
        } else {
498
40
            DCHECK(input_type == PrimitiveType::TYPE_JSONB);
499
143
            for (size_t i = 0; i < input_rows_count; ++i) {
500
103
                if (input_null_map && (*input_null_map)[i]) {
501
6
                    null_map->get_data()[i] = 1;
502
6
                    vec_to[i] = 0;
503
6
                    continue;
504
6
                }
505
97
                const auto& val = col_from_string->get_data_at(i);
506
97
                if (val.size == 0) {
507
0
                    vec_to[i] = 0;
508
0
                    continue;
509
0
                }
510
97
                const JsonbDocument* doc = nullptr;
511
97
                auto st = JsonbDocument::checkAndCreateDocument(val.data, val.size, &doc);
512
97
                if (!st.ok() || !doc || !doc->getValue()) [[unlikely]] {
513
0
                    vec_to[i] = 0;
514
0
                    continue;
515
0
                }
516
97
                const JsonbValue* value = doc->getValue();
517
97
                if (UNLIKELY(!value)) {
518
0
                    vec_to[i] = 0;
519
0
                    continue;
520
0
                }
521
97
                vec_to[i] = 1;
522
97
            }
523
40
        }
524
525
52
        block.replace_by_position(result,
526
52
                                  ColumnNullable::create(std::move(col_to), std::move(null_map)));
527
528
52
        return Status::OK();
529
52
    }
530
};
531
class FunctionJsonUnquote : public IFunction {
532
public:
533
    static constexpr auto name = "json_unquote";
534
21
    static FunctionPtr create() { return std::make_shared<FunctionJsonUnquote>(); }
535
536
1
    String get_name() const override { return name; }
537
538
12
    size_t get_number_of_arguments() const override { return 1; }
539
540
12
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
541
12
        return make_nullable(std::make_shared<DataTypeString>());
542
12
    }
543
544
24
    bool use_default_implementation_for_nulls() const override { return false; }
545
546
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
547
12
                        uint32_t result, size_t input_rows_count) const override {
548
12
        const IColumn& col_from = *(block.get_by_position(arguments[0]).column);
549
550
12
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
551
552
12
        const ColumnString* col_from_string = check_and_get_column<ColumnString>(col_from);
553
12
        if (auto* nullable = check_and_get_column<ColumnNullable>(col_from)) {
554
4
            col_from_string =
555
4
                    check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
556
4
        }
557
558
12
        if (!col_from_string) {
559
0
            return Status::RuntimeError("Illegal column {} should be ColumnString",
560
0
                                        col_from.get_name());
561
0
        }
562
563
12
        auto col_to = ColumnString::create();
564
12
        col_to->reserve(input_rows_count);
565
566
        // parser can be reused for performance
567
12
        rapidjson::Document document;
568
24
        for (size_t i = 0; i < input_rows_count; ++i) {
569
12
            if (col_from.is_null_at(i)) {
570
3
                null_map->get_data()[i] = 1;
571
3
                col_to->insert_data(nullptr, 0);
572
3
                continue;
573
3
            }
574
575
9
            const auto& json_str = col_from_string->get_data_at(i);
576
9
            if (json_str.size < 2 || json_str.data[0] != '"' ||
577
9
                json_str.data[json_str.size - 1] != '"') {
578
                // non-quoted string
579
6
                col_to->insert_data(json_str.data, json_str.size);
580
6
            } else {
581
3
                document.Parse(json_str.data, json_str.size);
582
3
                if (document.HasParseError() || !document.IsString()) {
583
0
                    return Status::RuntimeError(
584
0
                            fmt::format("Invalid JSON text in argument 1 to function {}: {}", name,
585
0
                                        std::string_view(json_str.data, json_str.size)));
586
0
                }
587
3
                col_to->insert_data(document.GetString(), document.GetStringLength());
588
3
            }
589
9
        }
590
591
12
        block.replace_by_position(result,
592
12
                                  ColumnNullable::create(std::move(col_to), std::move(null_map)));
593
594
12
        return Status::OK();
595
12
    }
596
};
597
598
8
void register_function_json(SimpleFunctionFactory& factory) {
599
8
    factory.register_function<FunctionJsonUnquote>();
600
601
8
    factory.register_function<FunctionJson<FunctionJsonQuoteImpl>>();
602
603
8
    factory.register_function<FunctionJsonValid>();
604
8
}
605
606
} // namespace doris