Coverage Report

Created: 2026-03-12 14:02

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_json.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <glog/logging.h>
19
#include <rapidjson/allocators.h>
20
#include <rapidjson/document.h>
21
#include <rapidjson/encodings.h>
22
#include <rapidjson/pointer.h>
23
#include <rapidjson/rapidjson.h>
24
#include <rapidjson/stringbuffer.h>
25
#include <rapidjson/writer.h>
26
#include <re2/re2.h>
27
#include <stdint.h>
28
#include <stdlib.h>
29
#include <string.h>
30
31
#include <algorithm>
32
#include <boost/iterator/iterator_facade.hpp>
33
#include <boost/token_functions.hpp>
34
#include <boost/tokenizer.hpp>
35
#include <memory>
36
#include <string>
37
#include <string_view>
38
#include <type_traits>
39
#include <utility>
40
#include <vector>
41
42
#include "common/cast_set.h"
43
#include "common/compiler_util.h" // IWYU pragma: keep
44
#include "common/status.h"
45
#include "core/assert_cast.h"
46
#include "core/block/block.h"
47
#include "core/block/column_numbers.h"
48
#include "core/block/column_with_type_and_name.h"
49
#include "core/column/column.h"
50
#include "core/column/column_nullable.h"
51
#include "core/column/column_string.h"
52
#include "core/column/column_vector.h"
53
#include "core/data_type/data_type.h"
54
#include "core/data_type/data_type_nullable.h"
55
#include "core/data_type/data_type_number.h"
56
#include "core/data_type/data_type_string.h"
57
#include "core/string_ref.h"
58
#include "core/types.h"
59
#include "core/value/jsonb_value.h"
60
#include "exec/common/stringop_substring.h"
61
#include "exec/common/template_helpers.hpp"
62
#include "exprs/aggregate/aggregate_function.h"
63
#include "exprs/function/function.h"
64
#include "exprs/function/function_totype.h"
65
#include "exprs/function/simple_function_factory.h"
66
#include "exprs/json_functions.h"
67
#include "util/io_helper.h"
68
#include "util/string_parser.hpp"
69
#include "util/string_util.h"
70
71
namespace doris {
72
class FunctionContext;
73
} // namespace doris
74
75
namespace doris {
76
#include "common/compile_check_begin.h"
77
static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?");
78
79
template <typename T, typename U>
80
void char_split(std::vector<T>& res, const U& var, char p) {
81
    int start = 0;
82
    int pos = start;
83
    int end = var.length();
84
    while (pos < end) {
85
        while (var[pos] != p && pos < end) {
86
            pos++;
87
        }
88
        res.emplace_back(&var[start], pos - start);
89
        pos++;
90
        start = pos;
91
    }
92
}
93
94
// T = std::vector<std::string>
95
// TODO: update RE2 to support std::vector<std::string_view>
96
template <typename T>
97
void get_parsed_paths(const T& path_exprs, std::vector<JsonPath>* parsed_paths) {
98
    if (path_exprs.empty()) {
99
        return;
100
    }
101
102
    if (path_exprs[0] != "$") {
103
        parsed_paths->emplace_back("", -1, false);
104
    } else {
105
        parsed_paths->emplace_back("$", -1, true);
106
    }
107
108
    for (int i = 1; i < path_exprs.size(); i++) {
109
        std::string col;
110
        std::string index;
111
        if (UNLIKELY(!RE2::FullMatch(path_exprs[i], JSON_PATTERN, &col, &index))) {
112
            parsed_paths->emplace_back("", -1, false);
113
        } else {
114
            int idx = -1;
115
            if (!index.empty()) {
116
                if (index == "*") {
117
                    idx = -2;
118
                } else {
119
                    idx = atoi(index.c_str());
120
                }
121
            }
122
            parsed_paths->emplace_back(col, idx, true);
123
        }
124
    }
125
}
126
127
rapidjson::Value* NO_SANITIZE_UNDEFINED
128
match_value(const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document,
129
0
            rapidjson::Document::AllocatorType& mem_allocator, bool is_insert_null = false) {
130
0
    rapidjson::Value* root = document;
131
0
    rapidjson::Value* array_obj = nullptr;
132
0
    for (int i = 1; i < parsed_paths.size(); i++) {
133
0
        if (root == nullptr || root->IsNull()) {
134
0
            return nullptr;
135
0
        }
136
137
0
        if (UNLIKELY(!parsed_paths[i].is_valid)) {
138
0
            return nullptr;
139
0
        }
140
141
0
        const std::string& col = parsed_paths[i].key;
142
0
        int index = parsed_paths[i].idx;
143
0
        if (LIKELY(!col.empty())) {
144
0
            if (root->IsObject()) {
145
0
                if (!root->HasMember(col.c_str())) {
146
0
                    return nullptr;
147
0
                } else {
148
0
                    root = &((*root)[col.c_str()]);
149
0
                }
150
0
            } else {
151
                // root is not a nested type, return NULL
152
0
                return nullptr;
153
0
            }
154
0
        }
155
156
0
        if (UNLIKELY(index != -1)) {
157
            // judge the rapidjson:Value, which base the top's result,
158
            // if not array return NULL;else get the index value from the array
159
0
            if (root->IsArray()) {
160
0
                if (root->IsNull()) {
161
0
                    return nullptr;
162
0
                } else if (index == -2) {
163
                    // [*]
164
0
                    array_obj = static_cast<rapidjson::Value*>(
165
0
                            mem_allocator.Malloc(sizeof(rapidjson::Value)));
166
0
                    array_obj->SetArray();
167
168
0
                    for (int j = 0; j < root->Size(); j++) {
169
0
                        rapidjson::Value v;
170
0
                        v.CopyFrom((*root)[j], mem_allocator);
171
0
                        array_obj->PushBack(v, mem_allocator);
172
0
                    }
173
0
                    root = array_obj;
174
0
                } else if (index >= root->Size()) {
175
0
                    return nullptr;
176
0
                } else {
177
0
                    root = &((*root)[index]);
178
0
                }
179
0
            } else {
180
0
                return nullptr;
181
0
            }
182
0
        }
183
0
    }
184
0
    return root;
185
0
}
186
187
template <JsonFunctionType fntype>
188
rapidjson::Value* get_json_object(std::string_view json_string, std::string_view path_string,
189
                                  rapidjson::Document* document) {
190
    std::vector<JsonPath>* parsed_paths;
191
    std::vector<JsonPath> tmp_parsed_paths;
192
193
    //Cannot use '\' as the last character, return NULL
194
    if (path_string.back() == '\\') {
195
        return nullptr;
196
    }
197
198
    std::string fixed_string;
199
    if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] != '.') {
200
        // Boost tokenizer requires explicit "." after "$" to correctly extract JSON path tokens.
201
        // Without this, expressions like "$[0].key" cannot be properly split.
202
        // This commit ensures a "." is automatically added after "$" to maintain consistent token parsing behavior.
203
        fixed_string = "$.";
204
        fixed_string += path_string.substr(1);
205
        path_string = fixed_string;
206
    }
207
208
    try {
209
#ifdef USE_LIBCPP
210
        std::string s(path_string);
211
        auto tok = get_json_token(s);
212
#else
213
        auto tok = get_json_token(path_string);
214
#endif
215
        std::vector<std::string> paths(tok.begin(), tok.end());
216
        get_parsed_paths(paths, &tmp_parsed_paths);
217
        if (tmp_parsed_paths.empty()) {
218
            return document;
219
        }
220
    } catch (boost::escaped_list_error&) {
221
        // meet unknown escape sequence, example '$.name\k'
222
        return nullptr;
223
    }
224
225
    parsed_paths = &tmp_parsed_paths;
226
227
    if (!(*parsed_paths)[0].is_valid) {
228
        return nullptr;
229
    }
230
231
    if (UNLIKELY((*parsed_paths).size() == 1)) {
232
        if (fntype == JSON_FUN_STRING) {
233
            document->SetString(json_string.data(),
234
                                cast_set<rapidjson::SizeType>(json_string.size()),
235
                                document->GetAllocator());
236
        } else {
237
            return document;
238
        }
239
    }
240
241
    document->Parse(json_string.data(), json_string.size());
242
    if (UNLIKELY(document->HasParseError())) {
243
        // VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": "
244
        //         << GetParseError_En(document->GetParseError());
245
        return nullptr;
246
    }
247
248
    return match_value(*parsed_paths, document, document->GetAllocator());
249
}
250
251
template <int flag>
252
struct JsonParser {
253
    //string
254
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
255
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
256
        value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator);
257
    }
258
};
259
260
template <>
261
struct JsonParser<'0'> {
262
    // null
263
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
264
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
265
0
        value.SetNull();
266
0
    }
267
};
268
269
template <>
270
struct JsonParser<'1'> {
271
    // bool
272
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
273
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
274
0
        DCHECK(data.size == 1 || strncmp(data.data, "true", 4) == 0 ||
275
0
               strncmp(data.data, "false", 5) == 0);
276
0
        value.SetBool(*data.data == '1' || *data.data == 't');
277
0
    }
278
};
279
280
template <>
281
struct JsonParser<'2'> {
282
    // int
283
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
284
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
285
0
        value.SetInt(StringParser::string_to_int<int32_t>(data.data, data.size, &result));
286
0
    }
287
};
288
289
template <>
290
struct JsonParser<'3'> {
291
    // double
292
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
293
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
294
0
        value.SetDouble(StringParser::string_to_float<double>(data.data, data.size, &result));
295
0
    }
296
};
297
298
template <>
299
struct JsonParser<'4'> {
300
    // time
301
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
302
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
303
0
        // remove double quotes, "xxx" -> xxx
304
0
        value.SetString(data.data + 1, cast_set<rapidjson::SizeType>(data.size - 2), allocator);
305
0
    }
306
};
307
308
template <>
309
struct JsonParser<'5'> {
310
    // bigint
311
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
312
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
313
0
        value.SetInt64(StringParser::string_to_int<int64_t>(data.data, data.size, &result));
314
0
    }
315
};
316
317
template <>
318
struct JsonParser<'7'> {
319
    // json string
320
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
321
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
322
0
        rapidjson::Document document;
323
0
        const JsonbValue* json_val = JsonbDocument::createValue(data.data, data.size);
324
0
        convert_jsonb_to_rapidjson(*json_val, document, allocator);
325
0
        value.CopyFrom(document, allocator);
326
0
    }
327
};
328
329
template <int flag, typename Impl>
330
struct ExecuteReducer {
331
    template <typename... TArgs>
332
    static void run(TArgs&&... args) {
333
        Impl::template execute_type<JsonParser<flag>>(std::forward<TArgs>(args)...);
334
    }
335
};
336
337
struct FunctionJsonQuoteImpl {
338
    static constexpr auto name = "json_quote";
339
340
0
    static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
341
0
        if (!arguments.empty() && arguments[0] && arguments[0]->is_nullable()) {
342
0
            return make_nullable(std::make_shared<DataTypeString>());
343
0
        }
344
0
        return std::make_shared<DataTypeString>();
345
0
    }
346
    static void execute(const std::vector<const ColumnString*>& data_columns,
347
0
                        ColumnString& result_column, size_t input_rows_count) {
348
0
        rapidjson::Document document;
349
0
        rapidjson::Document::AllocatorType& allocator = document.GetAllocator();
350
351
0
        rapidjson::Value value;
352
353
0
        rapidjson::StringBuffer buf;
354
355
0
        for (int i = 0; i < input_rows_count; i++) {
356
0
            StringRef data = data_columns[0]->get_data_at(i);
357
0
            value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator);
358
359
0
            buf.Clear();
360
0
            rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
361
0
            value.Accept(writer);
362
0
            result_column.insert_data(buf.GetString(), buf.GetSize());
363
0
        }
364
0
    }
365
};
366
367
template <typename Impl>
368
class FunctionJson : public IFunction {
369
public:
370
    static constexpr auto name = Impl::name;
371
372
8
    static FunctionPtr create() { return std::make_shared<FunctionJson<Impl>>(); }
373
374
0
    String get_name() const override { return name; }
375
376
0
    size_t get_number_of_arguments() const override { return 0; }
377
378
1
    bool is_variadic() const override { return true; }
379
380
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
381
0
        return Impl::get_return_type_impl(arguments);
382
0
    }
383
384
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
385
0
                        uint32_t result, size_t input_rows_count) const override {
386
0
        auto result_column = ColumnString::create();
387
388
0
        std::vector<ColumnPtr> column_ptrs; // prevent converted column destruct
389
0
        std::vector<const ColumnString*> data_columns;
390
0
        for (int i = 0; i < arguments.size(); i++) {
391
0
            column_ptrs.push_back(
392
0
                    block.get_by_position(arguments[i]).column->convert_to_full_column_if_const());
393
0
            data_columns.push_back(assert_cast<const ColumnString*>(column_ptrs.back().get()));
394
0
        }
395
396
0
        Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()),
397
0
                      input_rows_count);
398
0
        block.get_by_position(result).column = std::move(result_column);
399
0
        return Status::OK();
400
0
    }
401
};
402
403
template <typename Impl>
404
class FunctionJsonNullable : public IFunction {
405
public:
406
    static constexpr auto name = Impl::name;
407
    static FunctionPtr create() { return std::make_shared<FunctionJsonNullable<Impl>>(); }
408
    String get_name() const override { return name; }
409
    size_t get_number_of_arguments() const override { return 0; }
410
    bool is_variadic() const override { return true; }
411
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
412
        return make_nullable(std::make_shared<DataTypeString>());
413
    }
414
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
415
                        uint32_t result, size_t input_rows_count) const override {
416
        auto result_column = ColumnString::create();
417
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
418
        std::vector<const ColumnString*> data_columns;
419
        std::vector<bool> column_is_consts;
420
        for (int i = 0; i < arguments.size(); i++) {
421
            ColumnPtr arg_col;
422
            bool arg_const;
423
            std::tie(arg_col, arg_const) =
424
                    unpack_if_const(block.get_by_position(arguments[i]).column);
425
            column_is_consts.push_back(arg_const);
426
            data_columns.push_back(assert_cast<const ColumnString*>(arg_col.get()));
427
        }
428
        Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()),
429
                      null_map->get_data(), input_rows_count, column_is_consts);
430
        block.replace_by_position(
431
                result, ColumnNullable::create(std::move(result_column), std::move(null_map)));
432
        return Status::OK();
433
    }
434
};
435
436
class FunctionJsonValid : public IFunction {
437
public:
438
    static constexpr auto name = "json_valid";
439
8
    static FunctionPtr create() { return std::make_shared<FunctionJsonValid>(); }
440
441
1
    String get_name() const override { return name; }
442
443
0
    size_t get_number_of_arguments() const override { return 1; }
444
445
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
446
0
        return make_nullable(std::make_shared<DataTypeInt32>());
447
0
    }
448
449
0
    bool use_default_implementation_for_nulls() const override { return false; }
450
451
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
452
0
                        uint32_t result, size_t input_rows_count) const override {
453
0
        const IColumn& col_from = *(block.get_by_position(arguments[0]).column);
454
455
0
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
456
457
0
        const ColumnUInt8::Container* input_null_map = nullptr;
458
0
        const ColumnString* col_from_string = nullptr;
459
0
        if (const auto* nullable = check_and_get_column<ColumnNullable>(col_from)) {
460
0
            input_null_map = &nullable->get_null_map_data();
461
0
            col_from_string =
462
0
                    check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
463
0
        } else {
464
0
            col_from_string = check_and_get_column<ColumnString>(col_from);
465
0
        }
466
467
0
        if (!col_from_string) {
468
0
            return Status::RuntimeError("Illegal column {} should be ColumnString",
469
0
                                        col_from.get_name());
470
0
        }
471
472
0
        auto col_to = ColumnInt32::create();
473
0
        auto& vec_to = col_to->get_data();
474
0
        size_t size = col_from.size();
475
0
        vec_to.resize(size);
476
477
        // parser can be reused for performance
478
479
0
        auto input_type = block.get_by_position(arguments[0]).type->get_primitive_type();
480
481
0
        if (input_type == PrimitiveType::TYPE_VARCHAR || input_type == PrimitiveType::TYPE_CHAR ||
482
0
            input_type == PrimitiveType::TYPE_STRING) {
483
0
            JsonBinaryValue jsonb_value;
484
0
            for (size_t i = 0; i < input_rows_count; ++i) {
485
0
                if (input_null_map && (*input_null_map)[i]) {
486
0
                    null_map->get_data()[i] = 1;
487
0
                    vec_to[i] = 0;
488
0
                    continue;
489
0
                }
490
491
0
                const auto& val = col_from_string->get_data_at(i);
492
0
                if (jsonb_value.from_json_string(val.data, cast_set<unsigned int>(val.size)).ok()) {
493
0
                    vec_to[i] = 1;
494
0
                } else {
495
0
                    vec_to[i] = 0;
496
0
                }
497
0
            }
498
499
0
        } else {
500
0
            DCHECK(input_type == PrimitiveType::TYPE_JSONB);
501
0
            for (size_t i = 0; i < input_rows_count; ++i) {
502
0
                if (input_null_map && (*input_null_map)[i]) {
503
0
                    null_map->get_data()[i] = 1;
504
0
                    vec_to[i] = 0;
505
0
                    continue;
506
0
                }
507
0
                const auto& val = col_from_string->get_data_at(i);
508
0
                if (val.size == 0) {
509
0
                    vec_to[i] = 0;
510
0
                    continue;
511
0
                }
512
0
                const JsonbDocument* doc = nullptr;
513
0
                auto st = JsonbDocument::checkAndCreateDocument(val.data, val.size, &doc);
514
0
                if (!st.ok() || !doc || !doc->getValue()) [[unlikely]] {
515
0
                    vec_to[i] = 0;
516
0
                    continue;
517
0
                }
518
0
                const JsonbValue* value = doc->getValue();
519
0
                if (UNLIKELY(!value)) {
520
0
                    vec_to[i] = 0;
521
0
                    continue;
522
0
                }
523
0
                vec_to[i] = 1;
524
0
            }
525
0
        }
526
527
0
        block.replace_by_position(result,
528
0
                                  ColumnNullable::create(std::move(col_to), std::move(null_map)));
529
530
0
        return Status::OK();
531
0
    }
532
};
533
class FunctionJsonUnquote : public IFunction {
534
public:
535
    static constexpr auto name = "json_unquote";
536
8
    static FunctionPtr create() { return std::make_shared<FunctionJsonUnquote>(); }
537
538
1
    String get_name() const override { return name; }
539
540
0
    size_t get_number_of_arguments() const override { return 1; }
541
542
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
543
0
        return make_nullable(std::make_shared<DataTypeString>());
544
0
    }
545
546
0
    bool use_default_implementation_for_nulls() const override { return false; }
547
548
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
549
0
                        uint32_t result, size_t input_rows_count) const override {
550
0
        const IColumn& col_from = *(block.get_by_position(arguments[0]).column);
551
552
0
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
553
554
0
        const ColumnString* col_from_string = check_and_get_column<ColumnString>(col_from);
555
0
        if (auto* nullable = check_and_get_column<ColumnNullable>(col_from)) {
556
0
            col_from_string =
557
0
                    check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
558
0
        }
559
560
0
        if (!col_from_string) {
561
0
            return Status::RuntimeError("Illegal column {} should be ColumnString",
562
0
                                        col_from.get_name());
563
0
        }
564
565
0
        auto col_to = ColumnString::create();
566
0
        col_to->reserve(input_rows_count);
567
568
        // parser can be reused for performance
569
0
        rapidjson::Document document;
570
0
        for (size_t i = 0; i < input_rows_count; ++i) {
571
0
            if (col_from.is_null_at(i)) {
572
0
                null_map->get_data()[i] = 1;
573
0
                col_to->insert_data(nullptr, 0);
574
0
                continue;
575
0
            }
576
577
0
            const auto& json_str = col_from_string->get_data_at(i);
578
0
            if (json_str.size < 2 || json_str.data[0] != '"' ||
579
0
                json_str.data[json_str.size - 1] != '"') {
580
                // non-quoted string
581
0
                col_to->insert_data(json_str.data, json_str.size);
582
0
            } else {
583
0
                document.Parse(json_str.data, json_str.size);
584
0
                if (document.HasParseError() || !document.IsString()) {
585
0
                    return Status::RuntimeError(
586
0
                            fmt::format("Invalid JSON text in argument 1 to function {}: {}", name,
587
0
                                        std::string_view(json_str.data, json_str.size)));
588
0
                }
589
0
                col_to->insert_data(document.GetString(), document.GetStringLength());
590
0
            }
591
0
        }
592
593
0
        block.replace_by_position(result,
594
0
                                  ColumnNullable::create(std::move(col_to), std::move(null_map)));
595
596
0
        return Status::OK();
597
0
    }
598
};
599
600
7
void register_function_json(SimpleFunctionFactory& factory) {
601
7
    factory.register_function<FunctionJsonUnquote>();
602
603
7
    factory.register_function<FunctionJson<FunctionJsonQuoteImpl>>();
604
605
7
    factory.register_function<FunctionJsonValid>();
606
7
}
607
608
} // namespace doris