Coverage Report

Created: 2026-04-10 04:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/function_json.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include <glog/logging.h>
19
#include <rapidjson/allocators.h>
20
#include <rapidjson/document.h>
21
#include <rapidjson/encodings.h>
22
#include <rapidjson/pointer.h>
23
#include <rapidjson/rapidjson.h>
24
#include <rapidjson/stringbuffer.h>
25
#include <rapidjson/writer.h>
26
#include <re2/re2.h>
27
#include <stdint.h>
28
#include <stdlib.h>
29
#include <string.h>
30
31
#include <algorithm>
32
#include <boost/iterator/iterator_facade.hpp>
33
#include <boost/token_functions.hpp>
34
#include <boost/tokenizer.hpp>
35
#include <memory>
36
#include <string>
37
#include <string_view>
38
#include <type_traits>
39
#include <utility>
40
#include <vector>
41
42
#include "common/cast_set.h"
43
#include "common/compiler_util.h" // IWYU pragma: keep
44
#include "common/status.h"
45
#include "core/assert_cast.h"
46
#include "core/block/block.h"
47
#include "core/block/column_numbers.h"
48
#include "core/block/column_with_type_and_name.h"
49
#include "core/column/column.h"
50
#include "core/column/column_nullable.h"
51
#include "core/column/column_string.h"
52
#include "core/column/column_vector.h"
53
#include "core/data_type/data_type.h"
54
#include "core/data_type/data_type_nullable.h"
55
#include "core/data_type/data_type_number.h"
56
#include "core/data_type/data_type_string.h"
57
#include "core/string_ref.h"
58
#include "core/types.h"
59
#include "core/value/jsonb_value.h"
60
#include "exec/common/stringop_substring.h"
61
#include "exec/common/template_helpers.hpp"
62
#include "exprs/aggregate/aggregate_function.h"
63
#include "exprs/function/function.h"
64
#include "exprs/function/function_totype.h"
65
#include "exprs/function/simple_function_factory.h"
66
#include "exprs/json_functions.h"
67
#include "util/io_helper.h"
68
#include "util/string_parser.hpp"
69
#include "util/string_util.h"
70
71
namespace doris {
72
class FunctionContext;
73
} // namespace doris
74
75
namespace doris {
76
static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?");
77
78
template <typename T, typename U>
79
void char_split(std::vector<T>& res, const U& var, char p) {
80
    int start = 0;
81
    int pos = start;
82
    int end = var.length();
83
    while (pos < end) {
84
        while (var[pos] != p && pos < end) {
85
            pos++;
86
        }
87
        res.emplace_back(&var[start], pos - start);
88
        pos++;
89
        start = pos;
90
    }
91
}
92
93
// T = std::vector<std::string>
94
// TODO: update RE2 to support std::vector<std::string_view>
95
template <typename T>
96
void get_parsed_paths(const T& path_exprs, std::vector<JsonPath>* parsed_paths) {
97
    if (path_exprs.empty()) {
98
        return;
99
    }
100
101
    if (path_exprs[0] != "$") {
102
        parsed_paths->emplace_back("", -1, false);
103
    } else {
104
        parsed_paths->emplace_back("$", -1, true);
105
    }
106
107
    for (int i = 1; i < path_exprs.size(); i++) {
108
        std::string col;
109
        std::string index;
110
        if (UNLIKELY(!RE2::FullMatch(path_exprs[i], JSON_PATTERN, &col, &index))) {
111
            parsed_paths->emplace_back("", -1, false);
112
        } else {
113
            int idx = -1;
114
            if (!index.empty()) {
115
                if (index == "*") {
116
                    idx = -2;
117
                } else {
118
                    idx = atoi(index.c_str());
119
                }
120
            }
121
            parsed_paths->emplace_back(col, idx, true);
122
        }
123
    }
124
}
125
126
rapidjson::Value* NO_SANITIZE_UNDEFINED
127
match_value(const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document,
128
0
            rapidjson::Document::AllocatorType& mem_allocator, bool is_insert_null = false) {
129
0
    rapidjson::Value* root = document;
130
0
    rapidjson::Value* array_obj = nullptr;
131
0
    for (int i = 1; i < parsed_paths.size(); i++) {
132
0
        if (root == nullptr || root->IsNull()) {
133
0
            return nullptr;
134
0
        }
135
136
0
        if (UNLIKELY(!parsed_paths[i].is_valid)) {
137
0
            return nullptr;
138
0
        }
139
140
0
        const std::string& col = parsed_paths[i].key;
141
0
        int index = parsed_paths[i].idx;
142
0
        if (LIKELY(!col.empty())) {
143
0
            if (root->IsObject()) {
144
0
                if (!root->HasMember(col.c_str())) {
145
0
                    return nullptr;
146
0
                } else {
147
0
                    root = &((*root)[col.c_str()]);
148
0
                }
149
0
            } else {
150
                // root is not a nested type, return NULL
151
0
                return nullptr;
152
0
            }
153
0
        }
154
155
0
        if (UNLIKELY(index != -1)) {
156
            // judge the rapidjson:Value, which base the top's result,
157
            // if not array return NULL;else get the index value from the array
158
0
            if (root->IsArray()) {
159
0
                if (root->IsNull()) {
160
0
                    return nullptr;
161
0
                } else if (index == -2) {
162
                    // [*]
163
0
                    array_obj = static_cast<rapidjson::Value*>(
164
0
                            mem_allocator.Malloc(sizeof(rapidjson::Value)));
165
0
                    array_obj->SetArray();
166
167
0
                    for (int j = 0; j < root->Size(); j++) {
168
0
                        rapidjson::Value v;
169
0
                        v.CopyFrom((*root)[j], mem_allocator);
170
0
                        array_obj->PushBack(v, mem_allocator);
171
0
                    }
172
0
                    root = array_obj;
173
0
                } else if (index >= root->Size()) {
174
0
                    return nullptr;
175
0
                } else {
176
0
                    root = &((*root)[index]);
177
0
                }
178
0
            } else {
179
0
                return nullptr;
180
0
            }
181
0
        }
182
0
    }
183
0
    return root;
184
0
}
185
186
template <JsonFunctionType fntype>
187
rapidjson::Value* get_json_object(std::string_view json_string, std::string_view path_string,
188
                                  rapidjson::Document* document) {
189
    std::vector<JsonPath>* parsed_paths;
190
    std::vector<JsonPath> tmp_parsed_paths;
191
192
    //Cannot use '\' as the last character, return NULL
193
    if (path_string.back() == '\\') {
194
        return nullptr;
195
    }
196
197
    std::string fixed_string;
198
    if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] != '.') {
199
        // Boost tokenizer requires explicit "." after "$" to correctly extract JSON path tokens.
200
        // Without this, expressions like "$[0].key" cannot be properly split.
201
        // This commit ensures a "." is automatically added after "$" to maintain consistent token parsing behavior.
202
        fixed_string = "$.";
203
        fixed_string += path_string.substr(1);
204
        path_string = fixed_string;
205
    }
206
207
    try {
208
#ifdef USE_LIBCPP
209
        std::string s(path_string);
210
        auto tok = get_json_token(s);
211
#else
212
        auto tok = get_json_token(path_string);
213
#endif
214
        std::vector<std::string> paths(tok.begin(), tok.end());
215
        get_parsed_paths(paths, &tmp_parsed_paths);
216
        if (tmp_parsed_paths.empty()) {
217
            return document;
218
        }
219
    } catch (boost::escaped_list_error&) {
220
        // meet unknown escape sequence, example '$.name\k'
221
        return nullptr;
222
    }
223
224
    parsed_paths = &tmp_parsed_paths;
225
226
    if (!(*parsed_paths)[0].is_valid) {
227
        return nullptr;
228
    }
229
230
    if (UNLIKELY((*parsed_paths).size() == 1)) {
231
        if (fntype == JSON_FUN_STRING) {
232
            document->SetString(json_string.data(),
233
                                cast_set<rapidjson::SizeType>(json_string.size()),
234
                                document->GetAllocator());
235
        } else {
236
            return document;
237
        }
238
    }
239
240
    document->Parse(json_string.data(), json_string.size());
241
    if (UNLIKELY(document->HasParseError())) {
242
        // VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": "
243
        //         << GetParseError_En(document->GetParseError());
244
        return nullptr;
245
    }
246
247
    return match_value(*parsed_paths, document, document->GetAllocator());
248
}
249
250
template <int flag>
251
struct JsonParser {
252
    //string
253
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
254
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
255
        value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator);
256
    }
257
};
258
259
template <>
260
struct JsonParser<'0'> {
261
    // null
262
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
263
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
264
0
        value.SetNull();
265
0
    }
266
};
267
268
template <>
269
struct JsonParser<'1'> {
270
    // bool
271
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
272
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
273
0
        DCHECK(data.size == 1 || strncmp(data.data, "true", 4) == 0 ||
274
0
               strncmp(data.data, "false", 5) == 0);
275
0
        value.SetBool(*data.data == '1' || *data.data == 't');
276
0
    }
277
};
278
279
template <>
280
struct JsonParser<'2'> {
281
    // int
282
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
283
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
284
0
        value.SetInt(StringParser::string_to_int<int32_t>(data.data, data.size, &result));
285
0
    }
286
};
287
288
template <>
289
struct JsonParser<'3'> {
290
    // double
291
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
292
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
293
0
        value.SetDouble(StringParser::string_to_float<double>(data.data, data.size, &result));
294
0
    }
295
};
296
297
template <>
298
struct JsonParser<'4'> {
299
    // time
300
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
301
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
302
0
        // remove double quotes, "xxx" -> xxx
303
0
        value.SetString(data.data + 1, cast_set<rapidjson::SizeType>(data.size - 2), allocator);
304
0
    }
305
};
306
307
template <>
308
struct JsonParser<'5'> {
309
    // bigint
310
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
311
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
312
0
        value.SetInt64(StringParser::string_to_int<int64_t>(data.data, data.size, &result));
313
0
    }
314
};
315
316
template <>
317
struct JsonParser<'7'> {
318
    // json string
319
    static void update_value(StringParser::ParseResult& result, rapidjson::Value& value,
320
0
                             StringRef data, rapidjson::Document::AllocatorType& allocator) {
321
0
        rapidjson::Document document;
322
0
        const JsonbValue* json_val = JsonbDocument::createValue(data.data, data.size);
323
0
        convert_jsonb_to_rapidjson(*json_val, document, allocator);
324
0
        value.CopyFrom(document, allocator);
325
0
    }
326
};
327
328
template <int flag, typename Impl>
329
struct ExecuteReducer {
330
    template <typename... TArgs>
331
    static void run(TArgs&&... args) {
332
        Impl::template execute_type<JsonParser<flag>>(std::forward<TArgs>(args)...);
333
    }
334
};
335
336
struct FunctionJsonQuoteImpl {
337
    static constexpr auto name = "json_quote";
338
339
0
    static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
340
0
        if (!arguments.empty() && arguments[0] && arguments[0]->is_nullable()) {
341
0
            return make_nullable(std::make_shared<DataTypeString>());
342
0
        }
343
0
        return std::make_shared<DataTypeString>();
344
0
    }
345
    static void execute(const std::vector<const ColumnString*>& data_columns,
346
0
                        ColumnString& result_column, size_t input_rows_count) {
347
0
        rapidjson::Document document;
348
0
        rapidjson::Document::AllocatorType& allocator = document.GetAllocator();
349
350
0
        rapidjson::Value value;
351
352
0
        rapidjson::StringBuffer buf;
353
354
0
        for (int i = 0; i < input_rows_count; i++) {
355
0
            StringRef data = data_columns[0]->get_data_at(i);
356
0
            value.SetString(data.data, cast_set<rapidjson::SizeType>(data.size), allocator);
357
358
0
            buf.Clear();
359
0
            rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
360
0
            value.Accept(writer);
361
0
            result_column.insert_data(buf.GetString(), buf.GetSize());
362
0
        }
363
0
    }
364
};
365
366
template <typename Impl>
367
class FunctionJson : public IFunction {
368
public:
369
    static constexpr auto name = Impl::name;
370
371
2
    static FunctionPtr create() { return std::make_shared<FunctionJson<Impl>>(); }
372
373
0
    String get_name() const override { return name; }
374
375
0
    size_t get_number_of_arguments() const override { return 0; }
376
377
1
    bool is_variadic() const override { return true; }
378
379
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
380
0
        return Impl::get_return_type_impl(arguments);
381
0
    }
382
383
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
384
0
                        uint32_t result, size_t input_rows_count) const override {
385
0
        auto result_column = ColumnString::create();
386
387
0
        std::vector<ColumnPtr> column_ptrs; // prevent converted column destruct
388
0
        std::vector<const ColumnString*> data_columns;
389
0
        for (int i = 0; i < arguments.size(); i++) {
390
0
            column_ptrs.push_back(
391
0
                    block.get_by_position(arguments[i]).column->convert_to_full_column_if_const());
392
0
            data_columns.push_back(assert_cast<const ColumnString*>(column_ptrs.back().get()));
393
0
        }
394
395
0
        Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()),
396
0
                      input_rows_count);
397
0
        block.get_by_position(result).column = std::move(result_column);
398
0
        return Status::OK();
399
0
    }
400
};
401
402
template <typename Impl>
403
class FunctionJsonNullable : public IFunction {
404
public:
405
    static constexpr auto name = Impl::name;
406
    static FunctionPtr create() { return std::make_shared<FunctionJsonNullable<Impl>>(); }
407
    String get_name() const override { return name; }
408
    size_t get_number_of_arguments() const override { return 0; }
409
    bool is_variadic() const override { return true; }
410
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
411
        return make_nullable(std::make_shared<DataTypeString>());
412
    }
413
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
414
                        uint32_t result, size_t input_rows_count) const override {
415
        auto result_column = ColumnString::create();
416
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
417
        std::vector<const ColumnString*> data_columns;
418
        std::vector<bool> column_is_consts;
419
        for (int i = 0; i < arguments.size(); i++) {
420
            ColumnPtr arg_col;
421
            bool arg_const;
422
            std::tie(arg_col, arg_const) =
423
                    unpack_if_const(block.get_by_position(arguments[i]).column);
424
            column_is_consts.push_back(arg_const);
425
            data_columns.push_back(assert_cast<const ColumnString*>(arg_col.get()));
426
        }
427
        Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()),
428
                      null_map->get_data(), input_rows_count, column_is_consts);
429
        block.replace_by_position(
430
                result, ColumnNullable::create(std::move(result_column), std::move(null_map)));
431
        return Status::OK();
432
    }
433
};
434
435
class FunctionJsonValid : public IFunction {
436
public:
437
    static constexpr auto name = "json_valid";
438
2
    static FunctionPtr create() { return std::make_shared<FunctionJsonValid>(); }
439
440
1
    String get_name() const override { return name; }
441
442
0
    size_t get_number_of_arguments() const override { return 1; }
443
444
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
445
0
        return make_nullable(std::make_shared<DataTypeInt32>());
446
0
    }
447
448
0
    bool use_default_implementation_for_nulls() const override { return false; }
449
450
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
451
0
                        uint32_t result, size_t input_rows_count) const override {
452
0
        const IColumn& col_from = *(block.get_by_position(arguments[0]).column);
453
454
0
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
455
456
0
        const ColumnUInt8::Container* input_null_map = nullptr;
457
0
        const ColumnString* col_from_string = nullptr;
458
0
        if (const auto* nullable = check_and_get_column<ColumnNullable>(col_from)) {
459
0
            input_null_map = &nullable->get_null_map_data();
460
0
            col_from_string =
461
0
                    check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
462
0
        } else {
463
0
            col_from_string = check_and_get_column<ColumnString>(col_from);
464
0
        }
465
466
0
        if (!col_from_string) {
467
0
            return Status::RuntimeError("Illegal column {} should be ColumnString",
468
0
                                        col_from.get_name());
469
0
        }
470
471
0
        auto col_to = ColumnInt32::create();
472
0
        auto& vec_to = col_to->get_data();
473
0
        size_t size = col_from.size();
474
0
        vec_to.resize(size);
475
476
        // parser can be reused for performance
477
478
0
        auto input_type = block.get_by_position(arguments[0]).type->get_primitive_type();
479
480
0
        if (input_type == PrimitiveType::TYPE_VARCHAR || input_type == PrimitiveType::TYPE_CHAR ||
481
0
            input_type == PrimitiveType::TYPE_STRING) {
482
0
            JsonBinaryValue jsonb_value;
483
0
            for (size_t i = 0; i < input_rows_count; ++i) {
484
0
                if (input_null_map && (*input_null_map)[i]) {
485
0
                    null_map->get_data()[i] = 1;
486
0
                    vec_to[i] = 0;
487
0
                    continue;
488
0
                }
489
490
0
                const auto& val = col_from_string->get_data_at(i);
491
0
                if (jsonb_value.from_json_string(val.data, cast_set<unsigned int>(val.size)).ok()) {
492
0
                    vec_to[i] = 1;
493
0
                } else {
494
0
                    vec_to[i] = 0;
495
0
                }
496
0
            }
497
498
0
        } else {
499
0
            DCHECK(input_type == PrimitiveType::TYPE_JSONB);
500
0
            for (size_t i = 0; i < input_rows_count; ++i) {
501
0
                if (input_null_map && (*input_null_map)[i]) {
502
0
                    null_map->get_data()[i] = 1;
503
0
                    vec_to[i] = 0;
504
0
                    continue;
505
0
                }
506
0
                const auto& val = col_from_string->get_data_at(i);
507
0
                if (val.size == 0) {
508
0
                    vec_to[i] = 0;
509
0
                    continue;
510
0
                }
511
0
                const JsonbDocument* doc = nullptr;
512
0
                auto st = JsonbDocument::checkAndCreateDocument(val.data, val.size, &doc);
513
0
                if (!st.ok() || !doc || !doc->getValue()) [[unlikely]] {
514
0
                    vec_to[i] = 0;
515
0
                    continue;
516
0
                }
517
0
                const JsonbValue* value = doc->getValue();
518
0
                if (UNLIKELY(!value)) {
519
0
                    vec_to[i] = 0;
520
0
                    continue;
521
0
                }
522
0
                vec_to[i] = 1;
523
0
            }
524
0
        }
525
526
0
        block.replace_by_position(result,
527
0
                                  ColumnNullable::create(std::move(col_to), std::move(null_map)));
528
529
0
        return Status::OK();
530
0
    }
531
};
532
class FunctionJsonUnquote : public IFunction {
533
public:
534
    static constexpr auto name = "json_unquote";
535
2
    static FunctionPtr create() { return std::make_shared<FunctionJsonUnquote>(); }
536
537
1
    String get_name() const override { return name; }
538
539
0
    size_t get_number_of_arguments() const override { return 1; }
540
541
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
542
0
        return make_nullable(std::make_shared<DataTypeString>());
543
0
    }
544
545
0
    bool use_default_implementation_for_nulls() const override { return false; }
546
547
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
548
0
                        uint32_t result, size_t input_rows_count) const override {
549
0
        const IColumn& col_from = *(block.get_by_position(arguments[0]).column);
550
551
0
        auto null_map = ColumnUInt8::create(input_rows_count, 0);
552
553
0
        const ColumnString* col_from_string = check_and_get_column<ColumnString>(col_from);
554
0
        if (auto* nullable = check_and_get_column<ColumnNullable>(col_from)) {
555
0
            col_from_string =
556
0
                    check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
557
0
        }
558
559
0
        if (!col_from_string) {
560
0
            return Status::RuntimeError("Illegal column {} should be ColumnString",
561
0
                                        col_from.get_name());
562
0
        }
563
564
0
        auto col_to = ColumnString::create();
565
0
        col_to->reserve(input_rows_count);
566
567
        // parser can be reused for performance
568
0
        rapidjson::Document document;
569
0
        for (size_t i = 0; i < input_rows_count; ++i) {
570
0
            if (col_from.is_null_at(i)) {
571
0
                null_map->get_data()[i] = 1;
572
0
                col_to->insert_data(nullptr, 0);
573
0
                continue;
574
0
            }
575
576
0
            const auto& json_str = col_from_string->get_data_at(i);
577
0
            if (json_str.size < 2 || json_str.data[0] != '"' ||
578
0
                json_str.data[json_str.size - 1] != '"') {
579
                // non-quoted string
580
0
                col_to->insert_data(json_str.data, json_str.size);
581
0
            } else {
582
0
                document.Parse(json_str.data, json_str.size);
583
0
                if (document.HasParseError() || !document.IsString()) {
584
0
                    return Status::RuntimeError(
585
0
                            fmt::format("Invalid JSON text in argument 1 to function {}: {}", name,
586
0
                                        std::string_view(json_str.data, json_str.size)));
587
0
                }
588
0
                col_to->insert_data(document.GetString(), document.GetStringLength());
589
0
            }
590
0
        }
591
592
0
        block.replace_by_position(result,
593
0
                                  ColumnNullable::create(std::move(col_to), std::move(null_map)));
594
595
0
        return Status::OK();
596
0
    }
597
};
598
599
1
void register_function_json(SimpleFunctionFactory& factory) {
600
1
    factory.register_function<FunctionJsonUnquote>();
601
602
1
    factory.register_function<FunctionJson<FunctionJsonQuoteImpl>>();
603
604
1
    factory.register_function<FunctionJsonValid>();
605
1
}
606
607
} // namespace doris