Coverage Report

Created: 2026-05-18 01:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/json/json_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.cpp
19
// and modified by Doris
20
21
#include "util/json/json_parser.h"
22
23
#include <fmt/format.h>
24
#include <glog/logging.h>
25
26
#include <algorithm>
27
#include <cassert>
28
#include <string_view>
29
#include <vector>
30
31
#include "common/cast_set.h"
32
// IWYU pragma: keep
33
#include "common/status.h"
34
#include "util/json/path_in_data.h"
35
#include "util/json/simd_json_parser.h"
36
37
namespace doris {
38
39
template <typename ParserImpl>
40
std::optional<ParseResult> JSONDataParser<ParserImpl>::parse(const char* begin, size_t length,
41
80.6k
                                                             const ParseConfig& config) {
42
80.6k
    Element document;
43
80.6k
    const bool preserve_raw_numbers =
44
80.6k
            !config.preserve_decimal_number_paths.empty() ||
45
80.6k
            static_cast<bool>(config.preserve_decimal_number_path_matcher);
46
80.6k
    if (!parser.parse(begin, length, document, preserve_raw_numbers)) {
47
15
        return {};
48
15
    }
49
80.6k
    ParseContext context;
50
    // deprecated_enable_flatten_nested controls nested path traversal
51
    // NestedGroup expansion is now handled at storage layer
52
80.6k
    context.deprecated_enable_flatten_nested = config.deprecated_enable_flatten_nested;
53
80.6k
    context.check_duplicate_json_path = config.check_duplicate_json_path;
54
80.6k
    context.is_top_array = document.isArray();
55
80.6k
    context.preserve_decimal_number_paths = &config.preserve_decimal_number_paths;
56
80.6k
    context.preserve_decimal_number_path_matcher = &config.preserve_decimal_number_path_matcher;
57
80.6k
    traverse(document, context);
58
80.6k
    ParseResult result;
59
80.6k
    result.values = std::move(context.values);
60
80.6k
    result.paths.reserve(context.paths.size());
61
1.35M
    for (auto&& path : context.paths) {
62
1.35M
        result.paths.emplace_back(std::move(path));
63
1.35M
    }
64
80.6k
    return result;
65
80.6k
}
66
67
template <typename ParserImpl>
68
2.08M
void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext& ctx) {
69
    // checkStackSize();
70
2.08M
    if (element.isObject()) {
71
144k
        traverseObject(element.getObject(), ctx);
72
1.93M
    } else if (element.isArray()) {
73
        // allow nested arrays (multi-level) for NestedGroup; deeper levels are
74
        // handled by VariantNestedBuilder with a max-depth guard.
75
55.3k
        has_nested = false;
76
55.3k
        check_has_nested_object(element);
77
55.3k
        ctx.has_nested_in_flatten = has_nested && ctx.deprecated_enable_flatten_nested;
78
55.3k
        if (has_nested && !ctx.deprecated_enable_flatten_nested) {
79
            // Parse nested arrays to JsonbField
80
209
            JsonbWriter writer;
81
209
            traverseArrayAsJsonb(element.getArray(), writer);
82
209
            appendValueIfNotDuplicate(
83
209
                    ctx, ctx.builder.get_parts(),
84
209
                    Field::create_field<TYPE_JSONB>(JsonbField(writer.getOutput()->getBuffer(),
85
209
                                                               writer.getOutput()->getSize())));
86
55.1k
        } else {
87
55.1k
            traverseArray(element.getArray(), ctx);
88
55.1k
        }
89
        // we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array
90
55.3k
        ctx.has_nested_in_flatten = false;
91
1.88M
    } else {
92
1.88M
        appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
93
1.88M
                                  getValueAsField(element, shouldPreserveNumberAsString(ctx)));
94
1.88M
    }
95
2.08M
}
96
97
template <typename ParserImpl>
98
1.88M
bool JSONDataParser<ParserImpl>::shouldPreserveNumberAsString(const ParseContext& ctx) const {
99
1.88M
    const bool has_exact_paths = ctx.preserve_decimal_number_paths != nullptr &&
100
1.88M
                                 !ctx.preserve_decimal_number_paths->empty();
101
1.88M
    const bool has_path_matcher = ctx.preserve_decimal_number_path_matcher != nullptr &&
102
1.88M
                                  *ctx.preserve_decimal_number_path_matcher;
103
1.88M
    if (!has_exact_paths && !has_path_matcher) {
104
1.88M
        return false;
105
1.88M
    }
106
23
    PathInData::Parts path = ctx.path_prefix_for_typed_paths;
107
23
    const auto& current_parts = ctx.builder.get_parts();
108
23
    path.insert(path.end(), current_parts.begin(), current_parts.end());
109
23
    const auto current_path = PathInData(path).get_path();
110
23
    if (has_exact_paths && ctx.preserve_decimal_number_paths->find(current_path) !=
111
21
                                   ctx.preserve_decimal_number_paths->end()) {
112
18
        return true;
113
18
    }
114
5
    return has_path_matcher && (*ctx.preserve_decimal_number_path_matcher)(current_path);
115
23
}
116
117
template <typename ParserImpl>
118
void JSONDataParser<ParserImpl>::appendValueIfNotDuplicate(ParseContext& ctx,
119
                                                           const PathInData::Parts& path,
120
1.94M
                                                           Field&& value) {
121
1.94M
    if (ctx.check_duplicate_json_path) {
122
32
        PathInData path_in_data(path);
123
32
        if (!ctx.visited_path_names.emplace(path_in_data.get_path()).second) {
124
10
            return;
125
10
        }
126
32
    }
127
1.94M
    ctx.paths.push_back(path);
128
1.94M
    ctx.values.push_back(std::move(value));
129
1.94M
}
130
131
template <typename ParserImpl>
132
144k
void JSONDataParser<ParserImpl>::traverseObject(const JSONObject& object, ParseContext& ctx) {
133
144k
    ctx.paths.reserve(ctx.paths.size() + object.size());
134
144k
    ctx.values.reserve(ctx.values.size() + object.size());
135
1.42M
    auto check_key_length = [](const auto& key) {
136
1.42M
        const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
137
1.42M
        if (key.size() > max_key_length) {
138
336
            throw doris::Exception(
139
336
                    doris::ErrorCode::INVALID_ARGUMENT,
140
336
                    fmt::format("Key length exceeds maximum allowed size of {} bytes.",
141
336
                                max_key_length));
142
336
        }
143
1.42M
    };
144
1.42M
    auto traverse_object_member = [&](const auto& key, const auto& value) {
145
1.42M
        check_key_length(key);
146
1.42M
        ctx.builder.append(key, false);
147
1.42M
        traverse(value, ctx);
148
1.42M
        ctx.builder.pop_back();
149
1.42M
    };
150
151
1.56M
    for (auto it = object.begin(); it != object.end(); ++it) {
152
1.42M
        const auto& [key, value] = *it;
153
1.42M
        traverse_object_member(key, value);
154
1.42M
    }
155
144k
}
156
157
template <typename ParserImpl>
158
1.12M
void JSONDataParser<ParserImpl>::check_has_nested_object(const Element& element) {
159
1.12M
    if (element.isArray()) {
160
105k
        const JSONArray& array = element.getArray();
161
1.17M
        for (auto it = array.begin(); it != array.end(); ++it) {
162
1.06M
            check_has_nested_object(*it);
163
1.06M
        }
164
105k
    }
165
1.12M
    if (element.isObject()) {
166
23.0k
        has_nested = true;
167
23.0k
    }
168
1.12M
}
169
170
template <typename ParserImpl>
171
4.31k
void JSONDataParser<ParserImpl>::traverseAsJsonb(const Element& element, JsonbWriter& writer) {
172
4.31k
    if (element.isObject()) {
173
1.49k
        traverseObjectAsJsonb(element.getObject(), writer);
174
2.82k
    } else if (element.isArray()) {
175
7
        traverseArrayAsJsonb(element.getArray(), writer);
176
2.81k
    } else {
177
2.81k
        writeValueAsJsonb(element, writer);
178
2.81k
    }
179
4.31k
}
180
181
template <typename ParserImpl>
182
void JSONDataParser<ParserImpl>::traverseObjectAsJsonb(const JSONObject& object,
183
1.49k
                                                       JsonbWriter& writer) {
184
1.49k
    writer.writeStartObject();
185
4.93k
    for (auto it = object.begin(); it != object.end(); ++it) {
186
3.64k
        const auto& [key, value] = *it;
187
3.64k
        const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
188
3.64k
        if (key.size() > max_key_length) {
189
201
            throw doris::Exception(
190
201
                    doris::ErrorCode::INVALID_ARGUMENT,
191
201
                    fmt::format("Key length exceeds maximum allowed size of {} bytes.",
192
201
                                max_key_length));
193
201
        }
194
3.44k
        writer.writeKey(key.data(), cast_set<uint8_t>(key.size()));
195
3.44k
        traverseAsJsonb(value, writer);
196
3.44k
    }
197
1.28k
    writer.writeEndObject();
198
1.28k
}
199
200
template <typename ParserImpl>
201
216
void JSONDataParser<ParserImpl>::traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer) {
202
216
    writer.writeStartArray();
203
1.08k
    for (auto it = array.begin(); it != array.end(); ++it) {
204
873
        traverseAsJsonb(*it, writer);
205
873
    }
206
216
    writer.writeEndArray();
207
216
}
208
209
// check isPrefix in PathInData::Parts. like : [{"a": {"c": {"b": 1}}}, {"a": {"c": 2.2}}], "a.c" is prefix of "a.c.b"
210
// return true if prefix is a prefix of parts
211
6
static bool is_prefix(const PathInData::Parts& prefix, const PathInData::Parts& parts) {
212
6
    if (prefix.size() >= parts.size()) {
213
5
        return false;
214
5
    }
215
2
    for (size_t i = 0; i < prefix.size(); ++i) {
216
1
        if (prefix[i].key != parts[i].key) {
217
0
            return false;
218
0
        }
219
1
    }
220
1
    return true;
221
1
}
222
223
template <typename ParserImpl>
224
55.1k
void JSONDataParser<ParserImpl>::traverseArray(const JSONArray& array, ParseContext& ctx) {
225
    /// Traverse elements of array and collect an array of fields by each path.
226
55.1k
    ParseArrayContext array_ctx;
227
55.1k
    array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
228
55.1k
    array_ctx.is_top_array = ctx.is_top_array;
229
55.1k
    array_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path;
230
55.1k
    array_ctx.preserve_decimal_number_paths = ctx.preserve_decimal_number_paths;
231
55.1k
    array_ctx.preserve_decimal_number_path_matcher = ctx.preserve_decimal_number_path_matcher;
232
55.1k
    array_ctx.path_prefix_for_typed_paths = ctx.path_prefix_for_typed_paths;
233
55.1k
    const auto& current_parts = ctx.builder.get_parts();
234
55.1k
    array_ctx.path_prefix_for_typed_paths.insert(array_ctx.path_prefix_for_typed_paths.end(),
235
55.1k
                                                 current_parts.begin(), current_parts.end());
236
55.1k
    array_ctx.total_size = array.size();
237
633k
    for (auto it = array.begin(); it != array.end(); ++it) {
238
577k
        traverseArrayElement(*it, array_ctx);
239
577k
        ++array_ctx.current_size;
240
577k
    }
241
55.1k
    auto&& arrays_by_path = array_ctx.arrays_by_path;
242
55.1k
    if (arrays_by_path.empty()) {
243
4
        appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
244
4
                                  Field::create_field<TYPE_ARRAY>(Array()));
245
55.1k
    } else {
246
55.1k
        ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size());
247
55.1k
        ctx.values.reserve(ctx.values.size() + arrays_by_path.size());
248
113k
        for (auto it = arrays_by_path.begin(); it != arrays_by_path.end(); ++it) {
249
58.1k
            auto&& [path, path_array] = it->second;
250
            /// Merge prefix path and path of array element.
251
58.1k
            ctx.builder.append(path, true);
252
58.1k
            appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
253
58.1k
                                      Field::create_field<TYPE_ARRAY>(std::move(path_array)));
254
58.1k
            ctx.builder.pop_back(path.size());
255
58.1k
        }
256
55.1k
    }
257
55.1k
}
258
259
template <typename ParserImpl>
260
void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element,
261
577k
                                                      ParseArrayContext& ctx) {
262
577k
    ParseContext element_ctx;
263
577k
    element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
264
577k
    element_ctx.is_top_array = ctx.is_top_array;
265
577k
    element_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path;
266
577k
    element_ctx.preserve_decimal_number_paths = ctx.preserve_decimal_number_paths;
267
577k
    element_ctx.preserve_decimal_number_path_matcher = ctx.preserve_decimal_number_path_matcher;
268
577k
    element_ctx.path_prefix_for_typed_paths = ctx.path_prefix_for_typed_paths;
269
577k
    traverse(element, element_ctx);
270
577k
    auto& paths = element_ctx.paths;
271
577k
    auto& values = element_ctx.values;
272
273
577k
    if (element_ctx.has_nested_in_flatten && element_ctx.is_top_array) {
274
6
        checkAmbiguousStructure(ctx, paths);
275
6
    }
276
277
577k
    size_t size = paths.size();
278
577k
    size_t keys_to_update = ctx.arrays_by_path.size();
279
280
1.15M
    for (size_t i = 0; i < size; ++i) {
281
580k
        if (values[i].is_null()) {
282
11.7k
            continue;
283
11.7k
        }
284
285
569k
        UInt128 hash = PathInData::get_parts_hash(paths[i]);
286
569k
        auto found = ctx.arrays_by_path.find(hash);
287
288
569k
        if (found != ctx.arrays_by_path.end()) {
289
510k
            handleExistingPath(found->second, paths[i], values[i], ctx, keys_to_update);
290
510k
        } else {
291
58.1k
            handleNewPath(hash, paths[i], values[i], ctx);
292
58.1k
        }
293
569k
    }
294
295
    // always fill missed values to keep element-level association between keys.
296
577k
    if (keys_to_update) {
297
10.2k
        fillMissedValuesInArrays(ctx);
298
10.2k
    }
299
577k
}
300
301
// check if the structure of top_array is ambiguous like:
302
// [{"a": {"b": {"c": 1}}}, {"a": {"b": 1}}] a.b is ambiguous
303
template <typename ParserImpl>
304
void JSONDataParser<ParserImpl>::checkAmbiguousStructure(
305
6
        const ParseArrayContext& ctx, const std::vector<PathInData::Parts>& paths) {
306
6
    for (auto&& current_path : paths) {
307
8
        for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
308
3
            auto&& [p, _] = it->second;
309
3
            if (is_prefix(p, current_path) || is_prefix(current_path, p)) {
310
1
                throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
311
1
                                       "Ambiguous structure of top_array nested subcolumns: {}, {}",
312
1
                                       PathInData(p).to_jsonpath(),
313
1
                                       PathInData(current_path).to_jsonpath());
314
1
            }
315
3
        }
316
6
    }
317
6
}
318
319
template <typename ParserImpl>
320
void JSONDataParser<ParserImpl>::handleExistingPath(std::pair<PathInData::Parts, Array>& path_data,
321
                                                    const PathInData::Parts& path, Field& value,
322
                                                    ParseArrayContext& ctx,
323
510k
                                                    size_t& keys_to_update) {
324
510k
    auto& path_array = path_data.second;
325
    // keep arrays aligned for all keys (including top-level arrays).
326
510k
    assert(path_array.size() == ctx.current_size);
327
    // If current element of array is part of Nested,
328
    // collect its size or check it if the size of
329
    // the Nested has been already collected.
330
510k
    auto nested_key = getNameOfNested(path, value);
331
510k
    if (!nested_key.empty()) {
332
0
        size_t array_size = value.get<TYPE_ARRAY>().size();
333
0
        auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
334
0
        if (current_nested_sizes.size() == ctx.current_size) {
335
0
            current_nested_sizes.push_back(array_size);
336
0
        } else if (array_size != current_nested_sizes.back()) {
337
0
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
338
0
                                   "Array sizes mismatched ({} and {})", array_size,
339
0
                                   current_nested_sizes.back());
340
0
        }
341
0
    }
342
343
510k
    path_array.push_back(std::move(value));
344
510k
    --keys_to_update;
345
510k
}
346
347
template <typename ParserImpl>
348
void JSONDataParser<ParserImpl>::handleNewPath(UInt128 hash, const PathInData::Parts& path,
349
58.1k
                                               Field& value, ParseArrayContext& ctx) {
350
58.1k
    Array path_array;
351
58.1k
    path_array.reserve(ctx.total_size);
352
353
    // always resize to keep alignment.
354
58.1k
    path_array.resize(ctx.current_size);
355
356
58.1k
    auto nested_key = getNameOfNested(path, value);
357
58.1k
    if (!nested_key.empty()) {
358
3
        size_t array_size = value.get<TYPE_ARRAY>().size();
359
3
        auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
360
3
        if (current_nested_sizes.empty()) {
361
2
            current_nested_sizes.resize(ctx.current_size);
362
2
        } else {
363
            // If newly added element is part of the Nested then
364
            // resize its elements to keep correct sizes of Nested arrays.
365
3
            for (size_t j = 0; j < ctx.current_size; ++j) {
366
2
                path_array[j] = Field::create_field<TYPE_ARRAY>(Array(current_nested_sizes[j]));
367
2
            }
368
1
        }
369
3
        if (current_nested_sizes.size() == ctx.current_size) {
370
2
            current_nested_sizes.push_back(array_size);
371
2
        } else if (array_size != current_nested_sizes.back()) {
372
0
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
373
0
                                   "Array sizes mismatched ({} and {})", array_size,
374
0
                                   current_nested_sizes.back());
375
0
        }
376
3
    }
377
378
58.1k
    path_array.push_back(std::move(value));
379
58.1k
    auto& elem = ctx.arrays_by_path[hash];
380
58.1k
    elem.first = std::move(path);
381
58.1k
    elem.second = std::move(path_array);
382
58.1k
}
383
384
template <typename ParserImpl>
385
10.2k
void JSONDataParser<ParserImpl>::fillMissedValuesInArrays(ParseArrayContext& ctx) {
386
20.5k
    for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
387
10.2k
        auto& [path, path_array] = it->second;
388
10.2k
        assert(path_array.size() == ctx.current_size || path_array.size() == ctx.current_size + 1);
389
10.2k
        if (path_array.size() == ctx.current_size) {
390
10.2k
            bool inserted = tryInsertDefaultFromNested(ctx, path, path_array);
391
10.2k
            if (!inserted) {
392
10.2k
                path_array.emplace_back();
393
10.2k
            }
394
10.2k
        }
395
10.2k
    }
396
10.2k
}
397
398
template <typename ParserImpl>
399
bool JSONDataParser<ParserImpl>::tryInsertDefaultFromNested(ParseArrayContext& ctx,
400
                                                            const PathInData::Parts& path,
401
10.2k
                                                            Array& array) {
402
    /// If there is a collected size of current Nested
403
    /// then insert array of this size as a default value.
404
10.2k
    if (path.empty() || array.empty()) {
405
10.2k
        return false;
406
10.2k
    }
407
    /// Last element is not Null, because otherwise this path wouldn't exist.
408
1
    auto nested_key = getNameOfNested(path, array.back());
409
1
    if (nested_key.empty()) {
410
1
        return false;
411
1
    }
412
0
    auto mapped = ctx.nested_sizes_by_key.find(nested_key);
413
0
    if (mapped == ctx.nested_sizes_by_key.end()) {
414
0
        return false;
415
0
    }
416
0
    auto& current_nested_sizes = mapped->second;
417
0
    assert(current_nested_sizes.size() == ctx.current_size ||
418
0
           current_nested_sizes.size() == ctx.current_size + 1);
419
    /// If all keys of Nested were missed then add a zero length.
420
0
    if (current_nested_sizes.size() == ctx.current_size) {
421
0
        current_nested_sizes.push_back(0);
422
0
    }
423
0
    size_t array_size = current_nested_sizes.back();
424
0
    array.push_back(Field::create_field<TYPE_ARRAY>(Array(array_size)));
425
0
    return true;
426
0
}
427
428
template <typename ParserImpl>
429
StringRef JSONDataParser<ParserImpl>::getNameOfNested(const PathInData::Parts& path,
430
569k
                                                      const Field& value) {
431
569k
    if (value.get_type() != PrimitiveType::TYPE_ARRAY || path.empty()) {
432
569k
        return {};
433
569k
    }
434
    /// Find first key that is marked as nested,
435
    /// because we may have struct of Nested and there could be
436
    /// several arrays with the same prefix, but with independent sizes.
437
    /// Consider we have array element with type `k2 Struct(k3 Nested(...), k5 Nested(...))`
438
    /// Then subcolumns `k2.k3` and `k2.k5` may have indepented sizes and we should extract
439
    /// `k3` and `k5` keys instead of `k2`.
440
3
    for (const auto& part : path) {
441
3
        if (part.is_nested) {
442
3
            return {part.key.data(), part.key.size()};
443
3
        }
444
3
    }
445
0
    return {};
446
3
}
447
448
template class JSONDataParser<SimdJSONParser>;
449
} // namespace doris