Coverage Report

Created: 2026-03-13 05:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/json/json_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.cpp
19
// and modified by Doris
20
21
#include "util/json/json_parser.h"
22
23
#include <fmt/format.h>
24
#include <glog/logging.h>
25
26
#include <algorithm>
27
#include <cassert>
28
#include <string_view>
29
30
#include "common/cast_set.h"
31
// IWYU pragma: keep
32
#include "common/status.h"
33
#include "util/json/path_in_data.h"
34
#include "util/json/simd_json_parser.h"
35
36
namespace doris {
37
#include "common/compile_check_begin.h"
38
39
template <typename ParserImpl>
40
std::optional<ParseResult> JSONDataParser<ParserImpl>::parse(const char* begin, size_t length,
41
1.35M
                                                             const ParseConfig& config) {
42
1.35M
    Element document;
43
1.35M
    if (!parser.parse(begin, length, document)) {
44
666
        return {};
45
666
    }
46
1.35M
    ParseContext context;
47
    // enable_flatten_nested controls nested path traversal
48
    // NestedGroup expansion is now handled at storage layer
49
1.35M
    context.enable_flatten_nested = config.enable_flatten_nested;
50
1.35M
    context.is_top_array = document.isArray();
51
1.35M
    traverse(document, context);
52
1.35M
    ParseResult result;
53
1.35M
    result.values = std::move(context.values);
54
1.35M
    result.paths.reserve(context.paths.size());
55
19.1M
    for (auto&& path : context.paths) {
56
19.1M
        result.paths.emplace_back(std::move(path));
57
19.1M
    }
58
1.35M
    return result;
59
1.35M
}
60
61
template <typename ParserImpl>
62
22.3M
void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext& ctx) {
63
    // checkStackSize();
64
22.3M
    if (element.isObject()) {
65
2.03M
        traverseObject(element.getObject(), ctx);
66
20.2M
    } else if (element.isArray()) {
67
        // allow nested arrays (multi-level) for NestedGroup; deeper levels are
68
        // handled by VariantNestedBuilder with a max-depth guard.
69
683k
        has_nested = false;
70
683k
        check_has_nested_object(element);
71
683k
        ctx.has_nested_in_flatten = has_nested && ctx.enable_flatten_nested;
72
683k
        if (has_nested && !ctx.enable_flatten_nested) {
73
            // Parse nested arrays to JsonbField
74
46.9k
            JsonbWriter writer;
75
46.9k
            traverseArrayAsJsonb(element.getArray(), writer);
76
46.9k
            ctx.paths.push_back(ctx.builder.get_parts());
77
46.9k
            ctx.values.push_back(Field::create_field<TYPE_JSONB>(
78
46.9k
                    JsonbField(writer.getOutput()->getBuffer(), writer.getOutput()->getSize())));
79
636k
        } else {
80
636k
            traverseArray(element.getArray(), ctx);
81
636k
        }
82
        // we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array
83
683k
        ctx.has_nested_in_flatten = false;
84
19.6M
    } else {
85
19.6M
        ctx.paths.push_back(ctx.builder.get_parts());
86
19.6M
        ctx.values.push_back(getValueAsField(element));
87
19.6M
    }
88
22.3M
}
89
template <typename ParserImpl>
90
2.03M
void JSONDataParser<ParserImpl>::traverseObject(const JSONObject& object, ParseContext& ctx) {
91
2.03M
    ctx.paths.reserve(ctx.paths.size() + object.size());
92
2.03M
    ctx.values.reserve(ctx.values.size() + object.size());
93
21.4M
    for (auto it = object.begin(); it != object.end(); ++it) {
94
19.3M
        const auto& [key, value] = *it;
95
19.3M
        const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
96
19.3M
        if (key.size() > max_key_length) {
97
337
            throw doris::Exception(
98
337
                    doris::ErrorCode::INVALID_ARGUMENT,
99
337
                    fmt::format("Key length exceeds maximum allowed size of {} bytes.",
100
337
                                max_key_length));
101
337
        }
102
19.3M
        ctx.builder.append(key, false);
103
19.3M
        traverse(value, ctx);
104
19.3M
        ctx.builder.pop_back();
105
19.3M
    }
106
2.03M
}
107
108
template <typename ParserImpl>
109
2.76M
void JSONDataParser<ParserImpl>::check_has_nested_object(const Element& element) {
110
2.76M
    if (element.isArray()) {
111
843k
        const JSONArray& array = element.getArray();
112
2.93M
        for (auto it = array.begin(); it != array.end(); ++it) {
113
2.08M
            check_has_nested_object(*it);
114
2.08M
        }
115
843k
    }
116
2.76M
    if (element.isObject()) {
117
109k
        has_nested = true;
118
109k
    }
119
2.76M
}
120
121
template <typename ParserImpl>
122
651k
void JSONDataParser<ParserImpl>::traverseAsJsonb(const Element& element, JsonbWriter& writer) {
123
651k
    if (element.isObject()) {
124
155k
        traverseObjectAsJsonb(element.getObject(), writer);
125
495k
    } else if (element.isArray()) {
126
20
        traverseArrayAsJsonb(element.getArray(), writer);
127
495k
    } else {
128
495k
        writeValueAsJsonb(element, writer);
129
495k
    }
130
651k
}
131
132
template <typename ParserImpl>
133
void JSONDataParser<ParserImpl>::traverseObjectAsJsonb(const JSONObject& object,
134
155k
                                                       JsonbWriter& writer) {
135
155k
    writer.writeStartObject();
136
725k
    for (auto it = object.begin(); it != object.end(); ++it) {
137
569k
        const auto& [key, value] = *it;
138
569k
        const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
139
569k
        if (key.size() > max_key_length) {
140
201
            throw doris::Exception(
141
201
                    doris::ErrorCode::INVALID_ARGUMENT,
142
201
                    fmt::format("Key length exceeds maximum allowed size of {} bytes.",
143
201
                                max_key_length));
144
201
        }
145
569k
        writer.writeKey(key.data(), cast_set<uint8_t>(key.size()));
146
569k
        traverseAsJsonb(value, writer);
147
569k
    }
148
155k
    writer.writeEndObject();
149
155k
}
150
151
template <typename ParserImpl>
152
46.9k
void JSONDataParser<ParserImpl>::traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer) {
153
46.9k
    writer.writeStartArray();
154
128k
    for (auto it = array.begin(); it != array.end(); ++it) {
155
81.9k
        traverseAsJsonb(*it, writer);
156
81.9k
    }
157
46.9k
    writer.writeEndArray();
158
46.9k
}
159
160
// check isPrefix in PathInData::Parts. like : [{"a": {"c": {"b": 1}}}, {"a": {"c": 2.2}}], "a.c" is prefix of "a.c.b"
161
// return true if prefix is a prefix of parts
162
16
static bool is_prefix(const PathInData::Parts& prefix, const PathInData::Parts& parts) {
163
16
    if (prefix.size() >= parts.size()) {
164
15
        return false;
165
15
    }
166
2
    for (size_t i = 0; i < prefix.size(); ++i) {
167
1
        if (prefix[i].key != parts[i].key) {
168
0
            return false;
169
0
        }
170
1
    }
171
1
    return true;
172
1
}
173
174
template <typename ParserImpl>
175
636k
void JSONDataParser<ParserImpl>::traverseArray(const JSONArray& array, ParseContext& ctx) {
176
    /// Traverse elements of array and collect an array of fields by each path.
177
636k
    ParseArrayContext array_ctx;
178
636k
    array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
179
636k
    array_ctx.is_top_array = ctx.is_top_array;
180
636k
    array_ctx.total_size = array.size();
181
2.04M
    for (auto it = array.begin(); it != array.end(); ++it) {
182
1.40M
        traverseArrayElement(*it, array_ctx);
183
1.40M
        ++array_ctx.current_size;
184
1.40M
    }
185
636k
    auto&& arrays_by_path = array_ctx.arrays_by_path;
186
636k
    if (arrays_by_path.empty()) {
187
19.7k
        ctx.paths.push_back(ctx.builder.get_parts());
188
19.7k
        ctx.values.push_back(Field::create_field<TYPE_ARRAY>(Array()));
189
616k
    } else {
190
616k
        ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size());
191
616k
        ctx.values.reserve(ctx.values.size() + arrays_by_path.size());
192
1.24M
        for (auto it = arrays_by_path.begin(); it != arrays_by_path.end(); ++it) {
193
630k
            auto&& [path, path_array] = it->second;
194
            /// Merge prefix path and path of array element.
195
630k
            ctx.paths.push_back(ctx.builder.append(path, true).get_parts());
196
630k
            ctx.values.push_back(Field::create_field<TYPE_ARRAY>(std::move(path_array)));
197
630k
            ctx.builder.pop_back(path.size());
198
630k
        }
199
616k
    }
200
636k
}
201
202
template <typename ParserImpl>
203
void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element,
204
1.40M
                                                      ParseArrayContext& ctx) {
205
1.40M
    ParseContext element_ctx;
206
1.40M
    element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
207
1.40M
    element_ctx.is_top_array = ctx.is_top_array;
208
1.40M
    traverse(element, element_ctx);
209
1.40M
    auto& [_, paths, values, flatten_nested, __, is_top_array] = element_ctx;
210
211
1.40M
    if (element_ctx.has_nested_in_flatten && is_top_array) {
212
10
        checkAmbiguousStructure(ctx, paths);
213
10
    }
214
215
1.40M
    size_t size = paths.size();
216
1.40M
    size_t keys_to_update = ctx.arrays_by_path.size();
217
218
2.82M
    for (size_t i = 0; i < size; ++i) {
219
1.41M
        if (values[i].is_null()) {
220
11.8k
            continue;
221
11.8k
        }
222
223
1.40M
        UInt128 hash = PathInData::get_parts_hash(paths[i]);
224
1.40M
        auto found = ctx.arrays_by_path.find(hash);
225
226
1.40M
        if (found != ctx.arrays_by_path.end()) {
227
776k
            handleExistingPath(found->second, paths[i], values[i], ctx, keys_to_update);
228
776k
        } else {
229
630k
            handleNewPath(hash, paths[i], values[i], ctx);
230
630k
        }
231
1.40M
    }
232
233
    // always fill missed values to keep element-level association between keys.
234
1.40M
    if (keys_to_update) {
235
10.4k
        fillMissedValuesInArrays(ctx);
236
10.4k
    }
237
1.40M
}
238
239
// check if the structure of top_array is ambiguous like:
240
// [{"a": {"b": {"c": 1}}}, {"a": {"b": 1}}] a.b is ambiguous
241
template <typename ParserImpl>
242
void JSONDataParser<ParserImpl>::checkAmbiguousStructure(
243
10
        const ParseArrayContext& ctx, const std::vector<PathInData::Parts>& paths) {
244
13
    for (auto&& current_path : paths) {
245
20
        for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
246
8
            auto&& [p, _] = it->second;
247
8
            if (is_prefix(p, current_path) || is_prefix(current_path, p)) {
248
1
                throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
249
1
                                       "Ambiguous structure of top_array nested subcolumns: {}, {}",
250
1
                                       PathInData(p).to_jsonpath(),
251
1
                                       PathInData(current_path).to_jsonpath());
252
1
            }
253
8
        }
254
13
    }
255
10
}
256
257
template <typename ParserImpl>
258
void JSONDataParser<ParserImpl>::handleExistingPath(std::pair<PathInData::Parts, Array>& path_data,
259
                                                    const PathInData::Parts& path, Field& value,
260
                                                    ParseArrayContext& ctx,
261
776k
                                                    size_t& keys_to_update) {
262
776k
    auto& path_array = path_data.second;
263
    // keep arrays aligned for all keys (including top-level arrays).
264
776k
    assert(path_array.size() == ctx.current_size);
265
    // If current element of array is part of Nested,
266
    // collect its size or check it if the size of
267
    // the Nested has been already collected.
268
776k
    auto nested_key = getNameOfNested(path, value);
269
776k
    if (!nested_key.empty()) {
270
0
        size_t array_size = value.get<TYPE_ARRAY>().size();
271
0
        auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
272
0
        if (current_nested_sizes.size() == ctx.current_size) {
273
0
            current_nested_sizes.push_back(array_size);
274
0
        } else if (array_size != current_nested_sizes.back()) {
275
0
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
276
0
                                   "Array sizes mismatched ({} and {})", array_size,
277
0
                                   current_nested_sizes.back());
278
0
        }
279
0
    }
280
281
776k
    path_array.push_back(std::move(value));
282
776k
    --keys_to_update;
283
776k
}
284
285
template <typename ParserImpl>
286
void JSONDataParser<ParserImpl>::handleNewPath(UInt128 hash, const PathInData::Parts& path,
287
630k
                                               Field& value, ParseArrayContext& ctx) {
288
630k
    Array path_array;
289
630k
    path_array.reserve(ctx.total_size);
290
291
    // always resize to keep alignment.
292
630k
    path_array.resize(ctx.current_size);
293
294
630k
    auto nested_key = getNameOfNested(path, value);
295
630k
    if (!nested_key.empty()) {
296
3
        size_t array_size = value.get<TYPE_ARRAY>().size();
297
3
        auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
298
3
        if (current_nested_sizes.empty()) {
299
2
            current_nested_sizes.resize(ctx.current_size);
300
2
        } else {
301
            // If newly added element is part of the Nested then
302
            // resize its elements to keep correct sizes of Nested arrays.
303
3
            for (size_t j = 0; j < ctx.current_size; ++j) {
304
2
                path_array[j] = Field::create_field<TYPE_ARRAY>(Array(current_nested_sizes[j]));
305
2
            }
306
1
        }
307
3
        if (current_nested_sizes.size() == ctx.current_size) {
308
2
            current_nested_sizes.push_back(array_size);
309
2
        } else if (array_size != current_nested_sizes.back()) {
310
0
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
311
0
                                   "Array sizes mismatched ({} and {})", array_size,
312
0
                                   current_nested_sizes.back());
313
0
        }
314
3
    }
315
316
630k
    path_array.push_back(std::move(value));
317
630k
    auto& elem = ctx.arrays_by_path[hash];
318
630k
    elem.first = std::move(path);
319
630k
    elem.second = std::move(path_array);
320
630k
}
321
322
template <typename ParserImpl>
323
10.4k
void JSONDataParser<ParserImpl>::fillMissedValuesInArrays(ParseArrayContext& ctx) {
324
21.4k
    for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
325
11.0k
        auto& [path, path_array] = it->second;
326
11.0k
        assert(path_array.size() == ctx.current_size || path_array.size() == ctx.current_size + 1);
327
11.0k
        if (path_array.size() == ctx.current_size) {
328
10.8k
            bool inserted = tryInsertDefaultFromNested(ctx, path, path_array);
329
10.8k
            if (!inserted) {
330
10.8k
                path_array.emplace_back();
331
10.8k
            }
332
10.8k
        }
333
11.0k
    }
334
10.4k
}
335
336
template <typename ParserImpl>
337
bool JSONDataParser<ParserImpl>::tryInsertDefaultFromNested(ParseArrayContext& ctx,
338
                                                            const PathInData::Parts& path,
339
10.8k
                                                            Array& array) {
340
    /// If there is a collected size of current Nested
341
    /// then insert array of this size as a default value.
342
10.8k
    if (path.empty() || array.empty()) {
343
10.2k
        return false;
344
10.2k
    }
345
    /// Last element is not Null, because otherwise this path wouldn't exist.
346
541
    auto nested_key = getNameOfNested(path, array.back());
347
541
    if (nested_key.empty()) {
348
541
        return false;
349
541
    }
350
0
    auto mapped = ctx.nested_sizes_by_key.find(nested_key);
351
0
    if (mapped == ctx.nested_sizes_by_key.end()) {
352
0
        return false;
353
0
    }
354
0
    auto& current_nested_sizes = mapped->second;
355
0
    assert(current_nested_sizes.size() == ctx.current_size ||
356
0
           current_nested_sizes.size() == ctx.current_size + 1);
357
    /// If all keys of Nested were missed then add a zero length.
358
0
    if (current_nested_sizes.size() == ctx.current_size) {
359
0
        current_nested_sizes.push_back(0);
360
0
    }
361
0
    size_t array_size = current_nested_sizes.back();
362
0
    array.push_back(Field::create_field<TYPE_ARRAY>(Array(array_size)));
363
0
    return true;
364
0
}
365
366
template <typename ParserImpl>
367
StringRef JSONDataParser<ParserImpl>::getNameOfNested(const PathInData::Parts& path,
368
1.40M
                                                      const Field& value) {
369
1.40M
    if (value.get_type() != PrimitiveType::TYPE_ARRAY || path.empty()) {
370
1.40M
        return {};
371
1.40M
    }
372
    /// Find first key that is marked as nested,
373
    /// because we may have tuple of Nested and there could be
374
    /// several arrays with the same prefix, but with independent sizes.
375
    /// Consider we have array element with type `k2 Tuple(k3 Nested(...), k5 Nested(...))`
376
    /// Then subcolumns `k2.k3` and `k2.k5` may have indepented sizes and we should extract
377
    /// `k3` and `k5` keys instead of `k2`.
378
242
    for (const auto& part : path) {
379
3
        if (part.is_nested) {
380
3
            return {part.key.data(), part.key.size()};
381
3
        }
382
3
    }
383
239
    return {};
384
242
}
385
386
#include "common/compile_check_end.h"
387
388
template class JSONDataParser<SimdJSONParser>;
389
} // namespace doris