Coverage Report

Created: 2026-04-15 12:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/json/json_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.cpp
19
// and modified by Doris
20
21
#include "util/json/json_parser.h"
22
23
#include <fmt/format.h>
24
#include <glog/logging.h>
25
26
#include <algorithm>
27
#include <cassert>
28
#include <string_view>
29
30
#include "common/cast_set.h"
31
// IWYU pragma: keep
32
#include "common/status.h"
33
#include "util/json/path_in_data.h"
34
#include "util/json/simd_json_parser.h"
35
36
namespace doris {
37
38
template <typename ParserImpl>
39
std::optional<ParseResult> JSONDataParser<ParserImpl>::parse(const char* begin, size_t length,
40
1.43M
                                                             const ParseConfig& config) {
41
1.43M
    Element document;
42
1.43M
    if (!parser.parse(begin, length, document)) {
43
679
        return {};
44
679
    }
45
1.43M
    ParseContext context;
46
    // deprecated_enable_flatten_nested controls nested path traversal
47
    // NestedGroup expansion is now handled at storage layer
48
1.43M
    context.deprecated_enable_flatten_nested = config.deprecated_enable_flatten_nested;
49
1.43M
    context.is_top_array = document.isArray();
50
1.43M
    traverse(document, context);
51
1.43M
    ParseResult result;
52
1.43M
    result.values = std::move(context.values);
53
1.43M
    result.paths.reserve(context.paths.size());
54
20.4M
    for (auto&& path : context.paths) {
55
20.4M
        result.paths.emplace_back(std::move(path));
56
20.4M
    }
57
1.43M
    return result;
58
1.43M
}
59
60
template <typename ParserImpl>
61
24.4M
void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext& ctx) {
62
    // checkStackSize();
63
24.4M
    if (element.isObject()) {
64
2.18M
        traverseObject(element.getObject(), ctx);
65
22.2M
    } else if (element.isArray()) {
66
        // allow nested arrays (multi-level) for NestedGroup; deeper levels are
67
        // handled by VariantNestedBuilder with a max-depth guard.
68
735k
        has_nested = false;
69
735k
        check_has_nested_object(element);
70
735k
        ctx.has_nested_in_flatten = has_nested && ctx.deprecated_enable_flatten_nested;
71
735k
        if (has_nested && !ctx.deprecated_enable_flatten_nested) {
72
            // Parse nested arrays to JsonbField
73
47.1k
            JsonbWriter writer;
74
47.1k
            traverseArrayAsJsonb(element.getArray(), writer);
75
47.1k
            ctx.paths.push_back(ctx.builder.get_parts());
76
47.1k
            ctx.values.push_back(Field::create_field<TYPE_JSONB>(
77
47.1k
                    JsonbField(writer.getOutput()->getBuffer(), writer.getOutput()->getSize())));
78
687k
        } else {
79
687k
            traverseArray(element.getArray(), ctx);
80
687k
        }
81
        // we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array
82
735k
        ctx.has_nested_in_flatten = false;
83
21.5M
    } else {
84
21.5M
        ctx.paths.push_back(ctx.builder.get_parts());
85
21.5M
        ctx.values.push_back(getValueAsField(element));
86
21.5M
    }
87
24.4M
}
88
template <typename ParserImpl>
89
2.18M
void JSONDataParser<ParserImpl>::traverseObject(const JSONObject& object, ParseContext& ctx) {
90
2.18M
    ctx.paths.reserve(ctx.paths.size() + object.size());
91
2.18M
    ctx.values.reserve(ctx.values.size() + object.size());
92
23.1M
    for (auto it = object.begin(); it != object.end(); ++it) {
93
21.0M
        const auto& [key, value] = *it;
94
21.0M
        const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
95
21.0M
        if (key.size() > max_key_length) {
96
673
            throw doris::Exception(
97
673
                    doris::ErrorCode::INVALID_ARGUMENT,
98
673
                    fmt::format("Key length exceeds maximum allowed size of {} bytes.",
99
673
                                max_key_length));
100
673
        }
101
21.0M
        ctx.builder.append(key, false);
102
21.0M
        traverse(value, ctx);
103
21.0M
        ctx.builder.pop_back();
104
21.0M
    }
105
2.18M
}
106
107
template <typename ParserImpl>
108
3.88M
void JSONDataParser<ParserImpl>::check_has_nested_object(const Element& element) {
109
3.88M
    if (element.isArray()) {
110
943k
        const JSONArray& array = element.getArray();
111
4.09M
        for (auto it = array.begin(); it != array.end(); ++it) {
112
3.14M
            check_has_nested_object(*it);
113
3.14M
        }
114
943k
    }
115
3.88M
    if (element.isObject()) {
116
132k
        has_nested = true;
117
132k
    }
118
3.88M
}
119
120
template <typename ParserImpl>
121
655k
void JSONDataParser<ParserImpl>::traverseAsJsonb(const Element& element, JsonbWriter& writer) {
122
655k
    if (element.isObject()) {
123
157k
        traverseObjectAsJsonb(element.getObject(), writer);
124
498k
    } else if (element.isArray()) {
125
27
        traverseArrayAsJsonb(element.getArray(), writer);
126
498k
    } else {
127
498k
        writeValueAsJsonb(element, writer);
128
498k
    }
129
655k
}
130
131
template <typename ParserImpl>
132
void JSONDataParser<ParserImpl>::traverseObjectAsJsonb(const JSONObject& object,
133
157k
                                                       JsonbWriter& writer) {
134
157k
    writer.writeStartObject();
135
730k
    for (auto it = object.begin(); it != object.end(); ++it) {
136
573k
        const auto& [key, value] = *it;
137
573k
        const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
138
573k
        if (key.size() > max_key_length) {
139
402
            throw doris::Exception(
140
402
                    doris::ErrorCode::INVALID_ARGUMENT,
141
402
                    fmt::format("Key length exceeds maximum allowed size of {} bytes.",
142
402
                                max_key_length));
143
402
        }
144
572k
        writer.writeKey(key.data(), cast_set<uint8_t>(key.size()));
145
572k
        traverseAsJsonb(value, writer);
146
572k
    }
147
156k
    writer.writeEndObject();
148
156k
}
149
150
template <typename ParserImpl>
151
47.2k
void JSONDataParser<ParserImpl>::traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer) {
152
47.2k
    writer.writeStartArray();
153
130k
    for (auto it = array.begin(); it != array.end(); ++it) {
154
82.8k
        traverseAsJsonb(*it, writer);
155
82.8k
    }
156
47.2k
    writer.writeEndArray();
157
47.2k
}
158
159
// check isPrefix in PathInData::Parts. like : [{"a": {"c": {"b": 1}}}, {"a": {"c": 2.2}}], "a.c" is prefix of "a.c.b"
160
// return true if prefix is a prefix of parts
161
22
static bool is_prefix(const PathInData::Parts& prefix, const PathInData::Parts& parts) {
162
22
    if (prefix.size() >= parts.size()) {
163
20
        return false;
164
20
    }
165
4
    for (size_t i = 0; i < prefix.size(); ++i) {
166
2
        if (prefix[i].key != parts[i].key) {
167
0
            return false;
168
0
        }
169
2
    }
170
2
    return true;
171
2
}
172
173
template <typename ParserImpl>
174
688k
void JSONDataParser<ParserImpl>::traverseArray(const JSONArray& array, ParseContext& ctx) {
175
    /// Traverse elements of array and collect an array of fields by each path.
176
688k
    ParseArrayContext array_ctx;
177
688k
    array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
178
688k
    array_ctx.is_top_array = ctx.is_top_array;
179
688k
    array_ctx.total_size = array.size();
180
2.66M
    for (auto it = array.begin(); it != array.end(); ++it) {
181
1.98M
        traverseArrayElement(*it, array_ctx);
182
1.98M
        ++array_ctx.current_size;
183
1.98M
    }
184
688k
    auto&& arrays_by_path = array_ctx.arrays_by_path;
185
688k
    if (arrays_by_path.empty()) {
186
19.7k
        ctx.paths.push_back(ctx.builder.get_parts());
187
19.7k
        ctx.values.push_back(Field::create_field<TYPE_ARRAY>(Array()));
188
668k
    } else {
189
668k
        ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size());
190
668k
        ctx.values.reserve(ctx.values.size() + arrays_by_path.size());
191
1.35M
        for (auto it = arrays_by_path.begin(); it != arrays_by_path.end(); ++it) {
192
686k
            auto&& [path, path_array] = it->second;
193
            /// Merge prefix path and path of array element.
194
686k
            ctx.paths.push_back(ctx.builder.append(path, true).get_parts());
195
686k
            ctx.values.push_back(Field::create_field<TYPE_ARRAY>(std::move(path_array)));
196
686k
            ctx.builder.pop_back(path.size());
197
686k
        }
198
668k
    }
199
688k
}
200
201
template <typename ParserImpl>
202
void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element,
203
1.98M
                                                      ParseArrayContext& ctx) {
204
1.98M
    ParseContext element_ctx;
205
1.98M
    element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
206
1.98M
    element_ctx.is_top_array = ctx.is_top_array;
207
1.98M
    traverse(element, element_ctx);
208
1.98M
    auto& [_, paths, values, deprecated_flatten_nested, __, is_top_array] = element_ctx;
209
210
1.98M
    if (element_ctx.has_nested_in_flatten && is_top_array) {
211
16
        checkAmbiguousStructure(ctx, paths);
212
16
    }
213
214
1.98M
    size_t size = paths.size();
215
1.98M
    size_t keys_to_update = ctx.arrays_by_path.size();
216
217
3.97M
    for (size_t i = 0; i < size; ++i) {
218
1.99M
        if (values[i].is_null()) {
219
23.5k
            continue;
220
23.5k
        }
221
222
1.97M
        UInt128 hash = PathInData::get_parts_hash(paths[i]);
223
1.97M
        auto found = ctx.arrays_by_path.find(hash);
224
225
1.97M
        if (found != ctx.arrays_by_path.end()) {
226
1.28M
            handleExistingPath(found->second, paths[i], values[i], ctx, keys_to_update);
227
1.28M
        } else {
228
685k
            handleNewPath(hash, paths[i], values[i], ctx);
229
685k
        }
230
1.97M
    }
231
232
    // always fill missed values to keep element-level association between keys.
233
1.98M
    if (keys_to_update) {
234
20.6k
        fillMissedValuesInArrays(ctx);
235
20.6k
    }
236
1.98M
}
237
238
// check if the structure of top_array is ambiguous like:
239
// [{"a": {"b": {"c": 1}}}, {"a": {"b": 1}}] a.b is ambiguous
240
template <typename ParserImpl>
241
void JSONDataParser<ParserImpl>::checkAmbiguousStructure(
242
16
        const ParseArrayContext& ctx, const std::vector<PathInData::Parts>& paths) {
243
19
    for (auto&& current_path : paths) {
244
28
        for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
245
11
            auto&& [p, _] = it->second;
246
11
            if (is_prefix(p, current_path) || is_prefix(current_path, p)) {
247
2
                throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
248
2
                                       "Ambiguous structure of top_array nested subcolumns: {}, {}",
249
2
                                       PathInData(p).to_jsonpath(),
250
2
                                       PathInData(current_path).to_jsonpath());
251
2
            }
252
11
        }
253
19
    }
254
16
}
255
256
template <typename ParserImpl>
257
void JSONDataParser<ParserImpl>::handleExistingPath(std::pair<PathInData::Parts, Array>& path_data,
258
                                                    const PathInData::Parts& path, Field& value,
259
                                                    ParseArrayContext& ctx,
260
1.28M
                                                    size_t& keys_to_update) {
261
1.28M
    auto& path_array = path_data.second;
262
    // keep arrays aligned for all keys (including top-level arrays).
263
1.28M
    assert(path_array.size() == ctx.current_size);
264
    // If current element of array is part of Nested,
265
    // collect its size or check it if the size of
266
    // the Nested has been already collected.
267
1.28M
    auto nested_key = getNameOfNested(path, value);
268
1.28M
    if (!nested_key.empty()) {
269
0
        size_t array_size = value.get<TYPE_ARRAY>().size();
270
0
        auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
271
0
        if (current_nested_sizes.size() == ctx.current_size) {
272
0
            current_nested_sizes.push_back(array_size);
273
0
        } else if (array_size != current_nested_sizes.back()) {
274
0
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
275
0
                                   "Array sizes mismatched ({} and {})", array_size,
276
0
                                   current_nested_sizes.back());
277
0
        }
278
0
    }
279
280
1.28M
    path_array.push_back(std::move(value));
281
1.28M
    --keys_to_update;
282
1.28M
}
283
284
template <typename ParserImpl>
285
void JSONDataParser<ParserImpl>::handleNewPath(UInt128 hash, const PathInData::Parts& path,
286
683k
                                               Field& value, ParseArrayContext& ctx) {
287
683k
    Array path_array;
288
683k
    path_array.reserve(ctx.total_size);
289
290
    // always resize to keep alignment.
291
683k
    path_array.resize(ctx.current_size);
292
293
683k
    auto nested_key = getNameOfNested(path, value);
294
683k
    if (!nested_key.empty()) {
295
6
        size_t array_size = value.get<TYPE_ARRAY>().size();
296
6
        auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
297
6
        if (current_nested_sizes.empty()) {
298
4
            current_nested_sizes.resize(ctx.current_size);
299
4
        } else {
300
            // If newly added element is part of the Nested then
301
            // resize its elements to keep correct sizes of Nested arrays.
302
6
            for (size_t j = 0; j < ctx.current_size; ++j) {
303
4
                path_array[j] = Field::create_field<TYPE_ARRAY>(Array(current_nested_sizes[j]));
304
4
            }
305
2
        }
306
6
        if (current_nested_sizes.size() == ctx.current_size) {
307
4
            current_nested_sizes.push_back(array_size);
308
4
        } else if (array_size != current_nested_sizes.back()) {
309
0
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
310
0
                                   "Array sizes mismatched ({} and {})", array_size,
311
0
                                   current_nested_sizes.back());
312
0
        }
313
6
    }
314
315
683k
    path_array.push_back(std::move(value));
316
683k
    auto& elem = ctx.arrays_by_path[hash];
317
683k
    elem.first = std::move(path);
318
683k
    elem.second = std::move(path_array);
319
683k
}
320
321
template <typename ParserImpl>
322
20.6k
void JSONDataParser<ParserImpl>::fillMissedValuesInArrays(ParseArrayContext& ctx) {
323
42.0k
    for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
324
21.3k
        auto& [path, path_array] = it->second;
325
21.3k
        assert(path_array.size() == ctx.current_size || path_array.size() == ctx.current_size + 1);
326
21.3k
        if (path_array.size() == ctx.current_size) {
327
21.0k
            bool inserted = tryInsertDefaultFromNested(ctx, path, path_array);
328
21.0k
            if (!inserted) {
329
21.0k
                path_array.emplace_back();
330
21.0k
            }
331
21.0k
        }
332
21.3k
    }
333
20.6k
}
334
335
template <typename ParserImpl>
336
bool JSONDataParser<ParserImpl>::tryInsertDefaultFromNested(ParseArrayContext& ctx,
337
                                                            const PathInData::Parts& path,
338
21.0k
                                                            Array& array) {
339
    /// If there is a collected size of current Nested
340
    /// then insert array of this size as a default value.
341
21.0k
    if (path.empty() || array.empty()) {
342
20.5k
        return false;
343
20.5k
    }
344
    /// Last element is not Null, because otherwise this path wouldn't exist.
345
542
    auto nested_key = getNameOfNested(path, array.back());
346
542
    if (nested_key.empty()) {
347
542
        return false;
348
542
    }
349
0
    auto mapped = ctx.nested_sizes_by_key.find(nested_key);
350
0
    if (mapped == ctx.nested_sizes_by_key.end()) {
351
0
        return false;
352
0
    }
353
0
    auto& current_nested_sizes = mapped->second;
354
0
    assert(current_nested_sizes.size() == ctx.current_size ||
355
0
           current_nested_sizes.size() == ctx.current_size + 1);
356
    /// If all keys of Nested were missed then add a zero length.
357
0
    if (current_nested_sizes.size() == ctx.current_size) {
358
0
        current_nested_sizes.push_back(0);
359
0
    }
360
0
    size_t array_size = current_nested_sizes.back();
361
0
    array.push_back(Field::create_field<TYPE_ARRAY>(Array(array_size)));
362
0
    return true;
363
0
}
364
365
template <typename ParserImpl>
366
StringRef JSONDataParser<ParserImpl>::getNameOfNested(const PathInData::Parts& path,
367
1.97M
                                                      const Field& value) {
368
1.97M
    if (value.get_type() != PrimitiveType::TYPE_ARRAY || path.empty()) {
369
1.97M
        return {};
370
1.97M
    }
371
    /// Find first key that is marked as nested,
372
    /// because we may have tuple of Nested and there could be
373
    /// several arrays with the same prefix, but with independent sizes.
374
    /// Consider we have array element with type `k2 Tuple(k3 Nested(...), k5 Nested(...))`
375
    /// Then subcolumns `k2.k3` and `k2.k5` may have indepented sizes and we should extract
376
    /// `k3` and `k5` keys instead of `k2`.
377
537
    for (const auto& part : path) {
378
6
        if (part.is_nested) {
379
6
            return {part.key.data(), part.key.size()};
380
6
        }
381
6
    }
382
531
    return {};
383
537
}
384
385
template class JSONDataParser<SimdJSONParser>;
386
} // namespace doris