be/src/util/json/json_parser.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.cpp
// and modified by Doris

#include "util/json/json_parser.h"

#include <fmt/format.h>
#include <glog/logging.h>

#include <algorithm>
#include <cassert>
#include <string_view>

#include "common/cast_set.h"
// IWYU pragma: keep
#include "common/status.h"
#include "util/json/path_in_data.h"
#include "util/json/simd_json_parser.h"

namespace doris {

template <typename ParserImpl>
std::optional<ParseResult> JSONDataParser<ParserImpl>::parse(const char* begin, size_t length,
                                                             const ParseConfig& config) {
    Element document;
    if (!parser.parse(begin, length, document)) {
        return {};
    }
    ParseContext context;
    // deprecated_enable_flatten_nested controls nested path traversal
    // NestedGroup expansion is now handled at storage layer
    context.deprecated_enable_flatten_nested = config.deprecated_enable_flatten_nested;
    context.is_top_array = document.isArray();
    traverse(document, context);
    ParseResult result;
    result.values = std::move(context.values);
    result.paths.reserve(context.paths.size());
    for (auto&& path : context.paths) {
        result.paths.emplace_back(std::move(path));
    }
    return result;
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext& ctx) {
    // checkStackSize();
    if (element.isObject()) {
        traverseObject(element.getObject(), ctx);
    } else if (element.isArray()) {
        // allow nested arrays (multi-level) for NestedGroup; deeper levels are
        // handled by VariantNestedBuilder with a max-depth guard.
        has_nested = false;
        check_has_nested_object(element);
        ctx.has_nested_in_flatten = has_nested && ctx.deprecated_enable_flatten_nested;
        if (has_nested && !ctx.deprecated_enable_flatten_nested) {
            // Parse nested arrays to JsonbField
            JsonbWriter writer;
            traverseArrayAsJsonb(element.getArray(), writer);
            ctx.paths.push_back(ctx.builder.get_parts());
            ctx.values.push_back(Field::create_field<TYPE_JSONB>(
                    JsonbField(writer.getOutput()->getBuffer(), writer.getOutput()->getSize())));
        } else {
            traverseArray(element.getArray(), ctx);
        }
        // we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array
        ctx.has_nested_in_flatten = false;
    } else {
        ctx.paths.push_back(ctx.builder.get_parts());
        ctx.values.push_back(getValueAsField(element));
    }
}
template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseObject(const JSONObject& object, ParseContext& ctx) {
    ctx.paths.reserve(ctx.paths.size() + object.size());
    ctx.values.reserve(ctx.values.size() + object.size());
    for (auto it = object.begin(); it != object.end(); ++it) {
        const auto& [key, value] = *it;
        const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
        if (key.size() > max_key_length) {
            throw doris::Exception(
                    doris::ErrorCode::INVALID_ARGUMENT,
                    fmt::format("Key length exceeds maximum allowed size of {} bytes.",
                                max_key_length));
        }
        ctx.builder.append(key, false);
        traverse(value, ctx);
        ctx.builder.pop_back();
    }
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::check_has_nested_object(const Element& element) {
    if (element.isArray()) {
        const JSONArray& array = element.getArray();
        for (auto it = array.begin(); it != array.end(); ++it) {
            check_has_nested_object(*it);
        }
    }
    if (element.isObject()) {
        has_nested = true;
    }
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseAsJsonb(const Element& element, JsonbWriter& writer) {
    if (element.isObject()) {
        traverseObjectAsJsonb(element.getObject(), writer);
    } else if (element.isArray()) {
        traverseArrayAsJsonb(element.getArray(), writer);
    } else {
        writeValueAsJsonb(element, writer);
    }
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseObjectAsJsonb(const JSONObject& object,
                                                       JsonbWriter& writer) {
    writer.writeStartObject();
    for (auto it = object.begin(); it != object.end(); ++it) {
        const auto& [key, value] = *it;
        const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
        if (key.size() > max_key_length) {
            throw doris::Exception(
                    doris::ErrorCode::INVALID_ARGUMENT,
                    fmt::format("Key length exceeds maximum allowed size of {} bytes.",
                                max_key_length));
        }
        writer.writeKey(key.data(), cast_set<uint8_t>(key.size()));
        traverseAsJsonb(value, writer);
    }
    writer.writeEndObject();
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer) {
    writer.writeStartArray();
    for (auto it = array.begin(); it != array.end(); ++it) {
        traverseAsJsonb(*it, writer);
    }
    writer.writeEndArray();
}

// check isPrefix in PathInData::Parts. like : [{"a": {"c": {"b": 1}}}, {"a": {"c": 2.2}}], "a.c" is prefix of "a.c.b"
// return true if prefix is a prefix of parts
static bool is_prefix(const PathInData::Parts& prefix, const PathInData::Parts& parts) {
    if (prefix.size() >= parts.size()) {
        return false;
    }
    for (size_t i = 0; i < prefix.size(); ++i) {
        if (prefix[i].key != parts[i].key) {
            return false;
        }
    }
    return true;
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseArray(const JSONArray& array, ParseContext& ctx) {
    /// Traverse elements of array and collect an array of fields by each path.
    ParseArrayContext array_ctx;
    array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
    array_ctx.is_top_array = ctx.is_top_array;
    array_ctx.total_size = array.size();
    for (auto it = array.begin(); it != array.end(); ++it) {
        traverseArrayElement(*it, array_ctx);
        ++array_ctx.current_size;
    }
    auto&& arrays_by_path = array_ctx.arrays_by_path;
    if (arrays_by_path.empty()) {
        ctx.paths.push_back(ctx.builder.get_parts());
        ctx.values.push_back(Field::create_field<TYPE_ARRAY>(Array()));
    } else {
        ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size());
        ctx.values.reserve(ctx.values.size() + arrays_by_path.size());
        for (auto it = arrays_by_path.begin(); it != arrays_by_path.end(); ++it) {
            auto&& [path, path_array] = it->second;
            /// Merge prefix path and path of array element.
            ctx.paths.push_back(ctx.builder.append(path, true).get_parts());
            ctx.values.push_back(Field::create_field<TYPE_ARRAY>(std::move(path_array)));
            ctx.builder.pop_back(path.size());
        }
    }
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element,
                                                      ParseArrayContext& ctx) {
    ParseContext element_ctx;
    element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
    element_ctx.is_top_array = ctx.is_top_array;
    traverse(element, element_ctx);
    auto& [_, paths, values, deprecated_flatten_nested, __, is_top_array] = element_ctx;

    if (element_ctx.has_nested_in_flatten && is_top_array) {
        checkAmbiguousStructure(ctx, paths);
    }

    size_t size = paths.size();
    size_t keys_to_update = ctx.arrays_by_path.size();

    for (size_t i = 0; i < size; ++i) {
        if (values[i].is_null()) {
            continue;
        }

        UInt128 hash = PathInData::get_parts_hash(paths[i]);
        auto found = ctx.arrays_by_path.find(hash);

        if (found != ctx.arrays_by_path.end()) {
            handleExistingPath(found->second, paths[i], values[i], ctx, keys_to_update);
        } else {
            handleNewPath(hash, paths[i], values[i], ctx);
        }
    }

    // always fill missed values to keep element-level association between keys.
    if (keys_to_update) {
        fillMissedValuesInArrays(ctx);
    }
}

// check if the structure of top_array is ambiguous like:
// [{"a": {"b": {"c": 1}}}, {"a": {"b": 1}}] a.b is ambiguous
template <typename ParserImpl>
void JSONDataParser<ParserImpl>::checkAmbiguousStructure(
        const ParseArrayContext& ctx, const std::vector<PathInData::Parts>& paths) {
    for (auto&& current_path : paths) {
        for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
            auto&& [p, _] = it->second;
            if (is_prefix(p, current_path) || is_prefix(current_path, p)) {
                throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
                                       "Ambiguous structure of top_array nested subcolumns: {}, {}",
                                       PathInData(p).to_jsonpath(),
                                       PathInData(current_path).to_jsonpath());
            }
        }
    }
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::handleExistingPath(std::pair<PathInData::Parts, Array>& path_data,
                                                    const PathInData::Parts& path, Field& value,
                                                    ParseArrayContext& ctx,
                                                    size_t& keys_to_update) {
    auto& path_array = path_data.second;
    // keep arrays aligned for all keys (including top-level arrays).
    assert(path_array.size() == ctx.current_size);
    // If current element of array is part of Nested,
    // collect its size or check it if the size of
    // the Nested has been already collected.
    auto nested_key = getNameOfNested(path, value);
    if (!nested_key.empty()) {
        size_t array_size = value.get<TYPE_ARRAY>().size();
        auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
        if (current_nested_sizes.size() == ctx.current_size) {
            current_nested_sizes.push_back(array_size);
        } else if (array_size != current_nested_sizes.back()) {
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
                                   "Array sizes mismatched ({} and {})", array_size,
                                   current_nested_sizes.back());
        }
    }

    path_array.push_back(std::move(value));
    --keys_to_update;
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::handleNewPath(UInt128 hash, const PathInData::Parts& path,
                                               Field& value, ParseArrayContext& ctx) {
    Array path_array;
    path_array.reserve(ctx.total_size);

    // always resize to keep alignment.
    path_array.resize(ctx.current_size);

    auto nested_key = getNameOfNested(path, value);
    if (!nested_key.empty()) {
        size_t array_size = value.get<TYPE_ARRAY>().size();
        auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
        if (current_nested_sizes.empty()) {
            current_nested_sizes.resize(ctx.current_size);
        } else {
            // If newly added element is part of the Nested then
            // resize its elements to keep correct sizes of Nested arrays.
            for (size_t j = 0; j < ctx.current_size; ++j) {
                path_array[j] = Field::create_field<TYPE_ARRAY>(Array(current_nested_sizes[j]));
            }
        }
        if (current_nested_sizes.size() == ctx.current_size) {
            current_nested_sizes.push_back(array_size);
        } else if (array_size != current_nested_sizes.back()) {
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
                                   "Array sizes mismatched ({} and {})", array_size,
                                   current_nested_sizes.back());
        }
    }

    path_array.push_back(std::move(value));
    auto& elem = ctx.arrays_by_path[hash];
    elem.first = std::move(path);
    elem.second = std::move(path_array);
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::fillMissedValuesInArrays(ParseArrayContext& ctx) {
    for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
        auto& [path, path_array] = it->second;
        assert(path_array.size() == ctx.current_size || path_array.size() == ctx.current_size + 1);
        if (path_array.size() == ctx.current_size) {
            bool inserted = tryInsertDefaultFromNested(ctx, path, path_array);
            if (!inserted) {
                path_array.emplace_back();
            }
        }
    }
}

template <typename ParserImpl>
bool JSONDataParser<ParserImpl>::tryInsertDefaultFromNested(ParseArrayContext& ctx,
                                                            const PathInData::Parts& path,
                                                            Array& array) {
    /// If there is a collected size of current Nested
    /// then insert array of this size as a default value.
    if (path.empty() || array.empty()) {
        return false;
    }
    /// Last element is not Null, because otherwise this path wouldn't exist.
    auto nested_key = getNameOfNested(path, array.back());
    if (nested_key.empty()) {
        return false;
    }
    auto mapped = ctx.nested_sizes_by_key.find(nested_key);
    if (mapped == ctx.nested_sizes_by_key.end()) {
        return false;
    }
    auto& current_nested_sizes = mapped->second;
    assert(current_nested_sizes.size() == ctx.current_size ||
           current_nested_sizes.size() == ctx.current_size + 1);
    /// If all keys of Nested were missed then add a zero length.
    if (current_nested_sizes.size() == ctx.current_size) {
        current_nested_sizes.push_back(0);
    }
    size_t array_size = current_nested_sizes.back();
    array.push_back(Field::create_field<TYPE_ARRAY>(Array(array_size)));
    return true;
}

template <typename ParserImpl>
StringRef JSONDataParser<ParserImpl>::getNameOfNested(const PathInData::Parts& path,
                                                      const Field& value) {
    if (value.get_type() != PrimitiveType::TYPE_ARRAY || path.empty()) {
        return {};
    }
    /// Find first key that is marked as nested,
    /// because we may have tuple of Nested and there could be
    /// several arrays with the same prefix, but with independent sizes.
    /// Consider we have array element with type `k2 Tuple(k3 Nested(...), k5 Nested(...))`
    /// Then subcolumns `k2.k3` and `k2.k5` may have indepented sizes and we should extract
    /// `k3` and `k5` keys instead of `k2`.
    for (const auto& part : path) {
        if (part.is_nested) {
            return {part.key.data(), part.key.size()};
        }
    }
    return {};
}

template class JSONDataParser<SimdJSONParser>;
} // namespace doris

Coverage Report

Created: 2026-04-10 16:11

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17		// This file is copied from
18		// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.cpp
19		// and modified by Doris
20
21		#include "util/json/json_parser.h"
22
23		#include <fmt/format.h>
24		#include <glog/logging.h>
25
26		#include <algorithm>
27		#include <cassert>
28		#include <string_view>
29
30		#include "common/cast_set.h"
31		// IWYU pragma: keep
32		#include "common/status.h"
33		#include "util/json/path_in_data.h"
34		#include "util/json/simd_json_parser.h"
35
36		namespace doris {
37
38		template <typename ParserImpl>
39		std::optional<ParseResult> JSONDataParser<ParserImpl>::parse(const char* begin, size_t length,
40	1.35M	const ParseConfig& config) {
41	1.35M	Element document;
42	1.35M	if (!parser.parse(begin, length, document)) {
43	666	return {};
44	666	}
45	1.35M	ParseContext context;
46		// deprecated_enable_flatten_nested controls nested path traversal
47		// NestedGroup expansion is now handled at storage layer
48	1.35M	context.deprecated_enable_flatten_nested = config.deprecated_enable_flatten_nested;
49	1.35M	context.is_top_array = document.isArray();
50	1.35M	traverse(document, context);
51	1.35M	ParseResult result;
52	1.35M	result.values = std::move(context.values);
53	1.35M	result.paths.reserve(context.paths.size());
54	19.1M	for (auto&& path : context.paths) {
55	19.1M	result.paths.emplace_back(std::move(path));
56	19.1M	}
57	1.35M	return result;
58	1.35M	}
59
60		template <typename ParserImpl>
61	22.3M	void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext& ctx) {
62		// checkStackSize();
63	22.3M	if (element.isObject()) {
64	2.03M	traverseObject(element.getObject(), ctx);
65	20.3M	} else if (element.isArray()) {
66		// allow nested arrays (multi-level) for NestedGroup; deeper levels are
67		// handled by VariantNestedBuilder with a max-depth guard.
68	681k	has_nested = false;
69	681k	check_has_nested_object(element);
70	681k	ctx.has_nested_in_flatten = has_nested && ctx.deprecated_enable_flatten_nested;
71	681k	if (has_nested && !ctx.deprecated_enable_flatten_nested) {
72		// Parse nested arrays to JsonbField
73	46.9k	JsonbWriter writer;
74	46.9k	traverseArrayAsJsonb(element.getArray(), writer);
75	46.9k	ctx.paths.push_back(ctx.builder.get_parts());
76	46.9k	ctx.values.push_back(Field::create_field<TYPE_JSONB>(
77	46.9k	JsonbField(writer.getOutput()->getBuffer(), writer.getOutput()->getSize())));
78	634k	} else {
79	634k	traverseArray(element.getArray(), ctx);
80	634k	}
81		// we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array
82	681k	ctx.has_nested_in_flatten = false;
83	19.6M	} else {
84	19.6M	ctx.paths.push_back(ctx.builder.get_parts());
85	19.6M	ctx.values.push_back(getValueAsField(element));
86	19.6M	}
87	22.3M	}
88		template <typename ParserImpl>
89	2.03M	void JSONDataParser<ParserImpl>::traverseObject(const JSONObject& object, ParseContext& ctx) {
90	2.03M	ctx.paths.reserve(ctx.paths.size() + object.size());
91	2.03M	ctx.values.reserve(ctx.values.size() + object.size());
92	21.6M	for (auto it = object.begin(); it != object.end(); ++it) {
93	19.5M	const auto& [key, value] = *it;
94	19.5M	const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
95	19.5M	if (key.size() > max_key_length) {
96	337	throw doris::Exception(
97	337	doris::ErrorCode::INVALID_ARGUMENT,
98	337	fmt::format("Key length exceeds maximum allowed size of {} bytes.",
99	337	max_key_length));
100	337	}
101	19.5M	ctx.builder.append(key, false);
102	19.5M	traverse(value, ctx);
103	19.5M	ctx.builder.pop_back();
104	19.5M	}
105	2.03M	}
106
107		template <typename ParserImpl>
108	2.75M	void JSONDataParser<ParserImpl>::check_has_nested_object(const Element& element) {
109	2.75M	if (element.isArray()) {
110	839k	const JSONArray& array = element.getArray();
111	2.92M	for (auto it = array.begin(); it != array.end(); ++it) {
112	2.08M	check_has_nested_object(*it);
113	2.08M	}
114	839k	}
115	2.75M	if (element.isObject()) {
116	109k	has_nested = true;
117	109k	}
118	2.75M	}
119
120		template <typename ParserImpl>
121	651k	void JSONDataParser<ParserImpl>::traverseAsJsonb(const Element& element, JsonbWriter& writer) {
122	651k	if (element.isObject()) {
123	155k	traverseObjectAsJsonb(element.getObject(), writer);
124	495k	} else if (element.isArray()) {
125	20	traverseArrayAsJsonb(element.getArray(), writer);
126	495k	} else {
127	495k	writeValueAsJsonb(element, writer);
128	495k	}
129	651k	}
130
131		template <typename ParserImpl>
132		void JSONDataParser<ParserImpl>::traverseObjectAsJsonb(const JSONObject& object,
133	155k	JsonbWriter& writer) {
134	155k	writer.writeStartObject();
135	725k	for (auto it = object.begin(); it != object.end(); ++it) {
136	569k	const auto& [key, value] = *it;
137	569k	const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
138	569k	if (key.size() > max_key_length) {
139	201	throw doris::Exception(
140	201	doris::ErrorCode::INVALID_ARGUMENT,
141	201	fmt::format("Key length exceeds maximum allowed size of {} bytes.",
142	201	max_key_length));
143	201	}
144	569k	writer.writeKey(key.data(), cast_set<uint8_t>(key.size()));
145	569k	traverseAsJsonb(value, writer);
146	569k	}
147	155k	writer.writeEndObject();
148	155k	}
149
150		template <typename ParserImpl>
151	46.9k	void JSONDataParser<ParserImpl>::traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer) {
152	46.9k	writer.writeStartArray();
153	128k	for (auto it = array.begin(); it != array.end(); ++it) {
154	81.9k	traverseAsJsonb(*it, writer);
155	81.9k	}
156	46.9k	writer.writeEndArray();
157	46.9k	}
158
159		// check isPrefix in PathInData::Parts. like : [{"a": {"c": {"b": 1}}}, {"a": {"c": 2.2}}], "a.c" is prefix of "a.c.b"
160		// return true if prefix is a prefix of parts
161	16	static bool is_prefix(const PathInData::Parts& prefix, const PathInData::Parts& parts) {
162	16	if (prefix.size() >= parts.size()) {
163	15	return false;
164	15	}
165	2	for (size_t i = 0; i < prefix.size(); ++i) {
166	1	if (prefix[i].key != parts[i].key) {
167	0	return false;
168	0	}
169	1	}
170	1	return true;
171	1	}
172
173		template <typename ParserImpl>
174	634k	void JSONDataParser<ParserImpl>::traverseArray(const JSONArray& array, ParseContext& ctx) {
175		/// Traverse elements of array and collect an array of fields by each path.
176	634k	ParseArrayContext array_ctx;
177	634k	array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
178	634k	array_ctx.is_top_array = ctx.is_top_array;
179	634k	array_ctx.total_size = array.size();
180	2.03M	for (auto it = array.begin(); it != array.end(); ++it) {
181	1.40M	traverseArrayElement(*it, array_ctx);
182	1.40M	++array_ctx.current_size;
183	1.40M	}
184	634k	auto&& arrays_by_path = array_ctx.arrays_by_path;
185	634k	if (arrays_by_path.empty()) {
186	19.7k	ctx.paths.push_back(ctx.builder.get_parts());
187	19.7k	ctx.values.push_back(Field::create_field<TYPE_ARRAY>(Array()));
188	614k	} else {
189	614k	ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size());
190	614k	ctx.values.reserve(ctx.values.size() + arrays_by_path.size());
191	1.24M	for (auto it = arrays_by_path.begin(); it != arrays_by_path.end(); ++it) {
192	629k	auto&& [path, path_array] = it->second;
193		/// Merge prefix path and path of array element.
194	629k	ctx.paths.push_back(ctx.builder.append(path, true).get_parts());
195	629k	ctx.values.push_back(Field::create_field<TYPE_ARRAY>(std::move(path_array)));
196	629k	ctx.builder.pop_back(path.size());
197	629k	}
198	614k	}
199	634k	}
200
201		template <typename ParserImpl>
202		void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element,
203	1.40M	ParseArrayContext& ctx) {
204	1.40M	ParseContext element_ctx;
205	1.40M	element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
206	1.40M	element_ctx.is_top_array = ctx.is_top_array;
207	1.40M	traverse(element, element_ctx);
208	1.40M	auto& [_, paths, values, deprecated_flatten_nested, __, is_top_array] = element_ctx;
209
210	1.40M	if (element_ctx.has_nested_in_flatten && is_top_array) {
211	10	checkAmbiguousStructure(ctx, paths);
212	10	}
213
214	1.40M	size_t size = paths.size();
215	1.40M	size_t keys_to_update = ctx.arrays_by_path.size();
216
217	2.82M	for (size_t i = 0; i < size; ++i) {
218	1.41M	if (values[i].is_null()) {
219	11.8k	continue;
220	11.8k	}
221
222	1.40M	UInt128 hash = PathInData::get_parts_hash(paths[i]);
223	1.40M	auto found = ctx.arrays_by_path.find(hash);
224
225	1.40M	if (found != ctx.arrays_by_path.end()) {
226	776k	handleExistingPath(found->second, paths[i], values[i], ctx, keys_to_update);
227	776k	} else {
228	629k	handleNewPath(hash, paths[i], values[i], ctx);
229	629k	}
230	1.40M	}
231
232		// always fill missed values to keep element-level association between keys.
233	1.40M	if (keys_to_update) {
234	10.4k	fillMissedValuesInArrays(ctx);
235	10.4k	}
236	1.40M	}
237
238		// check if the structure of top_array is ambiguous like:
239		// [{"a": {"b": {"c": 1}}}, {"a": {"b": 1}}] a.b is ambiguous
240		template <typename ParserImpl>
241		void JSONDataParser<ParserImpl>::checkAmbiguousStructure(
242	10	const ParseArrayContext& ctx, const std::vector<PathInData::Parts>& paths) {
243	13	for (auto&& current_path : paths) {
244	20	for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
245	8	auto&& [p, _] = it->second;
246	8	if (is_prefix(p, current_path) \|\| is_prefix(current_path, p)) {
247	1	throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
248	1	"Ambiguous structure of top_array nested subcolumns: {}, {}",
249	1	PathInData(p).to_jsonpath(),
250	1	PathInData(current_path).to_jsonpath());
251	1	}
252	8	}
253	13	}
254	10	}
255
256		template <typename ParserImpl>
257		void JSONDataParser<ParserImpl>::handleExistingPath(std::pair<PathInData::Parts, Array>& path_data,
258		const PathInData::Parts& path, Field& value,
259		ParseArrayContext& ctx,
260	776k	size_t& keys_to_update) {
261	776k	auto& path_array = path_data.second;
262		// keep arrays aligned for all keys (including top-level arrays).
263	776k	assert(path_array.size() == ctx.current_size);
264		// If current element of array is part of Nested,
265		// collect its size or check it if the size of
266		// the Nested has been already collected.
267	776k	auto nested_key = getNameOfNested(path, value);
268	776k	if (!nested_key.empty()) {
269	0	size_t array_size = value.get<TYPE_ARRAY>().size();
270	0	auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
271	0	if (current_nested_sizes.size() == ctx.current_size) {
272	0	current_nested_sizes.push_back(array_size);
273	0	} else if (array_size != current_nested_sizes.back()) {
274	0	throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
275	0	"Array sizes mismatched ({} and {})", array_size,
276	0	current_nested_sizes.back());
277	0	}
278	0	}
279
280	776k	path_array.push_back(std::move(value));
281	776k	--keys_to_update;
282	776k	}
283
284		template <typename ParserImpl>
285		void JSONDataParser<ParserImpl>::handleNewPath(UInt128 hash, const PathInData::Parts& path,
286	627k	Field& value, ParseArrayContext& ctx) {
287	627k	Array path_array;
288	627k	path_array.reserve(ctx.total_size);
289
290		// always resize to keep alignment.
291	627k	path_array.resize(ctx.current_size);
292
293	627k	auto nested_key = getNameOfNested(path, value);
294	627k	if (!nested_key.empty()) {
295	3	size_t array_size = value.get<TYPE_ARRAY>().size();
296	3	auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
297	3	if (current_nested_sizes.empty()) {
298	2	current_nested_sizes.resize(ctx.current_size);
299	2	} else {
300		// If newly added element is part of the Nested then
301		// resize its elements to keep correct sizes of Nested arrays.
302	3	for (size_t j = 0; j < ctx.current_size; ++j) {
303	2	path_array[j] = Field::create_field<TYPE_ARRAY>(Array(current_nested_sizes[j]));
304	2	}
305	1	}
306	3	if (current_nested_sizes.size() == ctx.current_size) {
307	2	current_nested_sizes.push_back(array_size);
308	2	} else if (array_size != current_nested_sizes.back()) {
309	0	throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
310	0	"Array sizes mismatched ({} and {})", array_size,
311	0	current_nested_sizes.back());
312	0	}
313	3	}
314
315	627k	path_array.push_back(std::move(value));
316	627k	auto& elem = ctx.arrays_by_path[hash];
317	627k	elem.first = std::move(path);
318	627k	elem.second = std::move(path_array);
319	627k	}
320
321		template <typename ParserImpl>
322	10.4k	void JSONDataParser<ParserImpl>::fillMissedValuesInArrays(ParseArrayContext& ctx) {
323	21.4k	for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
324	11.0k	auto& [path, path_array] = it->second;
325	11.0k	assert(path_array.size() == ctx.current_size \|\| path_array.size() == ctx.current_size + 1);
326	11.0k	if (path_array.size() == ctx.current_size) {
327	10.8k	bool inserted = tryInsertDefaultFromNested(ctx, path, path_array);
328	10.8k	if (!inserted) {
329	10.8k	path_array.emplace_back();
330	10.8k	}
331	10.8k	}
332	11.0k	}
333	10.4k	}
334
335		template <typename ParserImpl>
336		bool JSONDataParser<ParserImpl>::tryInsertDefaultFromNested(ParseArrayContext& ctx,
337		const PathInData::Parts& path,
338	10.8k	Array& array) {
339		/// If there is a collected size of current Nested
340		/// then insert array of this size as a default value.
341	10.8k	if (path.empty() \|\| array.empty()) {
342	10.2k	return false;
343	10.2k	}
344		/// Last element is not Null, because otherwise this path wouldn't exist.
345	541	auto nested_key = getNameOfNested(path, array.back());
346	541	if (nested_key.empty()) {
347	541	return false;
348	541	}
349	0	auto mapped = ctx.nested_sizes_by_key.find(nested_key);
350	0	if (mapped == ctx.nested_sizes_by_key.end()) {
351	0	return false;
352	0	}
353	0	auto& current_nested_sizes = mapped->second;
354	0	assert(current_nested_sizes.size() == ctx.current_size \|\|
355	0	current_nested_sizes.size() == ctx.current_size + 1);
356		/// If all keys of Nested were missed then add a zero length.
357	0	if (current_nested_sizes.size() == ctx.current_size) {
358	0	current_nested_sizes.push_back(0);
359	0	}
360	0	size_t array_size = current_nested_sizes.back();
361	0	array.push_back(Field::create_field<TYPE_ARRAY>(Array(array_size)));
362	0	return true;
363	0	}
364
365		template <typename ParserImpl>
366		StringRef JSONDataParser<ParserImpl>::getNameOfNested(const PathInData::Parts& path,
367	1.40M	const Field& value) {
368	1.40M	if (value.get_type() != PrimitiveType::TYPE_ARRAY \|\| path.empty()) {
369	1.40M	return {};
370	1.40M	}
371		/// Find first key that is marked as nested,
372		/// because we may have tuple of Nested and there could be
373		/// several arrays with the same prefix, but with independent sizes.
374		/// Consider we have array element with type `k2 Tuple(k3 Nested(...), k5 Nested(...))`
375		/// Then subcolumns `k2.k3` and `k2.k5` may have indepented sizes and we should extract
376		/// `k3` and `k5` keys instead of `k2`.
377	893	for (const auto& part : path) {
378	3	if (part.is_nested) {
379	3	return {part.key.data(), part.key.size()};
380	3	}
381	3	}
382	890	return {};
383	893	}
384
385		template class JSONDataParser<SimdJSONParser>;
386		} // namespace doris