be/src/util/json/json_parser.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.cpp
// and modified by Doris

#include "util/json/json_parser.h"

#include <fmt/format.h>
#include <glog/logging.h>

#include <algorithm>
#include <cassert>
#include <string_view>
#include <vector>

#include "common/cast_set.h"
// IWYU pragma: keep
#include "common/status.h"
#include "util/json/path_in_data.h"
#include "util/json/simd_json_parser.h"

namespace doris {

template <typename ParserImpl>
std::optional<ParseResult> JSONDataParser<ParserImpl>::parse(const char* begin, size_t length,
                                                             const ParseConfig& config) {
    Element document;
    const bool preserve_raw_numbers =
            !config.preserve_decimal_number_paths.empty() ||
            static_cast<bool>(config.preserve_decimal_number_path_matcher);
    if (!parser.parse(begin, length, document, preserve_raw_numbers)) {
        return {};
    }
    ParseContext context;
    // deprecated_enable_flatten_nested controls nested path traversal
    // NestedGroup expansion is now handled at storage layer
    context.deprecated_enable_flatten_nested = config.deprecated_enable_flatten_nested;
    context.check_duplicate_json_path = config.check_duplicate_json_path;
    context.is_top_array = document.isArray();
    context.preserve_decimal_number_paths = &config.preserve_decimal_number_paths;
    context.preserve_decimal_number_path_matcher = &config.preserve_decimal_number_path_matcher;
    traverse(document, context);
    ParseResult result;
    result.values = std::move(context.values);
    result.paths.reserve(context.paths.size());
    for (auto&& path : context.paths) {
        result.paths.emplace_back(std::move(path));
    }
    return result;
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext& ctx) {
    // checkStackSize();
    if (element.isObject()) {
        traverseObject(element.getObject(), ctx);
    } else if (element.isArray()) {
        // allow nested arrays (multi-level) for NestedGroup; deeper levels are
        // handled by VariantNestedBuilder with a max-depth guard.
        has_nested = false;
        check_has_nested_object(element);
        ctx.has_nested_in_flatten = has_nested && ctx.deprecated_enable_flatten_nested;
        if (has_nested && !ctx.deprecated_enable_flatten_nested) {
            // Parse nested arrays to JsonbField
            JsonbWriter writer;
            traverseArrayAsJsonb(element.getArray(), writer);
            appendValueIfNotDuplicate(
                    ctx, ctx.builder.get_parts(),
                    Field::create_field<TYPE_JSONB>(JsonbField(writer.getOutput()->getBuffer(),
                                                               writer.getOutput()->getSize())));
        } else {
            traverseArray(element.getArray(), ctx);
        }
        // we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array
        ctx.has_nested_in_flatten = false;
    } else {
        appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
                                  getValueAsField(element, shouldPreserveNumberAsString(ctx)));
    }
}

template <typename ParserImpl>
bool JSONDataParser<ParserImpl>::shouldPreserveNumberAsString(const ParseContext& ctx) const {
    const bool has_exact_paths = ctx.preserve_decimal_number_paths != nullptr &&
                                 !ctx.preserve_decimal_number_paths->empty();
    const bool has_path_matcher = ctx.preserve_decimal_number_path_matcher != nullptr &&
                                  *ctx.preserve_decimal_number_path_matcher;
    if (!has_exact_paths && !has_path_matcher) {
        return false;
    }
    PathInData::Parts path = ctx.path_prefix_for_typed_paths;
    const auto& current_parts = ctx.builder.get_parts();
    path.insert(path.end(), current_parts.begin(), current_parts.end());
    const auto current_path = PathInData(path).get_path();
    if (has_exact_paths && ctx.preserve_decimal_number_paths->find(current_path) !=
                                   ctx.preserve_decimal_number_paths->end()) {
        return true;
    }
    return has_path_matcher && (*ctx.preserve_decimal_number_path_matcher)(current_path);
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::appendValueIfNotDuplicate(ParseContext& ctx,
                                                           const PathInData::Parts& path,
                                                           Field&& value) {
    if (ctx.check_duplicate_json_path) {
        PathInData path_in_data(path);
        if (!ctx.visited_path_names.emplace(path_in_data.get_path()).second) {
            return;
        }
    }
    ctx.paths.push_back(path);
    ctx.values.push_back(std::move(value));
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseObject(const JSONObject& object, ParseContext& ctx) {
    ctx.paths.reserve(ctx.paths.size() + object.size());
    ctx.values.reserve(ctx.values.size() + object.size());
    auto check_key_length = [](const auto& key) {
        const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
        if (key.size() > max_key_length) {
            throw doris::Exception(
                    doris::ErrorCode::INVALID_ARGUMENT,
                    fmt::format("Key length exceeds maximum allowed size of {} bytes.",
                                max_key_length));
        }
    };
    auto traverse_object_member = [&](const auto& key, const auto& value) {
        check_key_length(key);
        ctx.builder.append(key, false);
        traverse(value, ctx);
        ctx.builder.pop_back();
    };

    for (auto it = object.begin(); it != object.end(); ++it) {
        const auto& [key, value] = *it;
        traverse_object_member(key, value);
    }
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::check_has_nested_object(const Element& element) {
    if (element.isArray()) {
        const JSONArray& array = element.getArray();
        for (auto it = array.begin(); it != array.end(); ++it) {
            check_has_nested_object(*it);
        }
    }
    if (element.isObject()) {
        has_nested = true;
    }
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseAsJsonb(const Element& element, JsonbWriter& writer) {
    if (element.isObject()) {
        traverseObjectAsJsonb(element.getObject(), writer);
    } else if (element.isArray()) {
        traverseArrayAsJsonb(element.getArray(), writer);
    } else {
        writeValueAsJsonb(element, writer);
    }
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseObjectAsJsonb(const JSONObject& object,
                                                       JsonbWriter& writer) {
    writer.writeStartObject();
    for (auto it = object.begin(); it != object.end(); ++it) {
        const auto& [key, value] = *it;
        const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
        if (key.size() > max_key_length) {
            throw doris::Exception(
                    doris::ErrorCode::INVALID_ARGUMENT,
                    fmt::format("Key length exceeds maximum allowed size of {} bytes.",
                                max_key_length));
        }
        writer.writeKey(key.data(), cast_set<uint8_t>(key.size()));
        traverseAsJsonb(value, writer);
    }
    writer.writeEndObject();
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer) {
    writer.writeStartArray();
    for (auto it = array.begin(); it != array.end(); ++it) {
        traverseAsJsonb(*it, writer);
    }
    writer.writeEndArray();
}

// check isPrefix in PathInData::Parts. like : [{"a": {"c": {"b": 1}}}, {"a": {"c": 2.2}}], "a.c" is prefix of "a.c.b"
// return true if prefix is a prefix of parts
static bool is_prefix(const PathInData::Parts& prefix, const PathInData::Parts& parts) {
    if (prefix.size() >= parts.size()) {
        return false;
    }
    for (size_t i = 0; i < prefix.size(); ++i) {
        if (prefix[i].key != parts[i].key) {
            return false;
        }
    }
    return true;
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseArray(const JSONArray& array, ParseContext& ctx) {
    /// Traverse elements of array and collect an array of fields by each path.
    ParseArrayContext array_ctx;
    array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
    array_ctx.is_top_array = ctx.is_top_array;
    array_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path;
    array_ctx.preserve_decimal_number_paths = ctx.preserve_decimal_number_paths;
    array_ctx.preserve_decimal_number_path_matcher = ctx.preserve_decimal_number_path_matcher;
    array_ctx.path_prefix_for_typed_paths = ctx.path_prefix_for_typed_paths;
    const auto& current_parts = ctx.builder.get_parts();
    array_ctx.path_prefix_for_typed_paths.insert(array_ctx.path_prefix_for_typed_paths.end(),
                                                 current_parts.begin(), current_parts.end());
    array_ctx.total_size = array.size();
    for (auto it = array.begin(); it != array.end(); ++it) {
        traverseArrayElement(*it, array_ctx);
        ++array_ctx.current_size;
    }
    auto&& arrays_by_path = array_ctx.arrays_by_path;
    if (arrays_by_path.empty()) {
        appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
                                  Field::create_field<TYPE_ARRAY>(Array()));
    } else {
        ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size());
        ctx.values.reserve(ctx.values.size() + arrays_by_path.size());
        for (auto it = arrays_by_path.begin(); it != arrays_by_path.end(); ++it) {
            auto&& [path, path_array] = it->second;
            /// Merge prefix path and path of array element.
            ctx.builder.append(path, true);
            appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
                                      Field::create_field<TYPE_ARRAY>(std::move(path_array)));
            ctx.builder.pop_back(path.size());
        }
    }
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element,
                                                      ParseArrayContext& ctx) {
    ParseContext element_ctx;
    element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
    element_ctx.is_top_array = ctx.is_top_array;
    element_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path;
    element_ctx.preserve_decimal_number_paths = ctx.preserve_decimal_number_paths;
    element_ctx.preserve_decimal_number_path_matcher = ctx.preserve_decimal_number_path_matcher;
    element_ctx.path_prefix_for_typed_paths = ctx.path_prefix_for_typed_paths;
    traverse(element, element_ctx);
    auto& paths = element_ctx.paths;
    auto& values = element_ctx.values;

    if (element_ctx.has_nested_in_flatten && element_ctx.is_top_array) {
        checkAmbiguousStructure(ctx, paths);
    }

    size_t size = paths.size();
    size_t keys_to_update = ctx.arrays_by_path.size();

    for (size_t i = 0; i < size; ++i) {
        if (values[i].is_null()) {
            continue;
        }

        UInt128 hash = PathInData::get_parts_hash(paths[i]);
        auto found = ctx.arrays_by_path.find(hash);

        if (found != ctx.arrays_by_path.end()) {
            handleExistingPath(found->second, paths[i], values[i], ctx, keys_to_update);
        } else {
            handleNewPath(hash, paths[i], values[i], ctx);
        }
    }

    // always fill missed values to keep element-level association between keys.
    if (keys_to_update) {
        fillMissedValuesInArrays(ctx);
    }
}

// check if the structure of top_array is ambiguous like:
// [{"a": {"b": {"c": 1}}}, {"a": {"b": 1}}] a.b is ambiguous
template <typename ParserImpl>
void JSONDataParser<ParserImpl>::checkAmbiguousStructure(
        const ParseArrayContext& ctx, const std::vector<PathInData::Parts>& paths) {
    for (auto&& current_path : paths) {
        for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
            auto&& [p, _] = it->second;
            if (is_prefix(p, current_path) || is_prefix(current_path, p)) {
                throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
                                       "Ambiguous structure of top_array nested subcolumns: {}, {}",
                                       PathInData(p).to_jsonpath(),
                                       PathInData(current_path).to_jsonpath());
            }
        }
    }
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::handleExistingPath(std::pair<PathInData::Parts, Array>& path_data,
                                                    const PathInData::Parts& path, Field& value,
                                                    ParseArrayContext& ctx,
                                                    size_t& keys_to_update) {
    auto& path_array = path_data.second;
    // keep arrays aligned for all keys (including top-level arrays).
    assert(path_array.size() == ctx.current_size);
    // If current element of array is part of Nested,
    // collect its size or check it if the size of
    // the Nested has been already collected.
    auto nested_key = getNameOfNested(path, value);
    if (!nested_key.empty()) {
        size_t array_size = value.get<TYPE_ARRAY>().size();
        auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
        if (current_nested_sizes.size() == ctx.current_size) {
            current_nested_sizes.push_back(array_size);
        } else if (array_size != current_nested_sizes.back()) {
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
                                   "Array sizes mismatched ({} and {})", array_size,
                                   current_nested_sizes.back());
        }
    }

    path_array.push_back(std::move(value));
    --keys_to_update;
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::handleNewPath(UInt128 hash, const PathInData::Parts& path,
                                               Field& value, ParseArrayContext& ctx) {
    Array path_array;
    path_array.reserve(ctx.total_size);

    // always resize to keep alignment.
    path_array.resize(ctx.current_size);

    auto nested_key = getNameOfNested(path, value);
    if (!nested_key.empty()) {
        size_t array_size = value.get<TYPE_ARRAY>().size();
        auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
        if (current_nested_sizes.empty()) {
            current_nested_sizes.resize(ctx.current_size);
        } else {
            // If newly added element is part of the Nested then
            // resize its elements to keep correct sizes of Nested arrays.
            for (size_t j = 0; j < ctx.current_size; ++j) {
                path_array[j] = Field::create_field<TYPE_ARRAY>(Array(current_nested_sizes[j]));
            }
        }
        if (current_nested_sizes.size() == ctx.current_size) {
            current_nested_sizes.push_back(array_size);
        } else if (array_size != current_nested_sizes.back()) {
            throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
                                   "Array sizes mismatched ({} and {})", array_size,
                                   current_nested_sizes.back());
        }
    }

    path_array.push_back(std::move(value));
    auto& elem = ctx.arrays_by_path[hash];
    elem.first = std::move(path);
    elem.second = std::move(path_array);
}

template <typename ParserImpl>
void JSONDataParser<ParserImpl>::fillMissedValuesInArrays(ParseArrayContext& ctx) {
    for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
        auto& [path, path_array] = it->second;
        assert(path_array.size() == ctx.current_size || path_array.size() == ctx.current_size + 1);
        if (path_array.size() == ctx.current_size) {
            bool inserted = tryInsertDefaultFromNested(ctx, path, path_array);
            if (!inserted) {
                path_array.emplace_back();
            }
        }
    }
}

template <typename ParserImpl>
bool JSONDataParser<ParserImpl>::tryInsertDefaultFromNested(ParseArrayContext& ctx,
                                                            const PathInData::Parts& path,
                                                            Array& array) {
    /// If there is a collected size of current Nested
    /// then insert array of this size as a default value.
    if (path.empty() || array.empty()) {
        return false;
    }
    /// Last element is not Null, because otherwise this path wouldn't exist.
    auto nested_key = getNameOfNested(path, array.back());
    if (nested_key.empty()) {
        return false;
    }
    auto mapped = ctx.nested_sizes_by_key.find(nested_key);
    if (mapped == ctx.nested_sizes_by_key.end()) {
        return false;
    }
    auto& current_nested_sizes = mapped->second;
    assert(current_nested_sizes.size() == ctx.current_size ||
           current_nested_sizes.size() == ctx.current_size + 1);
    /// If all keys of Nested were missed then add a zero length.
    if (current_nested_sizes.size() == ctx.current_size) {
        current_nested_sizes.push_back(0);
    }
    size_t array_size = current_nested_sizes.back();
    array.push_back(Field::create_field<TYPE_ARRAY>(Array(array_size)));
    return true;
}

template <typename ParserImpl>
StringRef JSONDataParser<ParserImpl>::getNameOfNested(const PathInData::Parts& path,
                                                      const Field& value) {
    if (value.get_type() != PrimitiveType::TYPE_ARRAY || path.empty()) {
        return {};
    }
    /// Find first key that is marked as nested,
    /// because we may have struct of Nested and there could be
    /// several arrays with the same prefix, but with independent sizes.
    /// Consider we have array element with type `k2 Struct(k3 Nested(...), k5 Nested(...))`
    /// Then subcolumns `k2.k3` and `k2.k5` may have indepented sizes and we should extract
    /// `k3` and `k5` keys instead of `k2`.
    for (const auto& part : path) {
        if (part.is_nested) {
            return {part.key.data(), part.key.size()};
        }
    }
    return {};
}

template class JSONDataParser<SimdJSONParser>;
} // namespace doris

Coverage Report

Created: 2026-05-18 01:25

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17		// This file is copied from
18		// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.cpp
19		// and modified by Doris
20
21		#include "util/json/json_parser.h"
22
23		#include <fmt/format.h>
24		#include <glog/logging.h>
25
26		#include <algorithm>
27		#include <cassert>
28		#include <string_view>
29		#include <vector>
30
31		#include "common/cast_set.h"
32		// IWYU pragma: keep
33		#include "common/status.h"
34		#include "util/json/path_in_data.h"
35		#include "util/json/simd_json_parser.h"
36
37		namespace doris {
38
39		template <typename ParserImpl>
40		std::optional<ParseResult> JSONDataParser<ParserImpl>::parse(const char* begin, size_t length,
41	80.6k	const ParseConfig& config) {
42	80.6k	Element document;
43	80.6k	const bool preserve_raw_numbers =
44	80.6k	!config.preserve_decimal_number_paths.empty() \|\|
45	80.6k	static_cast<bool>(config.preserve_decimal_number_path_matcher);
46	80.6k	if (!parser.parse(begin, length, document, preserve_raw_numbers)) {
47	15	return {};
48	15	}
49	80.6k	ParseContext context;
50		// deprecated_enable_flatten_nested controls nested path traversal
51		// NestedGroup expansion is now handled at storage layer
52	80.6k	context.deprecated_enable_flatten_nested = config.deprecated_enable_flatten_nested;
53	80.6k	context.check_duplicate_json_path = config.check_duplicate_json_path;
54	80.6k	context.is_top_array = document.isArray();
55	80.6k	context.preserve_decimal_number_paths = &config.preserve_decimal_number_paths;
56	80.6k	context.preserve_decimal_number_path_matcher = &config.preserve_decimal_number_path_matcher;
57	80.6k	traverse(document, context);
58	80.6k	ParseResult result;
59	80.6k	result.values = std::move(context.values);
60	80.6k	result.paths.reserve(context.paths.size());
61	1.35M	for (auto&& path : context.paths) {
62	1.35M	result.paths.emplace_back(std::move(path));
63	1.35M	}
64	80.6k	return result;
65	80.6k	}
66
67		template <typename ParserImpl>
68	2.08M	void JSONDataParser<ParserImpl>::traverse(const Element& element, ParseContext& ctx) {
69		// checkStackSize();
70	2.08M	if (element.isObject()) {
71	144k	traverseObject(element.getObject(), ctx);
72	1.93M	} else if (element.isArray()) {
73		// allow nested arrays (multi-level) for NestedGroup; deeper levels are
74		// handled by VariantNestedBuilder with a max-depth guard.
75	55.3k	has_nested = false;
76	55.3k	check_has_nested_object(element);
77	55.3k	ctx.has_nested_in_flatten = has_nested && ctx.deprecated_enable_flatten_nested;
78	55.3k	if (has_nested && !ctx.deprecated_enable_flatten_nested) {
79		// Parse nested arrays to JsonbField
80	209	JsonbWriter writer;
81	209	traverseArrayAsJsonb(element.getArray(), writer);
82	209	appendValueIfNotDuplicate(
83	209	ctx, ctx.builder.get_parts(),
84	209	Field::create_field<TYPE_JSONB>(JsonbField(writer.getOutput()->getBuffer(),
85	209	writer.getOutput()->getSize())));
86	55.1k	} else {
87	55.1k	traverseArray(element.getArray(), ctx);
88	55.1k	}
89		// we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array
90	55.3k	ctx.has_nested_in_flatten = false;
91	1.88M	} else {
92	1.88M	appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
93	1.88M	getValueAsField(element, shouldPreserveNumberAsString(ctx)));
94	1.88M	}
95	2.08M	}
96
97		template <typename ParserImpl>
98	1.88M	bool JSONDataParser<ParserImpl>::shouldPreserveNumberAsString(const ParseContext& ctx) const {
99	1.88M	const bool has_exact_paths = ctx.preserve_decimal_number_paths != nullptr &&
100	1.88M	!ctx.preserve_decimal_number_paths->empty();
101	1.88M	const bool has_path_matcher = ctx.preserve_decimal_number_path_matcher != nullptr &&
102	1.88M	*ctx.preserve_decimal_number_path_matcher;
103	1.88M	if (!has_exact_paths && !has_path_matcher) {
104	1.88M	return false;
105	1.88M	}
106	23	PathInData::Parts path = ctx.path_prefix_for_typed_paths;
107	23	const auto& current_parts = ctx.builder.get_parts();
108	23	path.insert(path.end(), current_parts.begin(), current_parts.end());
109	23	const auto current_path = PathInData(path).get_path();
110	23	if (has_exact_paths && ctx.preserve_decimal_number_paths->find(current_path) !=
111	21	ctx.preserve_decimal_number_paths->end()) {
112	18	return true;
113	18	}
114	5	return has_path_matcher && (*ctx.preserve_decimal_number_path_matcher)(current_path);
115	23	}
116
117		template <typename ParserImpl>
118		void JSONDataParser<ParserImpl>::appendValueIfNotDuplicate(ParseContext& ctx,
119		const PathInData::Parts& path,
120	1.94M	Field&& value) {
121	1.94M	if (ctx.check_duplicate_json_path) {
122	32	PathInData path_in_data(path);
123	32	if (!ctx.visited_path_names.emplace(path_in_data.get_path()).second) {
124	10	return;
125	10	}
126	32	}
127	1.94M	ctx.paths.push_back(path);
128	1.94M	ctx.values.push_back(std::move(value));
129	1.94M	}
130
131		template <typename ParserImpl>
132	144k	void JSONDataParser<ParserImpl>::traverseObject(const JSONObject& object, ParseContext& ctx) {
133	144k	ctx.paths.reserve(ctx.paths.size() + object.size());
134	144k	ctx.values.reserve(ctx.values.size() + object.size());
135	1.42M	auto check_key_length = [](const auto& key) {
136	1.42M	const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
137	1.42M	if (key.size() > max_key_length) {
138	336	throw doris::Exception(
139	336	doris::ErrorCode::INVALID_ARGUMENT,
140	336	fmt::format("Key length exceeds maximum allowed size of {} bytes.",
141	336	max_key_length));
142	336	}
143	1.42M	};
144	1.42M	auto traverse_object_member = [&](const auto& key, const auto& value) {
145	1.42M	check_key_length(key);
146	1.42M	ctx.builder.append(key, false);
147	1.42M	traverse(value, ctx);
148	1.42M	ctx.builder.pop_back();
149	1.42M	};
150
151	1.56M	for (auto it = object.begin(); it != object.end(); ++it) {
152	1.42M	const auto& [key, value] = *it;
153	1.42M	traverse_object_member(key, value);
154	1.42M	}
155	144k	}
156
157		template <typename ParserImpl>
158	1.12M	void JSONDataParser<ParserImpl>::check_has_nested_object(const Element& element) {
159	1.12M	if (element.isArray()) {
160	105k	const JSONArray& array = element.getArray();
161	1.17M	for (auto it = array.begin(); it != array.end(); ++it) {
162	1.06M	check_has_nested_object(*it);
163	1.06M	}
164	105k	}
165	1.12M	if (element.isObject()) {
166	23.0k	has_nested = true;
167	23.0k	}
168	1.12M	}
169
170		template <typename ParserImpl>
171	4.31k	void JSONDataParser<ParserImpl>::traverseAsJsonb(const Element& element, JsonbWriter& writer) {
172	4.31k	if (element.isObject()) {
173	1.49k	traverseObjectAsJsonb(element.getObject(), writer);
174	2.82k	} else if (element.isArray()) {
175	7	traverseArrayAsJsonb(element.getArray(), writer);
176	2.81k	} else {
177	2.81k	writeValueAsJsonb(element, writer);
178	2.81k	}
179	4.31k	}
180
181		template <typename ParserImpl>
182		void JSONDataParser<ParserImpl>::traverseObjectAsJsonb(const JSONObject& object,
183	1.49k	JsonbWriter& writer) {
184	1.49k	writer.writeStartObject();
185	4.93k	for (auto it = object.begin(); it != object.end(); ++it) {
186	3.64k	const auto& [key, value] = *it;
187	3.64k	const size_t max_key_length = cast_set<size_t>(config::variant_max_json_key_length);
188	3.64k	if (key.size() > max_key_length) {
189	201	throw doris::Exception(
190	201	doris::ErrorCode::INVALID_ARGUMENT,
191	201	fmt::format("Key length exceeds maximum allowed size of {} bytes.",
192	201	max_key_length));
193	201	}
194	3.44k	writer.writeKey(key.data(), cast_set<uint8_t>(key.size()));
195	3.44k	traverseAsJsonb(value, writer);
196	3.44k	}
197	1.28k	writer.writeEndObject();
198	1.28k	}
199
200		template <typename ParserImpl>
201	216	void JSONDataParser<ParserImpl>::traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer) {
202	216	writer.writeStartArray();
203	1.08k	for (auto it = array.begin(); it != array.end(); ++it) {
204	873	traverseAsJsonb(*it, writer);
205	873	}
206	216	writer.writeEndArray();
207	216	}
208
209		// check isPrefix in PathInData::Parts. like : [{"a": {"c": {"b": 1}}}, {"a": {"c": 2.2}}], "a.c" is prefix of "a.c.b"
210		// return true if prefix is a prefix of parts
211	6	static bool is_prefix(const PathInData::Parts& prefix, const PathInData::Parts& parts) {
212	6	if (prefix.size() >= parts.size()) {
213	5	return false;
214	5	}
215	2	for (size_t i = 0; i < prefix.size(); ++i) {
216	1	if (prefix[i].key != parts[i].key) {
217	0	return false;
218	0	}
219	1	}
220	1	return true;
221	1	}
222
223		template <typename ParserImpl>
224	55.1k	void JSONDataParser<ParserImpl>::traverseArray(const JSONArray& array, ParseContext& ctx) {
225		/// Traverse elements of array and collect an array of fields by each path.
226	55.1k	ParseArrayContext array_ctx;
227	55.1k	array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
228	55.1k	array_ctx.is_top_array = ctx.is_top_array;
229	55.1k	array_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path;
230	55.1k	array_ctx.preserve_decimal_number_paths = ctx.preserve_decimal_number_paths;
231	55.1k	array_ctx.preserve_decimal_number_path_matcher = ctx.preserve_decimal_number_path_matcher;
232	55.1k	array_ctx.path_prefix_for_typed_paths = ctx.path_prefix_for_typed_paths;
233	55.1k	const auto& current_parts = ctx.builder.get_parts();
234	55.1k	array_ctx.path_prefix_for_typed_paths.insert(array_ctx.path_prefix_for_typed_paths.end(),
235	55.1k	current_parts.begin(), current_parts.end());
236	55.1k	array_ctx.total_size = array.size();
237	633k	for (auto it = array.begin(); it != array.end(); ++it) {
238	577k	traverseArrayElement(*it, array_ctx);
239	577k	++array_ctx.current_size;
240	577k	}
241	55.1k	auto&& arrays_by_path = array_ctx.arrays_by_path;
242	55.1k	if (arrays_by_path.empty()) {
243	4	appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
244	4	Field::create_field<TYPE_ARRAY>(Array()));
245	55.1k	} else {
246	55.1k	ctx.paths.reserve(ctx.paths.size() + arrays_by_path.size());
247	55.1k	ctx.values.reserve(ctx.values.size() + arrays_by_path.size());
248	113k	for (auto it = arrays_by_path.begin(); it != arrays_by_path.end(); ++it) {
249	58.1k	auto&& [path, path_array] = it->second;
250		/// Merge prefix path and path of array element.
251	58.1k	ctx.builder.append(path, true);
252	58.1k	appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(),
253	58.1k	Field::create_field<TYPE_ARRAY>(std::move(path_array)));
254	58.1k	ctx.builder.pop_back(path.size());
255	58.1k	}
256	55.1k	}
257	55.1k	}
258
259		template <typename ParserImpl>
260		void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element,
261	577k	ParseArrayContext& ctx) {
262	577k	ParseContext element_ctx;
263	577k	element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten;
264	577k	element_ctx.is_top_array = ctx.is_top_array;
265	577k	element_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path;
266	577k	element_ctx.preserve_decimal_number_paths = ctx.preserve_decimal_number_paths;
267	577k	element_ctx.preserve_decimal_number_path_matcher = ctx.preserve_decimal_number_path_matcher;
268	577k	element_ctx.path_prefix_for_typed_paths = ctx.path_prefix_for_typed_paths;
269	577k	traverse(element, element_ctx);
270	577k	auto& paths = element_ctx.paths;
271	577k	auto& values = element_ctx.values;
272
273	577k	if (element_ctx.has_nested_in_flatten && element_ctx.is_top_array) {
274	6	checkAmbiguousStructure(ctx, paths);
275	6	}
276
277	577k	size_t size = paths.size();
278	577k	size_t keys_to_update = ctx.arrays_by_path.size();
279
280	1.15M	for (size_t i = 0; i < size; ++i) {
281	580k	if (values[i].is_null()) {
282	11.7k	continue;
283	11.7k	}
284
285	569k	UInt128 hash = PathInData::get_parts_hash(paths[i]);
286	569k	auto found = ctx.arrays_by_path.find(hash);
287
288	569k	if (found != ctx.arrays_by_path.end()) {
289	510k	handleExistingPath(found->second, paths[i], values[i], ctx, keys_to_update);
290	510k	} else {
291	58.1k	handleNewPath(hash, paths[i], values[i], ctx);
292	58.1k	}
293	569k	}
294
295		// always fill missed values to keep element-level association between keys.
296	577k	if (keys_to_update) {
297	10.2k	fillMissedValuesInArrays(ctx);
298	10.2k	}
299	577k	}
300
301		// check if the structure of top_array is ambiguous like:
302		// [{"a": {"b": {"c": 1}}}, {"a": {"b": 1}}] a.b is ambiguous
303		template <typename ParserImpl>
304		void JSONDataParser<ParserImpl>::checkAmbiguousStructure(
305	6	const ParseArrayContext& ctx, const std::vector<PathInData::Parts>& paths) {
306	6	for (auto&& current_path : paths) {
307	8	for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
308	3	auto&& [p, _] = it->second;
309	3	if (is_prefix(p, current_path) \|\| is_prefix(current_path, p)) {
310	1	throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT,
311	1	"Ambiguous structure of top_array nested subcolumns: {}, {}",
312	1	PathInData(p).to_jsonpath(),
313	1	PathInData(current_path).to_jsonpath());
314	1	}
315	3	}
316	6	}
317	6	}
318
319		template <typename ParserImpl>
320		void JSONDataParser<ParserImpl>::handleExistingPath(std::pair<PathInData::Parts, Array>& path_data,
321		const PathInData::Parts& path, Field& value,
322		ParseArrayContext& ctx,
323	510k	size_t& keys_to_update) {
324	510k	auto& path_array = path_data.second;
325		// keep arrays aligned for all keys (including top-level arrays).
326	510k	assert(path_array.size() == ctx.current_size);
327		// If current element of array is part of Nested,
328		// collect its size or check it if the size of
329		// the Nested has been already collected.
330	510k	auto nested_key = getNameOfNested(path, value);
331	510k	if (!nested_key.empty()) {
332	0	size_t array_size = value.get<TYPE_ARRAY>().size();
333	0	auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
334	0	if (current_nested_sizes.size() == ctx.current_size) {
335	0	current_nested_sizes.push_back(array_size);
336	0	} else if (array_size != current_nested_sizes.back()) {
337	0	throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
338	0	"Array sizes mismatched ({} and {})", array_size,
339	0	current_nested_sizes.back());
340	0	}
341	0	}
342
343	510k	path_array.push_back(std::move(value));
344	510k	--keys_to_update;
345	510k	}
346
347		template <typename ParserImpl>
348		void JSONDataParser<ParserImpl>::handleNewPath(UInt128 hash, const PathInData::Parts& path,
349	58.1k	Field& value, ParseArrayContext& ctx) {
350	58.1k	Array path_array;
351	58.1k	path_array.reserve(ctx.total_size);
352
353		// always resize to keep alignment.
354	58.1k	path_array.resize(ctx.current_size);
355
356	58.1k	auto nested_key = getNameOfNested(path, value);
357	58.1k	if (!nested_key.empty()) {
358	3	size_t array_size = value.get<TYPE_ARRAY>().size();
359	3	auto& current_nested_sizes = ctx.nested_sizes_by_key[nested_key];
360	3	if (current_nested_sizes.empty()) {
361	2	current_nested_sizes.resize(ctx.current_size);
362	2	} else {
363		// If newly added element is part of the Nested then
364		// resize its elements to keep correct sizes of Nested arrays.
365	3	for (size_t j = 0; j < ctx.current_size; ++j) {
366	2	path_array[j] = Field::create_field<TYPE_ARRAY>(Array(current_nested_sizes[j]));
367	2	}
368	1	}
369	3	if (current_nested_sizes.size() == ctx.current_size) {
370	2	current_nested_sizes.push_back(array_size);
371	2	} else if (array_size != current_nested_sizes.back()) {
372	0	throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
373	0	"Array sizes mismatched ({} and {})", array_size,
374	0	current_nested_sizes.back());
375	0	}
376	3	}
377
378	58.1k	path_array.push_back(std::move(value));
379	58.1k	auto& elem = ctx.arrays_by_path[hash];
380	58.1k	elem.first = std::move(path);
381	58.1k	elem.second = std::move(path_array);
382	58.1k	}
383
384		template <typename ParserImpl>
385	10.2k	void JSONDataParser<ParserImpl>::fillMissedValuesInArrays(ParseArrayContext& ctx) {
386	20.5k	for (auto it = ctx.arrays_by_path.begin(); it != ctx.arrays_by_path.end(); ++it) {
387	10.2k	auto& [path, path_array] = it->second;
388	10.2k	assert(path_array.size() == ctx.current_size \|\| path_array.size() == ctx.current_size + 1);
389	10.2k	if (path_array.size() == ctx.current_size) {
390	10.2k	bool inserted = tryInsertDefaultFromNested(ctx, path, path_array);
391	10.2k	if (!inserted) {
392	10.2k	path_array.emplace_back();
393	10.2k	}
394	10.2k	}
395	10.2k	}
396	10.2k	}
397
398		template <typename ParserImpl>
399		bool JSONDataParser<ParserImpl>::tryInsertDefaultFromNested(ParseArrayContext& ctx,
400		const PathInData::Parts& path,
401	10.2k	Array& array) {
402		/// If there is a collected size of current Nested
403		/// then insert array of this size as a default value.
404	10.2k	if (path.empty() \|\| array.empty()) {
405	10.2k	return false;
406	10.2k	}
407		/// Last element is not Null, because otherwise this path wouldn't exist.
408	1	auto nested_key = getNameOfNested(path, array.back());
409	1	if (nested_key.empty()) {
410	1	return false;
411	1	}
412	0	auto mapped = ctx.nested_sizes_by_key.find(nested_key);
413	0	if (mapped == ctx.nested_sizes_by_key.end()) {
414	0	return false;
415	0	}
416	0	auto& current_nested_sizes = mapped->second;
417	0	assert(current_nested_sizes.size() == ctx.current_size \|\|
418	0	current_nested_sizes.size() == ctx.current_size + 1);
419		/// If all keys of Nested were missed then add a zero length.
420	0	if (current_nested_sizes.size() == ctx.current_size) {
421	0	current_nested_sizes.push_back(0);
422	0	}
423	0	size_t array_size = current_nested_sizes.back();
424	0	array.push_back(Field::create_field<TYPE_ARRAY>(Array(array_size)));
425	0	return true;
426	0	}
427
428		template <typename ParserImpl>
429		StringRef JSONDataParser<ParserImpl>::getNameOfNested(const PathInData::Parts& path,
430	569k	const Field& value) {
431	569k	if (value.get_type() != PrimitiveType::TYPE_ARRAY \|\| path.empty()) {
432	569k	return {};
433	569k	}
434		/// Find first key that is marked as nested,
435		/// because we may have struct of Nested and there could be
436		/// several arrays with the same prefix, but with independent sizes.
437		/// Consider we have array element with type `k2 Struct(k3 Nested(...), k5 Nested(...))`
438		/// Then subcolumns `k2.k3` and `k2.k5` may have indepented sizes and we should extract
439		/// `k3` and `k5` keys instead of `k2`.
440	3	for (const auto& part : path) {
441	3	if (part.is_nested) {
442	3	return {part.key.data(), part.key.size()};
443	3	}
444	3	}
445	0	return {};
446	3	}
447
448		template class JSONDataParser<SimdJSONParser>;
449		} // namespace doris