Coverage Report

Created: 2026-05-18 00:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/json/json_parser.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include <parallel_hashmap/phmap.h>
24
25
#include <cstddef>
26
#include <functional>
27
#include <optional>
28
#include <string>
29
#include <string_view>
30
#include <utility>
31
#include <vector>
32
33
#include "core/column/column.h"
34
#include "core/data_type/primitive_type.h"
35
#include "core/field.h"
36
#include "core/string_ref.h"
37
#include "core/uint128.h"
38
#include "util/json/path_in_data.h"
39
#include "util/json/simd_json_parser.h"
40
#include "util/jsonb_writer.h"
41
42
namespace doris {
43
44
template <typename Element>
45
1.88M
Field getValueAsField(const Element& element, bool preserve_number_as_string = false) {
46
    // bool will convert to type FiledType::UInt64
47
1.88M
    if (element.isBool()) {
48
20.6k
        return Field::create_field<TYPE_BOOLEAN>(element.getBool());
49
20.6k
    }
50
1.86M
    if (element.isInt64()) {
51
803k
        return Field::create_field<TYPE_BIGINT>(element.getInt64());
52
803k
    }
53
    // doris only support signed integers at present
54
    // use largeint to store unsigned int64
55
1.05M
    if (element.isUInt64()) {
56
21
        return Field::create_field<TYPE_LARGEINT>(static_cast<int128_t>(element.getUInt64()));
57
21
    }
58
1.05M
    if (element.isDouble()) {
59
116k
        if (preserve_number_as_string) {
60
14
            return Field::create_field<TYPE_STRING>(String(element.getRawNumber()));
61
14
        }
62
116k
        return Field::create_field<TYPE_DOUBLE>(element.getDouble());
63
116k
    }
64
942k
    if (element.isString()) {
65
930k
        return Field::create_field<TYPE_STRING>(String(element.getString()));
66
930k
    }
67
11.9k
    if (element.isNull()) {
68
11.9k
        return {};
69
11.9k
    }
70
0
    return {};
71
11.9k
}
72
73
template <typename Element>
74
2.81k
void writeValueAsJsonb(const Element& element, JsonbWriter& writer) {
75
    // bool will convert to type FiledType::UInt64
76
2.81k
    if (element.isBool()) {
77
77
        writer.writeBool(element.getBool());
78
77
        return;
79
77
    }
80
2.74k
    if (element.isInt64()) {
81
358
        writer.writeInt64(element.getInt64());
82
358
        return;
83
358
    }
84
    // doris only support signed integers at present
85
    // use largeint to store unsigned int64
86
2.38k
    if (element.isUInt64()) {
87
0
        writer.writeInt128(static_cast<int128_t>(element.getUInt64()));
88
0
        return;
89
0
    }
90
2.38k
    if (element.isDouble()) {
91
303
        writer.writeDouble(element.getDouble());
92
303
        return;
93
303
    }
94
2.07k
    if (element.isString()) {
95
2.00k
        writer.writeStartString();
96
2.00k
        std::string_view str = element.getString();
97
2.00k
        writer.writeString(str.data(), str.size());
98
2.00k
        writer.writeEndString();
99
2.00k
        return;
100
2.00k
    }
101
77
    if (element.isNull()) {
102
77
        writer.writeNull();
103
77
        return;
104
77
    }
105
77
}
106
107
struct ParseConfig {
108
    bool deprecated_enable_flatten_nested = false;
109
    bool check_duplicate_json_path = false;
110
    enum class ParseTo {
111
        OnlySubcolumns = 0,
112
        OnlyDocValueColumn = 1,
113
    };
114
    ParseTo parse_to = ParseTo::OnlySubcolumns;
115
    phmap::flat_hash_set<std::string> preserve_decimal_number_paths;
116
    std::function<bool(std::string_view)> preserve_decimal_number_path_matcher;
117
};
118
/// Result of parsing of a document.
119
/// Contains all paths extracted from document
120
/// and values which are related to them.
121
struct ParseResult {
122
    std::vector<PathInData> paths;
123
    std::vector<Field> values;
124
};
125
template <typename ParserImpl>
126
class JSONDataParser {
127
public:
128
    using Element = typename ParserImpl::Element;
129
    using JSONObject = typename ParserImpl::Object;
130
    using JSONArray = typename ParserImpl::Array;
131
    std::optional<ParseResult> parse(const char* begin, size_t length, const ParseConfig& config);
132
133
private:
134
    struct ParseContext {
135
        PathInDataBuilder builder;
136
        std::vector<PathInData::Parts> paths;
137
        std::vector<Field> values;
138
        phmap::flat_hash_set<std::string> visited_path_names;
139
        bool deprecated_enable_flatten_nested = false;
140
        bool check_duplicate_json_path = false;
141
        bool has_nested_in_flatten = false;
142
        bool is_top_array = false;
143
        const phmap::flat_hash_set<std::string>* preserve_decimal_number_paths = nullptr;
144
        const std::function<bool(std::string_view)>* preserve_decimal_number_path_matcher = nullptr;
145
        PathInData::Parts path_prefix_for_typed_paths;
146
    };
147
    using PathPartsWithArray = std::pair<PathInData::Parts, Array>;
148
    using PathToArray = phmap::flat_hash_map<UInt128, PathPartsWithArray, UInt128TrivialHash>;
149
    using KeyToSizes = phmap::flat_hash_map<StringRef, std::vector<size_t>, StringRefHash>;
150
    struct ParseArrayContext {
151
        size_t current_size = 0;
152
        size_t total_size = 0;
153
        PathToArray arrays_by_path;
154
        KeyToSizes nested_sizes_by_key;
155
        bool has_nested_in_flatten = false;
156
        bool is_top_array = false;
157
        bool check_duplicate_json_path = false;
158
        const phmap::flat_hash_set<std::string>* preserve_decimal_number_paths = nullptr;
159
        const std::function<bool(std::string_view)>* preserve_decimal_number_path_matcher = nullptr;
160
        PathInData::Parts path_prefix_for_typed_paths;
161
    };
162
    void traverse(const Element& element, ParseContext& ctx);
163
    void traverseObject(const JSONObject& object, ParseContext& ctx);
164
    void traverseArray(const JSONArray& array, ParseContext& ctx);
165
    void appendValueIfNotDuplicate(ParseContext& ctx, const PathInData::Parts& path, Field&& value);
166
    void traverseArrayElement(const Element& element, ParseArrayContext& ctx);
167
    bool shouldPreserveNumberAsString(const ParseContext& ctx) const;
168
    void checkAmbiguousStructure(const ParseArrayContext& ctx,
169
                                 const std::vector<PathInData::Parts>& paths);
170
    void handleExistingPath(std::pair<PathInData::Parts, Array>& path_data,
171
                            const PathInData::Parts& path, Field& value, ParseArrayContext& ctx,
172
                            size_t& keys_to_update);
173
    void handleNewPath(UInt128 hash, const PathInData::Parts& path, Field& value,
174
                       ParseArrayContext& ctx);
175
    static void fillMissedValuesInArrays(ParseArrayContext& ctx);
176
    static bool tryInsertDefaultFromNested(ParseArrayContext& ctx, const PathInData::Parts& path,
177
                                           Array& array);
178
    static StringRef getNameOfNested(const PathInData::Parts& path, const Field& value);
179
180
    bool has_nested = false;
181
    void check_has_nested_object(const Element& element);
182
    void traverseAsJsonb(const Element& element, JsonbWriter& writer);
183
    void traverseObjectAsJsonb(const JSONObject& object, JsonbWriter& writer);
184
    void traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer);
185
186
    ParserImpl parser;
187
};
188
189
} // namespace doris