Coverage Report

Created: 2026-05-14 18:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/json/json_parser.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include <parallel_hashmap/phmap.h>
24
#include <stddef.h>
25
26
#include <optional>
27
#include <string>
28
#include <utility>
29
#include <vector>
30
31
#include "core/column/column.h"
32
#include "core/data_type/primitive_type.h"
33
#include "core/field.h"
34
#include "core/string_ref.h"
35
#include "core/uint128.h"
36
#include "util/json/path_in_data.h"
37
#include "util/json/simd_json_parser.h"
38
#include "util/jsonb_writer.h"
39
40
namespace doris {
41
42
template <typename Element>
43
19.7M
Field getValueAsField(const Element& element) {
44
    // bool will convert to type FiledType::UInt64
45
19.7M
    if (element.isBool()) {
46
282k
        return Field::create_field<TYPE_BOOLEAN>(element.getBool());
47
282k
    }
48
19.4M
    if (element.isInt64()) {
49
5.31M
        return Field::create_field<TYPE_BIGINT>(element.getInt64());
50
5.31M
    }
51
    // doris only support signed integers at present
52
    // use largeint to store unsigned int64
53
14.1M
    if (element.isUInt64()) {
54
29
        return Field::create_field<TYPE_LARGEINT>(static_cast<int128_t>(element.getUInt64()));
55
29
    }
56
14.1M
    if (element.isDouble()) {
57
2.89M
        return Field::create_field<TYPE_DOUBLE>(element.getDouble());
58
2.89M
    }
59
11.2M
    if (element.isString()) {
60
11.2M
        return Field::create_field<TYPE_STRING>(String(element.getString()));
61
11.2M
    }
62
18.4E
    if (element.isNull()) {
63
108k
        return Field();
64
108k
    }
65
18.4E
    return Field();
66
18.4E
}
67
68
template <typename Element>
69
495k
void writeValueAsJsonb(const Element& element, JsonbWriter& writer) {
70
    // bool will convert to type FiledType::UInt64
71
495k
    if (element.isBool()) {
72
76.8k
        writer.writeBool(element.getBool());
73
76.8k
        return;
74
76.8k
    }
75
418k
    if (element.isInt64()) {
76
4.55k
        writer.writeInt64(element.getInt64());
77
4.55k
        return;
78
4.55k
    }
79
    // doris only support signed integers at present
80
    // use largeint to store unsigned int64
81
413k
    if (element.isUInt64()) {
82
0
        writer.writeInt128(static_cast<int128_t>(element.getUInt64()));
83
0
        return;
84
0
    }
85
413k
    if (element.isDouble()) {
86
314
        writer.writeDouble(element.getDouble());
87
314
        return;
88
314
    }
89
413k
    if (element.isString()) {
90
412k
        writer.writeStartString();
91
412k
        std::string_view str = element.getString();
92
412k
        writer.writeString(str.data(), str.size());
93
412k
        writer.writeEndString();
94
412k
        return;
95
412k
    }
96
1.66k
    if (element.isNull()) {
97
1.66k
        writer.writeNull();
98
1.66k
        return;
99
1.66k
    }
100
1.63k
}
101
102
struct ParseConfig {
103
    bool deprecated_enable_flatten_nested = false;
104
    bool check_duplicate_json_path = false;
105
    enum class ParseTo {
106
        OnlySubcolumns = 0,
107
        OnlyDocValueColumn = 1,
108
    };
109
    ParseTo parse_to = ParseTo::OnlySubcolumns;
110
};
111
/// Result of parsing of a document.
112
/// Contains all paths extracted from document
113
/// and values which are related to them.
114
struct ParseResult {
115
    std::vector<PathInData> paths;
116
    std::vector<Field> values;
117
};
118
template <typename ParserImpl>
119
class JSONDataParser {
120
public:
121
    using Element = typename ParserImpl::Element;
122
    using JSONObject = typename ParserImpl::Object;
123
    using JSONArray = typename ParserImpl::Array;
124
    std::optional<ParseResult> parse(const char* begin, size_t length, const ParseConfig& config);
125
126
private:
127
    struct ParseContext {
128
        PathInDataBuilder builder;
129
        std::vector<PathInData::Parts> paths;
130
        std::vector<Field> values;
131
        phmap::flat_hash_set<std::string> visited_path_names;
132
        bool deprecated_enable_flatten_nested = false;
133
        bool check_duplicate_json_path = false;
134
        bool has_nested_in_flatten = false;
135
        bool is_top_array = false;
136
    };
137
    using PathPartsWithArray = std::pair<PathInData::Parts, Array>;
138
    using PathToArray = phmap::flat_hash_map<UInt128, PathPartsWithArray, UInt128TrivialHash>;
139
    using KeyToSizes = phmap::flat_hash_map<StringRef, std::vector<size_t>, StringRefHash>;
140
    struct ParseArrayContext {
141
        size_t current_size = 0;
142
        size_t total_size = 0;
143
        PathToArray arrays_by_path;
144
        KeyToSizes nested_sizes_by_key;
145
        bool has_nested_in_flatten = false;
146
        bool is_top_array = false;
147
        bool check_duplicate_json_path = false;
148
    };
149
    void traverse(const Element& element, ParseContext& ctx);
150
    void traverseObject(const JSONObject& object, ParseContext& ctx);
151
    void traverseArray(const JSONArray& array, ParseContext& ctx);
152
    void appendValueIfNotDuplicate(ParseContext& ctx, const PathInData::Parts& path, Field&& value);
153
    void traverseArrayElement(const Element& element, ParseArrayContext& ctx);
154
    void checkAmbiguousStructure(const ParseArrayContext& ctx,
155
                                 const std::vector<PathInData::Parts>& paths);
156
    void handleExistingPath(std::pair<PathInData::Parts, Array>& path_data,
157
                            const PathInData::Parts& path, Field& value, ParseArrayContext& ctx,
158
                            size_t& keys_to_update);
159
    void handleNewPath(UInt128 hash, const PathInData::Parts& path, Field& value,
160
                       ParseArrayContext& ctx);
161
    static void fillMissedValuesInArrays(ParseArrayContext& ctx);
162
    static bool tryInsertDefaultFromNested(ParseArrayContext& ctx, const PathInData::Parts& path,
163
                                           Array& array);
164
    static StringRef getNameOfNested(const PathInData::Parts& path, const Field& value);
165
166
    bool has_nested = false;
167
    void check_has_nested_object(const Element& element);
168
    void traverseAsJsonb(const Element& element, JsonbWriter& writer);
169
    void traverseObjectAsJsonb(const JSONObject& object, JsonbWriter& writer);
170
    void traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer);
171
172
    ParserImpl parser;
173
};
174
175
} // namespace doris