be/src/util/json/json_parser.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | #include <parallel_hashmap/phmap.h> |
24 | | #include <stddef.h> |
25 | | |
26 | | #include <optional> |
27 | | #include <string> |
28 | | #include <utility> |
29 | | #include <vector> |
30 | | |
31 | | #include "core/column/column.h" |
32 | | #include "core/data_type/primitive_type.h" |
33 | | #include "core/field.h" |
34 | | #include "core/string_ref.h" |
35 | | #include "core/uint128.h" |
36 | | #include "util/json/path_in_data.h" |
37 | | #include "util/json/simd_json_parser.h" |
38 | | #include "util/jsonb_writer.h" |
39 | | |
40 | | namespace doris { |
41 | | |
42 | | template <typename Element> |
43 | 19.4M | Field getValueAsField(const Element& element) { |
44 | | // bool will convert to type FiledType::UInt64 |
45 | 19.4M | if (element.isBool()) { |
46 | 282k | return Field::create_field<TYPE_BOOLEAN>(element.getBool()); |
47 | 282k | } |
48 | 19.2M | if (element.isInt64()) { |
49 | 5.31M | return Field::create_field<TYPE_BIGINT>(element.getInt64()); |
50 | 5.31M | } |
51 | | // doris only support signed integers at present |
52 | | // use largeint to store unsigned int64 |
53 | 13.9M | if (element.isUInt64()) { |
54 | 29 | return Field::create_field<TYPE_LARGEINT>(static_cast<int128_t>(element.getUInt64())); |
55 | 29 | } |
56 | 13.9M | if (element.isDouble()) { |
57 | 2.89M | return Field::create_field<TYPE_DOUBLE>(element.getDouble()); |
58 | 2.89M | } |
59 | 11.2M | if (element.isString()) { |
60 | 11.2M | return Field::create_field<TYPE_STRING>(String(element.getString())); |
61 | 11.2M | } |
62 | 18.4E | if (element.isNull()) { |
63 | 108k | return Field(); |
64 | 108k | } |
65 | 18.4E | return Field(); |
66 | 18.4E | } |
67 | | |
68 | | template <typename Element> |
69 | 495k | void writeValueAsJsonb(const Element& element, JsonbWriter& writer) { |
70 | | // bool will convert to type FiledType::UInt64 |
71 | 495k | if (element.isBool()) { |
72 | 76.8k | writer.writeBool(element.getBool()); |
73 | 76.8k | return; |
74 | 76.8k | } |
75 | 418k | if (element.isInt64()) { |
76 | 4.55k | writer.writeInt64(element.getInt64()); |
77 | 4.55k | return; |
78 | 4.55k | } |
79 | | // doris only support signed integers at present |
80 | | // use largeint to store unsigned int64 |
81 | 414k | if (element.isUInt64()) { |
82 | 0 | writer.writeInt128(static_cast<int128_t>(element.getUInt64())); |
83 | 0 | return; |
84 | 0 | } |
85 | 414k | if (element.isDouble()) { |
86 | 314 | writer.writeDouble(element.getDouble()); |
87 | 314 | return; |
88 | 314 | } |
89 | 413k | if (element.isString()) { |
90 | 412k | writer.writeStartString(); |
91 | 412k | std::string_view str = element.getString(); |
92 | 412k | writer.writeString(str.data(), str.size()); |
93 | 412k | writer.writeEndString(); |
94 | 412k | return; |
95 | 412k | } |
96 | 1.66k | if (element.isNull()) { |
97 | 1.66k | writer.writeNull(); |
98 | 1.66k | return; |
99 | 1.66k | } |
100 | 1.65k | } |
101 | | |
102 | | struct ParseConfig { |
103 | | bool enable_flatten_nested = false; |
104 | | enum class ParseTo { |
105 | | OnlySubcolumns = 0, |
106 | | OnlyDocValueColumn = 1, |
107 | | }; |
108 | | ParseTo parse_to = ParseTo::OnlySubcolumns; |
109 | | }; |
110 | | /// Result of parsing of a document. |
111 | | /// Contains all paths extracted from document |
112 | | /// and values which are related to them. |
113 | | struct ParseResult { |
114 | | std::vector<PathInData> paths; |
115 | | std::vector<Field> values; |
116 | | }; |
117 | | template <typename ParserImpl> |
118 | | class JSONDataParser { |
119 | | public: |
120 | | using Element = typename ParserImpl::Element; |
121 | | using JSONObject = typename ParserImpl::Object; |
122 | | using JSONArray = typename ParserImpl::Array; |
123 | | std::optional<ParseResult> parse(const char* begin, size_t length, const ParseConfig& config); |
124 | | |
125 | | private: |
126 | | struct ParseContext { |
127 | | PathInDataBuilder builder; |
128 | | std::vector<PathInData::Parts> paths; |
129 | | std::vector<Field> values; |
130 | | bool enable_flatten_nested = false; |
131 | | bool has_nested_in_flatten = false; |
132 | | bool is_top_array = false; |
133 | | }; |
134 | | using PathPartsWithArray = std::pair<PathInData::Parts, Array>; |
135 | | using PathToArray = phmap::flat_hash_map<UInt128, PathPartsWithArray, UInt128TrivialHash>; |
136 | | using KeyToSizes = phmap::flat_hash_map<StringRef, std::vector<size_t>, StringRefHash>; |
137 | | struct ParseArrayContext { |
138 | | size_t current_size = 0; |
139 | | size_t total_size = 0; |
140 | | PathToArray arrays_by_path; |
141 | | KeyToSizes nested_sizes_by_key; |
142 | | bool has_nested_in_flatten = false; |
143 | | bool is_top_array = false; |
144 | | }; |
145 | | void traverse(const Element& element, ParseContext& ctx); |
146 | | void traverseObject(const JSONObject& object, ParseContext& ctx); |
147 | | void traverseArray(const JSONArray& array, ParseContext& ctx); |
148 | | void traverseArrayElement(const Element& element, ParseArrayContext& ctx); |
149 | | void checkAmbiguousStructure(const ParseArrayContext& ctx, |
150 | | const std::vector<PathInData::Parts>& paths); |
151 | | void handleExistingPath(std::pair<PathInData::Parts, Array>& path_data, |
152 | | const PathInData::Parts& path, Field& value, ParseArrayContext& ctx, |
153 | | size_t& keys_to_update); |
154 | | void handleNewPath(UInt128 hash, const PathInData::Parts& path, Field& value, |
155 | | ParseArrayContext& ctx); |
156 | | static void fillMissedValuesInArrays(ParseArrayContext& ctx); |
157 | | static bool tryInsertDefaultFromNested(ParseArrayContext& ctx, const PathInData::Parts& path, |
158 | | Array& array); |
159 | | static StringRef getNameOfNested(const PathInData::Parts& path, const Field& value); |
160 | | |
161 | | bool has_nested = false; |
162 | | void check_has_nested_object(const Element& element); |
163 | | void traverseAsJsonb(const Element& element, JsonbWriter& writer); |
164 | | void traverseObjectAsJsonb(const JSONObject& object, JsonbWriter& writer); |
165 | | void traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer); |
166 | | |
167 | | ParserImpl parser; |
168 | | }; |
169 | | |
170 | | } // namespace doris |