Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/JSONParsers/SimdJSONParser.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | #include <parallel_hashmap/phmap.h> |
24 | | |
25 | | #include <cstddef> |
26 | | #include <functional> |
27 | | #include <optional> |
28 | | #include <string> |
29 | | #include <string_view> |
30 | | #include <utility> |
31 | | #include <vector> |
32 | | |
33 | | #include "core/column/column.h" |
34 | | #include "core/data_type/primitive_type.h" |
35 | | #include "core/field.h" |
36 | | #include "core/string_ref.h" |
37 | | #include "core/uint128.h" |
38 | | #include "util/json/path_in_data.h" |
39 | | #include "util/json/simd_json_parser.h" |
40 | | #include "util/jsonb_writer.h" |
41 | | |
42 | | namespace doris { |
43 | | |
44 | | template <typename Element> |
45 | 1.88M | Field getValueAsField(const Element& element, bool preserve_number_as_string = false) { |
46 | | // bool will convert to type FiledType::UInt64 |
47 | 1.88M | if (element.isBool()) { |
48 | 20.6k | return Field::create_field<TYPE_BOOLEAN>(element.getBool()); |
49 | 20.6k | } |
50 | 1.86M | if (element.isInt64()) { |
51 | 803k | return Field::create_field<TYPE_BIGINT>(element.getInt64()); |
52 | 803k | } |
53 | | // doris only support signed integers at present |
54 | | // use largeint to store unsigned int64 |
55 | 1.05M | if (element.isUInt64()) { |
56 | 21 | return Field::create_field<TYPE_LARGEINT>(static_cast<int128_t>(element.getUInt64())); |
57 | 21 | } |
58 | 1.05M | if (element.isDouble()) { |
59 | 116k | if (preserve_number_as_string) { |
60 | 14 | return Field::create_field<TYPE_STRING>(String(element.getRawNumber())); |
61 | 14 | } |
62 | 116k | return Field::create_field<TYPE_DOUBLE>(element.getDouble()); |
63 | 116k | } |
64 | 942k | if (element.isString()) { |
65 | 930k | return Field::create_field<TYPE_STRING>(String(element.getString())); |
66 | 930k | } |
67 | 11.9k | if (element.isNull()) { |
68 | 11.9k | return {}; |
69 | 11.9k | } |
70 | 0 | return {}; |
71 | 11.9k | } |
72 | | |
73 | | template <typename Element> |
74 | 2.81k | void writeValueAsJsonb(const Element& element, JsonbWriter& writer) { |
75 | | // bool will convert to type FiledType::UInt64 |
76 | 2.81k | if (element.isBool()) { |
77 | 77 | writer.writeBool(element.getBool()); |
78 | 77 | return; |
79 | 77 | } |
80 | 2.74k | if (element.isInt64()) { |
81 | 358 | writer.writeInt64(element.getInt64()); |
82 | 358 | return; |
83 | 358 | } |
84 | | // doris only support signed integers at present |
85 | | // use largeint to store unsigned int64 |
86 | 2.38k | if (element.isUInt64()) { |
87 | 0 | writer.writeInt128(static_cast<int128_t>(element.getUInt64())); |
88 | 0 | return; |
89 | 0 | } |
90 | 2.38k | if (element.isDouble()) { |
91 | 303 | writer.writeDouble(element.getDouble()); |
92 | 303 | return; |
93 | 303 | } |
94 | 2.07k | if (element.isString()) { |
95 | 2.00k | writer.writeStartString(); |
96 | 2.00k | std::string_view str = element.getString(); |
97 | 2.00k | writer.writeString(str.data(), str.size()); |
98 | 2.00k | writer.writeEndString(); |
99 | 2.00k | return; |
100 | 2.00k | } |
101 | 77 | if (element.isNull()) { |
102 | 77 | writer.writeNull(); |
103 | 77 | return; |
104 | 77 | } |
105 | 77 | } |
106 | | |
107 | | struct ParseConfig { |
108 | | bool deprecated_enable_flatten_nested = false; |
109 | | bool check_duplicate_json_path = false; |
110 | | enum class ParseTo { |
111 | | OnlySubcolumns = 0, |
112 | | OnlyDocValueColumn = 1, |
113 | | }; |
114 | | ParseTo parse_to = ParseTo::OnlySubcolumns; |
115 | | phmap::flat_hash_set<std::string> preserve_decimal_number_paths; |
116 | | std::function<bool(std::string_view)> preserve_decimal_number_path_matcher; |
117 | | }; |
118 | | /// Result of parsing of a document. |
119 | | /// Contains all paths extracted from document |
120 | | /// and values which are related to them. |
121 | | struct ParseResult { |
122 | | std::vector<PathInData> paths; |
123 | | std::vector<Field> values; |
124 | | }; |
125 | | template <typename ParserImpl> |
126 | | class JSONDataParser { |
127 | | public: |
128 | | using Element = typename ParserImpl::Element; |
129 | | using JSONObject = typename ParserImpl::Object; |
130 | | using JSONArray = typename ParserImpl::Array; |
131 | | std::optional<ParseResult> parse(const char* begin, size_t length, const ParseConfig& config); |
132 | | |
133 | | private: |
134 | | struct ParseContext { |
135 | | PathInDataBuilder builder; |
136 | | std::vector<PathInData::Parts> paths; |
137 | | std::vector<Field> values; |
138 | | phmap::flat_hash_set<std::string> visited_path_names; |
139 | | bool deprecated_enable_flatten_nested = false; |
140 | | bool check_duplicate_json_path = false; |
141 | | bool has_nested_in_flatten = false; |
142 | | bool is_top_array = false; |
143 | | const phmap::flat_hash_set<std::string>* preserve_decimal_number_paths = nullptr; |
144 | | const std::function<bool(std::string_view)>* preserve_decimal_number_path_matcher = nullptr; |
145 | | PathInData::Parts path_prefix_for_typed_paths; |
146 | | }; |
147 | | using PathPartsWithArray = std::pair<PathInData::Parts, Array>; |
148 | | using PathToArray = phmap::flat_hash_map<UInt128, PathPartsWithArray, UInt128TrivialHash>; |
149 | | using KeyToSizes = phmap::flat_hash_map<StringRef, std::vector<size_t>, StringRefHash>; |
150 | | struct ParseArrayContext { |
151 | | size_t current_size = 0; |
152 | | size_t total_size = 0; |
153 | | PathToArray arrays_by_path; |
154 | | KeyToSizes nested_sizes_by_key; |
155 | | bool has_nested_in_flatten = false; |
156 | | bool is_top_array = false; |
157 | | bool check_duplicate_json_path = false; |
158 | | const phmap::flat_hash_set<std::string>* preserve_decimal_number_paths = nullptr; |
159 | | const std::function<bool(std::string_view)>* preserve_decimal_number_path_matcher = nullptr; |
160 | | PathInData::Parts path_prefix_for_typed_paths; |
161 | | }; |
162 | | void traverse(const Element& element, ParseContext& ctx); |
163 | | void traverseObject(const JSONObject& object, ParseContext& ctx); |
164 | | void traverseArray(const JSONArray& array, ParseContext& ctx); |
165 | | void appendValueIfNotDuplicate(ParseContext& ctx, const PathInData::Parts& path, Field&& value); |
166 | | void traverseArrayElement(const Element& element, ParseArrayContext& ctx); |
167 | | bool shouldPreserveNumberAsString(const ParseContext& ctx) const; |
168 | | void checkAmbiguousStructure(const ParseArrayContext& ctx, |
169 | | const std::vector<PathInData::Parts>& paths); |
170 | | void handleExistingPath(std::pair<PathInData::Parts, Array>& path_data, |
171 | | const PathInData::Parts& path, Field& value, ParseArrayContext& ctx, |
172 | | size_t& keys_to_update); |
173 | | void handleNewPath(UInt128 hash, const PathInData::Parts& path, Field& value, |
174 | | ParseArrayContext& ctx); |
175 | | static void fillMissedValuesInArrays(ParseArrayContext& ctx); |
176 | | static bool tryInsertDefaultFromNested(ParseArrayContext& ctx, const PathInData::Parts& path, |
177 | | Array& array); |
178 | | static StringRef getNameOfNested(const PathInData::Parts& path, const Field& value); |
179 | | |
180 | | bool has_nested = false; |
181 | | void check_has_nested_object(const Element& element); |
182 | | void traverseAsJsonb(const Element& element, JsonbWriter& writer); |
183 | | void traverseObjectAsJsonb(const JSONObject& object, JsonbWriter& writer); |
184 | | void traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer); |
185 | | |
186 | | ParserImpl parser; |
187 | | }; |
188 | | |
189 | | } // namespace doris |