be/src/util/json/path_in_data.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/DataTypes/Serializations/PathInData.cpp |
19 | | // and modified by Doris |
20 | | |
21 | | #include "util/json/path_in_data.h" |
22 | | |
23 | | #include <assert.h> |
24 | | |
25 | | #include <string_view> |
26 | | |
27 | | #include "common/cast_set.h" |
28 | | #include "exec/common/sip_hash.h" |
29 | | #include "exec/common/variant_util.h" |
30 | | |
31 | | namespace doris { |
32 | | |
33 | | #include "common/compile_check_begin.h" |
34 | | |
35 | 2.65M | PathInData::PathInData(std::string_view path_, bool is_typed_) : path(path_), is_typed(is_typed_) { |
36 | 2.65M | const char* begin = path.data(); |
37 | 2.65M | const char* end = path.data() + path.size(); |
38 | 61.2M | for (const char* it = path.data(); it != end; ++it) { |
39 | 58.6M | if (*it == '.') { |
40 | 5.13M | size_t size = static_cast<size_t>(it - begin); |
41 | 5.13M | parts.emplace_back(std::string_view {begin, size}, false, 0); |
42 | 5.13M | begin = it + 1; |
43 | 5.13M | } |
44 | 58.6M | } |
45 | 2.65M | size_t size = static_cast<size_t>(end - begin); |
46 | 2.65M | parts.emplace_back(std::string_view {begin, size}, false, 0.); |
47 | 2.65M | } |
48 | | |
49 | 3 | PathInData::PathInData(std::string_view path_, const Parts& parts_, bool is_typed_) { |
50 | 3 | path = path_; |
51 | 3 | is_typed = is_typed_; |
52 | 9 | for (const auto& part : parts_) { |
53 | 9 | has_nested |= part.is_nested; |
54 | 9 | parts.emplace_back(part); |
55 | 9 | } |
56 | 3 | } |
57 | | |
58 | 19.0M | PathInData::PathInData(const Parts& parts_) { |
59 | 19.0M | build_path(parts_); |
60 | 19.0M | build_parts(parts_); |
61 | 19.0M | } |
62 | 78.6M | PathInData::PathInData(const PathInData& other) : path(other.path), is_typed(other.is_typed) { |
63 | 78.6M | build_parts(other.get_parts()); |
64 | 78.6M | } |
65 | | |
66 | 15.2k | PathInData::PathInData(const std::string& root, const std::vector<std::string>& paths) { |
67 | 15.2k | PathInDataBuilder path_builder; |
68 | 15.2k | path_builder.append(root, false); |
69 | 15.2k | for (const std::string& p : paths) { |
70 | 13.7k | path_builder.append(p, false); |
71 | 13.7k | } |
72 | 15.2k | build_path(path_builder.get_parts()); |
73 | 15.2k | build_parts(path_builder.get_parts()); |
74 | 15.2k | } |
75 | | |
76 | 1 | PathInData::PathInData(const std::vector<std::string>& paths) { |
77 | 1 | PathInDataBuilder path_builder; |
78 | 4 | for (size_t i = 0; i < paths.size(); ++i) { |
79 | 3 | path_builder.append(paths[i], false); |
80 | 3 | } |
81 | 1 | build_path(path_builder.get_parts()); |
82 | 1 | build_parts(path_builder.get_parts()); |
83 | 1 | } |
84 | | |
85 | 2.82M | PathInData& PathInData::operator=(const PathInData& other) { |
86 | 2.82M | if (this != &other) { |
87 | 2.82M | path = other.path; |
88 | 2.82M | is_typed = other.is_typed; |
89 | 2.82M | build_parts(other.parts); |
90 | 2.82M | } |
91 | 2.82M | return *this; |
92 | 2.82M | } |
93 | | |
94 | 45.1M | UInt128 PathInData::get_parts_hash(const Parts& parts_, bool is_typed_) { |
95 | 45.1M | SipHash hash; |
96 | 45.1M | hash.update(parts_.size()); |
97 | 45.1M | for (const auto& part : parts_) { |
98 | 719k | hash.update(part.key.data(), part.key.length()); |
99 | 719k | hash.update(part.is_nested); |
100 | 719k | hash.update(part.anonymous_array_level); |
101 | 719k | } |
102 | 45.1M | hash.update(is_typed_); |
103 | 45.1M | UInt128 res; |
104 | 45.1M | hash.get128(res); |
105 | 45.1M | return res; |
106 | 45.1M | } |
107 | | |
108 | 24.9M | void PathInData::build_path(const Parts& other_parts) { |
109 | 24.9M | if (other_parts.empty()) { |
110 | 1.77k | return; |
111 | 1.77k | } |
112 | 24.9M | path.clear(); |
113 | 24.9M | auto it = other_parts.begin(); |
114 | 24.9M | path += it->key; |
115 | 24.9M | ++it; |
116 | 50.0M | for (; it != other_parts.end(); ++it) { |
117 | 25.1M | path += "."; |
118 | 25.1M | path += it->key; |
119 | 25.1M | } |
120 | 24.9M | } |
121 | 106M | void PathInData::build_parts(const Parts& other_parts) { |
122 | 106M | if (other_parts.empty()) { |
123 | 64.3M | return; |
124 | 64.3M | } |
125 | 41.9M | parts.clear(); |
126 | 41.9M | parts.reserve(other_parts.size()); |
127 | 41.9M | const char* begin = path.data(); |
128 | 98.0M | for (const auto& part : other_parts) { |
129 | 98.0M | has_nested |= part.is_nested; |
130 | 98.0M | parts.emplace_back(std::string_view {begin, part.key.length()}, part.is_nested, |
131 | 98.0M | part.anonymous_array_level); |
132 | 98.0M | begin += part.key.length() + 1; |
133 | 98.0M | } |
134 | 41.9M | } |
135 | | |
136 | 3.71M | void PathInData::from_protobuf(const segment_v2::ColumnPathInfo& pb) { |
137 | 3.71M | parts.clear(); |
138 | 3.71M | path = pb.path(); |
139 | 3.71M | has_nested = false; |
140 | 3.71M | is_typed = pb.is_typed(); |
141 | 3.71M | parts.reserve(pb.path_part_infos().size()); |
142 | 3.71M | const char* begin = path.data(); |
143 | 17.8M | for (const segment_v2::ColumnPathPartInfo& part_info : pb.path_part_infos()) { |
144 | 17.8M | Part part; |
145 | 17.8M | part.is_nested = part_info.is_nested(); |
146 | 17.8M | has_nested |= part.is_nested; |
147 | 17.8M | part.anonymous_array_level = |
148 | 17.8M | cast_set<uint8_t, uint32_t, false>(part_info.anonymous_array_level()); |
149 | | // use string_view to ref data in path |
150 | 17.8M | part.key = std::string_view {begin, part_info.key().length()}; |
151 | 17.8M | parts.push_back(part); |
152 | 17.8M | begin += part.key.length() + 1; |
153 | 17.8M | } |
154 | 3.71M | } |
155 | | |
156 | 2 | std::string PathInData::to_jsonpath() const { |
157 | 2 | std::string jsonpath = "$."; |
158 | 2 | if (parts.empty()) { |
159 | 0 | return jsonpath; |
160 | 0 | } |
161 | 2 | auto it = parts.begin(); |
162 | 2 | jsonpath += it->key; |
163 | 2 | ++it; |
164 | 3 | for (; it != parts.end(); ++it) { |
165 | 1 | jsonpath += "."; |
166 | 1 | jsonpath += it->key; |
167 | 1 | } |
168 | 2 | return jsonpath; |
169 | 2 | } |
170 | | |
171 | 361k | void PathInData::to_protobuf(segment_v2::ColumnPathInfo* pb, int32_t parent_col_unique_id) const { |
172 | 361k | pb->set_path(path); |
173 | 361k | pb->set_has_nested(has_nested); |
174 | 361k | pb->set_parrent_column_unique_id(parent_col_unique_id); |
175 | 361k | pb->set_is_typed(is_typed); |
176 | | |
177 | | // set parts info |
178 | 782k | for (const Part& part : parts) { |
179 | 782k | segment_v2::ColumnPathPartInfo& part_info = *pb->add_path_part_infos(); |
180 | 782k | part_info.set_key(std::string(part.key.data(), part.key.size())); |
181 | 782k | part_info.set_is_nested(part.is_nested); |
182 | 782k | part_info.set_anonymous_array_level(part.anonymous_array_level); |
183 | 782k | } |
184 | 361k | } |
185 | | |
186 | 43.7M | size_t PathInData::Hash::operator()(const PathInData& value) const { |
187 | 43.7M | auto hash = get_parts_hash(value.parts, value.is_typed); |
188 | 43.7M | return hash.low() ^ hash.high(); |
189 | 43.7M | } |
190 | | |
191 | 5.77k | bool PathInData::need_record_stats() const { |
192 | 5.77k | return !empty() && !is_typed && !has_nested && |
193 | 5.77k | path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos; |
194 | 5.77k | } |
195 | | |
196 | 5.54M | PathInData PathInData::copy_pop_front() const { |
197 | 5.54M | return copy_pop_nfront(1); |
198 | 5.54M | } |
199 | | |
200 | 63.4k | PathInData PathInData::get_nested_prefix_path() const { |
201 | 63.4k | CHECK(has_nested_part()); |
202 | 63.4k | PathInData new_path; |
203 | 63.4k | Parts new_parts; |
204 | 116k | for (const Part& part : parts) { |
205 | 116k | new_parts.push_back(part); |
206 | 116k | if (part.is_nested) { |
207 | 63.4k | break; |
208 | 63.4k | } |
209 | 116k | } |
210 | 63.4k | new_path.build_path(new_parts); |
211 | 63.4k | new_path.build_parts(new_parts); |
212 | 63.4k | new_path.is_typed = is_typed; |
213 | 63.4k | return new_path; |
214 | 63.4k | } |
215 | | |
216 | 2 | PathInData PathInData::copy_pop_back() const { |
217 | 2 | if (parts.size() <= 1) { |
218 | 1 | return {}; |
219 | 1 | } |
220 | 1 | PathInData new_path; |
221 | 1 | Parts new_parts = parts; |
222 | 1 | new_parts.pop_back(); |
223 | 1 | new_path.build_path(new_parts); |
224 | 1 | new_path.build_parts(new_parts); |
225 | 1 | new_path.is_typed = is_typed; |
226 | 1 | return new_path; |
227 | 2 | } |
228 | | |
229 | 5.75M | PathInData PathInData::copy_pop_nfront(size_t n) const { |
230 | 5.75M | if (n >= parts.size()) { |
231 | 47.1k | return {}; |
232 | 47.1k | } |
233 | 5.70M | PathInData new_path; |
234 | 5.70M | Parts new_parts; |
235 | 5.75M | if (!parts.empty()) { |
236 | 5.75M | std::copy(parts.begin() + n, parts.end(), std::back_inserter(new_parts)); |
237 | 5.75M | } |
238 | 5.70M | new_path.build_path(new_parts); |
239 | 5.70M | new_path.build_parts(new_parts); |
240 | 5.70M | new_path.is_typed = is_typed; |
241 | 5.70M | return new_path; |
242 | 5.75M | } |
243 | | |
244 | | bool PathInData::try_strip_prefix(const std::string& name, const std::string& prefix_dot, |
245 | 0 | std::string* out) { |
246 | 0 | if (!name.starts_with(prefix_dot)) { |
247 | 0 | return false; |
248 | 0 | } |
249 | 0 | *out = name.substr(prefix_dot.size()); |
250 | 0 | return !out->empty(); |
251 | 0 | } |
252 | | |
253 | 0 | PathInData PathInData::append(const PathInData& base, std::string_view suffix) { |
254 | 0 | if (suffix.empty()) { |
255 | 0 | return base; |
256 | 0 | } |
257 | 0 | if (base.empty()) { |
258 | 0 | return PathInData(suffix); |
259 | 0 | } |
260 | 0 | std::string new_path; |
261 | 0 | new_path.reserve(base.get_path().size() + 1 + suffix.size()); |
262 | 0 | new_path.append(base.get_path()); |
263 | 0 | new_path.push_back('.'); |
264 | 0 | new_path.append(suffix.data(), suffix.size()); |
265 | 0 | return PathInData(new_path); |
266 | 0 | } |
267 | | |
268 | 19.4M | PathInDataBuilder& PathInDataBuilder::append(std::string_view key, bool is_array) { |
269 | 19.4M | if (parts.empty()) { |
270 | 15.2M | current_anonymous_array_level += is_array; |
271 | 15.2M | } |
272 | 19.4M | if (!parts.empty()) { |
273 | 4.16M | parts.back().is_nested = is_array; |
274 | 4.16M | } |
275 | 19.4M | parts.emplace_back(key, false, current_anonymous_array_level); |
276 | 19.4M | current_anonymous_array_level = 0; |
277 | 19.4M | return *this; |
278 | 19.4M | } |
279 | 643k | PathInDataBuilder& PathInDataBuilder::append(const PathInData::Parts& path, bool is_array) { |
280 | 643k | if (parts.empty()) { |
281 | 172k | current_anonymous_array_level += is_array; |
282 | 172k | } |
283 | 643k | if (!path.empty()) { |
284 | 35.3k | if (!parts.empty()) { |
285 | 24.9k | parts.back().is_nested = is_array; |
286 | 24.9k | } |
287 | 35.3k | auto it = parts.insert(parts.end(), path.begin(), path.end()); |
288 | 89.0k | for (; it != parts.end(); ++it) { |
289 | 53.7k | it->anonymous_array_level += current_anonymous_array_level; |
290 | 53.7k | } |
291 | 35.3k | current_anonymous_array_level = 0; |
292 | 35.3k | } |
293 | 643k | return *this; |
294 | 643k | } |
295 | | |
296 | 19.5M | void PathInDataBuilder::pop_back() { |
297 | 19.5M | if (!parts.empty()) { |
298 | 19.5M | parts.pop_back(); |
299 | 19.5M | } |
300 | 19.5M | } |
301 | | |
302 | 630k | void PathInDataBuilder::pop_back(size_t n) { |
303 | 630k | assert(n <= parts.size()); |
304 | 630k | parts.resize(parts.size() - n); |
305 | 630k | } |
306 | | |
307 | | #include "common/compile_check_end.h" |
308 | | |
309 | | } // namespace doris |