be/src/util/json/path_in_data.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/DataTypes/Serializations/PathInData.cpp |
19 | | // and modified by Doris |
20 | | |
21 | | #include "util/json/path_in_data.h" |
22 | | |
23 | | #include <assert.h> |
24 | | |
25 | | #include <string_view> |
26 | | |
27 | | #include "common/cast_set.h" |
28 | | #include "exec/common/sip_hash.h" |
29 | | #include "exec/common/variant_util.h" |
30 | | |
31 | | namespace doris { |
32 | | |
33 | 3.98M | PathInData::PathInData(std::string_view path_, bool is_typed_) : path(path_), is_typed(is_typed_) { |
34 | 3.98M | const char* begin = path.data(); |
35 | 3.98M | const char* end = path.data() + path.size(); |
36 | 97.8M | for (const char* it = path.data(); it != end; ++it) { |
37 | 93.8M | if (*it == '.') { |
38 | 8.23M | size_t size = static_cast<size_t>(it - begin); |
39 | 8.23M | parts.emplace_back(std::string_view {begin, size}, false, 0); |
40 | 8.23M | begin = it + 1; |
41 | 8.23M | } |
42 | 93.8M | } |
43 | 3.98M | size_t size = static_cast<size_t>(end - begin); |
44 | 3.98M | parts.emplace_back(std::string_view {begin, size}, false, 0.); |
45 | 3.98M | } |
46 | | |
47 | 3 | PathInData::PathInData(std::string_view path_, const Parts& parts_, bool is_typed_) { |
48 | 3 | path = path_; |
49 | 3 | is_typed = is_typed_; |
50 | 9 | for (const auto& part : parts_) { |
51 | 9 | has_nested |= part.is_nested; |
52 | 9 | parts.emplace_back(part); |
53 | 9 | } |
54 | 3 | } |
55 | | |
56 | 19.0M | PathInData::PathInData(const Parts& parts_) { |
57 | 19.0M | build_path(parts_); |
58 | 19.0M | build_parts(parts_); |
59 | 19.0M | } |
60 | 130M | PathInData::PathInData(const PathInData& other) : path(other.path), is_typed(other.is_typed) { |
61 | 130M | build_parts(other.get_parts()); |
62 | 130M | } |
63 | | |
64 | 14.0k | PathInData::PathInData(const std::string& root, const std::vector<std::string>& paths) { |
65 | 14.0k | PathInDataBuilder path_builder; |
66 | 14.0k | path_builder.append(root, false); |
67 | 14.0k | for (const std::string& p : paths) { |
68 | 11.9k | path_builder.append(p, false); |
69 | 11.9k | } |
70 | 14.0k | build_path(path_builder.get_parts()); |
71 | 14.0k | build_parts(path_builder.get_parts()); |
72 | 14.0k | } |
73 | | |
74 | 1 | PathInData::PathInData(const std::vector<std::string>& paths) { |
75 | 1 | PathInDataBuilder path_builder; |
76 | 4 | for (size_t i = 0; i < paths.size(); ++i) { |
77 | 3 | path_builder.append(paths[i], false); |
78 | 3 | } |
79 | 1 | build_path(path_builder.get_parts()); |
80 | 1 | build_parts(path_builder.get_parts()); |
81 | 1 | } |
82 | | |
83 | 2.41M | PathInData& PathInData::operator=(const PathInData& other) { |
84 | 2.41M | if (this != &other) { |
85 | 2.41M | path = other.path; |
86 | 2.41M | is_typed = other.is_typed; |
87 | 2.41M | build_parts(other.parts); |
88 | 2.41M | } |
89 | 2.41M | return *this; |
90 | 2.41M | } |
91 | | |
92 | 64.2M | UInt128 PathInData::get_parts_hash(const Parts& parts_, bool is_typed_) { |
93 | 64.2M | SipHash hash; |
94 | 64.2M | hash.update(parts_.size()); |
95 | 64.2M | for (const auto& part : parts_) { |
96 | 259k | hash.update(part.key.data(), part.key.length()); |
97 | 259k | hash.update(part.is_nested); |
98 | 259k | hash.update(part.anonymous_array_level); |
99 | 259k | } |
100 | 64.2M | hash.update(is_typed_); |
101 | 64.2M | UInt128 res; |
102 | 64.2M | hash.get128(res); |
103 | 64.2M | return res; |
104 | 64.2M | } |
105 | | |
106 | 20.1M | void PathInData::build_path(const Parts& other_parts) { |
107 | 20.1M | if (other_parts.empty()) { |
108 | 1.76k | return; |
109 | 1.76k | } |
110 | 20.1M | path.clear(); |
111 | 20.1M | auto it = other_parts.begin(); |
112 | 20.1M | path += it->key; |
113 | 20.1M | ++it; |
114 | 29.6M | for (; it != other_parts.end(); ++it) { |
115 | 9.57M | path += "."; |
116 | 9.57M | path += it->key; |
117 | 9.57M | } |
118 | 20.1M | } |
119 | 153M | void PathInData::build_parts(const Parts& other_parts) { |
120 | 153M | if (other_parts.empty()) { |
121 | 121M | return; |
122 | 121M | } |
123 | 31.6M | parts.clear(); |
124 | 31.6M | parts.reserve(other_parts.size()); |
125 | 31.6M | const char* begin = path.data(); |
126 | 60.3M | for (const auto& part : other_parts) { |
127 | 60.3M | has_nested |= part.is_nested; |
128 | 60.3M | parts.emplace_back(std::string_view {begin, part.key.length()}, part.is_nested, |
129 | 60.3M | part.anonymous_array_level); |
130 | 60.3M | begin += part.key.length() + 1; |
131 | 60.3M | } |
132 | 31.6M | } |
133 | | |
134 | 496k | void PathInData::from_protobuf(const segment_v2::ColumnPathInfo& pb) { |
135 | 496k | parts.clear(); |
136 | 496k | path = pb.path(); |
137 | 496k | has_nested = false; |
138 | 496k | is_typed = pb.is_typed(); |
139 | 496k | parts.reserve(pb.path_part_infos().size()); |
140 | 496k | const char* begin = path.data(); |
141 | 1.14M | for (const segment_v2::ColumnPathPartInfo& part_info : pb.path_part_infos()) { |
142 | 1.14M | Part part; |
143 | 1.14M | part.is_nested = part_info.is_nested(); |
144 | 1.14M | has_nested |= part.is_nested; |
145 | 1.14M | part.anonymous_array_level = |
146 | 1.14M | cast_set<uint8_t, uint32_t, false>(part_info.anonymous_array_level()); |
147 | | // use string_view to ref data in path |
148 | 1.14M | part.key = std::string_view {begin, part_info.key().length()}; |
149 | 1.14M | parts.push_back(part); |
150 | 1.14M | begin += part.key.length() + 1; |
151 | 1.14M | } |
152 | 496k | } |
153 | | |
154 | 2 | std::string PathInData::to_jsonpath() const { |
155 | 2 | std::string jsonpath = "$."; |
156 | 2 | if (parts.empty()) { |
157 | 0 | return jsonpath; |
158 | 0 | } |
159 | 2 | auto it = parts.begin(); |
160 | 2 | jsonpath += it->key; |
161 | 2 | ++it; |
162 | 3 | for (; it != parts.end(); ++it) { |
163 | 1 | jsonpath += "."; |
164 | 1 | jsonpath += it->key; |
165 | 1 | } |
166 | 2 | return jsonpath; |
167 | 2 | } |
168 | | |
169 | 411k | void PathInData::to_protobuf(segment_v2::ColumnPathInfo* pb, int32_t parent_col_unique_id) const { |
170 | 411k | pb->set_path(path); |
171 | 411k | pb->set_has_nested(has_nested); |
172 | 411k | pb->set_parrent_column_unique_id(parent_col_unique_id); |
173 | 411k | pb->set_is_typed(is_typed); |
174 | | |
175 | | // set parts info |
176 | 745k | for (const Part& part : parts) { |
177 | 745k | segment_v2::ColumnPathPartInfo& part_info = *pb->add_path_part_infos(); |
178 | 745k | part_info.set_key(std::string(part.key.data(), part.key.size())); |
179 | 745k | part_info.set_is_nested(part.is_nested); |
180 | 745k | part_info.set_anonymous_array_level(part.anonymous_array_level); |
181 | 745k | } |
182 | 411k | } |
183 | | |
184 | 62.8M | size_t PathInData::Hash::operator()(const PathInData& value) const { |
185 | 62.8M | auto hash = get_parts_hash(value.parts, value.is_typed); |
186 | 62.8M | return hash.low() ^ hash.high(); |
187 | 62.8M | } |
188 | | |
189 | 5.99k | bool PathInData::need_record_stats() const { |
190 | 5.99k | return !empty() && !is_typed && !has_nested && |
191 | 5.99k | path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos; |
192 | 5.99k | } |
193 | | |
194 | 648k | PathInData PathInData::copy_pop_front() const { |
195 | 648k | return copy_pop_nfront(1); |
196 | 648k | } |
197 | | |
198 | 62.9k | PathInData PathInData::get_nested_prefix_path() const { |
199 | 62.9k | CHECK(has_nested_part()); |
200 | 62.9k | PathInData new_path; |
201 | 62.9k | Parts new_parts; |
202 | 115k | for (const Part& part : parts) { |
203 | 115k | new_parts.push_back(part); |
204 | 115k | if (part.is_nested) { |
205 | 62.9k | break; |
206 | 62.9k | } |
207 | 115k | } |
208 | 62.9k | new_path.build_path(new_parts); |
209 | 62.9k | new_path.build_parts(new_parts); |
210 | 62.9k | new_path.is_typed = is_typed; |
211 | 62.9k | return new_path; |
212 | 62.9k | } |
213 | | |
214 | 2 | PathInData PathInData::copy_pop_back() const { |
215 | 2 | if (parts.size() <= 1) { |
216 | 1 | return {}; |
217 | 1 | } |
218 | 1 | PathInData new_path; |
219 | 1 | Parts new_parts = parts; |
220 | 1 | new_parts.pop_back(); |
221 | 1 | new_path.build_path(new_parts); |
222 | 1 | new_path.build_parts(new_parts); |
223 | 1 | new_path.is_typed = is_typed; |
224 | 1 | return new_path; |
225 | 2 | } |
226 | | |
227 | 867k | PathInData PathInData::copy_pop_nfront(size_t n) const { |
228 | 867k | if (n >= parts.size()) { |
229 | 62.2k | return {}; |
230 | 62.2k | } |
231 | 805k | PathInData new_path; |
232 | 805k | Parts new_parts; |
233 | 806k | if (!parts.empty()) { |
234 | 806k | std::copy(parts.begin() + n, parts.end(), std::back_inserter(new_parts)); |
235 | 806k | } |
236 | 805k | new_path.build_path(new_parts); |
237 | 805k | new_path.build_parts(new_parts); |
238 | 805k | new_path.is_typed = is_typed; |
239 | 805k | return new_path; |
240 | 867k | } |
241 | | |
242 | | bool PathInData::try_strip_prefix(const std::string& name, const std::string& prefix_dot, |
243 | 0 | std::string* out) { |
244 | 0 | if (!name.starts_with(prefix_dot)) { |
245 | 0 | return false; |
246 | 0 | } |
247 | 0 | *out = name.substr(prefix_dot.size()); |
248 | 0 | return !out->empty(); |
249 | 0 | } |
250 | | |
251 | 0 | PathInData PathInData::append(const PathInData& base, std::string_view suffix) { |
252 | 0 | if (suffix.empty()) { |
253 | 0 | return base; |
254 | 0 | } |
255 | 0 | if (base.empty()) { |
256 | 0 | return PathInData(suffix); |
257 | 0 | } |
258 | 0 | std::string new_path; |
259 | 0 | new_path.reserve(base.get_path().size() + 1 + suffix.size()); |
260 | 0 | new_path.append(base.get_path()); |
261 | 0 | new_path.push_back('.'); |
262 | 0 | new_path.append(suffix.data(), suffix.size()); |
263 | 0 | return PathInData(new_path); |
264 | 0 | } |
265 | | |
266 | 19.5M | PathInDataBuilder& PathInDataBuilder::append(std::string_view key, bool is_array) { |
267 | 19.5M | if (parts.empty()) { |
268 | 15.4M | current_anonymous_array_level += is_array; |
269 | 15.4M | } |
270 | 19.5M | if (!parts.empty()) { |
271 | 4.15M | parts.back().is_nested = is_array; |
272 | 4.15M | } |
273 | 19.5M | parts.emplace_back(key, false, current_anonymous_array_level); |
274 | 19.5M | current_anonymous_array_level = 0; |
275 | 19.5M | return *this; |
276 | 19.5M | } |
277 | 642k | PathInDataBuilder& PathInDataBuilder::append(const PathInData::Parts& path, bool is_array) { |
278 | 642k | if (parts.empty()) { |
279 | 172k | current_anonymous_array_level += is_array; |
280 | 172k | } |
281 | 642k | if (!path.empty()) { |
282 | 35.3k | if (!parts.empty()) { |
283 | 25.0k | parts.back().is_nested = is_array; |
284 | 25.0k | } |
285 | 35.3k | auto it = parts.insert(parts.end(), path.begin(), path.end()); |
286 | 89.1k | for (; it != parts.end(); ++it) { |
287 | 53.8k | it->anonymous_array_level += current_anonymous_array_level; |
288 | 53.8k | } |
289 | 35.3k | current_anonymous_array_level = 0; |
290 | 35.3k | } |
291 | 642k | return *this; |
292 | 642k | } |
293 | | |
294 | 19.6M | void PathInDataBuilder::pop_back() { |
295 | 19.6M | if (!parts.empty()) { |
296 | 19.6M | parts.pop_back(); |
297 | 19.6M | } |
298 | 19.6M | } |
299 | | |
300 | 629k | void PathInDataBuilder::pop_back(size_t n) { |
301 | 629k | assert(n <= parts.size()); |
302 | 629k | parts.resize(parts.size() - n); |
303 | 629k | } |
304 | | |
305 | | } // namespace doris |