be/src/util/json/path_in_data.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/DataTypes/Serializations/PathInData.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | #include <gen_cpp/segment_v2.pb.h> |
24 | | #include <stddef.h> |
25 | | |
26 | | #include <algorithm> |
27 | | #include <memory> |
28 | | #include <string> |
29 | | #include <string_view> |
30 | | #include <vector> |
31 | | |
32 | | #include "core/uint128.h" |
33 | | |
34 | | namespace doris { |
35 | | |
36 | | /// Class that represents path in document, e.g. JSON. |
37 | | class PathInData; |
38 | | using PathInDataPtr = std::shared_ptr<PathInData>; |
39 | | |
40 | | class PathInData { |
41 | | public: |
42 | | struct Part { |
43 | 1.68M | Part() = default; |
44 | | Part(std::string_view key_, bool is_nested_, UInt8 anonymous_array_level_) |
45 | 88.0M | : key(key_), is_nested(is_nested_), anonymous_array_level(anonymous_array_level_) {} |
46 | 18.3M | bool operator==(const Part& other) const { |
47 | 18.3M | return this->key == other.key && this->is_nested == other.is_nested && |
48 | 18.3M | this->anonymous_array_level == other.anonymous_array_level; |
49 | 18.3M | } |
50 | | /// Name of part of path. |
51 | | std::string_view key; |
52 | | /// If this part is Nested, i.e. element |
53 | | /// related to this key is the array of objects. |
54 | | bool is_nested = false; |
55 | | /// Number of array levels between current key and previous key. |
56 | | /// E.g. in JSON {"k1": [[[{"k2": 1, "k3": 2}]]]} |
57 | | /// "k1" is nested and has anonymous_array_level = 0. |
58 | | /// "k2" and "k3" are not nested and have anonymous_array_level = 2. |
59 | | UInt8 anonymous_array_level = 0; |
60 | | |
61 | | /// Get the total array depth for this part. |
62 | | /// Used for NestedGroup offset indexing. |
63 | | /// If is_nested (array<object>), the depth is anonymous_array_level + 1 |
64 | | /// Otherwise it's just the anonymous_array_level. |
65 | 0 | UInt8 get_array_depth() const { |
66 | 0 | return is_nested ? (anonymous_array_level + 1) : anonymous_array_level; |
67 | 0 | } |
68 | | }; |
69 | | using Parts = std::vector<Part>; |
70 | 46.2M | PathInData() = default; |
71 | | explicit PathInData(std::string_view path_, bool is_typed_ = false); |
72 | | explicit PathInData(const Parts& parts_); |
73 | | explicit PathInData(std::string_view path_, const Parts& parts_, bool is_typed_ = false); |
74 | | explicit PathInData(const std::vector<std::string>& paths); |
75 | | explicit PathInData(const std::string& root, const std::vector<std::string>& paths); |
76 | | PathInData(const PathInData& other); |
77 | | PathInData& operator=(const PathInData& other); |
78 | | static UInt128 get_parts_hash(const Parts& parts_, bool is_typed_ = false); |
79 | 18.9M | bool empty() const { return parts.empty(); } |
80 | 115M | const String& get_path() const { return path; } |
81 | | // if path is v.a.b, then relative path will return a.b |
82 | | // make sure the parts is not empty |
83 | 0 | std::string_view get_relative_path() const { |
84 | 0 | if (parts.size() <= 1) { |
85 | 0 | // return empty string |
86 | 0 | return {}; |
87 | 0 | } |
88 | 0 | return {path.begin() + parts[0].key.size() + 1, path.end()}; |
89 | 0 | } |
90 | 126M | const Parts& get_parts() const { return parts; } |
91 | 0 | bool is_nested(size_t i) const { return parts[i].is_nested; } |
92 | 44.0M | bool has_nested_part() const { return has_nested; } |
93 | 25.4M | bool operator==(const PathInData& other) const { |
94 | 25.4M | return parts == other.parts && is_typed == other.is_typed; |
95 | 25.4M | } |
96 | 106 | bool operator!=(const PathInData& other) const { return !(*this == other); } |
97 | | PathInData get_nested_prefix_path() const; |
98 | | struct Hash { |
99 | | size_t operator()(const PathInData& value) const; |
100 | | }; |
101 | | std::string to_jsonpath() const; |
102 | | PathInData copy_pop_front() const; |
103 | | PathInData copy_pop_nfront(size_t n) const; |
104 | | PathInData copy_pop_back() const; |
105 | | static bool try_strip_prefix(const std::string& name, const std::string& prefix_dot, |
106 | | std::string* out); |
107 | | static PathInData append(const PathInData& base, std::string_view suffix); |
108 | | void to_protobuf(segment_v2::ColumnPathInfo* pb, int32_t parent_col_unique_id) const; |
109 | | void from_protobuf(const segment_v2::ColumnPathInfo& pb); |
110 | | |
111 | 785k | bool get_is_typed() const { return is_typed; } |
112 | | |
113 | | bool need_record_stats() const; |
114 | | |
115 | 25.9M | bool operator<(const PathInData& rhs) const { |
116 | 25.9M | return std::lexicographical_compare( |
117 | 25.9M | parts.begin(), parts.end(), rhs.parts.begin(), rhs.parts.end(), |
118 | 90.6M | [](const auto& a, const auto& b) { return a.key < b.key; }); |
119 | 25.9M | } |
120 | | |
121 | | private: |
122 | | /// Creates full path from parts. |
123 | | void build_path(const Parts& other_parts); |
124 | | /// Creates new parts full from full path with correct string pointers. |
125 | | void build_parts(const Parts& other_parts); |
126 | | /// The full path. Parts are separated by dots. |
127 | | String path; |
128 | | /// Parts of the path. All string_view-s in parts must point to the @path. |
129 | | Parts parts; |
130 | | /// True if at least one part is nested. |
131 | | /// Cached to avoid linear complexity at 'has_nested'. |
132 | | bool has_nested = false; |
133 | | |
134 | | /// True if the path is typed, e.g. a.b: int |
135 | | bool is_typed = false; |
136 | | }; |
137 | | |
138 | | class PathInDataBuilder { |
139 | | public: |
140 | 20.2M | const PathInData::Parts& get_parts() const { return parts; } |
141 | | PathInDataBuilder& append(std::string_view key, bool is_array); |
142 | | PathInDataBuilder& append(const PathInData::Parts& path, bool is_array); |
143 | | PathInDataBuilder& append(const std::vector<std::string>& parts); |
144 | | void pop_back(); |
145 | | void pop_back(size_t n); |
146 | 12.6k | PathInData build() { return PathInData(parts); } |
147 | | |
148 | | private: |
149 | | PathInData::Parts parts; |
150 | | /// Number of array levels without key to which |
151 | | /// next non-empty key will be nested. |
152 | | /// Example: for JSON { "k1": [[{"k2": 1, "k3": 2}] } |
153 | | // `k2` and `k3` has anonymous_array_level = 1 in that case. |
154 | | size_t current_anonymous_array_level = 0; |
155 | | }; |
156 | | using PathsInData = std::vector<PathInData>; |
157 | | |
158 | | struct PathInDataRef { |
159 | | const PathInData* ref; |
160 | | struct Hash { |
161 | 47.7k | size_t operator()(const PathInDataRef& value) const { |
162 | 47.7k | return PathInData::Hash {}(*value.ref); |
163 | 47.7k | } |
164 | | }; |
165 | 47.7k | PathInDataRef(const PathInData* ptr) : ref(ptr) {} |
166 | 0 | PathInDataRef() : ref(nullptr) {} |
167 | 9.41k | bool operator==(const PathInDataRef& other) const { |
168 | 9.41k | return (this->ref != nullptr && other.ref != nullptr && *this->ref == *other.ref) || |
169 | 9.41k | (this->ref == nullptr && other.ref == nullptr); |
170 | 9.41k | } |
171 | | }; |
172 | | |
173 | | } // namespace doris |
174 | | |
175 | | template <> |
176 | | struct std::hash<doris::PathInData> { |
177 | 55.2M | size_t operator()(const doris::PathInData& value) const { |
178 | 55.2M | return doris::PathInData::Hash {}(value); |
179 | 55.2M | } |
180 | | }; |