Coverage Report

Created: 2026-03-13 09:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/json/path_in_data.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/DataTypes/Serializations/PathInData.cpp
19
// and modified by Doris
20
21
#include "util/json/path_in_data.h"
22
23
#include <assert.h>
24
25
#include <string_view>
26
27
#include "common/cast_set.h"
28
#include "exec/common/sip_hash.h"
29
#include "exec/common/variant_util.h"
30
31
namespace doris {
32
33
#include "common/compile_check_begin.h"
34
35
2.65M
PathInData::PathInData(std::string_view path_, bool is_typed_) : path(path_), is_typed(is_typed_) {
36
2.65M
    const char* begin = path.data();
37
2.65M
    const char* end = path.data() + path.size();
38
61.2M
    for (const char* it = path.data(); it != end; ++it) {
39
58.6M
        if (*it == '.') {
40
5.13M
            size_t size = static_cast<size_t>(it - begin);
41
5.13M
            parts.emplace_back(std::string_view {begin, size}, false, 0);
42
5.13M
            begin = it + 1;
43
5.13M
        }
44
58.6M
    }
45
2.65M
    size_t size = static_cast<size_t>(end - begin);
46
2.65M
    parts.emplace_back(std::string_view {begin, size}, false, 0.);
47
2.65M
}
48
49
3
PathInData::PathInData(std::string_view path_, const Parts& parts_, bool is_typed_) {
50
3
    path = path_;
51
3
    is_typed = is_typed_;
52
9
    for (const auto& part : parts_) {
53
9
        has_nested |= part.is_nested;
54
9
        parts.emplace_back(part);
55
9
    }
56
3
}
57
58
19.0M
PathInData::PathInData(const Parts& parts_) {
59
19.0M
    build_path(parts_);
60
19.0M
    build_parts(parts_);
61
19.0M
}
62
78.6M
PathInData::PathInData(const PathInData& other) : path(other.path), is_typed(other.is_typed) {
63
78.6M
    build_parts(other.get_parts());
64
78.6M
}
65
66
15.2k
PathInData::PathInData(const std::string& root, const std::vector<std::string>& paths) {
67
15.2k
    PathInDataBuilder path_builder;
68
15.2k
    path_builder.append(root, false);
69
15.2k
    for (const std::string& p : paths) {
70
13.7k
        path_builder.append(p, false);
71
13.7k
    }
72
15.2k
    build_path(path_builder.get_parts());
73
15.2k
    build_parts(path_builder.get_parts());
74
15.2k
}
75
76
1
PathInData::PathInData(const std::vector<std::string>& paths) {
77
1
    PathInDataBuilder path_builder;
78
4
    for (size_t i = 0; i < paths.size(); ++i) {
79
3
        path_builder.append(paths[i], false);
80
3
    }
81
1
    build_path(path_builder.get_parts());
82
1
    build_parts(path_builder.get_parts());
83
1
}
84
85
2.82M
PathInData& PathInData::operator=(const PathInData& other) {
86
2.82M
    if (this != &other) {
87
2.82M
        path = other.path;
88
2.82M
        is_typed = other.is_typed;
89
2.82M
        build_parts(other.parts);
90
2.82M
    }
91
2.82M
    return *this;
92
2.82M
}
93
94
45.1M
UInt128 PathInData::get_parts_hash(const Parts& parts_, bool is_typed_) {
95
45.1M
    SipHash hash;
96
45.1M
    hash.update(parts_.size());
97
45.1M
    for (const auto& part : parts_) {
98
719k
        hash.update(part.key.data(), part.key.length());
99
719k
        hash.update(part.is_nested);
100
719k
        hash.update(part.anonymous_array_level);
101
719k
    }
102
45.1M
    hash.update(is_typed_);
103
45.1M
    UInt128 res;
104
45.1M
    hash.get128(res);
105
45.1M
    return res;
106
45.1M
}
107
108
24.9M
void PathInData::build_path(const Parts& other_parts) {
109
24.9M
    if (other_parts.empty()) {
110
1.77k
        return;
111
1.77k
    }
112
24.9M
    path.clear();
113
24.9M
    auto it = other_parts.begin();
114
24.9M
    path += it->key;
115
24.9M
    ++it;
116
50.0M
    for (; it != other_parts.end(); ++it) {
117
25.1M
        path += ".";
118
25.1M
        path += it->key;
119
25.1M
    }
120
24.9M
}
121
106M
void PathInData::build_parts(const Parts& other_parts) {
122
106M
    if (other_parts.empty()) {
123
64.3M
        return;
124
64.3M
    }
125
41.9M
    parts.clear();
126
41.9M
    parts.reserve(other_parts.size());
127
41.9M
    const char* begin = path.data();
128
98.0M
    for (const auto& part : other_parts) {
129
98.0M
        has_nested |= part.is_nested;
130
98.0M
        parts.emplace_back(std::string_view {begin, part.key.length()}, part.is_nested,
131
98.0M
                           part.anonymous_array_level);
132
98.0M
        begin += part.key.length() + 1;
133
98.0M
    }
134
41.9M
}
135
136
3.71M
void PathInData::from_protobuf(const segment_v2::ColumnPathInfo& pb) {
137
3.71M
    parts.clear();
138
3.71M
    path = pb.path();
139
3.71M
    has_nested = false;
140
3.71M
    is_typed = pb.is_typed();
141
3.71M
    parts.reserve(pb.path_part_infos().size());
142
3.71M
    const char* begin = path.data();
143
17.8M
    for (const segment_v2::ColumnPathPartInfo& part_info : pb.path_part_infos()) {
144
17.8M
        Part part;
145
17.8M
        part.is_nested = part_info.is_nested();
146
17.8M
        has_nested |= part.is_nested;
147
17.8M
        part.anonymous_array_level =
148
17.8M
                cast_set<uint8_t, uint32_t, false>(part_info.anonymous_array_level());
149
        // use string_view to ref data in path
150
17.8M
        part.key = std::string_view {begin, part_info.key().length()};
151
17.8M
        parts.push_back(part);
152
17.8M
        begin += part.key.length() + 1;
153
17.8M
    }
154
3.71M
}
155
156
2
std::string PathInData::to_jsonpath() const {
157
2
    std::string jsonpath = "$.";
158
2
    if (parts.empty()) {
159
0
        return jsonpath;
160
0
    }
161
2
    auto it = parts.begin();
162
2
    jsonpath += it->key;
163
2
    ++it;
164
3
    for (; it != parts.end(); ++it) {
165
1
        jsonpath += ".";
166
1
        jsonpath += it->key;
167
1
    }
168
2
    return jsonpath;
169
2
}
170
171
361k
void PathInData::to_protobuf(segment_v2::ColumnPathInfo* pb, int32_t parent_col_unique_id) const {
172
361k
    pb->set_path(path);
173
361k
    pb->set_has_nested(has_nested);
174
361k
    pb->set_parrent_column_unique_id(parent_col_unique_id);
175
361k
    pb->set_is_typed(is_typed);
176
177
    // set parts info
178
782k
    for (const Part& part : parts) {
179
782k
        segment_v2::ColumnPathPartInfo& part_info = *pb->add_path_part_infos();
180
782k
        part_info.set_key(std::string(part.key.data(), part.key.size()));
181
782k
        part_info.set_is_nested(part.is_nested);
182
782k
        part_info.set_anonymous_array_level(part.anonymous_array_level);
183
782k
    }
184
361k
}
185
186
43.7M
size_t PathInData::Hash::operator()(const PathInData& value) const {
187
43.7M
    auto hash = get_parts_hash(value.parts, value.is_typed);
188
43.7M
    return hash.low() ^ hash.high();
189
43.7M
}
190
191
5.77k
bool PathInData::need_record_stats() const {
192
5.77k
    return !empty() && !is_typed && !has_nested &&
193
5.77k
           path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos;
194
5.77k
}
195
196
5.54M
PathInData PathInData::copy_pop_front() const {
197
5.54M
    return copy_pop_nfront(1);
198
5.54M
}
199
200
63.4k
PathInData PathInData::get_nested_prefix_path() const {
201
63.4k
    CHECK(has_nested_part());
202
63.4k
    PathInData new_path;
203
63.4k
    Parts new_parts;
204
116k
    for (const Part& part : parts) {
205
116k
        new_parts.push_back(part);
206
116k
        if (part.is_nested) {
207
63.4k
            break;
208
63.4k
        }
209
116k
    }
210
63.4k
    new_path.build_path(new_parts);
211
63.4k
    new_path.build_parts(new_parts);
212
63.4k
    new_path.is_typed = is_typed;
213
63.4k
    return new_path;
214
63.4k
}
215
216
2
PathInData PathInData::copy_pop_back() const {
217
2
    if (parts.size() <= 1) {
218
1
        return {};
219
1
    }
220
1
    PathInData new_path;
221
1
    Parts new_parts = parts;
222
1
    new_parts.pop_back();
223
1
    new_path.build_path(new_parts);
224
1
    new_path.build_parts(new_parts);
225
1
    new_path.is_typed = is_typed;
226
1
    return new_path;
227
2
}
228
229
5.75M
PathInData PathInData::copy_pop_nfront(size_t n) const {
230
5.75M
    if (n >= parts.size()) {
231
47.1k
        return {};
232
47.1k
    }
233
5.70M
    PathInData new_path;
234
5.70M
    Parts new_parts;
235
5.75M
    if (!parts.empty()) {
236
5.75M
        std::copy(parts.begin() + n, parts.end(), std::back_inserter(new_parts));
237
5.75M
    }
238
5.70M
    new_path.build_path(new_parts);
239
5.70M
    new_path.build_parts(new_parts);
240
5.70M
    new_path.is_typed = is_typed;
241
5.70M
    return new_path;
242
5.75M
}
243
244
bool PathInData::try_strip_prefix(const std::string& name, const std::string& prefix_dot,
245
0
                                  std::string* out) {
246
0
    if (!name.starts_with(prefix_dot)) {
247
0
        return false;
248
0
    }
249
0
    *out = name.substr(prefix_dot.size());
250
0
    return !out->empty();
251
0
}
252
253
0
PathInData PathInData::append(const PathInData& base, std::string_view suffix) {
254
0
    if (suffix.empty()) {
255
0
        return base;
256
0
    }
257
0
    if (base.empty()) {
258
0
        return PathInData(suffix);
259
0
    }
260
0
    std::string new_path;
261
0
    new_path.reserve(base.get_path().size() + 1 + suffix.size());
262
0
    new_path.append(base.get_path());
263
0
    new_path.push_back('.');
264
0
    new_path.append(suffix.data(), suffix.size());
265
0
    return PathInData(new_path);
266
0
}
267
268
19.4M
PathInDataBuilder& PathInDataBuilder::append(std::string_view key, bool is_array) {
269
19.4M
    if (parts.empty()) {
270
15.2M
        current_anonymous_array_level += is_array;
271
15.2M
    }
272
19.4M
    if (!parts.empty()) {
273
4.16M
        parts.back().is_nested = is_array;
274
4.16M
    }
275
19.4M
    parts.emplace_back(key, false, current_anonymous_array_level);
276
19.4M
    current_anonymous_array_level = 0;
277
19.4M
    return *this;
278
19.4M
}
279
643k
PathInDataBuilder& PathInDataBuilder::append(const PathInData::Parts& path, bool is_array) {
280
643k
    if (parts.empty()) {
281
172k
        current_anonymous_array_level += is_array;
282
172k
    }
283
643k
    if (!path.empty()) {
284
35.3k
        if (!parts.empty()) {
285
24.9k
            parts.back().is_nested = is_array;
286
24.9k
        }
287
35.3k
        auto it = parts.insert(parts.end(), path.begin(), path.end());
288
89.0k
        for (; it != parts.end(); ++it) {
289
53.7k
            it->anonymous_array_level += current_anonymous_array_level;
290
53.7k
        }
291
35.3k
        current_anonymous_array_level = 0;
292
35.3k
    }
293
643k
    return *this;
294
643k
}
295
296
19.5M
void PathInDataBuilder::pop_back() {
297
19.5M
    if (!parts.empty()) {
298
19.5M
        parts.pop_back();
299
19.5M
    }
300
19.5M
}
301
302
630k
void PathInDataBuilder::pop_back(size_t n) {
303
630k
    assert(n <= parts.size());
304
630k
    parts.resize(parts.size() - n);
305
630k
}
306
307
#include "common/compile_check_end.h"
308
309
} // namespace doris