Coverage Report

Created: 2026-04-15 11:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/json/path_in_data.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/DataTypes/Serializations/PathInData.cpp
19
// and modified by Doris
20
21
#include "util/json/path_in_data.h"
22
23
#include <assert.h>
24
25
#include <string_view>
26
27
#include "common/cast_set.h"
28
#include "exec/common/sip_hash.h"
29
#include "exec/common/variant_util.h"
30
31
namespace doris {
32
33
3.09M
PathInData::PathInData(std::string_view path_, bool is_typed_) : path(path_), is_typed(is_typed_) {
34
3.09M
    const char* begin = path.data();
35
3.09M
    const char* end = path.data() + path.size();
36
68.3M
    for (const char* it = path.data(); it != end; ++it) {
37
65.2M
        if (*it == '.') {
38
5.52M
            size_t size = static_cast<size_t>(it - begin);
39
5.52M
            parts.emplace_back(std::string_view {begin, size}, false, 0);
40
5.52M
            begin = it + 1;
41
5.52M
        }
42
65.2M
    }
43
3.09M
    size_t size = static_cast<size_t>(end - begin);
44
3.09M
    parts.emplace_back(std::string_view {begin, size}, false, 0.);
45
3.09M
}
46
47
3
PathInData::PathInData(std::string_view path_, const Parts& parts_, bool is_typed_) {
48
3
    path = path_;
49
3
    is_typed = is_typed_;
50
9
    for (const auto& part : parts_) {
51
9
        has_nested |= part.is_nested;
52
9
        parts.emplace_back(part);
53
9
    }
54
3
}
55
56
19.1M
PathInData::PathInData(const Parts& parts_) {
57
19.1M
    build_path(parts_);
58
19.1M
    build_parts(parts_);
59
19.1M
}
60
56.7M
PathInData::PathInData(const PathInData& other) : path(other.path), is_typed(other.is_typed) {
61
56.7M
    build_parts(other.get_parts());
62
56.7M
}
63
64
21.6k
PathInData::PathInData(const std::string& root, const std::vector<std::string>& paths) {
65
21.6k
    PathInDataBuilder path_builder;
66
21.6k
    path_builder.append(root, false);
67
21.6k
    for (const std::string& p : paths) {
68
19.3k
        path_builder.append(p, false);
69
19.3k
    }
70
21.6k
    build_path(path_builder.get_parts());
71
21.6k
    build_parts(path_builder.get_parts());
72
21.6k
}
73
74
1
PathInData::PathInData(const std::vector<std::string>& paths) {
75
1
    PathInDataBuilder path_builder;
76
4
    for (size_t i = 0; i < paths.size(); ++i) {
77
3
        path_builder.append(paths[i], false);
78
3
    }
79
1
    build_path(path_builder.get_parts());
80
1
    build_parts(path_builder.get_parts());
81
1
}
82
83
2.39M
PathInData& PathInData::operator=(const PathInData& other) {
84
2.39M
    if (this != &other) {
85
2.39M
        path = other.path;
86
2.39M
        is_typed = other.is_typed;
87
2.39M
        build_parts(other.parts);
88
2.39M
    }
89
2.39M
    return *this;
90
2.39M
}
91
92
38.9M
UInt128 PathInData::get_parts_hash(const Parts& parts_, bool is_typed_) {
93
38.9M
    SipHash hash;
94
38.9M
    hash.update(parts_.size());
95
38.9M
    for (const auto& part : parts_) {
96
671k
        hash.update(part.key.data(), part.key.length());
97
671k
        hash.update(part.is_nested);
98
671k
        hash.update(part.anonymous_array_level);
99
671k
    }
100
38.9M
    hash.update(is_typed_);
101
38.9M
    UInt128 res;
102
38.9M
    hash.get128(res);
103
38.9M
    return res;
104
38.9M
}
105
106
19.9M
void PathInData::build_path(const Parts& other_parts) {
107
19.9M
    if (other_parts.empty()) {
108
1.76k
        return;
109
1.76k
    }
110
19.9M
    path.clear();
111
19.9M
    auto it = other_parts.begin();
112
19.9M
    path += it->key;
113
19.9M
    ++it;
114
29.6M
    for (; it != other_parts.end(); ++it) {
115
9.68M
        path += ".";
116
9.68M
        path += it->key;
117
9.68M
    }
118
19.9M
}
119
79.1M
void PathInData::build_parts(const Parts& other_parts) {
120
79.1M
    if (other_parts.empty()) {
121
47.0M
        return;
122
47.0M
    }
123
32.0M
    parts.clear();
124
32.0M
    parts.reserve(other_parts.size());
125
32.0M
    const char* begin = path.data();
126
63.4M
    for (const auto& part : other_parts) {
127
63.4M
        has_nested |= part.is_nested;
128
63.4M
        parts.emplace_back(std::string_view {begin, part.key.length()}, part.is_nested,
129
63.4M
                           part.anonymous_array_level);
130
63.4M
        begin += part.key.length() + 1;
131
63.4M
    }
132
32.0M
}
133
134
297k
void PathInData::from_protobuf(const segment_v2::ColumnPathInfo& pb) {
135
297k
    parts.clear();
136
297k
    path = pb.path();
137
297k
    has_nested = false;
138
297k
    is_typed = pb.is_typed();
139
297k
    parts.reserve(pb.path_part_infos().size());
140
297k
    const char* begin = path.data();
141
773k
    for (const segment_v2::ColumnPathPartInfo& part_info : pb.path_part_infos()) {
142
773k
        Part part;
143
773k
        part.is_nested = part_info.is_nested();
144
773k
        has_nested |= part.is_nested;
145
773k
        part.anonymous_array_level =
146
773k
                cast_set<uint8_t, uint32_t, false>(part_info.anonymous_array_level());
147
        // use string_view to ref data in path
148
773k
        part.key = std::string_view {begin, part_info.key().length()};
149
773k
        parts.push_back(part);
150
773k
        begin += part.key.length() + 1;
151
773k
    }
152
297k
}
153
154
2
std::string PathInData::to_jsonpath() const {
155
2
    std::string jsonpath = "$.";
156
2
    if (parts.empty()) {
157
0
        return jsonpath;
158
0
    }
159
2
    auto it = parts.begin();
160
2
    jsonpath += it->key;
161
2
    ++it;
162
3
    for (; it != parts.end(); ++it) {
163
1
        jsonpath += ".";
164
1
        jsonpath += it->key;
165
1
    }
166
2
    return jsonpath;
167
2
}
168
169
360k
void PathInData::to_protobuf(segment_v2::ColumnPathInfo* pb, int32_t parent_col_unique_id) const {
170
360k
    pb->set_path(path);
171
360k
    pb->set_has_nested(has_nested);
172
360k
    pb->set_parrent_column_unique_id(parent_col_unique_id);
173
360k
    pb->set_is_typed(is_typed);
174
175
    // set parts info
176
761k
    for (const Part& part : parts) {
177
761k
        segment_v2::ColumnPathPartInfo& part_info = *pb->add_path_part_infos();
178
761k
        part_info.set_key(std::string(part.key.data(), part.key.size()));
179
761k
        part_info.set_is_nested(part.is_nested);
180
761k
        part_info.set_anonymous_array_level(part.anonymous_array_level);
181
761k
    }
182
360k
}
183
184
37.4M
size_t PathInData::Hash::operator()(const PathInData& value) const {
185
37.4M
    auto hash = get_parts_hash(value.parts, value.is_typed);
186
37.4M
    return hash.low() ^ hash.high();
187
37.4M
}
188
189
4.62k
bool PathInData::need_record_stats() const {
190
4.62k
    return !empty() && !is_typed && !has_nested &&
191
4.62k
           path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos;
192
4.62k
}
193
194
478k
PathInData PathInData::copy_pop_front() const {
195
478k
    return copy_pop_nfront(1);
196
478k
}
197
198
62.7k
PathInData PathInData::get_nested_prefix_path() const {
199
62.7k
    CHECK(has_nested_part());
200
62.7k
    PathInData new_path;
201
62.7k
    Parts new_parts;
202
115k
    for (const Part& part : parts) {
203
115k
        new_parts.push_back(part);
204
115k
        if (part.is_nested) {
205
62.7k
            break;
206
62.7k
        }
207
115k
    }
208
62.7k
    new_path.build_path(new_parts);
209
62.7k
    new_path.build_parts(new_parts);
210
62.7k
    new_path.is_typed = is_typed;
211
62.7k
    return new_path;
212
62.7k
}
213
214
2
PathInData PathInData::copy_pop_back() const {
215
2
    if (parts.size() <= 1) {
216
1
        return {};
217
1
    }
218
1
    PathInData new_path;
219
1
    Parts new_parts = parts;
220
1
    new_parts.pop_back();
221
1
    new_path.build_path(new_parts);
222
1
    new_path.build_parts(new_parts);
223
1
    new_path.is_typed = is_typed;
224
1
    return new_path;
225
2
}
226
227
727k
PathInData PathInData::copy_pop_nfront(size_t n) const {
228
727k
    if (n >= parts.size()) {
229
41.4k
        return {};
230
41.4k
    }
231
686k
    PathInData new_path;
232
686k
    Parts new_parts;
233
686k
    if (!parts.empty()) {
234
686k
        std::copy(parts.begin() + n, parts.end(), std::back_inserter(new_parts));
235
686k
    }
236
686k
    new_path.build_path(new_parts);
237
686k
    new_path.build_parts(new_parts);
238
686k
    new_path.is_typed = is_typed;
239
686k
    return new_path;
240
727k
}
241
242
bool PathInData::try_strip_prefix(const std::string& name, const std::string& prefix_dot,
243
0
                                  std::string* out) {
244
0
    if (!name.starts_with(prefix_dot)) {
245
0
        return false;
246
0
    }
247
0
    *out = name.substr(prefix_dot.size());
248
0
    return !out->empty();
249
0
}
250
251
0
PathInData PathInData::append(const PathInData& base, std::string_view suffix) {
252
0
    if (suffix.empty()) {
253
0
        return base;
254
0
    }
255
0
    if (base.empty()) {
256
0
        return PathInData(suffix);
257
0
    }
258
0
    std::string new_path;
259
0
    new_path.reserve(base.get_path().size() + 1 + suffix.size());
260
0
    new_path.append(base.get_path());
261
0
    new_path.push_back('.');
262
0
    new_path.append(suffix.data(), suffix.size());
263
0
    return PathInData(new_path);
264
0
}
265
266
19.4M
PathInDataBuilder& PathInDataBuilder::append(std::string_view key, bool is_array) {
267
19.4M
    if (parts.empty()) {
268
15.3M
        current_anonymous_array_level += is_array;
269
15.3M
    }
270
19.4M
    if (!parts.empty()) {
271
4.16M
        parts.back().is_nested = is_array;
272
4.16M
    }
273
19.4M
    parts.emplace_back(key, false, current_anonymous_array_level);
274
19.4M
    current_anonymous_array_level = 0;
275
19.4M
    return *this;
276
19.4M
}
277
641k
PathInDataBuilder& PathInDataBuilder::append(const PathInData::Parts& path, bool is_array) {
278
641k
    if (parts.empty()) {
279
172k
        current_anonymous_array_level += is_array;
280
172k
    }
281
641k
    if (!path.empty()) {
282
35.2k
        if (!parts.empty()) {
283
25.0k
            parts.back().is_nested = is_array;
284
25.0k
        }
285
35.2k
        auto it = parts.insert(parts.end(), path.begin(), path.end());
286
89.0k
        for (; it != parts.end(); ++it) {
287
53.7k
            it->anonymous_array_level += current_anonymous_array_level;
288
53.7k
        }
289
35.2k
        current_anonymous_array_level = 0;
290
35.2k
    }
291
641k
    return *this;
292
641k
}
293
294
19.7M
void PathInDataBuilder::pop_back() {
295
19.7M
    if (!parts.empty()) {
296
19.7M
        parts.pop_back();
297
19.7M
    }
298
19.7M
}
299
300
627k
void PathInDataBuilder::pop_back(size_t n) {
301
627k
    assert(n <= parts.size());
302
627k
    parts.resize(parts.size() - n);
303
627k
}
304
305
} // namespace doris