Coverage Report

Created: 2026-06-03 03:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/common/variant_util.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/common/variant_util.h"
19
20
#include <assert.h>
21
#include <fmt/format.h>
22
#include <gen_cpp/FrontendService.h>
23
#include <gen_cpp/FrontendService_types.h>
24
#include <gen_cpp/HeartbeatService_types.h>
25
#include <gen_cpp/MasterService_types.h>
26
#include <gen_cpp/Status_types.h>
27
#include <gen_cpp/Types_types.h>
28
#include <glog/logging.h>
29
#include <rapidjson/document.h>
30
#include <rapidjson/stringbuffer.h>
31
#include <rapidjson/writer.h>
32
#include <simdjson/simdjson.h> // IWYU pragma: keep
33
#include <unicode/uchar.h>
34
35
#include <algorithm>
36
#include <cassert>
37
#include <cstddef>
38
#include <cstdint>
39
#include <cstring>
40
#include <list>
41
#include <memory>
42
#include <mutex>
43
#include <optional>
44
#include <ostream>
45
#include <ranges>
46
#include <set>
47
#include <stack>
48
#include <string>
49
#include <string_view>
50
#include <unordered_map>
51
#include <utility>
52
#include <vector>
53
54
#include "common/config.h"
55
#include "common/status.h"
56
#include "core/assert_cast.h"
57
#include "core/block/block.h"
58
#include "core/block/column_numbers.h"
59
#include "core/block/column_with_type_and_name.h"
60
#include "core/column/column.h"
61
#include "core/column/column_array.h"
62
#include "core/column/column_map.h"
63
#include "core/column/column_nullable.h"
64
#include "core/column/column_string.h"
65
#include "core/column/column_variant.h"
66
#include "core/data_type/data_type.h"
67
#include "core/data_type/data_type_array.h"
68
#include "core/data_type/data_type_factory.hpp"
69
#include "core/data_type/data_type_jsonb.h"
70
#include "core/data_type/data_type_nullable.h"
71
#include "core/data_type/data_type_string.h"
72
#include "core/data_type/data_type_variant.h"
73
#include "core/data_type/define_primitive_type.h"
74
#include "core/data_type/get_least_supertype.h"
75
#include "core/data_type/primitive_type.h"
76
#include "core/field.h"
77
#include "core/typeid_cast.h"
78
#include "core/types.h"
79
#include "exec/common/field_visitors.h"
80
#include "exec/common/sip_hash.h"
81
#include "exprs/function/function.h"
82
#include "exprs/function/simple_function_factory.h"
83
#include "exprs/function_context.h"
84
#include "exprs/json_functions.h"
85
#include "re2/re2.h"
86
#include "runtime/exec_env.h"
87
#include "runtime/runtime_state.h"
88
#include "storage/olap_common.h"
89
#include "storage/rowset/beta_rowset.h"
90
#include "storage/rowset/rowset.h"
91
#include "storage/rowset/rowset_fwd.h"
92
#include "storage/segment/segment_loader.h"
93
#include "storage/segment/variant/nested_group_path.h"
94
#include "storage/segment/variant/variant_column_reader.h"
95
#include "storage/segment/variant/variant_column_writer_impl.h"
96
#include "storage/tablet/tablet.h"
97
#include "storage/tablet/tablet_fwd.h"
98
#include "storage/tablet/tablet_schema.h"
99
#include "util/client_cache.h"
100
#include "util/defer_op.h"
101
#include "util/json/json_parser.h"
102
#include "util/json/path_in_data.h"
103
#include "util/json/simd_json_parser.h"
104
105
namespace doris::variant_util {
106
107
2.82k
inline void append_escaped_regex_char(std::string* regex_output, char ch) {
108
2.82k
    switch (ch) {
109
22
    case '.':
110
24
    case '^':
111
26
    case '$':
112
28
    case '+':
113
34
    case '*':
114
36
    case '?':
115
38
    case '(':
116
40
    case ')':
117
42
    case '|':
118
44
    case '{':
119
46
    case '}':
120
48
    case '[':
121
48
    case ']':
122
52
    case '\\':
123
52
        regex_output->push_back('\\');
124
52
        regex_output->push_back(ch);
125
52
        break;
126
2.77k
    default:
127
2.77k
        regex_output->push_back(ch);
128
2.77k
        break;
129
2.82k
    }
130
2.82k
}
131
132
// Small LRU to cap compiled glob patterns
133
constexpr size_t kGlobRegexCacheCapacity = 256;
134
135
struct GlobRegexCacheEntry {
136
    std::shared_ptr<RE2> re2;
137
    std::list<std::string>::iterator lru_it;
138
};
139
140
static std::mutex g_glob_regex_cache_mutex;
141
static std::list<std::string> g_glob_regex_cache_lru;
142
static std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;
143
144
200k
std::shared_ptr<RE2> get_or_build_re2(const std::string& glob_pattern) {
145
200k
    {
146
200k
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
147
200k
        auto it = g_glob_regex_cache.find(glob_pattern);
148
200k
        if (it != g_glob_regex_cache.end()) {
149
200k
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
150
200k
                                          it->second.lru_it);
151
200k
            return it->second.re2;
152
200k
        }
153
200k
    }
154
75
    std::string regex_pattern;
155
75
    Status st = glob_to_regex(glob_pattern, &regex_pattern);
156
75
    if (!st.ok()) {
157
2
        return nullptr;
158
2
    }
159
73
    auto compiled = std::make_shared<RE2>(regex_pattern);
160
73
    if (!compiled->ok()) {
161
3
        return nullptr;
162
3
    }
163
70
    {
164
70
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
165
70
        auto it = g_glob_regex_cache.find(glob_pattern);
166
70
        if (it != g_glob_regex_cache.end()) {
167
0
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
168
0
                                          it->second.lru_it);
169
0
            return it->second.re2;
170
0
        }
171
70
        g_glob_regex_cache_lru.push_front(glob_pattern);
172
70
        g_glob_regex_cache.emplace(glob_pattern,
173
70
                                   GlobRegexCacheEntry {compiled, g_glob_regex_cache_lru.begin()});
174
70
        if (g_glob_regex_cache.size() > kGlobRegexCacheCapacity) {
175
0
            const std::string& evict_key = g_glob_regex_cache_lru.back();
176
0
            g_glob_regex_cache.erase(evict_key);
177
0
            g_glob_regex_cache_lru.pop_back();
178
0
        }
179
70
    }
180
0
    return compiled;
181
70
}
182
183
// Convert a restricted glob pattern into a regex.
184
// Supported: '*', '?', '[...]', '\\' escape. Others are treated as literals.
185
308
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern) {
186
308
    regex_pattern->clear();
187
308
    regex_pattern->append("^");
188
308
    bool is_escaped = false;
189
308
    size_t pattern_length = glob_pattern.size();
190
3.25k
    for (size_t index = 0; index < pattern_length; ++index) {
191
2.95k
        char current_char = glob_pattern[index];
192
2.95k
        if (is_escaped) {
193
10
            append_escaped_regex_char(regex_pattern, current_char);
194
10
            is_escaped = false;
195
10
            continue;
196
10
        }
197
2.94k
        if (current_char == '\\') {
198
14
            is_escaped = true;
199
14
            continue;
200
14
        }
201
2.92k
        if (current_char == '*') {
202
69
            regex_pattern->append(".*");
203
69
            continue;
204
69
        }
205
2.85k
        if (current_char == '?') {
206
15
            regex_pattern->append(".");
207
15
            continue;
208
15
        }
209
2.84k
        if (current_char == '[') {
210
33
            size_t class_index = index + 1;
211
33
            bool class_closed = false;
212
33
            bool is_class_escaped = false;
213
33
            std::string class_buffer;
214
33
            if (class_index < pattern_length &&
215
33
                (glob_pattern[class_index] == '!' || glob_pattern[class_index] == '^')) {
216
9
                class_buffer.push_back('^');
217
9
                ++class_index;
218
9
            }
219
99
            for (; class_index < pattern_length; ++class_index) {
220
95
                char class_char = glob_pattern[class_index];
221
95
                if (is_class_escaped) {
222
10
                    class_buffer.push_back(class_char);
223
10
                    is_class_escaped = false;
224
10
                    continue;
225
10
                }
226
85
                if (class_char == '\\') {
227
10
                    is_class_escaped = true;
228
10
                    continue;
229
10
                }
230
75
                if (class_char == ']') {
231
29
                    class_closed = true;
232
29
                    break;
233
29
                }
234
46
                class_buffer.push_back(class_char);
235
46
            }
236
33
            if (!class_closed) {
237
4
                return Status::InvalidArgument("Unclosed character class in glob pattern: {}",
238
4
                                               glob_pattern);
239
4
            }
240
29
            regex_pattern->append("[");
241
29
            regex_pattern->append(class_buffer);
242
29
            regex_pattern->append("]");
243
29
            index = class_index;
244
29
            continue;
245
33
        }
246
2.81k
        append_escaped_regex_char(regex_pattern, current_char);
247
2.81k
    }
248
304
    if (is_escaped) {
249
4
        append_escaped_regex_char(regex_pattern, '\\');
250
4
    }
251
304
    regex_pattern->append("$");
252
304
    return Status::OK();
253
308
}
254
255
200k
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path) {
256
200k
    auto compiled = get_or_build_re2(glob_pattern);
257
200k
    if (compiled == nullptr) {
258
5
        return false;
259
5
    }
260
200k
    return RE2::FullMatch(candidate_path, *compiled);
261
200k
}
262
263
// NestedGroup's physical children and offsets are produced by NestedGroupWriteProvider, not by
264
// appending TabletSchema extracted columns here. This predicate keeps only ordinary Variant paths
265
// that are outside the NG tree, for example `v.owner` beside `v.items[*]`.
266
0
bool is_regular_path_outside_nested_group(const PathInData& path) {
267
0
    const std::string& relative_path = path.get_path();
268
0
    return !relative_path.empty() && !path.get_is_typed() && !path.has_nested_part() &&
269
0
           !segment_v2::contains_nested_group_marker(relative_path) &&
270
0
           !segment_v2::is_root_nested_group_path(relative_path) &&
271
0
           relative_path != SPARSE_COLUMN_PATH &&
272
0
           relative_path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos;
273
0
}
274
275
bool should_materialize_nested_group_regular_subcolumns(
276
        const TabletColumnPtr& column,
277
668
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
278
668
    const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
279
668
    return column->variant_enable_nested_group() ||
280
671
           (info_it != uid_to_variant_extended_info.end() && info_it->second.has_nested_group);
281
668
}
282
283
std::unordered_set<int32_t> collect_nested_group_compaction_root_uids(
284
        const TabletSchemaSPtr& target,
285
10.1k
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
286
10.1k
    std::unordered_set<int32_t> root_uids;
287
101k
    for (const TabletColumnPtr& column : target->columns()) {
288
101k
        if (column->is_variant_type() && should_materialize_nested_group_regular_subcolumns(
289
674
                                                 column, uid_to_variant_extended_info)) {
290
1
            root_uids.insert(column->unique_id());
291
1
        }
292
101k
    }
293
10.1k
    return root_uids;
294
10.1k
}
295
296
PathToDataTypes collect_regular_types_outside_nested_group(
297
1
        const VariantExtendedInfo& extended_info) {
298
1
    PathToDataTypes regular_path_to_data_types;
299
1
    for (const auto& [path, data_types] : extended_info.path_to_data_types) {
300
0
        if (!is_regular_path_outside_nested_group(path)) {
301
0
            continue;
302
0
        }
303
0
        regular_path_to_data_types.emplace(path, data_types);
304
0
    }
305
1
    return regular_path_to_data_types;
306
1
}
307
308
963
size_t get_number_of_dimensions(const IDataType& type) {
309
963
    if (const auto* type_array = typeid_cast<const DataTypeArray*>(&type)) {
310
4
        return type_array->get_number_of_dimensions();
311
4
    }
312
959
    return 0;
313
963
}
314
3
size_t get_number_of_dimensions(const IColumn& column) {
315
3
    if (const auto* column_array = check_and_get_column<ColumnArray>(column)) {
316
2
        return column_array->get_number_of_dimensions();
317
2
    }
318
1
    return 0;
319
3
}
320
321
127k
DataTypePtr get_base_type_of_array(const DataTypePtr& type) {
322
    /// Get raw pointers to avoid extra copying of type pointers.
323
127k
    const DataTypeArray* last_array = nullptr;
324
127k
    const auto* current_type = type.get();
325
127k
    if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
326
127k
        current_type = nullable->get_nested_type().get();
327
127k
    }
328
128k
    while (const auto* type_array = typeid_cast<const DataTypeArray*>(current_type)) {
329
1.16k
        current_type = type_array->get_nested_type().get();
330
1.16k
        last_array = type_array;
331
1.16k
        if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
332
1.16k
            current_type = nullable->get_nested_type().get();
333
1.16k
        }
334
1.16k
    }
335
127k
    return last_array ? last_array->get_nested_type() : type;
336
127k
}
337
338
1.03M
Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result) {
339
1.03M
    ColumnsWithTypeAndName arguments {arg, {nullptr, type, type->get_name()}};
340
341
    // To prevent from null info lost, we should not call function since the function framework will wrap
342
    // nullable to Variant instead of the root of Variant
343
    // correct output: Nullable(Array(int)) -> Nullable(Variant(Nullable(Array(int))))
344
    // incorrect output: Nullable(Array(int)) -> Nullable(Variant(Array(int)))
345
1.03M
    if (type->get_primitive_type() == TYPE_VARIANT) {
346
        // If source column is variant, so the nullable info is different from dst column
347
15.5k
        if (arg.type->get_primitive_type() == TYPE_VARIANT) {
348
125
            *result = type->is_nullable() ? make_nullable(arg.column) : remove_nullable(arg.column);
349
125
            return Status::OK();
350
125
        }
351
        // set variant root column/type to from column/type
352
15.5k
        CHECK(arg.column->is_nullable());
353
15.4k
        auto to_type = remove_nullable(type);
354
15.4k
        const auto& data_type_object = assert_cast<const DataTypeVariant&>(*to_type);
355
15.4k
        auto variant = ColumnVariant::create(data_type_object.variant_max_subcolumns_count(),
356
15.4k
                                             data_type_object.enable_doc_mode());
357
358
15.4k
        variant->create_root(arg.type, std::move(*arg.column).mutate());
359
15.4k
        ColumnPtr nullable = ColumnNullable::create(
360
15.4k
                variant->get_ptr(),
361
15.4k
                assert_cast<const ColumnNullable*>(arg.column.get())->get_null_map_column_ptr());
362
18.4E
        *result = type->is_nullable() ? nullable : variant->get_ptr();
363
15.4k
        return Status::OK();
364
15.5k
    }
365
366
1.01M
    auto function = SimpleFunctionFactory::instance().get_function("CAST", arguments, type);
367
1.01M
    if (!function) {
368
0
        return Status::InternalError("Not found cast function {} to {}", arg.type->get_name(),
369
0
                                     type->get_name());
370
0
    }
371
1.01M
    Block tmp_block {arguments};
372
1.01M
    uint32_t result_column = cast_set<uint32_t>(tmp_block.columns());
373
1.01M
    RuntimeState state;
374
1.01M
    auto ctx = FunctionContext::create_context(&state, {}, {});
375
376
1.01M
    if (arg.type->get_primitive_type() == INVALID_TYPE) {
377
        // cast from nothing to any type should result in nulls
378
5.88k
        *result = type->create_column_const_with_default_value(arg.column->size())
379
5.88k
                          ->convert_to_full_column_if_const();
380
5.88k
        return Status::OK();
381
5.88k
    }
382
383
    // We convert column string to jsonb type just add a string jsonb field to dst column instead of parse
384
    // each line in original string column.
385
1.00M
    ctx->set_string_as_jsonb_string(true);
386
1.00M
    ctx->set_jsonb_string_as_string(true);
387
1.00M
    tmp_block.insert({nullptr, type, arg.name});
388
    // TODO(lihangyu): we should handle this error in strict mode
389
1.00M
    if (!function->execute(ctx.get(), tmp_block, {0}, result_column, arg.column->size())) {
390
1
        LOG_EVERY_N(WARNING, 100) << fmt::format("cast from {} to {}", arg.type->get_name(),
391
1
                                                 type->get_name());
392
1
        *result = type->create_column_const_with_default_value(arg.column->size())
393
1
                          ->convert_to_full_column_if_const();
394
1
        return Status::OK();
395
1
    }
396
1.00M
    *result = tmp_block.get_by_position(result_column).column->convert_to_full_column_if_const();
397
1.00M
    VLOG_DEBUG << fmt::format("{} before convert {}, after convert {}", arg.name,
398
17
                              arg.column->get_name(), (*result)->get_name());
399
1.00M
    return Status::OK();
400
1.00M
}
401
402
void get_column_by_type(const DataTypePtr& data_type, const std::string& name, TabletColumn& column,
403
253k
                        const ExtraInfo& ext_info) {
404
253k
    column.set_name(name);
405
253k
    column.set_type(data_type->get_storage_field_type());
406
253k
    if (ext_info.unique_id >= 0) {
407
4
        column.set_unique_id(ext_info.unique_id);
408
4
    }
409
253k
    if (ext_info.parent_unique_id >= 0) {
410
125k
        column.set_parent_unique_id(ext_info.parent_unique_id);
411
125k
    }
412
253k
    if (!ext_info.path_info.empty()) {
413
125k
        column.set_path_info(ext_info.path_info);
414
125k
    }
415
253k
    if (data_type->is_nullable()) {
416
126k
        const auto& real_type = static_cast<const DataTypeNullable&>(*data_type);
417
126k
        column.set_is_nullable(true);
418
126k
        get_column_by_type(real_type.get_nested_type(), name, column, {});
419
126k
        return;
420
126k
    }
421
126k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_ARRAY) {
422
1.12k
        TabletColumn child;
423
1.12k
        get_column_by_type(assert_cast<const DataTypeArray*>(data_type.get())->get_nested_type(),
424
1.12k
                           "", child, {});
425
1.12k
        column.set_length(TabletColumn::get_field_length_by_type(TPrimitiveType::ARRAY, 0));
426
1.12k
        column.add_sub_column(child);
427
1.12k
        return;
428
1.12k
    }
429
125k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_VARIANT) {
430
0
        const auto* dt_variant = assert_cast<const DataTypeVariant*>(data_type.get());
431
0
        column.set_variant_max_subcolumns_count(dt_variant->variant_max_subcolumns_count());
432
0
        column.set_variant_enable_doc_mode(dt_variant->enable_doc_mode());
433
0
        return;
434
0
    }
435
    // size is not fixed when type is string or json
436
125k
    if (is_string_type(data_type->get_primitive_type()) ||
437
125k
        data_type->get_primitive_type() == TYPE_JSONB) {
438
26.9k
        column.set_length(INT_MAX);
439
26.9k
        return;
440
26.9k
    }
441
442
98.6k
    PrimitiveType type = data_type->get_primitive_type();
443
98.6k
    if (is_int_or_bool(type) || is_string_type(type) || is_float_or_double(type) || is_ip(type) ||
444
98.6k
        is_date_or_datetime(type) || type == PrimitiveType::TYPE_DATEV2) {
445
98.5k
        column.set_length(cast_set<int32_t>(data_type->get_size_of_value_in_memory()));
446
98.5k
        return;
447
98.5k
    }
448
105
    if (is_decimal(type)) {
449
105
        column.set_precision(data_type->get_precision());
450
105
        column.set_frac(data_type->get_scale());
451
105
        return;
452
105
    }
453
    // datetimev2 needs scale
454
18.4E
    if (type == PrimitiveType::TYPE_DATETIMEV2 || type == PrimitiveType::TYPE_TIMESTAMPTZ) {
455
17
        column.set_precision(-1);
456
17
        column.set_frac(data_type->get_scale());
457
17
        return;
458
17
    }
459
460
18.4E
    throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
461
18.4E
                           "unexcepted data column type: {}, column name is: {}",
462
18.4E
                           data_type->get_name(), name);
463
18.4E
}
464
465
TabletColumn get_column_by_type(const DataTypePtr& data_type, const std::string& name,
466
124k
                                const ExtraInfo& ext_info) {
467
124k
    TabletColumn result;
468
124k
    get_column_by_type(data_type, name, result, ext_info);
469
124k
    return result;
470
124k
}
471
472
// check if two paths which same prefix have different structure
473
static bool has_different_structure_in_same_path(const PathInData::Parts& lhs,
474
9.04k
                                                 const PathInData::Parts& rhs) {
475
9.04k
    if (lhs.size() != rhs.size()) {
476
1
        return false; // different size means different structure
477
1
    }
478
    // Since we group by path string, lhs and rhs must have the same size and keys
479
    // We only need to check if they have different nested structure
480
36.1k
    for (size_t i = 0; i < lhs.size(); ++i) {
481
27.0k
        if (lhs[i] != rhs[i]) {
482
5
            VLOG_DEBUG << fmt::format(
483
0
                    "Check different structure: {} vs {}, lhs[i].is_nested: {}, rhs[i].is_nested: "
484
0
                    "{}",
485
0
                    lhs[i].key, rhs[i].key, lhs[i].is_nested, rhs[i].is_nested);
486
5
            return true;
487
5
        }
488
27.0k
    }
489
9.03k
    return false;
490
9.04k
}
491
492
4.81k
Status check_variant_has_no_ambiguous_paths(const PathsInData& tuple_paths) {
493
    // Group paths by their string representation to reduce comparisons
494
4.81k
    std::unordered_map<std::string, std::vector<size_t>> path_groups;
495
496
26.8k
    for (size_t i = 0; i < tuple_paths.size(); ++i) {
497
        // same path should have same structure, so we group them by path
498
22.0k
        path_groups[tuple_paths[i].get_path()].push_back(i);
499
        // print part of tuple_paths[i]
500
22.0k
        VLOG_DEBUG << "tuple_paths[i]: " << tuple_paths[i].get_path();
501
22.0k
    }
502
503
    // Only compare paths within the same group
504
13.0k
    for (const auto& [path_str, indices] : path_groups) {
505
13.0k
        if (indices.size() <= 1) {
506
3.99k
            continue; // No conflicts possible
507
3.99k
        }
508
509
        // Compare all pairs within this group
510
27.0k
        for (size_t i = 0; i < indices.size(); ++i) {
511
27.0k
            for (size_t j = 0; j < i; ++j) {
512
9.04k
                if (has_different_structure_in_same_path(tuple_paths[indices[i]].get_parts(),
513
9.04k
                                                         tuple_paths[indices[j]].get_parts())) {
514
5
                    return Status::DataQualityError(
515
5
                            "Ambiguous paths: {} vs {} with different nested part {} vs {}",
516
5
                            tuple_paths[indices[i]].get_path(), tuple_paths[indices[j]].get_path(),
517
5
                            tuple_paths[indices[i]].has_nested_part(),
518
5
                            tuple_paths[indices[j]].has_nested_part());
519
5
                }
520
9.04k
            }
521
18.0k
        }
522
9.01k
    }
523
4.80k
    return Status::OK();
524
4.81k
}
525
526
Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
527
                                    TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
528
                                    const std::map<std::string, TabletColumnPtr>& typed_columns,
529
1.77k
                                    std::set<PathInData>* path_set) {
530
1.77k
    PathsInData tuple_paths;
531
1.77k
    DataTypes tuple_types;
532
1.77k
    CHECK(common_schema.use_count() == 1);
533
    // Get the least common type for all paths.
534
1.77k
    for (const auto& [key, subtypes] : subcolumns_types) {
535
929
        assert(!subtypes.empty());
536
929
        if (key.get_path() == ColumnVariant::COLUMN_NAME_DUMMY) {
537
0
            continue;
538
0
        }
539
929
        size_t first_dim = get_number_of_dimensions(*subtypes[0]);
540
929
        tuple_paths.emplace_back(key);
541
959
        for (size_t i = 1; i < subtypes.size(); ++i) {
542
31
            if (first_dim != get_number_of_dimensions(*subtypes[i])) {
543
1
                tuple_types.emplace_back(make_nullable(std::make_shared<DataTypeJsonb>()));
544
1
                LOG(INFO) << fmt::format(
545
1
                        "Uncompatible types of subcolumn '{}': {} and {}, cast to JSONB",
546
1
                        key.get_path(), subtypes[0]->get_name(), subtypes[i]->get_name());
547
1
                break;
548
1
            }
549
31
        }
550
929
        if (tuple_paths.size() == tuple_types.size()) {
551
1
            continue;
552
1
        }
553
928
        DataTypePtr common_type;
554
928
        get_least_supertype_jsonb(subtypes, &common_type);
555
928
        if (!common_type->is_nullable()) {
556
3
            common_type = make_nullable(common_type);
557
3
        }
558
928
        tuple_types.emplace_back(common_type);
559
928
    }
560
1.77k
    CHECK_EQ(tuple_paths.size(), tuple_types.size());
561
562
    // Append all common type columns of this variant
563
2.70k
    for (int i = 0; i < tuple_paths.size(); ++i) {
564
929
        TabletColumn common_column;
565
        // typed path not contains root part
566
929
        auto path_without_root = tuple_paths[i].copy_pop_front().get_path();
567
929
        if (typed_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) {
568
0
            common_column = *typed_columns.at(path_without_root);
569
            // parent unique id and path may not be init in write path
570
0
            common_column.set_parent_unique_id(variant_col_unique_id);
571
0
            common_column.set_path_info(tuple_paths[i]);
572
0
            common_column.set_name(tuple_paths[i].get_path());
573
929
        } else {
574
            // const std::string& column_name = variant_col_name + "." + tuple_paths[i].get_path();
575
929
            get_column_by_type(tuple_types[i], tuple_paths[i].get_path(), common_column,
576
929
                               ExtraInfo {.unique_id = -1,
577
929
                                          .parent_unique_id = variant_col_unique_id,
578
929
                                          .path_info = tuple_paths[i]});
579
929
        }
580
929
        common_schema->append_column(common_column);
581
929
        if (path_set != nullptr) {
582
926
            path_set->insert(tuple_paths[i]);
583
926
        }
584
929
    }
585
1.77k
    return Status::OK();
586
1.77k
}
587
588
Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
589
                                  TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
590
1.77k
                                  std::set<PathInData>* path_set) {
591
1.77k
    std::map<std::string, TabletColumnPtr> typed_columns;
592
1.77k
    for (const TabletColumnPtr& col :
593
7.87k
         common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) {
594
7.87k
        typed_columns[col->name()] = col;
595
7.87k
    }
596
    // Types of subcolumns by path from all tuples.
597
1.77k
    std::map<PathInData, DataTypes> subcolumns_types;
598
599
    // Collect all paths first to enable batch checking
600
1.77k
    std::vector<PathInData> all_paths;
601
602
1.95k
    for (const TabletSchemaSPtr& schema : schemas) {
603
6.37k
        for (const TabletColumnPtr& col : schema->columns()) {
604
            // Get subcolumns of this variant
605
6.37k
            if (col->has_path_info() && col->parent_unique_id() >= 0 &&
606
6.37k
                col->parent_unique_id() == variant_col_unique_id) {
607
955
                subcolumns_types[*col->path_info_ptr()].emplace_back(
608
955
                        DataTypeFactory::instance().create_data_type(*col, col->is_nullable()));
609
955
                all_paths.push_back(*col->path_info_ptr());
610
955
            }
611
6.37k
        }
612
1.95k
    }
613
614
    // Batch check for conflicts
615
1.77k
    RETURN_IF_ERROR(check_variant_has_no_ambiguous_paths(all_paths));
616
617
1.77k
    return update_least_schema_internal(subcolumns_types, common_schema, variant_col_unique_id,
618
1.77k
                                        typed_columns, path_set);
619
1.77k
}
620
621
// Keep variant subcolumn BF support aligned with FE DDL checks.
622
135k
bool is_bf_supported_by_fe_for_variant_subcolumn(FieldType type) {
623
135k
    switch (type) {
624
91
    case FieldType::OLAP_FIELD_TYPE_SMALLINT:
625
482
    case FieldType::OLAP_FIELD_TYPE_INT:
626
94.4k
    case FieldType::OLAP_FIELD_TYPE_BIGINT:
627
94.5k
    case FieldType::OLAP_FIELD_TYPE_LARGEINT:
628
94.5k
    case FieldType::OLAP_FIELD_TYPE_CHAR:
629
94.5k
    case FieldType::OLAP_FIELD_TYPE_VARCHAR:
630
121k
    case FieldType::OLAP_FIELD_TYPE_STRING:
631
121k
    case FieldType::OLAP_FIELD_TYPE_DATE:
632
121k
    case FieldType::OLAP_FIELD_TYPE_DATETIME:
633
121k
    case FieldType::OLAP_FIELD_TYPE_DATEV2:
634
122k
    case FieldType::OLAP_FIELD_TYPE_DATETIMEV2:
635
122k
    case FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ:
636
122k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL:
637
122k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL32:
638
122k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL64:
639
122k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL128I:
640
122k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL256:
641
122k
    case FieldType::OLAP_FIELD_TYPE_IPV4:
642
122k
    case FieldType::OLAP_FIELD_TYPE_IPV6:
643
122k
        return true;
644
12.5k
    default:
645
12.5k
        return false;
646
135k
    }
647
135k
}
648
649
void inherit_column_attributes(const TabletColumn& source, TabletColumn& target,
650
135k
                               TabletSchemaSPtr* target_schema) {
651
135k
    if (!target.is_extracted_column()) {
652
0
        return;
653
0
    }
654
135k
    target.set_aggregation_method(source.aggregation());
655
656
    // 1. bloom filter
657
135k
    if (is_bf_supported_by_fe_for_variant_subcolumn(target.type())) {
658
122k
        target.set_is_bf_column(source.is_bf_column());
659
122k
    }
660
661
135k
    if (!target_schema) {
662
128k
        return;
663
128k
    }
664
665
    // 2. inverted index
666
7.17k
    TabletIndexes indexes_to_add;
667
7.17k
    auto source_indexes = (*target_schema)->inverted_indexs(source.unique_id());
668
    // if target is variant type, we need to inherit all indexes
669
    // because this schema is a read schema from fe
670
7.17k
    if (target.is_variant_type()) {
671
6.49k
        for (auto& index : source_indexes) {
672
282
            auto index_info = std::make_shared<TabletIndex>(*index);
673
282
            index_info->set_escaped_escaped_index_suffix_path(target.path_info_ptr()->get_path());
674
282
            indexes_to_add.emplace_back(std::move(index_info));
675
282
        }
676
6.49k
    } else {
677
676
        inherit_index(source_indexes, indexes_to_add, target);
678
676
    }
679
7.17k
    auto target_indexes = (*target_schema)
680
7.17k
                                  ->inverted_indexs(target.parent_unique_id(),
681
7.17k
                                                    target.path_info_ptr()->get_path());
682
7.42k
    if (target_indexes.empty()) {
683
7.42k
        for (auto& index_info : indexes_to_add) {
684
292
            (*target_schema)->append_index(std::move(*index_info));
685
292
        }
686
7.42k
    }
687
688
    // 3. TODO: gnragm bf index
689
7.17k
}
690
691
8.50k
void inherit_column_attributes(TabletSchemaSPtr& schema) {
692
    // Add index meta if extracted column is missing index meta
693
96.3k
    for (size_t i = 0; i < schema->num_columns(); ++i) {
694
87.8k
        TabletColumn& col = schema->mutable_column(i);
695
87.8k
        if (!col.is_extracted_column()) {
696
80.3k
            continue;
697
80.3k
        }
698
7.46k
        if (schema->field_index(col.parent_unique_id()) == -1) {
699
            // parent column is missing, maybe dropped
700
0
            continue;
701
0
        }
702
7.46k
        inherit_column_attributes(schema->column_by_uid(col.parent_unique_id()), col, &schema);
703
7.46k
    }
704
8.50k
}
705
706
Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
707
                               const TabletSchemaSPtr& base_schema, TabletSchemaSPtr& output_schema,
708
1.73k
                               bool check_schema_size) {
709
1.73k
    std::vector<int32_t> variant_column_unique_id;
710
    // Construct a schema excluding the extracted columns and gather unique identifiers for variants.
711
    // Ensure that the output schema also excludes these extracted columns. This approach prevents
712
    // duplicated paths following the update_least_common_schema process.
713
1.73k
    auto build_schema_without_extracted_columns = [&](const TabletSchemaSPtr& base_schema) {
714
1.73k
        output_schema = std::make_shared<TabletSchema>();
715
        // not copy columns but only shadow copy other attributes
716
1.73k
        output_schema->shawdow_copy_without_columns(*base_schema);
717
        // Get all columns without extracted columns and collect variant col unique id
718
4.64k
        for (const TabletColumnPtr& col : base_schema->columns()) {
719
4.64k
            if (col->is_variant_type()) {
720
1.77k
                variant_column_unique_id.push_back(col->unique_id());
721
1.77k
            }
722
4.64k
            if (!col->is_extracted_column()) {
723
4.12k
                output_schema->append_column(*col);
724
4.12k
            }
725
4.64k
        }
726
1.73k
    };
727
1.73k
    if (base_schema == nullptr) {
728
        // Pick tablet schema with max schema version
729
324
        auto max_version_schema =
730
324
                *std::max_element(schemas.cbegin(), schemas.cend(),
731
1.42k
                                  [](const TabletSchemaSPtr a, const TabletSchemaSPtr b) {
732
1.42k
                                      return a->schema_version() < b->schema_version();
733
1.42k
                                  });
734
324
        CHECK(max_version_schema);
735
324
        build_schema_without_extracted_columns(max_version_schema);
736
1.40k
    } else {
737
        // use input base_schema schema as base schema
738
1.40k
        build_schema_without_extracted_columns(base_schema);
739
1.40k
    }
740
741
1.77k
    for (int32_t unique_id : variant_column_unique_id) {
742
1.77k
        std::set<PathInData> path_set;
743
1.77k
        RETURN_IF_ERROR(update_least_common_schema(schemas, output_schema, unique_id, &path_set));
744
1.77k
    }
745
746
1.73k
    inherit_column_attributes(output_schema);
747
1.73k
    if (check_schema_size &&
748
1.73k
        output_schema->columns().size() > config::variant_max_merged_tablet_schema_size) {
749
0
        return Status::DataQualityError("Reached max column size limit {}",
750
0
                                        config::variant_max_merged_tablet_schema_size);
751
0
    }
752
753
1.73k
    return Status::OK();
754
1.73k
}
755
756
// sort by paths in lexicographical order
757
4.61k
ColumnVariant::Subcolumns get_sorted_subcolumns(const ColumnVariant::Subcolumns& subcolumns) {
758
    // sort by paths in lexicographical order
759
4.61k
    ColumnVariant::Subcolumns sorted = subcolumns;
760
287k
    std::sort(sorted.begin(), sorted.end(), [](const auto& lhsItem, const auto& rhsItem) {
761
287k
        return lhsItem->path < rhsItem->path;
762
287k
    });
763
4.61k
    return sorted;
764
4.61k
}
765
766
bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema,
767
23.5k
                           int32_t new_col_idx, int32_t old_col_idx) {
768
23.5k
    const auto& column_new = new_schema->column(new_col_idx);
769
23.5k
    const auto& column_old = old_schema->column(old_col_idx);
770
771
23.5k
    if (column_new.is_bf_column() != column_old.is_bf_column()) {
772
94
        return true;
773
94
    }
774
775
23.4k
    auto new_schema_inverted_indexs = new_schema->inverted_indexs(column_new);
776
23.4k
    auto old_schema_inverted_indexs = old_schema->inverted_indexs(column_old);
777
778
23.4k
    if (new_schema_inverted_indexs.size() != old_schema_inverted_indexs.size()) {
779
706
        return true;
780
706
    }
781
782
23.1k
    for (size_t i = 0; i < new_schema_inverted_indexs.size(); ++i) {
783
391
        if (!new_schema_inverted_indexs[i]->is_same_except_id(old_schema_inverted_indexs[i])) {
784
20
            return true;
785
20
        }
786
391
    }
787
788
22.7k
    return false;
789
22.7k
}
790
791
3.47k
TabletColumn create_sparse_column(const TabletColumn& variant) {
792
3.47k
    TabletColumn res;
793
3.47k
    res.set_name(variant.name_lower_case() + "." + SPARSE_COLUMN_PATH);
794
3.47k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
795
3.47k
    res.set_aggregation_method(variant.aggregation());
796
3.47k
    res.set_path_info(PathInData {variant.name_lower_case() + "." + SPARSE_COLUMN_PATH});
797
3.47k
    res.set_parent_unique_id(variant.unique_id());
798
    // set default value to "NULL" DefaultColumnIterator will call insert_many_defaults
799
3.47k
    res.set_default_value("NULL");
800
3.47k
    TabletColumn child_tcolumn;
801
3.47k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
802
3.47k
    res.add_sub_column(child_tcolumn);
803
3.47k
    res.add_sub_column(child_tcolumn);
804
3.47k
    return res;
805
3.47k
}
806
807
13.5k
TabletColumn create_sparse_shard_column(const TabletColumn& variant, int bucket_index) {
808
13.5k
    TabletColumn res;
809
13.5k
    std::string name = variant.name_lower_case() + "." + SPARSE_COLUMN_PATH + ".b" +
810
13.5k
                       std::to_string(bucket_index);
811
13.5k
    res.set_name(name);
812
13.5k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
813
13.5k
    res.set_aggregation_method(variant.aggregation());
814
13.5k
    res.set_parent_unique_id(variant.unique_id());
815
13.5k
    res.set_default_value("NULL");
816
13.5k
    PathInData path(name);
817
13.5k
    res.set_path_info(path);
818
13.5k
    TabletColumn child_tcolumn;
819
13.5k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
820
13.5k
    res.add_sub_column(child_tcolumn);
821
13.5k
    res.add_sub_column(child_tcolumn);
822
13.5k
    return res;
823
13.5k
}
824
825
8.19k
TabletColumn create_doc_value_column(const TabletColumn& variant, int bucket_index) {
826
8.19k
    TabletColumn res;
827
8.19k
    std::string name = variant.name_lower_case() + "." + DOC_VALUE_COLUMN_PATH + ".b" +
828
8.19k
                       std::to_string(bucket_index);
829
8.19k
    res.set_name(name);
830
8.19k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
831
8.19k
    res.set_aggregation_method(variant.aggregation());
832
8.19k
    res.set_parent_unique_id(variant.unique_id());
833
8.19k
    res.set_default_value("NULL");
834
8.19k
    res.set_path_info(PathInData {name});
835
836
8.19k
    TabletColumn child_tcolumn;
837
8.19k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
838
8.19k
    res.add_sub_column(child_tcolumn);
839
8.19k
    res.add_sub_column(child_tcolumn);
840
8.19k
    return res;
841
8.19k
}
842
843
234k
uint32_t variant_binary_shard_of(const StringRef& path, uint32_t bucket_num) {
844
234k
    if (bucket_num <= 1) return 0;
845
45.4k
    SipHash hash;
846
45.4k
    hash.update(path.data, path.size);
847
45.4k
    uint64_t h = hash.get64();
848
45.4k
    return static_cast<uint32_t>(h % bucket_num);
849
234k
}
850
851
Status VariantCompactionUtil::aggregate_path_to_stats(
852
        const RowsetSharedPtr& rs,
853
2.90k
        std::unordered_map<int32_t, PathToNoneNullValues>* uid_to_path_stats) {
854
2.90k
    SegmentCacheHandle segment_cache;
855
2.90k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
856
2.90k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
857
858
8.46k
    for (const auto& column : rs->tablet_schema()->columns()) {
859
8.46k
        if (!column->is_variant_type() || column->unique_id() < 0) {
860
4.46k
            continue;
861
4.46k
        }
862
4.00k
        if (!should_check_variant_path_stats(*column)) {
863
0
            continue;
864
0
        }
865
4.00k
        for (const auto& segment : segment_cache.get_segments()) {
866
2.07k
            std::shared_ptr<ColumnReader> column_reader;
867
2.07k
            OlapReaderStatistics stats;
868
2.07k
            RETURN_IF_ERROR(
869
2.07k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
870
2.07k
            if (!column_reader) {
871
0
                continue;
872
0
            }
873
874
2.07k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
875
2.07k
            auto* variant_column_reader =
876
2.07k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
877
            // load external meta before getting stats
878
2.07k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
879
2.07k
            const auto* source_stats = variant_column_reader->get_stats();
880
2.07k
            CHECK(source_stats);
881
882
            // agg path -> stats
883
5.11k
            for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
884
5.11k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
885
5.11k
            }
886
887
7.40k
            for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
888
7.40k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
889
7.40k
            }
890
2.07k
        }
891
4.00k
    }
892
2.90k
    return Status::OK();
893
2.90k
}
894
895
Status VariantCompactionUtil::aggregate_variant_extended_info(
896
        const RowsetSharedPtr& rs,
897
4.85k
        std::unordered_map<int32_t, VariantExtendedInfo>* uid_to_variant_extended_info) {
898
4.85k
    SegmentCacheHandle segment_cache;
899
4.85k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
900
4.85k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
901
902
18.8k
    for (const auto& column : rs->tablet_schema()->columns()) {
903
18.8k
        if (!column->is_variant_type()) {
904
12.7k
            continue;
905
12.7k
        }
906
6.11k
        auto& extended_info = (*uid_to_variant_extended_info)[column->unique_id()];
907
6.11k
        if (column->variant_enable_nested_group()) {
908
0
            extended_info.has_nested_group = true;
909
0
        }
910
6.11k
        for (const auto& segment : segment_cache.get_segments()) {
911
3.51k
            std::shared_ptr<ColumnReader> column_reader;
912
3.51k
            OlapReaderStatistics stats;
913
3.51k
            RETURN_IF_ERROR(
914
3.51k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
915
3.51k
            if (!column_reader) {
916
0
                continue;
917
0
            }
918
919
3.51k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
920
3.51k
            auto* variant_column_reader =
921
3.51k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
922
            // load external meta before getting stats
923
3.51k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
924
3.51k
            const auto* source_stats = variant_column_reader->get_stats();
925
3.51k
            CHECK(source_stats);
926
927
3.51k
            if (!column->variant_enable_nested_group()) {
928
                // NG roots still need type metadata for regular subpaths such as `v.owner`,
929
                // but their compaction schema should not be driven by flat path stats.
930
3.51k
                for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
931
2.86k
                    extended_info.path_to_none_null_values[path] += size;
932
2.86k
                    extended_info.sparse_paths.emplace(path);
933
2.86k
                }
934
935
6.44k
                for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
936
6.44k
                    extended_info.path_to_none_null_values[path] += size;
937
6.44k
                }
938
3.51k
            }
939
940
            //2. agg path -> schema
941
3.51k
            variant_column_reader->get_subcolumns_types(&extended_info.path_to_data_types);
942
943
            // 3. extract typed paths
944
3.51k
            variant_column_reader->get_typed_paths(&extended_info.typed_paths);
945
946
            // 4. extract nested paths
947
3.52k
            if (!column->variant_enable_nested_group()) {
948
3.52k
                variant_column_reader->get_nested_paths(&extended_info.nested_paths);
949
3.52k
            }
950
3.51k
        }
951
6.11k
    }
952
4.85k
    return Status::OK();
953
4.85k
}
954
955
// get the subpaths and sparse paths for the variant column
956
void VariantCompactionUtil::get_subpaths(int32_t max_subcolumns_count,
957
                                         const PathToNoneNullValues& stats,
958
285
                                         TabletSchema::PathsSetInfo& paths_set_info) {
959
285
    std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
960
285
    paths_with_sizes.reserve(stats.size());
961
2.46k
    for (const auto& [path, size] : stats) {
962
2.46k
        if (paths_set_info.typed_path_set.contains(path)) {
963
206
            continue;
964
206
        }
965
2.25k
        paths_with_sizes.emplace_back(size, path);
966
2.25k
    }
967
968
    // max_subcolumns_count is 0 means no limit
969
285
    if (max_subcolumns_count > 0 && paths_with_sizes.size() > max_subcolumns_count) {
970
142
        std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), std::greater());
971
972
        // Select top N paths as subcolumns, remaining paths as sparse columns
973
1.93k
        for (const auto& [size, path] : paths_with_sizes) {
974
1.93k
            if (paths_set_info.sub_path_set.size() < max_subcolumns_count) {
975
323
                paths_set_info.sub_path_set.emplace(path);
976
1.60k
            } else {
977
1.60k
                paths_set_info.sparse_path_set.emplace(path);
978
1.60k
            }
979
1.93k
        }
980
142
        LOG(INFO) << "subpaths " << paths_set_info.sub_path_set.size() << " sparse paths "
981
142
                  << paths_set_info.sparse_path_set.size() << " variant max subcolumns count "
982
142
                  << max_subcolumns_count << " stats size " << paths_with_sizes.size();
983
143
    } else {
984
        // Apply all paths as subcolumns
985
324
        for (const auto& [_, path] : paths_with_sizes) {
986
324
            paths_set_info.sub_path_set.emplace(path);
987
324
        }
988
143
    }
989
285
}
990
991
Status VariantCompactionUtil::check_path_stats(const std::vector<RowsetSharedPtr>& intputs,
992
10.6k
                                               RowsetSharedPtr output, BaseTabletSPtr tablet) {
993
10.6k
    if (output->tablet_schema()->num_variant_columns() == 0) {
994
10.0k
        return Status::OK();
995
10.0k
    }
996
4.85k
    for (const auto& rowset : intputs) {
997
18.8k
        for (const auto& column : rowset->tablet_schema()->columns()) {
998
18.8k
            if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
999
0
                return Status::OK();
1000
0
            }
1001
18.8k
        }
1002
4.85k
    }
1003
    // check no extended schema in input rowsets
1004
4.85k
    for (const auto& rowset : intputs) {
1005
18.8k
        for (const auto& column : rowset->tablet_schema()->columns()) {
1006
18.8k
            if (column->is_extracted_column()) {
1007
0
                return Status::OK();
1008
0
            }
1009
18.8k
        }
1010
4.85k
    }
1011
571
#ifndef BE_TEST
1012
    // check no extended schema in output rowset
1013
1.98k
    for (const auto& column : output->tablet_schema()->columns()) {
1014
1.98k
        if (column->is_extracted_column()) {
1015
0
            const auto& name = column->name();
1016
0
            if (name.find("." + DOC_VALUE_COLUMN_PATH + ".") != std::string::npos ||
1017
0
                name.find("." + SPARSE_COLUMN_PATH + ".") != std::string::npos ||
1018
0
                name.ends_with("." + SPARSE_COLUMN_PATH)) {
1019
0
                continue;
1020
0
            }
1021
0
            return Status::InternalError("Unexpected extracted column {} in output rowset",
1022
0
                                         column->name());
1023
0
        }
1024
1.98k
    }
1025
571
#endif
1026
    // only check path stats for dup_keys since the rows may be merged in other models
1027
571
    if (tablet->keys_type() != KeysType::DUP_KEYS) {
1028
191
        return Status::OK();
1029
191
    }
1030
    // if there is a delete predicate in the input rowsets, we skip the path stats check
1031
2.55k
    for (auto& rowset : intputs) {
1032
2.55k
        if (rowset->rowset_meta()->has_delete_predicate()) {
1033
4
            return Status::OK();
1034
4
        }
1035
2.55k
    }
1036
1.02k
    for (const auto& column : output->tablet_schema()->columns()) {
1037
1.02k
        if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
1038
0
            return Status::OK();
1039
0
        }
1040
1.02k
    }
1041
376
    std::unordered_map<int32_t, PathToNoneNullValues> original_uid_to_path_stats;
1042
2.52k
    for (const auto& rs : intputs) {
1043
2.52k
        RETURN_IF_ERROR(aggregate_path_to_stats(rs, &original_uid_to_path_stats));
1044
2.52k
    }
1045
376
    std::unordered_map<int32_t, PathToNoneNullValues> output_uid_to_path_stats;
1046
376
    RETURN_IF_ERROR(aggregate_path_to_stats(output, &output_uid_to_path_stats));
1047
376
    for (const auto& [uid, stats] : output_uid_to_path_stats) {
1048
234
        if (output->tablet_schema()->column_by_uid(uid).is_variant_type() &&
1049
234
            output->tablet_schema()->column_by_uid(uid).variant_enable_doc_mode()) {
1050
116
            continue;
1051
116
        }
1052
118
        if (original_uid_to_path_stats.find(uid) == original_uid_to_path_stats.end()) {
1053
0
            return Status::InternalError("Path stats not found for uid {}, tablet_id {}", uid,
1054
0
                                         tablet->tablet_id());
1055
0
        }
1056
1057
        // In input rowsets, some rowsets may have statistics values exceeding the maximum limit,
1058
        // which leads to inaccurate statistics
1059
118
        if (stats.size() > output->tablet_schema()
1060
118
                                   ->column_by_uid(uid)
1061
118
                                   .variant_max_sparse_column_statistics_size()) {
1062
            // When there is only one segment, we can ensure that the size of each path in output stats is accurate
1063
1
            if (output->num_segments() == 1) {
1064
13
                for (const auto& [path, size] : stats) {
1065
13
                    if (original_uid_to_path_stats.at(uid).find(path) ==
1066
13
                        original_uid_to_path_stats.at(uid).end()) {
1067
0
                        continue;
1068
0
                    }
1069
13
                    if (original_uid_to_path_stats.at(uid).at(path) > size) {
1070
0
                        return Status::InternalError(
1071
0
                                "Path stats not smaller for uid {} with path `{}`, input size {}, "
1072
0
                                "output "
1073
0
                                "size {}, "
1074
0
                                "tablet_id {}",
1075
0
                                uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1076
0
                                tablet->tablet_id());
1077
0
                    }
1078
13
                }
1079
1
            }
1080
1
        }
1081
        // in this case, input stats is accurate, so we check the stats size and stats value
1082
117
        else {
1083
1.77k
            for (const auto& [path, size] : stats) {
1084
1.77k
                if (original_uid_to_path_stats.at(uid).find(path) ==
1085
1.77k
                    original_uid_to_path_stats.at(uid).end()) {
1086
0
                    return Status::InternalError(
1087
0
                            "Path stats not found for uid {}, path {}, tablet_id {}", uid, path,
1088
0
                            tablet->tablet_id());
1089
0
                }
1090
1.77k
                if (original_uid_to_path_stats.at(uid).at(path) != size) {
1091
0
                    return Status::InternalError(
1092
0
                            "Path stats not match for uid {} with path `{}`, input size {}, output "
1093
0
                            "size {}, "
1094
0
                            "tablet_id {}",
1095
0
                            uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1096
0
                            tablet->tablet_id());
1097
0
                }
1098
1.77k
            }
1099
117
        }
1100
118
    }
1101
1102
376
    return Status::OK();
1103
376
}
1104
1105
Status VariantCompactionUtil::get_compaction_typed_columns(
1106
        const TabletSchemaSPtr& target, const std::unordered_set<std::string>& typed_paths,
1107
        const TabletColumnPtr parent_column, TabletSchemaSPtr& output_schema,
1108
282
        TabletSchema::PathsSetInfo& paths_set_info) {
1109
282
    if (parent_column->variant_enable_typed_paths_to_sparse()) {
1110
40
        return Status::OK();
1111
40
    }
1112
417
    for (const auto& path : typed_paths) {
1113
417
        TabletSchema::SubColumnInfo sub_column_info;
1114
417
        if (generate_sub_column_info(*target, parent_column->unique_id(), path, &sub_column_info)) {
1115
416
            inherit_column_attributes(*parent_column, sub_column_info.column);
1116
416
            output_schema->append_column(sub_column_info.column);
1117
416
            paths_set_info.typed_path_set.insert({path, std::move(sub_column_info)});
1118
416
            VLOG_DEBUG << "append typed column " << path;
1119
416
        } else {
1120
1
            return Status::InternalError("Failed to generate sub column info for path {}", path);
1121
1
        }
1122
417
    }
1123
241
    return Status::OK();
1124
242
}
1125
1126
Status VariantCompactionUtil::get_compaction_nested_columns(
1127
        const std::unordered_set<PathInData, PathInData::Hash>& nested_paths,
1128
        const PathToDataTypes& path_to_data_types, const TabletColumnPtr parent_column,
1129
281
        TabletSchemaSPtr& output_schema, TabletSchema::PathsSetInfo& paths_set_info) {
1130
281
    const auto& parent_indexes = output_schema->inverted_indexs(parent_column->unique_id());
1131
281
    for (const auto& path : nested_paths) {
1132
3
        const auto& find_data_types = path_to_data_types.find(path);
1133
3
        if (find_data_types == path_to_data_types.end() || find_data_types->second.empty()) {
1134
1
            return Status::InternalError("Nested path {} has no data type", path.get_path());
1135
1
        }
1136
2
        DataTypePtr data_type;
1137
2
        get_least_supertype_jsonb(find_data_types->second, &data_type);
1138
1139
2
        const std::string& column_name = parent_column->name_lower_case() + "." + path.get_path();
1140
2
        PathInDataBuilder full_path_builder;
1141
2
        auto full_path = full_path_builder.append(parent_column->name_lower_case(), false)
1142
2
                                 .append(path.get_parts(), false)
1143
2
                                 .build();
1144
2
        TabletColumn nested_column =
1145
2
                get_column_by_type(data_type, column_name,
1146
2
                                   ExtraInfo {.unique_id = -1,
1147
2
                                              .parent_unique_id = parent_column->unique_id(),
1148
2
                                              .path_info = full_path});
1149
2
        inherit_column_attributes(*parent_column, nested_column);
1150
2
        TabletIndexes sub_column_indexes;
1151
2
        inherit_index(parent_indexes, sub_column_indexes, nested_column);
1152
2
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1153
2
        output_schema->append_column(nested_column);
1154
2
        VLOG_DEBUG << "append nested column " << path.get_path();
1155
2
    }
1156
280
    return Status::OK();
1157
281
}
1158
1159
void VariantCompactionUtil::get_compaction_subcolumns_from_subpaths(
1160
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1161
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1162
269
        const std::unordered_set<std::string>& sparse_paths, TabletSchemaSPtr& output_schema) {
1163
269
    auto& path_set = paths_set_info.sub_path_set;
1164
269
    std::vector<StringRef> sorted_subpaths(path_set.begin(), path_set.end());
1165
269
    std::sort(sorted_subpaths.begin(), sorted_subpaths.end());
1166
269
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1167
    // append subcolumns
1168
519
    for (const auto& subpath : sorted_subpaths) {
1169
519
        const std::string subpath_str = subpath.to_string();
1170
519
        if (paths_set_info.typed_path_set.contains(subpath_str)) {
1171
0
            paths_set_info.sub_path_set.erase(subpath);
1172
0
            continue;
1173
0
        }
1174
519
        auto column_name = parent_column->name_lower_case() + "." + subpath_str;
1175
519
        auto column_path = PathInData(column_name);
1176
1177
519
        const auto& find_data_types = path_to_data_types.find(PathInData(subpath));
1178
1179
        // some cases: the subcolumn type is variant
1180
        // 1. this path has no data type in segments
1181
        // 2. this path is in sparse paths
1182
        // 3. the sparse paths are too much
1183
519
        TabletSchema::SubColumnInfo sub_column_info;
1184
519
        if (parent_column->variant_enable_typed_paths_to_sparse() &&
1185
519
            generate_sub_column_info(*target, parent_column->unique_id(), subpath_str,
1186
69
                                     &sub_column_info)) {
1187
59
            inherit_column_attributes(*parent_column, sub_column_info.column);
1188
59
            output_schema->append_column(sub_column_info.column);
1189
59
            paths_set_info.subcolumn_indexes.emplace(subpath_str,
1190
59
                                                     std::move(sub_column_info.indexes));
1191
59
            VLOG_DEBUG << "append typed column " << subpath;
1192
460
        } else if (find_data_types == path_to_data_types.end() || find_data_types->second.empty() ||
1193
460
                   sparse_paths.find(subpath_str) != sparse_paths.end() ||
1194
460
                   sparse_paths.size() >=
1195
420
                           parent_column->variant_max_sparse_column_statistics_size()) {
1196
44
            TabletColumn subcolumn;
1197
44
            subcolumn.set_name(column_name);
1198
44
            subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1199
44
            subcolumn.set_parent_unique_id(parent_column->unique_id());
1200
44
            subcolumn.set_path_info(column_path);
1201
44
            subcolumn.set_aggregation_method(parent_column->aggregation());
1202
44
            subcolumn.set_variant_max_subcolumns_count(
1203
44
                    parent_column->variant_max_subcolumns_count());
1204
44
            subcolumn.set_variant_enable_doc_mode(parent_column->variant_enable_doc_mode());
1205
44
            subcolumn.set_is_nullable(true);
1206
44
            output_schema->append_column(subcolumn);
1207
44
            VLOG_DEBUG << "append sub column " << subpath << " data type "
1208
0
                       << "VARIANT";
1209
44
        }
1210
        // normal case: the subcolumn type can be calculated from the data types in segments
1211
416
        else {
1212
416
            DataTypePtr data_type;
1213
416
            get_least_supertype_jsonb(find_data_types->second, &data_type);
1214
416
            TabletColumn sub_column =
1215
416
                    get_column_by_type(data_type, column_name,
1216
416
                                       ExtraInfo {.unique_id = -1,
1217
416
                                                  .parent_unique_id = parent_column->unique_id(),
1218
416
                                                  .path_info = column_path});
1219
416
            inherit_column_attributes(*parent_column, sub_column);
1220
416
            TabletIndexes sub_column_indexes;
1221
416
            inherit_index(parent_indexes, sub_column_indexes, sub_column);
1222
416
            paths_set_info.subcolumn_indexes.emplace(subpath_str, std::move(sub_column_indexes));
1223
416
            output_schema->append_column(sub_column);
1224
416
            VLOG_DEBUG << "append sub column " << subpath << " data type " << data_type->get_name();
1225
416
        }
1226
519
    }
1227
269
}
1228
1229
void VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
1230
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1231
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1232
20
        TabletSchemaSPtr& output_schema) {
1233
20
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1234
155
    for (const auto& [path, data_types] : path_to_data_types) {
1235
        // Typed paths are materialized by get_compaction_typed_columns(); this helper only
1236
        // materializes regular subcolumns inferred from rowset data types.
1237
155
        if (data_types.empty() || path.empty() || path.get_is_typed() || path.has_nested_part()) {
1238
15
            continue;
1239
15
        }
1240
140
        DataTypePtr data_type;
1241
140
        get_least_supertype_jsonb(data_types, &data_type);
1242
140
        auto column_name = parent_column->name_lower_case() + "." + path.get_path();
1243
140
        auto column_path = PathInData(column_name);
1244
140
        TabletColumn sub_column =
1245
140
                get_column_by_type(data_type, column_name,
1246
140
                                   ExtraInfo {.unique_id = -1,
1247
140
                                              .parent_unique_id = parent_column->unique_id(),
1248
140
                                              .path_info = column_path});
1249
140
        inherit_column_attributes(*parent_column, sub_column);
1250
140
        TabletIndexes sub_column_indexes;
1251
140
        inherit_index(parent_indexes, sub_column_indexes, sub_column);
1252
140
        paths_set_info.sub_path_set.emplace(path.get_path());
1253
140
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1254
140
        output_schema->append_column(sub_column);
1255
140
        VLOG_DEBUG << "append sub column " << path.get_path() << " data type "
1256
0
                   << data_type->get_name();
1257
140
    }
1258
20
}
1259
1260
// Build the temporary schema for compaction.
1261
// NestedGroup roots are special: the root VARIANT column owns the NG tree and the streaming NG
1262
// writer handles NG children, while regular non-NG paths beside the arrays are materialized as
1263
// ordinary extracted subcolumns. NG typed paths still use get_compaction_typed_columns(), keeping
1264
// typed-column rules out of the NG-specific regular-path filtering.
1265
Status VariantCompactionUtil::get_extended_compaction_schema(
1266
10.1k
        const std::vector<RowsetSharedPtr>& rowsets, TabletSchemaSPtr& target) {
1267
10.1k
    std::unordered_map<int32_t, VariantExtendedInfo> uid_to_variant_extended_info;
1268
10.1k
    const bool needs_variant_extended_info =
1269
100k
            std::ranges::any_of(target->columns(), [](const TabletColumnPtr& column) {
1270
100k
                return column->is_variant_type() && (should_check_variant_path_stats(*column) ||
1271
554
                                                     column->variant_enable_nested_group());
1272
100k
            });
1273
10.1k
    if (needs_variant_extended_info) {
1274
        // collect path stats from all rowsets and segments
1275
4.85k
        for (const auto& rs : rowsets) {
1276
4.85k
            RETURN_IF_ERROR(aggregate_variant_extended_info(rs, &uid_to_variant_extended_info));
1277
4.85k
        }
1278
550
    }
1279
1280
    // build the output schema
1281
10.1k
    TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>();
1282
10.1k
    output_schema->shawdow_copy_without_columns(*target);
1283
10.1k
    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info;
1284
10.1k
    const auto ng_root_uids =
1285
10.1k
            collect_nested_group_compaction_root_uids(target, uid_to_variant_extended_info);
1286
102k
    for (const TabletColumnPtr& column : target->columns()) {
1287
102k
        if (!column->is_extracted_column()) {
1288
102k
            output_schema->append_column(*column);
1289
102k
        }
1290
102k
        if (!column->is_variant_type()) {
1291
102k
            continue;
1292
102k
        }
1293
18.4E
        VLOG_DEBUG << "column " << column->name() << " unique id " << column->unique_id();
1294
1295
526
        const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
1296
526
        const VariantExtendedInfo empty_extended_info;
1297
526
        const VariantExtendedInfo& extended_info = info_it == uid_to_variant_extended_info.end()
1298
526
                                                           ? empty_extended_info
1299
526
                                                           : info_it->second;
1300
526
        auto& paths_set_info = uid_to_paths_set_info[column->unique_id()];
1301
526
        const bool use_nested_group_compaction_schema = ng_root_uids.contains(column->unique_id());
1302
1303
526
        if (use_nested_group_compaction_schema) {
1304
            // 1. append typed columns. Keep this shared with the non-NG typed helper; only the
1305
            // regular-path selection below is NG-specific.
1306
1
            RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1307
1
                                                         output_schema, paths_set_info));
1308
1309
            // NG roots do not record path-count stats for ordinary Variant paths, so their regular
1310
            // non-NG subcolumns use the same data-types materialization helper as the
1311
            // all-materialized non-NG branch below.
1312
1
            auto regular_path_to_data_types =
1313
1
                    collect_regular_types_outside_nested_group(extended_info);
1314
1
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1315
1
                                                      regular_path_to_data_types, output_schema);
1316
1
            LOG(INFO) << "Variant column uid=" << column->unique_id()
1317
1
                      << " keeps nested-group root and materializes regular non-NG subcolumns in "
1318
1
                         "compaction schema";
1319
1
            continue;
1320
1
        }
1321
1322
525
        if (column->variant_enable_doc_mode()) {
1323
392
            const int bucket_num = std::max(1, column->variant_doc_hash_shard_count());
1324
1.17k
            for (int b = 0; b < bucket_num; ++b) {
1325
779
                TabletColumn doc_value_bucket_column = create_doc_value_column(*column, b);
1326
779
                doc_value_bucket_column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1327
779
                doc_value_bucket_column.set_is_nullable(false);
1328
779
                doc_value_bucket_column.set_variant_enable_doc_mode(true);
1329
779
                output_schema->append_column(doc_value_bucket_column);
1330
779
            }
1331
392
            continue;
1332
392
        }
1333
1334
        // 1. append typed columns
1335
133
        RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1336
133
                                                     output_schema, paths_set_info));
1337
1338
        // 2. append nested columns
1339
133
        RETURN_IF_ERROR(get_compaction_nested_columns(extended_info.nested_paths,
1340
133
                                                      extended_info.path_to_data_types, column,
1341
133
                                                      output_schema, paths_set_info));
1342
1343
        // 3. get the subpaths
1344
133
        get_subpaths(column->variant_max_subcolumns_count(), extended_info.path_to_none_null_values,
1345
133
                     paths_set_info);
1346
1347
        // 4. append subcolumns
1348
260
        if (column->variant_max_subcolumns_count() > 0 || !column->get_sub_columns().empty()) {
1349
260
            get_compaction_subcolumns_from_subpaths(paths_set_info, column, target,
1350
260
                                                    extended_info.path_to_data_types,
1351
260
                                                    extended_info.sparse_paths, output_schema);
1352
260
        }
1353
        // variant_max_subcolumns_count == 0 and no typed paths materialized
1354
        // it means that all subcolumns are materialized, may be from old data
1355
18.4E
        else {
1356
18.4E
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1357
18.4E
                                                      extended_info.path_to_data_types,
1358
18.4E
                                                      output_schema);
1359
18.4E
        }
1360
1361
        // append sparse column(s)
1362
        // If variant uses bucketized sparse columns, append one sparse bucket column per bucket.
1363
        // Otherwise, append the single sparse column.
1364
133
        int bucket_num = std::max(1, column->variant_sparse_hash_shard_count());
1365
159
        if (bucket_num > 1) {
1366
751
            for (int b = 0; b < bucket_num; ++b) {
1367
592
                TabletColumn sparse_bucket_column = create_sparse_shard_column(*column, b);
1368
592
                output_schema->append_column(sparse_bucket_column);
1369
592
            }
1370
18.4E
        } else {
1371
18.4E
            TabletColumn sparse_column = create_sparse_column(*column);
1372
18.4E
            output_schema->append_column(sparse_column);
1373
18.4E
        }
1374
133
    }
1375
1376
10.1k
    target = output_schema;
1377
    // used to merge & filter path to sparse column during reading in compaction
1378
10.1k
    target->set_path_set_info(std::move(uid_to_paths_set_info));
1379
18.4E
    VLOG_DEBUG << "dump schema " << target->dump_full_schema();
1380
10.1k
    return Status::OK();
1381
10.1k
}
1382
1383
// Calculate statistics about variant data paths from the encoded sparse column
1384
void VariantCompactionUtil::calculate_variant_stats(const IColumn& encoded_sparse_column,
1385
                                                    segment_v2::VariantStatisticsPB* stats,
1386
                                                    size_t max_sparse_column_statistics_size,
1387
1.36k
                                                    size_t row_pos, size_t num_rows) {
1388
    // Cast input column to ColumnMap type since sparse column is stored as a map
1389
1.36k
    const auto& map_column = assert_cast<const ColumnMap&>(encoded_sparse_column);
1390
1391
    // Get the keys column which contains the paths as strings
1392
1.36k
    const auto& sparse_data_paths =
1393
1.36k
            assert_cast<const ColumnString*>(map_column.get_keys_ptr().get());
1394
1.36k
    const auto& serialized_sparse_column_offsets = map_column.get_offsets();
1395
1.36k
    auto& count_map = *stats->mutable_sparse_column_non_null_size();
1396
    // Iterate through all paths in the sparse column
1397
893k
    for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
1398
892k
        size_t offset = serialized_sparse_column_offsets[i - 1];
1399
892k
        size_t end = serialized_sparse_column_offsets[i];
1400
3.00M
        for (size_t j = offset; j != end; ++j) {
1401
2.11M
            auto path = sparse_data_paths->get_data_at(j);
1402
1403
2.11M
            const auto& sparse_path = path.to_string();
1404
            // If path already exists in statistics, increment its count
1405
2.11M
            if (auto it = count_map.find(sparse_path); it != count_map.end()) {
1406
2.11M
                ++it->second;
1407
2.11M
            }
1408
            // If path doesn't exist and we haven't hit the max statistics size limit,
1409
            // add it with count 1
1410
18.4E
            else if (count_map.size() < max_sparse_column_statistics_size) {
1411
1.53k
                count_map.emplace(sparse_path, 1);
1412
1.53k
            }
1413
2.11M
        }
1414
892k
    }
1415
1416
1.36k
    if (stats->sparse_column_non_null_size().size() > max_sparse_column_statistics_size) {
1417
0
        throw doris::Exception(
1418
0
                ErrorCode::INTERNAL_ERROR,
1419
0
                "Sparse column non null size: {} is greater than max statistics size: {}",
1420
0
                stats->sparse_column_non_null_size().size(), max_sparse_column_statistics_size);
1421
0
    }
1422
1.36k
}
1423
1424
/// Calculates number of dimensions in array field.
1425
/// Returns 0 for scalar fields.
1426
class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t> {
1427
public:
1428
    FieldVisitorToNumberOfDimensions() = default;
1429
    template <PrimitiveType T>
1430
22.3M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
22.3M
        if constexpr (T == TYPE_ARRAY) {
1432
1.36M
            const size_t size = x.size();
1433
1.36M
            size_t dimensions = 0;
1434
3.85M
            for (size_t i = 0; i < size; ++i) {
1435
2.48M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
2.48M
                dimensions = std::max(dimensions, element_dimensions);
1437
2.48M
            }
1438
1.36M
            return 1 + dimensions;
1439
20.9M
        } else {
1440
20.9M
            return 0;
1441
20.9M
        }
1442
22.3M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
121k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
121k
        } else {
1440
121k
            return 0;
1441
121k
        }
1442
121k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
479
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
479
        } else {
1440
479
            return 0;
1441
479
        }
1442
479
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
41.9k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
41.9k
        } else {
1440
41.9k
            return 0;
1441
41.9k
        }
1442
41.9k
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
395
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
395
        } else {
1440
395
            return 0;
1441
395
        }
1442
395
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
332k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
332k
        } else {
1440
332k
            return 0;
1441
332k
        }
1442
332k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
1.03k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
1.03k
        } else {
1440
1.03k
            return 0;
1441
1.03k
        }
1442
1.03k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
1.02k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
1.02k
        } else {
1440
1.02k
            return 0;
1441
1.02k
        }
1442
1.02k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
1.97k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
1.97k
        } else {
1440
1.97k
            return 0;
1441
1.97k
        }
1442
1.97k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
5.65M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
5.65M
        } else {
1440
5.65M
            return 0;
1441
5.65M
        }
1442
5.65M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
858
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
858
        } else {
1440
858
            return 0;
1441
858
        }
1442
858
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
2.96M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
2.96M
        } else {
1440
2.96M
            return 0;
1441
2.96M
        }
1442
2.96M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
305
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
305
        } else {
1440
305
            return 0;
1441
305
        }
1442
305
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
274
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
274
        } else {
1440
274
            return 0;
1441
274
        }
1442
274
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
11.7M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
11.7M
        } else {
1440
11.7M
            return 0;
1441
11.7M
        }
1442
11.7M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
1.36M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
1.36M
        if constexpr (T == TYPE_ARRAY) {
1432
1.36M
            const size_t size = x.size();
1433
1.36M
            size_t dimensions = 0;
1434
3.85M
            for (size_t i = 0; i < size; ++i) {
1435
2.48M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
2.48M
                dimensions = std::max(dimensions, element_dimensions);
1437
2.48M
            }
1438
1.36M
            return 1 + dimensions;
1439
        } else {
1440
            return 0;
1441
        }
1442
1.36M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
1
        } else {
1440
1
            return 0;
1441
1
        }
1442
1
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
1
        } else {
1440
1
            return 0;
1441
1
        }
1442
1
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
755
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
755
        } else {
1440
755
            return 0;
1441
755
        }
1442
755
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
695
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
695
        } else {
1440
695
            return 0;
1441
695
        }
1442
695
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
70.7k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
70.7k
        } else {
1440
70.7k
            return 0;
1441
70.7k
        }
1442
70.7k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
557
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
557
        } else {
1440
557
            return 0;
1441
557
        }
1442
557
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1430
46.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1431
        if constexpr (T == TYPE_ARRAY) {
1432
            const size_t size = x.size();
1433
            size_t dimensions = 0;
1434
            for (size_t i = 0; i < size; ++i) {
1435
                size_t element_dimensions = apply_visitor(*this, x[i]);
1436
                dimensions = std::max(dimensions, element_dimensions);
1437
            }
1438
            return 1 + dimensions;
1439
46.8k
        } else {
1440
46.8k
            return 0;
1441
46.8k
        }
1442
46.8k
    }
1443
};
1444
1445
// Visitor that allows to get type of scalar field
1446
// but exclude fields contain complex field.This is a faster version
1447
// for FieldVisitorToScalarType which does not support complex field.
1448
class SimpleFieldVisitorToScalarType : public StaticVisitor<size_t> {
1449
public:
1450
    template <PrimitiveType T>
1451
18.7M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
18.7M
        if constexpr (T == TYPE_ARRAY) {
1453
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
109k
        } else if constexpr (T == TYPE_NULL) {
1455
109k
            have_nulls = true;
1456
109k
            return 1;
1457
18.6M
        } else {
1458
18.6M
            type = T;
1459
18.6M
            return 1;
1460
18.6M
        }
1461
18.7M
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
109k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
109k
        } else if constexpr (T == TYPE_NULL) {
1455
109k
            have_nulls = true;
1456
109k
            return 1;
1457
        } else {
1458
            type = T;
1459
            return 1;
1460
        }
1461
109k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
12.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
        } else if constexpr (T == TYPE_NULL) {
1455
            have_nulls = true;
1456
            return 1;
1457
12.3k
        } else {
1458
12.3k
            type = T;
1459
12.3k
            return 1;
1460
12.3k
        }
1461
12.3k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
273k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
        } else if constexpr (T == TYPE_NULL) {
1455
            have_nulls = true;
1456
            return 1;
1457
273k
        } else {
1458
273k
            type = T;
1459
273k
            return 1;
1460
273k
        }
1461
273k
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
2
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
        } else if constexpr (T == TYPE_NULL) {
1455
            have_nulls = true;
1456
            return 1;
1457
2
        } else {
1458
2
            type = T;
1459
2
            return 1;
1460
2
        }
1461
2
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
7
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
        } else if constexpr (T == TYPE_NULL) {
1455
            have_nulls = true;
1456
            return 1;
1457
7
        } else {
1458
7
            type = T;
1459
7
            return 1;
1460
7
        }
1461
7
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
586
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
        } else if constexpr (T == TYPE_NULL) {
1455
            have_nulls = true;
1456
            return 1;
1457
586
        } else {
1458
586
            type = T;
1459
586
            return 1;
1460
586
        }
1461
586
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
4.93M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
        } else if constexpr (T == TYPE_NULL) {
1455
            have_nulls = true;
1456
            return 1;
1457
4.93M
        } else {
1458
4.93M
            type = T;
1459
4.93M
            return 1;
1460
4.93M
        }
1461
4.93M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
        } else if constexpr (T == TYPE_NULL) {
1455
            have_nulls = true;
1456
            return 1;
1457
1
        } else {
1458
1
            type = T;
1459
1
            return 1;
1460
1
        }
1461
1
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
2.78M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
        } else if constexpr (T == TYPE_NULL) {
1455
            have_nulls = true;
1456
            return 1;
1457
2.78M
        } else {
1458
2.78M
            type = T;
1459
2.78M
            return 1;
1460
2.78M
        }
1461
2.78M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
10.6M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
        } else if constexpr (T == TYPE_NULL) {
1455
            have_nulls = true;
1456
            return 1;
1457
10.6M
        } else {
1458
10.6M
            type = T;
1459
10.6M
            return 1;
1460
10.6M
        }
1461
10.6M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1451
46.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1452
        if constexpr (T == TYPE_ARRAY) {
1453
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1454
        } else if constexpr (T == TYPE_NULL) {
1455
            have_nulls = true;
1456
            return 1;
1457
46.8k
        } else {
1458
46.8k
            type = T;
1459
46.8k
            return 1;
1460
46.8k
        }
1461
46.8k
    }
1462
18.5M
    void get_scalar_type(PrimitiveType* data_type) const { *data_type = type; }
1463
18.5M
    bool contain_nulls() const { return have_nulls; }
1464
1465
18.5M
    bool need_convert_field() const { return false; }
1466
1467
private:
1468
    PrimitiveType type = PrimitiveType::INVALID_TYPE;
1469
    bool have_nulls = false;
1470
};
1471
1472
/// Visitor that allows to get type of scalar field
1473
/// or least common type of scalars in array.
1474
/// More optimized version of FieldToDataType.
1475
class FieldVisitorToScalarType : public StaticVisitor<size_t> {
1476
public:
1477
    template <PrimitiveType T>
1478
3.59M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
3.59M
        if constexpr (T == TYPE_ARRAY) {
1480
1.37M
            size_t size = x.size();
1481
3.85M
            for (size_t i = 0; i < size; ++i) {
1482
2.48M
                apply_visitor(*this, x[i]);
1483
2.48M
            }
1484
1.37M
            return 0;
1485
1.37M
        } else if constexpr (T == TYPE_NULL) {
1486
12.3k
            have_nulls = true;
1487
12.3k
            return 0;
1488
2.21M
        } else {
1489
2.21M
            field_types.insert(T);
1490
2.21M
            type_indexes.insert(T);
1491
2.21M
            return 0;
1492
2.21M
        }
1493
3.59M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
12.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
12.3k
        } else if constexpr (T == TYPE_NULL) {
1486
12.3k
            have_nulls = true;
1487
12.3k
            return 0;
1488
        } else {
1489
            field_types.insert(T);
1490
            type_indexes.insert(T);
1491
            return 0;
1492
        }
1493
12.3k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
479
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
479
        } else {
1489
479
            field_types.insert(T);
1490
479
            type_indexes.insert(T);
1491
479
            return 0;
1492
479
        }
1493
479
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
29.6k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
29.6k
        } else {
1489
29.6k
            field_types.insert(T);
1490
29.6k
            type_indexes.insert(T);
1491
29.6k
            return 0;
1492
29.6k
        }
1493
29.6k
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
395
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
395
        } else {
1489
395
            field_types.insert(T);
1490
395
            type_indexes.insert(T);
1491
395
            return 0;
1492
395
        }
1493
395
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
58.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
58.3k
        } else {
1489
58.3k
            field_types.insert(T);
1490
58.3k
            type_indexes.insert(T);
1491
58.3k
            return 0;
1492
58.3k
        }
1493
58.3k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
1.02k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
1.02k
        } else {
1489
1.02k
            field_types.insert(T);
1490
1.02k
            type_indexes.insert(T);
1491
1.02k
            return 0;
1492
1.02k
        }
1493
1.02k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
1.01k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
1.01k
        } else {
1489
1.01k
            field_types.insert(T);
1490
1.01k
            type_indexes.insert(T);
1491
1.01k
            return 0;
1492
1.01k
        }
1493
1.01k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
1.38k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
1.38k
        } else {
1489
1.38k
            field_types.insert(T);
1490
1.38k
            type_indexes.insert(T);
1491
1.38k
            return 0;
1492
1.38k
        }
1493
1.38k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
723k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
723k
        } else {
1489
723k
            field_types.insert(T);
1490
723k
            type_indexes.insert(T);
1491
723k
            return 0;
1492
723k
        }
1493
723k
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
857
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
857
        } else {
1489
857
            field_types.insert(T);
1490
857
            type_indexes.insert(T);
1491
857
            return 0;
1492
857
        }
1493
857
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
184k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
184k
        } else {
1489
184k
            field_types.insert(T);
1490
184k
            type_indexes.insert(T);
1491
184k
            return 0;
1492
184k
        }
1493
184k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
305
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
305
        } else {
1489
305
            field_types.insert(T);
1490
305
            type_indexes.insert(T);
1491
305
            return 0;
1492
305
        }
1493
305
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
274
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
274
        } else {
1489
274
            field_types.insert(T);
1490
274
            type_indexes.insert(T);
1491
274
            return 0;
1492
274
        }
1493
274
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
1.13M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
1.13M
        } else {
1489
1.13M
            field_types.insert(T);
1490
1.13M
            type_indexes.insert(T);
1491
1.13M
            return 0;
1492
1.13M
        }
1493
1.13M
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
1.37M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
1.37M
        if constexpr (T == TYPE_ARRAY) {
1480
1.37M
            size_t size = x.size();
1481
3.85M
            for (size_t i = 0; i < size; ++i) {
1482
2.48M
                apply_visitor(*this, x[i]);
1483
2.48M
            }
1484
1.37M
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
        } else {
1489
            field_types.insert(T);
1490
            type_indexes.insert(T);
1491
            return 0;
1492
        }
1493
1.37M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
1
        } else {
1489
1
            field_types.insert(T);
1490
1
            type_indexes.insert(T);
1491
1
            return 0;
1492
1
        }
1493
1
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
1
        } else {
1489
1
            field_types.insert(T);
1490
1
            type_indexes.insert(T);
1491
1
            return 0;
1492
1
        }
1493
1
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
755
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
755
        } else {
1489
755
            field_types.insert(T);
1490
755
            type_indexes.insert(T);
1491
755
            return 0;
1492
755
        }
1493
755
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
695
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
695
        } else {
1489
695
            field_types.insert(T);
1490
695
            type_indexes.insert(T);
1491
695
            return 0;
1492
695
        }
1493
695
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
70.7k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
70.7k
        } else {
1489
70.7k
            field_types.insert(T);
1490
70.7k
            type_indexes.insert(T);
1491
70.7k
            return 0;
1492
70.7k
        }
1493
70.7k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
557
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
557
        } else {
1489
557
            field_types.insert(T);
1490
557
            type_indexes.insert(T);
1491
557
            return 0;
1492
557
        }
1493
557
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1478
26
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1479
        if constexpr (T == TYPE_ARRAY) {
1480
            size_t size = x.size();
1481
            for (size_t i = 0; i < size; ++i) {
1482
                apply_visitor(*this, x[i]);
1483
            }
1484
            return 0;
1485
        } else if constexpr (T == TYPE_NULL) {
1486
            have_nulls = true;
1487
            return 0;
1488
26
        } else {
1489
26
            field_types.insert(T);
1490
26
            type_indexes.insert(T);
1491
26
            return 0;
1492
26
        }
1493
26
    }
1494
1.10M
    void get_scalar_type(PrimitiveType* type) const {
1495
1.10M
        if (type_indexes.size() == 1) {
1496
            // Most cases will have only one type
1497
1.01M
            *type = *type_indexes.begin();
1498
1.01M
            return;
1499
1.01M
        }
1500
90.5k
        DataTypePtr data_type;
1501
90.5k
        get_least_supertype_jsonb(type_indexes, &data_type);
1502
90.5k
        *type = data_type->get_primitive_type();
1503
90.5k
    }
1504
1.10M
    bool contain_nulls() const { return have_nulls; }
1505
1.10M
    bool need_convert_field() const { return field_types.size() > 1; }
1506
1507
private:
1508
    phmap::flat_hash_set<PrimitiveType> type_indexes;
1509
    phmap::flat_hash_set<PrimitiveType> field_types;
1510
    bool have_nulls = false;
1511
};
1512
1513
template <typename Visitor>
1514
19.7M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1515
19.7M
    Visitor to_scalar_type_visitor;
1516
19.7M
    apply_visitor(to_scalar_type_visitor, field);
1517
19.7M
    PrimitiveType type_id;
1518
19.7M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1519
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1520
19.7M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1521
19.7M
             to_scalar_type_visitor.need_convert_field(),
1522
19.7M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1523
19.7M
}
_ZN5doris12variant_util19get_field_info_implINS0_24FieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1514
1.10M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1515
1.10M
    Visitor to_scalar_type_visitor;
1516
1.10M
    apply_visitor(to_scalar_type_visitor, field);
1517
1.10M
    PrimitiveType type_id;
1518
1.10M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1519
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1520
1.10M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1521
1.10M
             to_scalar_type_visitor.need_convert_field(),
1522
1.10M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1523
1.10M
}
_ZN5doris12variant_util19get_field_info_implINS0_30SimpleFieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1514
18.6M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1515
18.6M
    Visitor to_scalar_type_visitor;
1516
18.6M
    apply_visitor(to_scalar_type_visitor, field);
1517
18.6M
    PrimitiveType type_id;
1518
18.6M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1519
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1520
18.6M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1521
18.6M
             to_scalar_type_visitor.need_convert_field(),
1522
18.6M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1523
18.6M
}
1524
1525
19.8M
void get_field_info(const Field& field, FieldInfo* info) {
1526
19.8M
    if (field.is_complex_field()) {
1527
1.10M
        get_field_info_impl<FieldVisitorToScalarType>(field, info);
1528
18.7M
    } else {
1529
18.7M
        get_field_info_impl<SimpleFieldVisitorToScalarType>(field, info);
1530
18.7M
    }
1531
19.8M
}
1532
1533
bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id,
1534
                              const std::string& path,
1535
217k
                              TabletSchema::SubColumnInfo* sub_column_info) {
1536
217k
    const auto& parent_column = schema.column_by_uid(col_unique_id);
1537
217k
    std::function<void(const TabletColumn&, TabletColumn*)> generate_result_column =
1538
217k
            [&](const TabletColumn& from_column, TabletColumn* to_column) {
1539
16.4k
                to_column->set_name(parent_column.name_lower_case() + "." + path);
1540
16.4k
                to_column->set_type(from_column.type());
1541
16.4k
                to_column->set_parent_unique_id(parent_column.unique_id());
1542
16.4k
                bool is_typed = !parent_column.variant_enable_typed_paths_to_sparse();
1543
16.4k
                to_column->set_path_info(
1544
16.4k
                        PathInData(parent_column.name_lower_case() + "." + path, is_typed));
1545
16.4k
                to_column->set_aggregation_method(parent_column.aggregation());
1546
16.4k
                to_column->set_is_nullable(true);
1547
16.4k
                to_column->set_parent_unique_id(parent_column.unique_id());
1548
16.4k
                if (from_column.is_decimal()) {
1549
16.4k
                    to_column->set_precision(from_column.precision());
1550
16.4k
                }
1551
16.4k
                to_column->set_frac(from_column.frac());
1552
1553
16.4k
                if (from_column.is_array_type()) {
1554
3.31k
                    TabletColumn nested_column;
1555
3.31k
                    generate_result_column(*from_column.get_sub_columns()[0], &nested_column);
1556
3.31k
                    to_column->add_sub_column(nested_column);
1557
3.31k
                }
1558
16.4k
            };
1559
1560
217k
    auto generate_index = [&](const std::string& pattern) {
1561
        // 1. find subcolumn's index
1562
13.1k
        if (const auto& indexes = schema.inverted_index_by_field_pattern(col_unique_id, pattern);
1563
13.1k
            !indexes.empty()) {
1564
5.52k
            for (const auto& index : indexes) {
1565
5.52k
                auto index_ptr = std::make_shared<TabletIndex>(*index);
1566
5.52k
                index_ptr->set_escaped_escaped_index_suffix_path(
1567
5.52k
                        sub_column_info->column.path_info_ptr()->get_path());
1568
5.52k
                sub_column_info->indexes.emplace_back(std::move(index_ptr));
1569
5.52k
            }
1570
5.44k
        }
1571
        // 2. find parent column's index
1572
7.70k
        else if (const auto parent_index = schema.inverted_indexs(col_unique_id);
1573
7.70k
                 !parent_index.empty()) {
1574
482
            inherit_index(parent_index, sub_column_info->indexes, sub_column_info->column);
1575
7.22k
        } else {
1576
7.22k
            sub_column_info->indexes.clear();
1577
7.22k
        }
1578
13.1k
    };
1579
1580
217k
    const auto& sub_columns = parent_column.get_sub_columns();
1581
217k
    for (const auto& sub_column : sub_columns) {
1582
206k
        const char* pattern = sub_column->name().c_str();
1583
206k
        switch (sub_column->pattern_type()) {
1584
5.85k
        case PatternTypePB::MATCH_NAME: {
1585
5.85k
            if (strcmp(pattern, path.c_str()) == 0) {
1586
1.74k
                generate_result_column(*sub_column, &sub_column_info->column);
1587
1.74k
                generate_index(sub_column->name());
1588
1.74k
                return true;
1589
1.74k
            }
1590
4.11k
            break;
1591
5.85k
        }
1592
200k
        case PatternTypePB::MATCH_NAME_GLOB: {
1593
200k
            if (glob_match_re2(pattern, path)) {
1594
11.4k
                generate_result_column(*sub_column, &sub_column_info->column);
1595
11.4k
                generate_index(sub_column->name());
1596
11.4k
                return true;
1597
11.4k
            }
1598
189k
            break;
1599
200k
        }
1600
189k
        default:
1601
0
            break;
1602
206k
        }
1603
206k
    }
1604
204k
    return false;
1605
217k
}
1606
1607
TabletSchemaSPtr VariantCompactionUtil::calculate_variant_extended_schema(
1608
1.41k
        const std::vector<RowsetSharedPtr>& rowsets, const TabletSchemaSPtr& base_schema) {
1609
1.41k
    if (rowsets.empty()) {
1610
0
        return nullptr;
1611
0
    }
1612
1613
1.41k
    std::vector<TabletSchemaSPtr> schemas;
1614
3.31k
    for (const auto& rs : rowsets) {
1615
3.31k
        if (rs->num_segments() == 0) {
1616
3.14k
            continue;
1617
3.14k
        }
1618
174
        const auto& tablet_schema = rs->tablet_schema();
1619
174
        SegmentCacheHandle segment_cache;
1620
174
        auto st = SegmentLoader::instance()->load_segments(std::static_pointer_cast<BetaRowset>(rs),
1621
174
                                                           &segment_cache);
1622
174
        if (!st.ok()) {
1623
0
            return base_schema;
1624
0
        }
1625
174
        for (const auto& segment : segment_cache.get_segments()) {
1626
174
            TabletSchemaSPtr schema = tablet_schema->copy_without_variant_extracted_columns();
1627
354
            for (const auto& column : tablet_schema->columns()) {
1628
354
                if (!column->is_variant_type()) {
1629
174
                    continue;
1630
174
                }
1631
180
                std::shared_ptr<ColumnReader> column_reader;
1632
180
                OlapReaderStatistics stats;
1633
180
                st = segment->get_column_reader(column->unique_id(), &column_reader, &stats);
1634
180
                if (!st.ok()) {
1635
0
                    LOG(WARNING) << "Failed to get column reader for column: " << column->name()
1636
0
                                 << " error: " << st.to_string();
1637
0
                    continue;
1638
0
                }
1639
180
                if (!column_reader) {
1640
0
                    continue;
1641
0
                }
1642
1643
180
                CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
1644
180
                auto* variant_column_reader =
1645
180
                        assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
1646
                // load external meta before getting subcolumn meta info
1647
180
                st = variant_column_reader->load_external_meta_once();
1648
180
                if (!st.ok()) {
1649
0
                    LOG(WARNING) << "Failed to load external meta for column: " << column->name()
1650
0
                                 << " error: " << st.to_string();
1651
0
                    continue;
1652
0
                }
1653
180
                const auto* subcolumn_meta_info = variant_column_reader->get_subcolumns_meta_info();
1654
513
                for (const auto& entry : *subcolumn_meta_info) {
1655
513
                    if (entry->path.empty()) {
1656
180
                        continue;
1657
180
                    }
1658
333
                    const std::string& column_name =
1659
333
                            column->name_lower_case() + "." + entry->path.get_path();
1660
333
                    const DataTypePtr& data_type = entry->data.file_column_type;
1661
333
                    PathInDataBuilder full_path_builder;
1662
333
                    auto full_path = full_path_builder.append(column->name_lower_case(), false)
1663
333
                                             .append(entry->path.get_parts(), false)
1664
333
                                             .build();
1665
333
                    TabletColumn subcolumn =
1666
333
                            get_column_by_type(data_type, column_name,
1667
333
                                               ExtraInfo {.unique_id = -1,
1668
333
                                                          .parent_unique_id = column->unique_id(),
1669
333
                                                          .path_info = full_path});
1670
333
                    schema->append_column(subcolumn);
1671
333
                }
1672
180
            }
1673
174
            schemas.emplace_back(schema);
1674
174
        }
1675
174
    }
1676
1.41k
    TabletSchemaSPtr least_common_schema;
1677
1.41k
    auto st = get_least_common_schema(schemas, base_schema, least_common_schema, false);
1678
1.41k
    if (!st.ok()) {
1679
0
        return base_schema;
1680
0
    }
1681
1.41k
    return least_common_schema;
1682
1.41k
}
1683
1684
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1685
                   TabletIndexes& subcolumns_indexes, FieldType column_type,
1686
125k
                   const std::string& suffix_path, bool is_array_nested_type) {
1687
125k
    if (parent_indexes.empty()) {
1688
116k
        return false;
1689
116k
    }
1690
9.03k
    subcolumns_indexes.clear();
1691
    // bkd index or array index only need to inherit one index
1692
9.03k
    if (field_is_numeric_type(column_type) ||
1693
9.03k
        (is_array_nested_type &&
1694
6.39k
         (field_is_numeric_type(column_type) || field_is_slice_type(column_type)))) {
1695
2.66k
        auto index_ptr = std::make_shared<TabletIndex>(*parent_indexes[0]);
1696
2.66k
        index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1697
        // no need parse for bkd index or array index
1698
2.66k
        index_ptr->remove_parser_and_analyzer();
1699
2.66k
        subcolumns_indexes.emplace_back(std::move(index_ptr));
1700
2.66k
        return true;
1701
2.66k
    }
1702
    // string type need to inherit all indexes
1703
6.36k
    else if (field_is_slice_type(column_type) && !is_array_nested_type) {
1704
6.36k
        for (const auto& index : parent_indexes) {
1705
6.36k
            auto index_ptr = std::make_shared<TabletIndex>(*index);
1706
6.36k
            index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1707
6.36k
            subcolumns_indexes.emplace_back(std::move(index_ptr));
1708
6.36k
        }
1709
6.33k
        return true;
1710
6.33k
    }
1711
31
    return false;
1712
9.03k
}
1713
1714
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1715
125k
                   TabletIndexes& subcolumns_indexes, const TabletColumn& column) {
1716
125k
    if (!column.is_extracted_column()) {
1717
3
        return false;
1718
3
    }
1719
125k
    if (column.is_array_type()) {
1720
1.03k
        if (column.get_sub_columns().empty()) {
1721
0
            return false;
1722
0
        }
1723
1.03k
        const TabletColumn* nested = column.get_sub_columns()[0].get();
1724
1.03k
        while (nested != nullptr && nested->is_array_type()) {
1725
0
            if (nested->get_sub_columns().empty()) {
1726
0
                return false;
1727
0
            }
1728
0
            nested = nested->get_sub_columns()[0].get();
1729
0
        }
1730
1.03k
        if (nested == nullptr) {
1731
0
            return false;
1732
0
        }
1733
1.03k
        return inherit_index(parent_indexes, subcolumns_indexes, nested->type(),
1734
1.03k
                             column.path_info_ptr()->get_path(), true);
1735
1.03k
    }
1736
124k
    return inherit_index(parent_indexes, subcolumns_indexes, column.type(),
1737
124k
                         column.path_info_ptr()->get_path());
1738
125k
}
1739
1740
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1741
0
                   TabletIndexes& subcolumns_indexes, const ColumnMetaPB& column_pb) {
1742
0
    if (!column_pb.has_column_path_info()) {
1743
0
        return false;
1744
0
    }
1745
0
    if (column_pb.type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1746
0
        if (column_pb.children_columns_size() == 0) {
1747
0
            return false;
1748
0
        }
1749
0
        const ColumnMetaPB* nested = &column_pb.children_columns(0);
1750
0
        while (nested != nullptr && nested->type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1751
0
            if (nested->children_columns_size() == 0) {
1752
0
                return false;
1753
0
            }
1754
0
            nested = &nested->children_columns(0);
1755
0
        }
1756
0
        if (nested == nullptr) {
1757
0
            return false;
1758
0
        }
1759
0
        return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)nested->type(),
1760
0
                             column_pb.column_path_info().path(), true);
1761
0
    }
1762
0
    return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)column_pb.type(),
1763
0
                         column_pb.column_path_info().path());
1764
0
}
1765
1766
// ============ Implementation from parse2column.cpp ============
1767
1768
/** Pool for objects that cannot be used from different threads simultaneously.
1769
  * Allows to create an object for each thread.
1770
  * Pool has unbounded size and objects are not destroyed before destruction of pool.
1771
  *
1772
  * Use it in cases when thread local storage is not appropriate
1773
  *  (when maximum number of simultaneously used objects is less
1774
  *   than number of running/sleeping threads, that has ever used object,
1775
  *   and creation/destruction of objects is expensive).
1776
  */
1777
template <typename T>
1778
class SimpleObjectPool {
1779
protected:
1780
    /// Hold all available objects in stack.
1781
    std::mutex mutex;
1782
    std::stack<std::unique_ptr<T>> stack;
1783
    /// Specialized deleter for std::unique_ptr.
1784
    /// Returns underlying pointer back to stack thus reclaiming its ownership.
1785
    struct Deleter {
1786
        SimpleObjectPool<T>* parent;
1787
16.9k
        Deleter(SimpleObjectPool<T>* parent_ = nullptr) : parent {parent_} {} /// NOLINT
1788
16.9k
        void operator()(T* owning_ptr) const {
1789
16.9k
            std::lock_guard lock {parent->mutex};
1790
16.9k
            parent->stack.emplace(owning_ptr);
1791
16.9k
        }
1792
    };
1793
1794
public:
1795
    using Pointer = std::unique_ptr<T, Deleter>;
1796
    /// Extracts and returns a pointer from the stack if it's not empty,
1797
    ///  creates a new one by calling provided f() otherwise.
1798
    template <typename Factory>
1799
16.9k
    Pointer get(Factory&& f) {
1800
16.9k
        std::unique_lock lock(mutex);
1801
16.9k
        if (stack.empty()) {
1802
33
            return {f(), this};
1803
33
        }
1804
16.9k
        auto object = stack.top().release();
1805
16.9k
        stack.pop();
1806
16.9k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1807
16.9k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9StringRefEPS4_RKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1799
12.4k
    Pointer get(Factory&& f) {
1800
12.4k
        std::unique_lock lock(mutex);
1801
12.4k
        if (stack.empty()) {
1802
1
            return {f(), this};
1803
1
        }
1804
12.4k
        auto object = stack.top().release();
1805
12.4k
        stack.pop();
1806
12.4k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1807
12.4k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9ColumnStrIjEERKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1799
4.52k
    Pointer get(Factory&& f) {
1800
4.52k
        std::unique_lock lock(mutex);
1801
4.52k
        if (stack.empty()) {
1802
32
            return {f(), this};
1803
32
        }
1804
4.49k
        auto object = stack.top().release();
1805
4.49k
        stack.pop();
1806
4.49k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1807
4.52k
    }
1808
    /// Like get(), but creates object using default constructor.
1809
    Pointer getDefault() {
1810
        return get([] { return new T; });
1811
    }
1812
};
1813
1814
SimpleObjectPool<JsonParser> parsers_pool;
1815
1816
using Node = typename ColumnVariant::Subcolumns::Node;
1817
1818
42.5M
static inline void append_binary_bytes(ColumnString::Chars& chars, const void* data, size_t size) {
1819
42.5M
    const auto old_size = chars.size();
1820
42.5M
    chars.resize(old_size + size);
1821
42.5M
    memcpy(chars.data() + old_size, reinterpret_cast<const char*>(data), size);
1822
42.5M
}
1823
1824
17.6M
static inline void append_binary_type(ColumnString::Chars& chars, FieldType type) {
1825
17.6M
    const uint8_t t = static_cast<uint8_t>(type);
1826
17.6M
    append_binary_bytes(chars, &t, sizeof(uint8_t));
1827
17.6M
}
1828
1829
10.8M
static inline void append_binary_sizet(ColumnString::Chars& chars, size_t v) {
1830
10.8M
    append_binary_bytes(chars, &v, sizeof(size_t));
1831
10.8M
}
1832
1833
17.6M
static void append_field_to_binary_chars(const Field& field, ColumnString::Chars& chars) {
1834
17.6M
    switch (field.get_type()) {
1835
14
    case PrimitiveType::TYPE_NULL: {
1836
14
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_NONE);
1837
14
        return;
1838
0
    }
1839
261k
    case PrimitiveType::TYPE_BOOLEAN: {
1840
261k
        append_binary_type(chars,
1841
261k
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BOOLEAN));
1842
261k
        const auto v = static_cast<UInt8>(field.get<PrimitiveType::TYPE_BOOLEAN>());
1843
261k
        append_binary_bytes(chars, &v, sizeof(UInt8));
1844
261k
        return;
1845
0
    }
1846
4.45M
    case PrimitiveType::TYPE_BIGINT: {
1847
4.45M
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BIGINT));
1848
4.45M
        const auto v = field.get<PrimitiveType::TYPE_BIGINT>();
1849
4.45M
        append_binary_bytes(chars, &v, sizeof(Int64));
1850
4.45M
        return;
1851
0
    }
1852
9
    case PrimitiveType::TYPE_LARGEINT: {
1853
9
        append_binary_type(chars,
1854
9
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_LARGEINT));
1855
9
        const auto v = field.get<PrimitiveType::TYPE_LARGEINT>();
1856
9
        append_binary_bytes(chars, &v, sizeof(int128_t));
1857
9
        return;
1858
0
    }
1859
2.75M
    case PrimitiveType::TYPE_DOUBLE: {
1860
2.75M
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_DOUBLE));
1861
2.75M
        const auto v = field.get<PrimitiveType::TYPE_DOUBLE>();
1862
2.75M
        append_binary_bytes(chars, &v, sizeof(Float64));
1863
2.75M
        return;
1864
0
    }
1865
10.2M
    case PrimitiveType::TYPE_STRING: {
1866
10.2M
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_STRING);
1867
10.2M
        const auto& v = field.get<PrimitiveType::TYPE_STRING>();
1868
10.2M
        append_binary_sizet(chars, v.size());
1869
10.2M
        append_binary_bytes(chars, v.data(), v.size());
1870
10.2M
        return;
1871
0
    }
1872
46.7k
    case PrimitiveType::TYPE_JSONB: {
1873
46.7k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_JSONB);
1874
46.7k
        const auto& v = field.get<PrimitiveType::TYPE_JSONB>();
1875
46.7k
        append_binary_sizet(chars, v.get_size());
1876
46.7k
        append_binary_bytes(chars, v.get_value(), v.get_size());
1877
46.7k
        return;
1878
0
    }
1879
528k
    case PrimitiveType::TYPE_ARRAY: {
1880
528k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_ARRAY);
1881
528k
        const auto& a = field.get<PrimitiveType::TYPE_ARRAY>();
1882
528k
        append_binary_sizet(chars, a.size());
1883
791k
        for (const auto& elem : a) {
1884
791k
            append_field_to_binary_chars(elem, chars);
1885
791k
        }
1886
528k
        return;
1887
0
    }
1888
0
    default:
1889
0
        throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Unsupported field type {}",
1890
0
                               field.get_type());
1891
17.6M
    }
1892
17.6M
}
1893
template <typename ParserImpl>
1894
void parse_json_to_variant_impl(IColumn& column, const char* src, size_t length,
1895
1.35M
                                JSONDataParser<ParserImpl>* parser, const ParseConfig& config) {
1896
1.35M
    auto& column_variant = assert_cast<ColumnVariant&>(column);
1897
1.35M
    std::optional<ParseResult> result;
1898
    /// Treat empty string as an empty object
1899
    /// for better CAST from String to Object.
1900
1.35M
    if (length > 0) {
1901
1.35M
        result = parser->parse(src, length, config);
1902
1.35M
    } else {
1903
3.16k
        result = ParseResult {};
1904
3.16k
    }
1905
1.35M
    if (!result) {
1906
664
        VLOG_DEBUG << "failed to parse " << std::string_view(src, length) << ", length= " << length;
1907
664
        if (config::variant_throw_exeception_on_invalid_json) {
1908
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse object {}",
1909
0
                                   std::string_view(src, length));
1910
0
        }
1911
        // Treat as string
1912
664
        PathInData root_path;
1913
664
        Field field = Field::create_field<TYPE_STRING>(String(src, length));
1914
664
        result = ParseResult {{root_path}, {field}};
1915
664
    }
1916
1.35M
    auto& [paths, values] = *result;
1917
1.35M
    assert(paths.size() == values.size());
1918
1.35M
    size_t old_num_rows = column_variant.rows();
1919
1.35M
    if (config.deprecated_enable_flatten_nested) {
1920
        // here we should check the paths in variant and paths in result,
1921
        // if two paths which same prefix have different structure, we should throw an exception
1922
3.02k
        std::vector<PathInData> check_paths;
1923
12.0k
        for (const auto& entry : column_variant.get_subcolumns()) {
1924
12.0k
            check_paths.push_back(entry->path);
1925
12.0k
        }
1926
3.02k
        check_paths.insert(check_paths.end(), paths.begin(), paths.end());
1927
3.02k
        THROW_IF_ERROR(check_variant_has_no_ambiguous_paths(check_paths));
1928
3.02k
    }
1929
1.35M
    auto [doc_value_data_paths, doc_value_data_values] =
1930
1.35M
            column_variant.get_doc_value_data_paths_and_values();
1931
1.35M
    auto& doc_value_data_offsets = column_variant.serialized_doc_value_column_offsets();
1932
1933
1.35M
    auto flush_defaults = [](ColumnVariant::Subcolumn* subcolumn) {
1934
1.35M
        const auto num_defaults = subcolumn->cur_num_of_defaults();
1935
1.35M
        if (num_defaults > 0) {
1936
104k
            subcolumn->insert_many_defaults(num_defaults);
1937
104k
            subcolumn->reset_current_num_of_defaults();
1938
104k
        }
1939
1.35M
    };
1940
1941
1.35M
    auto is_plain_path = [](const PathInData& path) {
1942
13
        for (const auto& part : path.get_parts()) {
1943
13
            if (part.is_nested || part.anonymous_array_level != 0) {
1944
0
                return false;
1945
0
            }
1946
13
        }
1947
9
        return true;
1948
9
    };
1949
1950
1.35M
    auto get_or_create_subcolumn = [&](const PathInData& path, size_t index_hint,
1951
1.35M
                                       const FieldInfo& field_info) -> ColumnVariant::Subcolumn* {
1952
1.35M
        auto* subcolumn = column_variant.get_subcolumn(path, index_hint);
1953
1.35M
        if (subcolumn == nullptr) {
1954
2.67k
            if (path.has_nested_part()) {
1955
17
                column_variant.add_nested_subcolumn(path, field_info, old_num_rows);
1956
2.65k
            } else {
1957
2.65k
                column_variant.add_sub_column(path, old_num_rows);
1958
2.65k
            }
1959
2.67k
            subcolumn = column_variant.get_subcolumn(path, index_hint);
1960
2.67k
        }
1961
1.35M
        if (!subcolumn) {
1962
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to find sub column {}",
1963
0
                                   path.get_path());
1964
0
        }
1965
1.35M
        return subcolumn;
1966
1.35M
    };
1967
1968
1.35M
    auto normalize_plain_path = [&](const PathInData& path) {
1969
1.35M
        if (!config.check_duplicate_json_path || path.empty() || !is_plain_path(path)) {
1970
1.35M
            return path;
1971
1.35M
        }
1972
9
        return PathInData(path.get_path());
1973
1.35M
    };
1974
1975
1.35M
    auto insert_into_subcolumn = [&](size_t i,
1976
1.35M
                                     bool check_size_mismatch) -> ColumnVariant::Subcolumn* {
1977
1.35M
        FieldInfo field_info;
1978
1.35M
        get_field_info(values[i], &field_info);
1979
1.35M
        if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
1980
104
            return nullptr;
1981
104
        }
1982
1.35M
        auto path = normalize_plain_path(paths[i]);
1983
1.35M
        auto* subcolumn = get_or_create_subcolumn(path, i, field_info);
1984
1.35M
        flush_defaults(subcolumn);
1985
1.35M
        if (check_size_mismatch && subcolumn->size() != old_num_rows) {
1986
1
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
1987
1
                                   "subcolumn {} size missmatched, may contains duplicated entry",
1988
1
                                   path.get_path());
1989
1
        }
1990
1.35M
        subcolumn->insert(std::move(values[i]), std::move(field_info));
1991
1.35M
        return subcolumn;
1992
1.35M
    };
1993
1994
1.35M
    switch (config.parse_to) {
1995
80.1k
    case ParseConfig::ParseTo::OnlySubcolumns:
1996
1.43M
        for (size_t i = 0; i < paths.size(); ++i) {
1997
1.35M
            insert_into_subcolumn(i, true);
1998
1.35M
        }
1999
80.1k
        break;
2000
1.27M
    case ParseConfig::ParseTo::OnlyDocValueColumn: {
2001
1.27M
        std::vector<size_t> doc_item_indexes;
2002
1.27M
        doc_item_indexes.reserve(paths.size());
2003
1.27M
        phmap::flat_hash_set<StringRef, StringRefHash> seen_paths;
2004
1.27M
        seen_paths.reserve(paths.size());
2005
2006
19.0M
        for (size_t i = 0; i < paths.size(); ++i) {
2007
17.7M
            FieldInfo field_info;
2008
17.7M
            get_field_info(values[i], &field_info);
2009
17.7M
            if (paths[i].empty()) {
2010
                // Plain non-doc VARIANT can use doc-value KV as writer-side staging. An
2011
                // invalid root entry from JSON object/array is neither a scalar root value nor
2012
                // a doc KV path, so leave this row's doc offset empty. Doc-mode and valid scalar
2013
                // roots still populate the root subcolumn below.
2014
798
                if (!column_variant.enable_doc_mode() &&
2015
798
                    field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
2016
3
                    continue;
2017
3
                }
2018
795
                auto* subcolumn = column_variant.get_subcolumn(paths[i]);
2019
795
                DCHECK(subcolumn != nullptr);
2020
795
                flush_defaults(subcolumn);
2021
795
                subcolumn->insert(std::move(values[i]), std::move(field_info));
2022
795
                continue;
2023
798
            }
2024
17.7M
            if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE ||
2025
17.7M
                values[i].get_type() == PrimitiveType::TYPE_NULL) {
2026
116k
                continue;
2027
116k
            }
2028
17.6M
            const auto& path_str = paths[i].get_path();
2029
17.6M
            StringRef path_ref {path_str.data(), path_str.size()};
2030
17.6M
            if (UNLIKELY(!seen_paths.emplace(path_ref).second)) {
2031
2
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
2032
2
                                       "may contains duplicated entry : {}",
2033
2
                                       std::string_view(path_str));
2034
2
            }
2035
17.6M
            doc_item_indexes.push_back(i);
2036
17.6M
        }
2037
2038
1.27M
        std::sort(doc_item_indexes.begin(), doc_item_indexes.end(),
2039
70.7M
                  [&](size_t l, size_t r) { return paths[l].get_path() < paths[r].get_path(); });
2040
15.9M
        for (const auto idx : doc_item_indexes) {
2041
15.9M
            const auto& path_str = paths[idx].get_path();
2042
15.9M
            doc_value_data_paths->insert_data(path_str.data(), path_str.size());
2043
15.9M
            auto& chars = doc_value_data_values->get_chars();
2044
15.9M
            append_field_to_binary_chars(values[idx], chars);
2045
15.9M
            doc_value_data_values->get_offsets().push_back(chars.size());
2046
15.9M
        }
2047
1.27M
    } break;
2048
1.35M
    }
2049
1.35M
    doc_value_data_offsets.push_back(doc_value_data_paths->size());
2050
    // /// Insert default values to missed subcolumns.
2051
1.35M
    const auto& subcolumns = column_variant.get_subcolumns();
2052
3.50M
    for (const auto& entry : subcolumns) {
2053
3.50M
        if (entry->data.size() == old_num_rows) {
2054
            // Handle nested paths differently from simple paths
2055
2.15M
            if (entry->path.has_nested_part()) {
2056
                // Try to insert default from nested, if failed, insert regular default
2057
0
                bool success = UNLIKELY(column_variant.try_insert_default_from_nested(entry));
2058
0
                if (!success) {
2059
0
                    entry->data.insert_default();
2060
0
                }
2061
2.15M
            } else {
2062
                // For non-nested paths, increment default counter
2063
2.15M
                entry->data.increment_default_counter();
2064
2.15M
            }
2065
2.15M
        }
2066
3.50M
    }
2067
1.35M
    column_variant.incr_num_rows();
2068
1.35M
    if (column_variant.get_sparse_column()->size() == old_num_rows) {
2069
1.35M
        column_variant.get_sparse_column_mutable().insert_default();
2070
1.35M
    }
2071
1.35M
#ifndef NDEBUG
2072
1.35M
    column_variant.check_consistency();
2073
1.35M
#endif
2074
1.35M
}
2075
2076
// exposed interfaces
2077
void parse_json_to_variant(IColumn& column, const StringRef& json, JsonParser* parser,
2078
12.4k
                           const ParseConfig& config) {
2079
12.4k
    if (parser) {
2080
0
        return parse_json_to_variant_impl(column, json.data, json.size, parser, config);
2081
12.4k
    } else {
2082
12.4k
        auto pool_parser = parsers_pool.get([] { return new JsonParser(); });
2083
12.4k
        return parse_json_to_variant_impl(column, json.data, json.size, pool_parser.get(), config);
2084
12.4k
    }
2085
12.4k
}
2086
2087
void parse_json_to_variant(IColumn& column, const ColumnString& raw_json_column,
2088
4.52k
                           const ParseConfig& config) {
2089
4.52k
    auto parser = parsers_pool.get([] { return new JsonParser(); });
2090
1.34M
    for (size_t i = 0; i < raw_json_column.size(); ++i) {
2091
1.33M
        StringRef raw_json = raw_json_column.get_data_at(i);
2092
1.33M
        parse_json_to_variant_impl(column, raw_json.data, raw_json.size, parser.get(), config);
2093
1.33M
    }
2094
4.52k
    column.finalize();
2095
4.52k
}
2096
2097
// parse the doc snapshot column to subcolumns
2098
0
void materialize_docs_to_subcolumns(ColumnVariant& column_variant) {
2099
0
    auto subcolumns = materialize_docs_to_subcolumns_map(column_variant);
2100
2101
0
    for (auto& entry : subcolumns) {
2102
0
        entry.second.finalize();
2103
0
        if (!column_variant.add_sub_column(PathInData(entry.first),
2104
0
                                           IColumn::mutate(entry.second.get_finalized_column_ptr()),
2105
0
                                           entry.second.get_least_common_type())) {
2106
0
            throw doris::Exception(ErrorCode::INTERNAL_ERROR,
2107
0
                                   "Failed to add subcolumn {}, which is from doc snapshot column",
2108
0
                                   entry.first);
2109
0
        }
2110
0
    }
2111
2112
0
    column_variant.finalize();
2113
0
}
2114
2115
// ============ Implementation from variant_util.cpp ============
2116
2117
phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_to_subcolumns_map(
2118
11
        const ColumnVariant& variant, size_t expected_unique_paths) {
2119
11
    constexpr size_t kInitialPathReserve = 8192;
2120
11
    phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> subcolumns;
2121
2122
11
    const auto [column_key, column_value] = variant.get_doc_value_data_paths_and_values();
2123
11
    const auto& column_offsets = variant.serialized_doc_value_column_offsets();
2124
11
    const size_t num_rows = column_offsets.size();
2125
2126
11
    DCHECK_EQ(num_rows, variant.size()) << "doc snapshot offsets size mismatch with variant rows";
2127
2128
11
    subcolumns.reserve(expected_unique_paths != 0
2129
11
                               ? expected_unique_paths
2130
11
                               : std::min<size_t>(column_key->size(), kInitialPathReserve));
2131
2132
36
    for (size_t row = 0; row < num_rows; ++row) {
2133
25
        const size_t start = column_offsets[row - 1];
2134
25
        const size_t end = column_offsets[row];
2135
71
        for (size_t i = start; i < end; ++i) {
2136
46
            const auto& key = column_key->get_data_at(i);
2137
46
            const std::string_view path_sv(key.data, key.size);
2138
2139
46
            auto [it, inserted] =
2140
46
                    subcolumns.try_emplace(path_sv, ColumnVariant::Subcolumn {0, true, false});
2141
46
            auto& subcolumn = it->second;
2142
46
            if (inserted) {
2143
27
                subcolumn.insert_many_defaults(row);
2144
27
            } else if (subcolumn.size() != row) {
2145
4
                subcolumn.insert_many_defaults(row - subcolumn.size());
2146
4
            }
2147
46
            subcolumn.deserialize_from_binary_column(column_value, i);
2148
46
        }
2149
25
    }
2150
2151
27
    for (auto& [path, subcolumn] : subcolumns) {
2152
27
        if (subcolumn.size() != num_rows) {
2153
7
            subcolumn.insert_many_defaults(num_rows - subcolumn.size());
2154
7
        }
2155
27
    }
2156
2157
11
    return subcolumns;
2158
11
}
2159
2160
Status _parse_and_materialize_variant_columns(Block& block,
2161
                                              const std::vector<uint32_t>& variant_pos,
2162
4.23k
                                              const std::vector<ParseConfig>& configs) {
2163
9.54k
    for (size_t i = 0; i < variant_pos.size(); ++i) {
2164
5.31k
        auto column_ref = block.get_by_position(variant_pos[i]).column;
2165
5.31k
        bool is_nullable = column_ref->is_nullable();
2166
5.31k
        MutableColumnPtr owner_column = std::move(*column_ref).mutate();
2167
5.31k
        ColumnPtr nullable_null_map;
2168
5.31k
        MutableColumnPtr var_column;
2169
5.31k
        if (is_nullable) {
2170
5.02k
            const auto& nullable = assert_cast<const ColumnNullable&>(*owner_column);
2171
5.02k
            nullable_null_map = nullable.get_null_map_column_ptr();
2172
5.02k
            var_column = std::move(*nullable.get_nested_column_ptr()).mutate();
2173
5.02k
        } else {
2174
289
            var_column = std::move(owner_column);
2175
289
        }
2176
5.31k
        auto& var = assert_cast<ColumnVariant&>(*var_column);
2177
5.31k
        var_column->finalize();
2178
2179
5.31k
        MutableColumnPtr variant_column;
2180
5.31k
        if (!var.is_scalar_variant()) {
2181
            // already parsed
2182
1.22k
            continue;
2183
1.22k
        }
2184
2185
18.4E
        VLOG_DEBUG << "parse scalar variant column: " << var.get_root_type()->get_name();
2186
4.08k
        ColumnPtr scalar_root_column;
2187
4.08k
        if (var.get_root_type()->get_primitive_type() == TYPE_JSONB) {
2188
            // TODO more efficient way to parse jsonb type, currently we just convert jsonb to
2189
            // json str and parse them into variant
2190
30
            RETURN_IF_ERROR(cast_column({var.get_root(), var.get_root_type(), ""},
2191
30
                                        var.get_root()->is_nullable()
2192
30
                                                ? make_nullable(std::make_shared<DataTypeString>())
2193
30
                                                : std::make_shared<DataTypeString>(),
2194
30
                                        &scalar_root_column));
2195
30
            if (scalar_root_column->is_nullable()) {
2196
30
                scalar_root_column = assert_cast<const ColumnNullable*>(scalar_root_column.get())
2197
30
                                             ->get_nested_column_ptr();
2198
30
            }
2199
4.05k
        } else {
2200
4.05k
            const auto& root = *var.get_root();
2201
4.05k
            scalar_root_column =
2202
4.05k
                    root.is_nullable()
2203
4.05k
                            ? assert_cast<const ColumnNullable&>(root).get_nested_column_ptr()
2204
4.05k
                            : var.get_root();
2205
4.05k
        }
2206
2207
4.30k
        if (scalar_root_column->is_column_string()) {
2208
4.30k
            variant_column = ColumnVariant::create(0, var.enable_doc_mode());
2209
4.30k
            parse_json_to_variant(*variant_column.get(),
2210
4.30k
                                  assert_cast<const ColumnString&>(*scalar_root_column),
2211
4.30k
                                  configs[i]);
2212
18.4E
        } else {
2213
            // Root maybe other types rather than string like ColumnVariant(Int32).
2214
            // In this case, we should finlize the root and cast to JSON type
2215
18.4E
            auto expected_root_type =
2216
18.4E
                    make_nullable(std::make_shared<ColumnVariant::MostCommonType>());
2217
18.4E
            var.ensure_root_node_type(expected_root_type);
2218
18.4E
            variant_column = std::move(var_column);
2219
18.4E
        }
2220
2221
        // Wrap variant with nullmap if it is nullable
2222
4.08k
        ColumnPtr result = variant_column->get_ptr();
2223
4.08k
        if (is_nullable) {
2224
4.07k
            result = ColumnNullable::create(result, nullable_null_map);
2225
4.07k
        }
2226
4.08k
        block.get_by_position(variant_pos[i]).column = result;
2227
4.08k
    }
2228
4.23k
    return Status::OK();
2229
4.23k
}
2230
2231
Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos,
2232
4.10k
                                             const std::vector<ParseConfig>& configs) {
2233
4.10k
    RETURN_IF_CATCH_EXCEPTION(
2234
4.10k
            { return _parse_and_materialize_variant_columns(block, variant_pos, configs); });
2235
4.10k
}
2236
2237
namespace {
2238
2239
ParseConfig::ParseTo select_storage_variant_parse_target(const TabletColumn& column,
2240
5.00k
                                                         const ParseConfig& config) {
2241
    // NestedGroup consumes the parse-time subcolumn tree to build nested storage structures, so it
2242
    // must not go through doc-value staging.
2243
5.00k
    if (column.variant_enable_nested_group()) {
2244
4
        return ParseConfig::ParseTo::OnlySubcolumns;
2245
4
    }
2246
2247
    // Persistent doc mode owns doc-value bucket columns in VariantDocWriter. Keep it separate from
2248
    // the plain non-doc staging optimization, even when typed paths or parent indexes exist.
2249
5.00k
    if (column.variant_enable_doc_mode()) {
2250
1.86k
        return ParseConfig::ParseTo::OnlyDocValueColumn;
2251
1.86k
    }
2252
2253
    // Deprecated flatten-nested still consumes parse-time subcolumns. Predefined typed paths and
2254
    // parent inverted indexes are handled later by regular doc-value staging: typed paths are
2255
    // forced into the materialized set unless typed-to-sparse is enabled, and materialized dynamic
2256
    // subcolumns inherit parent indexes while sparse payloads stay unindexed.
2257
3.13k
    if (config.deprecated_enable_flatten_nested) {
2258
26
        return ParseConfig::ParseTo::OnlySubcolumns;
2259
26
    }
2260
2261
    // Plain dynamic non-doc VARIANT can avoid eagerly creating thousands of parse-time subcolumns.
2262
    // The segment writer will pick the materialized/sparse split from this doc-value KV staging.
2263
3.11k
    return ParseConfig::ParseTo::OnlyDocValueColumn;
2264
3.13k
}
2265
2266
} // namespace
2267
2268
Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& tablet_schema,
2269
4.54k
                                             const std::vector<uint32_t>& column_pos) {
2270
4.54k
    std::vector<uint32_t> variant_column_pos;
2271
4.54k
    std::vector<uint32_t> variant_schema_pos;
2272
4.54k
    variant_column_pos.reserve(column_pos.size());
2273
4.54k
    variant_schema_pos.reserve(column_pos.size());
2274
26.5k
    for (size_t block_pos = 0; block_pos < column_pos.size(); ++block_pos) {
2275
21.9k
        const uint32_t schema_pos = column_pos[block_pos];
2276
21.9k
        const auto& column = tablet_schema.column(schema_pos);
2277
21.9k
        if (column.is_variant_type()) {
2278
5.24k
            variant_column_pos.push_back(schema_pos);
2279
5.24k
            variant_schema_pos.push_back(schema_pos);
2280
5.24k
        }
2281
21.9k
    }
2282
2283
4.54k
    if (variant_column_pos.empty()) {
2284
46
        return Status::OK();
2285
46
    }
2286
2287
4.49k
    std::vector<ParseConfig> configs(variant_column_pos.size());
2288
9.73k
    for (size_t i = 0; i < variant_column_pos.size(); ++i) {
2289
        // Deprecated legacy flatten-nested switch. Distinct from variant_enable_nested_group.
2290
5.23k
        configs[i].deprecated_enable_flatten_nested =
2291
5.23k
                tablet_schema.deprecated_variant_flatten_nested();
2292
5.23k
        configs[i].check_duplicate_json_path = config::variant_enable_duplicate_json_path_check;
2293
5.23k
        const auto& column = tablet_schema.column(variant_schema_pos[i]);
2294
5.23k
        if (!column.is_variant_type()) {
2295
0
            return Status::InternalError("column is not variant type, column name: {}",
2296
0
                                         column.name());
2297
0
        }
2298
5.23k
        configs[i].parse_to = select_storage_variant_parse_target(column, configs[i]);
2299
5.23k
    }
2300
2301
4.49k
    RETURN_IF_ERROR(parse_and_materialize_variant_columns(block, variant_column_pos, configs));
2302
4.49k
    return Status::OK();
2303
4.49k
}
2304
2305
} // namespace doris::variant_util