Coverage Report

Created: 2026-06-12 08:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/common/variant_util.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/common/variant_util.h"
19
20
#include <fmt/format.h>
21
#include <gen_cpp/FrontendService.h>
22
#include <gen_cpp/FrontendService_types.h>
23
#include <gen_cpp/HeartbeatService_types.h>
24
#include <gen_cpp/MasterService_types.h>
25
#include <gen_cpp/Status_types.h>
26
#include <gen_cpp/Types_types.h>
27
#include <glog/logging.h>
28
#include <rapidjson/document.h>
29
#include <rapidjson/stringbuffer.h>
30
#include <rapidjson/writer.h>
31
#include <simdjson/simdjson.h> // IWYU pragma: keep
32
#include <unicode/uchar.h>
33
34
#include <algorithm>
35
#include <cassert>
36
#include <cstddef>
37
#include <cstdint>
38
#include <cstring>
39
#include <list>
40
#include <memory>
41
#include <mutex>
42
#include <optional>
43
#include <ostream>
44
#include <ranges>
45
#include <set>
46
#include <stack>
47
#include <string>
48
#include <string_view>
49
#include <unordered_map>
50
#include <utility>
51
#include <vector>
52
53
#include "common/config.h"
54
#include "common/status.h"
55
#include "core/assert_cast.h"
56
#include "core/block/block.h"
57
#include "core/block/column_numbers.h"
58
#include "core/block/column_with_type_and_name.h"
59
#include "core/column/column.h"
60
#include "core/column/column_array.h"
61
#include "core/column/column_map.h"
62
#include "core/column/column_nullable.h"
63
#include "core/column/column_string.h"
64
#include "core/column/column_variant.h"
65
#include "core/data_type/data_type.h"
66
#include "core/data_type/data_type_array.h"
67
#include "core/data_type/data_type_factory.hpp"
68
#include "core/data_type/data_type_jsonb.h"
69
#include "core/data_type/data_type_nullable.h"
70
#include "core/data_type/data_type_string.h"
71
#include "core/data_type/data_type_variant.h"
72
#include "core/data_type/define_primitive_type.h"
73
#include "core/data_type/get_least_supertype.h"
74
#include "core/data_type/primitive_type.h"
75
#include "core/field.h"
76
#include "core/typeid_cast.h"
77
#include "core/types.h"
78
#include "exec/common/field_visitors.h"
79
#include "exec/common/sip_hash.h"
80
#include "exprs/function/function.h"
81
#include "exprs/function/simple_function_factory.h"
82
#include "exprs/function_context.h"
83
#include "exprs/json_functions.h"
84
#include "re2/re2.h"
85
#include "runtime/exec_env.h"
86
#include "runtime/runtime_state.h"
87
#include "storage/olap_common.h"
88
#include "storage/rowset/beta_rowset.h"
89
#include "storage/rowset/rowset.h"
90
#include "storage/rowset/rowset_fwd.h"
91
#include "storage/segment/segment_loader.h"
92
#include "storage/segment/variant/nested_group_path.h"
93
#include "storage/segment/variant/variant_column_reader.h"
94
#include "storage/segment/variant/variant_column_writer_impl.h"
95
#include "storage/tablet/tablet.h"
96
#include "storage/tablet/tablet_fwd.h"
97
#include "storage/tablet/tablet_schema.h"
98
#include "util/client_cache.h"
99
#include "util/defer_op.h"
100
#include "util/json/json_parser.h"
101
#include "util/json/path_in_data.h"
102
#include "util/json/simd_json_parser.h"
103
104
namespace doris::variant_util {
105
106
2.82k
inline void append_escaped_regex_char(std::string* regex_output, char ch) {
107
2.82k
    switch (ch) {
108
22
    case '.':
109
24
    case '^':
110
26
    case '$':
111
28
    case '+':
112
34
    case '*':
113
36
    case '?':
114
38
    case '(':
115
40
    case ')':
116
42
    case '|':
117
44
    case '{':
118
46
    case '}':
119
48
    case '[':
120
48
    case ']':
121
52
    case '\\':
122
52
        regex_output->push_back('\\');
123
52
        regex_output->push_back(ch);
124
52
        break;
125
2.77k
    default:
126
2.77k
        regex_output->push_back(ch);
127
2.77k
        break;
128
2.82k
    }
129
2.82k
}
130
131
// Small LRU to cap compiled glob patterns
132
constexpr size_t kGlobRegexCacheCapacity = 256;
133
134
struct GlobRegexCacheEntry {
135
    std::shared_ptr<RE2> re2;
136
    std::list<std::string>::iterator lru_it;
137
};
138
139
static std::mutex g_glob_regex_cache_mutex;
140
static std::list<std::string> g_glob_regex_cache_lru;
141
static std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;
142
143
209k
std::shared_ptr<RE2> get_or_build_re2(const std::string& glob_pattern) {
144
209k
    {
145
209k
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
146
209k
        auto it = g_glob_regex_cache.find(glob_pattern);
147
209k
        if (it != g_glob_regex_cache.end()) {
148
209k
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
149
209k
                                          it->second.lru_it);
150
209k
            return it->second.re2;
151
209k
        }
152
209k
    }
153
171
    std::string regex_pattern;
154
171
    Status st = glob_to_regex(glob_pattern, &regex_pattern);
155
171
    if (!st.ok()) {
156
2
        return nullptr;
157
2
    }
158
169
    auto compiled = std::make_shared<RE2>(regex_pattern);
159
169
    if (!compiled->ok()) {
160
3
        return nullptr;
161
3
    }
162
166
    {
163
166
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
164
166
        auto it = g_glob_regex_cache.find(glob_pattern);
165
166
        if (it != g_glob_regex_cache.end()) {
166
0
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
167
0
                                          it->second.lru_it);
168
0
            return it->second.re2;
169
0
        }
170
166
        g_glob_regex_cache_lru.push_front(glob_pattern);
171
166
        g_glob_regex_cache.emplace(glob_pattern,
172
166
                                   GlobRegexCacheEntry {compiled, g_glob_regex_cache_lru.begin()});
173
166
        if (g_glob_regex_cache.size() > kGlobRegexCacheCapacity) {
174
0
            const std::string& evict_key = g_glob_regex_cache_lru.back();
175
0
            g_glob_regex_cache.erase(evict_key);
176
0
            g_glob_regex_cache_lru.pop_back();
177
0
        }
178
166
    }
179
0
    return compiled;
180
166
}
181
182
// Convert a restricted glob pattern into a regex.
183
// Supported: '*', '?', '[...]', '\\' escape. Others are treated as literals.
184
309
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern) {
185
309
    regex_pattern->clear();
186
309
    regex_pattern->append("^");
187
309
    bool is_escaped = false;
188
309
    size_t pattern_length = glob_pattern.size();
189
3.25k
    for (size_t index = 0; index < pattern_length; ++index) {
190
2.95k
        char current_char = glob_pattern[index];
191
2.95k
        if (is_escaped) {
192
10
            append_escaped_regex_char(regex_pattern, current_char);
193
10
            is_escaped = false;
194
10
            continue;
195
10
        }
196
2.94k
        if (current_char == '\\') {
197
14
            is_escaped = true;
198
14
            continue;
199
14
        }
200
2.93k
        if (current_char == '*') {
201
69
            regex_pattern->append(".*");
202
69
            continue;
203
69
        }
204
2.86k
        if (current_char == '?') {
205
15
            regex_pattern->append(".");
206
15
            continue;
207
15
        }
208
2.84k
        if (current_char == '[') {
209
33
            size_t class_index = index + 1;
210
33
            bool class_closed = false;
211
33
            bool is_class_escaped = false;
212
33
            std::string class_buffer;
213
33
            if (class_index < pattern_length &&
214
33
                (glob_pattern[class_index] == '!' || glob_pattern[class_index] == '^')) {
215
9
                class_buffer.push_back('^');
216
9
                ++class_index;
217
9
            }
218
99
            for (; class_index < pattern_length; ++class_index) {
219
95
                char class_char = glob_pattern[class_index];
220
95
                if (is_class_escaped) {
221
10
                    class_buffer.push_back(class_char);
222
10
                    is_class_escaped = false;
223
10
                    continue;
224
10
                }
225
85
                if (class_char == '\\') {
226
10
                    is_class_escaped = true;
227
10
                    continue;
228
10
                }
229
75
                if (class_char == ']') {
230
29
                    class_closed = true;
231
29
                    break;
232
29
                }
233
46
                class_buffer.push_back(class_char);
234
46
            }
235
33
            if (!class_closed) {
236
4
                return Status::InvalidArgument("Unclosed character class in glob pattern: {}",
237
4
                                               glob_pattern);
238
4
            }
239
29
            regex_pattern->append("[");
240
29
            regex_pattern->append(class_buffer);
241
29
            regex_pattern->append("]");
242
29
            index = class_index;
243
29
            continue;
244
33
        }
245
2.81k
        append_escaped_regex_char(regex_pattern, current_char);
246
2.81k
    }
247
305
    if (is_escaped) {
248
4
        append_escaped_regex_char(regex_pattern, '\\');
249
4
    }
250
305
    regex_pattern->append("$");
251
305
    return Status::OK();
252
309
}
253
254
209k
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path) {
255
209k
    auto compiled = get_or_build_re2(glob_pattern);
256
209k
    if (compiled == nullptr) {
257
5
        return false;
258
5
    }
259
209k
    return RE2::FullMatch(candidate_path, *compiled);
260
209k
}
261
262
// NestedGroup's physical children and offsets are produced by NestedGroupWriteProvider, not by
263
// appending TabletSchema extracted columns here. This predicate keeps only ordinary Variant paths
264
// that are outside the NG tree, for example `v.owner` beside `v.items[*]`.
265
0
bool is_regular_path_outside_nested_group(const PathInData& path) {
266
0
    const std::string& relative_path = path.get_path();
267
0
    return !relative_path.empty() && !path.get_is_typed() && !path.has_nested_part() &&
268
0
           !segment_v2::contains_nested_group_marker(relative_path) &&
269
0
           !segment_v2::is_root_nested_group_path(relative_path) &&
270
0
           relative_path != SPARSE_COLUMN_PATH &&
271
0
           relative_path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos;
272
0
}
273
274
bool should_materialize_nested_group_regular_subcolumns(
275
        const TabletColumnPtr& column,
276
726
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
277
726
    const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
278
726
    return column->variant_enable_nested_group() ||
279
728
           (info_it != uid_to_variant_extended_info.end() && info_it->second.has_nested_group);
280
726
}
281
282
std::unordered_set<int32_t> collect_nested_group_compaction_root_uids(
283
        const TabletSchemaSPtr& target,
284
11.1k
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
285
11.1k
    std::unordered_set<int32_t> root_uids;
286
110k
    for (const TabletColumnPtr& column : target->columns()) {
287
110k
        if (column->is_variant_type() && should_materialize_nested_group_regular_subcolumns(
288
726
                                                 column, uid_to_variant_extended_info)) {
289
1
            root_uids.insert(column->unique_id());
290
1
        }
291
110k
    }
292
11.1k
    return root_uids;
293
11.1k
}
294
295
PathToDataTypes collect_regular_types_outside_nested_group(
296
1
        const VariantExtendedInfo& extended_info) {
297
1
    PathToDataTypes regular_path_to_data_types;
298
1
    for (const auto& [path, data_types] : extended_info.path_to_data_types) {
299
0
        if (!is_regular_path_outside_nested_group(path)) {
300
0
            continue;
301
0
        }
302
0
        regular_path_to_data_types.emplace(path, data_types);
303
0
    }
304
1
    return regular_path_to_data_types;
305
1
}
306
307
960
size_t get_number_of_dimensions(const IDataType& type) {
308
960
    if (const auto* type_array = typeid_cast<const DataTypeArray*>(&type)) {
309
4
        return type_array->get_number_of_dimensions();
310
4
    }
311
956
    return 0;
312
960
}
313
3
size_t get_number_of_dimensions(const IColumn& column) {
314
3
    if (const auto* column_array = check_and_get_column<ColumnArray>(column)) {
315
2
        return column_array->get_number_of_dimensions();
316
2
    }
317
1
    return 0;
318
3
}
319
320
69.8k
DataTypePtr get_base_type_of_array(const DataTypePtr& type) {
321
    /// Get raw pointers to avoid extra copying of type pointers.
322
69.8k
    const DataTypeArray* last_array = nullptr;
323
69.8k
    const auto* current_type = type.get();
324
69.8k
    if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
325
69.8k
        current_type = nullable->get_nested_type().get();
326
69.8k
    }
327
71.0k
    while (const auto* type_array = typeid_cast<const DataTypeArray*>(current_type)) {
328
1.12k
        current_type = type_array->get_nested_type().get();
329
1.12k
        last_array = type_array;
330
1.12k
        if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
331
1.12k
            current_type = nullable->get_nested_type().get();
332
1.12k
        }
333
1.12k
    }
334
69.8k
    return last_array ? last_array->get_nested_type() : type;
335
69.8k
}
336
337
972k
Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result) {
338
972k
    ColumnsWithTypeAndName arguments {arg, {nullptr, type, type->get_name()}};
339
340
    // To prevent from null info lost, we should not call function since the function framework will wrap
341
    // nullable to Variant instead of the root of Variant
342
    // correct output: Nullable(Array(int)) -> Nullable(Variant(Nullable(Array(int))))
343
    // incorrect output: Nullable(Array(int)) -> Nullable(Variant(Array(int)))
344
972k
    if (type->get_primitive_type() == TYPE_VARIANT) {
345
        // If source column is variant, so the nullable info is different from dst column
346
13.0k
        if (arg.type->get_primitive_type() == TYPE_VARIANT) {
347
158
            *result = type->is_nullable() ? make_nullable(arg.column) : remove_nullable(arg.column);
348
158
            return Status::OK();
349
158
        }
350
        // set variant root column/type to from column/type
351
13.0k
        CHECK(is_column_nullable(*arg.column));
352
12.8k
        auto to_type = remove_nullable(type);
353
12.8k
        const auto& data_type_object = assert_cast<const DataTypeVariant&>(*to_type);
354
12.8k
        auto variant = ColumnVariant::create(data_type_object.variant_max_subcolumns_count(),
355
12.8k
                                             data_type_object.enable_doc_mode());
356
357
12.8k
        variant->create_root(arg.type, IColumn::mutate(arg.column));
358
12.8k
        ColumnPtr nullable = ColumnNullable::create(
359
12.8k
                variant->get_ptr(),
360
12.8k
                assert_cast<const ColumnNullable*>(arg.column.get())->get_null_map_column_ptr());
361
12.8k
        *result = type->is_nullable() ? nullable : variant->get_ptr();
362
12.8k
        return Status::OK();
363
13.0k
    }
364
365
959k
    auto function = SimpleFunctionFactory::instance().get_function("CAST", arguments, type);
366
959k
    if (!function) {
367
0
        return Status::InternalError("Not found cast function {} to {}", arg.type->get_name(),
368
0
                                     type->get_name());
369
0
    }
370
959k
    Block tmp_block {arguments};
371
959k
    uint32_t result_column = cast_set<uint32_t>(tmp_block.columns());
372
959k
    RuntimeState state;
373
959k
    auto ctx = FunctionContext::create_context(&state, {}, {});
374
375
959k
    if (arg.type->get_primitive_type() == INVALID_TYPE) {
376
        // cast from nothing to any type should result in nulls
377
5.99k
        *result = type->create_column_const_with_default_value(arg.column->size())
378
5.99k
                          ->convert_to_full_column_if_const();
379
5.99k
        return Status::OK();
380
5.99k
    }
381
382
    // We convert column string to jsonb type just add a string jsonb field to dst column instead of parse
383
    // each line in original string column.
384
953k
    ctx->set_string_as_jsonb_string(true);
385
953k
    ctx->set_jsonb_string_as_string(true);
386
953k
    tmp_block.insert({nullptr, type, arg.name});
387
    // TODO(lihangyu): we should handle this error in strict mode
388
953k
    if (!function->execute(ctx.get(), tmp_block, {0}, result_column, arg.column->size())) {
389
1
        LOG_EVERY_N(WARNING, 100) << fmt::format("cast from {} to {}", arg.type->get_name(),
390
1
                                                 type->get_name());
391
1
        *result = type->create_column_const_with_default_value(arg.column->size())
392
1
                          ->convert_to_full_column_if_const();
393
1
        return Status::OK();
394
1
    }
395
953k
    *result = tmp_block.get_by_position(result_column).column->convert_to_full_column_if_const();
396
953k
    VLOG_DEBUG << fmt::format("{} before convert {}, after convert {}", arg.name,
397
10
                              arg.column->get_name(), (*result)->get_name());
398
953k
    return Status::OK();
399
953k
}
400
401
void get_column_by_type(const DataTypePtr& data_type, const std::string& name, TabletColumn& column,
402
140k
                        const ExtraInfo& ext_info) {
403
140k
    column.set_name(name);
404
140k
    column.set_type(data_type->get_storage_field_type());
405
140k
    if (ext_info.unique_id >= 0) {
406
4
        column.set_unique_id(ext_info.unique_id);
407
4
    }
408
140k
    if (ext_info.parent_unique_id >= 0) {
409
69.2k
        column.set_parent_unique_id(ext_info.parent_unique_id);
410
69.2k
    }
411
140k
    if (!ext_info.path_info.empty()) {
412
69.1k
        column.set_path_info(ext_info.path_info);
413
69.1k
    }
414
140k
    if (data_type->is_nullable()) {
415
70.3k
        const auto& real_type = static_cast<const DataTypeNullable&>(*data_type);
416
70.3k
        column.set_is_nullable(true);
417
70.3k
        get_column_by_type(real_type.get_nested_type(), name, column, {});
418
70.3k
        return;
419
70.3k
    }
420
70.3k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_ARRAY) {
421
1.17k
        TabletColumn child;
422
1.17k
        get_column_by_type(assert_cast<const DataTypeArray*>(data_type.get())->get_nested_type(),
423
1.17k
                           "", child, {});
424
1.17k
        column.set_length(TabletColumn::get_field_length_by_type(TPrimitiveType::ARRAY, 0));
425
1.17k
        column.add_sub_column(child);
426
1.17k
        return;
427
1.17k
    }
428
69.2k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_VARIANT) {
429
0
        const auto* dt_variant = assert_cast<const DataTypeVariant*>(data_type.get());
430
0
        column.set_variant_max_subcolumns_count(dt_variant->variant_max_subcolumns_count());
431
0
        column.set_variant_enable_doc_mode(dt_variant->enable_doc_mode());
432
0
        return;
433
0
    }
434
    // size is not fixed when type is string or json
435
69.2k
    if (is_string_type(data_type->get_primitive_type()) ||
436
69.2k
        data_type->get_primitive_type() == TYPE_JSONB) {
437
15.7k
        column.set_length(INT_MAX);
438
15.7k
        return;
439
15.7k
    }
440
441
53.4k
    PrimitiveType type = data_type->get_primitive_type();
442
53.4k
    if (is_int_or_bool(type) || is_string_type(type) || is_float_or_double(type) || is_ip(type) ||
443
53.4k
        is_date_or_datetime(type) || type == PrimitiveType::TYPE_DATEV2) {
444
53.3k
        column.set_length(cast_set<int32_t>(data_type->get_size_of_value_in_memory()));
445
53.3k
        return;
446
53.3k
    }
447
113
    if (is_decimal(type)) {
448
105
        column.set_precision(data_type->get_precision());
449
105
        column.set_frac(data_type->get_scale());
450
105
        return;
451
105
    }
452
    // datetimev2 needs scale
453
17
    if (type == PrimitiveType::TYPE_DATETIMEV2 || type == PrimitiveType::TYPE_TIMESTAMPTZ) {
454
17
        column.set_precision(-1);
455
17
        column.set_frac(data_type->get_scale());
456
17
        return;
457
17
    }
458
459
18.4E
    throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
460
18.4E
                           "unexcepted data column type: {}, column name is: {}",
461
18.4E
                           data_type->get_name(), name);
462
8
}
463
464
TabletColumn get_column_by_type(const DataTypePtr& data_type, const std::string& name,
465
68.2k
                                const ExtraInfo& ext_info) {
466
68.2k
    TabletColumn result;
467
68.2k
    get_column_by_type(data_type, name, result, ext_info);
468
68.2k
    return result;
469
68.2k
}
470
471
// check if two paths which same prefix have different structure
472
static bool has_different_structure_in_same_path(const PathInData::Parts& lhs,
473
9.04k
                                                 const PathInData::Parts& rhs) {
474
9.04k
    if (lhs.size() != rhs.size()) {
475
1
        return false; // different size means different structure
476
1
    }
477
    // Since we group by path string, lhs and rhs must have the same size and keys
478
    // We only need to check if they have different nested structure
479
36.1k
    for (size_t i = 0; i < lhs.size(); ++i) {
480
27.0k
        if (lhs[i] != rhs[i]) {
481
5
            VLOG_DEBUG << fmt::format(
482
0
                    "Check different structure: {} vs {}, lhs[i].is_nested: {}, rhs[i].is_nested: "
483
0
                    "{}",
484
0
                    lhs[i].key, rhs[i].key, lhs[i].is_nested, rhs[i].is_nested);
485
5
            return true;
486
5
        }
487
27.0k
    }
488
9.03k
    return false;
489
9.04k
}
490
491
4.75k
Status check_variant_has_no_ambiguous_paths(const PathsInData& tuple_paths) {
492
    // Group paths by their string representation to reduce comparisons
493
4.75k
    std::unordered_map<std::string, std::vector<size_t>> path_groups;
494
495
26.7k
    for (size_t i = 0; i < tuple_paths.size(); ++i) {
496
        // same path should have same structure, so we group them by path
497
22.0k
        path_groups[tuple_paths[i].get_path()].push_back(i);
498
        // print part of tuple_paths[i]
499
22.0k
        VLOG_DEBUG << "tuple_paths[i]: " << tuple_paths[i].get_path();
500
22.0k
    }
501
502
    // Only compare paths within the same group
503
13.0k
    for (const auto& [path_str, indices] : path_groups) {
504
13.0k
        if (indices.size() <= 1) {
505
3.99k
            continue; // No conflicts possible
506
3.99k
        }
507
508
        // Compare all pairs within this group
509
27.0k
        for (size_t i = 0; i < indices.size(); ++i) {
510
27.0k
            for (size_t j = 0; j < i; ++j) {
511
9.04k
                if (has_different_structure_in_same_path(tuple_paths[indices[i]].get_parts(),
512
9.04k
                                                         tuple_paths[indices[j]].get_parts())) {
513
5
                    return Status::DataQualityError(
514
5
                            "Ambiguous paths: {} vs {} with different nested part {} vs {}",
515
5
                            tuple_paths[indices[i]].get_path(), tuple_paths[indices[j]].get_path(),
516
5
                            tuple_paths[indices[i]].has_nested_part(),
517
5
                            tuple_paths[indices[j]].has_nested_part());
518
5
                }
519
9.04k
            }
520
18.0k
        }
521
9.01k
    }
522
4.75k
    return Status::OK();
523
4.75k
}
524
525
Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
526
                                    TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
527
                                    const std::map<std::string, TabletColumnPtr>& typed_columns,
528
1.71k
                                    std::set<PathInData>* path_set) {
529
1.71k
    PathsInData tuple_paths;
530
1.71k
    DataTypes tuple_types;
531
1.71k
    CHECK(common_schema.use_count() == 1);
532
    // Get the least common type for all paths.
533
1.71k
    for (const auto& [key, subtypes] : subcolumns_types) {
534
926
        assert(!subtypes.empty());
535
926
        if (key.get_path() == ColumnVariant::COLUMN_NAME_DUMMY) {
536
0
            continue;
537
0
        }
538
926
        size_t first_dim = get_number_of_dimensions(*subtypes[0]);
539
926
        tuple_paths.emplace_back(key);
540
956
        for (size_t i = 1; i < subtypes.size(); ++i) {
541
31
            if (first_dim != get_number_of_dimensions(*subtypes[i])) {
542
1
                tuple_types.emplace_back(make_nullable(std::make_shared<DataTypeJsonb>()));
543
1
                LOG(INFO) << fmt::format(
544
1
                        "Uncompatible types of subcolumn '{}': {} and {}, cast to JSONB",
545
1
                        key.get_path(), subtypes[0]->get_name(), subtypes[i]->get_name());
546
1
                break;
547
1
            }
548
31
        }
549
926
        if (tuple_paths.size() == tuple_types.size()) {
550
1
            continue;
551
1
        }
552
925
        DataTypePtr common_type;
553
925
        get_least_supertype_jsonb(subtypes, &common_type);
554
925
        if (!common_type->is_nullable()) {
555
3
            common_type = make_nullable(common_type);
556
3
        }
557
925
        tuple_types.emplace_back(common_type);
558
925
    }
559
1.71k
    CHECK_EQ(tuple_paths.size(), tuple_types.size());
560
561
    // Append all common type columns of this variant
562
2.64k
    for (int i = 0; i < tuple_paths.size(); ++i) {
563
926
        TabletColumn common_column;
564
        // typed path not contains root part
565
926
        auto path_without_root = tuple_paths[i].copy_pop_front().get_path();
566
926
        if (typed_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) {
567
0
            common_column = *typed_columns.at(path_without_root);
568
            // parent unique id and path may not be init in write path
569
0
            common_column.set_parent_unique_id(variant_col_unique_id);
570
0
            common_column.set_path_info(tuple_paths[i]);
571
0
            common_column.set_name(tuple_paths[i].get_path());
572
926
        } else {
573
            // const std::string& column_name = variant_col_name + "." + tuple_paths[i].get_path();
574
926
            get_column_by_type(tuple_types[i], tuple_paths[i].get_path(), common_column,
575
926
                               ExtraInfo {.unique_id = -1,
576
926
                                          .parent_unique_id = variant_col_unique_id,
577
926
                                          .path_info = tuple_paths[i]});
578
926
        }
579
926
        common_schema->append_column(common_column);
580
926
        if (path_set != nullptr) {
581
923
            path_set->insert(tuple_paths[i]);
582
923
        }
583
926
    }
584
1.71k
    return Status::OK();
585
1.71k
}
586
587
Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
588
                                  TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
589
1.71k
                                  std::set<PathInData>* path_set) {
590
1.71k
    std::map<std::string, TabletColumnPtr> typed_columns;
591
1.71k
    for (const TabletColumnPtr& col :
592
7.82k
         common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) {
593
7.82k
        typed_columns[col->name()] = col;
594
7.82k
    }
595
    // Types of subcolumns by path from all tuples.
596
1.71k
    std::map<PathInData, DataTypes> subcolumns_types;
597
598
    // Collect all paths first to enable batch checking
599
1.71k
    std::vector<PathInData> all_paths;
600
601
1.84k
    for (const TabletSchemaSPtr& schema : schemas) {
602
5.47k
        for (const TabletColumnPtr& col : schema->columns()) {
603
            // Get subcolumns of this variant
604
5.47k
            if (col->has_path_info() && col->parent_unique_id() >= 0 &&
605
5.47k
                col->parent_unique_id() == variant_col_unique_id) {
606
952
                subcolumns_types[*col->path_info_ptr()].emplace_back(
607
952
                        DataTypeFactory::instance().create_data_type(*col, col->is_nullable()));
608
952
                all_paths.push_back(*col->path_info_ptr());
609
952
            }
610
5.47k
        }
611
1.84k
    }
612
613
    // Batch check for conflicts
614
1.71k
    RETURN_IF_ERROR(check_variant_has_no_ambiguous_paths(all_paths));
615
616
1.71k
    return update_least_schema_internal(subcolumns_types, common_schema, variant_col_unique_id,
617
1.71k
                                        typed_columns, path_set);
618
1.71k
}
619
620
// Keep variant subcolumn BF support aligned with FE DDL checks.
621
77.0k
bool is_bf_supported_by_fe_for_variant_subcolumn(FieldType type) {
622
77.0k
    switch (type) {
623
91
    case FieldType::OLAP_FIELD_TYPE_SMALLINT:
624
484
    case FieldType::OLAP_FIELD_TYPE_INT:
625
50.5k
    case FieldType::OLAP_FIELD_TYPE_BIGINT:
626
50.6k
    case FieldType::OLAP_FIELD_TYPE_LARGEINT:
627
50.6k
    case FieldType::OLAP_FIELD_TYPE_CHAR:
628
50.6k
    case FieldType::OLAP_FIELD_TYPE_VARCHAR:
629
66.3k
    case FieldType::OLAP_FIELD_TYPE_STRING:
630
66.3k
    case FieldType::OLAP_FIELD_TYPE_DATE:
631
66.3k
    case FieldType::OLAP_FIELD_TYPE_DATETIME:
632
66.5k
    case FieldType::OLAP_FIELD_TYPE_DATEV2:
633
66.7k
    case FieldType::OLAP_FIELD_TYPE_DATETIMEV2:
634
66.7k
    case FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ:
635
66.7k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL:
636
66.8k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL32:
637
66.9k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL64:
638
67.2k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL128I:
639
67.3k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL256:
640
67.5k
    case FieldType::OLAP_FIELD_TYPE_IPV4:
641
67.6k
    case FieldType::OLAP_FIELD_TYPE_IPV6:
642
67.6k
        return true;
643
9.39k
    default:
644
9.39k
        return false;
645
77.0k
    }
646
77.0k
}
647
648
void inherit_column_attributes(const TabletColumn& source, TabletColumn& target,
649
77.0k
                               TabletSchemaSPtr* target_schema) {
650
77.0k
    if (!target.is_extracted_column()) {
651
0
        return;
652
0
    }
653
77.0k
    target.set_aggregation_method(source.aggregation());
654
655
    // 1. bloom filter
656
77.0k
    if (is_bf_supported_by_fe_for_variant_subcolumn(target.type())) {
657
67.6k
        target.set_is_bf_column(source.is_bf_column());
658
67.6k
    }
659
660
77.0k
    if (!target_schema) {
661
71.5k
        return;
662
71.5k
    }
663
664
    // 2. inverted index
665
5.48k
    TabletIndexes indexes_to_add;
666
5.48k
    auto source_indexes = (*target_schema)->inverted_indexs(source.unique_id());
667
    // if target is variant type, we need to inherit all indexes
668
    // because this schema is a read schema from fe
669
5.48k
    if (target.is_variant_type()) {
670
4.59k
        for (auto& index : source_indexes) {
671
407
            auto index_info = std::make_shared<TabletIndex>(*index);
672
407
            index_info->set_escaped_escaped_index_suffix_path(target.path_info_ptr()->get_path());
673
407
            indexes_to_add.emplace_back(std::move(index_info));
674
407
        }
675
4.59k
    } else {
676
884
        inherit_index(source_indexes, indexes_to_add, target);
677
884
    }
678
5.48k
    auto target_indexes = (*target_schema)
679
5.48k
                                  ->inverted_indexs(target.parent_unique_id(),
680
5.48k
                                                    target.path_info_ptr()->get_path());
681
5.53k
    if (target_indexes.empty()) {
682
5.53k
        for (auto& index_info : indexes_to_add) {
683
415
            (*target_schema)->append_index(std::move(*index_info));
684
415
        }
685
5.53k
    }
686
687
    // 3. TODO: gnragm bf index
688
5.48k
}
689
690
7.42k
void inherit_column_attributes(TabletSchemaSPtr& schema) {
691
    // Add index meta if extracted column is missing index meta
692
99.4k
    for (size_t i = 0; i < schema->num_columns(); ++i) {
693
91.9k
        TabletColumn& col = schema->mutable_column(i);
694
91.9k
        if (!col.is_extracted_column()) {
695
86.4k
            continue;
696
86.4k
        }
697
5.57k
        if (schema->field_index(col.parent_unique_id()) == -1) {
698
            // parent column is missing, maybe dropped
699
0
            continue;
700
0
        }
701
5.57k
        inherit_column_attributes(schema->column_by_uid(col.parent_unique_id()), col, &schema);
702
5.57k
    }
703
7.42k
}
704
705
Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
706
                               const TabletSchemaSPtr& base_schema, TabletSchemaSPtr& output_schema,
707
1.67k
                               bool check_schema_size) {
708
1.67k
    std::vector<int32_t> variant_column_unique_id;
709
    // Construct a schema excluding the extracted columns and gather unique identifiers for variants.
710
    // Ensure that the output schema also excludes these extracted columns. This approach prevents
711
    // duplicated paths following the update_least_common_schema process.
712
1.67k
    auto build_schema_without_extracted_columns = [&](const TabletSchemaSPtr& base_schema) {
713
1.67k
        output_schema = std::make_shared<TabletSchema>();
714
        // not copy columns but only shadow copy other attributes
715
1.67k
        output_schema->shawdow_copy_without_columns(*base_schema);
716
        // Get all columns without extracted columns and collect variant col unique id
717
4.20k
        for (const TabletColumnPtr& col : base_schema->columns()) {
718
4.20k
            if (col->is_variant_type()) {
719
1.71k
                variant_column_unique_id.push_back(col->unique_id());
720
1.71k
            }
721
4.20k
            if (!col->is_extracted_column()) {
722
3.66k
                output_schema->append_column(*col);
723
3.66k
            }
724
4.20k
        }
725
1.67k
    };
726
1.67k
    if (base_schema == nullptr) {
727
        // Pick tablet schema with max schema version
728
263
        auto max_version_schema =
729
263
                *std::max_element(schemas.cbegin(), schemas.cend(),
730
1.35k
                                  [](const TabletSchemaSPtr a, const TabletSchemaSPtr b) {
731
1.35k
                                      return a->schema_version() < b->schema_version();
732
1.35k
                                  });
733
263
        CHECK(max_version_schema);
734
263
        build_schema_without_extracted_columns(max_version_schema);
735
1.41k
    } else {
736
        // use input base_schema schema as base schema
737
1.41k
        build_schema_without_extracted_columns(base_schema);
738
1.41k
    }
739
740
1.71k
    for (int32_t unique_id : variant_column_unique_id) {
741
1.71k
        std::set<PathInData> path_set;
742
1.71k
        RETURN_IF_ERROR(update_least_common_schema(schemas, output_schema, unique_id, &path_set));
743
1.71k
    }
744
745
1.67k
    inherit_column_attributes(output_schema);
746
1.67k
    if (check_schema_size &&
747
1.67k
        output_schema->columns().size() > config::variant_max_merged_tablet_schema_size) {
748
0
        return Status::DataQualityError("Reached max column size limit {}",
749
0
                                        config::variant_max_merged_tablet_schema_size);
750
0
    }
751
752
1.67k
    return Status::OK();
753
1.67k
}
754
755
// sort by paths in lexicographical order
756
2.06k
ColumnVariant::Subcolumns get_sorted_subcolumns(const ColumnVariant::Subcolumns& subcolumns) {
757
    // sort by paths in lexicographical order
758
2.06k
    ColumnVariant::Subcolumns sorted = subcolumns;
759
1.11M
    std::sort(sorted.begin(), sorted.end(), [](const auto& lhsItem, const auto& rhsItem) {
760
1.11M
        return lhsItem->path < rhsItem->path;
761
1.11M
    });
762
2.06k
    return sorted;
763
2.06k
}
764
765
bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema,
766
16.4k
                           int32_t new_col_idx, int32_t old_col_idx) {
767
16.4k
    const auto& column_new = new_schema->column(new_col_idx);
768
16.4k
    const auto& column_old = old_schema->column(old_col_idx);
769
770
16.4k
    if (column_new.is_bf_column() != column_old.is_bf_column()) {
771
94
        return true;
772
94
    }
773
774
16.3k
    auto new_schema_inverted_indexs = new_schema->inverted_indexs(column_new);
775
16.3k
    auto old_schema_inverted_indexs = old_schema->inverted_indexs(column_old);
776
777
16.3k
    if (new_schema_inverted_indexs.size() != old_schema_inverted_indexs.size()) {
778
706
        return true;
779
706
    }
780
781
16.0k
    for (size_t i = 0; i < new_schema_inverted_indexs.size(); ++i) {
782
390
        if (!new_schema_inverted_indexs[i]->is_same_except_id(old_schema_inverted_indexs[i])) {
783
19
            return true;
784
19
        }
785
390
    }
786
787
15.6k
    return false;
788
15.6k
}
789
790
2.69k
TabletColumn create_sparse_column(const TabletColumn& variant) {
791
2.69k
    TabletColumn res;
792
2.69k
    res.set_name(variant.name_lower_case() + "." + SPARSE_COLUMN_PATH);
793
2.69k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
794
2.69k
    res.set_aggregation_method(variant.aggregation());
795
2.69k
    res.set_path_info(PathInData {variant.name_lower_case() + "." + SPARSE_COLUMN_PATH});
796
2.69k
    res.set_parent_unique_id(variant.unique_id());
797
    // set default value to "NULL" DefaultColumnIterator will call insert_many_defaults
798
2.69k
    res.set_default_value("NULL");
799
2.69k
    TabletColumn child_tcolumn;
800
2.69k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
801
2.69k
    res.add_sub_column(child_tcolumn);
802
2.69k
    res.add_sub_column(child_tcolumn);
803
2.69k
    return res;
804
2.69k
}
805
806
18.6k
TabletColumn create_sparse_shard_column(const TabletColumn& variant, int bucket_index) {
807
18.6k
    TabletColumn res;
808
18.6k
    std::string name = variant.name_lower_case() + "." + SPARSE_COLUMN_PATH + ".b" +
809
18.6k
                       std::to_string(bucket_index);
810
18.6k
    res.set_name(name);
811
18.6k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
812
18.6k
    res.set_aggregation_method(variant.aggregation());
813
18.6k
    res.set_parent_unique_id(variant.unique_id());
814
18.6k
    res.set_default_value("NULL");
815
18.6k
    PathInData path(name);
816
18.6k
    res.set_path_info(path);
817
18.6k
    TabletColumn child_tcolumn;
818
18.6k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
819
18.6k
    res.add_sub_column(child_tcolumn);
820
18.6k
    res.add_sub_column(child_tcolumn);
821
18.6k
    return res;
822
18.6k
}
823
824
8.69k
TabletColumn create_doc_value_column(const TabletColumn& variant, int bucket_index) {
825
8.69k
    TabletColumn res;
826
8.69k
    std::string name = variant.name_lower_case() + "." + DOC_VALUE_COLUMN_PATH + ".b" +
827
8.69k
                       std::to_string(bucket_index);
828
8.69k
    res.set_name(name);
829
8.69k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
830
8.69k
    res.set_aggregation_method(variant.aggregation());
831
8.69k
    res.set_parent_unique_id(variant.unique_id());
832
8.69k
    res.set_default_value("NULL");
833
8.69k
    res.set_path_info(PathInData {name});
834
835
8.69k
    TabletColumn child_tcolumn;
836
8.69k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
837
8.69k
    res.add_sub_column(child_tcolumn);
838
8.69k
    res.add_sub_column(child_tcolumn);
839
8.69k
    return res;
840
8.69k
}
841
842
154k
uint32_t variant_binary_shard_of(const StringRef& path, uint32_t bucket_num) {
843
154k
    if (bucket_num <= 1) return 0;
844
147k
    SipHash hash;
845
147k
    hash.update(path.data, path.size);
846
147k
    uint64_t h = hash.get64();
847
147k
    return static_cast<uint32_t>(h % bucket_num);
848
154k
}
849
850
Status VariantCompactionUtil::aggregate_path_to_stats(
851
        const RowsetSharedPtr& rs,
852
2.89k
        std::unordered_map<int32_t, PathToNoneNullValues>* uid_to_path_stats) {
853
2.89k
    SegmentCacheHandle segment_cache;
854
2.89k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
855
2.89k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
856
857
8.38k
    for (const auto& column : rs->tablet_schema()->columns()) {
858
8.38k
        if (!column->is_variant_type() || column->unique_id() < 0) {
859
4.43k
            continue;
860
4.43k
        }
861
3.94k
        if (!should_check_variant_path_stats(*column)) {
862
0
            continue;
863
0
        }
864
3.94k
        for (const auto& segment : segment_cache.get_segments()) {
865
2.08k
            std::shared_ptr<ColumnReader> column_reader;
866
2.08k
            OlapReaderStatistics stats;
867
2.08k
            RETURN_IF_ERROR(
868
2.08k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
869
2.08k
            if (!column_reader) {
870
0
                continue;
871
0
            }
872
873
2.08k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
874
2.08k
            auto* variant_column_reader =
875
2.08k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
876
            // load external meta before getting stats
877
2.08k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
878
2.08k
            const auto* source_stats = variant_column_reader->get_stats();
879
2.08k
            CHECK(source_stats);
880
881
            // agg path -> stats
882
5.75k
            for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
883
5.75k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
884
5.75k
            }
885
886
6.73k
            for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
887
6.73k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
888
6.73k
            }
889
2.08k
        }
890
3.94k
    }
891
2.89k
    return Status::OK();
892
2.89k
}
893
894
Status VariantCompactionUtil::aggregate_variant_extended_info(
895
        const RowsetSharedPtr& rs,
896
4.91k
        std::unordered_map<int32_t, VariantExtendedInfo>* uid_to_variant_extended_info) {
897
4.91k
    SegmentCacheHandle segment_cache;
898
4.91k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
899
4.91k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
900
901
19.0k
    for (const auto& column : rs->tablet_schema()->columns()) {
902
19.0k
        if (!column->is_variant_type()) {
903
13.0k
            continue;
904
13.0k
        }
905
6.06k
        auto& extended_info = (*uid_to_variant_extended_info)[column->unique_id()];
906
6.06k
        if (column->variant_enable_nested_group()) {
907
0
            extended_info.has_nested_group = true;
908
0
        }
909
6.06k
        for (const auto& segment : segment_cache.get_segments()) {
910
3.54k
            std::shared_ptr<ColumnReader> column_reader;
911
3.54k
            OlapReaderStatistics stats;
912
3.54k
            RETURN_IF_ERROR(
913
3.54k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
914
3.54k
            if (!column_reader) {
915
0
                continue;
916
0
            }
917
918
3.54k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
919
3.54k
            auto* variant_column_reader =
920
3.54k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
921
            // load external meta before getting stats
922
3.54k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
923
3.54k
            const auto* source_stats = variant_column_reader->get_stats();
924
3.54k
            CHECK(source_stats);
925
926
3.54k
            if (!column->variant_enable_nested_group()) {
927
                // NG roots still need type metadata for regular subpaths such as `v.owner`,
928
                // but their compaction schema should not be driven by flat path stats.
929
3.54k
                for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
930
3.05k
                    extended_info.path_to_none_null_values[path] += size;
931
3.05k
                    extended_info.sparse_paths.emplace(path);
932
3.05k
                }
933
934
6.58k
                for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
935
6.58k
                    extended_info.path_to_none_null_values[path] += size;
936
6.58k
                }
937
3.54k
            }
938
939
            //2. agg path -> schema
940
3.54k
            variant_column_reader->get_subcolumns_types(&extended_info.path_to_data_types);
941
942
            // 3. extract typed paths
943
3.54k
            variant_column_reader->get_typed_paths(&extended_info.typed_paths);
944
945
            // 4. extract nested paths
946
3.54k
            if (!column->variant_enable_nested_group()) {
947
3.54k
                variant_column_reader->get_nested_paths(&extended_info.nested_paths);
948
3.54k
            }
949
3.54k
        }
950
6.06k
    }
951
4.91k
    return Status::OK();
952
4.91k
}
953
954
// get the subpaths and sparse paths for the variant column
955
void VariantCompactionUtil::get_subpaths(int32_t max_subcolumns_count,
956
                                         const PathToNoneNullValues& stats,
957
378
                                         TabletSchema::PathsSetInfo& paths_set_info) {
958
    // max_subcolumns_count is 0 means no limit
959
378
    if (max_subcolumns_count > 0 && stats.size() > max_subcolumns_count) {
960
122
        std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
961
122
        paths_with_sizes.reserve(stats.size());
962
2.32k
        for (const auto& [path, size] : stats) {
963
2.32k
            paths_with_sizes.emplace_back(size, path);
964
2.32k
        }
965
122
        std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), std::greater());
966
967
        // Select top N paths as subcolumns, remaining paths as sparse columns
968
2.32k
        for (const auto& [size, path] : paths_with_sizes) {
969
2.32k
            if (paths_set_info.sub_path_set.size() < max_subcolumns_count) {
970
377
                paths_set_info.sub_path_set.emplace(path);
971
1.94k
            } else {
972
1.94k
                paths_set_info.sparse_path_set.emplace(path);
973
1.94k
            }
974
2.32k
        }
975
122
        LOG(INFO) << "subpaths " << paths_set_info.sub_path_set.size() << " sparse paths "
976
122
                  << paths_set_info.sparse_path_set.size() << " variant max subcolumns count "
977
122
                  << max_subcolumns_count << " stats size " << paths_with_sizes.size();
978
256
    } else {
979
        // Apply all paths as subcolumns
980
886
        for (const auto& [path, _] : stats) {
981
886
            paths_set_info.sub_path_set.emplace(path);
982
886
        }
983
256
    }
984
378
}
985
986
Status VariantCompactionUtil::check_path_stats(const std::vector<RowsetSharedPtr>& intputs,
987
11.6k
                                               RowsetSharedPtr output, BaseTabletSPtr tablet) {
988
11.6k
    if (output->tablet_schema()->num_variant_columns() == 0) {
989
11.0k
        return Status::OK();
990
11.0k
    }
991
4.93k
    for (const auto& rowset : intputs) {
992
19.1k
        for (const auto& column : rowset->tablet_schema()->columns()) {
993
19.1k
            if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
994
0
                return Status::OK();
995
0
            }
996
19.1k
        }
997
4.93k
    }
998
    // check no extended schema in input rowsets
999
4.93k
    for (const auto& rowset : intputs) {
1000
19.0k
        for (const auto& column : rowset->tablet_schema()->columns()) {
1001
19.0k
            if (column->is_extracted_column()) {
1002
0
                return Status::OK();
1003
0
            }
1004
19.0k
        }
1005
4.93k
    }
1006
611
#ifndef BE_TEST
1007
    // check no extended schema in output rowset
1008
2.26k
    for (const auto& column : output->tablet_schema()->columns()) {
1009
2.26k
        if (column->is_extracted_column()) {
1010
0
            const auto& name = column->name();
1011
0
            if (name.find("." + DOC_VALUE_COLUMN_PATH + ".") != std::string::npos ||
1012
0
                name.find("." + SPARSE_COLUMN_PATH + ".") != std::string::npos ||
1013
0
                name.ends_with("." + SPARSE_COLUMN_PATH)) {
1014
0
                continue;
1015
0
            }
1016
0
            return Status::InternalError("Unexpected extracted column {} in output rowset",
1017
0
                                         column->name());
1018
0
        }
1019
2.26k
    }
1020
611
#endif
1021
    // only check path stats for dup_keys since the rows may be merged in other models
1022
611
    if (tablet->keys_type() != KeysType::DUP_KEYS) {
1023
250
        return Status::OK();
1024
250
    }
1025
    // if there is a delete predicate in the input rowsets, we skip the path stats check
1026
2.54k
    for (auto& rowset : intputs) {
1027
2.54k
        if (rowset->rowset_meta()->has_delete_predicate()) {
1028
4
            return Status::OK();
1029
4
        }
1030
2.54k
    }
1031
1.02k
    for (const auto& column : output->tablet_schema()->columns()) {
1032
1.02k
        if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
1033
0
            return Status::OK();
1034
0
        }
1035
1.02k
    }
1036
357
    std::unordered_map<int32_t, PathToNoneNullValues> original_uid_to_path_stats;
1037
2.52k
    for (const auto& rs : intputs) {
1038
2.52k
        RETURN_IF_ERROR(aggregate_path_to_stats(rs, &original_uid_to_path_stats));
1039
2.52k
    }
1040
357
    std::unordered_map<int32_t, PathToNoneNullValues> output_uid_to_path_stats;
1041
357
    RETURN_IF_ERROR(aggregate_path_to_stats(output, &output_uid_to_path_stats));
1042
357
    for (const auto& [uid, stats] : output_uid_to_path_stats) {
1043
272
        if (output->tablet_schema()->column_by_uid(uid).is_variant_type() &&
1044
272
            output->tablet_schema()->column_by_uid(uid).variant_enable_doc_mode()) {
1045
106
            continue;
1046
106
        }
1047
166
        if (original_uid_to_path_stats.find(uid) == original_uid_to_path_stats.end()) {
1048
0
            return Status::InternalError("Path stats not found for uid {}, tablet_id {}", uid,
1049
0
                                         tablet->tablet_id());
1050
0
        }
1051
1052
        // In input rowsets, some rowsets may have statistics values exceeding the maximum limit,
1053
        // which leads to inaccurate statistics
1054
166
        if (stats.size() > output->tablet_schema()
1055
166
                                   ->column_by_uid(uid)
1056
166
                                   .variant_max_sparse_column_statistics_size()) {
1057
            // When there is only one segment, we can ensure that the size of each path in output stats is accurate
1058
1
            if (output->num_segments() == 1) {
1059
13
                for (const auto& [path, size] : stats) {
1060
13
                    if (original_uid_to_path_stats.at(uid).find(path) ==
1061
13
                        original_uid_to_path_stats.at(uid).end()) {
1062
0
                        continue;
1063
0
                    }
1064
13
                    if (original_uid_to_path_stats.at(uid).at(path) > size) {
1065
0
                        return Status::InternalError(
1066
0
                                "Path stats not smaller for uid {} with path `{}`, input size {}, "
1067
0
                                "output "
1068
0
                                "size {}, "
1069
0
                                "tablet_id {}",
1070
0
                                uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1071
0
                                tablet->tablet_id());
1072
0
                    }
1073
13
                }
1074
1
            }
1075
1
        }
1076
        // in this case, input stats is accurate, so we check the stats size and stats value
1077
165
        else {
1078
2.42k
            for (const auto& [path, size] : stats) {
1079
2.42k
                if (original_uid_to_path_stats.at(uid).find(path) ==
1080
2.42k
                    original_uid_to_path_stats.at(uid).end()) {
1081
0
                    return Status::InternalError(
1082
0
                            "Path stats not found for uid {}, path {}, tablet_id {}", uid, path,
1083
0
                            tablet->tablet_id());
1084
0
                }
1085
2.42k
                if (original_uid_to_path_stats.at(uid).at(path) != size) {
1086
0
                    return Status::InternalError(
1087
0
                            "Path stats not match for uid {} with path `{}`, input size {}, output "
1088
0
                            "size {}, "
1089
0
                            "tablet_id {}",
1090
0
                            uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1091
0
                            tablet->tablet_id());
1092
0
                }
1093
2.42k
            }
1094
165
        }
1095
166
    }
1096
1097
357
    return Status::OK();
1098
357
}
1099
1100
Status VariantCompactionUtil::get_compaction_typed_columns(
1101
        const TabletSchemaSPtr& target, const std::unordered_set<std::string>& typed_paths,
1102
        const TabletColumnPtr parent_column, TabletSchemaSPtr& output_schema,
1103
375
        TabletSchema::PathsSetInfo& paths_set_info) {
1104
375
    if (parent_column->variant_enable_typed_paths_to_sparse()) {
1105
40
        return Status::OK();
1106
40
    }
1107
458
    for (const auto& path : typed_paths) {
1108
458
        TabletSchema::SubColumnInfo sub_column_info;
1109
458
        if (generate_sub_column_info(*target, parent_column->unique_id(), path, &sub_column_info)) {
1110
457
            inherit_column_attributes(*parent_column, sub_column_info.column);
1111
457
            output_schema->append_column(sub_column_info.column);
1112
457
            paths_set_info.typed_path_set.insert({path, std::move(sub_column_info)});
1113
457
            VLOG_DEBUG << "append typed column " << path;
1114
457
        } else {
1115
1
            return Status::InternalError("Failed to generate sub column info for path {}", path);
1116
1
        }
1117
458
    }
1118
334
    return Status::OK();
1119
335
}
1120
1121
Status VariantCompactionUtil::get_compaction_nested_columns(
1122
        const std::unordered_set<PathInData, PathInData::Hash>& nested_paths,
1123
        const PathToDataTypes& path_to_data_types, const TabletColumnPtr parent_column,
1124
374
        TabletSchemaSPtr& output_schema, TabletSchema::PathsSetInfo& paths_set_info) {
1125
374
    const auto& parent_indexes = output_schema->inverted_indexs(parent_column->unique_id());
1126
374
    for (const auto& path : nested_paths) {
1127
3
        const auto& find_data_types = path_to_data_types.find(path);
1128
3
        if (find_data_types == path_to_data_types.end() || find_data_types->second.empty()) {
1129
1
            return Status::InternalError("Nested path {} has no data type", path.get_path());
1130
1
        }
1131
2
        DataTypePtr data_type;
1132
2
        get_least_supertype_jsonb(find_data_types->second, &data_type);
1133
1134
2
        const std::string& column_name = parent_column->name_lower_case() + "." + path.get_path();
1135
2
        PathInDataBuilder full_path_builder;
1136
2
        auto full_path = full_path_builder.append(parent_column->name_lower_case(), false)
1137
2
                                 .append(path.get_parts(), false)
1138
2
                                 .build();
1139
2
        TabletColumn nested_column =
1140
2
                get_column_by_type(data_type, column_name,
1141
2
                                   ExtraInfo {.unique_id = -1,
1142
2
                                              .parent_unique_id = parent_column->unique_id(),
1143
2
                                              .path_info = full_path});
1144
2
        inherit_column_attributes(*parent_column, nested_column);
1145
2
        TabletIndexes sub_column_indexes;
1146
2
        inherit_index(parent_indexes, sub_column_indexes, nested_column);
1147
2
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1148
2
        output_schema->append_column(nested_column);
1149
2
        VLOG_DEBUG << "append nested column " << path.get_path();
1150
2
    }
1151
373
    return Status::OK();
1152
374
}
1153
1154
void VariantCompactionUtil::get_compaction_subcolumns_from_subpaths(
1155
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1156
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1157
366
        const std::unordered_set<std::string>& sparse_paths, TabletSchemaSPtr& output_schema) {
1158
366
    auto& path_set = paths_set_info.sub_path_set;
1159
366
    std::vector<StringRef> sorted_subpaths(path_set.begin(), path_set.end());
1160
366
    std::sort(sorted_subpaths.begin(), sorted_subpaths.end());
1161
366
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1162
    // append subcolumns
1163
1.19k
    for (const auto& subpath : sorted_subpaths) {
1164
1.19k
        auto column_name = parent_column->name_lower_case() + "." + subpath.to_string();
1165
1.19k
        auto column_path = PathInData(column_name);
1166
1167
1.19k
        const auto& find_data_types = path_to_data_types.find(PathInData(subpath));
1168
1169
        // some cases: the subcolumn type is variant
1170
        // 1. this path has no data type in segments
1171
        // 2. this path is in sparse paths
1172
        // 3. the sparse paths are too much
1173
1.19k
        TabletSchema::SubColumnInfo sub_column_info;
1174
1.19k
        if (parent_column->variant_enable_typed_paths_to_sparse() &&
1175
1.19k
            generate_sub_column_info(*target, parent_column->unique_id(), std::string(subpath),
1176
71
                                     &sub_column_info)) {
1177
61
            inherit_column_attributes(*parent_column, sub_column_info.column);
1178
61
            output_schema->append_column(sub_column_info.column);
1179
61
            paths_set_info.subcolumn_indexes.emplace(subpath, std::move(sub_column_info.indexes));
1180
61
            VLOG_DEBUG << "append typed column " << subpath;
1181
1.13k
        } else if (find_data_types == path_to_data_types.end() || find_data_types->second.empty() ||
1182
1.13k
                   sparse_paths.find(std::string(subpath)) != sparse_paths.end() ||
1183
1.13k
                   sparse_paths.size() >=
1184
1.00k
                           parent_column->variant_max_sparse_column_statistics_size()) {
1185
130
            TabletColumn subcolumn;
1186
130
            subcolumn.set_name(column_name);
1187
130
            subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1188
130
            subcolumn.set_parent_unique_id(parent_column->unique_id());
1189
130
            subcolumn.set_path_info(column_path);
1190
130
            subcolumn.set_aggregation_method(parent_column->aggregation());
1191
130
            subcolumn.set_variant_max_subcolumns_count(
1192
130
                    parent_column->variant_max_subcolumns_count());
1193
130
            subcolumn.set_variant_enable_doc_mode(parent_column->variant_enable_doc_mode());
1194
130
            subcolumn.set_is_nullable(true);
1195
130
            output_schema->append_column(subcolumn);
1196
130
            VLOG_DEBUG << "append sub column " << subpath << " data type "
1197
0
                       << "VARIANT";
1198
130
        }
1199
        // normal case: the subcolumn type can be calculated from the data types in segments
1200
1.00k
        else {
1201
1.00k
            DataTypePtr data_type;
1202
1.00k
            get_least_supertype_jsonb(find_data_types->second, &data_type);
1203
1.00k
            TabletColumn sub_column =
1204
1.00k
                    get_column_by_type(data_type, column_name,
1205
1.00k
                                       ExtraInfo {.unique_id = -1,
1206
1.00k
                                                  .parent_unique_id = parent_column->unique_id(),
1207
1.00k
                                                  .path_info = column_path});
1208
1.00k
            inherit_column_attributes(*parent_column, sub_column);
1209
1.00k
            TabletIndexes sub_column_indexes;
1210
1.00k
            inherit_index(parent_indexes, sub_column_indexes, sub_column);
1211
1.00k
            paths_set_info.subcolumn_indexes.emplace(subpath, std::move(sub_column_indexes));
1212
1.00k
            output_schema->append_column(sub_column);
1213
1.00k
            VLOG_DEBUG << "append sub column " << subpath << " data type " << data_type->get_name();
1214
1.00k
        }
1215
1.19k
    }
1216
366
}
1217
1218
void VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
1219
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1220
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1221
15
        TabletSchemaSPtr& output_schema) {
1222
15
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1223
92
    for (const auto& [path, data_types] : path_to_data_types) {
1224
        // Typed paths are materialized by get_compaction_typed_columns(); this helper only
1225
        // materializes regular subcolumns inferred from rowset data types.
1226
92
        if (data_types.empty() || path.empty() || path.get_is_typed() || path.has_nested_part()) {
1227
12
            continue;
1228
12
        }
1229
80
        DataTypePtr data_type;
1230
80
        get_least_supertype_jsonb(data_types, &data_type);
1231
80
        auto column_name = parent_column->name_lower_case() + "." + path.get_path();
1232
80
        auto column_path = PathInData(column_name);
1233
80
        TabletColumn sub_column =
1234
80
                get_column_by_type(data_type, column_name,
1235
80
                                   ExtraInfo {.unique_id = -1,
1236
80
                                              .parent_unique_id = parent_column->unique_id(),
1237
80
                                              .path_info = column_path});
1238
80
        inherit_column_attributes(*parent_column, sub_column);
1239
80
        TabletIndexes sub_column_indexes;
1240
80
        inherit_index(parent_indexes, sub_column_indexes, sub_column);
1241
80
        paths_set_info.sub_path_set.emplace(path.get_path());
1242
80
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1243
80
        output_schema->append_column(sub_column);
1244
80
        VLOG_DEBUG << "append sub column " << path.get_path() << " data type "
1245
0
                   << data_type->get_name();
1246
80
    }
1247
15
}
1248
1249
// Build the temporary schema for compaction.
1250
// NestedGroup roots are special: the root VARIANT column owns the NG tree and the streaming NG
1251
// writer handles NG children, while regular non-NG paths beside the arrays are materialized as
1252
// ordinary extracted subcolumns. NG typed paths still use get_compaction_typed_columns(), keeping
1253
// typed-column rules out of the NG-specific regular-path filtering.
1254
Status VariantCompactionUtil::get_extended_compaction_schema(
1255
11.1k
        const std::vector<RowsetSharedPtr>& rowsets, TabletSchemaSPtr& target) {
1256
11.1k
    std::unordered_map<int32_t, VariantExtendedInfo> uid_to_variant_extended_info;
1257
11.1k
    const bool needs_variant_extended_info =
1258
109k
            std::ranges::any_of(target->columns(), [](const TabletColumnPtr& column) {
1259
109k
                return column->is_variant_type() && (should_check_variant_path_stats(*column) ||
1260
606
                                                     column->variant_enable_nested_group());
1261
109k
            });
1262
11.1k
    if (needs_variant_extended_info) {
1263
        // collect path stats from all rowsets and segments
1264
4.93k
        for (const auto& rs : rowsets) {
1265
4.93k
            RETURN_IF_ERROR(aggregate_variant_extended_info(rs, &uid_to_variant_extended_info));
1266
4.93k
        }
1267
605
    }
1268
1269
    // build the output schema
1270
11.1k
    TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>();
1271
11.1k
    output_schema->shawdow_copy_without_columns(*target);
1272
11.1k
    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info;
1273
11.1k
    const auto ng_root_uids =
1274
11.1k
            collect_nested_group_compaction_root_uids(target, uid_to_variant_extended_info);
1275
111k
    for (const TabletColumnPtr& column : target->columns()) {
1276
111k
        if (!column->is_extracted_column()) {
1277
111k
            output_schema->append_column(*column);
1278
111k
        }
1279
111k
        if (!column->is_variant_type()) {
1280
111k
            continue;
1281
111k
        }
1282
18.4E
        VLOG_DEBUG << "column " << column->name() << " unique id " << column->unique_id();
1283
1284
623
        const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
1285
623
        const VariantExtendedInfo empty_extended_info;
1286
623
        const VariantExtendedInfo& extended_info = info_it == uid_to_variant_extended_info.end()
1287
623
                                                           ? empty_extended_info
1288
623
                                                           : info_it->second;
1289
623
        auto& paths_set_info = uid_to_paths_set_info[column->unique_id()];
1290
623
        const bool use_nested_group_compaction_schema = ng_root_uids.contains(column->unique_id());
1291
1292
623
        if (use_nested_group_compaction_schema) {
1293
            // 1. append typed columns. Keep this shared with the non-NG typed helper; only the
1294
            // regular-path selection below is NG-specific.
1295
1
            RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1296
1
                                                         output_schema, paths_set_info));
1297
1298
            // NG roots do not record path-count stats for ordinary Variant paths, so their regular
1299
            // non-NG subcolumns use the same data-types materialization helper as the
1300
            // all-materialized non-NG branch below.
1301
1
            auto regular_path_to_data_types =
1302
1
                    collect_regular_types_outside_nested_group(extended_info);
1303
1
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1304
1
                                                      regular_path_to_data_types, output_schema);
1305
1
            LOG(INFO) << "Variant column uid=" << column->unique_id()
1306
1
                      << " keeps nested-group root and materializes regular non-NG subcolumns in "
1307
1
                         "compaction schema";
1308
1
            continue;
1309
1
        }
1310
1311
622
        if (column->variant_enable_doc_mode()) {
1312
353
            const int bucket_num = std::max(1, column->variant_doc_hash_shard_count());
1313
1.04k
            for (int b = 0; b < bucket_num; ++b) {
1314
690
                TabletColumn doc_value_bucket_column = create_doc_value_column(*column, b);
1315
690
                doc_value_bucket_column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1316
690
                doc_value_bucket_column.set_is_nullable(false);
1317
690
                doc_value_bucket_column.set_variant_enable_doc_mode(true);
1318
690
                output_schema->append_column(doc_value_bucket_column);
1319
690
            }
1320
353
            continue;
1321
353
        }
1322
1323
        // 1. append typed columns
1324
269
        RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1325
269
                                                     output_schema, paths_set_info));
1326
1327
        // 2. append nested columns
1328
269
        RETURN_IF_ERROR(get_compaction_nested_columns(extended_info.nested_paths,
1329
269
                                                      extended_info.path_to_data_types, column,
1330
269
                                                      output_schema, paths_set_info));
1331
1332
        // 3. get the subpaths
1333
269
        get_subpaths(column->variant_max_subcolumns_count(), extended_info.path_to_none_null_values,
1334
269
                     paths_set_info);
1335
1336
        // 4. append subcolumns
1337
359
        if (column->variant_max_subcolumns_count() > 0 || !column->get_sub_columns().empty()) {
1338
359
            get_compaction_subcolumns_from_subpaths(paths_set_info, column, target,
1339
359
                                                    extended_info.path_to_data_types,
1340
359
                                                    extended_info.sparse_paths, output_schema);
1341
359
        }
1342
        // variant_max_subcolumns_count == 0 and no typed paths materialized
1343
        // it means that all subcolumns are materialized, may be from old data
1344
18.4E
        else {
1345
18.4E
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1346
18.4E
                                                      extended_info.path_to_data_types,
1347
18.4E
                                                      output_schema);
1348
18.4E
        }
1349
1350
        // append sparse column(s)
1351
        // If variant uses bucketized sparse columns, append one sparse bucket column per bucket.
1352
        // Otherwise, append the single sparse column.
1353
269
        int bucket_num = std::max(1, column->variant_sparse_hash_shard_count());
1354
269
        if (bucket_num > 1) {
1355
1.17k
            for (int b = 0; b < bucket_num; ++b) {
1356
928
                TabletColumn sparse_bucket_column = create_sparse_shard_column(*column, b);
1357
928
                output_schema->append_column(sparse_bucket_column);
1358
928
            }
1359
244
        } else {
1360
25
            TabletColumn sparse_column = create_sparse_column(*column);
1361
25
            output_schema->append_column(sparse_column);
1362
25
        }
1363
269
    }
1364
1365
11.1k
    target = output_schema;
1366
    // used to merge & filter path to sparse column during reading in compaction
1367
11.1k
    target->set_path_set_info(std::move(uid_to_paths_set_info));
1368
18.4E
    VLOG_DEBUG << "dump schema " << target->dump_full_schema();
1369
11.1k
    return Status::OK();
1370
11.1k
}
1371
1372
// Calculate statistics about variant data paths from the encoded sparse column
1373
void VariantCompactionUtil::calculate_variant_stats(const IColumn& encoded_sparse_column,
1374
                                                    segment_v2::VariantStatisticsPB* stats,
1375
                                                    size_t max_sparse_column_statistics_size,
1376
1.27k
                                                    size_t row_pos, size_t num_rows) {
1377
    // Cast input column to ColumnMap type since sparse column is stored as a map
1378
1.27k
    const auto& map_column = assert_cast<const ColumnMap&>(encoded_sparse_column);
1379
1380
    // Get the keys column which contains the paths as strings
1381
1.27k
    const auto& sparse_data_paths =
1382
1.27k
            assert_cast<const ColumnString*>(map_column.get_keys_ptr().get());
1383
1.27k
    const auto& serialized_sparse_column_offsets = map_column.get_offsets();
1384
1.27k
    auto& count_map = *stats->mutable_sparse_column_non_null_size();
1385
    // Iterate through all paths in the sparse column
1386
511k
    for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
1387
510k
        size_t offset = serialized_sparse_column_offsets[i - 1];
1388
510k
        size_t end = serialized_sparse_column_offsets[i];
1389
1.89M
        for (size_t j = offset; j != end; ++j) {
1390
1.37M
            auto path = sparse_data_paths->get_data_at(j);
1391
1392
1.37M
            const auto& sparse_path = path.to_string();
1393
            // If path already exists in statistics, increment its count
1394
1.37M
            if (auto it = count_map.find(sparse_path); it != count_map.end()) {
1395
1.37M
                ++it->second;
1396
1.37M
            }
1397
            // If path doesn't exist and we haven't hit the max statistics size limit,
1398
            // add it with count 1
1399
1.89k
            else if (count_map.size() < max_sparse_column_statistics_size) {
1400
1.89k
                count_map.emplace(sparse_path, 1);
1401
1.89k
            }
1402
1.37M
        }
1403
510k
    }
1404
1405
1.27k
    if (stats->sparse_column_non_null_size().size() > max_sparse_column_statistics_size) {
1406
0
        throw doris::Exception(
1407
0
                ErrorCode::INTERNAL_ERROR,
1408
0
                "Sparse column non null size: {} is greater than max statistics size: {}",
1409
0
                stats->sparse_column_non_null_size().size(), max_sparse_column_statistics_size);
1410
0
    }
1411
1.27k
}
1412
1413
/// Calculates number of dimensions in array field.
1414
/// Returns 0 for scalar fields.
1415
class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t> {
1416
public:
1417
    FieldVisitorToNumberOfDimensions() = default;
1418
    template <PrimitiveType T>
1419
24.4M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
24.4M
        if constexpr (T == TYPE_ARRAY) {
1421
2.58M
            const size_t size = x.size();
1422
2.58M
            size_t dimensions = 0;
1423
6.28M
            for (size_t i = 0; i < size; ++i) {
1424
3.69M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
3.69M
                dimensions = std::max(dimensions, element_dimensions);
1426
3.69M
            }
1427
2.58M
            return 1 + dimensions;
1428
21.8M
        } else {
1429
21.8M
            return 0;
1430
21.8M
        }
1431
24.4M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
121k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
121k
        } else {
1429
121k
            return 0;
1430
121k
        }
1431
121k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
511
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
511
        } else {
1429
511
            return 0;
1430
511
        }
1431
511
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
41.9k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
41.9k
        } else {
1429
41.9k
            return 0;
1430
41.9k
        }
1431
41.9k
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
428
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
428
        } else {
1429
428
            return 0;
1430
428
        }
1431
428
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
332k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
332k
        } else {
1429
332k
            return 0;
1430
332k
        }
1431
332k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
1.05k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
1.05k
        } else {
1429
1.05k
            return 0;
1430
1.05k
        }
1431
1.05k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
1.04k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
1.04k
        } else {
1429
1.04k
            return 0;
1430
1.04k
        }
1431
1.04k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
2.23k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
2.23k
        } else {
1429
2.23k
            return 0;
1430
2.23k
        }
1431
2.23k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
6.54M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
6.54M
        } else {
1429
6.54M
            return 0;
1430
6.54M
        }
1431
6.54M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
884
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
884
        } else {
1429
884
            return 0;
1430
884
        }
1431
884
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
2.96M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
2.96M
        } else {
1429
2.96M
            return 0;
1430
2.96M
        }
1431
2.96M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
339
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
339
        } else {
1429
339
            return 0;
1430
339
        }
1431
339
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
309
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
309
        } else {
1429
309
            return 0;
1430
309
        }
1431
309
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
11.7M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
11.7M
        } else {
1429
11.7M
            return 0;
1430
11.7M
        }
1431
11.7M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
2.58M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
2.58M
        if constexpr (T == TYPE_ARRAY) {
1421
2.58M
            const size_t size = x.size();
1422
2.58M
            size_t dimensions = 0;
1423
6.28M
            for (size_t i = 0; i < size; ++i) {
1424
3.69M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
3.69M
                dimensions = std::max(dimensions, element_dimensions);
1426
3.69M
            }
1427
2.58M
            return 1 + dimensions;
1428
        } else {
1429
            return 0;
1430
        }
1431
2.58M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
1
        } else {
1429
1
            return 0;
1430
1
        }
1431
1
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
1
        } else {
1429
1
            return 0;
1430
1
        }
1431
1
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
783
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
783
        } else {
1429
783
            return 0;
1430
783
        }
1431
783
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
724
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
724
        } else {
1429
724
            return 0;
1430
724
        }
1431
724
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
70.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
70.8k
        } else {
1429
70.8k
            return 0;
1430
70.8k
        }
1431
70.8k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
588
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
588
        } else {
1429
588
            return 0;
1430
588
        }
1431
588
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1419
46.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1420
        if constexpr (T == TYPE_ARRAY) {
1421
            const size_t size = x.size();
1422
            size_t dimensions = 0;
1423
            for (size_t i = 0; i < size; ++i) {
1424
                size_t element_dimensions = apply_visitor(*this, x[i]);
1425
                dimensions = std::max(dimensions, element_dimensions);
1426
            }
1427
            return 1 + dimensions;
1428
46.8k
        } else {
1429
46.8k
            return 0;
1430
46.8k
        }
1431
46.8k
    }
1432
};
1433
1434
// Visitor that allows to get type of scalar field
1435
// but exclude fields contain complex field.This is a faster version
1436
// for FieldVisitorToScalarType which does not support complex field.
1437
class SimpleFieldVisitorToScalarType : public StaticVisitor<size_t> {
1438
public:
1439
    template <PrimitiveType T>
1440
18.9M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
18.9M
        if constexpr (T == TYPE_ARRAY) {
1442
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
109k
        } else if constexpr (T == TYPE_NULL) {
1444
109k
            have_nulls = true;
1445
109k
            return 1;
1446
18.7M
        } else {
1447
18.7M
            type = T;
1448
18.7M
            return 1;
1449
18.7M
        }
1450
18.9M
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
109k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
109k
        } else if constexpr (T == TYPE_NULL) {
1444
109k
            have_nulls = true;
1445
109k
            return 1;
1446
        } else {
1447
            type = T;
1448
            return 1;
1449
        }
1450
109k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
12.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
        } else if constexpr (T == TYPE_NULL) {
1444
            have_nulls = true;
1445
            return 1;
1446
12.3k
        } else {
1447
12.3k
            type = T;
1448
12.3k
            return 1;
1449
12.3k
        }
1450
12.3k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
273k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
        } else if constexpr (T == TYPE_NULL) {
1444
            have_nulls = true;
1445
            return 1;
1446
273k
        } else {
1447
273k
            type = T;
1448
273k
            return 1;
1449
273k
        }
1450
273k
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
2
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
        } else if constexpr (T == TYPE_NULL) {
1444
            have_nulls = true;
1445
            return 1;
1446
2
        } else {
1447
2
            type = T;
1448
2
            return 1;
1449
2
        }
1450
2
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
7
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
        } else if constexpr (T == TYPE_NULL) {
1444
            have_nulls = true;
1445
            return 1;
1446
7
        } else {
1447
7
            type = T;
1448
7
            return 1;
1449
7
        }
1450
7
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
676
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
        } else if constexpr (T == TYPE_NULL) {
1444
            have_nulls = true;
1445
            return 1;
1446
676
        } else {
1447
676
            type = T;
1448
676
            return 1;
1449
676
        }
1450
676
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
5.06M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
        } else if constexpr (T == TYPE_NULL) {
1444
            have_nulls = true;
1445
            return 1;
1446
5.06M
        } else {
1447
5.06M
            type = T;
1448
5.06M
            return 1;
1449
5.06M
        }
1450
5.06M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
        } else if constexpr (T == TYPE_NULL) {
1444
            have_nulls = true;
1445
            return 1;
1446
1
        } else {
1447
1
            type = T;
1448
1
            return 1;
1449
1
        }
1450
1
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
2.77M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
        } else if constexpr (T == TYPE_NULL) {
1444
            have_nulls = true;
1445
            return 1;
1446
2.77M
        } else {
1447
2.77M
            type = T;
1448
2.77M
            return 1;
1449
2.77M
        }
1450
2.77M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
10.6M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
        } else if constexpr (T == TYPE_NULL) {
1444
            have_nulls = true;
1445
            return 1;
1446
10.6M
        } else {
1447
10.6M
            type = T;
1448
10.6M
            return 1;
1449
10.6M
        }
1450
10.6M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1440
46.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1441
        if constexpr (T == TYPE_ARRAY) {
1442
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1443
        } else if constexpr (T == TYPE_NULL) {
1444
            have_nulls = true;
1445
            return 1;
1446
46.8k
        } else {
1447
46.8k
            type = T;
1448
46.8k
            return 1;
1449
46.8k
        }
1450
46.8k
    }
1451
18.6M
    void get_scalar_type(PrimitiveType* data_type) const { *data_type = type; }
1452
18.6M
    bool contain_nulls() const { return have_nulls; }
1453
1454
18.6M
    bool need_convert_field() const { return false; }
1455
1456
private:
1457
    PrimitiveType type = PrimitiveType::INVALID_TYPE;
1458
    bool have_nulls = false;
1459
};
1460
1461
/// Visitor that allows to get type of scalar field
1462
/// or least common type of scalars in array.
1463
/// More optimized version of FieldToDataType.
1464
class FieldVisitorToScalarType : public StaticVisitor<size_t> {
1465
public:
1466
    template <PrimitiveType T>
1467
5.58M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
5.58M
        if constexpr (T == TYPE_ARRAY) {
1469
2.58M
            size_t size = x.size();
1470
6.28M
            for (size_t i = 0; i < size; ++i) {
1471
3.69M
                apply_visitor(*this, x[i]);
1472
3.69M
            }
1473
2.58M
            return 0;
1474
2.58M
        } else if constexpr (T == TYPE_NULL) {
1475
12.2k
            have_nulls = true;
1476
12.2k
            return 0;
1477
2.98M
        } else {
1478
2.98M
            field_types.insert(T);
1479
2.98M
            type_indexes.insert(T);
1480
2.98M
            return 0;
1481
2.98M
        }
1482
5.58M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
12.2k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
12.2k
        } else if constexpr (T == TYPE_NULL) {
1475
12.2k
            have_nulls = true;
1476
12.2k
            return 0;
1477
        } else {
1478
            field_types.insert(T);
1479
            type_indexes.insert(T);
1480
            return 0;
1481
        }
1482
12.2k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
511
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
511
        } else {
1478
511
            field_types.insert(T);
1479
511
            type_indexes.insert(T);
1480
511
            return 0;
1481
511
        }
1482
511
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
29.6k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
29.6k
        } else {
1478
29.6k
            field_types.insert(T);
1479
29.6k
            type_indexes.insert(T);
1480
29.6k
            return 0;
1481
29.6k
        }
1482
29.6k
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
428
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
428
        } else {
1478
428
            field_types.insert(T);
1479
428
            type_indexes.insert(T);
1480
428
            return 0;
1481
428
        }
1482
428
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
58.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
58.3k
        } else {
1478
58.3k
            field_types.insert(T);
1479
58.3k
            type_indexes.insert(T);
1480
58.3k
            return 0;
1481
58.3k
        }
1482
58.3k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
1.04k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
1.04k
        } else {
1478
1.04k
            field_types.insert(T);
1479
1.04k
            type_indexes.insert(T);
1480
1.04k
            return 0;
1481
1.04k
        }
1482
1.04k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
1.03k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
1.03k
        } else {
1478
1.03k
            field_types.insert(T);
1479
1.03k
            type_indexes.insert(T);
1480
1.03k
            return 0;
1481
1.03k
        }
1482
1.03k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
1.55k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
1.55k
        } else {
1478
1.55k
            field_types.insert(T);
1479
1.55k
            type_indexes.insert(T);
1480
1.55k
            return 0;
1481
1.55k
        }
1482
1.55k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
1.48M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
1.48M
        } else {
1478
1.48M
            field_types.insert(T);
1479
1.48M
            type_indexes.insert(T);
1480
1.48M
            return 0;
1481
1.48M
        }
1482
1.48M
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
883
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
883
        } else {
1478
883
            field_types.insert(T);
1479
883
            type_indexes.insert(T);
1480
883
            return 0;
1481
883
        }
1482
883
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
187k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
187k
        } else {
1478
187k
            field_types.insert(T);
1479
187k
            type_indexes.insert(T);
1480
187k
            return 0;
1481
187k
        }
1482
187k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
339
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
339
        } else {
1478
339
            field_types.insert(T);
1479
339
            type_indexes.insert(T);
1480
339
            return 0;
1481
339
        }
1482
339
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
309
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
309
        } else {
1478
309
            field_types.insert(T);
1479
309
            type_indexes.insert(T);
1480
309
            return 0;
1481
309
        }
1482
309
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
1.14M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
1.14M
        } else {
1478
1.14M
            field_types.insert(T);
1479
1.14M
            type_indexes.insert(T);
1480
1.14M
            return 0;
1481
1.14M
        }
1482
1.14M
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
2.58M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
2.58M
        if constexpr (T == TYPE_ARRAY) {
1469
2.58M
            size_t size = x.size();
1470
6.28M
            for (size_t i = 0; i < size; ++i) {
1471
3.69M
                apply_visitor(*this, x[i]);
1472
3.69M
            }
1473
2.58M
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
        } else {
1478
            field_types.insert(T);
1479
            type_indexes.insert(T);
1480
            return 0;
1481
        }
1482
2.58M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
1
        } else {
1478
1
            field_types.insert(T);
1479
1
            type_indexes.insert(T);
1480
1
            return 0;
1481
1
        }
1482
1
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
1
        } else {
1478
1
            field_types.insert(T);
1479
1
            type_indexes.insert(T);
1480
1
            return 0;
1481
1
        }
1482
1
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
783
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
783
        } else {
1478
783
            field_types.insert(T);
1479
783
            type_indexes.insert(T);
1480
783
            return 0;
1481
783
        }
1482
783
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
724
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
724
        } else {
1478
724
            field_types.insert(T);
1479
724
            type_indexes.insert(T);
1480
724
            return 0;
1481
724
        }
1482
724
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
70.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
70.8k
        } else {
1478
70.8k
            field_types.insert(T);
1479
70.8k
            type_indexes.insert(T);
1480
70.8k
            return 0;
1481
70.8k
        }
1482
70.8k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
588
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
588
        } else {
1478
588
            field_types.insert(T);
1479
588
            type_indexes.insert(T);
1480
588
            return 0;
1481
588
        }
1482
588
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1467
44
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1468
        if constexpr (T == TYPE_ARRAY) {
1469
            size_t size = x.size();
1470
            for (size_t i = 0; i < size; ++i) {
1471
                apply_visitor(*this, x[i]);
1472
            }
1473
            return 0;
1474
        } else if constexpr (T == TYPE_NULL) {
1475
            have_nulls = true;
1476
            return 0;
1477
44
        } else {
1478
44
            field_types.insert(T);
1479
44
            type_indexes.insert(T);
1480
44
            return 0;
1481
44
        }
1482
44
    }
1483
1.88M
    void get_scalar_type(PrimitiveType* type) const {
1484
1.88M
        if (type_indexes.size() == 1) {
1485
            // Most cases will have only one type
1486
1.79M
            *type = *type_indexes.begin();
1487
1.79M
            return;
1488
1.79M
        }
1489
90.6k
        DataTypePtr data_type;
1490
90.6k
        get_least_supertype_jsonb(type_indexes, &data_type);
1491
90.6k
        *type = data_type->get_primitive_type();
1492
90.6k
    }
1493
1.88M
    bool contain_nulls() const { return have_nulls; }
1494
1.88M
    bool need_convert_field() const { return field_types.size() > 1; }
1495
1496
private:
1497
    phmap::flat_hash_set<PrimitiveType> type_indexes;
1498
    phmap::flat_hash_set<PrimitiveType> field_types;
1499
    bool have_nulls = false;
1500
};
1501
1502
template <typename Visitor>
1503
20.7M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1504
20.7M
    Visitor to_scalar_type_visitor;
1505
20.7M
    apply_visitor(to_scalar_type_visitor, field);
1506
20.7M
    PrimitiveType type_id;
1507
20.7M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1508
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1509
20.7M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1510
20.7M
             to_scalar_type_visitor.need_convert_field(),
1511
20.7M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1512
20.7M
}
_ZN5doris12variant_util19get_field_info_implINS0_24FieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1503
1.88M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1504
1.88M
    Visitor to_scalar_type_visitor;
1505
1.88M
    apply_visitor(to_scalar_type_visitor, field);
1506
1.88M
    PrimitiveType type_id;
1507
1.88M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1508
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1509
1.88M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1510
1.88M
             to_scalar_type_visitor.need_convert_field(),
1511
1.88M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1512
1.88M
}
_ZN5doris12variant_util19get_field_info_implINS0_30SimpleFieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1503
18.8M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1504
18.8M
    Visitor to_scalar_type_visitor;
1505
18.8M
    apply_visitor(to_scalar_type_visitor, field);
1506
18.8M
    PrimitiveType type_id;
1507
18.8M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1508
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1509
18.8M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1510
18.8M
             to_scalar_type_visitor.need_convert_field(),
1511
18.8M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1512
18.8M
}
1513
1514
20.6M
void get_field_info(const Field& field, FieldInfo* info) {
1515
20.6M
    if (field.is_complex_field()) {
1516
1.88M
        get_field_info_impl<FieldVisitorToScalarType>(field, info);
1517
18.8M
    } else {
1518
18.8M
        get_field_info_impl<SimpleFieldVisitorToScalarType>(field, info);
1519
18.8M
    }
1520
20.6M
}
1521
1522
bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id,
1523
                              const std::string& path,
1524
252k
                              TabletSchema::SubColumnInfo* sub_column_info) {
1525
252k
    const auto& parent_column = schema.column_by_uid(col_unique_id);
1526
252k
    std::function<void(const TabletColumn&, TabletColumn*)> generate_result_column =
1527
252k
            [&](const TabletColumn& from_column, TabletColumn* to_column) {
1528
15.4k
                to_column->set_name(parent_column.name_lower_case() + "." + path);
1529
15.4k
                to_column->set_type(from_column.type());
1530
15.4k
                to_column->set_parent_unique_id(parent_column.unique_id());
1531
15.4k
                bool is_typed = !parent_column.variant_enable_typed_paths_to_sparse();
1532
15.4k
                to_column->set_path_info(
1533
15.4k
                        PathInData(parent_column.name_lower_case() + "." + path, is_typed));
1534
15.4k
                to_column->set_aggregation_method(parent_column.aggregation());
1535
15.4k
                to_column->set_is_nullable(true);
1536
15.4k
                to_column->set_parent_unique_id(parent_column.unique_id());
1537
15.4k
                if (from_column.is_decimal()) {
1538
15.3k
                    to_column->set_precision(from_column.precision());
1539
15.3k
                }
1540
15.4k
                to_column->set_frac(from_column.frac());
1541
1542
15.4k
                if (from_column.is_array_type()) {
1543
2.69k
                    TabletColumn nested_column;
1544
2.69k
                    generate_result_column(*from_column.get_sub_columns()[0], &nested_column);
1545
2.69k
                    to_column->add_sub_column(nested_column);
1546
2.69k
                }
1547
15.4k
            };
1548
1549
252k
    auto generate_index = [&](const std::string& pattern) {
1550
        // 1. find subcolumn's index
1551
12.7k
        if (const auto& indexes = schema.inverted_index_by_field_pattern(col_unique_id, pattern);
1552
12.7k
            !indexes.empty()) {
1553
4.85k
            for (const auto& index : indexes) {
1554
4.85k
                auto index_ptr = std::make_shared<TabletIndex>(*index);
1555
4.85k
                index_ptr->set_escaped_escaped_index_suffix_path(
1556
4.85k
                        sub_column_info->column.path_info_ptr()->get_path());
1557
4.85k
                sub_column_info->indexes.emplace_back(std::move(index_ptr));
1558
4.85k
            }
1559
4.79k
        }
1560
        // 2. find parent column's index
1561
7.93k
        else if (const auto parent_index = schema.inverted_indexs(col_unique_id);
1562
7.93k
                 !parent_index.empty()) {
1563
489
            inherit_index(parent_index, sub_column_info->indexes, sub_column_info->column);
1564
7.44k
        } else {
1565
7.44k
            sub_column_info->indexes.clear();
1566
7.44k
        }
1567
12.7k
    };
1568
1569
252k
    const auto& sub_columns = parent_column.get_sub_columns();
1570
252k
    for (const auto& sub_column : sub_columns) {
1571
214k
        const char* pattern = sub_column->name().c_str();
1572
214k
        switch (sub_column->pattern_type()) {
1573
5.33k
        case PatternTypePB::MATCH_NAME: {
1574
5.33k
            if (strcmp(pattern, path.c_str()) == 0) {
1575
1.05k
                generate_result_column(*sub_column, &sub_column_info->column);
1576
1.05k
                generate_index(sub_column->name());
1577
1.05k
                return true;
1578
1.05k
            }
1579
4.27k
            break;
1580
5.33k
        }
1581
209k
        case PatternTypePB::MATCH_NAME_GLOB: {
1582
209k
            if (glob_match_re2(pattern, path)) {
1583
11.6k
                generate_result_column(*sub_column, &sub_column_info->column);
1584
11.6k
                generate_index(sub_column->name());
1585
11.6k
                return true;
1586
11.6k
            }
1587
197k
            break;
1588
209k
        }
1589
197k
        default:
1590
0
            break;
1591
214k
        }
1592
214k
    }
1593
239k
    return false;
1594
252k
}
1595
1596
TabletSchemaSPtr VariantCompactionUtil::calculate_variant_extended_schema(
1597
1.41k
        const std::vector<RowsetSharedPtr>& rowsets, const TabletSchemaSPtr& base_schema) {
1598
1.41k
    if (rowsets.empty()) {
1599
0
        return nullptr;
1600
0
    }
1601
1602
1.41k
    std::vector<TabletSchemaSPtr> schemas;
1603
3.31k
    for (const auto& rs : rowsets) {
1604
3.31k
        if (rs->num_segments() == 0) {
1605
3.14k
            continue;
1606
3.14k
        }
1607
177
        const auto& tablet_schema = rs->tablet_schema();
1608
177
        SegmentCacheHandle segment_cache;
1609
177
        auto st = SegmentLoader::instance()->load_segments(std::static_pointer_cast<BetaRowset>(rs),
1610
177
                                                           &segment_cache);
1611
177
        if (!st.ok()) {
1612
0
            return base_schema;
1613
0
        }
1614
177
        for (const auto& segment : segment_cache.get_segments()) {
1615
177
            TabletSchemaSPtr schema = tablet_schema->copy_without_variant_extracted_columns();
1616
360
            for (const auto& column : tablet_schema->columns()) {
1617
360
                if (!column->is_variant_type()) {
1618
177
                    continue;
1619
177
                }
1620
183
                std::shared_ptr<ColumnReader> column_reader;
1621
183
                OlapReaderStatistics stats;
1622
183
                st = segment->get_column_reader(column->unique_id(), &column_reader, &stats);
1623
183
                if (!st.ok()) {
1624
0
                    LOG(WARNING) << "Failed to get column reader for column: " << column->name()
1625
0
                                 << " error: " << st.to_string();
1626
0
                    continue;
1627
0
                }
1628
183
                if (!column_reader) {
1629
0
                    continue;
1630
0
                }
1631
1632
183
                CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
1633
183
                auto* variant_column_reader =
1634
183
                        assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
1635
                // load external meta before getting subcolumn meta info
1636
183
                st = variant_column_reader->load_external_meta_once();
1637
183
                if (!st.ok()) {
1638
0
                    LOG(WARNING) << "Failed to load external meta for column: " << column->name()
1639
0
                                 << " error: " << st.to_string();
1640
0
                    continue;
1641
0
                }
1642
183
                const auto* subcolumn_meta_info = variant_column_reader->get_subcolumns_meta_info();
1643
515
                for (const auto& entry : *subcolumn_meta_info) {
1644
515
                    if (entry->path.empty()) {
1645
183
                        continue;
1646
183
                    }
1647
332
                    const std::string& column_name =
1648
332
                            column->name_lower_case() + "." + entry->path.get_path();
1649
332
                    const DataTypePtr& data_type = entry->data.file_column_type;
1650
332
                    PathInDataBuilder full_path_builder;
1651
332
                    auto full_path = full_path_builder.append(column->name_lower_case(), false)
1652
332
                                             .append(entry->path.get_parts(), false)
1653
332
                                             .build();
1654
332
                    TabletColumn subcolumn =
1655
332
                            get_column_by_type(data_type, column_name,
1656
332
                                               ExtraInfo {.unique_id = -1,
1657
332
                                                          .parent_unique_id = column->unique_id(),
1658
332
                                                          .path_info = full_path});
1659
332
                    schema->append_column(subcolumn);
1660
332
                }
1661
183
            }
1662
177
            schemas.emplace_back(schema);
1663
177
        }
1664
177
    }
1665
1.41k
    TabletSchemaSPtr least_common_schema;
1666
1.41k
    auto st = get_least_common_schema(schemas, base_schema, least_common_schema, false);
1667
1.41k
    if (!st.ok()) {
1668
0
        return base_schema;
1669
0
    }
1670
1.41k
    return least_common_schema;
1671
1.41k
}
1672
1673
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1674
                   TabletIndexes& subcolumns_indexes, FieldType column_type,
1675
69.2k
                   const std::string& suffix_path, bool is_array_nested_type) {
1676
69.2k
    if (parent_indexes.empty()) {
1677
67.3k
        return false;
1678
67.3k
    }
1679
1.87k
    subcolumns_indexes.clear();
1680
    // bkd index or array index only need to inherit one index
1681
1.87k
    if (field_is_numeric_type(column_type) ||
1682
1.87k
        (is_array_nested_type &&
1683
1.40k
         (field_is_numeric_type(column_type) || field_is_slice_type(column_type)))) {
1684
1.40k
        auto index_ptr = std::make_shared<TabletIndex>(*parent_indexes[0]);
1685
1.40k
        index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1686
        // no need parse for bkd index or array index
1687
1.40k
        index_ptr->remove_parser_and_analyzer();
1688
1.40k
        subcolumns_indexes.emplace_back(std::move(index_ptr));
1689
1.40k
        return true;
1690
1.40k
    }
1691
    // string type need to inherit all indexes
1692
471
    else if (field_is_slice_type(column_type) && !is_array_nested_type) {
1693
484
        for (const auto& index : parent_indexes) {
1694
484
            auto index_ptr = std::make_shared<TabletIndex>(*index);
1695
484
            index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1696
484
            subcolumns_indexes.emplace_back(std::move(index_ptr));
1697
484
        }
1698
453
        return true;
1699
453
    }
1700
18
    return false;
1701
1.87k
}
1702
1703
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1704
69.2k
                   TabletIndexes& subcolumns_indexes, const TabletColumn& column) {
1705
69.2k
    if (!column.is_extracted_column()) {
1706
3
        return false;
1707
3
    }
1708
69.2k
    if (column.is_array_type()) {
1709
1.08k
        if (column.get_sub_columns().empty()) {
1710
0
            return false;
1711
0
        }
1712
1.08k
        const TabletColumn* nested = column.get_sub_columns()[0].get();
1713
1.08k
        while (nested != nullptr && nested->is_array_type()) {
1714
0
            if (nested->get_sub_columns().empty()) {
1715
0
                return false;
1716
0
            }
1717
0
            nested = nested->get_sub_columns()[0].get();
1718
0
        }
1719
1.08k
        if (nested == nullptr) {
1720
0
            return false;
1721
0
        }
1722
1.08k
        return inherit_index(parent_indexes, subcolumns_indexes, nested->type(),
1723
1.08k
                             column.path_info_ptr()->get_path(), true);
1724
1.08k
    }
1725
68.1k
    return inherit_index(parent_indexes, subcolumns_indexes, column.type(),
1726
68.1k
                         column.path_info_ptr()->get_path());
1727
69.2k
}
1728
1729
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1730
0
                   TabletIndexes& subcolumns_indexes, const ColumnMetaPB& column_pb) {
1731
0
    if (!column_pb.has_column_path_info()) {
1732
0
        return false;
1733
0
    }
1734
0
    if (column_pb.type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1735
0
        if (column_pb.children_columns_size() == 0) {
1736
0
            return false;
1737
0
        }
1738
0
        const ColumnMetaPB* nested = &column_pb.children_columns(0);
1739
0
        while (nested != nullptr && nested->type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1740
0
            if (nested->children_columns_size() == 0) {
1741
0
                return false;
1742
0
            }
1743
0
            nested = &nested->children_columns(0);
1744
0
        }
1745
0
        if (nested == nullptr) {
1746
0
            return false;
1747
0
        }
1748
0
        return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)nested->type(),
1749
0
                             column_pb.column_path_info().path(), true);
1750
0
    }
1751
0
    return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)column_pb.type(),
1752
0
                         column_pb.column_path_info().path());
1753
0
}
1754
1755
// ============ Implementation from parse2column.cpp ============
1756
1757
/** Pool for objects that cannot be used from different threads simultaneously.
1758
  * Allows to create an object for each thread.
1759
  * Pool has unbounded size and objects are not destroyed before destruction of pool.
1760
  *
1761
  * Use it in cases when thread local storage is not appropriate
1762
  *  (when maximum number of simultaneously used objects is less
1763
  *   than number of running/sleeping threads, that has ever used object,
1764
  *   and creation/destruction of objects is expensive).
1765
  */
1766
template <typename T>
1767
class SimpleObjectPool {
1768
protected:
1769
    /// Hold all available objects in stack.
1770
    std::mutex mutex;
1771
    std::stack<std::unique_ptr<T>> stack;
1772
    /// Specialized deleter for std::unique_ptr.
1773
    /// Returns underlying pointer back to stack thus reclaiming its ownership.
1774
    struct Deleter {
1775
        SimpleObjectPool<T>* parent;
1776
16.9k
        Deleter(SimpleObjectPool<T>* parent_ = nullptr) : parent {parent_} {} /// NOLINT
1777
16.9k
        void operator()(T* owning_ptr) const {
1778
16.9k
            std::lock_guard lock {parent->mutex};
1779
16.9k
            parent->stack.emplace(owning_ptr);
1780
16.9k
        }
1781
    };
1782
1783
public:
1784
    using Pointer = std::unique_ptr<T, Deleter>;
1785
    /// Extracts and returns a pointer from the stack if it's not empty,
1786
    ///  creates a new one by calling provided f() otherwise.
1787
    template <typename Factory>
1788
16.9k
    Pointer get(Factory&& f) {
1789
16.9k
        std::unique_lock lock(mutex);
1790
16.9k
        if (stack.empty()) {
1791
25
            return {f(), this};
1792
25
        }
1793
16.9k
        auto object = stack.top().release();
1794
16.9k
        stack.pop();
1795
16.9k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1796
16.9k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9StringRefEPS4_RKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1788
12.4k
    Pointer get(Factory&& f) {
1789
12.4k
        std::unique_lock lock(mutex);
1790
12.4k
        if (stack.empty()) {
1791
1
            return {f(), this};
1792
1
        }
1793
12.4k
        auto object = stack.top().release();
1794
12.4k
        stack.pop();
1795
12.4k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1796
12.4k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9ColumnStrIjEERKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1788
4.52k
    Pointer get(Factory&& f) {
1789
4.52k
        std::unique_lock lock(mutex);
1790
4.52k
        if (stack.empty()) {
1791
24
            return {f(), this};
1792
24
        }
1793
4.49k
        auto object = stack.top().release();
1794
4.49k
        stack.pop();
1795
4.49k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1796
4.52k
    }
1797
    /// Like get(), but creates object using default constructor.
1798
    Pointer getDefault() {
1799
        return get([] { return new T; });
1800
    }
1801
};
1802
1803
SimpleObjectPool<JsonParser> parsers_pool;
1804
1805
using Node = typename ColumnVariant::Subcolumns::Node;
1806
1807
42.8M
static inline void append_binary_bytes(ColumnString::Chars& chars, const void* data, size_t size) {
1808
42.8M
    const auto old_size = chars.size();
1809
42.8M
    chars.resize(old_size + size);
1810
42.8M
    memcpy(chars.data() + old_size, reinterpret_cast<const char*>(data), size);
1811
42.8M
}
1812
1813
17.6M
static inline void append_binary_type(ColumnString::Chars& chars, FieldType type) {
1814
17.6M
    const uint8_t t = static_cast<uint8_t>(type);
1815
17.6M
    append_binary_bytes(chars, &t, sizeof(uint8_t));
1816
17.6M
}
1817
1818
10.8M
static inline void append_binary_sizet(ColumnString::Chars& chars, size_t v) {
1819
10.8M
    append_binary_bytes(chars, &v, sizeof(size_t));
1820
10.8M
}
1821
1822
17.6M
static void append_field_to_binary_chars(const Field& field, ColumnString::Chars& chars) {
1823
17.6M
    switch (field.get_type()) {
1824
14
    case PrimitiveType::TYPE_NULL: {
1825
14
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_NONE);
1826
14
        return;
1827
0
    }
1828
261k
    case PrimitiveType::TYPE_BOOLEAN: {
1829
261k
        append_binary_type(chars,
1830
261k
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BOOLEAN));
1831
261k
        const auto v = static_cast<UInt8>(field.get<PrimitiveType::TYPE_BOOLEAN>());
1832
261k
        append_binary_bytes(chars, &v, sizeof(UInt8));
1833
261k
        return;
1834
0
    }
1835
4.50M
    case PrimitiveType::TYPE_BIGINT: {
1836
4.50M
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BIGINT));
1837
4.50M
        const auto v = field.get<PrimitiveType::TYPE_BIGINT>();
1838
4.50M
        append_binary_bytes(chars, &v, sizeof(Int64));
1839
4.50M
        return;
1840
0
    }
1841
9
    case PrimitiveType::TYPE_LARGEINT: {
1842
9
        append_binary_type(chars,
1843
9
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_LARGEINT));
1844
9
        const auto v = field.get<PrimitiveType::TYPE_LARGEINT>();
1845
9
        append_binary_bytes(chars, &v, sizeof(int128_t));
1846
9
        return;
1847
0
    }
1848
2.74M
    case PrimitiveType::TYPE_DOUBLE: {
1849
2.74M
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_DOUBLE));
1850
2.74M
        const auto v = field.get<PrimitiveType::TYPE_DOUBLE>();
1851
2.74M
        append_binary_bytes(chars, &v, sizeof(Float64));
1852
2.74M
        return;
1853
0
    }
1854
10.1M
    case PrimitiveType::TYPE_STRING: {
1855
10.1M
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_STRING);
1856
10.1M
        const auto& v = field.get<PrimitiveType::TYPE_STRING>();
1857
10.1M
        append_binary_sizet(chars, v.size());
1858
10.1M
        append_binary_bytes(chars, v.data(), v.size());
1859
10.1M
        return;
1860
0
    }
1861
46.7k
    case PrimitiveType::TYPE_JSONB: {
1862
46.7k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_JSONB);
1863
46.7k
        const auto& v = field.get<PrimitiveType::TYPE_JSONB>();
1864
46.7k
        append_binary_sizet(chars, v.get_size());
1865
46.7k
        append_binary_bytes(chars, v.get_value(), v.get_size());
1866
46.7k
        return;
1867
0
    }
1868
524k
    case PrimitiveType::TYPE_ARRAY: {
1869
524k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_ARRAY);
1870
524k
        const auto& a = field.get<PrimitiveType::TYPE_ARRAY>();
1871
524k
        append_binary_sizet(chars, a.size());
1872
787k
        for (const auto& elem : a) {
1873
787k
            append_field_to_binary_chars(elem, chars);
1874
787k
        }
1875
524k
        return;
1876
0
    }
1877
0
    default:
1878
0
        throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Unsupported field type {}",
1879
0
                               field.get_type());
1880
17.6M
    }
1881
17.6M
}
1882
template <typename ParserImpl>
1883
void parse_json_to_variant_impl(IColumn& column, const char* src, size_t length,
1884
1.35M
                                JSONDataParser<ParserImpl>* parser, const ParseConfig& config) {
1885
1.35M
    auto& column_variant = assert_cast<ColumnVariant&>(column);
1886
1.35M
    std::optional<ParseResult> result;
1887
    /// Treat empty string as an empty object
1888
    /// for better CAST from String to Object.
1889
1.35M
    if (length > 0) {
1890
1.35M
        result = parser->parse(src, length, config);
1891
1.35M
    } else {
1892
3.99k
        result = ParseResult {};
1893
3.99k
    }
1894
1.35M
    if (!result) {
1895
664
        VLOG_DEBUG << "failed to parse " << std::string_view(src, length) << ", length= " << length;
1896
664
        if (config::variant_throw_exeception_on_invalid_json) {
1897
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse object {}",
1898
0
                                   std::string_view(src, length));
1899
0
        }
1900
        // Treat as string
1901
664
        PathInData root_path;
1902
664
        Field field = Field::create_field<TYPE_STRING>(String(src, length));
1903
664
        result = ParseResult {{root_path}, {field}};
1904
664
    }
1905
1.35M
    auto& [paths, values] = *result;
1906
1.35M
    assert(paths.size() == values.size());
1907
1.35M
    size_t old_num_rows = column_variant.rows();
1908
1.35M
    if (config.deprecated_enable_flatten_nested) {
1909
        // here we should check the paths in variant and paths in result,
1910
        // if two paths which same prefix have different structure, we should throw an exception
1911
3.02k
        std::vector<PathInData> check_paths;
1912
12.0k
        for (const auto& entry : column_variant.get_subcolumns()) {
1913
12.0k
            check_paths.push_back(entry->path);
1914
12.0k
        }
1915
3.02k
        check_paths.insert(check_paths.end(), paths.begin(), paths.end());
1916
3.02k
        THROW_IF_ERROR(check_variant_has_no_ambiguous_paths(check_paths));
1917
3.02k
    }
1918
1.35M
    auto [doc_value_data_paths, doc_value_data_values] =
1919
1.35M
            column_variant.get_doc_value_data_paths_and_values();
1920
1.35M
    auto& doc_value_data_offsets = column_variant.serialized_doc_value_column_offsets();
1921
1922
1.41M
    auto flush_defaults = [](ColumnVariant::Subcolumn* subcolumn) {
1923
1.41M
        const auto num_defaults = subcolumn->cur_num_of_defaults();
1924
1.41M
        if (num_defaults > 0) {
1925
165k
            subcolumn->insert_many_defaults(num_defaults);
1926
165k
            subcolumn->reset_current_num_of_defaults();
1927
165k
        }
1928
1.41M
    };
1929
1930
1.35M
    auto is_plain_path = [](const PathInData& path) {
1931
13
        for (const auto& part : path.get_parts()) {
1932
13
            if (part.is_nested || part.anonymous_array_level != 0) {
1933
0
                return false;
1934
0
            }
1935
13
        }
1936
9
        return true;
1937
9
    };
1938
1939
1.35M
    auto get_or_create_subcolumn = [&](const PathInData& path, size_t index_hint,
1940
1.41M
                                       const FieldInfo& field_info) -> ColumnVariant::Subcolumn* {
1941
1.41M
        auto* subcolumn = column_variant.get_subcolumn(path, index_hint);
1942
1.41M
        if (subcolumn == nullptr) {
1943
3.67k
            if (path.has_nested_part()) {
1944
17
                column_variant.add_nested_subcolumn(path, field_info, old_num_rows);
1945
3.66k
            } else {
1946
3.66k
                column_variant.add_sub_column(path, old_num_rows);
1947
3.66k
            }
1948
3.67k
            subcolumn = column_variant.get_subcolumn(path, index_hint);
1949
3.67k
        }
1950
1.41M
        if (!subcolumn) {
1951
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to find sub column {}",
1952
0
                                   path.get_path());
1953
0
        }
1954
1.41M
        return subcolumn;
1955
1.41M
    };
1956
1957
1.41M
    auto normalize_plain_path = [&](const PathInData& path) {
1958
1.41M
        if (!config.check_duplicate_json_path || path.empty() || !is_plain_path(path)) {
1959
1.41M
            return path;
1960
1.41M
        }
1961
9
        return PathInData(path.get_path());
1962
1.41M
    };
1963
1964
1.35M
    auto insert_into_subcolumn = [&](size_t i,
1965
1.41M
                                     bool check_size_mismatch) -> ColumnVariant::Subcolumn* {
1966
1.41M
        FieldInfo field_info;
1967
1.41M
        get_field_info(values[i], &field_info);
1968
1.41M
        if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
1969
104
            return nullptr;
1970
104
        }
1971
1.41M
        auto path = normalize_plain_path(paths[i]);
1972
1.41M
        auto* subcolumn = get_or_create_subcolumn(path, i, field_info);
1973
1.41M
        flush_defaults(subcolumn);
1974
1.41M
        if (check_size_mismatch && subcolumn->size() != old_num_rows) {
1975
1
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
1976
1
                                   "subcolumn {} size missmatched, may contains duplicated entry",
1977
1
                                   path.get_path());
1978
1
        }
1979
1.41M
        subcolumn->insert(std::move(values[i]), std::move(field_info));
1980
1.41M
        return subcolumn;
1981
1.41M
    };
1982
1983
1.35M
    switch (config.parse_to) {
1984
82.1k
    case ParseConfig::ParseTo::OnlySubcolumns:
1985
1.50M
        for (size_t i = 0; i < paths.size(); ++i) {
1986
1.41M
            insert_into_subcolumn(i, true);
1987
1.41M
        }
1988
82.1k
        break;
1989
1.27M
    case ParseConfig::ParseTo::OnlyDocValueColumn: {
1990
1.27M
        std::vector<size_t> doc_item_indexes;
1991
1.27M
        doc_item_indexes.reserve(paths.size());
1992
1.27M
        phmap::flat_hash_set<StringRef, StringRefHash> seen_paths;
1993
1.27M
        seen_paths.reserve(paths.size());
1994
1995
19.1M
        for (size_t i = 0; i < paths.size(); ++i) {
1996
17.8M
            FieldInfo field_info;
1997
17.8M
            get_field_info(values[i], &field_info);
1998
17.8M
            if (paths[i].empty()) {
1999
                // Plain non-doc VARIANT can use doc-value KV as writer-side staging. An
2000
                // invalid root entry from JSON object/array is neither a scalar root value nor
2001
                // a doc KV path, so leave this row's doc offset empty. Doc-mode and valid scalar
2002
                // roots still populate the root subcolumn below.
2003
798
                if (!column_variant.enable_doc_mode() &&
2004
798
                    field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
2005
3
                    continue;
2006
3
                }
2007
795
                auto* subcolumn = column_variant.get_subcolumn(paths[i]);
2008
795
                DCHECK(subcolumn != nullptr);
2009
795
                flush_defaults(subcolumn);
2010
795
                subcolumn->insert(std::move(values[i]), std::move(field_info));
2011
795
                continue;
2012
798
            }
2013
17.8M
            if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE ||
2014
17.8M
                values[i].get_type() == PrimitiveType::TYPE_NULL) {
2015
116k
                continue;
2016
116k
            }
2017
17.7M
            const auto& path_str = paths[i].get_path();
2018
17.7M
            StringRef path_ref {path_str.data(), path_str.size()};
2019
17.7M
            if (UNLIKELY(!seen_paths.emplace(path_ref).second)) {
2020
2
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
2021
2
                                       "may contains duplicated entry : {}",
2022
2
                                       std::string_view(path_str));
2023
2
            }
2024
17.7M
            doc_item_indexes.push_back(i);
2025
17.7M
        }
2026
2027
1.27M
        std::sort(doc_item_indexes.begin(), doc_item_indexes.end(),
2028
71.2M
                  [&](size_t l, size_t r) { return paths[l].get_path() < paths[r].get_path(); });
2029
16.1M
        for (const auto idx : doc_item_indexes) {
2030
16.1M
            const auto& path_str = paths[idx].get_path();
2031
16.1M
            doc_value_data_paths->insert_data(path_str.data(), path_str.size());
2032
16.1M
            auto& chars = doc_value_data_values->get_chars();
2033
16.1M
            append_field_to_binary_chars(values[idx], chars);
2034
16.1M
            doc_value_data_values->get_offsets().push_back(chars.size());
2035
16.1M
        }
2036
1.27M
    } break;
2037
1.35M
    }
2038
1.35M
    doc_value_data_offsets.push_back(doc_value_data_paths->size());
2039
    // /// Insert default values to missed subcolumns.
2040
1.35M
    const auto& subcolumns = column_variant.get_subcolumns();
2041
5.54M
    for (const auto& entry : subcolumns) {
2042
5.54M
        if (entry->data.size() == old_num_rows) {
2043
            // Handle nested paths differently from simple paths
2044
4.12M
            if (entry->path.has_nested_part()) {
2045
                // Try to insert default from nested, if failed, insert regular default
2046
0
                bool success = UNLIKELY(column_variant.try_insert_default_from_nested(entry));
2047
0
                if (!success) {
2048
0
                    entry->data.insert_default();
2049
0
                }
2050
4.12M
            } else {
2051
                // For non-nested paths, increment default counter
2052
4.12M
                entry->data.increment_default_counter();
2053
4.12M
            }
2054
4.12M
        }
2055
5.54M
    }
2056
1.35M
    column_variant.incr_num_rows();
2057
1.35M
    if (column_variant.get_sparse_column()->size() == old_num_rows) {
2058
1.35M
        column_variant.get_sparse_column_mutable().insert_default();
2059
1.35M
    }
2060
1.35M
#ifndef NDEBUG
2061
1.35M
    column_variant.check_consistency();
2062
1.35M
#endif
2063
1.35M
}
2064
2065
// exposed interfaces
2066
void parse_json_to_variant(IColumn& column, const StringRef& json, JsonParser* parser,
2067
12.4k
                           const ParseConfig& config) {
2068
12.4k
    if (parser) {
2069
0
        return parse_json_to_variant_impl(column, json.data, json.size, parser, config);
2070
12.4k
    } else {
2071
12.4k
        auto pool_parser = parsers_pool.get([] { return new JsonParser(); });
2072
12.4k
        return parse_json_to_variant_impl(column, json.data, json.size, pool_parser.get(), config);
2073
12.4k
    }
2074
12.4k
}
2075
2076
void parse_json_to_variant(IColumn& column, const ColumnString& raw_json_column,
2077
4.52k
                           const ParseConfig& config) {
2078
4.52k
    auto parser = parsers_pool.get([] { return new JsonParser(); });
2079
1.34M
    for (size_t i = 0; i < raw_json_column.size(); ++i) {
2080
1.34M
        StringRef raw_json = raw_json_column.get_data_at(i);
2081
1.34M
        parse_json_to_variant_impl(column, raw_json.data, raw_json.size, parser.get(), config);
2082
1.34M
    }
2083
4.52k
    column.finalize();
2084
4.52k
}
2085
2086
// parse the doc snapshot column to subcolumns
2087
0
void materialize_docs_to_subcolumns(ColumnVariant& column_variant) {
2088
0
    auto subcolumns = materialize_docs_to_subcolumns_map(column_variant);
2089
2090
0
    for (auto& entry : subcolumns) {
2091
0
        entry.second.finalize();
2092
0
        if (!column_variant.add_sub_column(PathInData(entry.first),
2093
0
                                           IColumn::mutate(entry.second.get_finalized_column_ptr()),
2094
0
                                           entry.second.get_least_common_type())) {
2095
0
            throw doris::Exception(ErrorCode::INTERNAL_ERROR,
2096
0
                                   "Failed to add subcolumn {}, which is from doc snapshot column",
2097
0
                                   entry.first);
2098
0
        }
2099
0
    }
2100
2101
0
    column_variant.finalize();
2102
0
}
2103
2104
// ============ Implementation from variant_util.cpp ============
2105
2106
phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_to_subcolumns_map(
2107
11
        const ColumnVariant& variant, size_t expected_unique_paths) {
2108
11
    constexpr size_t kInitialPathReserve = 8192;
2109
11
    phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> subcolumns;
2110
2111
11
    const auto [column_key, column_value] = variant.get_doc_value_data_paths_and_values();
2112
11
    const auto& column_offsets = variant.serialized_doc_value_column_offsets();
2113
11
    const size_t num_rows = column_offsets.size();
2114
2115
11
    DCHECK_EQ(num_rows, variant.size()) << "doc snapshot offsets size mismatch with variant rows";
2116
2117
11
    subcolumns.reserve(expected_unique_paths != 0
2118
11
                               ? expected_unique_paths
2119
11
                               : std::min<size_t>(column_key->size(), kInitialPathReserve));
2120
2121
36
    for (size_t row = 0; row < num_rows; ++row) {
2122
25
        const size_t start = column_offsets[row - 1];
2123
25
        const size_t end = column_offsets[row];
2124
71
        for (size_t i = start; i < end; ++i) {
2125
46
            const auto& key = column_key->get_data_at(i);
2126
46
            const std::string_view path_sv(key.data, key.size);
2127
2128
46
            auto [it, inserted] =
2129
46
                    subcolumns.try_emplace(path_sv, ColumnVariant::Subcolumn {0, true, false});
2130
46
            auto& subcolumn = it->second;
2131
46
            if (inserted) {
2132
27
                subcolumn.insert_many_defaults(row);
2133
27
            } else if (subcolumn.size() != row) {
2134
4
                subcolumn.insert_many_defaults(row - subcolumn.size());
2135
4
            }
2136
46
            subcolumn.deserialize_from_binary_column(column_value, i);
2137
46
        }
2138
25
    }
2139
2140
27
    for (auto& [path, subcolumn] : subcolumns) {
2141
27
        if (subcolumn.size() != num_rows) {
2142
7
            subcolumn.insert_many_defaults(num_rows - subcolumn.size());
2143
7
        }
2144
27
    }
2145
2146
11
    return subcolumns;
2147
11
}
2148
2149
Status _parse_and_materialize_variant_columns(Block& block,
2150
                                              const std::vector<uint32_t>& variant_pos,
2151
4.45k
                                              const std::vector<ParseConfig>& configs) {
2152
9.75k
    for (size_t i = 0; i < variant_pos.size(); ++i) {
2153
5.29k
        auto column_ref = block.get_by_position(variant_pos[i]).column;
2154
5.29k
        bool is_nullable = is_column_nullable(*column_ref);
2155
5.29k
        MutableColumnPtr owner_column = IColumn::mutate(std::move(column_ref));
2156
5.29k
        ColumnPtr nullable_null_map;
2157
5.29k
        MutableColumnPtr var_column;
2158
5.29k
        if (is_nullable) {
2159
5.02k
            const auto& nullable = assert_cast<const ColumnNullable&>(*owner_column);
2160
5.02k
            nullable_null_map = nullable.get_null_map_column_ptr();
2161
5.02k
            var_column = IColumn::mutate(nullable.get_nested_column_ptr());
2162
5.02k
        } else {
2163
273
            var_column = std::move(owner_column);
2164
273
        }
2165
5.29k
        auto& var = assert_cast<ColumnVariant&>(*var_column);
2166
5.29k
        var_column->finalize();
2167
2168
5.29k
        MutableColumnPtr variant_column;
2169
5.29k
        if (!var.is_scalar_variant()) {
2170
            // already parsed
2171
1.23k
            continue;
2172
1.23k
        }
2173
2174
18.4E
        VLOG_DEBUG << "parse scalar variant column: " << var.get_root_type()->get_name();
2175
4.06k
        ColumnPtr scalar_root_column;
2176
4.06k
        if (var.get_root_type()->get_primitive_type() == TYPE_JSONB) {
2177
            // TODO more efficient way to parse jsonb type, currently we just convert jsonb to
2178
            // json str and parse them into variant
2179
30
            RETURN_IF_ERROR(cast_column({var.get_root(), var.get_root_type(), ""},
2180
30
                                        var.get_root()->is_nullable()
2181
30
                                                ? make_nullable(std::make_shared<DataTypeString>())
2182
30
                                                : std::make_shared<DataTypeString>(),
2183
30
                                        &scalar_root_column));
2184
30
            if (is_column_nullable(*scalar_root_column)) {
2185
30
                scalar_root_column = assert_cast<const ColumnNullable*>(scalar_root_column.get())
2186
30
                                             ->get_nested_column_ptr();
2187
30
            }
2188
4.03k
        } else {
2189
4.03k
            const auto& root = *var.get_root();
2190
4.03k
            scalar_root_column =
2191
4.03k
                    is_column_nullable(root)
2192
4.03k
                            ? assert_cast<const ColumnNullable&>(root).get_nested_column_ptr()
2193
4.03k
                            : var.get_root();
2194
4.03k
        }
2195
2196
4.29k
        if (scalar_root_column->is_column_string()) {
2197
4.29k
            variant_column = ColumnVariant::create(0, var.enable_doc_mode());
2198
4.29k
            parse_json_to_variant(*variant_column.get(),
2199
4.29k
                                  assert_cast<const ColumnString&>(*scalar_root_column),
2200
4.29k
                                  configs[i]);
2201
18.4E
        } else {
2202
            // Root maybe other types rather than string like ColumnVariant(Int32).
2203
            // In this case, we should finlize the root and cast to JSON type
2204
18.4E
            auto expected_root_type =
2205
18.4E
                    make_nullable(std::make_shared<ColumnVariant::MostCommonType>());
2206
18.4E
            var.ensure_root_node_type(expected_root_type);
2207
18.4E
            variant_column = std::move(var_column);
2208
18.4E
        }
2209
2210
        // Wrap variant with nullmap if it is nullable
2211
4.06k
        ColumnPtr result = variant_column->get_ptr();
2212
4.06k
        if (is_nullable) {
2213
4.06k
            result = ColumnNullable::create(result, nullable_null_map);
2214
4.06k
        }
2215
4.06k
        block.get_by_position(variant_pos[i]).column = result;
2216
4.06k
    }
2217
4.45k
    return Status::OK();
2218
4.45k
}
2219
2220
Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos,
2221
4.37k
                                             const std::vector<ParseConfig>& configs) {
2222
4.37k
    RETURN_IF_CATCH_EXCEPTION(
2223
4.37k
            { return _parse_and_materialize_variant_columns(block, variant_pos, configs); });
2224
4.37k
}
2225
2226
namespace {
2227
2228
ParseConfig::ParseTo select_storage_variant_parse_target(const TabletColumn& column,
2229
5.17k
                                                         const ParseConfig& config) {
2230
    // NestedGroup consumes the parse-time subcolumn tree to build nested storage structures, so it
2231
    // must not go through doc-value staging.
2232
5.17k
    if (column.variant_enable_nested_group()) {
2233
4
        return ParseConfig::ParseTo::OnlySubcolumns;
2234
4
    }
2235
2236
    // Persistent doc mode owns doc-value bucket columns in VariantDocWriter. Keep it separate from
2237
    // the plain non-doc staging optimization, even when typed paths or parent indexes exist.
2238
5.17k
    if (column.variant_enable_doc_mode()) {
2239
1.93k
        return ParseConfig::ParseTo::OnlyDocValueColumn;
2240
1.93k
    }
2241
2242
    // Deprecated flatten-nested still consumes parse-time subcolumns. Predefined typed paths and
2243
    // parent inverted indexes are handled later by regular doc-value staging: typed paths are
2244
    // forced into the materialized set unless typed-to-sparse is enabled, and materialized dynamic
2245
    // subcolumns inherit parent indexes while sparse payloads stay unindexed.
2246
3.24k
    if (config.deprecated_enable_flatten_nested) {
2247
26
        return ParseConfig::ParseTo::OnlySubcolumns;
2248
26
    }
2249
2250
    // Plain dynamic non-doc VARIANT can avoid eagerly creating thousands of parse-time subcolumns.
2251
    // The segment writer will pick the materialized/sparse split from this doc-value KV staging.
2252
    // Keep a BE switch so tests and rollouts can compare the old parse-time path with staging under
2253
    // the same writer and schema.
2254
3.21k
    switch (config::variant_storage_parse_mode) {
2255
3.30k
    case 0:
2256
3.30k
    case 2:
2257
3.30k
        return ParseConfig::ParseTo::OnlyDocValueColumn;
2258
2
    case 1:
2259
2
        return ParseConfig::ParseTo::OnlySubcolumns;
2260
0
    default:
2261
0
        CHECK(false) << "invalid variant_storage_parse_mode: "
2262
0
                     << config::variant_storage_parse_mode;
2263
0
        return ParseConfig::ParseTo::OnlyDocValueColumn;
2264
3.21k
    }
2265
3.21k
}
2266
2267
} // namespace
2268
2269
Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& tablet_schema,
2270
4.54k
                                             const std::vector<uint32_t>& column_pos) {
2271
4.54k
    std::vector<uint32_t> variant_column_pos;
2272
4.54k
    std::vector<uint32_t> variant_schema_pos;
2273
4.54k
    variant_column_pos.reserve(column_pos.size());
2274
4.54k
    variant_schema_pos.reserve(column_pos.size());
2275
27.4k
    for (size_t block_pos = 0; block_pos < column_pos.size(); ++block_pos) {
2276
22.8k
        const uint32_t schema_pos = column_pos[block_pos];
2277
22.8k
        const auto& column = tablet_schema.column(schema_pos);
2278
22.8k
        if (column.is_variant_type()) {
2279
5.30k
            variant_column_pos.push_back(schema_pos);
2280
5.30k
            variant_schema_pos.push_back(schema_pos);
2281
5.30k
        }
2282
22.8k
    }
2283
2284
4.54k
    if (variant_column_pos.empty()) {
2285
47
        return Status::OK();
2286
47
    }
2287
2288
4.50k
    std::vector<ParseConfig> configs(variant_column_pos.size());
2289
9.84k
    for (size_t i = 0; i < variant_column_pos.size(); ++i) {
2290
        // Deprecated legacy flatten-nested switch. Distinct from variant_enable_nested_group.
2291
5.34k
        configs[i].deprecated_enable_flatten_nested =
2292
5.34k
                tablet_schema.deprecated_variant_flatten_nested();
2293
5.34k
        configs[i].check_duplicate_json_path = config::variant_enable_duplicate_json_path_check;
2294
5.34k
        const auto& column = tablet_schema.column(variant_schema_pos[i]);
2295
5.34k
        if (!column.is_variant_type()) {
2296
0
            return Status::InternalError("column is not variant type, column name: {}",
2297
0
                                         column.name());
2298
0
        }
2299
5.34k
        configs[i].parse_to = select_storage_variant_parse_target(column, configs[i]);
2300
5.34k
    }
2301
2302
4.50k
    RETURN_IF_ERROR(parse_and_materialize_variant_columns(block, variant_column_pos, configs));
2303
4.49k
    return Status::OK();
2304
4.50k
}
2305
2306
} // namespace doris::variant_util