Coverage Report

Created: 2026-06-10 09:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/common/variant_util.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/common/variant_util.h"
19
20
#include <assert.h>
21
#include <fmt/format.h>
22
#include <gen_cpp/FrontendService.h>
23
#include <gen_cpp/FrontendService_types.h>
24
#include <gen_cpp/HeartbeatService_types.h>
25
#include <gen_cpp/MasterService_types.h>
26
#include <gen_cpp/Status_types.h>
27
#include <gen_cpp/Types_types.h>
28
#include <glog/logging.h>
29
#include <rapidjson/document.h>
30
#include <rapidjson/stringbuffer.h>
31
#include <rapidjson/writer.h>
32
#include <simdjson/simdjson.h> // IWYU pragma: keep
33
#include <unicode/uchar.h>
34
35
#include <algorithm>
36
#include <cassert>
37
#include <cstddef>
38
#include <cstdint>
39
#include <cstring>
40
#include <list>
41
#include <memory>
42
#include <mutex>
43
#include <optional>
44
#include <ostream>
45
#include <ranges>
46
#include <set>
47
#include <stack>
48
#include <string>
49
#include <string_view>
50
#include <unordered_map>
51
#include <utility>
52
#include <vector>
53
54
#include "common/config.h"
55
#include "common/status.h"
56
#include "core/assert_cast.h"
57
#include "core/block/block.h"
58
#include "core/block/column_numbers.h"
59
#include "core/block/column_with_type_and_name.h"
60
#include "core/column/column.h"
61
#include "core/column/column_array.h"
62
#include "core/column/column_map.h"
63
#include "core/column/column_nullable.h"
64
#include "core/column/column_string.h"
65
#include "core/column/column_variant.h"
66
#include "core/data_type/data_type.h"
67
#include "core/data_type/data_type_array.h"
68
#include "core/data_type/data_type_factory.hpp"
69
#include "core/data_type/data_type_jsonb.h"
70
#include "core/data_type/data_type_nullable.h"
71
#include "core/data_type/data_type_string.h"
72
#include "core/data_type/data_type_variant.h"
73
#include "core/data_type/define_primitive_type.h"
74
#include "core/data_type/get_least_supertype.h"
75
#include "core/data_type/primitive_type.h"
76
#include "core/field.h"
77
#include "core/typeid_cast.h"
78
#include "core/types.h"
79
#include "exec/common/field_visitors.h"
80
#include "exec/common/sip_hash.h"
81
#include "exprs/function/function.h"
82
#include "exprs/function/simple_function_factory.h"
83
#include "exprs/function_context.h"
84
#include "exprs/json_functions.h"
85
#include "re2/re2.h"
86
#include "runtime/exec_env.h"
87
#include "runtime/runtime_state.h"
88
#include "storage/olap_common.h"
89
#include "storage/rowset/beta_rowset.h"
90
#include "storage/rowset/rowset.h"
91
#include "storage/rowset/rowset_fwd.h"
92
#include "storage/segment/segment_loader.h"
93
#include "storage/segment/variant/nested_group_path.h"
94
#include "storage/segment/variant/variant_column_reader.h"
95
#include "storage/segment/variant/variant_column_writer_impl.h"
96
#include "storage/tablet/tablet.h"
97
#include "storage/tablet/tablet_fwd.h"
98
#include "storage/tablet/tablet_schema.h"
99
#include "util/client_cache.h"
100
#include "util/defer_op.h"
101
#include "util/json/json_parser.h"
102
#include "util/json/path_in_data.h"
103
#include "util/json/simd_json_parser.h"
104
#include "util/jsonb_utils.h"
105
106
namespace doris::variant_util {
107
108
2.83k
inline void append_escaped_regex_char(std::string* regex_output, char ch) {
109
2.83k
    switch (ch) {
110
23
    case '.':
111
25
    case '^':
112
27
    case '$':
113
29
    case '+':
114
35
    case '*':
115
37
    case '?':
116
39
    case '(':
117
41
    case ')':
118
43
    case '|':
119
45
    case '{':
120
47
    case '}':
121
49
    case '[':
122
49
    case ']':
123
53
    case '\\':
124
53
        regex_output->push_back('\\');
125
53
        regex_output->push_back(ch);
126
53
        break;
127
2.78k
    default:
128
2.78k
        regex_output->push_back(ch);
129
2.78k
        break;
130
2.83k
    }
131
2.83k
}
132
133
// Small LRU to cap compiled glob patterns
134
constexpr size_t kGlobRegexCacheCapacity = 256;
135
136
struct GlobRegexCacheEntry {
137
    std::shared_ptr<RE2> re2;
138
    std::list<std::string>::iterator lru_it;
139
};
140
141
static std::mutex g_glob_regex_cache_mutex;
142
static std::list<std::string> g_glob_regex_cache_lru;
143
static std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;
144
145
206k
std::shared_ptr<RE2> get_or_build_re2(const std::string& glob_pattern) {
146
206k
    {
147
206k
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
148
206k
        auto it = g_glob_regex_cache.find(glob_pattern);
149
206k
        if (it != g_glob_regex_cache.end()) {
150
206k
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
151
206k
                                          it->second.lru_it);
152
206k
            return it->second.re2;
153
206k
        }
154
206k
    }
155
210
    std::string regex_pattern;
156
210
    Status st = glob_to_regex(glob_pattern, &regex_pattern);
157
210
    if (!st.ok()) {
158
2
        return nullptr;
159
2
    }
160
208
    auto compiled = std::make_shared<RE2>(regex_pattern);
161
208
    if (!compiled->ok()) {
162
3
        return nullptr;
163
3
    }
164
205
    {
165
205
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
166
205
        auto it = g_glob_regex_cache.find(glob_pattern);
167
205
        if (it != g_glob_regex_cache.end()) {
168
4
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
169
4
                                          it->second.lru_it);
170
4
            return it->second.re2;
171
4
        }
172
201
        g_glob_regex_cache_lru.push_front(glob_pattern);
173
201
        g_glob_regex_cache.emplace(glob_pattern,
174
201
                                   GlobRegexCacheEntry {compiled, g_glob_regex_cache_lru.begin()});
175
201
        if (g_glob_regex_cache.size() > kGlobRegexCacheCapacity) {
176
0
            const std::string& evict_key = g_glob_regex_cache_lru.back();
177
0
            g_glob_regex_cache.erase(evict_key);
178
0
            g_glob_regex_cache_lru.pop_back();
179
0
        }
180
201
    }
181
0
    return compiled;
182
205
}
183
184
// Convert a restricted glob pattern into a regex.
185
// Supported: '*', '?', '[...]', '\\' escape. Others are treated as literals.
186
312
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern) {
187
312
    regex_pattern->clear();
188
312
    regex_pattern->append("^");
189
312
    bool is_escaped = false;
190
312
    size_t pattern_length = glob_pattern.size();
191
3.27k
    for (size_t index = 0; index < pattern_length; ++index) {
192
2.96k
        char current_char = glob_pattern[index];
193
2.96k
        if (is_escaped) {
194
10
            append_escaped_regex_char(regex_pattern, current_char);
195
10
            is_escaped = false;
196
10
            continue;
197
10
        }
198
2.95k
        if (current_char == '\\') {
199
14
            is_escaped = true;
200
14
            continue;
201
14
        }
202
2.94k
        if (current_char == '*') {
203
69
            regex_pattern->append(".*");
204
69
            continue;
205
69
        }
206
2.87k
        if (current_char == '?') {
207
15
            regex_pattern->append(".");
208
15
            continue;
209
15
        }
210
2.85k
        if (current_char == '[') {
211
33
            size_t class_index = index + 1;
212
33
            bool class_closed = false;
213
33
            bool is_class_escaped = false;
214
33
            std::string class_buffer;
215
33
            if (class_index < pattern_length &&
216
33
                (glob_pattern[class_index] == '!' || glob_pattern[class_index] == '^')) {
217
9
                class_buffer.push_back('^');
218
9
                ++class_index;
219
9
            }
220
99
            for (; class_index < pattern_length; ++class_index) {
221
95
                char class_char = glob_pattern[class_index];
222
95
                if (is_class_escaped) {
223
10
                    class_buffer.push_back(class_char);
224
10
                    is_class_escaped = false;
225
10
                    continue;
226
10
                }
227
85
                if (class_char == '\\') {
228
10
                    is_class_escaped = true;
229
10
                    continue;
230
10
                }
231
75
                if (class_char == ']') {
232
29
                    class_closed = true;
233
29
                    break;
234
29
                }
235
46
                class_buffer.push_back(class_char);
236
46
            }
237
33
            if (!class_closed) {
238
4
                return Status::InvalidArgument("Unclosed character class in glob pattern: {}",
239
4
                                               glob_pattern);
240
4
            }
241
29
            regex_pattern->append("[");
242
29
            regex_pattern->append(class_buffer);
243
29
            regex_pattern->append("]");
244
29
            index = class_index;
245
29
            continue;
246
33
        }
247
2.82k
        append_escaped_regex_char(regex_pattern, current_char);
248
2.82k
    }
249
308
    if (is_escaped) {
250
4
        append_escaped_regex_char(regex_pattern, '\\');
251
4
    }
252
308
    regex_pattern->append("$");
253
308
    return Status::OK();
254
312
}
255
256
206k
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path) {
257
206k
    auto compiled = get_or_build_re2(glob_pattern);
258
206k
    if (compiled == nullptr) {
259
5
        return false;
260
5
    }
261
206k
    return RE2::FullMatch(candidate_path, *compiled);
262
206k
}
263
264
// NestedGroup's physical children and offsets are produced by NestedGroupWriteProvider, not by
265
// appending TabletSchema extracted columns here. This predicate keeps only ordinary Variant paths
266
// that are outside the NG tree, for example `v.owner` beside `v.items[*]`.
267
0
bool is_regular_path_outside_nested_group(const PathInData& path) {
268
0
    const std::string& relative_path = path.get_path();
269
0
    return !relative_path.empty() && !path.get_is_typed() && !path.has_nested_part() &&
270
0
           !segment_v2::contains_nested_group_marker(relative_path) &&
271
0
           !segment_v2::is_root_nested_group_path(relative_path) &&
272
0
           relative_path != SPARSE_COLUMN_PATH &&
273
0
           relative_path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos;
274
0
}
275
276
bool should_materialize_nested_group_regular_subcolumns(
277
        const TabletColumnPtr& column,
278
662
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
279
662
    const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
280
662
    return column->variant_enable_nested_group() ||
281
666
           (info_it != uid_to_variant_extended_info.end() && info_it->second.has_nested_group);
282
662
}
283
284
std::unordered_set<int32_t> collect_nested_group_compaction_root_uids(
285
        const TabletSchemaSPtr& target,
286
10.6k
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
287
10.6k
    std::unordered_set<int32_t> root_uids;
288
106k
    for (const TabletColumnPtr& column : target->columns()) {
289
106k
        if (column->is_variant_type() && should_materialize_nested_group_regular_subcolumns(
290
666
                                                 column, uid_to_variant_extended_info)) {
291
1
            root_uids.insert(column->unique_id());
292
1
        }
293
106k
    }
294
10.6k
    return root_uids;
295
10.6k
}
296
297
PathToDataTypes collect_regular_types_outside_nested_group(
298
1
        const VariantExtendedInfo& extended_info) {
299
1
    PathToDataTypes regular_path_to_data_types;
300
1
    for (const auto& [path, data_types] : extended_info.path_to_data_types) {
301
0
        if (!is_regular_path_outside_nested_group(path)) {
302
0
            continue;
303
0
        }
304
0
        regular_path_to_data_types.emplace(path, data_types);
305
0
    }
306
1
    return regular_path_to_data_types;
307
1
}
308
309
960
size_t get_number_of_dimensions(const IDataType& type) {
310
960
    if (const auto* type_array = typeid_cast<const DataTypeArray*>(&type)) {
311
4
        return type_array->get_number_of_dimensions();
312
4
    }
313
956
    return 0;
314
960
}
315
3
size_t get_number_of_dimensions(const IColumn& column) {
316
3
    if (const auto* column_array = check_and_get_column<ColumnArray>(column)) {
317
2
        return column_array->get_number_of_dimensions();
318
2
    }
319
1
    return 0;
320
3
}
321
322
71.8k
DataTypePtr get_base_type_of_array(const DataTypePtr& type) {
323
    /// Get raw pointers to avoid extra copying of type pointers.
324
71.8k
    const DataTypeArray* last_array = nullptr;
325
71.8k
    const auto* current_type = type.get();
326
71.8k
    if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
327
71.8k
        current_type = nullable->get_nested_type().get();
328
71.8k
    }
329
73.0k
    while (const auto* type_array = typeid_cast<const DataTypeArray*>(current_type)) {
330
1.18k
        current_type = type_array->get_nested_type().get();
331
1.18k
        last_array = type_array;
332
1.18k
        if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
333
1.18k
            current_type = nullable->get_nested_type().get();
334
1.18k
        }
335
1.18k
    }
336
71.8k
    return last_array ? last_array->get_nested_type() : type;
337
71.8k
}
338
339
914k
Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result) {
340
914k
    ColumnsWithTypeAndName arguments {arg, {nullptr, type, type->get_name()}};
341
342
    // To prevent from null info lost, we should not call function since the function framework will wrap
343
    // nullable to Variant instead of the root of Variant
344
    // correct output: Nullable(Array(int)) -> Nullable(Variant(Nullable(Array(int))))
345
    // incorrect output: Nullable(Array(int)) -> Nullable(Variant(Array(int)))
346
914k
    if (type->get_primitive_type() == TYPE_VARIANT) {
347
        // If source column is variant, so the nullable info is different from dst column
348
14.6k
        if (arg.type->get_primitive_type() == TYPE_VARIANT) {
349
165
            *result = type->is_nullable() ? make_nullable(arg.column) : remove_nullable(arg.column);
350
165
            return Status::OK();
351
165
        }
352
        // set variant root column/type to from column/type
353
14.6k
        CHECK(arg.column->is_nullable());
354
14.4k
        auto to_type = remove_nullable(type);
355
14.4k
        const auto& data_type_object = assert_cast<const DataTypeVariant&>(*to_type);
356
14.4k
        auto variant = ColumnVariant::create(data_type_object.variant_max_subcolumns_count(),
357
14.4k
                                             data_type_object.enable_doc_mode());
358
359
14.4k
        variant->create_root(arg.type, std::move(*arg.column).mutate());
360
14.4k
        ColumnPtr nullable = ColumnNullable::create(
361
14.4k
                variant->get_ptr(),
362
14.4k
                assert_cast<const ColumnNullable*>(arg.column.get())->get_null_map_column_ptr());
363
18.4E
        *result = type->is_nullable() ? nullable : variant->get_ptr();
364
14.4k
        return Status::OK();
365
14.6k
    }
366
367
899k
    auto function = SimpleFunctionFactory::instance().get_function("CAST", arguments, type);
368
899k
    if (!function) {
369
0
        return Status::InternalError("Not found cast function {} to {}", arg.type->get_name(),
370
0
                                     type->get_name());
371
0
    }
372
899k
    Block tmp_block {arguments};
373
899k
    uint32_t result_column = cast_set<uint32_t>(tmp_block.columns());
374
899k
    RuntimeState state;
375
899k
    auto ctx = FunctionContext::create_context(&state, {}, {});
376
377
899k
    if (arg.type->get_primitive_type() == INVALID_TYPE) {
378
        // cast from nothing to any type should result in nulls
379
5.90k
        *result = type->create_column_const_with_default_value(arg.column->size())
380
5.90k
                          ->convert_to_full_column_if_const();
381
5.90k
        return Status::OK();
382
5.90k
    }
383
384
    // We convert column string to jsonb type just add a string jsonb field to dst column instead of parse
385
    // each line in original string column.
386
893k
    ctx->set_string_as_jsonb_string(true);
387
893k
    ctx->set_jsonb_string_as_string(true);
388
893k
    tmp_block.insert({nullptr, type, arg.name});
389
    // TODO(lihangyu): we should handle this error in strict mode
390
893k
    if (!function->execute(ctx.get(), tmp_block, {0}, result_column, arg.column->size())) {
391
1
        LOG_EVERY_N(WARNING, 100) << fmt::format("cast from {} to {}", arg.type->get_name(),
392
1
                                                 type->get_name());
393
1
        *result = type->create_column_const_with_default_value(arg.column->size())
394
1
                          ->convert_to_full_column_if_const();
395
1
        return Status::OK();
396
1
    }
397
893k
    *result = tmp_block.get_by_position(result_column).column->convert_to_full_column_if_const();
398
893k
    VLOG_DEBUG << fmt::format("{} before convert {}, after convert {}", arg.name,
399
16
                              arg.column->get_name(), (*result)->get_name());
400
893k
    return Status::OK();
401
893k
}
402
403
32
ColumnPtr jsonb_root_to_json_string_column(const IColumn& root) {
404
32
    auto root_column = root.convert_to_full_column_if_const();
405
32
    const IColumn* jsonb_column = root_column.get();
406
32
    const NullMap* null_map = nullptr;
407
32
    if (root_column->is_nullable()) {
408
27
        const auto& nullable = assert_cast<const ColumnNullable&>(*root_column);
409
27
        jsonb_column = &nullable.get_nested_column();
410
27
        null_map = &nullable.get_null_map_data();
411
27
    }
412
413
32
    const auto& column = assert_cast<const ColumnString&>(*jsonb_column);
414
32
    auto result = ColumnString::create();
415
32
    result->reserve(column.size());
416
118
    for (size_t i = 0; i < column.size(); ++i) {
417
86
        if (null_map != nullptr && (*null_map)[i]) {
418
13
            result->insert_default();
419
13
            continue;
420
13
        }
421
422
73
        const auto jsonb = column.get_data_at(i);
423
73
        if (jsonb.size == 0) {
424
0
            result->insert_default();
425
0
            continue;
426
0
        }
427
428
73
        const auto json = JsonbToJson::jsonb_to_json_string(jsonb.data, jsonb.size);
429
73
        result->insert_data(json.data(), json.size());
430
73
    }
431
32
    return result->get_ptr();
432
32
}
433
434
void get_column_by_type(const DataTypePtr& data_type, const std::string& name, TabletColumn& column,
435
144k
                        const ExtraInfo& ext_info) {
436
144k
    column.set_name(name);
437
144k
    column.set_type(data_type->get_storage_field_type());
438
144k
    if (ext_info.unique_id >= 0) {
439
4
        column.set_unique_id(ext_info.unique_id);
440
4
    }
441
144k
    if (ext_info.parent_unique_id >= 0) {
442
71.0k
        column.set_parent_unique_id(ext_info.parent_unique_id);
443
71.0k
    }
444
144k
    if (!ext_info.path_info.empty()) {
445
71.0k
        column.set_path_info(ext_info.path_info);
446
71.0k
    }
447
144k
    if (data_type->is_nullable()) {
448
72.2k
        const auto& real_type = static_cast<const DataTypeNullable&>(*data_type);
449
72.2k
        column.set_is_nullable(true);
450
72.2k
        get_column_by_type(real_type.get_nested_type(), name, column, {});
451
72.2k
        return;
452
72.2k
    }
453
72.2k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_ARRAY) {
454
1.19k
        TabletColumn child;
455
1.19k
        get_column_by_type(assert_cast<const DataTypeArray*>(data_type.get())->get_nested_type(),
456
1.19k
                           "", child, {});
457
1.19k
        column.set_length(TabletColumn::get_field_length_by_type(TPrimitiveType::ARRAY, 0));
458
1.19k
        column.add_sub_column(child);
459
1.19k
        return;
460
1.19k
    }
461
71.0k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_VARIANT) {
462
0
        const auto* dt_variant = assert_cast<const DataTypeVariant*>(data_type.get());
463
0
        column.set_variant_max_subcolumns_count(dt_variant->variant_max_subcolumns_count());
464
0
        column.set_variant_enable_doc_mode(dt_variant->enable_doc_mode());
465
0
        return;
466
0
    }
467
    // size is not fixed when type is string or json
468
71.0k
    if (is_string_type(data_type->get_primitive_type()) ||
469
71.0k
        data_type->get_primitive_type() == TYPE_JSONB) {
470
17.3k
        column.set_length(INT_MAX);
471
17.3k
        return;
472
17.3k
    }
473
474
53.6k
    PrimitiveType type = data_type->get_primitive_type();
475
53.6k
    if (is_int_or_bool(type) || is_string_type(type) || is_float_or_double(type) || is_ip(type) ||
476
53.6k
        is_date_or_datetime(type) || type == PrimitiveType::TYPE_DATEV2) {
477
53.6k
        column.set_length(cast_set<int32_t>(data_type->get_size_of_value_in_memory()));
478
53.6k
        return;
479
53.6k
    }
480
105
    if (is_decimal(type)) {
481
105
        column.set_precision(data_type->get_precision());
482
105
        column.set_frac(data_type->get_scale());
483
105
        return;
484
105
    }
485
    // datetimev2 needs scale
486
18.4E
    if (type == PrimitiveType::TYPE_DATETIMEV2 || type == PrimitiveType::TYPE_TIMESTAMPTZ) {
487
17
        column.set_precision(-1);
488
17
        column.set_frac(data_type->get_scale());
489
17
        return;
490
17
    }
491
492
18.4E
    throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
493
18.4E
                           "unexcepted data column type: {}, column name is: {}",
494
18.4E
                           data_type->get_name(), name);
495
18.4E
}
496
497
TabletColumn get_column_by_type(const DataTypePtr& data_type, const std::string& name,
498
70.1k
                                const ExtraInfo& ext_info) {
499
70.1k
    TabletColumn result;
500
70.1k
    get_column_by_type(data_type, name, result, ext_info);
501
70.1k
    return result;
502
70.1k
}
503
504
// check if two paths which same prefix have different structure
505
static bool has_different_structure_in_same_path(const PathInData::Parts& lhs,
506
9.04k
                                                 const PathInData::Parts& rhs) {
507
9.04k
    if (lhs.size() != rhs.size()) {
508
1
        return false; // different size means different structure
509
1
    }
510
    // Since we group by path string, lhs and rhs must have the same size and keys
511
    // We only need to check if they have different nested structure
512
36.1k
    for (size_t i = 0; i < lhs.size(); ++i) {
513
27.0k
        if (lhs[i] != rhs[i]) {
514
5
            VLOG_DEBUG << fmt::format(
515
0
                    "Check different structure: {} vs {}, lhs[i].is_nested: {}, rhs[i].is_nested: "
516
0
                    "{}",
517
0
                    lhs[i].key, rhs[i].key, lhs[i].is_nested, rhs[i].is_nested);
518
5
            return true;
519
5
        }
520
27.0k
    }
521
9.03k
    return false;
522
9.04k
}
523
524
4.81k
Status check_variant_has_no_ambiguous_paths(const PathsInData& tuple_paths) {
525
    // Group paths by their string representation to reduce comparisons
526
4.81k
    std::unordered_map<std::string, std::vector<size_t>> path_groups;
527
528
26.8k
    for (size_t i = 0; i < tuple_paths.size(); ++i) {
529
        // same path should have same structure, so we group them by path
530
22.0k
        path_groups[tuple_paths[i].get_path()].push_back(i);
531
        // print part of tuple_paths[i]
532
22.0k
        VLOG_DEBUG << "tuple_paths[i]: " << tuple_paths[i].get_path();
533
22.0k
    }
534
535
    // Only compare paths within the same group
536
13.0k
    for (const auto& [path_str, indices] : path_groups) {
537
13.0k
        if (indices.size() <= 1) {
538
3.99k
            continue; // No conflicts possible
539
3.99k
        }
540
541
        // Compare all pairs within this group
542
27.0k
        for (size_t i = 0; i < indices.size(); ++i) {
543
27.0k
            for (size_t j = 0; j < i; ++j) {
544
9.04k
                if (has_different_structure_in_same_path(tuple_paths[indices[i]].get_parts(),
545
9.04k
                                                         tuple_paths[indices[j]].get_parts())) {
546
5
                    return Status::DataQualityError(
547
5
                            "Ambiguous paths: {} vs {} with different nested part {} vs {}",
548
5
                            tuple_paths[indices[i]].get_path(), tuple_paths[indices[j]].get_path(),
549
5
                            tuple_paths[indices[i]].has_nested_part(),
550
5
                            tuple_paths[indices[j]].has_nested_part());
551
5
                }
552
9.04k
            }
553
18.0k
        }
554
9.01k
    }
555
4.81k
    return Status::OK();
556
4.81k
}
557
558
Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
559
                                    TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
560
                                    const std::map<std::string, TabletColumnPtr>& typed_columns,
561
1.77k
                                    std::set<PathInData>* path_set) {
562
1.77k
    PathsInData tuple_paths;
563
1.77k
    DataTypes tuple_types;
564
1.77k
    CHECK(common_schema.use_count() == 1);
565
    // Get the least common type for all paths.
566
1.77k
    for (const auto& [key, subtypes] : subcolumns_types) {
567
926
        assert(!subtypes.empty());
568
926
        if (key.get_path() == ColumnVariant::COLUMN_NAME_DUMMY) {
569
0
            continue;
570
0
        }
571
926
        size_t first_dim = get_number_of_dimensions(*subtypes[0]);
572
926
        tuple_paths.emplace_back(key);
573
956
        for (size_t i = 1; i < subtypes.size(); ++i) {
574
31
            if (first_dim != get_number_of_dimensions(*subtypes[i])) {
575
1
                tuple_types.emplace_back(make_nullable(std::make_shared<DataTypeJsonb>()));
576
1
                LOG(INFO) << fmt::format(
577
1
                        "Uncompatible types of subcolumn '{}': {} and {}, cast to JSONB",
578
1
                        key.get_path(), subtypes[0]->get_name(), subtypes[i]->get_name());
579
1
                break;
580
1
            }
581
31
        }
582
926
        if (tuple_paths.size() == tuple_types.size()) {
583
1
            continue;
584
1
        }
585
925
        DataTypePtr common_type;
586
925
        get_least_supertype_jsonb(subtypes, &common_type);
587
925
        if (!common_type->is_nullable()) {
588
3
            common_type = make_nullable(common_type);
589
3
        }
590
925
        tuple_types.emplace_back(common_type);
591
925
    }
592
1.77k
    CHECK_EQ(tuple_paths.size(), tuple_types.size());
593
594
    // Append all common type columns of this variant
595
2.70k
    for (int i = 0; i < tuple_paths.size(); ++i) {
596
926
        TabletColumn common_column;
597
        // typed path not contains root part
598
926
        auto path_without_root = tuple_paths[i].copy_pop_front().get_path();
599
926
        if (typed_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) {
600
0
            common_column = *typed_columns.at(path_without_root);
601
            // parent unique id and path may not be init in write path
602
0
            common_column.set_parent_unique_id(variant_col_unique_id);
603
0
            common_column.set_path_info(tuple_paths[i]);
604
0
            common_column.set_name(tuple_paths[i].get_path());
605
926
        } else {
606
            // const std::string& column_name = variant_col_name + "." + tuple_paths[i].get_path();
607
926
            get_column_by_type(tuple_types[i], tuple_paths[i].get_path(), common_column,
608
926
                               ExtraInfo {.unique_id = -1,
609
926
                                          .parent_unique_id = variant_col_unique_id,
610
926
                                          .path_info = tuple_paths[i]});
611
926
        }
612
926
        common_schema->append_column(common_column);
613
926
        if (path_set != nullptr) {
614
923
            path_set->insert(tuple_paths[i]);
615
923
        }
616
926
    }
617
1.77k
    return Status::OK();
618
1.77k
}
619
620
Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
621
                                  TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
622
1.77k
                                  std::set<PathInData>* path_set) {
623
1.77k
    std::map<std::string, TabletColumnPtr> typed_columns;
624
1.77k
    for (const TabletColumnPtr& col :
625
7.86k
         common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) {
626
7.86k
        typed_columns[col->name()] = col;
627
7.86k
    }
628
    // Types of subcolumns by path from all tuples.
629
1.77k
    std::map<PathInData, DataTypes> subcolumns_types;
630
631
    // Collect all paths first to enable batch checking
632
1.77k
    std::vector<PathInData> all_paths;
633
634
1.96k
    for (const TabletSchemaSPtr& schema : schemas) {
635
6.37k
        for (const TabletColumnPtr& col : schema->columns()) {
636
            // Get subcolumns of this variant
637
6.37k
            if (col->has_path_info() && col->parent_unique_id() >= 0 &&
638
6.37k
                col->parent_unique_id() == variant_col_unique_id) {
639
952
                subcolumns_types[*col->path_info_ptr()].emplace_back(
640
952
                        DataTypeFactory::instance().create_data_type(*col, col->is_nullable()));
641
952
                all_paths.push_back(*col->path_info_ptr());
642
952
            }
643
6.37k
        }
644
1.96k
    }
645
646
    // Batch check for conflicts
647
1.77k
    RETURN_IF_ERROR(check_variant_has_no_ambiguous_paths(all_paths));
648
649
1.77k
    return update_least_schema_internal(subcolumns_types, common_schema, variant_col_unique_id,
650
1.77k
                                        typed_columns, path_set);
651
1.77k
}
652
653
// Keep variant subcolumn BF support aligned with FE DDL checks.
654
80.1k
bool is_bf_supported_by_fe_for_variant_subcolumn(FieldType type) {
655
80.1k
    switch (type) {
656
91
    case FieldType::OLAP_FIELD_TYPE_SMALLINT:
657
460
    case FieldType::OLAP_FIELD_TYPE_INT:
658
50.5k
    case FieldType::OLAP_FIELD_TYPE_BIGINT:
659
50.7k
    case FieldType::OLAP_FIELD_TYPE_LARGEINT:
660
50.7k
    case FieldType::OLAP_FIELD_TYPE_CHAR:
661
50.7k
    case FieldType::OLAP_FIELD_TYPE_VARCHAR:
662
68.0k
    case FieldType::OLAP_FIELD_TYPE_STRING:
663
68.0k
    case FieldType::OLAP_FIELD_TYPE_DATE:
664
68.0k
    case FieldType::OLAP_FIELD_TYPE_DATETIME:
665
68.2k
    case FieldType::OLAP_FIELD_TYPE_DATEV2:
666
68.4k
    case FieldType::OLAP_FIELD_TYPE_DATETIMEV2:
667
68.4k
    case FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ:
668
68.4k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL:
669
68.5k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL32:
670
68.6k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL64:
671
68.9k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL128I:
672
69.0k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL256:
673
69.2k
    case FieldType::OLAP_FIELD_TYPE_IPV4:
674
69.3k
    case FieldType::OLAP_FIELD_TYPE_IPV6:
675
69.3k
        return true;
676
10.8k
    default:
677
10.8k
        return false;
678
80.1k
    }
679
80.1k
}
680
681
void inherit_column_attributes(const TabletColumn& source, TabletColumn& target,
682
80.1k
                               TabletSchemaSPtr* target_schema) {
683
80.1k
    if (!target.is_extracted_column()) {
684
0
        return;
685
0
    }
686
80.1k
    target.set_aggregation_method(source.aggregation());
687
688
    // 1. bloom filter
689
80.1k
    if (is_bf_supported_by_fe_for_variant_subcolumn(target.type())) {
690
69.3k
        target.set_is_bf_column(source.is_bf_column());
691
69.3k
    }
692
693
80.1k
    if (!target_schema) {
694
73.3k
        return;
695
73.3k
    }
696
697
    // 2. inverted index
698
6.80k
    TabletIndexes indexes_to_add;
699
6.80k
    auto source_indexes = (*target_schema)->inverted_indexs(source.unique_id());
700
    // if target is variant type, we need to inherit all indexes
701
    // because this schema is a read schema from fe
702
6.80k
    if (target.is_variant_type()) {
703
5.94k
        for (auto& index : source_indexes) {
704
403
            auto index_info = std::make_shared<TabletIndex>(*index);
705
403
            index_info->set_escaped_escaped_index_suffix_path(target.path_info_ptr()->get_path());
706
403
            indexes_to_add.emplace_back(std::move(index_info));
707
403
        }
708
5.94k
    } else {
709
867
        inherit_index(source_indexes, indexes_to_add, target);
710
867
    }
711
6.80k
    auto target_indexes = (*target_schema)
712
6.80k
                                  ->inverted_indexs(target.parent_unique_id(),
713
6.80k
                                                    target.path_info_ptr()->get_path());
714
6.87k
    if (target_indexes.empty()) {
715
6.87k
        for (auto& index_info : indexes_to_add) {
716
410
            (*target_schema)->append_index(std::move(*index_info));
717
410
        }
718
6.87k
    }
719
720
    // 3. TODO: gnragm bf index
721
6.80k
}
722
723
8.31k
void inherit_column_attributes(TabletSchemaSPtr& schema) {
724
    // Add index meta if extracted column is missing index meta
725
102k
    for (size_t i = 0; i < schema->num_columns(); ++i) {
726
93.6k
        TabletColumn& col = schema->mutable_column(i);
727
93.6k
        if (!col.is_extracted_column()) {
728
86.8k
            continue;
729
86.8k
        }
730
6.84k
        if (schema->field_index(col.parent_unique_id()) == -1) {
731
            // parent column is missing, maybe dropped
732
0
            continue;
733
0
        }
734
6.84k
        inherit_column_attributes(schema->column_by_uid(col.parent_unique_id()), col, &schema);
735
6.84k
    }
736
8.31k
}
737
738
Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
739
                               const TabletSchemaSPtr& base_schema, TabletSchemaSPtr& output_schema,
740
1.73k
                               bool check_schema_size) {
741
1.73k
    std::vector<int32_t> variant_column_unique_id;
742
    // Construct a schema excluding the extracted columns and gather unique identifiers for variants.
743
    // Ensure that the output schema also excludes these extracted columns. This approach prevents
744
    // duplicated paths following the update_least_common_schema process.
745
1.73k
    auto build_schema_without_extracted_columns = [&](const TabletSchemaSPtr& base_schema) {
746
1.73k
        output_schema = std::make_shared<TabletSchema>();
747
        // not copy columns but only shadow copy other attributes
748
1.73k
        output_schema->shawdow_copy_without_columns(*base_schema);
749
        // Get all columns without extracted columns and collect variant col unique id
750
4.62k
        for (const TabletColumnPtr& col : base_schema->columns()) {
751
4.62k
            if (col->is_variant_type()) {
752
1.77k
                variant_column_unique_id.push_back(col->unique_id());
753
1.77k
            }
754
4.62k
            if (!col->is_extracted_column()) {
755
4.13k
                output_schema->append_column(*col);
756
4.13k
            }
757
4.62k
        }
758
1.73k
    };
759
1.73k
    if (base_schema == nullptr) {
760
        // Pick tablet schema with max schema version
761
324
        auto max_version_schema =
762
324
                *std::max_element(schemas.cbegin(), schemas.cend(),
763
1.41k
                                  [](const TabletSchemaSPtr a, const TabletSchemaSPtr b) {
764
1.41k
                                      return a->schema_version() < b->schema_version();
765
1.41k
                                  });
766
324
        CHECK(max_version_schema);
767
324
        build_schema_without_extracted_columns(max_version_schema);
768
1.41k
    } else {
769
        // use input base_schema schema as base schema
770
1.41k
        build_schema_without_extracted_columns(base_schema);
771
1.41k
    }
772
773
1.77k
    for (int32_t unique_id : variant_column_unique_id) {
774
1.77k
        std::set<PathInData> path_set;
775
1.77k
        RETURN_IF_ERROR(update_least_common_schema(schemas, output_schema, unique_id, &path_set));
776
1.77k
    }
777
778
1.73k
    inherit_column_attributes(output_schema);
779
1.73k
    if (check_schema_size &&
780
1.73k
        output_schema->columns().size() > config::variant_max_merged_tablet_schema_size) {
781
0
        return Status::DataQualityError("Reached max column size limit {}",
782
0
                                        config::variant_max_merged_tablet_schema_size);
783
0
    }
784
785
1.73k
    return Status::OK();
786
1.73k
}
787
788
// sort by paths in lexicographical order
789
2.05k
ColumnVariant::Subcolumns get_sorted_subcolumns(const ColumnVariant::Subcolumns& subcolumns) {
790
    // sort by paths in lexicographical order
791
2.05k
    ColumnVariant::Subcolumns sorted = subcolumns;
792
179k
    std::sort(sorted.begin(), sorted.end(), [](const auto& lhsItem, const auto& rhsItem) {
793
179k
        return lhsItem->path < rhsItem->path;
794
179k
    });
795
2.05k
    return sorted;
796
2.05k
}
797
798
bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema,
799
26.5k
                           int32_t new_col_idx, int32_t old_col_idx) {
800
26.5k
    const auto& column_new = new_schema->column(new_col_idx);
801
26.5k
    const auto& column_old = old_schema->column(old_col_idx);
802
803
26.5k
    if (column_new.is_bf_column() != column_old.is_bf_column()) {
804
94
        return true;
805
94
    }
806
807
26.4k
    auto new_schema_inverted_indexs = new_schema->inverted_indexs(column_new);
808
26.4k
    auto old_schema_inverted_indexs = old_schema->inverted_indexs(column_old);
809
810
26.4k
    if (new_schema_inverted_indexs.size() != old_schema_inverted_indexs.size()) {
811
705
        return true;
812
705
    }
813
814
26.1k
    for (size_t i = 0; i < new_schema_inverted_indexs.size(); ++i) {
815
391
        if (!new_schema_inverted_indexs[i]->is_same_except_id(old_schema_inverted_indexs[i])) {
816
19
            return true;
817
19
        }
818
391
    }
819
820
25.7k
    return false;
821
25.7k
}
822
823
2.67k
TabletColumn create_sparse_column(const TabletColumn& variant) {
824
2.67k
    TabletColumn res;
825
2.67k
    res.set_name(variant.name_lower_case() + "." + SPARSE_COLUMN_PATH);
826
2.67k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
827
2.67k
    res.set_aggregation_method(variant.aggregation());
828
2.67k
    res.set_path_info(PathInData {variant.name_lower_case() + "." + SPARSE_COLUMN_PATH});
829
2.67k
    res.set_parent_unique_id(variant.unique_id());
830
    // set default value to "NULL" DefaultColumnIterator will call insert_many_defaults
831
2.67k
    res.set_default_value("NULL");
832
2.67k
    TabletColumn child_tcolumn;
833
2.67k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
834
2.67k
    res.add_sub_column(child_tcolumn);
835
2.67k
    res.add_sub_column(child_tcolumn);
836
2.67k
    return res;
837
2.67k
}
838
839
17.8k
TabletColumn create_sparse_shard_column(const TabletColumn& variant, int bucket_index) {
840
17.8k
    TabletColumn res;
841
17.8k
    std::string name = variant.name_lower_case() + "." + SPARSE_COLUMN_PATH + ".b" +
842
17.8k
                       std::to_string(bucket_index);
843
17.8k
    res.set_name(name);
844
17.8k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
845
17.8k
    res.set_aggregation_method(variant.aggregation());
846
17.8k
    res.set_parent_unique_id(variant.unique_id());
847
17.8k
    res.set_default_value("NULL");
848
17.8k
    PathInData path(name);
849
17.8k
    res.set_path_info(path);
850
17.8k
    TabletColumn child_tcolumn;
851
17.8k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
852
17.8k
    res.add_sub_column(child_tcolumn);
853
17.8k
    res.add_sub_column(child_tcolumn);
854
17.8k
    return res;
855
17.8k
}
856
857
12.7k
TabletColumn create_doc_value_column(const TabletColumn& variant, int bucket_index) {
858
12.7k
    TabletColumn res;
859
12.7k
    std::string name = variant.name_lower_case() + "." + DOC_VALUE_COLUMN_PATH + ".b" +
860
12.7k
                       std::to_string(bucket_index);
861
12.7k
    res.set_name(name);
862
12.7k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
863
12.7k
    res.set_aggregation_method(variant.aggregation());
864
12.7k
    res.set_parent_unique_id(variant.unique_id());
865
12.7k
    res.set_default_value("NULL");
866
12.7k
    res.set_path_info(PathInData {name});
867
868
12.7k
    TabletColumn child_tcolumn;
869
12.7k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
870
12.7k
    res.add_sub_column(child_tcolumn);
871
12.7k
    res.add_sub_column(child_tcolumn);
872
12.7k
    return res;
873
12.7k
}
874
875
216k
uint32_t variant_binary_shard_of(const StringRef& path, uint32_t bucket_num) {
876
216k
    if (bucket_num <= 1) return 0;
877
204k
    SipHash hash;
878
204k
    hash.update(path.data, path.size);
879
204k
    uint64_t h = hash.get64();
880
204k
    return static_cast<uint32_t>(h % bucket_num);
881
216k
}
882
883
Status VariantCompactionUtil::aggregate_path_to_stats(
884
        const RowsetSharedPtr& rs,
885
2.83k
        std::unordered_map<int32_t, PathToNoneNullValues>* uid_to_path_stats) {
886
2.83k
    SegmentCacheHandle segment_cache;
887
2.83k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
888
2.83k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
889
890
8.32k
    for (const auto& column : rs->tablet_schema()->columns()) {
891
8.32k
        if (!column->is_variant_type() || column->unique_id() < 0) {
892
4.36k
            continue;
893
4.36k
        }
894
3.95k
        if (!should_check_variant_path_stats(*column)) {
895
0
            continue;
896
0
        }
897
3.95k
        for (const auto& segment : segment_cache.get_segments()) {
898
2.01k
            std::shared_ptr<ColumnReader> column_reader;
899
2.01k
            OlapReaderStatistics stats;
900
2.01k
            RETURN_IF_ERROR(
901
2.01k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
902
2.01k
            if (!column_reader) {
903
0
                continue;
904
0
            }
905
906
2.01k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
907
2.01k
            auto* variant_column_reader =
908
2.01k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
909
            // load external meta before getting stats
910
2.01k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
911
2.01k
            const auto* source_stats = variant_column_reader->get_stats();
912
2.01k
            CHECK(source_stats);
913
914
            // agg path -> stats
915
9.44k
            for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
916
9.44k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
917
9.44k
            }
918
919
4.79k
            for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
920
4.79k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
921
4.79k
            }
922
2.01k
        }
923
3.95k
    }
924
2.83k
    return Status::OK();
925
2.83k
}
926
927
Status VariantCompactionUtil::aggregate_variant_extended_info(
928
        const RowsetSharedPtr& rs,
929
4.68k
        std::unordered_map<int32_t, VariantExtendedInfo>* uid_to_variant_extended_info) {
930
4.68k
    SegmentCacheHandle segment_cache;
931
4.68k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
932
4.68k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
933
934
18.3k
    for (const auto& column : rs->tablet_schema()->columns()) {
935
18.3k
        if (!column->is_variant_type()) {
936
12.3k
            continue;
937
12.3k
        }
938
5.95k
        auto& extended_info = (*uid_to_variant_extended_info)[column->unique_id()];
939
5.95k
        if (column->variant_enable_nested_group()) {
940
0
            extended_info.has_nested_group = true;
941
0
        }
942
5.95k
        for (const auto& segment : segment_cache.get_segments()) {
943
3.46k
            std::shared_ptr<ColumnReader> column_reader;
944
3.46k
            OlapReaderStatistics stats;
945
3.46k
            RETURN_IF_ERROR(
946
3.46k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
947
3.46k
            if (!column_reader) {
948
0
                continue;
949
0
            }
950
951
3.46k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
952
3.46k
            auto* variant_column_reader =
953
3.46k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
954
            // load external meta before getting stats
955
3.46k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
956
3.46k
            const auto* source_stats = variant_column_reader->get_stats();
957
3.46k
            CHECK(source_stats);
958
959
3.46k
            if (!column->variant_enable_nested_group()) {
960
                // NG roots still need type metadata for regular subpaths such as `v.owner`,
961
                // but their compaction schema should not be driven by flat path stats.
962
4.69k
                for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
963
4.69k
                    extended_info.path_to_none_null_values[path] += size;
964
4.69k
                    extended_info.sparse_paths.emplace(path);
965
4.69k
                }
966
967
5.51k
                for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
968
5.51k
                    extended_info.path_to_none_null_values[path] += size;
969
5.51k
                }
970
3.46k
            }
971
972
            //2. agg path -> schema
973
3.46k
            variant_column_reader->get_subcolumns_types(&extended_info.path_to_data_types);
974
975
            // 3. extract typed paths
976
3.46k
            variant_column_reader->get_typed_paths(&extended_info.typed_paths);
977
978
            // 4. extract nested paths
979
3.46k
            if (!column->variant_enable_nested_group()) {
980
3.46k
                variant_column_reader->get_nested_paths(&extended_info.nested_paths);
981
3.46k
            }
982
3.46k
        }
983
5.95k
    }
984
4.68k
    return Status::OK();
985
4.68k
}
986
987
// get the subpaths and sparse paths for the variant column
988
void VariantCompactionUtil::get_subpaths(int32_t max_subcolumns_count,
989
                                         const PathToNoneNullValues& stats,
990
448
                                         TabletSchema::PathsSetInfo& paths_set_info) {
991
    // max_subcolumns_count is 0 means no limit
992
448
    if (max_subcolumns_count > 0 && stats.size() > max_subcolumns_count) {
993
146
        std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
994
146
        paths_with_sizes.reserve(stats.size());
995
4.44k
        for (const auto& [path, size] : stats) {
996
4.44k
            paths_with_sizes.emplace_back(size, path);
997
4.44k
        }
998
146
        std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), std::greater());
999
1000
        // Select top N paths as subcolumns, remaining paths as sparse columns
1001
4.44k
        for (const auto& [size, path] : paths_with_sizes) {
1002
4.44k
            if (paths_set_info.sub_path_set.size() < max_subcolumns_count) {
1003
474
                paths_set_info.sub_path_set.emplace(path);
1004
3.97k
            } else {
1005
3.97k
                paths_set_info.sparse_path_set.emplace(path);
1006
3.97k
            }
1007
4.44k
        }
1008
146
        LOG(INFO) << "subpaths " << paths_set_info.sub_path_set.size() << " sparse paths "
1009
146
                  << paths_set_info.sparse_path_set.size() << " variant max subcolumns count "
1010
146
                  << max_subcolumns_count << " stats size " << paths_with_sizes.size();
1011
302
    } else {
1012
        // Apply all paths as subcolumns
1013
738
        for (const auto& [path, _] : stats) {
1014
738
            paths_set_info.sub_path_set.emplace(path);
1015
738
        }
1016
302
    }
1017
448
}
1018
1019
Status VariantCompactionUtil::check_path_stats(const std::vector<RowsetSharedPtr>& intputs,
1020
11.1k
                                               RowsetSharedPtr output, BaseTabletSPtr tablet) {
1021
11.1k
    if (output->tablet_schema()->num_variant_columns() == 0) {
1022
10.6k
        return Status::OK();
1023
10.6k
    }
1024
4.69k
    for (const auto& rowset : intputs) {
1025
18.3k
        for (const auto& column : rowset->tablet_schema()->columns()) {
1026
18.3k
            if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
1027
0
                return Status::OK();
1028
0
            }
1029
18.3k
        }
1030
4.69k
    }
1031
    // check no extended schema in input rowsets
1032
4.69k
    for (const auto& rowset : intputs) {
1033
18.3k
        for (const auto& column : rowset->tablet_schema()->columns()) {
1034
18.3k
            if (column->is_extracted_column()) {
1035
0
                return Status::OK();
1036
0
            }
1037
18.3k
        }
1038
4.69k
    }
1039
550
#ifndef BE_TEST
1040
    // check no extended schema in output rowset
1041
1.94k
    for (const auto& column : output->tablet_schema()->columns()) {
1042
1.94k
        if (column->is_extracted_column()) {
1043
0
            const auto& name = column->name();
1044
0
            if (name.find("." + DOC_VALUE_COLUMN_PATH + ".") != std::string::npos ||
1045
0
                name.find("." + SPARSE_COLUMN_PATH + ".") != std::string::npos ||
1046
0
                name.ends_with("." + SPARSE_COLUMN_PATH)) {
1047
0
                continue;
1048
0
            }
1049
0
            return Status::InternalError("Unexpected extracted column {} in output rowset",
1050
0
                                         column->name());
1051
0
        }
1052
1.94k
    }
1053
550
#endif
1054
    // only check path stats for dup_keys since the rows may be merged in other models
1055
550
    if (tablet->keys_type() != KeysType::DUP_KEYS) {
1056
183
        return Status::OK();
1057
183
    }
1058
    // if there is a delete predicate in the input rowsets, we skip the path stats check
1059
2.48k
    for (auto& rowset : intputs) {
1060
2.48k
        if (rowset->rowset_meta()->has_delete_predicate()) {
1061
4
            return Status::OK();
1062
4
        }
1063
2.48k
    }
1064
1.01k
    for (const auto& column : output->tablet_schema()->columns()) {
1065
1.01k
        if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
1066
0
            return Status::OK();
1067
0
        }
1068
1.01k
    }
1069
363
    std::unordered_map<int32_t, PathToNoneNullValues> original_uid_to_path_stats;
1070
2.46k
    for (const auto& rs : intputs) {
1071
2.46k
        RETURN_IF_ERROR(aggregate_path_to_stats(rs, &original_uid_to_path_stats));
1072
2.46k
    }
1073
363
    std::unordered_map<int32_t, PathToNoneNullValues> output_uid_to_path_stats;
1074
363
    RETURN_IF_ERROR(aggregate_path_to_stats(output, &output_uid_to_path_stats));
1075
363
    for (const auto& [uid, stats] : output_uid_to_path_stats) {
1076
252
        if (output->tablet_schema()->column_by_uid(uid).is_variant_type() &&
1077
252
            output->tablet_schema()->column_by_uid(uid).variant_enable_doc_mode()) {
1078
64
            continue;
1079
64
        }
1080
188
        if (original_uid_to_path_stats.find(uid) == original_uid_to_path_stats.end()) {
1081
0
            return Status::InternalError("Path stats not found for uid {}, tablet_id {}", uid,
1082
0
                                         tablet->tablet_id());
1083
0
        }
1084
1085
        // In input rowsets, some rowsets may have statistics values exceeding the maximum limit,
1086
        // which leads to inaccurate statistics
1087
188
        if (stats.size() > output->tablet_schema()
1088
188
                                   ->column_by_uid(uid)
1089
188
                                   .variant_max_sparse_column_statistics_size()) {
1090
            // When there is only one segment, we can ensure that the size of each path in output stats is accurate
1091
1
            if (output->num_segments() == 1) {
1092
13
                for (const auto& [path, size] : stats) {
1093
13
                    if (original_uid_to_path_stats.at(uid).find(path) ==
1094
13
                        original_uid_to_path_stats.at(uid).end()) {
1095
0
                        continue;
1096
0
                    }
1097
13
                    if (original_uid_to_path_stats.at(uid).at(path) > size) {
1098
0
                        return Status::InternalError(
1099
0
                                "Path stats not smaller for uid {} with path `{}`, input size {}, "
1100
0
                                "output "
1101
0
                                "size {}, "
1102
0
                                "tablet_id {}",
1103
0
                                uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1104
0
                                tablet->tablet_id());
1105
0
                    }
1106
13
                }
1107
1
            }
1108
1
        }
1109
        // in this case, input stats is accurate, so we check the stats size and stats value
1110
187
        else {
1111
4.54k
            for (const auto& [path, size] : stats) {
1112
4.54k
                if (original_uid_to_path_stats.at(uid).find(path) ==
1113
4.54k
                    original_uid_to_path_stats.at(uid).end()) {
1114
0
                    return Status::InternalError(
1115
0
                            "Path stats not found for uid {}, path {}, tablet_id {}", uid, path,
1116
0
                            tablet->tablet_id());
1117
0
                }
1118
4.54k
                if (original_uid_to_path_stats.at(uid).at(path) != size) {
1119
0
                    return Status::InternalError(
1120
0
                            "Path stats not match for uid {} with path `{}`, input size {}, output "
1121
0
                            "size {}, "
1122
0
                            "tablet_id {}",
1123
0
                            uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1124
0
                            tablet->tablet_id());
1125
0
                }
1126
4.54k
            }
1127
187
        }
1128
188
    }
1129
1130
363
    return Status::OK();
1131
363
}
1132
1133
Status VariantCompactionUtil::get_compaction_typed_columns(
1134
        const TabletSchemaSPtr& target, const std::unordered_set<std::string>& typed_paths,
1135
        const TabletColumnPtr parent_column, TabletSchemaSPtr& output_schema,
1136
447
        TabletSchema::PathsSetInfo& paths_set_info) {
1137
447
    if (parent_column->variant_enable_typed_paths_to_sparse()) {
1138
40
        return Status::OK();
1139
40
    }
1140
407
    for (const auto& path : typed_paths) {
1141
401
        TabletSchema::SubColumnInfo sub_column_info;
1142
401
        if (generate_sub_column_info(*target, parent_column->unique_id(), path, &sub_column_info)) {
1143
400
            inherit_column_attributes(*parent_column, sub_column_info.column);
1144
400
            output_schema->append_column(sub_column_info.column);
1145
400
            paths_set_info.typed_path_set.insert({path, std::move(sub_column_info)});
1146
400
            VLOG_DEBUG << "append typed column " << path;
1147
400
        } else {
1148
1
            return Status::InternalError("Failed to generate sub column info for path {}", path);
1149
1
        }
1150
401
    }
1151
406
    return Status::OK();
1152
407
}
1153
1154
Status VariantCompactionUtil::get_compaction_nested_columns(
1155
        const std::unordered_set<PathInData, PathInData::Hash>& nested_paths,
1156
        const PathToDataTypes& path_to_data_types, const TabletColumnPtr parent_column,
1157
444
        TabletSchemaSPtr& output_schema, TabletSchema::PathsSetInfo& paths_set_info) {
1158
444
    const auto& parent_indexes = output_schema->inverted_indexs(parent_column->unique_id());
1159
444
    for (const auto& path : nested_paths) {
1160
3
        const auto& find_data_types = path_to_data_types.find(path);
1161
3
        if (find_data_types == path_to_data_types.end() || find_data_types->second.empty()) {
1162
1
            return Status::InternalError("Nested path {} has no data type", path.get_path());
1163
1
        }
1164
2
        DataTypePtr data_type;
1165
2
        get_least_supertype_jsonb(find_data_types->second, &data_type);
1166
1167
2
        const std::string& column_name = parent_column->name_lower_case() + "." + path.get_path();
1168
2
        PathInDataBuilder full_path_builder;
1169
2
        auto full_path = full_path_builder.append(parent_column->name_lower_case(), false)
1170
2
                                 .append(path.get_parts(), false)
1171
2
                                 .build();
1172
2
        TabletColumn nested_column =
1173
2
                get_column_by_type(data_type, column_name,
1174
2
                                   ExtraInfo {.unique_id = -1,
1175
2
                                              .parent_unique_id = parent_column->unique_id(),
1176
2
                                              .path_info = full_path});
1177
2
        inherit_column_attributes(*parent_column, nested_column);
1178
2
        TabletIndexes sub_column_indexes;
1179
2
        inherit_index(parent_indexes, sub_column_indexes, nested_column);
1180
2
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1181
2
        output_schema->append_column(nested_column);
1182
2
        VLOG_DEBUG << "append nested column " << path.get_path();
1183
2
    }
1184
443
    return Status::OK();
1185
444
}
1186
1187
void VariantCompactionUtil::get_compaction_subcolumns_from_subpaths(
1188
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1189
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1190
442
        const std::unordered_set<std::string>& sparse_paths, TabletSchemaSPtr& output_schema) {
1191
442
    auto& path_set = paths_set_info.sub_path_set;
1192
442
    std::vector<StringRef> sorted_subpaths(path_set.begin(), path_set.end());
1193
442
    std::sort(sorted_subpaths.begin(), sorted_subpaths.end());
1194
442
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1195
    // append subcolumns
1196
1.15k
    for (const auto& subpath : sorted_subpaths) {
1197
1.15k
        auto column_name = parent_column->name_lower_case() + "." + subpath.to_string();
1198
1.15k
        auto column_path = PathInData(column_name);
1199
1200
1.15k
        const auto& find_data_types = path_to_data_types.find(PathInData(subpath));
1201
1202
        // some cases: the subcolumn type is variant
1203
        // 1. this path has no data type in segments
1204
        // 2. this path is in sparse paths
1205
        // 3. the sparse paths are too much
1206
1.15k
        TabletSchema::SubColumnInfo sub_column_info;
1207
1.15k
        if (parent_column->variant_enable_typed_paths_to_sparse() &&
1208
1.15k
            generate_sub_column_info(*target, parent_column->unique_id(), std::string(subpath),
1209
73
                                     &sub_column_info)) {
1210
63
            inherit_column_attributes(*parent_column, sub_column_info.column);
1211
63
            output_schema->append_column(sub_column_info.column);
1212
63
            paths_set_info.subcolumn_indexes.emplace(subpath, std::move(sub_column_info.indexes));
1213
63
            VLOG_DEBUG << "append typed column " << subpath;
1214
1.09k
        } else if (find_data_types == path_to_data_types.end() || find_data_types->second.empty() ||
1215
1.09k
                   sparse_paths.find(std::string(subpath)) != sparse_paths.end() ||
1216
1.09k
                   sparse_paths.size() >=
1217
1.05k
                           parent_column->variant_max_sparse_column_statistics_size()) {
1218
41
            TabletColumn subcolumn;
1219
41
            subcolumn.set_name(column_name);
1220
41
            subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1221
41
            subcolumn.set_parent_unique_id(parent_column->unique_id());
1222
41
            subcolumn.set_path_info(column_path);
1223
41
            subcolumn.set_aggregation_method(parent_column->aggregation());
1224
41
            subcolumn.set_variant_max_subcolumns_count(
1225
41
                    parent_column->variant_max_subcolumns_count());
1226
41
            subcolumn.set_variant_enable_doc_mode(parent_column->variant_enable_doc_mode());
1227
41
            subcolumn.set_is_nullable(true);
1228
41
            output_schema->append_column(subcolumn);
1229
41
            VLOG_DEBUG << "append sub column " << subpath << " data type "
1230
0
                       << "VARIANT";
1231
41
        }
1232
        // normal case: the subcolumn type can be calculated from the data types in segments
1233
1.05k
        else {
1234
1.05k
            DataTypePtr data_type;
1235
1.05k
            get_least_supertype_jsonb(find_data_types->second, &data_type);
1236
1.05k
            TabletColumn sub_column =
1237
1.05k
                    get_column_by_type(data_type, column_name,
1238
1.05k
                                       ExtraInfo {.unique_id = -1,
1239
1.05k
                                                  .parent_unique_id = parent_column->unique_id(),
1240
1.05k
                                                  .path_info = column_path});
1241
1.05k
            inherit_column_attributes(*parent_column, sub_column);
1242
1.05k
            TabletIndexes sub_column_indexes;
1243
1.05k
            inherit_index(parent_indexes, sub_column_indexes, sub_column);
1244
1.05k
            paths_set_info.subcolumn_indexes.emplace(subpath, std::move(sub_column_indexes));
1245
1.05k
            output_schema->append_column(sub_column);
1246
18.4E
            VLOG_DEBUG << "append sub column " << subpath << " data type " << data_type->get_name();
1247
1.05k
        }
1248
1.15k
    }
1249
442
}
1250
1251
void VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
1252
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1253
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1254
12
        TabletSchemaSPtr& output_schema) {
1255
12
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1256
74
    for (const auto& [path, data_types] : path_to_data_types) {
1257
        // Typed paths are materialized by get_compaction_typed_columns(); this helper only
1258
        // materializes regular subcolumns inferred from rowset data types.
1259
74
        if (data_types.empty() || path.empty() || path.get_is_typed() || path.has_nested_part()) {
1260
10
            continue;
1261
10
        }
1262
64
        DataTypePtr data_type;
1263
64
        get_least_supertype_jsonb(data_types, &data_type);
1264
64
        auto column_name = parent_column->name_lower_case() + "." + path.get_path();
1265
64
        auto column_path = PathInData(column_name);
1266
64
        TabletColumn sub_column =
1267
64
                get_column_by_type(data_type, column_name,
1268
64
                                   ExtraInfo {.unique_id = -1,
1269
64
                                              .parent_unique_id = parent_column->unique_id(),
1270
64
                                              .path_info = column_path});
1271
64
        inherit_column_attributes(*parent_column, sub_column);
1272
64
        TabletIndexes sub_column_indexes;
1273
64
        inherit_index(parent_indexes, sub_column_indexes, sub_column);
1274
64
        paths_set_info.sub_path_set.emplace(path.get_path());
1275
64
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1276
64
        output_schema->append_column(sub_column);
1277
64
        VLOG_DEBUG << "append sub column " << path.get_path() << " data type "
1278
0
                   << data_type->get_name();
1279
64
    }
1280
12
}
1281
1282
// Build the temporary schema for compaction.
1283
// NestedGroup roots are special: the root VARIANT column owns the NG tree and the streaming NG
1284
// writer handles NG children, while regular non-NG paths beside the arrays are materialized as
1285
// ordinary extracted subcolumns. NG typed paths still use get_compaction_typed_columns(), keeping
1286
// typed-column rules out of the NG-specific regular-path filtering.
1287
Status VariantCompactionUtil::get_extended_compaction_schema(
1288
10.7k
        const std::vector<RowsetSharedPtr>& rowsets, TabletSchemaSPtr& target) {
1289
10.7k
    std::unordered_map<int32_t, VariantExtendedInfo> uid_to_variant_extended_info;
1290
10.7k
    const bool needs_variant_extended_info =
1291
104k
            std::ranges::any_of(target->columns(), [](const TabletColumnPtr& column) {
1292
104k
                return column->is_variant_type() && (should_check_variant_path_stats(*column) ||
1293
539
                                                     column->variant_enable_nested_group());
1294
104k
            });
1295
10.7k
    if (needs_variant_extended_info) {
1296
        // collect path stats from all rowsets and segments
1297
4.68k
        for (const auto& rs : rowsets) {
1298
4.68k
            RETURN_IF_ERROR(aggregate_variant_extended_info(rs, &uid_to_variant_extended_info));
1299
4.68k
        }
1300
539
    }
1301
1302
    // build the output schema
1303
10.7k
    TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>();
1304
10.7k
    output_schema->shawdow_copy_without_columns(*target);
1305
10.7k
    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info;
1306
10.7k
    const auto ng_root_uids =
1307
10.7k
            collect_nested_group_compaction_root_uids(target, uid_to_variant_extended_info);
1308
107k
    for (const TabletColumnPtr& column : target->columns()) {
1309
107k
        if (!column->is_extracted_column()) {
1310
107k
            output_schema->append_column(*column);
1311
107k
        }
1312
107k
        if (!column->is_variant_type()) {
1313
107k
            continue;
1314
107k
        }
1315
18.4E
        VLOG_DEBUG << "column " << column->name() << " unique id " << column->unique_id();
1316
1317
624
        const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
1318
624
        const VariantExtendedInfo empty_extended_info;
1319
624
        const VariantExtendedInfo& extended_info = info_it == uid_to_variant_extended_info.end()
1320
624
                                                           ? empty_extended_info
1321
624
                                                           : info_it->second;
1322
624
        auto& paths_set_info = uid_to_paths_set_info[column->unique_id()];
1323
624
        const bool use_nested_group_compaction_schema = ng_root_uids.contains(column->unique_id());
1324
1325
624
        if (use_nested_group_compaction_schema) {
1326
            // 1. append typed columns. Keep this shared with the non-NG typed helper; only the
1327
            // regular-path selection below is NG-specific.
1328
1
            RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1329
1
                                                         output_schema, paths_set_info));
1330
1331
            // NG roots do not record path-count stats for ordinary Variant paths, so their regular
1332
            // non-NG subcolumns use the same data-types materialization helper as the
1333
            // all-materialized non-NG branch below.
1334
1
            auto regular_path_to_data_types =
1335
1
                    collect_regular_types_outside_nested_group(extended_info);
1336
1
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1337
1
                                                      regular_path_to_data_types, output_schema);
1338
1
            LOG(INFO) << "Variant column uid=" << column->unique_id()
1339
1
                      << " keeps nested-group root and materializes regular non-NG subcolumns in "
1340
1
                         "compaction schema";
1341
1
            continue;
1342
1
        }
1343
1344
623
        if (column->variant_enable_doc_mode()) {
1345
221
            const int bucket_num = std::max(1, column->variant_doc_hash_shard_count());
1346
852
            for (int b = 0; b < bucket_num; ++b) {
1347
631
                TabletColumn doc_value_bucket_column = create_doc_value_column(*column, b);
1348
631
                doc_value_bucket_column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1349
631
                doc_value_bucket_column.set_is_nullable(false);
1350
631
                doc_value_bucket_column.set_variant_enable_doc_mode(true);
1351
631
                output_schema->append_column(doc_value_bucket_column);
1352
631
            }
1353
221
            continue;
1354
221
        }
1355
1356
        // 1. append typed columns
1357
402
        RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1358
402
                                                     output_schema, paths_set_info));
1359
1360
        // 2. append nested columns
1361
402
        RETURN_IF_ERROR(get_compaction_nested_columns(extended_info.nested_paths,
1362
402
                                                      extended_info.path_to_data_types, column,
1363
402
                                                      output_schema, paths_set_info));
1364
1365
        // 3. get the subpaths
1366
402
        get_subpaths(column->variant_max_subcolumns_count(), extended_info.path_to_none_null_values,
1367
402
                     paths_set_info);
1368
1369
        // 4. append subcolumns
1370
432
        if (column->variant_max_subcolumns_count() > 0 || !column->get_sub_columns().empty()) {
1371
432
            get_compaction_subcolumns_from_subpaths(paths_set_info, column, target,
1372
432
                                                    extended_info.path_to_data_types,
1373
432
                                                    extended_info.sparse_paths, output_schema);
1374
432
        }
1375
        // variant_max_subcolumns_count == 0 and no typed paths materialized
1376
        // it means that all subcolumns are materialized, may be from old data
1377
18.4E
        else {
1378
18.4E
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1379
18.4E
                                                      extended_info.path_to_data_types,
1380
18.4E
                                                      output_schema);
1381
18.4E
        }
1382
1383
        // append sparse column(s)
1384
        // If variant uses bucketized sparse columns, append one sparse bucket column per bucket.
1385
        // Otherwise, append the single sparse column.
1386
402
        int bucket_num = std::max(1, column->variant_sparse_hash_shard_count());
1387
402
        if (bucket_num > 1) {
1388
1.40k
            for (int b = 0; b < bucket_num; ++b) {
1389
1.05k
                TabletColumn sparse_bucket_column = create_sparse_shard_column(*column, b);
1390
1.05k
                output_schema->append_column(sparse_bucket_column);
1391
1.05k
            }
1392
344
        } else {
1393
58
            TabletColumn sparse_column = create_sparse_column(*column);
1394
58
            output_schema->append_column(sparse_column);
1395
58
        }
1396
402
    }
1397
1398
10.7k
    target = output_schema;
1399
    // used to merge & filter path to sparse column during reading in compaction
1400
10.7k
    target->set_path_set_info(std::move(uid_to_paths_set_info));
1401
18.4E
    VLOG_DEBUG << "dump schema " << target->dump_full_schema();
1402
10.7k
    return Status::OK();
1403
10.7k
}
1404
1405
// Calculate statistics about variant data paths from the encoded sparse column
1406
void VariantCompactionUtil::calculate_variant_stats(const IColumn& encoded_sparse_column,
1407
                                                    segment_v2::VariantStatisticsPB* stats,
1408
                                                    size_t max_sparse_column_statistics_size,
1409
1.31k
                                                    size_t row_pos, size_t num_rows) {
1410
    // Cast input column to ColumnMap type since sparse column is stored as a map
1411
1.31k
    const auto& map_column = assert_cast<const ColumnMap&>(encoded_sparse_column);
1412
1413
    // Get the keys column which contains the paths as strings
1414
1.31k
    const auto& sparse_data_paths =
1415
1.31k
            assert_cast<const ColumnString*>(map_column.get_keys_ptr().get());
1416
1.31k
    const auto& serialized_sparse_column_offsets = map_column.get_offsets();
1417
1.31k
    auto& count_map = *stats->mutable_sparse_column_non_null_size();
1418
    // Iterate through all paths in the sparse column
1419
606k
    for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
1420
604k
        size_t offset = serialized_sparse_column_offsets[i - 1];
1421
604k
        size_t end = serialized_sparse_column_offsets[i];
1422
1.89M
        for (size_t j = offset; j != end; ++j) {
1423
1.28M
            auto path = sparse_data_paths->get_data_at(j);
1424
1425
1.28M
            const auto& sparse_path = path.to_string();
1426
            // If path already exists in statistics, increment its count
1427
1.28M
            if (auto it = count_map.find(sparse_path); it != count_map.end()) {
1428
1.28M
                ++it->second;
1429
1.28M
            }
1430
            // If path doesn't exist and we haven't hit the max statistics size limit,
1431
            // add it with count 1
1432
3.93k
            else if (count_map.size() < max_sparse_column_statistics_size) {
1433
3.93k
                count_map.emplace(sparse_path, 1);
1434
3.93k
            }
1435
1.28M
        }
1436
604k
    }
1437
1438
1.31k
    if (stats->sparse_column_non_null_size().size() > max_sparse_column_statistics_size) {
1439
0
        throw doris::Exception(
1440
0
                ErrorCode::INTERNAL_ERROR,
1441
0
                "Sparse column non null size: {} is greater than max statistics size: {}",
1442
0
                stats->sparse_column_non_null_size().size(), max_sparse_column_statistics_size);
1443
0
    }
1444
1.31k
}
1445
1446
/// Calculates number of dimensions in array field.
1447
/// Returns 0 for scalar fields.
1448
class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t> {
1449
public:
1450
    FieldVisitorToNumberOfDimensions() = default;
1451
    template <PrimitiveType T>
1452
22.3M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
22.3M
        if constexpr (T == TYPE_ARRAY) {
1454
1.34M
            const size_t size = x.size();
1455
1.34M
            size_t dimensions = 0;
1456
3.80M
            for (size_t i = 0; i < size; ++i) {
1457
2.46M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
2.46M
                dimensions = std::max(dimensions, element_dimensions);
1459
2.46M
            }
1460
1.34M
            return 1 + dimensions;
1461
21.0M
        } else {
1462
21.0M
            return 0;
1463
21.0M
        }
1464
22.3M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
121k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
121k
        } else {
1462
121k
            return 0;
1463
121k
        }
1464
121k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
480
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
480
        } else {
1462
480
            return 0;
1463
480
        }
1464
480
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
41.9k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
41.9k
        } else {
1462
41.9k
            return 0;
1463
41.9k
        }
1464
41.9k
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
396
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
396
        } else {
1462
396
            return 0;
1463
396
        }
1464
396
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
332k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
332k
        } else {
1462
332k
            return 0;
1463
332k
        }
1464
332k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
1.03k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
1.03k
        } else {
1462
1.03k
            return 0;
1463
1.03k
        }
1464
1.03k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
1.02k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
1.02k
        } else {
1462
1.02k
            return 0;
1463
1.02k
        }
1464
1.02k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
2.21k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
2.21k
        } else {
1462
2.21k
            return 0;
1463
2.21k
        }
1464
2.21k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
5.72M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
5.72M
        } else {
1462
5.72M
            return 0;
1463
5.72M
        }
1464
5.72M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
859
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
859
        } else {
1462
859
            return 0;
1463
859
        }
1464
859
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
2.96M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
2.96M
        } else {
1462
2.96M
            return 0;
1463
2.96M
        }
1464
2.96M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
306
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
306
        } else {
1462
306
            return 0;
1463
306
        }
1464
306
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
275
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
275
        } else {
1462
275
            return 0;
1463
275
        }
1464
275
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
11.7M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
11.7M
        } else {
1462
11.7M
            return 0;
1463
11.7M
        }
1464
11.7M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
1.34M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
1.34M
        if constexpr (T == TYPE_ARRAY) {
1454
1.34M
            const size_t size = x.size();
1455
1.34M
            size_t dimensions = 0;
1456
3.80M
            for (size_t i = 0; i < size; ++i) {
1457
2.46M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
2.46M
                dimensions = std::max(dimensions, element_dimensions);
1459
2.46M
            }
1460
1.34M
            return 1 + dimensions;
1461
        } else {
1462
            return 0;
1463
        }
1464
1.34M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
1
        } else {
1462
1
            return 0;
1463
1
        }
1464
1
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
1
        } else {
1462
1
            return 0;
1463
1
        }
1464
1
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
756
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
756
        } else {
1462
756
            return 0;
1463
756
        }
1464
756
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
696
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
696
        } else {
1462
696
            return 0;
1463
696
        }
1464
696
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
60.6k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
60.6k
        } else {
1462
60.6k
            return 0;
1463
60.6k
        }
1464
60.6k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
558
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
558
        } else {
1462
558
            return 0;
1463
558
        }
1464
558
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1452
46.9k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1453
        if constexpr (T == TYPE_ARRAY) {
1454
            const size_t size = x.size();
1455
            size_t dimensions = 0;
1456
            for (size_t i = 0; i < size; ++i) {
1457
                size_t element_dimensions = apply_visitor(*this, x[i]);
1458
                dimensions = std::max(dimensions, element_dimensions);
1459
            }
1460
            return 1 + dimensions;
1461
46.9k
        } else {
1462
46.9k
            return 0;
1463
46.9k
        }
1464
46.9k
    }
1465
};
1466
1467
// Visitor that allows to get type of scalar field
1468
// but exclude fields contain complex field.This is a faster version
1469
// for FieldVisitorToScalarType which does not support complex field.
1470
class SimpleFieldVisitorToScalarType : public StaticVisitor<size_t> {
1471
public:
1472
    template <PrimitiveType T>
1473
18.8M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
18.8M
        if constexpr (T == TYPE_ARRAY) {
1475
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
109k
        } else if constexpr (T == TYPE_NULL) {
1477
109k
            have_nulls = true;
1478
109k
            return 1;
1479
18.7M
        } else {
1480
18.7M
            type = T;
1481
18.7M
            return 1;
1482
18.7M
        }
1483
18.8M
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
109k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
109k
        } else if constexpr (T == TYPE_NULL) {
1477
109k
            have_nulls = true;
1478
109k
            return 1;
1479
        } else {
1480
            type = T;
1481
            return 1;
1482
        }
1483
109k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
12.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
        } else if constexpr (T == TYPE_NULL) {
1477
            have_nulls = true;
1478
            return 1;
1479
12.3k
        } else {
1480
12.3k
            type = T;
1481
12.3k
            return 1;
1482
12.3k
        }
1483
12.3k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
273k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
        } else if constexpr (T == TYPE_NULL) {
1477
            have_nulls = true;
1478
            return 1;
1479
273k
        } else {
1480
273k
            type = T;
1481
273k
            return 1;
1482
273k
        }
1483
273k
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
2
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
        } else if constexpr (T == TYPE_NULL) {
1477
            have_nulls = true;
1478
            return 1;
1479
2
        } else {
1480
2
            type = T;
1481
2
            return 1;
1482
2
        }
1483
2
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
7
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
        } else if constexpr (T == TYPE_NULL) {
1477
            have_nulls = true;
1478
            return 1;
1479
7
        } else {
1480
7
            type = T;
1481
7
            return 1;
1482
7
        }
1483
7
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
676
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
        } else if constexpr (T == TYPE_NULL) {
1477
            have_nulls = true;
1478
            return 1;
1479
676
        } else {
1480
676
            type = T;
1481
676
            return 1;
1482
676
        }
1483
676
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
5.05M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
        } else if constexpr (T == TYPE_NULL) {
1477
            have_nulls = true;
1478
            return 1;
1479
5.05M
        } else {
1480
5.05M
            type = T;
1481
5.05M
            return 1;
1482
5.05M
        }
1483
5.05M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
        } else if constexpr (T == TYPE_NULL) {
1477
            have_nulls = true;
1478
            return 1;
1479
1
        } else {
1480
1
            type = T;
1481
1
            return 1;
1482
1
        }
1483
1
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
2.77M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
        } else if constexpr (T == TYPE_NULL) {
1477
            have_nulls = true;
1478
            return 1;
1479
2.77M
        } else {
1480
2.77M
            type = T;
1481
2.77M
            return 1;
1482
2.77M
        }
1483
2.77M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
10.5M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
        } else if constexpr (T == TYPE_NULL) {
1477
            have_nulls = true;
1478
            return 1;
1479
10.5M
        } else {
1480
10.5M
            type = T;
1481
10.5M
            return 1;
1482
10.5M
        }
1483
10.5M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1473
46.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1474
        if constexpr (T == TYPE_ARRAY) {
1475
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1476
        } else if constexpr (T == TYPE_NULL) {
1477
            have_nulls = true;
1478
            return 1;
1479
46.8k
        } else {
1480
46.8k
            type = T;
1481
46.8k
            return 1;
1482
46.8k
        }
1483
46.8k
    }
1484
18.5M
    void get_scalar_type(PrimitiveType* data_type) const { *data_type = type; }
1485
18.5M
    bool contain_nulls() const { return have_nulls; }
1486
1487
18.5M
    bool need_convert_field() const { return false; }
1488
1489
private:
1490
    PrimitiveType type = PrimitiveType::INVALID_TYPE;
1491
    bool have_nulls = false;
1492
};
1493
1494
/// Visitor that allows to get type of scalar field
1495
/// or least common type of scalars in array.
1496
/// More optimized version of FieldToDataType.
1497
class FieldVisitorToScalarType : public StaticVisitor<size_t> {
1498
public:
1499
    template <PrimitiveType T>
1500
3.54M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
3.54M
        if constexpr (T == TYPE_ARRAY) {
1502
1.34M
            size_t size = x.size();
1503
3.80M
            for (size_t i = 0; i < size; ++i) {
1504
2.46M
                apply_visitor(*this, x[i]);
1505
2.46M
            }
1506
1.34M
            return 0;
1507
1.34M
        } else if constexpr (T == TYPE_NULL) {
1508
12.2k
            have_nulls = true;
1509
12.2k
            return 0;
1510
2.18M
        } else {
1511
2.18M
            field_types.insert(T);
1512
2.18M
            type_indexes.insert(T);
1513
2.18M
            return 0;
1514
2.18M
        }
1515
3.54M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
12.2k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
12.2k
        } else if constexpr (T == TYPE_NULL) {
1508
12.2k
            have_nulls = true;
1509
12.2k
            return 0;
1510
        } else {
1511
            field_types.insert(T);
1512
            type_indexes.insert(T);
1513
            return 0;
1514
        }
1515
12.2k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
480
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
480
        } else {
1511
480
            field_types.insert(T);
1512
480
            type_indexes.insert(T);
1513
480
            return 0;
1514
480
        }
1515
480
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
29.6k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
29.6k
        } else {
1511
29.6k
            field_types.insert(T);
1512
29.6k
            type_indexes.insert(T);
1513
29.6k
            return 0;
1514
29.6k
        }
1515
29.6k
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
396
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
396
        } else {
1511
396
            field_types.insert(T);
1512
396
            type_indexes.insert(T);
1513
396
            return 0;
1514
396
        }
1515
396
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
58.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
58.3k
        } else {
1511
58.3k
            field_types.insert(T);
1512
58.3k
            type_indexes.insert(T);
1513
58.3k
            return 0;
1514
58.3k
        }
1515
58.3k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
1.03k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
1.03k
        } else {
1511
1.03k
            field_types.insert(T);
1512
1.03k
            type_indexes.insert(T);
1513
1.03k
            return 0;
1514
1.03k
        }
1515
1.03k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
1.02k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
1.02k
        } else {
1511
1.02k
            field_types.insert(T);
1512
1.02k
            type_indexes.insert(T);
1513
1.02k
            return 0;
1514
1.02k
        }
1515
1.02k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
1.53k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
1.53k
        } else {
1511
1.53k
            field_types.insert(T);
1512
1.53k
            type_indexes.insert(T);
1513
1.53k
            return 0;
1514
1.53k
        }
1515
1.53k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
673k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
673k
        } else {
1511
673k
            field_types.insert(T);
1512
673k
            type_indexes.insert(T);
1513
673k
            return 0;
1514
673k
        }
1515
673k
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
858
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
858
        } else {
1511
858
            field_types.insert(T);
1512
858
            type_indexes.insert(T);
1513
858
            return 0;
1514
858
        }
1515
858
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
198k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
198k
        } else {
1511
198k
            field_types.insert(T);
1512
198k
            type_indexes.insert(T);
1513
198k
            return 0;
1514
198k
        }
1515
198k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
306
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
306
        } else {
1511
306
            field_types.insert(T);
1512
306
            type_indexes.insert(T);
1513
306
            return 0;
1514
306
        }
1515
306
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
275
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
275
        } else {
1511
275
            field_types.insert(T);
1512
275
            type_indexes.insert(T);
1513
275
            return 0;
1514
275
        }
1515
275
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
1.15M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
1.15M
        } else {
1511
1.15M
            field_types.insert(T);
1512
1.15M
            type_indexes.insert(T);
1513
1.15M
            return 0;
1514
1.15M
        }
1515
1.15M
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
1.34M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
1.34M
        if constexpr (T == TYPE_ARRAY) {
1502
1.34M
            size_t size = x.size();
1503
3.80M
            for (size_t i = 0; i < size; ++i) {
1504
2.46M
                apply_visitor(*this, x[i]);
1505
2.46M
            }
1506
1.34M
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
        } else {
1511
            field_types.insert(T);
1512
            type_indexes.insert(T);
1513
            return 0;
1514
        }
1515
1.34M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
1
        } else {
1511
1
            field_types.insert(T);
1512
1
            type_indexes.insert(T);
1513
1
            return 0;
1514
1
        }
1515
1
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
1
        } else {
1511
1
            field_types.insert(T);
1512
1
            type_indexes.insert(T);
1513
1
            return 0;
1514
1
        }
1515
1
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
756
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
756
        } else {
1511
756
            field_types.insert(T);
1512
756
            type_indexes.insert(T);
1513
756
            return 0;
1514
756
        }
1515
756
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
696
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
696
        } else {
1511
696
            field_types.insert(T);
1512
696
            type_indexes.insert(T);
1513
696
            return 0;
1514
696
        }
1515
696
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
60.6k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
60.6k
        } else {
1511
60.6k
            field_types.insert(T);
1512
60.6k
            type_indexes.insert(T);
1513
60.6k
            return 0;
1514
60.6k
        }
1515
60.6k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
558
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
558
        } else {
1511
558
            field_types.insert(T);
1512
558
            type_indexes.insert(T);
1513
558
            return 0;
1514
558
        }
1515
558
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1500
44
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1501
        if constexpr (T == TYPE_ARRAY) {
1502
            size_t size = x.size();
1503
            for (size_t i = 0; i < size; ++i) {
1504
                apply_visitor(*this, x[i]);
1505
            }
1506
            return 0;
1507
        } else if constexpr (T == TYPE_NULL) {
1508
            have_nulls = true;
1509
            return 0;
1510
44
        } else {
1511
44
            field_types.insert(T);
1512
44
            type_indexes.insert(T);
1513
44
            return 0;
1514
44
        }
1515
44
    }
1516
1.08M
    void get_scalar_type(PrimitiveType* type) const {
1517
1.08M
        if (type_indexes.size() == 1) {
1518
            // Most cases will have only one type
1519
989k
            *type = *type_indexes.begin();
1520
989k
            return;
1521
989k
        }
1522
90.6k
        DataTypePtr data_type;
1523
90.6k
        get_least_supertype_jsonb(type_indexes, &data_type);
1524
90.6k
        *type = data_type->get_primitive_type();
1525
90.6k
    }
1526
1.08M
    bool contain_nulls() const { return have_nulls; }
1527
1.08M
    bool need_convert_field() const { return field_types.size() > 1; }
1528
1529
private:
1530
    phmap::flat_hash_set<PrimitiveType> type_indexes;
1531
    phmap::flat_hash_set<PrimitiveType> field_types;
1532
    bool have_nulls = false;
1533
};
1534
1535
template <typename Visitor>
1536
19.8M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1537
19.8M
    Visitor to_scalar_type_visitor;
1538
19.8M
    apply_visitor(to_scalar_type_visitor, field);
1539
19.8M
    PrimitiveType type_id;
1540
19.8M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1541
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1542
19.8M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1543
19.8M
             to_scalar_type_visitor.need_convert_field(),
1544
19.8M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1545
19.8M
}
_ZN5doris12variant_util19get_field_info_implINS0_24FieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1536
1.08M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1537
1.08M
    Visitor to_scalar_type_visitor;
1538
1.08M
    apply_visitor(to_scalar_type_visitor, field);
1539
1.08M
    PrimitiveType type_id;
1540
1.08M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1541
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1542
1.08M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1543
1.08M
             to_scalar_type_visitor.need_convert_field(),
1544
1.08M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1545
1.08M
}
_ZN5doris12variant_util19get_field_info_implINS0_30SimpleFieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1536
18.7M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1537
18.7M
    Visitor to_scalar_type_visitor;
1538
18.7M
    apply_visitor(to_scalar_type_visitor, field);
1539
18.7M
    PrimitiveType type_id;
1540
18.7M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1541
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1542
18.7M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1543
18.7M
             to_scalar_type_visitor.need_convert_field(),
1544
18.7M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1545
18.7M
}
1546
1547
19.8M
void get_field_info(const Field& field, FieldInfo* info) {
1548
19.8M
    if (field.is_complex_field()) {
1549
1.08M
        get_field_info_impl<FieldVisitorToScalarType>(field, info);
1550
18.7M
    } else {
1551
18.7M
        get_field_info_impl<SimpleFieldVisitorToScalarType>(field, info);
1552
18.7M
    }
1553
19.8M
}
1554
1555
bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id,
1556
                              const std::string& path,
1557
136k
                              TabletSchema::SubColumnInfo* sub_column_info) {
1558
136k
    const auto& parent_column = schema.column_by_uid(col_unique_id);
1559
136k
    std::function<void(const TabletColumn&, TabletColumn*)> generate_result_column =
1560
136k
            [&](const TabletColumn& from_column, TabletColumn* to_column) {
1561
15.8k
                to_column->set_name(parent_column.name_lower_case() + "." + path);
1562
15.8k
                to_column->set_type(from_column.type());
1563
15.8k
                to_column->set_parent_unique_id(parent_column.unique_id());
1564
15.8k
                bool is_typed = !parent_column.variant_enable_typed_paths_to_sparse();
1565
15.8k
                to_column->set_path_info(
1566
15.8k
                        PathInData(parent_column.name_lower_case() + "." + path, is_typed));
1567
15.8k
                to_column->set_aggregation_method(parent_column.aggregation());
1568
15.8k
                to_column->set_is_nullable(true);
1569
15.8k
                to_column->set_parent_unique_id(parent_column.unique_id());
1570
15.8k
                if (from_column.is_decimal()) {
1571
15.7k
                    to_column->set_precision(from_column.precision());
1572
15.7k
                }
1573
15.8k
                to_column->set_frac(from_column.frac());
1574
1575
15.8k
                if (from_column.is_array_type()) {
1576
3.00k
                    TabletColumn nested_column;
1577
3.00k
                    generate_result_column(*from_column.get_sub_columns()[0], &nested_column);
1578
3.00k
                    to_column->add_sub_column(nested_column);
1579
3.00k
                }
1580
15.8k
            };
1581
1582
136k
    auto generate_index = [&](const std::string& pattern) {
1583
        // 1. find subcolumn's index
1584
12.8k
        if (const auto& indexes = schema.inverted_index_by_field_pattern(col_unique_id, pattern);
1585
12.8k
            !indexes.empty()) {
1586
5.31k
            for (const auto& index : indexes) {
1587
5.31k
                auto index_ptr = std::make_shared<TabletIndex>(*index);
1588
5.31k
                index_ptr->set_escaped_escaped_index_suffix_path(
1589
5.31k
                        sub_column_info->column.path_info_ptr()->get_path());
1590
5.31k
                sub_column_info->indexes.emplace_back(std::move(index_ptr));
1591
5.31k
            }
1592
5.25k
        }
1593
        // 2. find parent column's index
1594
7.58k
        else if (const auto parent_index = schema.inverted_indexs(col_unique_id);
1595
7.58k
                 !parent_index.empty()) {
1596
497
            inherit_index(parent_index, sub_column_info->indexes, sub_column_info->column);
1597
7.09k
        } else {
1598
7.09k
            sub_column_info->indexes.clear();
1599
7.09k
        }
1600
12.8k
    };
1601
1602
136k
    const auto& sub_columns = parent_column.get_sub_columns();
1603
212k
    for (const auto& sub_column : sub_columns) {
1604
212k
        const char* pattern = sub_column->name().c_str();
1605
212k
        switch (sub_column->pattern_type()) {
1606
5.36k
        case PatternTypePB::MATCH_NAME: {
1607
5.36k
            if (strcmp(pattern, path.c_str()) == 0) {
1608
1.45k
                generate_result_column(*sub_column, &sub_column_info->column);
1609
1.45k
                generate_index(sub_column->name());
1610
1.45k
                return true;
1611
1.45k
            }
1612
3.90k
            break;
1613
5.36k
        }
1614
206k
        case PatternTypePB::MATCH_NAME_GLOB: {
1615
206k
            if (glob_match_re2(pattern, path)) {
1616
11.3k
                generate_result_column(*sub_column, &sub_column_info->column);
1617
11.3k
                generate_index(sub_column->name());
1618
11.3k
                return true;
1619
11.3k
            }
1620
195k
            break;
1621
206k
        }
1622
195k
        default:
1623
0
            break;
1624
212k
        }
1625
212k
    }
1626
123k
    return false;
1627
136k
}
1628
1629
TabletSchemaSPtr VariantCompactionUtil::calculate_variant_extended_schema(
1630
1.41k
        const std::vector<RowsetSharedPtr>& rowsets, const TabletSchemaSPtr& base_schema) {
1631
1.41k
    if (rowsets.empty()) {
1632
0
        return nullptr;
1633
0
    }
1634
1635
1.41k
    std::vector<TabletSchemaSPtr> schemas;
1636
3.31k
    for (const auto& rs : rowsets) {
1637
3.31k
        if (rs->num_segments() == 0) {
1638
3.14k
            continue;
1639
3.14k
        }
1640
177
        const auto& tablet_schema = rs->tablet_schema();
1641
177
        SegmentCacheHandle segment_cache;
1642
177
        auto st = SegmentLoader::instance()->load_segments(std::static_pointer_cast<BetaRowset>(rs),
1643
177
                                                           &segment_cache);
1644
177
        if (!st.ok()) {
1645
0
            return base_schema;
1646
0
        }
1647
177
        for (const auto& segment : segment_cache.get_segments()) {
1648
177
            TabletSchemaSPtr schema = tablet_schema->copy_without_variant_extracted_columns();
1649
360
            for (const auto& column : tablet_schema->columns()) {
1650
360
                if (!column->is_variant_type()) {
1651
177
                    continue;
1652
177
                }
1653
183
                std::shared_ptr<ColumnReader> column_reader;
1654
183
                OlapReaderStatistics stats;
1655
183
                st = segment->get_column_reader(column->unique_id(), &column_reader, &stats);
1656
183
                if (!st.ok()) {
1657
0
                    LOG(WARNING) << "Failed to get column reader for column: " << column->name()
1658
0
                                 << " error: " << st.to_string();
1659
0
                    continue;
1660
0
                }
1661
183
                if (!column_reader) {
1662
0
                    continue;
1663
0
                }
1664
1665
183
                CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
1666
183
                auto* variant_column_reader =
1667
183
                        assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
1668
                // load external meta before getting subcolumn meta info
1669
183
                st = variant_column_reader->load_external_meta_once();
1670
183
                if (!st.ok()) {
1671
0
                    LOG(WARNING) << "Failed to load external meta for column: " << column->name()
1672
0
                                 << " error: " << st.to_string();
1673
0
                    continue;
1674
0
                }
1675
183
                const auto* subcolumn_meta_info = variant_column_reader->get_subcolumns_meta_info();
1676
515
                for (const auto& entry : *subcolumn_meta_info) {
1677
515
                    if (entry->path.empty()) {
1678
183
                        continue;
1679
183
                    }
1680
332
                    const std::string& column_name =
1681
332
                            column->name_lower_case() + "." + entry->path.get_path();
1682
332
                    const DataTypePtr& data_type = entry->data.file_column_type;
1683
332
                    PathInDataBuilder full_path_builder;
1684
332
                    auto full_path = full_path_builder.append(column->name_lower_case(), false)
1685
332
                                             .append(entry->path.get_parts(), false)
1686
332
                                             .build();
1687
332
                    TabletColumn subcolumn =
1688
332
                            get_column_by_type(data_type, column_name,
1689
332
                                               ExtraInfo {.unique_id = -1,
1690
332
                                                          .parent_unique_id = column->unique_id(),
1691
332
                                                          .path_info = full_path});
1692
332
                    schema->append_column(subcolumn);
1693
332
                }
1694
183
            }
1695
177
            schemas.emplace_back(schema);
1696
177
        }
1697
177
    }
1698
1.41k
    TabletSchemaSPtr least_common_schema;
1699
1.41k
    auto st = get_least_common_schema(schemas, base_schema, least_common_schema, false);
1700
1.41k
    if (!st.ok()) {
1701
0
        return base_schema;
1702
0
    }
1703
1.41k
    return least_common_schema;
1704
1.41k
}
1705
1706
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1707
                   TabletIndexes& subcolumns_indexes, FieldType column_type,
1708
71.1k
                   const std::string& suffix_path, bool is_array_nested_type) {
1709
71.1k
    if (parent_indexes.empty()) {
1710
67.9k
        return false;
1711
67.9k
    }
1712
3.16k
    subcolumns_indexes.clear();
1713
    // bkd index or array index only need to inherit one index
1714
3.16k
    if (field_is_numeric_type(column_type) ||
1715
3.16k
        (is_array_nested_type &&
1716
1.67k
         (field_is_numeric_type(column_type) || field_is_slice_type(column_type)))) {
1717
1.67k
        auto index_ptr = std::make_shared<TabletIndex>(*parent_indexes[0]);
1718
1.67k
        index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1719
        // no need parse for bkd index or array index
1720
1.67k
        index_ptr->remove_parser_and_analyzer();
1721
1.67k
        subcolumns_indexes.emplace_back(std::move(index_ptr));
1722
1.67k
        return true;
1723
1.67k
    }
1724
    // string type need to inherit all indexes
1725
1.48k
    else if (field_is_slice_type(column_type) && !is_array_nested_type) {
1726
1.51k
        for (const auto& index : parent_indexes) {
1727
1.51k
            auto index_ptr = std::make_shared<TabletIndex>(*index);
1728
1.51k
            index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1729
1.51k
            subcolumns_indexes.emplace_back(std::move(index_ptr));
1730
1.51k
        }
1731
1.48k
        return true;
1732
1.48k
    }
1733
4
    return false;
1734
3.16k
}
1735
1736
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1737
71.2k
                   TabletIndexes& subcolumns_indexes, const TabletColumn& column) {
1738
71.2k
    if (!column.is_extracted_column()) {
1739
3
        return false;
1740
3
    }
1741
71.2k
    if (column.is_array_type()) {
1742
1.10k
        if (column.get_sub_columns().empty()) {
1743
0
            return false;
1744
0
        }
1745
1.10k
        const TabletColumn* nested = column.get_sub_columns()[0].get();
1746
1.10k
        while (nested != nullptr && nested->is_array_type()) {
1747
0
            if (nested->get_sub_columns().empty()) {
1748
0
                return false;
1749
0
            }
1750
0
            nested = nested->get_sub_columns()[0].get();
1751
0
        }
1752
1.10k
        if (nested == nullptr) {
1753
0
            return false;
1754
0
        }
1755
1.10k
        return inherit_index(parent_indexes, subcolumns_indexes, nested->type(),
1756
1.10k
                             column.path_info_ptr()->get_path(), true);
1757
1.10k
    }
1758
70.1k
    return inherit_index(parent_indexes, subcolumns_indexes, column.type(),
1759
70.1k
                         column.path_info_ptr()->get_path());
1760
71.2k
}
1761
1762
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1763
0
                   TabletIndexes& subcolumns_indexes, const ColumnMetaPB& column_pb) {
1764
0
    if (!column_pb.has_column_path_info()) {
1765
0
        return false;
1766
0
    }
1767
0
    if (column_pb.type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1768
0
        if (column_pb.children_columns_size() == 0) {
1769
0
            return false;
1770
0
        }
1771
0
        const ColumnMetaPB* nested = &column_pb.children_columns(0);
1772
0
        while (nested != nullptr && nested->type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1773
0
            if (nested->children_columns_size() == 0) {
1774
0
                return false;
1775
0
            }
1776
0
            nested = &nested->children_columns(0);
1777
0
        }
1778
0
        if (nested == nullptr) {
1779
0
            return false;
1780
0
        }
1781
0
        return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)nested->type(),
1782
0
                             column_pb.column_path_info().path(), true);
1783
0
    }
1784
0
    return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)column_pb.type(),
1785
0
                         column_pb.column_path_info().path());
1786
0
}
1787
1788
// ============ Implementation from parse2column.cpp ============
1789
1790
/** Pool for objects that cannot be used from different threads simultaneously.
1791
  * Allows to create an object for each thread.
1792
  * Pool has unbounded size and objects are not destroyed before destruction of pool.
1793
  *
1794
  * Use it in cases when thread local storage is not appropriate
1795
  *  (when maximum number of simultaneously used objects is less
1796
  *   than number of running/sleeping threads, that has ever used object,
1797
  *   and creation/destruction of objects is expensive).
1798
  */
1799
template <typename T>
1800
class SimpleObjectPool {
1801
protected:
1802
    /// Hold all available objects in stack.
1803
    std::mutex mutex;
1804
    std::stack<std::unique_ptr<T>> stack;
1805
    /// Specialized deleter for std::unique_ptr.
1806
    /// Returns underlying pointer back to stack thus reclaiming its ownership.
1807
    struct Deleter {
1808
        SimpleObjectPool<T>* parent;
1809
16.9k
        Deleter(SimpleObjectPool<T>* parent_ = nullptr) : parent {parent_} {} /// NOLINT
1810
16.9k
        void operator()(T* owning_ptr) const {
1811
16.9k
            std::lock_guard lock {parent->mutex};
1812
16.9k
            parent->stack.emplace(owning_ptr);
1813
16.9k
        }
1814
    };
1815
1816
public:
1817
    using Pointer = std::unique_ptr<T, Deleter>;
1818
    /// Extracts and returns a pointer from the stack if it's not empty,
1819
    ///  creates a new one by calling provided f() otherwise.
1820
    template <typename Factory>
1821
16.9k
    Pointer get(Factory&& f) {
1822
16.9k
        std::unique_lock lock(mutex);
1823
16.9k
        if (stack.empty()) {
1824
24
            return {f(), this};
1825
24
        }
1826
16.9k
        auto object = stack.top().release();
1827
16.9k
        stack.pop();
1828
16.9k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1829
16.9k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9StringRefEPS4_RKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1821
12.4k
    Pointer get(Factory&& f) {
1822
12.4k
        std::unique_lock lock(mutex);
1823
12.4k
        if (stack.empty()) {
1824
1
            return {f(), this};
1825
1
        }
1826
12.4k
        auto object = stack.top().release();
1827
12.4k
        stack.pop();
1828
12.4k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1829
12.4k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9ColumnStrIjEERKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1821
4.51k
    Pointer get(Factory&& f) {
1822
4.51k
        std::unique_lock lock(mutex);
1823
4.51k
        if (stack.empty()) {
1824
23
            return {f(), this};
1825
23
        }
1826
4.48k
        auto object = stack.top().release();
1827
4.48k
        stack.pop();
1828
4.48k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1829
4.51k
    }
1830
    /// Like get(), but creates object using default constructor.
1831
    Pointer getDefault() {
1832
        return get([] { return new T; });
1833
    }
1834
};
1835
1836
SimpleObjectPool<JsonParser> parsers_pool;
1837
1838
using Node = typename ColumnVariant::Subcolumns::Node;
1839
1840
42.7M
static inline void append_binary_bytes(ColumnString::Chars& chars, const void* data, size_t size) {
1841
42.7M
    const auto old_size = chars.size();
1842
42.7M
    chars.resize(old_size + size);
1843
42.7M
    memcpy(chars.data() + old_size, reinterpret_cast<const char*>(data), size);
1844
42.7M
}
1845
1846
17.8M
static inline void append_binary_type(ColumnString::Chars& chars, FieldType type) {
1847
17.8M
    const uint8_t t = static_cast<uint8_t>(type);
1848
17.8M
    append_binary_bytes(chars, &t, sizeof(uint8_t));
1849
17.8M
}
1850
1851
10.7M
static inline void append_binary_sizet(ColumnString::Chars& chars, size_t v) {
1852
10.7M
    append_binary_bytes(chars, &v, sizeof(size_t));
1853
10.7M
}
1854
1855
17.8M
static void append_field_to_binary_chars(const Field& field, ColumnString::Chars& chars) {
1856
17.8M
    switch (field.get_type()) {
1857
14
    case PrimitiveType::TYPE_NULL: {
1858
14
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_NONE);
1859
14
        return;
1860
0
    }
1861
261k
    case PrimitiveType::TYPE_BOOLEAN: {
1862
261k
        append_binary_type(chars,
1863
261k
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BOOLEAN));
1864
261k
        const auto v = static_cast<UInt8>(field.get<PrimitiveType::TYPE_BOOLEAN>());
1865
261k
        append_binary_bytes(chars, &v, sizeof(UInt8));
1866
261k
        return;
1867
0
    }
1868
4.52M
    case PrimitiveType::TYPE_BIGINT: {
1869
4.52M
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BIGINT));
1870
4.52M
        const auto v = field.get<PrimitiveType::TYPE_BIGINT>();
1871
4.52M
        append_binary_bytes(chars, &v, sizeof(Int64));
1872
4.52M
        return;
1873
0
    }
1874
9
    case PrimitiveType::TYPE_LARGEINT: {
1875
9
        append_binary_type(chars,
1876
9
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_LARGEINT));
1877
9
        const auto v = field.get<PrimitiveType::TYPE_LARGEINT>();
1878
9
        append_binary_bytes(chars, &v, sizeof(int128_t));
1879
9
        return;
1880
0
    }
1881
2.75M
    case PrimitiveType::TYPE_DOUBLE: {
1882
2.75M
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_DOUBLE));
1883
2.75M
        const auto v = field.get<PrimitiveType::TYPE_DOUBLE>();
1884
2.75M
        append_binary_bytes(chars, &v, sizeof(Float64));
1885
2.75M
        return;
1886
0
    }
1887
10.2M
    case PrimitiveType::TYPE_STRING: {
1888
10.2M
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_STRING);
1889
10.2M
        const auto& v = field.get<PrimitiveType::TYPE_STRING>();
1890
10.2M
        append_binary_sizet(chars, v.size());
1891
10.2M
        append_binary_bytes(chars, v.data(), v.size());
1892
10.2M
        return;
1893
0
    }
1894
46.7k
    case PrimitiveType::TYPE_JSONB: {
1895
46.7k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_JSONB);
1896
46.7k
        const auto& v = field.get<PrimitiveType::TYPE_JSONB>();
1897
46.7k
        append_binary_sizet(chars, v.get_size());
1898
46.7k
        append_binary_bytes(chars, v.get_value(), v.get_size());
1899
46.7k
        return;
1900
0
    }
1901
529k
    case PrimitiveType::TYPE_ARRAY: {
1902
529k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_ARRAY);
1903
529k
        const auto& a = field.get<PrimitiveType::TYPE_ARRAY>();
1904
529k
        append_binary_sizet(chars, a.size());
1905
787k
        for (const auto& elem : a) {
1906
787k
            append_field_to_binary_chars(elem, chars);
1907
787k
        }
1908
529k
        return;
1909
0
    }
1910
0
    default:
1911
0
        throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Unsupported field type {}",
1912
0
                               field.get_type());
1913
17.8M
    }
1914
17.8M
}
1915
template <typename ParserImpl>
1916
void parse_json_to_variant_impl(IColumn& column, const char* src, size_t length,
1917
1.35M
                                JSONDataParser<ParserImpl>* parser, const ParseConfig& config) {
1918
1.35M
    auto& column_variant = assert_cast<ColumnVariant&>(column);
1919
1.35M
    std::optional<ParseResult> result;
1920
    /// Treat empty string as an empty object
1921
    /// for better CAST from String to Object.
1922
1.35M
    if (length > 0) {
1923
1.35M
        result = parser->parse(src, length, config);
1924
1.35M
    } else {
1925
2.33k
        result = ParseResult {};
1926
2.33k
    }
1927
1.35M
    if (!result) {
1928
663
        VLOG_DEBUG << "failed to parse " << std::string_view(src, length) << ", length= " << length;
1929
663
        if (config::variant_throw_exeception_on_invalid_json) {
1930
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse object {}",
1931
0
                                   std::string_view(src, length));
1932
0
        }
1933
        // Treat as string
1934
663
        PathInData root_path;
1935
663
        Field field = Field::create_field<TYPE_STRING>(String(src, length));
1936
663
        result = ParseResult {{root_path}, {field}};
1937
663
    }
1938
1.35M
    auto& [paths, values] = *result;
1939
1.35M
    assert(paths.size() == values.size());
1940
1.35M
    size_t old_num_rows = column_variant.rows();
1941
1.35M
    if (config.deprecated_enable_flatten_nested) {
1942
        // here we should check the paths in variant and paths in result,
1943
        // if two paths which same prefix have different structure, we should throw an exception
1944
3.02k
        std::vector<PathInData> check_paths;
1945
12.0k
        for (const auto& entry : column_variant.get_subcolumns()) {
1946
12.0k
            check_paths.push_back(entry->path);
1947
12.0k
        }
1948
3.02k
        check_paths.insert(check_paths.end(), paths.begin(), paths.end());
1949
3.02k
        THROW_IF_ERROR(check_variant_has_no_ambiguous_paths(check_paths));
1950
3.02k
    }
1951
1.35M
    auto [doc_value_data_paths, doc_value_data_values] =
1952
1.35M
            column_variant.get_doc_value_data_paths_and_values();
1953
1.35M
    auto& doc_value_data_offsets = column_variant.serialized_doc_value_column_offsets();
1954
1955
1.41M
    auto flush_defaults = [](ColumnVariant::Subcolumn* subcolumn) {
1956
1.41M
        const auto num_defaults = subcolumn->cur_num_of_defaults();
1957
1.41M
        if (num_defaults > 0) {
1958
165k
            subcolumn->insert_many_defaults(num_defaults);
1959
165k
            subcolumn->reset_current_num_of_defaults();
1960
165k
        }
1961
1.41M
    };
1962
1963
1.35M
    auto is_plain_path = [](const PathInData& path) {
1964
13
        for (const auto& part : path.get_parts()) {
1965
13
            if (part.is_nested || part.anonymous_array_level != 0) {
1966
0
                return false;
1967
0
            }
1968
13
        }
1969
9
        return true;
1970
9
    };
1971
1972
1.35M
    auto get_or_create_subcolumn = [&](const PathInData& path, size_t index_hint,
1973
1.41M
                                       const FieldInfo& field_info) -> ColumnVariant::Subcolumn* {
1974
1.41M
        auto* subcolumn = column_variant.get_subcolumn(path, index_hint);
1975
1.41M
        if (subcolumn == nullptr) {
1976
3.68k
            if (path.has_nested_part()) {
1977
17
                column_variant.add_nested_subcolumn(path, field_info, old_num_rows);
1978
3.66k
            } else {
1979
3.66k
                column_variant.add_sub_column(path, old_num_rows);
1980
3.66k
            }
1981
3.68k
            subcolumn = column_variant.get_subcolumn(path, index_hint);
1982
3.68k
        }
1983
1.41M
        if (!subcolumn) {
1984
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to find sub column {}",
1985
0
                                   path.get_path());
1986
0
        }
1987
1.41M
        return subcolumn;
1988
1.41M
    };
1989
1990
1.41M
    auto normalize_plain_path = [&](const PathInData& path) {
1991
1.41M
        if (!config.check_duplicate_json_path || path.empty() || !is_plain_path(path)) {
1992
1.41M
            return path;
1993
1.41M
        }
1994
9
        return PathInData(path.get_path());
1995
1.41M
    };
1996
1997
1.35M
    auto insert_into_subcolumn = [&](size_t i,
1998
1.41M
                                     bool check_size_mismatch) -> ColumnVariant::Subcolumn* {
1999
1.41M
        FieldInfo field_info;
2000
1.41M
        get_field_info(values[i], &field_info);
2001
1.41M
        if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
2002
104
            return nullptr;
2003
104
        }
2004
1.41M
        auto path = normalize_plain_path(paths[i]);
2005
1.41M
        auto* subcolumn = get_or_create_subcolumn(path, i, field_info);
2006
1.41M
        flush_defaults(subcolumn);
2007
1.41M
        if (check_size_mismatch && subcolumn->size() != old_num_rows) {
2008
1
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
2009
1
                                   "subcolumn {} size missmatched, may contains duplicated entry",
2010
1
                                   path.get_path());
2011
1
        }
2012
1.41M
        subcolumn->insert(std::move(values[i]), std::move(field_info));
2013
1.41M
        return subcolumn;
2014
1.41M
    };
2015
2016
1.35M
    switch (config.parse_to) {
2017
82.1k
    case ParseConfig::ParseTo::OnlySubcolumns:
2018
1.50M
        for (size_t i = 0; i < paths.size(); ++i) {
2019
1.41M
            insert_into_subcolumn(i, true);
2020
1.41M
        }
2021
82.1k
        break;
2022
1.27M
    case ParseConfig::ParseTo::OnlyDocValueColumn: {
2023
1.27M
        std::vector<size_t> doc_item_indexes;
2024
1.27M
        doc_item_indexes.reserve(paths.size());
2025
1.27M
        phmap::flat_hash_set<StringRef, StringRefHash> seen_paths;
2026
1.27M
        seen_paths.reserve(paths.size());
2027
2028
19.1M
        for (size_t i = 0; i < paths.size(); ++i) {
2029
17.8M
            FieldInfo field_info;
2030
17.8M
            get_field_info(values[i], &field_info);
2031
17.8M
            if (paths[i].empty()) {
2032
                // Plain non-doc VARIANT can use doc-value KV as writer-side staging. An
2033
                // invalid root entry from JSON object/array is neither a scalar root value nor
2034
                // a doc KV path, so leave this row's doc offset empty. Doc-mode and valid scalar
2035
                // roots still populate the root subcolumn below.
2036
798
                if (!column_variant.enable_doc_mode() &&
2037
798
                    field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
2038
3
                    continue;
2039
3
                }
2040
795
                auto* subcolumn = column_variant.get_subcolumn(paths[i]);
2041
795
                DCHECK(subcolumn != nullptr);
2042
795
                flush_defaults(subcolumn);
2043
795
                subcolumn->insert(std::move(values[i]), std::move(field_info));
2044
795
                continue;
2045
798
            }
2046
17.8M
            if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE ||
2047
17.8M
                values[i].get_type() == PrimitiveType::TYPE_NULL) {
2048
116k
                continue;
2049
116k
            }
2050
17.7M
            const auto& path_str = paths[i].get_path();
2051
17.7M
            StringRef path_ref {path_str.data(), path_str.size()};
2052
17.7M
            if (UNLIKELY(!seen_paths.emplace(path_ref).second)) {
2053
2
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
2054
2
                                       "may contains duplicated entry : {}",
2055
2
                                       std::string_view(path_str));
2056
2
            }
2057
17.7M
            doc_item_indexes.push_back(i);
2058
17.7M
        }
2059
2060
1.27M
        std::sort(doc_item_indexes.begin(), doc_item_indexes.end(),
2061
71.2M
                  [&](size_t l, size_t r) { return paths[l].get_path() < paths[r].get_path(); });
2062
15.9M
        for (const auto idx : doc_item_indexes) {
2063
15.9M
            const auto& path_str = paths[idx].get_path();
2064
15.9M
            doc_value_data_paths->insert_data(path_str.data(), path_str.size());
2065
15.9M
            auto& chars = doc_value_data_values->get_chars();
2066
15.9M
            append_field_to_binary_chars(values[idx], chars);
2067
15.9M
            doc_value_data_values->get_offsets().push_back(chars.size());
2068
15.9M
        }
2069
1.27M
    } break;
2070
1.35M
    }
2071
1.35M
    doc_value_data_offsets.push_back(doc_value_data_paths->size());
2072
    // /// Insert default values to missed subcolumns.
2073
1.35M
    const auto& subcolumns = column_variant.get_subcolumns();
2074
5.54M
    for (const auto& entry : subcolumns) {
2075
5.54M
        if (entry->data.size() == old_num_rows) {
2076
            // Handle nested paths differently from simple paths
2077
4.12M
            if (entry->path.has_nested_part()) {
2078
                // Try to insert default from nested, if failed, insert regular default
2079
0
                bool success = UNLIKELY(column_variant.try_insert_default_from_nested(entry));
2080
0
                if (!success) {
2081
0
                    entry->data.insert_default();
2082
0
                }
2083
4.12M
            } else {
2084
                // For non-nested paths, increment default counter
2085
4.12M
                entry->data.increment_default_counter();
2086
4.12M
            }
2087
4.12M
        }
2088
5.54M
    }
2089
1.35M
    column_variant.incr_num_rows();
2090
1.35M
    if (column_variant.get_sparse_column()->size() == old_num_rows) {
2091
1.35M
        column_variant.get_sparse_column_mutable().insert_default();
2092
1.35M
    }
2093
1.35M
#ifndef NDEBUG
2094
1.35M
    column_variant.check_consistency();
2095
1.35M
#endif
2096
1.35M
}
2097
2098
// exposed interfaces
2099
void parse_json_to_variant(IColumn& column, const StringRef& json, JsonParser* parser,
2100
12.4k
                           const ParseConfig& config) {
2101
12.4k
    if (parser) {
2102
0
        return parse_json_to_variant_impl(column, json.data, json.size, parser, config);
2103
12.4k
    } else {
2104
12.4k
        auto pool_parser = parsers_pool.get([] { return new JsonParser(); });
2105
12.4k
        return parse_json_to_variant_impl(column, json.data, json.size, pool_parser.get(), config);
2106
12.4k
    }
2107
12.4k
}
2108
2109
void parse_json_to_variant(IColumn& column, const ColumnString& raw_json_column,
2110
4.51k
                           const ParseConfig& config) {
2111
4.51k
    auto parser = parsers_pool.get([] { return new JsonParser(); });
2112
1.34M
    for (size_t i = 0; i < raw_json_column.size(); ++i) {
2113
1.34M
        StringRef raw_json = raw_json_column.get_data_at(i);
2114
1.34M
        parse_json_to_variant_impl(column, raw_json.data, raw_json.size, parser.get(), config);
2115
1.34M
    }
2116
4.51k
    column.finalize();
2117
4.51k
}
2118
2119
// parse the doc snapshot column to subcolumns
2120
0
void materialize_docs_to_subcolumns(ColumnVariant& column_variant) {
2121
0
    auto subcolumns = materialize_docs_to_subcolumns_map(column_variant);
2122
2123
0
    for (auto& entry : subcolumns) {
2124
0
        entry.second.finalize();
2125
0
        if (!column_variant.add_sub_column(PathInData(entry.first),
2126
0
                                           IColumn::mutate(entry.second.get_finalized_column_ptr()),
2127
0
                                           entry.second.get_least_common_type())) {
2128
0
            throw doris::Exception(ErrorCode::INTERNAL_ERROR,
2129
0
                                   "Failed to add subcolumn {}, which is from doc snapshot column",
2130
0
                                   entry.first);
2131
0
        }
2132
0
    }
2133
2134
0
    column_variant.finalize();
2135
0
}
2136
2137
// ============ Implementation from variant_util.cpp ============
2138
2139
phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_to_subcolumns_map(
2140
11
        const ColumnVariant& variant, size_t expected_unique_paths) {
2141
11
    constexpr size_t kInitialPathReserve = 8192;
2142
11
    phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> subcolumns;
2143
2144
11
    const auto [column_key, column_value] = variant.get_doc_value_data_paths_and_values();
2145
11
    const auto& column_offsets = variant.serialized_doc_value_column_offsets();
2146
11
    const size_t num_rows = column_offsets.size();
2147
2148
11
    DCHECK_EQ(num_rows, variant.size()) << "doc snapshot offsets size mismatch with variant rows";
2149
2150
11
    subcolumns.reserve(expected_unique_paths != 0
2151
11
                               ? expected_unique_paths
2152
11
                               : std::min<size_t>(column_key->size(), kInitialPathReserve));
2153
2154
36
    for (size_t row = 0; row < num_rows; ++row) {
2155
25
        const size_t start = column_offsets[row - 1];
2156
25
        const size_t end = column_offsets[row];
2157
71
        for (size_t i = start; i < end; ++i) {
2158
46
            const auto& key = column_key->get_data_at(i);
2159
46
            const std::string_view path_sv(key.data, key.size);
2160
2161
46
            auto [it, inserted] =
2162
46
                    subcolumns.try_emplace(path_sv, ColumnVariant::Subcolumn {0, true, false});
2163
46
            auto& subcolumn = it->second;
2164
46
            if (inserted) {
2165
27
                subcolumn.insert_many_defaults(row);
2166
27
            } else if (subcolumn.size() != row) {
2167
4
                subcolumn.insert_many_defaults(row - subcolumn.size());
2168
4
            }
2169
46
            subcolumn.deserialize_from_binary_column(column_value, i);
2170
46
        }
2171
25
    }
2172
2173
27
    for (auto& [path, subcolumn] : subcolumns) {
2174
27
        if (subcolumn.size() != num_rows) {
2175
7
            subcolumn.insert_many_defaults(num_rows - subcolumn.size());
2176
7
        }
2177
27
    }
2178
2179
11
    return subcolumns;
2180
11
}
2181
2182
Status _parse_and_materialize_variant_columns(Block& block,
2183
                                              const std::vector<uint32_t>& variant_pos,
2184
4.47k
                                              const std::vector<ParseConfig>& configs) {
2185
9.81k
    for (size_t i = 0; i < variant_pos.size(); ++i) {
2186
5.34k
        auto column_ref = block.get_by_position(variant_pos[i]).column;
2187
5.34k
        bool is_nullable = column_ref->is_nullable();
2188
5.34k
        MutableColumnPtr owner_column = std::move(*column_ref).mutate();
2189
5.34k
        ColumnPtr nullable_null_map;
2190
5.34k
        MutableColumnPtr var_column;
2191
5.34k
        if (is_nullable) {
2192
4.99k
            const auto& nullable = assert_cast<const ColumnNullable&>(*owner_column);
2193
4.99k
            nullable_null_map = nullable.get_null_map_column_ptr();
2194
4.99k
            var_column = std::move(*nullable.get_nested_column_ptr()).mutate();
2195
4.99k
        } else {
2196
349
            var_column = std::move(owner_column);
2197
349
        }
2198
5.34k
        auto& var = assert_cast<ColumnVariant&>(*var_column);
2199
5.34k
        var_column->finalize();
2200
2201
5.34k
        MutableColumnPtr variant_column;
2202
5.34k
        if (!var.is_scalar_variant()) {
2203
            // already parsed
2204
1.21k
            continue;
2205
1.21k
        }
2206
2207
18.4E
        VLOG_DEBUG << "parse scalar variant column: " << var.get_root_type()->get_name();
2208
4.12k
        ColumnPtr scalar_root_column;
2209
4.12k
        if (var.get_root_type()->get_primitive_type() == TYPE_JSONB) {
2210
32
            scalar_root_column = jsonb_root_to_json_string_column(*var.get_root());
2211
4.09k
        } else {
2212
4.09k
            const auto& root = *var.get_root();
2213
4.09k
            scalar_root_column =
2214
4.09k
                    root.is_nullable()
2215
4.09k
                            ? assert_cast<const ColumnNullable&>(root).get_nested_column_ptr()
2216
4.09k
                            : var.get_root();
2217
4.09k
        }
2218
2219
4.28k
        if (scalar_root_column->is_column_string()) {
2220
4.28k
            variant_column = ColumnVariant::create(0, var.enable_doc_mode());
2221
4.28k
            parse_json_to_variant(*variant_column.get(),
2222
4.28k
                                  assert_cast<const ColumnString&>(*scalar_root_column),
2223
4.28k
                                  configs[i]);
2224
18.4E
        } else {
2225
            // Root maybe other types rather than string like ColumnVariant(Int32).
2226
            // In this case, we should finlize the root and cast to JSON type
2227
18.4E
            auto expected_root_type =
2228
18.4E
                    make_nullable(std::make_shared<ColumnVariant::MostCommonType>());
2229
18.4E
            var.ensure_root_node_type(expected_root_type);
2230
18.4E
            variant_column = std::move(var_column);
2231
18.4E
        }
2232
2233
        // Wrap variant with nullmap if it is nullable
2234
4.12k
        ColumnPtr result = variant_column->get_ptr();
2235
4.12k
        if (is_nullable) {
2236
4.05k
            result = ColumnNullable::create(result, nullable_null_map);
2237
4.05k
        }
2238
4.12k
        block.get_by_position(variant_pos[i]).column = result;
2239
4.12k
    }
2240
4.47k
    return Status::OK();
2241
4.47k
}
2242
2243
Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos,
2244
4.37k
                                             const std::vector<ParseConfig>& configs) {
2245
4.37k
    RETURN_IF_CATCH_EXCEPTION(
2246
4.37k
            { return _parse_and_materialize_variant_columns(block, variant_pos, configs); });
2247
4.37k
}
2248
2249
namespace {
2250
2251
ParseConfig::ParseTo select_storage_variant_parse_target(const TabletColumn& column,
2252
5.16k
                                                         const ParseConfig& config) {
2253
    // NestedGroup consumes the parse-time subcolumn tree to build nested storage structures, so it
2254
    // must not go through doc-value staging.
2255
5.16k
    if (column.variant_enable_nested_group()) {
2256
4
        return ParseConfig::ParseTo::OnlySubcolumns;
2257
4
    }
2258
2259
    // Persistent doc mode owns doc-value bucket columns in VariantDocWriter. Keep it separate from
2260
    // the plain non-doc staging optimization, even when typed paths or parent indexes exist.
2261
5.15k
    if (column.variant_enable_doc_mode()) {
2262
1.88k
        return ParseConfig::ParseTo::OnlyDocValueColumn;
2263
1.88k
    }
2264
2265
    // Deprecated flatten-nested still consumes parse-time subcolumns. Predefined typed paths and
2266
    // parent inverted indexes are handled later by regular doc-value staging: typed paths are
2267
    // forced into the materialized set unless typed-to-sparse is enabled, and materialized dynamic
2268
    // subcolumns inherit parent indexes while sparse payloads stay unindexed.
2269
3.27k
    if (config.deprecated_enable_flatten_nested) {
2270
26
        return ParseConfig::ParseTo::OnlySubcolumns;
2271
26
    }
2272
2273
    // Plain dynamic non-doc VARIANT can avoid eagerly creating thousands of parse-time subcolumns.
2274
    // The segment writer will pick the materialized/sparse split from this doc-value KV staging.
2275
    // Keep a BE switch so tests and rollouts can compare the old parse-time path with staging under
2276
    // the same writer and schema.
2277
3.24k
    switch (config::variant_storage_parse_mode) {
2278
3.34k
    case 0:
2279
3.34k
    case 2:
2280
3.34k
        return ParseConfig::ParseTo::OnlyDocValueColumn;
2281
2
    case 1:
2282
2
        return ParseConfig::ParseTo::OnlySubcolumns;
2283
0
    default:
2284
0
        CHECK(false) << "invalid variant_storage_parse_mode: "
2285
0
                     << config::variant_storage_parse_mode;
2286
0
        return ParseConfig::ParseTo::OnlyDocValueColumn;
2287
3.24k
    }
2288
3.24k
}
2289
2290
} // namespace
2291
2292
Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& tablet_schema,
2293
4.48k
                                             const std::vector<uint32_t>& column_pos) {
2294
4.48k
    std::vector<uint32_t> variant_column_pos;
2295
4.48k
    std::vector<uint32_t> variant_schema_pos;
2296
4.48k
    variant_column_pos.reserve(column_pos.size());
2297
4.48k
    variant_schema_pos.reserve(column_pos.size());
2298
27.2k
    for (size_t block_pos = 0; block_pos < column_pos.size(); ++block_pos) {
2299
22.7k
        const uint32_t schema_pos = column_pos[block_pos];
2300
22.7k
        const auto& column = tablet_schema.column(schema_pos);
2301
22.7k
        if (column.is_variant_type()) {
2302
5.28k
            variant_column_pos.push_back(schema_pos);
2303
5.28k
            variant_schema_pos.push_back(schema_pos);
2304
5.28k
        }
2305
22.7k
    }
2306
2307
4.48k
    if (variant_column_pos.empty()) {
2308
45
        return Status::OK();
2309
45
    }
2310
2311
4.44k
    std::vector<ParseConfig> configs(variant_column_pos.size());
2312
9.73k
    for (size_t i = 0; i < variant_column_pos.size(); ++i) {
2313
        // Deprecated legacy flatten-nested switch. Distinct from variant_enable_nested_group.
2314
5.29k
        configs[i].deprecated_enable_flatten_nested =
2315
5.29k
                tablet_schema.deprecated_variant_flatten_nested();
2316
5.29k
        configs[i].check_duplicate_json_path = config::variant_enable_duplicate_json_path_check;
2317
5.29k
        const auto& column = tablet_schema.column(variant_schema_pos[i]);
2318
5.29k
        if (!column.is_variant_type()) {
2319
0
            return Status::InternalError("column is not variant type, column name: {}",
2320
0
                                         column.name());
2321
0
        }
2322
5.29k
        configs[i].parse_to = select_storage_variant_parse_target(column, configs[i]);
2323
5.29k
    }
2324
2325
4.44k
    RETURN_IF_ERROR(parse_and_materialize_variant_columns(block, variant_column_pos, configs));
2326
4.44k
    return Status::OK();
2327
4.44k
}
2328
2329
} // namespace doris::variant_util