Coverage Report

Created: 2026-05-09 04:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/common/variant_util.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/common/variant_util.h"
19
20
#include <assert.h>
21
#include <fmt/format.h>
22
#include <gen_cpp/FrontendService.h>
23
#include <gen_cpp/FrontendService_types.h>
24
#include <gen_cpp/HeartbeatService_types.h>
25
#include <gen_cpp/MasterService_types.h>
26
#include <gen_cpp/Status_types.h>
27
#include <gen_cpp/Types_types.h>
28
#include <glog/logging.h>
29
#include <rapidjson/document.h>
30
#include <rapidjson/stringbuffer.h>
31
#include <rapidjson/writer.h>
32
#include <simdjson/simdjson.h> // IWYU pragma: keep
33
#include <unicode/uchar.h>
34
35
#include <algorithm>
36
#include <cassert>
37
#include <cstddef>
38
#include <cstdint>
39
#include <cstring>
40
#include <list>
41
#include <memory>
42
#include <mutex>
43
#include <optional>
44
#include <ostream>
45
#include <ranges>
46
#include <set>
47
#include <stack>
48
#include <string>
49
#include <string_view>
50
#include <unordered_map>
51
#include <utility>
52
#include <vector>
53
54
#include "common/config.h"
55
#include "common/status.h"
56
#include "core/assert_cast.h"
57
#include "core/block/block.h"
58
#include "core/block/column_numbers.h"
59
#include "core/block/column_with_type_and_name.h"
60
#include "core/column/column.h"
61
#include "core/column/column_array.h"
62
#include "core/column/column_map.h"
63
#include "core/column/column_nullable.h"
64
#include "core/column/column_string.h"
65
#include "core/column/column_variant.h"
66
#include "core/data_type/data_type.h"
67
#include "core/data_type/data_type_array.h"
68
#include "core/data_type/data_type_factory.hpp"
69
#include "core/data_type/data_type_jsonb.h"
70
#include "core/data_type/data_type_nullable.h"
71
#include "core/data_type/data_type_string.h"
72
#include "core/data_type/data_type_variant.h"
73
#include "core/data_type/define_primitive_type.h"
74
#include "core/data_type/get_least_supertype.h"
75
#include "core/data_type/primitive_type.h"
76
#include "core/field.h"
77
#include "core/typeid_cast.h"
78
#include "core/types.h"
79
#include "exec/common/field_visitors.h"
80
#include "exec/common/sip_hash.h"
81
#include "exprs/function/function.h"
82
#include "exprs/function/simple_function_factory.h"
83
#include "exprs/function_context.h"
84
#include "exprs/json_functions.h"
85
#include "re2/re2.h"
86
#include "runtime/exec_env.h"
87
#include "runtime/runtime_state.h"
88
#include "storage/olap_common.h"
89
#include "storage/rowset/beta_rowset.h"
90
#include "storage/rowset/rowset.h"
91
#include "storage/rowset/rowset_fwd.h"
92
#include "storage/segment/segment_loader.h"
93
#include "storage/segment/variant/nested_group_path.h"
94
#include "storage/segment/variant/variant_column_reader.h"
95
#include "storage/segment/variant/variant_column_writer_impl.h"
96
#include "storage/tablet/tablet.h"
97
#include "storage/tablet/tablet_fwd.h"
98
#include "storage/tablet/tablet_schema.h"
99
#include "util/client_cache.h"
100
#include "util/defer_op.h"
101
#include "util/json/json_parser.h"
102
#include "util/json/path_in_data.h"
103
#include "util/json/simd_json_parser.h"
104
105
namespace doris::variant_util {
106
107
2.73k
inline void append_escaped_regex_char(std::string* regex_output, char ch) {
108
2.73k
    switch (ch) {
109
22
    case '.':
110
24
    case '^':
111
26
    case '$':
112
28
    case '+':
113
34
    case '*':
114
36
    case '?':
115
38
    case '(':
116
40
    case ')':
117
42
    case '|':
118
44
    case '{':
119
46
    case '}':
120
48
    case '[':
121
48
    case ']':
122
52
    case '\\':
123
52
        regex_output->push_back('\\');
124
52
        regex_output->push_back(ch);
125
52
        break;
126
2.68k
    default:
127
2.68k
        regex_output->push_back(ch);
128
2.68k
        break;
129
2.73k
    }
130
2.73k
}
131
132
// Small LRU to cap compiled glob patterns
133
constexpr size_t kGlobRegexCacheCapacity = 256;
134
135
struct GlobRegexCacheEntry {
136
    std::shared_ptr<RE2> re2;
137
    std::list<std::string>::iterator lru_it;
138
};
139
140
static std::mutex g_glob_regex_cache_mutex;
141
static std::list<std::string> g_glob_regex_cache_lru;
142
static std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;
143
144
150k
std::shared_ptr<RE2> get_or_build_re2(const std::string& glob_pattern) {
145
150k
    {
146
150k
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
147
150k
        auto it = g_glob_regex_cache.find(glob_pattern);
148
150k
        if (it != g_glob_regex_cache.end()) {
149
150k
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
150
150k
                                          it->second.lru_it);
151
150k
            return it->second.re2;
152
150k
        }
153
150k
    }
154
203
    std::string regex_pattern;
155
203
    Status st = glob_to_regex(glob_pattern, &regex_pattern);
156
203
    if (!st.ok()) {
157
2
        return nullptr;
158
2
    }
159
201
    auto compiled = std::make_shared<RE2>(regex_pattern);
160
201
    if (!compiled->ok()) {
161
3
        return nullptr;
162
3
    }
163
198
    {
164
198
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
165
198
        auto it = g_glob_regex_cache.find(glob_pattern);
166
198
        if (it != g_glob_regex_cache.end()) {
167
0
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
168
0
                                          it->second.lru_it);
169
0
            return it->second.re2;
170
0
        }
171
198
        g_glob_regex_cache_lru.push_front(glob_pattern);
172
198
        g_glob_regex_cache.emplace(glob_pattern,
173
198
                                   GlobRegexCacheEntry {compiled, g_glob_regex_cache_lru.begin()});
174
198
        if (g_glob_regex_cache.size() > kGlobRegexCacheCapacity) {
175
0
            const std::string& evict_key = g_glob_regex_cache_lru.back();
176
0
            g_glob_regex_cache.erase(evict_key);
177
0
            g_glob_regex_cache_lru.pop_back();
178
0
        }
179
198
    }
180
0
    return compiled;
181
198
}
182
183
// Convert a restricted glob pattern into a regex.
184
// Supported: '*', '?', '[...]', '\\' escape. Others are treated as literals.
185
286
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern) {
186
286
    regex_pattern->clear();
187
286
    regex_pattern->append("^");
188
286
    bool is_escaped = false;
189
286
    size_t pattern_length = glob_pattern.size();
190
3.14k
    for (size_t index = 0; index < pattern_length; ++index) {
191
2.86k
        char current_char = glob_pattern[index];
192
2.86k
        if (is_escaped) {
193
10
            append_escaped_regex_char(regex_pattern, current_char);
194
10
            is_escaped = false;
195
10
            continue;
196
10
        }
197
2.85k
        if (current_char == '\\') {
198
14
            is_escaped = true;
199
14
            continue;
200
14
        }
201
2.83k
        if (current_char == '*') {
202
68
            regex_pattern->append(".*");
203
68
            continue;
204
68
        }
205
2.77k
        if (current_char == '?') {
206
15
            regex_pattern->append(".");
207
15
            continue;
208
15
        }
209
2.75k
        if (current_char == '[') {
210
33
            size_t class_index = index + 1;
211
33
            bool class_closed = false;
212
33
            bool is_class_escaped = false;
213
33
            std::string class_buffer;
214
33
            if (class_index < pattern_length &&
215
33
                (glob_pattern[class_index] == '!' || glob_pattern[class_index] == '^')) {
216
9
                class_buffer.push_back('^');
217
9
                ++class_index;
218
9
            }
219
99
            for (; class_index < pattern_length; ++class_index) {
220
95
                char class_char = glob_pattern[class_index];
221
95
                if (is_class_escaped) {
222
10
                    class_buffer.push_back(class_char);
223
10
                    is_class_escaped = false;
224
10
                    continue;
225
10
                }
226
85
                if (class_char == '\\') {
227
10
                    is_class_escaped = true;
228
10
                    continue;
229
10
                }
230
75
                if (class_char == ']') {
231
29
                    class_closed = true;
232
29
                    break;
233
29
                }
234
46
                class_buffer.push_back(class_char);
235
46
            }
236
33
            if (!class_closed) {
237
4
                return Status::InvalidArgument("Unclosed character class in glob pattern: {}",
238
4
                                               glob_pattern);
239
4
            }
240
29
            regex_pattern->append("[");
241
29
            regex_pattern->append(class_buffer);
242
29
            regex_pattern->append("]");
243
29
            index = class_index;
244
29
            continue;
245
33
        }
246
2.72k
        append_escaped_regex_char(regex_pattern, current_char);
247
2.72k
    }
248
282
    if (is_escaped) {
249
4
        append_escaped_regex_char(regex_pattern, '\\');
250
4
    }
251
282
    regex_pattern->append("$");
252
282
    return Status::OK();
253
286
}
254
255
150k
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path) {
256
150k
    auto compiled = get_or_build_re2(glob_pattern);
257
150k
    if (compiled == nullptr) {
258
5
        return false;
259
5
    }
260
150k
    return RE2::FullMatch(candidate_path, *compiled);
261
150k
}
262
263
2
bool is_regular_ng_compaction_subpath(const PathInData& path) {
264
2
    const std::string& relative_path = path.get_path();
265
2
    return !relative_path.empty() && !path.has_nested_part() &&
266
2
           !segment_v2::contains_nested_group_marker(relative_path) &&
267
2
           !segment_v2::is_root_nested_group_path(relative_path) &&
268
2
           relative_path != SPARSE_COLUMN_PATH &&
269
2
           relative_path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos;
270
2
}
271
272
2
bool should_keep_existing_ng_compaction_subcolumn(const TabletColumn& column) {
273
2
    if (!column.is_extracted_column()) {
274
0
        return false;
275
0
    }
276
2
    return is_regular_ng_compaction_subpath(column.path_info_ptr()->copy_pop_front());
277
2
}
278
279
1
TabletIndexes clone_tablet_indexes(const std::vector<const TabletIndex*>& indexes) {
280
1
    TabletIndexes cloned_indexes;
281
1
    cloned_indexes.reserve(indexes.size());
282
1
    for (const auto* index : indexes) {
283
1
        cloned_indexes.emplace_back(std::make_shared<TabletIndex>(*index));
284
1
    }
285
1
    return cloned_indexes;
286
1
}
287
288
void keep_existing_ng_compaction_subcolumn(const TabletSchemaSPtr& source_schema,
289
                                           const TabletColumnPtr& extracted_column,
290
                                           TabletSchemaSPtr& output_schema,
291
1
                                           TabletSchema::PathsSetInfo& paths_set_info) {
292
1
    DCHECK(extracted_column->is_extracted_column());
293
1
    DCHECK(should_keep_existing_ng_compaction_subcolumn(*extracted_column));
294
295
1
    auto relative_path = extracted_column->path_info_ptr()->copy_pop_front();
296
1
    const std::string path = relative_path.get_path();
297
1
    output_schema->append_column(*extracted_column);
298
299
1
    auto indexes = clone_tablet_indexes(source_schema->inverted_indexs(*extracted_column));
300
1
    if (extracted_column->path_info_ptr()->get_is_typed()) {
301
1
        TabletSchema::SubColumnInfo sub_column_info {.column = *extracted_column,
302
1
                                                     .indexes = std::move(indexes)};
303
1
        paths_set_info.typed_path_set.emplace(path, std::move(sub_column_info));
304
1
        return;
305
1
    }
306
307
0
    paths_set_info.sub_path_set.emplace(path);
308
0
    if (!indexes.empty()) {
309
0
        paths_set_info.subcolumn_indexes.emplace(path, std::move(indexes));
310
0
    }
311
0
}
312
313
using ExistingNgCompactionSubcolumns = std::unordered_map<int32_t, std::vector<TabletColumnPtr>>;
314
315
struct NestedGroupCompactionMaterializationPlan {
316
    std::vector<TabletColumnPtr> preserved_regular_subcolumns;
317
    std::unordered_set<PathInData, PathInData::Hash> materialized_regular_paths;
318
    PathToDataTypes additional_regular_path_to_data_types;
319
};
320
321
bool should_preserve_existing_ng_compaction_subcolumns(
322
        const TabletColumnPtr& column,
323
664
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
324
664
    const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
325
664
    return column->variant_enable_nested_group() ||
326
664
           (info_it != uid_to_variant_extended_info.end() && info_it->second.has_nested_group);
327
664
}
328
329
std::unordered_set<int32_t> collect_ng_compaction_root_uids(
330
        const TabletSchemaSPtr& target,
331
8.08k
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
332
8.08k
    std::unordered_set<int32_t> root_uids;
333
92.3k
    for (const TabletColumnPtr& column : target->columns()) {
334
92.3k
        if (column->is_variant_type() && should_preserve_existing_ng_compaction_subcolumns(
335
665
                                                 column, uid_to_variant_extended_info)) {
336
1
            root_uids.insert(column->unique_id());
337
1
        }
338
92.3k
    }
339
8.08k
    return root_uids;
340
8.08k
}
341
342
ExistingNgCompactionSubcolumns collect_existing_ng_compaction_subcolumns(
343
8.01k
        const TabletSchemaSPtr& target, const std::unordered_set<int32_t>& ng_root_uids) {
344
8.01k
    ExistingNgCompactionSubcolumns uid_to_existing_subcolumns;
345
91.0k
    for (const TabletColumnPtr& column : target->columns()) {
346
91.0k
        if (!column->is_extracted_column() || !ng_root_uids.contains(column->parent_unique_id()) ||
347
91.1k
            !should_keep_existing_ng_compaction_subcolumn(*column)) {
348
91.1k
            continue;
349
91.1k
        }
350
18.4E
        uid_to_existing_subcolumns[column->parent_unique_id()].push_back(column);
351
18.4E
    }
352
8.01k
    return uid_to_existing_subcolumns;
353
8.01k
}
354
355
NestedGroupCompactionMaterializationPlan build_nested_group_compaction_materialization_plan(
356
        const std::vector<TabletColumnPtr>& existing_subcolumns,
357
1
        const VariantExtendedInfo& extended_info) {
358
1
    NestedGroupCompactionMaterializationPlan plan;
359
1
    plan.preserved_regular_subcolumns = existing_subcolumns;
360
1
    for (const auto& column : existing_subcolumns) {
361
1
        plan.materialized_regular_paths.emplace(column->path_info_ptr()->copy_pop_front());
362
1
    }
363
1
    for (const auto& [path, data_types] : extended_info.path_to_data_types) {
364
0
        if (!is_regular_ng_compaction_subpath(path) ||
365
0
            plan.materialized_regular_paths.contains(path)) {
366
0
            continue;
367
0
        }
368
0
        plan.materialized_regular_paths.emplace(path);
369
0
        plan.additional_regular_path_to_data_types.emplace(path, data_types);
370
0
    }
371
1
    return plan;
372
1
}
373
374
void append_nested_group_compaction_columns(const TabletSchemaSPtr& target,
375
                                            const TabletColumnPtr& column,
376
                                            const NestedGroupCompactionMaterializationPlan& plan,
377
                                            TabletSchemaSPtr& output_schema,
378
1
                                            TabletSchema::PathsSetInfo& paths_set_info) {
379
1
    for (const auto& existing_column : plan.preserved_regular_subcolumns) {
380
1
        keep_existing_ng_compaction_subcolumn(target, existing_column, output_schema,
381
1
                                              paths_set_info);
382
1
    }
383
1
    VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
384
1
            paths_set_info, column, target, plan.additional_regular_path_to_data_types,
385
1
            output_schema);
386
1
}
387
388
963
size_t get_number_of_dimensions(const IDataType& type) {
389
963
    if (const auto* type_array = typeid_cast<const DataTypeArray*>(&type)) {
390
4
        return type_array->get_number_of_dimensions();
391
4
    }
392
959
    return 0;
393
963
}
394
3
size_t get_number_of_dimensions(const IColumn& column) {
395
3
    if (const auto* column_array = check_and_get_column<ColumnArray>(column)) {
396
2
        return column_array->get_number_of_dimensions();
397
2
    }
398
1
    return 0;
399
3
}
400
401
86.2k
DataTypePtr get_base_type_of_array(const DataTypePtr& type) {
402
    /// Get raw pointers to avoid extra copying of type pointers.
403
86.2k
    const DataTypeArray* last_array = nullptr;
404
86.2k
    const auto* current_type = type.get();
405
86.2k
    if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
406
86.2k
        current_type = nullable->get_nested_type().get();
407
86.2k
    }
408
87.4k
    while (const auto* type_array = typeid_cast<const DataTypeArray*>(current_type)) {
409
1.24k
        current_type = type_array->get_nested_type().get();
410
1.24k
        last_array = type_array;
411
1.24k
        if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
412
1.24k
            current_type = nullable->get_nested_type().get();
413
1.24k
        }
414
1.24k
    }
415
86.2k
    return last_array ? last_array->get_nested_type() : type;
416
86.2k
}
417
418
69.9k
Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result) {
419
69.9k
    ColumnsWithTypeAndName arguments {arg, {nullptr, type, type->get_name()}};
420
421
    // To prevent from null info lost, we should not call function since the function framework will wrap
422
    // nullable to Variant instead of the root of Variant
423
    // correct output: Nullable(Array(int)) -> Nullable(Variant(Nullable(Array(int))))
424
    // incorrect output: Nullable(Array(int)) -> Nullable(Variant(Array(int)))
425
69.9k
    if (type->get_primitive_type() == TYPE_VARIANT) {
426
        // If source column is variant, so the nullable info is different from dst column
427
11.5k
        if (arg.type->get_primitive_type() == TYPE_VARIANT) {
428
287
            *result = type->is_nullable() ? make_nullable(arg.column) : remove_nullable(arg.column);
429
287
            return Status::OK();
430
287
        }
431
        // set variant root column/type to from column/type
432
11.5k
        CHECK(arg.column->is_nullable());
433
11.2k
        auto to_type = remove_nullable(type);
434
11.2k
        const auto& data_type_object = assert_cast<const DataTypeVariant&>(*to_type);
435
11.2k
        auto variant = ColumnVariant::create(data_type_object.variant_max_subcolumns_count(),
436
11.2k
                                             data_type_object.enable_doc_mode());
437
438
11.2k
        variant->create_root(arg.type, arg.column->assume_mutable());
439
11.2k
        ColumnPtr nullable = ColumnNullable::create(
440
11.2k
                variant->get_ptr(),
441
11.2k
                check_and_get_column<ColumnNullable>(arg.column.get())->get_null_map_column_ptr());
442
11.2k
        *result = type->is_nullable() ? nullable : variant->get_ptr();
443
11.2k
        return Status::OK();
444
11.5k
    }
445
446
58.4k
    auto function = SimpleFunctionFactory::instance().get_function("CAST", arguments, type);
447
58.4k
    if (!function) {
448
0
        return Status::InternalError("Not found cast function {} to {}", arg.type->get_name(),
449
0
                                     type->get_name());
450
0
    }
451
58.4k
    Block tmp_block {arguments};
452
58.4k
    uint32_t result_column = cast_set<uint32_t>(tmp_block.columns());
453
58.4k
    RuntimeState state;
454
58.4k
    auto ctx = FunctionContext::create_context(&state, {}, {});
455
456
58.4k
    if (arg.type->get_primitive_type() == INVALID_TYPE) {
457
        // cast from nothing to any type should result in nulls
458
5.90k
        *result = type->create_column_const_with_default_value(arg.column->size())
459
5.90k
                          ->convert_to_full_column_if_const();
460
5.90k
        return Status::OK();
461
5.90k
    }
462
463
    // We convert column string to jsonb type just add a string jsonb field to dst column instead of parse
464
    // each line in original string column.
465
52.5k
    ctx->set_string_as_jsonb_string(true);
466
52.5k
    ctx->set_jsonb_string_as_string(true);
467
52.5k
    tmp_block.insert({nullptr, type, arg.name});
468
    // TODO(lihangyu): we should handle this error in strict mode
469
52.5k
    if (!function->execute(ctx.get(), tmp_block, {0}, result_column, arg.column->size())) {
470
1
        LOG_EVERY_N(WARNING, 100) << fmt::format("cast from {} to {}", arg.type->get_name(),
471
1
                                                 type->get_name());
472
1
        *result = type->create_column_const_with_default_value(arg.column->size())
473
1
                          ->convert_to_full_column_if_const();
474
1
        return Status::OK();
475
1
    }
476
52.5k
    *result = tmp_block.get_by_position(result_column).column->convert_to_full_column_if_const();
477
52.5k
    VLOG_DEBUG << fmt::format("{} before convert {}, after convert {}", arg.name,
478
8
                              arg.column->get_name(), (*result)->get_name());
479
52.5k
    return Status::OK();
480
52.5k
}
481
482
void get_column_by_type(const DataTypePtr& data_type, const std::string& name, TabletColumn& column,
483
172k
                        const ExtraInfo& ext_info) {
484
172k
    column.set_name(name);
485
172k
    column.set_type(data_type->get_storage_field_type());
486
172k
    if (ext_info.unique_id >= 0) {
487
4
        column.set_unique_id(ext_info.unique_id);
488
4
    }
489
172k
    if (ext_info.parent_unique_id >= 0) {
490
85.2k
        column.set_parent_unique_id(ext_info.parent_unique_id);
491
85.2k
    }
492
172k
    if (!ext_info.path_info.empty()) {
493
85.2k
        column.set_path_info(ext_info.path_info);
494
85.2k
    }
495
172k
    if (data_type->is_nullable()) {
496
86.4k
        const auto& real_type = static_cast<const DataTypeNullable&>(*data_type);
497
86.4k
        column.set_is_nullable(true);
498
86.4k
        get_column_by_type(real_type.get_nested_type(), name, column, {});
499
86.4k
        return;
500
86.4k
    }
501
86.4k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_ARRAY) {
502
1.21k
        TabletColumn child;
503
1.21k
        get_column_by_type(assert_cast<const DataTypeArray*>(data_type.get())->get_nested_type(),
504
1.21k
                           "", child, {});
505
1.21k
        column.set_length(TabletColumn::get_field_length_by_type(TPrimitiveType::ARRAY, 0));
506
1.21k
        column.add_sub_column(child);
507
1.21k
        return;
508
1.21k
    }
509
85.2k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_VARIANT) {
510
0
        const auto* dt_variant = assert_cast<const DataTypeVariant*>(data_type.get());
511
0
        column.set_variant_max_subcolumns_count(dt_variant->variant_max_subcolumns_count());
512
0
        column.set_variant_enable_doc_mode(dt_variant->enable_doc_mode());
513
0
        return;
514
0
    }
515
    // size is not fixed when type is string or json
516
85.2k
    if (is_string_type(data_type->get_primitive_type()) ||
517
85.2k
        data_type->get_primitive_type() == TYPE_JSONB) {
518
28.6k
        column.set_length(INT_MAX);
519
28.6k
        return;
520
28.6k
    }
521
522
56.6k
    PrimitiveType type = data_type->get_primitive_type();
523
56.6k
    if (is_int_or_bool(type) || is_string_type(type) || is_float_or_double(type) || is_ip(type) ||
524
56.6k
        is_date_or_datetime(type) || type == PrimitiveType::TYPE_DATEV2) {
525
56.4k
        column.set_length(cast_set<int32_t>(data_type->get_size_of_value_in_memory()));
526
56.4k
        return;
527
56.4k
    }
528
124
    if (is_decimal(type)) {
529
105
        column.set_precision(data_type->get_precision());
530
105
        column.set_frac(data_type->get_scale());
531
105
        return;
532
105
    }
533
    // datetimev2 needs scale
534
19
    if (type == PrimitiveType::TYPE_DATETIMEV2 || type == PrimitiveType::TYPE_TIMESTAMPTZ) {
535
17
        column.set_precision(-1);
536
17
        column.set_frac(data_type->get_scale());
537
17
        return;
538
17
    }
539
540
2
    throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
541
2
                           "unexcepted data column type: {}, column name is: {}",
542
2
                           data_type->get_name(), name);
543
19
}
544
545
TabletColumn get_column_by_type(const DataTypePtr& data_type, const std::string& name,
546
84.3k
                                const ExtraInfo& ext_info) {
547
84.3k
    TabletColumn result;
548
84.3k
    get_column_by_type(data_type, name, result, ext_info);
549
84.3k
    return result;
550
84.3k
}
551
552
// check if two paths which same prefix have different structure
553
static bool has_different_structure_in_same_path(const PathInData::Parts& lhs,
554
9.04k
                                                 const PathInData::Parts& rhs) {
555
9.04k
    if (lhs.size() != rhs.size()) {
556
1
        return false; // different size means different structure
557
1
    }
558
    // Since we group by path string, lhs and rhs must have the same size and keys
559
    // We only need to check if they have different nested structure
560
36.1k
    for (size_t i = 0; i < lhs.size(); ++i) {
561
27.0k
        if (lhs[i] != rhs[i]) {
562
5
            VLOG_DEBUG << fmt::format(
563
0
                    "Check different structure: {} vs {}, lhs[i].is_nested: {}, rhs[i].is_nested: "
564
0
                    "{}",
565
0
                    lhs[i].key, rhs[i].key, lhs[i].is_nested, rhs[i].is_nested);
566
5
            return true;
567
5
        }
568
27.0k
    }
569
9.03k
    return false;
570
9.04k
}
571
572
4.75k
Status check_variant_has_no_ambiguous_paths(const PathsInData& tuple_paths) {
573
    // Group paths by their string representation to reduce comparisons
574
4.75k
    std::unordered_map<std::string, std::vector<size_t>> path_groups;
575
576
26.8k
    for (size_t i = 0; i < tuple_paths.size(); ++i) {
577
        // same path should have same structure, so we group them by path
578
22.0k
        path_groups[tuple_paths[i].get_path()].push_back(i);
579
        // print part of tuple_paths[i]
580
22.0k
        VLOG_DEBUG << "tuple_paths[i]: " << tuple_paths[i].get_path();
581
22.0k
    }
582
583
    // Only compare paths within the same group
584
13.0k
    for (const auto& [path_str, indices] : path_groups) {
585
13.0k
        if (indices.size() <= 1) {
586
3.99k
            continue; // No conflicts possible
587
3.99k
        }
588
589
        // Compare all pairs within this group
590
27.0k
        for (size_t i = 0; i < indices.size(); ++i) {
591
27.0k
            for (size_t j = 0; j < i; ++j) {
592
9.04k
                if (has_different_structure_in_same_path(tuple_paths[indices[i]].get_parts(),
593
9.04k
                                                         tuple_paths[indices[j]].get_parts())) {
594
5
                    return Status::DataQualityError(
595
5
                            "Ambiguous paths: {} vs {} with different nested part {} vs {}",
596
5
                            tuple_paths[indices[i]].get_path(), tuple_paths[indices[j]].get_path(),
597
5
                            tuple_paths[indices[i]].has_nested_part(),
598
5
                            tuple_paths[indices[j]].has_nested_part());
599
5
                }
600
9.04k
            }
601
18.0k
        }
602
9.01k
    }
603
4.75k
    return Status::OK();
604
4.75k
}
605
606
Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
607
                                    TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
608
                                    const std::map<std::string, TabletColumnPtr>& typed_columns,
609
1.71k
                                    std::set<PathInData>* path_set) {
610
1.71k
    PathsInData tuple_paths;
611
1.71k
    DataTypes tuple_types;
612
1.71k
    CHECK(common_schema.use_count() == 1);
613
    // Get the least common type for all paths.
614
1.71k
    for (const auto& [key, subtypes] : subcolumns_types) {
615
929
        assert(!subtypes.empty());
616
929
        if (key.get_path() == ColumnVariant::COLUMN_NAME_DUMMY) {
617
0
            continue;
618
0
        }
619
929
        size_t first_dim = get_number_of_dimensions(*subtypes[0]);
620
929
        tuple_paths.emplace_back(key);
621
959
        for (size_t i = 1; i < subtypes.size(); ++i) {
622
31
            if (first_dim != get_number_of_dimensions(*subtypes[i])) {
623
1
                tuple_types.emplace_back(make_nullable(std::make_shared<DataTypeJsonb>()));
624
1
                LOG(INFO) << fmt::format(
625
1
                        "Uncompatible types of subcolumn '{}': {} and {}, cast to JSONB",
626
1
                        key.get_path(), subtypes[0]->get_name(), subtypes[i]->get_name());
627
1
                break;
628
1
            }
629
31
        }
630
929
        if (tuple_paths.size() == tuple_types.size()) {
631
1
            continue;
632
1
        }
633
928
        DataTypePtr common_type;
634
928
        get_least_supertype_jsonb(subtypes, &common_type);
635
928
        if (!common_type->is_nullable()) {
636
3
            common_type = make_nullable(common_type);
637
3
        }
638
928
        tuple_types.emplace_back(common_type);
639
928
    }
640
1.71k
    CHECK_EQ(tuple_paths.size(), tuple_types.size());
641
642
    // Append all common type columns of this variant
643
2.64k
    for (int i = 0; i < tuple_paths.size(); ++i) {
644
929
        TabletColumn common_column;
645
        // typed path not contains root part
646
929
        auto path_without_root = tuple_paths[i].copy_pop_front().get_path();
647
929
        if (typed_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) {
648
0
            common_column = *typed_columns.at(path_without_root);
649
            // parent unique id and path may not be init in write path
650
0
            common_column.set_parent_unique_id(variant_col_unique_id);
651
0
            common_column.set_path_info(tuple_paths[i]);
652
0
            common_column.set_name(tuple_paths[i].get_path());
653
929
        } else {
654
            // const std::string& column_name = variant_col_name + "." + tuple_paths[i].get_path();
655
929
            get_column_by_type(tuple_types[i], tuple_paths[i].get_path(), common_column,
656
929
                               ExtraInfo {.unique_id = -1,
657
929
                                          .parent_unique_id = variant_col_unique_id,
658
929
                                          .path_info = tuple_paths[i]});
659
929
        }
660
929
        common_schema->append_column(common_column);
661
929
        if (path_set != nullptr) {
662
926
            path_set->insert(tuple_paths[i]);
663
926
        }
664
929
    }
665
1.71k
    return Status::OK();
666
1.71k
}
667
668
Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
669
                                  TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
670
1.71k
                                  std::set<PathInData>* path_set) {
671
1.71k
    std::map<std::string, TabletColumnPtr> typed_columns;
672
1.71k
    for (const TabletColumnPtr& col :
673
7.82k
         common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) {
674
7.82k
        typed_columns[col->name()] = col;
675
7.82k
    }
676
    // Types of subcolumns by path from all tuples.
677
1.71k
    std::map<PathInData, DataTypes> subcolumns_types;
678
679
    // Collect all paths first to enable batch checking
680
1.71k
    std::vector<PathInData> all_paths;
681
682
1.84k
    for (const TabletSchemaSPtr& schema : schemas) {
683
5.48k
        for (const TabletColumnPtr& col : schema->columns()) {
684
            // Get subcolumns of this variant
685
5.48k
            if (col->has_path_info() && col->parent_unique_id() >= 0 &&
686
5.48k
                col->parent_unique_id() == variant_col_unique_id) {
687
955
                subcolumns_types[*col->path_info_ptr()].emplace_back(
688
955
                        DataTypeFactory::instance().create_data_type(*col, col->is_nullable()));
689
955
                all_paths.push_back(*col->path_info_ptr());
690
955
            }
691
5.48k
        }
692
1.84k
    }
693
694
    // Batch check for conflicts
695
1.71k
    RETURN_IF_ERROR(check_variant_has_no_ambiguous_paths(all_paths));
696
697
1.71k
    return update_least_schema_internal(subcolumns_types, common_schema, variant_col_unique_id,
698
1.71k
                                        typed_columns, path_set);
699
1.71k
}
700
701
// Keep variant subcolumn BF support aligned with FE DDL checks.
702
93.0k
bool is_bf_supported_by_fe_for_variant_subcolumn(FieldType type) {
703
93.0k
    switch (type) {
704
91
    case FieldType::OLAP_FIELD_TYPE_SMALLINT:
705
418
    case FieldType::OLAP_FIELD_TYPE_INT:
706
51.9k
    case FieldType::OLAP_FIELD_TYPE_BIGINT:
707
52.0k
    case FieldType::OLAP_FIELD_TYPE_LARGEINT:
708
52.0k
    case FieldType::OLAP_FIELD_TYPE_CHAR:
709
52.0k
    case FieldType::OLAP_FIELD_TYPE_VARCHAR:
710
80.6k
    case FieldType::OLAP_FIELD_TYPE_STRING:
711
80.6k
    case FieldType::OLAP_FIELD_TYPE_DATE:
712
80.6k
    case FieldType::OLAP_FIELD_TYPE_DATETIME:
713
80.7k
    case FieldType::OLAP_FIELD_TYPE_DATEV2:
714
81.0k
    case FieldType::OLAP_FIELD_TYPE_DATETIMEV2:
715
81.0k
    case FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ:
716
81.0k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL:
717
81.1k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL32:
718
81.2k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL64:
719
81.5k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL128I:
720
81.6k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL256:
721
81.8k
    case FieldType::OLAP_FIELD_TYPE_IPV4:
722
81.9k
    case FieldType::OLAP_FIELD_TYPE_IPV6:
723
81.9k
        return true;
724
11.1k
    default:
725
11.1k
        return false;
726
93.0k
    }
727
93.0k
}
728
729
void inherit_column_attributes(const TabletColumn& source, TabletColumn& target,
730
93.0k
                               TabletSchemaSPtr* target_schema) {
731
93.0k
    if (!target.is_extracted_column()) {
732
0
        return;
733
0
    }
734
93.0k
    target.set_aggregation_method(source.aggregation());
735
736
    // 1. bloom filter
737
93.0k
    if (is_bf_supported_by_fe_for_variant_subcolumn(target.type())) {
738
81.9k
        target.set_is_bf_column(source.is_bf_column());
739
81.9k
    }
740
741
93.0k
    if (!target_schema) {
742
87.4k
        return;
743
87.4k
    }
744
745
    // 2. inverted index
746
5.59k
    TabletIndexes indexes_to_add;
747
5.59k
    auto source_indexes = (*target_schema)->inverted_indexs(source.unique_id());
748
    // if target is variant type, we need to inherit all indexes
749
    // because this schema is a read schema from fe
750
5.59k
    if (target.is_variant_type()) {
751
4.67k
        for (auto& index : source_indexes) {
752
288
            auto index_info = std::make_shared<TabletIndex>(*index);
753
288
            index_info->set_escaped_escaped_index_suffix_path(target.path_info_ptr()->get_path());
754
288
            indexes_to_add.emplace_back(std::move(index_info));
755
288
        }
756
4.67k
    } else {
757
920
        inherit_index(source_indexes, indexes_to_add, target);
758
920
    }
759
5.59k
    auto target_indexes = (*target_schema)
760
5.59k
                                  ->inverted_indexs(target.parent_unique_id(),
761
5.59k
                                                    target.path_info_ptr()->get_path());
762
5.59k
    if (target_indexes.empty()) {
763
5.59k
        for (auto& index_info : indexes_to_add) {
764
299
            (*target_schema)->append_index(std::move(*index_info));
765
299
        }
766
5.59k
    }
767
768
    // 3. TODO: gnragm bf index
769
5.59k
}
770
771
6.98k
void inherit_column_attributes(TabletSchemaSPtr& schema) {
772
    // Add index meta if extracted column is missing index meta
773
83.0k
    for (size_t i = 0; i < schema->num_columns(); ++i) {
774
76.1k
        TabletColumn& col = schema->mutable_column(i);
775
76.1k
        if (!col.is_extracted_column()) {
776
70.5k
            continue;
777
70.5k
        }
778
5.60k
        if (schema->field_index(col.parent_unique_id()) == -1) {
779
            // parent column is missing, maybe dropped
780
0
            continue;
781
0
        }
782
5.60k
        inherit_column_attributes(schema->column_by_uid(col.parent_unique_id()), col, &schema);
783
5.60k
    }
784
6.98k
}
785
786
Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
787
                               const TabletSchemaSPtr& base_schema, TabletSchemaSPtr& output_schema,
788
1.67k
                               bool check_schema_size) {
789
1.67k
    std::vector<int32_t> variant_column_unique_id;
790
    // Construct a schema excluding the extracted columns and gather unique identifiers for variants.
791
    // Ensure that the output schema also excludes these extracted columns. This approach prevents
792
    // duplicated paths following the update_least_common_schema process.
793
1.67k
    auto build_schema_without_extracted_columns = [&](const TabletSchemaSPtr& base_schema) {
794
1.67k
        output_schema = std::make_shared<TabletSchema>();
795
        // not copy columns but only shadow copy other attributes
796
1.67k
        output_schema->shawdow_copy_without_columns(*base_schema);
797
        // Get all columns without extracted columns and collect variant col unique id
798
4.15k
        for (const TabletColumnPtr& col : base_schema->columns()) {
799
4.15k
            if (col->is_variant_type()) {
800
1.71k
                variant_column_unique_id.push_back(col->unique_id());
801
1.71k
            }
802
4.15k
            if (!col->is_extracted_column()) {
803
3.66k
                output_schema->append_column(*col);
804
3.66k
            }
805
4.15k
        }
806
1.67k
    };
807
1.67k
    if (base_schema == nullptr) {
808
        // Pick tablet schema with max schema version
809
263
        auto max_version_schema =
810
263
                *std::max_element(schemas.cbegin(), schemas.cend(),
811
1.35k
                                  [](const TabletSchemaSPtr a, const TabletSchemaSPtr b) {
812
1.35k
                                      return a->schema_version() < b->schema_version();
813
1.35k
                                  });
814
263
        CHECK(max_version_schema);
815
263
        build_schema_without_extracted_columns(max_version_schema);
816
1.41k
    } else {
817
        // use input base_schema schema as base schema
818
1.41k
        build_schema_without_extracted_columns(base_schema);
819
1.41k
    }
820
821
1.71k
    for (int32_t unique_id : variant_column_unique_id) {
822
1.71k
        std::set<PathInData> path_set;
823
1.71k
        RETURN_IF_ERROR(update_least_common_schema(schemas, output_schema, unique_id, &path_set));
824
1.71k
    }
825
826
1.67k
    inherit_column_attributes(output_schema);
827
1.67k
    if (check_schema_size &&
828
1.67k
        output_schema->columns().size() > config::variant_max_merged_tablet_schema_size) {
829
0
        return Status::DataQualityError("Reached max column size limit {}",
830
0
                                        config::variant_max_merged_tablet_schema_size);
831
0
    }
832
833
1.67k
    return Status::OK();
834
1.67k
}
835
836
// sort by paths in lexicographical order
837
9.23k
ColumnVariant::Subcolumns get_sorted_subcolumns(const ColumnVariant::Subcolumns& subcolumns) {
838
    // sort by paths in lexicographical order
839
9.23k
    ColumnVariant::Subcolumns sorted = subcolumns;
840
2.56M
    std::sort(sorted.begin(), sorted.end(), [](const auto& lhsItem, const auto& rhsItem) {
841
2.56M
        return lhsItem->path < rhsItem->path;
842
2.56M
    });
843
9.23k
    return sorted;
844
9.23k
}
845
846
bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema,
847
17.8k
                           int32_t new_col_idx, int32_t old_col_idx) {
848
17.8k
    const auto& column_new = new_schema->column(new_col_idx);
849
17.8k
    const auto& column_old = old_schema->column(old_col_idx);
850
851
17.8k
    if (column_new.is_bf_column() != column_old.is_bf_column()) {
852
94
        return true;
853
94
    }
854
855
17.7k
    auto new_schema_inverted_indexs = new_schema->inverted_indexs(column_new);
856
17.7k
    auto old_schema_inverted_indexs = old_schema->inverted_indexs(column_old);
857
858
17.7k
    if (new_schema_inverted_indexs.size() != old_schema_inverted_indexs.size()) {
859
706
        return true;
860
706
    }
861
862
17.4k
    for (size_t i = 0; i < new_schema_inverted_indexs.size(); ++i) {
863
391
        if (!new_schema_inverted_indexs[i]->is_same_except_id(old_schema_inverted_indexs[i])) {
864
18
            return true;
865
18
        }
866
391
    }
867
868
17.0k
    return false;
869
17.0k
}
870
871
1.23k
TabletColumn create_sparse_column(const TabletColumn& variant) {
872
1.23k
    TabletColumn res;
873
1.23k
    res.set_name(variant.name_lower_case() + "." + SPARSE_COLUMN_PATH);
874
1.23k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
875
1.23k
    res.set_aggregation_method(variant.aggregation());
876
1.23k
    res.set_path_info(PathInData {variant.name_lower_case() + "." + SPARSE_COLUMN_PATH});
877
1.23k
    res.set_parent_unique_id(variant.unique_id());
878
    // set default value to "NULL" DefaultColumnIterator will call insert_many_defaults
879
1.23k
    res.set_default_value("NULL");
880
1.23k
    TabletColumn child_tcolumn;
881
1.23k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
882
1.23k
    res.add_sub_column(child_tcolumn);
883
1.23k
    res.add_sub_column(child_tcolumn);
884
1.23k
    return res;
885
1.23k
}
886
887
19.4k
TabletColumn create_sparse_shard_column(const TabletColumn& variant, int bucket_index) {
888
19.4k
    TabletColumn res;
889
19.4k
    std::string name = variant.name_lower_case() + "." + SPARSE_COLUMN_PATH + ".b" +
890
19.4k
                       std::to_string(bucket_index);
891
19.4k
    res.set_name(name);
892
19.4k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
893
19.4k
    res.set_aggregation_method(variant.aggregation());
894
19.4k
    res.set_parent_unique_id(variant.unique_id());
895
19.4k
    res.set_default_value("NULL");
896
19.4k
    PathInData path(name);
897
19.4k
    res.set_path_info(path);
898
19.4k
    TabletColumn child_tcolumn;
899
19.4k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
900
19.4k
    res.add_sub_column(child_tcolumn);
901
19.4k
    res.add_sub_column(child_tcolumn);
902
19.4k
    return res;
903
19.4k
}
904
905
11.2k
TabletColumn create_doc_value_column(const TabletColumn& variant, int bucket_index) {
906
11.2k
    TabletColumn res;
907
11.2k
    std::string name = variant.name_lower_case() + "." + DOC_VALUE_COLUMN_PATH + ".b" +
908
11.2k
                       std::to_string(bucket_index);
909
11.2k
    res.set_name(name);
910
11.2k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
911
11.2k
    res.set_aggregation_method(variant.aggregation());
912
11.2k
    res.set_parent_unique_id(variant.unique_id());
913
11.2k
    res.set_default_value("NULL");
914
11.2k
    res.set_path_info(PathInData {name});
915
916
11.2k
    TabletColumn child_tcolumn;
917
11.2k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
918
11.2k
    res.add_sub_column(child_tcolumn);
919
11.2k
    res.add_sub_column(child_tcolumn);
920
11.2k
    return res;
921
11.2k
}
922
923
101k
uint32_t variant_binary_shard_of(const StringRef& path, uint32_t bucket_num) {
924
101k
    if (bucket_num <= 1) return 0;
925
81.6k
    SipHash hash;
926
81.6k
    hash.update(path.data, path.size);
927
81.6k
    uint64_t h = hash.get64();
928
81.6k
    return static_cast<uint32_t>(h % bucket_num);
929
101k
}
930
931
Status VariantCompactionUtil::aggregate_path_to_stats(
932
        const RowsetSharedPtr& rs,
933
2.71k
        std::unordered_map<int32_t, PathToNoneNullValues>* uid_to_path_stats) {
934
2.71k
    SegmentCacheHandle segment_cache;
935
2.71k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
936
2.71k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
937
938
8.02k
    for (const auto& column : rs->tablet_schema()->columns()) {
939
8.02k
        if (!column->is_variant_type() || column->unique_id() < 0) {
940
4.27k
            continue;
941
4.27k
        }
942
3.75k
        if (!should_check_variant_path_stats(*column)) {
943
0
            continue;
944
0
        }
945
3.75k
        for (const auto& segment : segment_cache.get_segments()) {
946
2.03k
            std::shared_ptr<ColumnReader> column_reader;
947
2.03k
            OlapReaderStatistics stats;
948
2.03k
            RETURN_IF_ERROR(
949
2.03k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
950
2.03k
            if (!column_reader) {
951
0
                continue;
952
0
            }
953
954
2.03k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
955
2.03k
            auto* variant_column_reader =
956
2.03k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
957
            // load external meta before getting stats
958
2.03k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
959
2.03k
            const auto* source_stats = variant_column_reader->get_stats();
960
2.03k
            CHECK(source_stats);
961
962
            // agg path -> stats
963
4.91k
            for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
964
4.91k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
965
4.91k
            }
966
967
6.22k
            for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
968
6.22k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
969
6.22k
            }
970
2.03k
        }
971
3.75k
    }
972
2.71k
    return Status::OK();
973
2.71k
}
974
975
Status VariantCompactionUtil::aggregate_variant_extended_info(
976
        const RowsetSharedPtr& rs,
977
4.81k
        std::unordered_map<int32_t, VariantExtendedInfo>* uid_to_variant_extended_info) {
978
4.81k
    SegmentCacheHandle segment_cache;
979
4.81k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
980
4.81k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
981
982
18.9k
    for (const auto& column : rs->tablet_schema()->columns()) {
983
18.9k
        if (!column->is_variant_type()) {
984
12.9k
            continue;
985
12.9k
        }
986
5.95k
        auto& extended_info = (*uid_to_variant_extended_info)[column->unique_id()];
987
5.95k
        if (column->variant_enable_nested_group()) {
988
0
            extended_info.has_nested_group = true;
989
0
        }
990
5.95k
        for (const auto& segment : segment_cache.get_segments()) {
991
3.49k
            std::shared_ptr<ColumnReader> column_reader;
992
3.49k
            OlapReaderStatistics stats;
993
3.49k
            RETURN_IF_ERROR(
994
3.49k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
995
3.49k
            if (!column_reader) {
996
0
                continue;
997
0
            }
998
999
3.49k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
1000
3.49k
            auto* variant_column_reader =
1001
3.49k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
1002
            // load external meta before getting stats
1003
3.49k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
1004
3.49k
            const auto* source_stats = variant_column_reader->get_stats();
1005
3.49k
            CHECK(source_stats);
1006
1007
3.50k
            if (!column->variant_enable_nested_group()) {
1008
                // NG roots still need type metadata for regular subpaths such as `v.owner`,
1009
                // but their compaction schema should not be driven by flat path stats.
1010
3.50k
                for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
1011
2.66k
                    extended_info.path_to_none_null_values[path] += size;
1012
2.66k
                    extended_info.sparse_paths.emplace(path);
1013
2.66k
                }
1014
1015
5.63k
                for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
1016
5.63k
                    extended_info.path_to_none_null_values[path] += size;
1017
5.63k
                }
1018
3.50k
            }
1019
1020
            //2. agg path -> schema
1021
3.49k
            variant_column_reader->get_subcolumns_types(&extended_info.path_to_data_types);
1022
1023
            // 3. extract typed paths
1024
3.49k
            variant_column_reader->get_typed_paths(&extended_info.typed_paths);
1025
1026
            // 4. extract nested paths
1027
3.50k
            if (!column->variant_enable_nested_group()) {
1028
3.50k
                variant_column_reader->get_nested_paths(&extended_info.nested_paths);
1029
3.50k
            }
1030
3.49k
        }
1031
5.95k
    }
1032
4.81k
    return Status::OK();
1033
4.81k
}
1034
1035
// get the subpaths and sparse paths for the variant column
1036
void VariantCompactionUtil::get_subpaths(int32_t max_subcolumns_count,
1037
                                         const PathToNoneNullValues& stats,
1038
328
                                         TabletSchema::PathsSetInfo& paths_set_info) {
1039
    // max_subcolumns_count is 0 means no limit
1040
328
    if (max_subcolumns_count > 0 && stats.size() > max_subcolumns_count) {
1041
75
        std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
1042
75
        paths_with_sizes.reserve(stats.size());
1043
1.65k
        for (const auto& [path, size] : stats) {
1044
1.65k
            paths_with_sizes.emplace_back(size, path);
1045
1.65k
        }
1046
75
        std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), std::greater());
1047
1048
        // Select top N paths as subcolumns, remaining paths as sparse columns
1049
1.65k
        for (const auto& [size, path] : paths_with_sizes) {
1050
1.65k
            if (paths_set_info.sub_path_set.size() < max_subcolumns_count) {
1051
169
                paths_set_info.sub_path_set.emplace(path);
1052
1.48k
            } else {
1053
1.48k
                paths_set_info.sparse_path_set.emplace(path);
1054
1.48k
            }
1055
1.65k
        }
1056
75
        LOG(INFO) << "subpaths " << paths_set_info.sub_path_set.size() << " sparse paths "
1057
75
                  << paths_set_info.sparse_path_set.size() << " variant max subcolumns count "
1058
75
                  << max_subcolumns_count << " stats size " << paths_with_sizes.size();
1059
253
    } else {
1060
        // Apply all paths as subcolumns
1061
655
        for (const auto& [path, _] : stats) {
1062
655
            paths_set_info.sub_path_set.emplace(path);
1063
655
        }
1064
253
    }
1065
328
}
1066
1067
Status VariantCompactionUtil::check_path_stats(const std::vector<RowsetSharedPtr>& intputs,
1068
8.61k
                                               RowsetSharedPtr output, BaseTabletSPtr tablet) {
1069
8.61k
    if (output->tablet_schema()->num_variant_columns() == 0) {
1070
8.06k
        return Status::OK();
1071
8.06k
    }
1072
4.80k
    for (const auto& rowset : intputs) {
1073
18.9k
        for (const auto& column : rowset->tablet_schema()->columns()) {
1074
18.9k
            if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
1075
0
                return Status::OK();
1076
0
            }
1077
18.9k
        }
1078
4.80k
    }
1079
    // check no extended schema in input rowsets
1080
4.81k
    for (const auto& rowset : intputs) {
1081
18.9k
        for (const auto& column : rowset->tablet_schema()->columns()) {
1082
18.9k
            if (column->is_extracted_column()) {
1083
0
                return Status::OK();
1084
0
            }
1085
18.9k
        }
1086
4.81k
    }
1087
552
#ifndef BE_TEST
1088
    // check no extended schema in output rowset
1089
2.03k
    for (const auto& column : output->tablet_schema()->columns()) {
1090
2.03k
        if (column->is_extracted_column()) {
1091
0
            const auto& name = column->name();
1092
0
            if (name.find("." + DOC_VALUE_COLUMN_PATH + ".") != std::string::npos ||
1093
0
                name.find("." + SPARSE_COLUMN_PATH + ".") != std::string::npos ||
1094
0
                name.ends_with("." + SPARSE_COLUMN_PATH)) {
1095
0
                continue;
1096
0
            }
1097
0
            return Status::InternalError("Unexpected extracted column {} in output rowset",
1098
0
                                         column->name());
1099
0
        }
1100
2.03k
    }
1101
552
#endif
1102
    // only check path stats for dup_keys since the rows may be merged in other models
1103
552
    if (tablet->keys_type() != KeysType::DUP_KEYS) {
1104
214
        return Status::OK();
1105
214
    }
1106
    // if there is a delete predicate in the input rowsets, we skip the path stats check
1107
2.38k
    for (auto& rowset : intputs) {
1108
2.38k
        if (rowset->rowset_meta()->has_delete_predicate()) {
1109
4
            return Status::OK();
1110
4
        }
1111
2.38k
    }
1112
977
    for (const auto& column : output->tablet_schema()->columns()) {
1113
977
        if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
1114
0
            return Status::OK();
1115
0
        }
1116
977
    }
1117
334
    std::unordered_map<int32_t, PathToNoneNullValues> original_uid_to_path_stats;
1118
2.36k
    for (const auto& rs : intputs) {
1119
2.36k
        RETURN_IF_ERROR(aggregate_path_to_stats(rs, &original_uid_to_path_stats));
1120
2.36k
    }
1121
334
    std::unordered_map<int32_t, PathToNoneNullValues> output_uid_to_path_stats;
1122
334
    RETURN_IF_ERROR(aggregate_path_to_stats(output, &output_uid_to_path_stats));
1123
334
    for (const auto& [uid, stats] : output_uid_to_path_stats) {
1124
226
        if (output->tablet_schema()->column_by_uid(uid).is_variant_type() &&
1125
226
            output->tablet_schema()->column_by_uid(uid).variant_enable_doc_mode()) {
1126
98
            continue;
1127
98
        }
1128
128
        if (original_uid_to_path_stats.find(uid) == original_uid_to_path_stats.end()) {
1129
0
            return Status::InternalError("Path stats not found for uid {}, tablet_id {}", uid,
1130
0
                                         tablet->tablet_id());
1131
0
        }
1132
1133
        // In input rowsets, some rowsets may have statistics values exceeding the maximum limit,
1134
        // which leads to inaccurate statistics
1135
128
        if (stats.size() > output->tablet_schema()
1136
128
                                   ->column_by_uid(uid)
1137
128
                                   .variant_max_sparse_column_statistics_size()) {
1138
            // When there is only one segment, we can ensure that the size of each path in output stats is accurate
1139
1
            if (output->num_segments() == 1) {
1140
13
                for (const auto& [path, size] : stats) {
1141
13
                    if (original_uid_to_path_stats.at(uid).find(path) ==
1142
13
                        original_uid_to_path_stats.at(uid).end()) {
1143
0
                        continue;
1144
0
                    }
1145
13
                    if (original_uid_to_path_stats.at(uid).at(path) > size) {
1146
0
                        return Status::InternalError(
1147
0
                                "Path stats not smaller for uid {} with path `{}`, input size {}, "
1148
0
                                "output "
1149
0
                                "size {}, "
1150
0
                                "tablet_id {}",
1151
0
                                uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1152
0
                                tablet->tablet_id());
1153
0
                    }
1154
13
                }
1155
1
            }
1156
1
        }
1157
        // in this case, input stats is accurate, so we check the stats size and stats value
1158
127
        else {
1159
1.75k
            for (const auto& [path, size] : stats) {
1160
1.75k
                if (original_uid_to_path_stats.at(uid).find(path) ==
1161
1.75k
                    original_uid_to_path_stats.at(uid).end()) {
1162
0
                    return Status::InternalError(
1163
0
                            "Path stats not found for uid {}, path {}, tablet_id {}", uid, path,
1164
0
                            tablet->tablet_id());
1165
0
                }
1166
1.75k
                if (original_uid_to_path_stats.at(uid).at(path) != size) {
1167
0
                    return Status::InternalError(
1168
0
                            "Path stats not match for uid {} with path `{}`, input size {}, output "
1169
0
                            "size {}, "
1170
0
                            "tablet_id {}",
1171
0
                            uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1172
0
                            tablet->tablet_id());
1173
0
                }
1174
1.75k
            }
1175
127
        }
1176
128
    }
1177
1178
334
    return Status::OK();
1179
334
}
1180
1181
Status VariantCompactionUtil::get_compaction_typed_columns(
1182
        const TabletSchemaSPtr& target, const std::unordered_set<std::string>& typed_paths,
1183
        const TabletColumnPtr parent_column, TabletSchemaSPtr& output_schema,
1184
324
        TabletSchema::PathsSetInfo& paths_set_info) {
1185
324
    if (parent_column->variant_enable_typed_paths_to_sparse()) {
1186
40
        return Status::OK();
1187
40
    }
1188
442
    for (const auto& path : typed_paths) {
1189
442
        TabletSchema::SubColumnInfo sub_column_info;
1190
442
        if (generate_sub_column_info(*target, parent_column->unique_id(), path, &sub_column_info)) {
1191
441
            inherit_column_attributes(*parent_column, sub_column_info.column);
1192
441
            output_schema->append_column(sub_column_info.column);
1193
441
            paths_set_info.typed_path_set.insert({path, std::move(sub_column_info)});
1194
441
            VLOG_DEBUG << "append typed column " << path;
1195
441
        } else {
1196
1
            return Status::InternalError("Failed to generate sub column info for path {}", path);
1197
1
        }
1198
442
    }
1199
283
    return Status::OK();
1200
284
}
1201
1202
Status VariantCompactionUtil::get_compaction_nested_columns(
1203
        const std::unordered_set<PathInData, PathInData::Hash>& nested_paths,
1204
        const PathToDataTypes& path_to_data_types, const TabletColumnPtr parent_column,
1205
324
        TabletSchemaSPtr& output_schema, TabletSchema::PathsSetInfo& paths_set_info) {
1206
324
    const auto& parent_indexes = output_schema->inverted_indexs(parent_column->unique_id());
1207
324
    for (const auto& path : nested_paths) {
1208
3
        const auto& find_data_types = path_to_data_types.find(path);
1209
3
        if (find_data_types == path_to_data_types.end() || find_data_types->second.empty()) {
1210
1
            return Status::InternalError("Nested path {} has no data type", path.get_path());
1211
1
        }
1212
2
        DataTypePtr data_type;
1213
2
        get_least_supertype_jsonb(find_data_types->second, &data_type);
1214
1215
2
        const std::string& column_name = parent_column->name_lower_case() + "." + path.get_path();
1216
2
        PathInDataBuilder full_path_builder;
1217
2
        auto full_path = full_path_builder.append(parent_column->name_lower_case(), false)
1218
2
                                 .append(path.get_parts(), false)
1219
2
                                 .build();
1220
2
        TabletColumn nested_column =
1221
2
                get_column_by_type(data_type, column_name,
1222
2
                                   ExtraInfo {.unique_id = -1,
1223
2
                                              .parent_unique_id = parent_column->unique_id(),
1224
2
                                              .path_info = full_path});
1225
2
        inherit_column_attributes(*parent_column, nested_column);
1226
2
        TabletIndexes sub_column_indexes;
1227
2
        inherit_index(parent_indexes, sub_column_indexes, nested_column);
1228
2
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1229
2
        output_schema->append_column(nested_column);
1230
2
        VLOG_DEBUG << "append nested column " << path.get_path();
1231
2
    }
1232
323
    return Status::OK();
1233
324
}
1234
1235
void VariantCompactionUtil::get_compaction_subcolumns_from_subpaths(
1236
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1237
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1238
271
        const std::unordered_set<std::string>& sparse_paths, TabletSchemaSPtr& output_schema) {
1239
271
    auto& path_set = paths_set_info.sub_path_set;
1240
271
    std::vector<StringRef> sorted_subpaths(path_set.begin(), path_set.end());
1241
271
    std::sort(sorted_subpaths.begin(), sorted_subpaths.end());
1242
271
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1243
    // append subcolumns
1244
731
    for (const auto& subpath : sorted_subpaths) {
1245
731
        auto column_name = parent_column->name_lower_case() + "." + subpath.to_string();
1246
731
        auto column_path = PathInData(column_name);
1247
1248
731
        const auto& find_data_types = path_to_data_types.find(PathInData(subpath));
1249
1250
        // some cases: the subcolumn type is variant
1251
        // 1. this path has no data type in segments
1252
        // 2. this path is in sparse paths
1253
        // 3. the sparse paths are too much
1254
731
        TabletSchema::SubColumnInfo sub_column_info;
1255
731
        if (parent_column->variant_enable_typed_paths_to_sparse() &&
1256
731
            generate_sub_column_info(*target, parent_column->unique_id(), std::string(subpath),
1257
68
                                     &sub_column_info)) {
1258
58
            inherit_column_attributes(*parent_column, sub_column_info.column);
1259
58
            output_schema->append_column(sub_column_info.column);
1260
58
            paths_set_info.subcolumn_indexes.emplace(subpath, std::move(sub_column_info.indexes));
1261
58
            VLOG_DEBUG << "append typed column " << subpath;
1262
673
        } else if (find_data_types == path_to_data_types.end() || find_data_types->second.empty() ||
1263
673
                   sparse_paths.find(std::string(subpath)) != sparse_paths.end() ||
1264
673
                   sparse_paths.size() >=
1265
640
                           parent_column->variant_max_sparse_column_statistics_size()) {
1266
37
            TabletColumn subcolumn;
1267
37
            subcolumn.set_name(column_name);
1268
37
            subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1269
37
            subcolumn.set_parent_unique_id(parent_column->unique_id());
1270
37
            subcolumn.set_path_info(column_path);
1271
37
            subcolumn.set_aggregation_method(parent_column->aggregation());
1272
37
            subcolumn.set_variant_max_subcolumns_count(
1273
37
                    parent_column->variant_max_subcolumns_count());
1274
37
            subcolumn.set_variant_enable_doc_mode(parent_column->variant_enable_doc_mode());
1275
37
            subcolumn.set_is_nullable(true);
1276
37
            output_schema->append_column(subcolumn);
1277
37
            VLOG_DEBUG << "append sub column " << subpath << " data type "
1278
0
                       << "VARIANT";
1279
37
        }
1280
        // normal case: the subcolumn type can be calculated from the data types in segments
1281
636
        else {
1282
636
            DataTypePtr data_type;
1283
636
            get_least_supertype_jsonb(find_data_types->second, &data_type);
1284
636
            TabletColumn sub_column =
1285
636
                    get_column_by_type(data_type, column_name,
1286
636
                                       ExtraInfo {.unique_id = -1,
1287
636
                                                  .parent_unique_id = parent_column->unique_id(),
1288
636
                                                  .path_info = column_path});
1289
636
            inherit_column_attributes(*parent_column, sub_column);
1290
636
            TabletIndexes sub_column_indexes;
1291
636
            inherit_index(parent_indexes, sub_column_indexes, sub_column);
1292
636
            paths_set_info.subcolumn_indexes.emplace(subpath, std::move(sub_column_indexes));
1293
636
            output_schema->append_column(sub_column);
1294
636
            VLOG_DEBUG << "append sub column " << subpath << " data type " << data_type->get_name();
1295
636
        }
1296
731
    }
1297
271
}
1298
1299
void VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
1300
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1301
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1302
60
        TabletSchemaSPtr& output_schema) {
1303
60
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1304
118
    for (const auto& [path, data_types] : path_to_data_types) {
1305
118
        if (data_types.empty() || path.empty() || path.has_nested_part()) {
1306
13
            continue;
1307
13
        }
1308
105
        DataTypePtr data_type;
1309
105
        get_least_supertype_jsonb(data_types, &data_type);
1310
105
        auto column_name = parent_column->name_lower_case() + "." + path.get_path();
1311
105
        auto column_path = PathInData(column_name, path.get_is_typed());
1312
105
        TabletColumn sub_column =
1313
105
                get_column_by_type(data_type, column_name,
1314
105
                                   ExtraInfo {.unique_id = -1,
1315
105
                                              .parent_unique_id = parent_column->unique_id(),
1316
105
                                              .path_info = column_path});
1317
105
        inherit_column_attributes(*parent_column, sub_column);
1318
105
        TabletIndexes sub_column_indexes;
1319
105
        inherit_index(parent_indexes, sub_column_indexes, sub_column);
1320
105
        if (path.get_is_typed()) {
1321
2
            TabletSchema::SubColumnInfo sub_column_info {.column = sub_column,
1322
2
                                                         .indexes = std::move(sub_column_indexes)};
1323
2
            paths_set_info.typed_path_set.emplace(path.get_path(), std::move(sub_column_info));
1324
103
        } else {
1325
103
            paths_set_info.subcolumn_indexes.emplace(path.get_path(),
1326
103
                                                     std::move(sub_column_indexes));
1327
103
        }
1328
105
        output_schema->append_column(sub_column);
1329
105
        VLOG_DEBUG << "append sub column " << path.get_path() << " data type "
1330
0
                   << data_type->get_name();
1331
105
    }
1332
60
}
1333
1334
// Build the temporary schema for compaction
1335
// 1. aggregate path stats and data types from all rowsets
1336
// 2. append typed columns and nested columns to the output schema
1337
// 3. sort the subpaths and sparse paths for each unique id
1338
// 4. append the subpaths and sparse paths to the output schema
1339
// 5. set the path set info for each unique id
1340
// 6. return the output schema
1341
Status VariantCompactionUtil::get_extended_compaction_schema(
1342
8.11k
        const std::vector<RowsetSharedPtr>& rowsets, TabletSchemaSPtr& target) {
1343
8.11k
    std::unordered_map<int32_t, VariantExtendedInfo> uid_to_variant_extended_info;
1344
8.11k
    const bool has_extendable_variant =
1345
90.8k
            std::ranges::any_of(target->columns(), [](const TabletColumnPtr& column) {
1346
90.8k
                return column->is_variant_type() && should_check_variant_path_stats(*column);
1347
90.8k
            });
1348
8.11k
    if (has_extendable_variant) {
1349
        // collect path stats from all rowsets and segments
1350
4.80k
        for (const auto& rs : rowsets) {
1351
4.80k
            RETURN_IF_ERROR(aggregate_variant_extended_info(rs, &uid_to_variant_extended_info));
1352
4.80k
        }
1353
551
    }
1354
1355
    // build the output schema
1356
8.11k
    TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>();
1357
8.11k
    output_schema->shawdow_copy_without_columns(*target);
1358
8.11k
    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info;
1359
8.11k
    const auto ng_root_uids = collect_ng_compaction_root_uids(target, uid_to_variant_extended_info);
1360
8.11k
    const auto uid_to_existing_ng_subcolumns =
1361
8.11k
            collect_existing_ng_compaction_subcolumns(target, ng_root_uids);
1362
93.3k
    for (const TabletColumnPtr& column : target->columns()) {
1363
93.3k
        if (!column->is_extracted_column()) {
1364
93.1k
            output_schema->append_column(*column);
1365
93.1k
        }
1366
93.3k
        if (!column->is_variant_type()) {
1367
92.6k
            continue;
1368
92.6k
        }
1369
18.4E
        VLOG_DEBUG << "column " << column->name() << " unique id " << column->unique_id();
1370
1371
628
        const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
1372
628
        const VariantExtendedInfo empty_extended_info;
1373
628
        const VariantExtendedInfo& extended_info = info_it == uid_to_variant_extended_info.end()
1374
628
                                                           ? empty_extended_info
1375
628
                                                           : info_it->second;
1376
628
        auto& paths_set_info = uid_to_paths_set_info[column->unique_id()];
1377
628
        if (ng_root_uids.contains(column->unique_id())) {
1378
1
            const auto plan = build_nested_group_compaction_materialization_plan(
1379
1
                    uid_to_existing_ng_subcolumns.contains(column->unique_id())
1380
1
                            ? uid_to_existing_ng_subcolumns.at(column->unique_id())
1381
1
                            : std::vector<TabletColumnPtr> {},
1382
1
                    extended_info);
1383
1
            append_nested_group_compaction_columns(target, column, plan, output_schema,
1384
1
                                                   paths_set_info);
1385
1
            LOG(INFO) << "Variant column uid=" << column->unique_id()
1386
1
                      << " keeps nested-group root with regular extracted columns in compaction "
1387
1
                         "schema";
1388
1
            continue;
1389
1
        }
1390
627
        if (!should_check_variant_path_stats(*column)) {
1391
0
            VLOG_DEBUG << "skip extended schema compaction for variant uid=" << column->unique_id()
1392
0
                       << " because the column disables variant path stats";
1393
0
            continue;
1394
0
        }
1395
627
        if (extended_info.has_nested_group) {
1396
0
            LOG(INFO) << "Variant column uid=" << column->unique_id()
1397
0
                      << " has nested group, keep original column in compaction schema";
1398
0
            continue;
1399
0
        }
1400
1401
627
        if (column->variant_enable_doc_mode()) {
1402
344
            const int bucket_num = std::max(1, column->variant_doc_hash_shard_count());
1403
1.29k
            for (int b = 0; b < bucket_num; ++b) {
1404
952
                TabletColumn doc_value_bucket_column = create_doc_value_column(*column, b);
1405
952
                doc_value_bucket_column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1406
952
                doc_value_bucket_column.set_is_nullable(false);
1407
952
                doc_value_bucket_column.set_variant_enable_doc_mode(true);
1408
952
                output_schema->append_column(doc_value_bucket_column);
1409
952
            }
1410
344
            continue;
1411
344
        }
1412
1413
        // 1. append typed columns
1414
283
        RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1415
283
                                                     output_schema, paths_set_info));
1416
        // 2. append nested columns
1417
283
        RETURN_IF_ERROR(get_compaction_nested_columns(extended_info.nested_paths,
1418
283
                                                      extended_info.path_to_data_types, column,
1419
283
                                                      output_schema, paths_set_info));
1420
1421
        // 3. get the subpaths
1422
283
        get_subpaths(column->variant_max_subcolumns_count(), extended_info.path_to_none_null_values,
1423
283
                     paths_set_info);
1424
1425
        // 4. append subcolumns
1426
283
        if (column->variant_max_subcolumns_count() > 0 || !column->get_sub_columns().empty()) {
1427
263
            get_compaction_subcolumns_from_subpaths(paths_set_info, column, target,
1428
263
                                                    extended_info.path_to_data_types,
1429
263
                                                    extended_info.sparse_paths, output_schema);
1430
263
        }
1431
        // variant_max_subcolumns_count == 0 and no typed paths materialized
1432
        // it means that all subcolumns are materialized, may be from old data
1433
20
        else {
1434
20
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1435
20
                                                      extended_info.path_to_data_types,
1436
20
                                                      output_schema);
1437
20
        }
1438
1439
        // append sparse column(s)
1440
        // If variant uses bucketized sparse columns, append one sparse bucket column per bucket.
1441
        // Otherwise, append the single sparse column.
1442
283
        int bucket_num = std::max(1, column->variant_sparse_hash_shard_count());
1443
283
        if (bucket_num > 1) {
1444
1.17k
            for (int b = 0; b < bucket_num; ++b) {
1445
910
                TabletColumn sparse_bucket_column = create_sparse_shard_column(*column, b);
1446
910
                output_schema->append_column(sparse_bucket_column);
1447
910
            }
1448
261
        } else {
1449
22
            TabletColumn sparse_column = create_sparse_column(*column);
1450
22
            output_schema->append_column(sparse_column);
1451
22
        }
1452
283
    }
1453
1454
8.11k
    target = output_schema;
1455
    // used to merge & filter path to sparse column during reading in compaction
1456
8.11k
    target->set_path_set_info(std::move(uid_to_paths_set_info));
1457
18.4E
    VLOG_DEBUG << "dump schema " << target->dump_full_schema();
1458
8.11k
    return Status::OK();
1459
8.11k
}
1460
1461
// Calculate statistics about variant data paths from the encoded sparse column
1462
void VariantCompactionUtil::calculate_variant_stats(const IColumn& encoded_sparse_column,
1463
                                                    segment_v2::VariantStatisticsPB* stats,
1464
                                                    size_t max_sparse_column_statistics_size,
1465
1.26k
                                                    size_t row_pos, size_t num_rows) {
1466
    // Cast input column to ColumnMap type since sparse column is stored as a map
1467
1.26k
    const auto& map_column = assert_cast<const ColumnMap&>(encoded_sparse_column);
1468
1469
    // Get the keys column which contains the paths as strings
1470
1.26k
    const auto& sparse_data_paths =
1471
1.26k
            assert_cast<const ColumnString*>(map_column.get_keys_ptr().get());
1472
1.26k
    const auto& serialized_sparse_column_offsets = map_column.get_offsets();
1473
1.26k
    auto& count_map = *stats->mutable_sparse_column_non_null_size();
1474
    // Iterate through all paths in the sparse column
1475
508k
    for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
1476
507k
        size_t offset = serialized_sparse_column_offsets[i - 1];
1477
507k
        size_t end = serialized_sparse_column_offsets[i];
1478
2.03M
        for (size_t j = offset; j != end; ++j) {
1479
1.52M
            auto path = sparse_data_paths->get_data_at(j);
1480
1481
1.52M
            const auto& sparse_path = path.to_string();
1482
            // If path already exists in statistics, increment its count
1483
1.52M
            if (auto it = count_map.find(sparse_path); it != count_map.end()) {
1484
1.52M
                ++it->second;
1485
1.52M
            }
1486
            // If path doesn't exist and we haven't hit the max statistics size limit,
1487
            // add it with count 1
1488
1.43k
            else if (count_map.size() < max_sparse_column_statistics_size) {
1489
1.43k
                count_map.emplace(sparse_path, 1);
1490
1.43k
            }
1491
1.52M
        }
1492
507k
    }
1493
1494
1.26k
    if (stats->sparse_column_non_null_size().size() > max_sparse_column_statistics_size) {
1495
0
        throw doris::Exception(
1496
0
                ErrorCode::INTERNAL_ERROR,
1497
0
                "Sparse column non null size: {} is greater than max statistics size: {}",
1498
0
                stats->sparse_column_non_null_size().size(), max_sparse_column_statistics_size);
1499
0
    }
1500
1.26k
}
1501
1502
/// Calculates number of dimensions in array field.
1503
/// Returns 0 for scalar fields.
1504
class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t> {
1505
public:
1506
    FieldVisitorToNumberOfDimensions() = default;
1507
    template <PrimitiveType T>
1508
23.3M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
23.3M
        if constexpr (T == TYPE_ARRAY) {
1510
2.18M
            const size_t size = x.size();
1511
2.18M
            size_t dimensions = 0;
1512
5.28M
            for (size_t i = 0; i < size; ++i) {
1513
3.10M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
3.10M
                dimensions = std::max(dimensions, element_dimensions);
1515
3.10M
            }
1516
2.18M
            return 1 + dimensions;
1517
21.1M
        } else {
1518
21.1M
            return 0;
1519
21.1M
        }
1520
23.3M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
121k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
121k
        } else {
1518
121k
            return 0;
1519
121k
        }
1520
121k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
480
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
480
        } else {
1518
480
            return 0;
1519
480
        }
1520
480
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
41.9k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
41.9k
        } else {
1518
41.9k
            return 0;
1519
41.9k
        }
1520
41.9k
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
396
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
396
        } else {
1518
396
            return 0;
1519
396
        }
1520
396
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
332k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
332k
        } else {
1518
332k
            return 0;
1519
332k
        }
1520
332k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
1.03k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
1.03k
        } else {
1518
1.03k
            return 0;
1519
1.03k
        }
1520
1.03k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
1.02k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
1.02k
        } else {
1518
1.02k
            return 0;
1519
1.02k
        }
1520
1.02k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
1.95k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
1.95k
        } else {
1518
1.95k
            return 0;
1519
1.95k
        }
1520
1.95k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
6.28M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
6.28M
        } else {
1518
6.28M
            return 0;
1519
6.28M
        }
1520
6.28M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
859
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
859
        } else {
1518
859
            return 0;
1519
859
        }
1520
859
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
2.94M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
2.94M
        } else {
1518
2.94M
            return 0;
1519
2.94M
        }
1520
2.94M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
306
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
306
        } else {
1518
306
            return 0;
1519
306
        }
1520
306
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
304
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
304
        } else {
1518
304
            return 0;
1519
304
        }
1520
304
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
11.2M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
11.2M
        } else {
1518
11.2M
            return 0;
1519
11.2M
        }
1520
11.2M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
2.18M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
2.18M
        if constexpr (T == TYPE_ARRAY) {
1510
2.18M
            const size_t size = x.size();
1511
2.18M
            size_t dimensions = 0;
1512
5.28M
            for (size_t i = 0; i < size; ++i) {
1513
3.10M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
3.10M
                dimensions = std::max(dimensions, element_dimensions);
1515
3.10M
            }
1516
2.18M
            return 1 + dimensions;
1517
        } else {
1518
            return 0;
1519
        }
1520
2.18M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
1
        } else {
1518
1
            return 0;
1519
1
        }
1520
1
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
1
        } else {
1518
1
            return 0;
1519
1
        }
1520
1
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
756
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
756
        } else {
1518
756
            return 0;
1519
756
        }
1520
756
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
696
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
696
        } else {
1518
696
            return 0;
1519
696
        }
1520
696
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
80.9k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
80.9k
        } else {
1518
80.9k
            return 0;
1519
80.9k
        }
1520
80.9k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
558
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
558
        } else {
1518
558
            return 0;
1519
558
        }
1520
558
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1508
46.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1509
        if constexpr (T == TYPE_ARRAY) {
1510
            const size_t size = x.size();
1511
            size_t dimensions = 0;
1512
            for (size_t i = 0; i < size; ++i) {
1513
                size_t element_dimensions = apply_visitor(*this, x[i]);
1514
                dimensions = std::max(dimensions, element_dimensions);
1515
            }
1516
            return 1 + dimensions;
1517
46.8k
        } else {
1518
46.8k
            return 0;
1519
46.8k
        }
1520
46.8k
    }
1521
};
1522
1523
// Visitor that allows to get type of scalar field
1524
// but exclude fields contain complex field.This is a faster version
1525
// for FieldVisitorToScalarType which does not support complex field.
1526
class SimpleFieldVisitorToScalarType : public StaticVisitor<size_t> {
1527
public:
1528
    template <PrimitiveType T>
1529
18.6M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
18.6M
        if constexpr (T == TYPE_ARRAY) {
1531
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
109k
        } else if constexpr (T == TYPE_NULL) {
1533
109k
            have_nulls = true;
1534
109k
            return 1;
1535
18.5M
        } else {
1536
18.5M
            type = T;
1537
18.5M
            return 1;
1538
18.5M
        }
1539
18.6M
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
109k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
109k
        } else if constexpr (T == TYPE_NULL) {
1533
109k
            have_nulls = true;
1534
109k
            return 1;
1535
        } else {
1536
            type = T;
1537
            return 1;
1538
        }
1539
109k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
12.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
        } else if constexpr (T == TYPE_NULL) {
1533
            have_nulls = true;
1534
            return 1;
1535
12.3k
        } else {
1536
12.3k
            type = T;
1537
12.3k
            return 1;
1538
12.3k
        }
1539
12.3k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
273k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
        } else if constexpr (T == TYPE_NULL) {
1533
            have_nulls = true;
1534
            return 1;
1535
273k
        } else {
1536
273k
            type = T;
1537
273k
            return 1;
1538
273k
        }
1539
273k
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
2
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
        } else if constexpr (T == TYPE_NULL) {
1533
            have_nulls = true;
1534
            return 1;
1535
2
        } else {
1536
2
            type = T;
1537
2
            return 1;
1538
2
        }
1539
2
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
7
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
        } else if constexpr (T == TYPE_NULL) {
1533
            have_nulls = true;
1534
            return 1;
1535
7
        } else {
1536
7
            type = T;
1537
7
            return 1;
1538
7
        }
1539
7
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
570
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
        } else if constexpr (T == TYPE_NULL) {
1533
            have_nulls = true;
1534
            return 1;
1535
570
        } else {
1536
570
            type = T;
1537
570
            return 1;
1538
570
        }
1539
570
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
4.94M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
        } else if constexpr (T == TYPE_NULL) {
1533
            have_nulls = true;
1534
            return 1;
1535
4.94M
        } else {
1536
4.94M
            type = T;
1537
4.94M
            return 1;
1538
4.94M
        }
1539
4.94M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
        } else if constexpr (T == TYPE_NULL) {
1533
            have_nulls = true;
1534
            return 1;
1535
1
        } else {
1536
1
            type = T;
1537
1
            return 1;
1538
1
        }
1539
1
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
2.76M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
        } else if constexpr (T == TYPE_NULL) {
1533
            have_nulls = true;
1534
            return 1;
1535
2.76M
        } else {
1536
2.76M
            type = T;
1537
2.76M
            return 1;
1538
2.76M
        }
1539
2.76M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
10.5M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
        } else if constexpr (T == TYPE_NULL) {
1533
            have_nulls = true;
1534
            return 1;
1535
10.5M
        } else {
1536
10.5M
            type = T;
1537
10.5M
            return 1;
1538
10.5M
        }
1539
10.5M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1529
46.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1530
        if constexpr (T == TYPE_ARRAY) {
1531
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1532
        } else if constexpr (T == TYPE_NULL) {
1533
            have_nulls = true;
1534
            return 1;
1535
46.8k
        } else {
1536
46.8k
            type = T;
1537
46.8k
            return 1;
1538
46.8k
        }
1539
46.8k
    }
1540
18.3M
    void get_scalar_type(PrimitiveType* data_type) const { *data_type = type; }
1541
18.3M
    bool contain_nulls() const { return have_nulls; }
1542
1543
18.3M
    bool need_convert_field() const { return false; }
1544
1545
private:
1546
    PrimitiveType type = PrimitiveType::INVALID_TYPE;
1547
    bool have_nulls = false;
1548
};
1549
1550
/// Visitor that allows to get type of scalar field
1551
/// or least common type of scalars in array.
1552
/// More optimized version of FieldToDataType.
1553
class FieldVisitorToScalarType : public StaticVisitor<size_t> {
1554
public:
1555
    template <PrimitiveType T>
1556
4.66M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
4.66M
        if constexpr (T == TYPE_ARRAY) {
1558
2.17M
            size_t size = x.size();
1559
5.28M
            for (size_t i = 0; i < size; ++i) {
1560
3.10M
                apply_visitor(*this, x[i]);
1561
3.10M
            }
1562
2.17M
            return 0;
1563
2.17M
        } else if constexpr (T == TYPE_NULL) {
1564
12.2k
            have_nulls = true;
1565
12.2k
            return 0;
1566
2.47M
        } else {
1567
2.47M
            field_types.insert(T);
1568
2.47M
            type_indexes.insert(T);
1569
2.47M
            return 0;
1570
2.47M
        }
1571
4.66M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
12.2k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
12.2k
        } else if constexpr (T == TYPE_NULL) {
1564
12.2k
            have_nulls = true;
1565
12.2k
            return 0;
1566
        } else {
1567
            field_types.insert(T);
1568
            type_indexes.insert(T);
1569
            return 0;
1570
        }
1571
12.2k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
480
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
480
        } else {
1567
480
            field_types.insert(T);
1568
480
            type_indexes.insert(T);
1569
480
            return 0;
1570
480
        }
1571
480
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
29.6k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
29.6k
        } else {
1567
29.6k
            field_types.insert(T);
1568
29.6k
            type_indexes.insert(T);
1569
29.6k
            return 0;
1570
29.6k
        }
1571
29.6k
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
396
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
396
        } else {
1567
396
            field_types.insert(T);
1568
396
            type_indexes.insert(T);
1569
396
            return 0;
1570
396
        }
1571
396
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
58.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
58.3k
        } else {
1567
58.3k
            field_types.insert(T);
1568
58.3k
            type_indexes.insert(T);
1569
58.3k
            return 0;
1570
58.3k
        }
1571
58.3k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
1.03k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
1.03k
        } else {
1567
1.03k
            field_types.insert(T);
1568
1.03k
            type_indexes.insert(T);
1569
1.03k
            return 0;
1570
1.03k
        }
1571
1.03k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
1.02k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
1.02k
        } else {
1567
1.02k
            field_types.insert(T);
1568
1.02k
            type_indexes.insert(T);
1569
1.02k
            return 0;
1570
1.02k
        }
1571
1.02k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
1.38k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
1.38k
        } else {
1567
1.38k
            field_types.insert(T);
1568
1.38k
            type_indexes.insert(T);
1569
1.38k
            return 0;
1570
1.38k
        }
1571
1.38k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
1.35M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
1.35M
        } else {
1567
1.35M
            field_types.insert(T);
1568
1.35M
            type_indexes.insert(T);
1569
1.35M
            return 0;
1570
1.35M
        }
1571
1.35M
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
858
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
858
        } else {
1567
858
            field_types.insert(T);
1568
858
            type_indexes.insert(T);
1569
858
            return 0;
1570
858
        }
1571
858
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
183k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
183k
        } else {
1567
183k
            field_types.insert(T);
1568
183k
            type_indexes.insert(T);
1569
183k
            return 0;
1570
183k
        }
1571
183k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
306
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
306
        } else {
1567
306
            field_types.insert(T);
1568
306
            type_indexes.insert(T);
1569
306
            return 0;
1570
306
        }
1571
306
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
304
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
304
        } else {
1567
304
            field_types.insert(T);
1568
304
            type_indexes.insert(T);
1569
304
            return 0;
1570
304
        }
1571
304
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
759k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
759k
        } else {
1567
759k
            field_types.insert(T);
1568
759k
            type_indexes.insert(T);
1569
759k
            return 0;
1570
759k
        }
1571
759k
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
2.17M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
2.17M
        if constexpr (T == TYPE_ARRAY) {
1558
2.17M
            size_t size = x.size();
1559
5.28M
            for (size_t i = 0; i < size; ++i) {
1560
3.10M
                apply_visitor(*this, x[i]);
1561
3.10M
            }
1562
2.17M
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
        } else {
1567
            field_types.insert(T);
1568
            type_indexes.insert(T);
1569
            return 0;
1570
        }
1571
2.17M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
1
        } else {
1567
1
            field_types.insert(T);
1568
1
            type_indexes.insert(T);
1569
1
            return 0;
1570
1
        }
1571
1
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
1
        } else {
1567
1
            field_types.insert(T);
1568
1
            type_indexes.insert(T);
1569
1
            return 0;
1570
1
        }
1571
1
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
756
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
756
        } else {
1567
756
            field_types.insert(T);
1568
756
            type_indexes.insert(T);
1569
756
            return 0;
1570
756
        }
1571
756
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
696
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
696
        } else {
1567
696
            field_types.insert(T);
1568
696
            type_indexes.insert(T);
1569
696
            return 0;
1570
696
        }
1571
696
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
80.9k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
80.9k
        } else {
1567
80.9k
            field_types.insert(T);
1568
80.9k
            type_indexes.insert(T);
1569
80.9k
            return 0;
1570
80.9k
        }
1571
80.9k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
558
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
558
        } else {
1567
558
            field_types.insert(T);
1568
558
            type_indexes.insert(T);
1569
558
            return 0;
1570
558
        }
1571
558
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1556
44
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1557
        if constexpr (T == TYPE_ARRAY) {
1558
            size_t size = x.size();
1559
            for (size_t i = 0; i < size; ++i) {
1560
                apply_visitor(*this, x[i]);
1561
            }
1562
            return 0;
1563
        } else if constexpr (T == TYPE_NULL) {
1564
            have_nulls = true;
1565
            return 0;
1566
44
        } else {
1567
44
            field_types.insert(T);
1568
44
            type_indexes.insert(T);
1569
44
            return 0;
1570
44
        }
1571
44
    }
1572
1.56M
    void get_scalar_type(PrimitiveType* type) const {
1573
1.56M
        if (type_indexes.size() == 1) {
1574
            // Most cases will have only one type
1575
1.48M
            *type = *type_indexes.begin();
1576
1.48M
            return;
1577
1.48M
        }
1578
74.1k
        DataTypePtr data_type;
1579
74.1k
        get_least_supertype_jsonb(type_indexes, &data_type);
1580
74.1k
        *type = data_type->get_primitive_type();
1581
74.1k
    }
1582
1.56M
    bool contain_nulls() const { return have_nulls; }
1583
1.56M
    bool need_convert_field() const { return field_types.size() > 1; }
1584
1585
private:
1586
    phmap::flat_hash_set<PrimitiveType> type_indexes;
1587
    phmap::flat_hash_set<PrimitiveType> field_types;
1588
    bool have_nulls = false;
1589
};
1590
1591
template <typename Visitor>
1592
20.2M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1593
20.2M
    Visitor to_scalar_type_visitor;
1594
20.2M
    apply_visitor(to_scalar_type_visitor, field);
1595
20.2M
    PrimitiveType type_id;
1596
20.2M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1597
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1598
20.2M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1599
20.2M
             to_scalar_type_visitor.need_convert_field(),
1600
20.2M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1601
20.2M
}
_ZN5doris12variant_util19get_field_info_implINS0_24FieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1592
1.56M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1593
1.56M
    Visitor to_scalar_type_visitor;
1594
1.56M
    apply_visitor(to_scalar_type_visitor, field);
1595
1.56M
    PrimitiveType type_id;
1596
1.56M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1597
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1598
1.56M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1599
1.56M
             to_scalar_type_visitor.need_convert_field(),
1600
1.56M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1601
1.56M
}
_ZN5doris12variant_util19get_field_info_implINS0_30SimpleFieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1592
18.7M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1593
18.7M
    Visitor to_scalar_type_visitor;
1594
18.7M
    apply_visitor(to_scalar_type_visitor, field);
1595
18.7M
    PrimitiveType type_id;
1596
18.7M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1597
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1598
18.7M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1599
18.7M
             to_scalar_type_visitor.need_convert_field(),
1600
18.7M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1601
18.7M
}
1602
1603
20.3M
void get_field_info(const Field& field, FieldInfo* info) {
1604
20.3M
    if (field.is_complex_field()) {
1605
1.56M
        get_field_info_impl<FieldVisitorToScalarType>(field, info);
1606
18.7M
    } else {
1607
18.7M
        get_field_info_impl<SimpleFieldVisitorToScalarType>(field, info);
1608
18.7M
    }
1609
20.3M
}
1610
1611
bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id,
1612
                              const std::string& path,
1613
258k
                              TabletSchema::SubColumnInfo* sub_column_info) {
1614
258k
    const auto& parent_column = schema.column_by_uid(col_unique_id);
1615
258k
    std::function<void(const TabletColumn&, TabletColumn*)> generate_result_column =
1616
258k
            [&](const TabletColumn& from_column, TabletColumn* to_column) {
1617
12.2k
                to_column->set_name(parent_column.name_lower_case() + "." + path);
1618
12.2k
                to_column->set_type(from_column.type());
1619
12.2k
                to_column->set_parent_unique_id(parent_column.unique_id());
1620
12.2k
                bool is_typed = !parent_column.variant_enable_typed_paths_to_sparse();
1621
12.2k
                to_column->set_path_info(
1622
12.2k
                        PathInData(parent_column.name_lower_case() + "." + path, is_typed));
1623
12.2k
                to_column->set_aggregation_method(parent_column.aggregation());
1624
12.2k
                to_column->set_is_nullable(true);
1625
12.2k
                to_column->set_parent_unique_id(parent_column.unique_id());
1626
12.2k
                if (from_column.is_decimal()) {
1627
12.1k
                    to_column->set_precision(from_column.precision());
1628
12.1k
                }
1629
12.2k
                to_column->set_frac(from_column.frac());
1630
1631
12.2k
                if (from_column.is_array_type()) {
1632
2.60k
                    TabletColumn nested_column;
1633
2.60k
                    generate_result_column(*from_column.get_sub_columns()[0], &nested_column);
1634
2.60k
                    to_column->add_sub_column(nested_column);
1635
2.60k
                }
1636
12.2k
            };
1637
1638
258k
    auto generate_index = [&](const std::string& pattern) {
1639
        // 1. find subcolumn's index
1640
9.60k
        if (const auto& indexes = schema.inverted_index_by_field_pattern(col_unique_id, pattern);
1641
9.60k
            !indexes.empty()) {
1642
3.59k
            for (const auto& index : indexes) {
1643
3.59k
                auto index_ptr = std::make_shared<TabletIndex>(*index);
1644
3.59k
                index_ptr->set_escaped_escaped_index_suffix_path(
1645
3.59k
                        sub_column_info->column.path_info_ptr()->get_path());
1646
3.59k
                sub_column_info->indexes.emplace_back(std::move(index_ptr));
1647
3.59k
            }
1648
3.53k
        }
1649
        // 2. find parent column's index
1650
6.06k
        else if (const auto parent_index = schema.inverted_indexs(col_unique_id);
1651
6.06k
                 !parent_index.empty()) {
1652
328
            inherit_index(parent_index, sub_column_info->indexes, sub_column_info->column);
1653
5.73k
        } else {
1654
5.73k
            sub_column_info->indexes.clear();
1655
5.73k
        }
1656
9.60k
    };
1657
1658
258k
    const auto& sub_columns = parent_column.get_sub_columns();
1659
258k
    for (const auto& sub_column : sub_columns) {
1660
155k
        const char* pattern = sub_column->name().c_str();
1661
155k
        switch (sub_column->pattern_type()) {
1662
4.59k
        case PatternTypePB::MATCH_NAME: {
1663
4.59k
            if (strcmp(pattern, path.c_str()) == 0) {
1664
1.13k
                generate_result_column(*sub_column, &sub_column_info->column);
1665
1.13k
                generate_index(sub_column->name());
1666
1.13k
                return true;
1667
1.13k
            }
1668
3.45k
            break;
1669
4.59k
        }
1670
150k
        case PatternTypePB::MATCH_NAME_GLOB: {
1671
150k
            if (glob_match_re2(pattern, path)) {
1672
8.46k
                generate_result_column(*sub_column, &sub_column_info->column);
1673
8.46k
                generate_index(sub_column->name());
1674
8.46k
                return true;
1675
8.46k
            }
1676
142k
            break;
1677
150k
        }
1678
142k
        default:
1679
0
            break;
1680
155k
        }
1681
155k
    }
1682
249k
    return false;
1683
258k
}
1684
1685
TabletSchemaSPtr VariantCompactionUtil::calculate_variant_extended_schema(
1686
1.41k
        const std::vector<RowsetSharedPtr>& rowsets, const TabletSchemaSPtr& base_schema) {
1687
1.41k
    if (rowsets.empty()) {
1688
0
        return nullptr;
1689
0
    }
1690
1691
1.41k
    std::vector<TabletSchemaSPtr> schemas;
1692
3.31k
    for (const auto& rs : rowsets) {
1693
3.31k
        if (rs->num_segments() == 0) {
1694
3.14k
            continue;
1695
3.14k
        }
1696
174
        const auto& tablet_schema = rs->tablet_schema();
1697
174
        SegmentCacheHandle segment_cache;
1698
174
        auto st = SegmentLoader::instance()->load_segments(std::static_pointer_cast<BetaRowset>(rs),
1699
174
                                                           &segment_cache);
1700
174
        if (!st.ok()) {
1701
0
            return base_schema;
1702
0
        }
1703
174
        for (const auto& segment : segment_cache.get_segments()) {
1704
174
            TabletSchemaSPtr schema = tablet_schema->copy_without_variant_extracted_columns();
1705
354
            for (const auto& column : tablet_schema->columns()) {
1706
354
                if (!column->is_variant_type()) {
1707
174
                    continue;
1708
174
                }
1709
180
                std::shared_ptr<ColumnReader> column_reader;
1710
180
                OlapReaderStatistics stats;
1711
180
                st = segment->get_column_reader(column->unique_id(), &column_reader, &stats);
1712
180
                if (!st.ok()) {
1713
0
                    LOG(WARNING) << "Failed to get column reader for column: " << column->name()
1714
0
                                 << " error: " << st.to_string();
1715
0
                    continue;
1716
0
                }
1717
180
                if (!column_reader) {
1718
0
                    continue;
1719
0
                }
1720
1721
180
                CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
1722
180
                auto* variant_column_reader =
1723
180
                        assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
1724
                // load external meta before getting subcolumn meta info
1725
180
                st = variant_column_reader->load_external_meta_once();
1726
180
                if (!st.ok()) {
1727
0
                    LOG(WARNING) << "Failed to load external meta for column: " << column->name()
1728
0
                                 << " error: " << st.to_string();
1729
0
                    continue;
1730
0
                }
1731
180
                const auto* subcolumn_meta_info = variant_column_reader->get_subcolumns_meta_info();
1732
513
                for (const auto& entry : *subcolumn_meta_info) {
1733
513
                    if (entry->path.empty()) {
1734
180
                        continue;
1735
180
                    }
1736
333
                    const std::string& column_name =
1737
333
                            column->name_lower_case() + "." + entry->path.get_path();
1738
333
                    const DataTypePtr& data_type = entry->data.file_column_type;
1739
333
                    PathInDataBuilder full_path_builder;
1740
333
                    auto full_path = full_path_builder.append(column->name_lower_case(), false)
1741
333
                                             .append(entry->path.get_parts(), false)
1742
333
                                             .build();
1743
333
                    TabletColumn subcolumn =
1744
333
                            get_column_by_type(data_type, column_name,
1745
333
                                               ExtraInfo {.unique_id = -1,
1746
333
                                                          .parent_unique_id = column->unique_id(),
1747
333
                                                          .path_info = full_path});
1748
333
                    schema->append_column(subcolumn);
1749
333
                }
1750
180
            }
1751
174
            schemas.emplace_back(schema);
1752
174
        }
1753
174
    }
1754
1.41k
    TabletSchemaSPtr least_common_schema;
1755
1.41k
    auto st = get_least_common_schema(schemas, base_schema, least_common_schema, false);
1756
1.41k
    if (!st.ok()) {
1757
0
        return base_schema;
1758
0
    }
1759
1.41k
    return least_common_schema;
1760
1.41k
}
1761
1762
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1763
                   TabletIndexes& subcolumns_indexes, FieldType column_type,
1764
85.1k
                   const std::string& suffix_path, bool is_array_nested_type) {
1765
85.1k
    if (parent_indexes.empty()) {
1766
76.2k
        return false;
1767
76.2k
    }
1768
8.97k
    subcolumns_indexes.clear();
1769
    // bkd index or array index only need to inherit one index
1770
8.97k
    if (field_is_numeric_type(column_type) ||
1771
8.97k
        (is_array_nested_type &&
1772
6.35k
         (field_is_numeric_type(column_type) || field_is_slice_type(column_type)))) {
1773
2.63k
        auto index_ptr = std::make_shared<TabletIndex>(*parent_indexes[0]);
1774
2.63k
        index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1775
        // no need parse for bkd index or array index
1776
2.63k
        index_ptr->remove_parser_and_analyzer();
1777
2.63k
        subcolumns_indexes.emplace_back(std::move(index_ptr));
1778
2.63k
        return true;
1779
2.63k
    }
1780
    // string type need to inherit all indexes
1781
6.34k
    else if (field_is_slice_type(column_type) && !is_array_nested_type) {
1782
6.31k
        for (const auto& index : parent_indexes) {
1783
6.31k
            auto index_ptr = std::make_shared<TabletIndex>(*index);
1784
6.31k
            index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1785
6.31k
            subcolumns_indexes.emplace_back(std::move(index_ptr));
1786
6.31k
        }
1787
6.29k
        return true;
1788
6.29k
    }
1789
49
    return false;
1790
8.97k
}
1791
1792
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1793
85.1k
                   TabletIndexes& subcolumns_indexes, const TabletColumn& column) {
1794
85.1k
    if (!column.is_extracted_column()) {
1795
3
        return false;
1796
3
    }
1797
85.1k
    if (column.is_array_type()) {
1798
1.12k
        if (column.get_sub_columns().empty()) {
1799
0
            return false;
1800
0
        }
1801
1.12k
        const TabletColumn* nested = column.get_sub_columns()[0].get();
1802
1.12k
        while (nested != nullptr && nested->is_array_type()) {
1803
0
            if (nested->get_sub_columns().empty()) {
1804
0
                return false;
1805
0
            }
1806
0
            nested = nested->get_sub_columns()[0].get();
1807
0
        }
1808
1.12k
        if (nested == nullptr) {
1809
0
            return false;
1810
0
        }
1811
1.12k
        return inherit_index(parent_indexes, subcolumns_indexes, nested->type(),
1812
1.12k
                             column.path_info_ptr()->get_path(), true);
1813
1.12k
    }
1814
84.0k
    return inherit_index(parent_indexes, subcolumns_indexes, column.type(),
1815
84.0k
                         column.path_info_ptr()->get_path());
1816
85.1k
}
1817
1818
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1819
0
                   TabletIndexes& subcolumns_indexes, const ColumnMetaPB& column_pb) {
1820
0
    if (!column_pb.has_column_path_info()) {
1821
0
        return false;
1822
0
    }
1823
0
    if (column_pb.type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1824
0
        if (column_pb.children_columns_size() == 0) {
1825
0
            return false;
1826
0
        }
1827
0
        const ColumnMetaPB* nested = &column_pb.children_columns(0);
1828
0
        while (nested != nullptr && nested->type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1829
0
            if (nested->children_columns_size() == 0) {
1830
0
                return false;
1831
0
            }
1832
0
            nested = &nested->children_columns(0);
1833
0
        }
1834
0
        if (nested == nullptr) {
1835
0
            return false;
1836
0
        }
1837
0
        return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)nested->type(),
1838
0
                             column_pb.column_path_info().path(), true);
1839
0
    }
1840
0
    return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)column_pb.type(),
1841
0
                         column_pb.column_path_info().path());
1842
0
}
1843
1844
// ============ Implementation from parse2column.cpp ============
1845
1846
/** Pool for objects that cannot be used from different threads simultaneously.
1847
  * Allows to create an object for each thread.
1848
  * Pool has unbounded size and objects are not destroyed before destruction of pool.
1849
  *
1850
  * Use it in cases when thread local storage is not appropriate
1851
  *  (when maximum number of simultaneously used objects is less
1852
  *   than number of running/sleeping threads, that has ever used object,
1853
  *   and creation/destruction of objects is expensive).
1854
  */
1855
template <typename T>
1856
class SimpleObjectPool {
1857
protected:
1858
    /// Hold all available objects in stack.
1859
    std::mutex mutex;
1860
    std::stack<std::unique_ptr<T>> stack;
1861
    /// Specialized deleter for std::unique_ptr.
1862
    /// Returns underlying pointer back to stack thus reclaiming its ownership.
1863
    struct Deleter {
1864
        SimpleObjectPool<T>* parent;
1865
16.9k
        Deleter(SimpleObjectPool<T>* parent_ = nullptr) : parent {parent_} {} /// NOLINT
1866
16.9k
        void operator()(T* owning_ptr) const {
1867
16.9k
            std::lock_guard lock {parent->mutex};
1868
16.9k
            parent->stack.emplace(owning_ptr);
1869
16.9k
        }
1870
    };
1871
1872
public:
1873
    using Pointer = std::unique_ptr<T, Deleter>;
1874
    /// Extracts and returns a pointer from the stack if it's not empty,
1875
    ///  creates a new one by calling provided f() otherwise.
1876
    template <typename Factory>
1877
16.8k
    Pointer get(Factory&& f) {
1878
16.8k
        std::unique_lock lock(mutex);
1879
16.8k
        if (stack.empty()) {
1880
58
            return {f(), this};
1881
58
        }
1882
16.7k
        auto object = stack.top().release();
1883
16.7k
        stack.pop();
1884
16.7k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1885
16.8k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9StringRefEPS4_RKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1877
12.4k
    Pointer get(Factory&& f) {
1878
12.4k
        std::unique_lock lock(mutex);
1879
12.4k
        if (stack.empty()) {
1880
1
            return {f(), this};
1881
1
        }
1882
12.4k
        auto object = stack.top().release();
1883
12.4k
        stack.pop();
1884
12.4k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1885
12.4k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9ColumnStrIjEERKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1877
4.41k
    Pointer get(Factory&& f) {
1878
4.41k
        std::unique_lock lock(mutex);
1879
4.41k
        if (stack.empty()) {
1880
57
            return {f(), this};
1881
57
        }
1882
4.35k
        auto object = stack.top().release();
1883
4.35k
        stack.pop();
1884
4.35k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1885
4.41k
    }
1886
    /// Like get(), but creates object using default constructor.
1887
    Pointer getDefault() {
1888
        return get([] { return new T; });
1889
    }
1890
};
1891
1892
SimpleObjectPool<JsonParser> parsers_pool;
1893
1894
using Node = typename ColumnVariant::Subcolumns::Node;
1895
1896
6.49M
static inline void append_binary_bytes(ColumnString::Chars& chars, const void* data, size_t size) {
1897
6.49M
    const auto old_size = chars.size();
1898
6.49M
    chars.resize(old_size + size);
1899
6.49M
    memcpy(chars.data() + old_size, reinterpret_cast<const char*>(data), size);
1900
6.49M
}
1901
1902
2.50M
static inline void append_binary_type(ColumnString::Chars& chars, FieldType type) {
1903
2.50M
    const uint8_t t = static_cast<uint8_t>(type);
1904
2.50M
    append_binary_bytes(chars, &t, sizeof(uint8_t));
1905
2.50M
}
1906
1907
1.81M
static inline void append_binary_sizet(ColumnString::Chars& chars, size_t v) {
1908
1.81M
    append_binary_bytes(chars, &v, sizeof(size_t));
1909
1.81M
}
1910
1911
2.50M
static void append_field_to_binary_chars(const Field& field, ColumnString::Chars& chars) {
1912
2.50M
    switch (field.get_type()) {
1913
1
    case PrimitiveType::TYPE_NULL: {
1914
1
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_NONE);
1915
1
        return;
1916
0
    }
1917
132k
    case PrimitiveType::TYPE_BOOLEAN: {
1918
132k
        append_binary_type(chars,
1919
132k
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BOOLEAN));
1920
132k
        const auto v = static_cast<UInt8>(field.get<PrimitiveType::TYPE_BOOLEAN>());
1921
132k
        append_binary_bytes(chars, &v, sizeof(UInt8));
1922
132k
        return;
1923
0
    }
1924
526k
    case PrimitiveType::TYPE_BIGINT: {
1925
526k
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BIGINT));
1926
526k
        const auto v = field.get<PrimitiveType::TYPE_BIGINT>();
1927
526k
        append_binary_bytes(chars, &v, sizeof(Int64));
1928
526k
        return;
1929
0
    }
1930
1
    case PrimitiveType::TYPE_LARGEINT: {
1931
1
        append_binary_type(chars,
1932
1
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_LARGEINT));
1933
1
        const auto v = field.get<PrimitiveType::TYPE_LARGEINT>();
1934
1
        append_binary_bytes(chars, &v, sizeof(int128_t));
1935
1
        return;
1936
0
    }
1937
55.4k
    case PrimitiveType::TYPE_DOUBLE: {
1938
55.4k
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_DOUBLE));
1939
55.4k
        const auto v = field.get<PrimitiveType::TYPE_DOUBLE>();
1940
55.4k
        append_binary_bytes(chars, &v, sizeof(Float64));
1941
55.4k
        return;
1942
0
    }
1943
1.62M
    case PrimitiveType::TYPE_STRING: {
1944
1.62M
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_STRING);
1945
1.62M
        const auto& v = field.get<PrimitiveType::TYPE_STRING>();
1946
1.62M
        append_binary_sizet(chars, v.size());
1947
1.62M
        append_binary_bytes(chars, v.data(), v.size());
1948
1.62M
        return;
1949
0
    }
1950
29.1k
    case PrimitiveType::TYPE_JSONB: {
1951
29.1k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_JSONB);
1952
29.1k
        const auto& v = field.get<PrimitiveType::TYPE_JSONB>();
1953
29.1k
        append_binary_sizet(chars, v.get_size());
1954
29.1k
        append_binary_bytes(chars, v.get_value(), v.get_size());
1955
29.1k
        return;
1956
0
    }
1957
169k
    case PrimitiveType::TYPE_ARRAY: {
1958
169k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_ARRAY);
1959
169k
        const auto& a = field.get<PrimitiveType::TYPE_ARRAY>();
1960
169k
        append_binary_sizet(chars, a.size());
1961
176k
        for (const auto& elem : a) {
1962
176k
            append_field_to_binary_chars(elem, chars);
1963
176k
        }
1964
169k
        return;
1965
0
    }
1966
0
    default:
1967
0
        throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Unsupported field type {}",
1968
0
                               field.get_type());
1969
2.50M
    }
1970
2.50M
}
1971
template <typename ParserImpl>
1972
void parse_json_to_variant_impl(IColumn& column, const char* src, size_t length,
1973
1.35M
                                JSONDataParser<ParserImpl>* parser, const ParseConfig& config) {
1974
1.35M
    auto& column_variant = assert_cast<ColumnVariant&>(column);
1975
1.35M
    std::optional<ParseResult> result;
1976
    /// Treat empty string as an empty object
1977
    /// for better CAST from String to Object.
1978
1.35M
    if (length > 0) {
1979
1.34M
        result = parser->parse(src, length, config);
1980
1.34M
    } else {
1981
2.53k
        result = ParseResult {};
1982
2.53k
    }
1983
1.35M
    if (!result) {
1984
664
        VLOG_DEBUG << "failed to parse " << std::string_view(src, length) << ", length= " << length;
1985
664
        if (config::variant_throw_exeception_on_invalid_json) {
1986
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse object {}",
1987
0
                                   std::string_view(src, length));
1988
0
        }
1989
        // Treat as string
1990
664
        PathInData root_path;
1991
664
        Field field = Field::create_field<TYPE_STRING>(String(src, length));
1992
664
        result = ParseResult {{root_path}, {field}};
1993
664
    }
1994
1.35M
    auto& [paths, values] = *result;
1995
1.35M
    assert(paths.size() == values.size());
1996
1.35M
    size_t old_num_rows = column_variant.rows();
1997
1.35M
    if (config.deprecated_enable_flatten_nested) {
1998
        // here we should check the paths in variant and paths in result,
1999
        // if two paths which same prefix have different structure, we should throw an exception
2000
3.02k
        std::vector<PathInData> check_paths;
2001
12.0k
        for (const auto& entry : column_variant.get_subcolumns()) {
2002
12.0k
            check_paths.push_back(entry->path);
2003
12.0k
        }
2004
3.02k
        check_paths.insert(check_paths.end(), paths.begin(), paths.end());
2005
3.02k
        THROW_IF_ERROR(check_variant_has_no_ambiguous_paths(check_paths));
2006
3.02k
    }
2007
1.35M
    auto [doc_value_data_paths, doc_value_data_values] =
2008
1.35M
            column_variant.get_doc_value_data_paths_and_values();
2009
1.35M
    auto& doc_value_data_offsets = column_variant.serialized_doc_value_column_offsets();
2010
2011
16.4M
    auto flush_defaults = [](ColumnVariant::Subcolumn* subcolumn) {
2012
16.4M
        const auto num_defaults = subcolumn->cur_num_of_defaults();
2013
16.4M
        if (num_defaults > 0) {
2014
1.97M
            subcolumn->insert_many_defaults(num_defaults);
2015
1.97M
            subcolumn->reset_current_num_of_defaults();
2016
1.97M
        }
2017
16.4M
    };
2018
2019
1.35M
    auto get_or_create_subcolumn = [&](const PathInData& path, size_t index_hint,
2020
16.3M
                                       const FieldInfo& field_info) -> ColumnVariant::Subcolumn* {
2021
16.3M
        if (column_variant.get_subcolumn(path, index_hint) == nullptr) {
2022
95.6k
            if (path.has_nested_part()) {
2023
17
                column_variant.add_nested_subcolumn(path, field_info, old_num_rows);
2024
95.6k
            } else {
2025
95.6k
                column_variant.add_sub_column(path, old_num_rows);
2026
95.6k
            }
2027
95.6k
        }
2028
16.3M
        auto* subcolumn = column_variant.get_subcolumn(path, index_hint);
2029
16.3M
        if (!subcolumn) {
2030
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to find sub column {}",
2031
0
                                   path.get_path());
2032
0
        }
2033
16.3M
        return subcolumn;
2034
16.3M
    };
2035
2036
1.35M
    auto insert_into_subcolumn = [&](size_t i,
2037
16.3M
                                     bool check_size_mismatch) -> ColumnVariant::Subcolumn* {
2038
16.3M
        FieldInfo field_info;
2039
16.3M
        get_field_info(values[i], &field_info);
2040
16.3M
        if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
2041
63.9k
            return nullptr;
2042
63.9k
        }
2043
16.2M
        auto* subcolumn = get_or_create_subcolumn(paths[i], i, field_info);
2044
16.2M
        flush_defaults(subcolumn);
2045
16.4M
        if (check_size_mismatch && subcolumn->size() != old_num_rows) {
2046
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
2047
0
                                   "subcolumn {} size missmatched, may contains duplicated entry",
2048
0
                                   paths[i].get_path());
2049
0
        }
2050
16.2M
        subcolumn->insert(std::move(values[i]), std::move(field_info));
2051
16.2M
        return subcolumn;
2052
16.2M
    };
2053
2054
1.35M
    switch (config.parse_to) {
2055
1.22M
    case ParseConfig::ParseTo::OnlySubcolumns:
2056
17.4M
        for (size_t i = 0; i < paths.size(); ++i) {
2057
16.2M
            insert_into_subcolumn(i, true);
2058
16.2M
        }
2059
1.22M
        break;
2060
124k
    case ParseConfig::ParseTo::OnlyDocValueColumn: {
2061
18.4E
        CHECK(column_variant.enable_doc_mode()) << "OnlyDocValueColumn requires doc mode enabled";
2062
124k
        std::vector<size_t> doc_item_indexes;
2063
124k
        doc_item_indexes.reserve(paths.size());
2064
124k
        phmap::flat_hash_set<StringRef, StringRefHash> seen_paths;
2065
124k
        seen_paths.reserve(paths.size());
2066
2067
2.54M
        for (size_t i = 0; i < paths.size(); ++i) {
2068
2.42M
            FieldInfo field_info;
2069
2.42M
            get_field_info(values[i], &field_info);
2070
2.42M
            if (paths[i].empty()) {
2071
63
                auto* subcolumn = column_variant.get_subcolumn(paths[i]);
2072
63
                DCHECK(subcolumn != nullptr);
2073
63
                flush_defaults(subcolumn);
2074
63
                subcolumn->insert(std::move(values[i]), std::move(field_info));
2075
63
                continue;
2076
63
            }
2077
2.42M
            if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE ||
2078
2.42M
                values[i].get_type() == PrimitiveType::TYPE_NULL) {
2079
52.8k
                continue;
2080
52.8k
            }
2081
2.36M
            const auto& path_str = paths[i].get_path();
2082
2.36M
            StringRef path_ref {path_str.data(), path_str.size()};
2083
2.36M
            if (UNLIKELY(!seen_paths.emplace(path_ref).second)) {
2084
2
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
2085
2
                                       "may contains duplicated entry : {}",
2086
2
                                       std::string_view(path_str));
2087
2
            }
2088
2.36M
            doc_item_indexes.push_back(i);
2089
2.36M
        }
2090
2091
124k
        std::sort(doc_item_indexes.begin(), doc_item_indexes.end(),
2092
15.2M
                  [&](size_t l, size_t r) { return paths[l].get_path() < paths[r].get_path(); });
2093
2.31M
        for (const auto idx : doc_item_indexes) {
2094
2.31M
            const auto& path_str = paths[idx].get_path();
2095
2.31M
            doc_value_data_paths->insert_data(path_str.data(), path_str.size());
2096
2.31M
            auto& chars = doc_value_data_values->get_chars();
2097
2.31M
            append_field_to_binary_chars(values[idx], chars);
2098
2.31M
            doc_value_data_values->get_offsets().push_back(chars.size());
2099
2.31M
        }
2100
124k
    } break;
2101
1.35M
    }
2102
1.34M
    doc_value_data_offsets.push_back(doc_value_data_paths->size());
2103
    // /// Insert default values to missed subcolumns.
2104
1.34M
    const auto& subcolumns = column_variant.get_subcolumns();
2105
39.1M
    for (const auto& entry : subcolumns) {
2106
39.1M
        if (entry->data.size() == old_num_rows) {
2107
            // Handle nested paths differently from simple paths
2108
22.5M
            if (entry->path.has_nested_part()) {
2109
                // Try to insert default from nested, if failed, insert regular default
2110
0
                bool success = UNLIKELY(column_variant.try_insert_default_from_nested(entry));
2111
0
                if (!success) {
2112
0
                    entry->data.insert_default();
2113
0
                }
2114
22.5M
            } else {
2115
                // For non-nested paths, increment default counter
2116
22.5M
                entry->data.increment_default_counter();
2117
22.5M
            }
2118
22.5M
        }
2119
39.1M
    }
2120
1.34M
    column_variant.incr_num_rows();
2121
1.34M
    auto sparse_column = column_variant.get_sparse_column();
2122
1.34M
    if (sparse_column->size() == old_num_rows) {
2123
1.34M
        sparse_column->assume_mutable()->insert_default();
2124
1.34M
    }
2125
1.34M
#ifndef NDEBUG
2126
1.34M
    column_variant.check_consistency();
2127
1.34M
#endif
2128
1.34M
}
2129
2130
// exposed interfaces
2131
void parse_json_to_variant(IColumn& column, const StringRef& json, JsonParser* parser,
2132
12.4k
                           const ParseConfig& config) {
2133
12.4k
    if (parser) {
2134
0
        return parse_json_to_variant_impl(column, json.data, json.size, parser, config);
2135
12.4k
    } else {
2136
12.4k
        auto pool_parser = parsers_pool.get([] { return new JsonParser(); });
2137
12.4k
        return parse_json_to_variant_impl(column, json.data, json.size, pool_parser.get(), config);
2138
12.4k
    }
2139
12.4k
}
2140
2141
void parse_json_to_variant(IColumn& column, const ColumnString& raw_json_column,
2142
4.40k
                           const ParseConfig& config) {
2143
4.40k
    auto parser = parsers_pool.get([] { return new JsonParser(); });
2144
1.34M
    for (size_t i = 0; i < raw_json_column.size(); ++i) {
2145
1.33M
        StringRef raw_json = raw_json_column.get_data_at(i);
2146
1.33M
        parse_json_to_variant_impl(column, raw_json.data, raw_json.size, parser.get(), config);
2147
1.33M
    }
2148
4.40k
    column.finalize();
2149
4.40k
}
2150
2151
// parse the doc snapshot column to subcolumns
2152
0
void materialize_docs_to_subcolumns(ColumnVariant& column_variant) {
2153
0
    auto subcolumns = materialize_docs_to_subcolumns_map(column_variant);
2154
2155
0
    for (auto& entry : subcolumns) {
2156
0
        entry.second.finalize();
2157
0
        if (!column_variant.add_sub_column(PathInData(entry.first),
2158
0
                                           IColumn::mutate(entry.second.get_finalized_column_ptr()),
2159
0
                                           entry.second.get_least_common_type())) {
2160
0
            throw doris::Exception(ErrorCode::INTERNAL_ERROR,
2161
0
                                   "Failed to add subcolumn {}, which is from doc snapshot column",
2162
0
                                   entry.first);
2163
0
        }
2164
0
    }
2165
2166
0
    column_variant.finalize();
2167
0
}
2168
2169
// ============ Implementation from variant_util.cpp ============
2170
2171
phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_to_subcolumns_map(
2172
5
        const ColumnVariant& variant, size_t expected_unique_paths) {
2173
5
    constexpr size_t kInitialPathReserve = 8192;
2174
5
    phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> subcolumns;
2175
2176
5
    const auto [column_key, column_value] = variant.get_doc_value_data_paths_and_values();
2177
5
    const auto& column_offsets = variant.serialized_doc_value_column_offsets();
2178
5
    const size_t num_rows = column_offsets.size();
2179
2180
5
    DCHECK_EQ(num_rows, variant.size()) << "doc snapshot offsets size mismatch with variant rows";
2181
2182
5
    subcolumns.reserve(expected_unique_paths != 0
2183
5
                               ? expected_unique_paths
2184
5
                               : std::min<size_t>(column_key->size(), kInitialPathReserve));
2185
2186
18
    for (size_t row = 0; row < num_rows; ++row) {
2187
13
        const size_t start = column_offsets[row - 1];
2188
13
        const size_t end = column_offsets[row];
2189
42
        for (size_t i = start; i < end; ++i) {
2190
29
            const auto& key = column_key->get_data_at(i);
2191
29
            const std::string_view path_sv(key.data, key.size);
2192
2193
29
            auto [it, inserted] =
2194
29
                    subcolumns.try_emplace(path_sv, ColumnVariant::Subcolumn {0, true, false});
2195
29
            auto& subcolumn = it->second;
2196
29
            if (inserted) {
2197
16
                subcolumn.insert_many_defaults(row);
2198
16
            } else if (subcolumn.size() != row) {
2199
3
                subcolumn.insert_many_defaults(row - subcolumn.size());
2200
3
            }
2201
29
            subcolumn.deserialize_from_binary_column(column_value, i);
2202
29
        }
2203
13
    }
2204
2205
16
    for (auto& [path, subcolumn] : subcolumns) {
2206
16
        if (subcolumn.size() != num_rows) {
2207
6
            subcolumn.insert_many_defaults(num_rows - subcolumn.size());
2208
6
        }
2209
16
    }
2210
2211
5
    return subcolumns;
2212
5
}
2213
2214
Status _parse_and_materialize_variant_columns(Block& block,
2215
                                              const std::vector<uint32_t>& variant_pos,
2216
4.47k
                                              const std::vector<ParseConfig>& configs) {
2217
9.87k
    for (size_t i = 0; i < variant_pos.size(); ++i) {
2218
5.40k
        auto column_ref = block.get_by_position(variant_pos[i]).column;
2219
5.40k
        bool is_nullable = column_ref->is_nullable();
2220
5.40k
        MutableColumnPtr var_column = column_ref->assume_mutable();
2221
5.40k
        if (is_nullable) {
2222
4.92k
            const auto& nullable = assert_cast<const ColumnNullable&>(*column_ref);
2223
4.92k
            var_column = nullable.get_nested_column_ptr()->assume_mutable();
2224
4.92k
        }
2225
5.40k
        auto& var = assert_cast<ColumnVariant&>(*var_column);
2226
5.40k
        var_column->finalize();
2227
2228
5.40k
        MutableColumnPtr variant_column;
2229
5.40k
        if (!var.is_scalar_variant()) {
2230
            // already parsed
2231
1.17k
            continue;
2232
1.17k
        }
2233
2234
18.4E
        VLOG_DEBUG << "parse scalar variant column: " << var.get_root_type()->get_name();
2235
4.22k
        ColumnPtr scalar_root_column;
2236
4.22k
        if (var.get_root_type()->get_primitive_type() == TYPE_JSONB) {
2237
            // TODO more efficient way to parse jsonb type, currently we just convert jsonb to
2238
            // json str and parse them into variant
2239
28
            RETURN_IF_ERROR(cast_column({var.get_root(), var.get_root_type(), ""},
2240
28
                                        var.get_root()->is_nullable()
2241
28
                                                ? make_nullable(std::make_shared<DataTypeString>())
2242
28
                                                : std::make_shared<DataTypeString>(),
2243
28
                                        &scalar_root_column));
2244
28
            if (scalar_root_column->is_nullable()) {
2245
28
                scalar_root_column = assert_cast<const ColumnNullable*>(scalar_root_column.get())
2246
28
                                             ->get_nested_column_ptr();
2247
28
            }
2248
4.19k
        } else {
2249
4.19k
            const auto& root = *var.get_root();
2250
4.19k
            scalar_root_column =
2251
4.19k
                    root.is_nullable()
2252
4.19k
                            ? assert_cast<const ColumnNullable&>(root).get_nested_column_ptr()
2253
4.19k
                            : var.get_root();
2254
4.19k
        }
2255
2256
4.22k
        if (scalar_root_column->is_column_string()) {
2257
4.22k
            variant_column = ColumnVariant::create(0, var.enable_doc_mode());
2258
4.22k
            parse_json_to_variant(*variant_column.get(),
2259
4.22k
                                  assert_cast<const ColumnString&>(*scalar_root_column),
2260
4.22k
                                  configs[i]);
2261
18.4E
        } else {
2262
            // Root maybe other types rather than string like ColumnVariant(Int32).
2263
            // In this case, we should finlize the root and cast to JSON type
2264
18.4E
            auto expected_root_type =
2265
18.4E
                    make_nullable(std::make_shared<ColumnVariant::MostCommonType>());
2266
18.4E
            var.ensure_root_node_type(expected_root_type);
2267
18.4E
            variant_column = var.assume_mutable();
2268
18.4E
        }
2269
2270
        // Wrap variant with nullmap if it is nullable
2271
4.22k
        ColumnPtr result = variant_column->get_ptr();
2272
4.22k
        if (is_nullable) {
2273
4.05k
            const auto& null_map =
2274
4.05k
                    assert_cast<const ColumnNullable&>(*column_ref).get_null_map_column_ptr();
2275
4.05k
            result = ColumnNullable::create(result, null_map);
2276
4.05k
        }
2277
4.22k
        block.get_by_position(variant_pos[i]).column = result;
2278
4.22k
    }
2279
4.47k
    return Status::OK();
2280
4.47k
}
2281
2282
Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos,
2283
4.44k
                                             const std::vector<ParseConfig>& configs) {
2284
4.44k
    RETURN_IF_CATCH_EXCEPTION(
2285
4.44k
            { return _parse_and_materialize_variant_columns(block, variant_pos, configs); });
2286
4.44k
}
2287
2288
Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& tablet_schema,
2289
4.56k
                                             const std::vector<uint32_t>& column_pos) {
2290
4.56k
    std::vector<uint32_t> variant_column_pos;
2291
4.56k
    std::vector<uint32_t> variant_schema_pos;
2292
4.56k
    variant_column_pos.reserve(column_pos.size());
2293
4.56k
    variant_schema_pos.reserve(column_pos.size());
2294
27.7k
    for (size_t block_pos = 0; block_pos < column_pos.size(); ++block_pos) {
2295
23.1k
        const uint32_t schema_pos = column_pos[block_pos];
2296
23.1k
        const auto& column = tablet_schema.column(schema_pos);
2297
23.1k
        if (column.is_variant_type()) {
2298
5.29k
            variant_column_pos.push_back(schema_pos);
2299
5.29k
            variant_schema_pos.push_back(schema_pos);
2300
5.29k
        }
2301
23.1k
    }
2302
2303
4.56k
    if (variant_column_pos.empty()) {
2304
47
        return Status::OK();
2305
47
    }
2306
2307
4.51k
    std::vector<ParseConfig> configs(variant_column_pos.size());
2308
9.83k
    for (size_t i = 0; i < variant_column_pos.size(); ++i) {
2309
        // Deprecated legacy flatten-nested switch. Distinct from variant_enable_nested_group.
2310
5.31k
        configs[i].deprecated_enable_flatten_nested =
2311
5.31k
                tablet_schema.deprecated_variant_flatten_nested();
2312
5.31k
        const auto& column = tablet_schema.column(variant_schema_pos[i]);
2313
5.31k
        if (!column.is_variant_type()) {
2314
0
            return Status::InternalError("column is not variant type, column name: {}",
2315
0
                                         column.name());
2316
0
        }
2317
        // if doc mode is not enabled, no need to parse to doc value column
2318
5.31k
        if (!column.variant_enable_doc_mode()) {
2319
3.12k
            configs[i].parse_to = ParseConfig::ParseTo::OnlySubcolumns;
2320
3.12k
            continue;
2321
3.12k
        }
2322
2323
2.19k
        configs[i].parse_to = ParseConfig::ParseTo::OnlyDocValueColumn;
2324
2.19k
    }
2325
2326
4.51k
    RETURN_IF_ERROR(parse_and_materialize_variant_columns(block, variant_column_pos, configs));
2327
4.51k
    return Status::OK();
2328
4.51k
}
2329
2330
} // namespace doris::variant_util