Coverage Report

Created: 2026-07-03 17:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/common/variant_util.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/common/variant_util.h"
19
20
#include <fmt/format.h>
21
#include <gen_cpp/FrontendService.h>
22
#include <gen_cpp/FrontendService_types.h>
23
#include <gen_cpp/HeartbeatService_types.h>
24
#include <gen_cpp/MasterService_types.h>
25
#include <gen_cpp/Status_types.h>
26
#include <gen_cpp/Types_types.h>
27
#include <glog/logging.h>
28
#include <rapidjson/document.h>
29
#include <rapidjson/stringbuffer.h>
30
#include <rapidjson/writer.h>
31
#include <simdjson/simdjson.h> // IWYU pragma: keep
32
#include <unicode/uchar.h>
33
34
#include <algorithm>
35
#include <cassert>
36
#include <cstddef>
37
#include <cstdint>
38
#include <cstring>
39
#include <list>
40
#include <memory>
41
#include <mutex>
42
#include <optional>
43
#include <ostream>
44
#include <ranges>
45
#include <set>
46
#include <stack>
47
#include <string>
48
#include <string_view>
49
#include <unordered_map>
50
#include <utility>
51
#include <vector>
52
53
#include "common/config.h"
54
#include "common/status.h"
55
#include "core/assert_cast.h"
56
#include "core/block/block.h"
57
#include "core/block/column_numbers.h"
58
#include "core/block/column_with_type_and_name.h"
59
#include "core/column/column.h"
60
#include "core/column/column_array.h"
61
#include "core/column/column_map.h"
62
#include "core/column/column_nullable.h"
63
#include "core/column/column_string.h"
64
#include "core/column/column_variant.h"
65
#include "core/data_type/data_type.h"
66
#include "core/data_type/data_type_array.h"
67
#include "core/data_type/data_type_factory.hpp"
68
#include "core/data_type/data_type_jsonb.h"
69
#include "core/data_type/data_type_nullable.h"
70
#include "core/data_type/data_type_string.h"
71
#include "core/data_type/data_type_variant.h"
72
#include "core/data_type/define_primitive_type.h"
73
#include "core/data_type/get_least_supertype.h"
74
#include "core/data_type/primitive_type.h"
75
#include "core/field.h"
76
#include "core/typeid_cast.h"
77
#include "core/types.h"
78
#include "exec/common/field_visitors.h"
79
#include "exec/common/sip_hash.h"
80
#include "exprs/function/function.h"
81
#include "exprs/function/simple_function_factory.h"
82
#include "exprs/function_context.h"
83
#include "exprs/json_functions.h"
84
#include "re2/re2.h"
85
#include "runtime/exec_env.h"
86
#include "runtime/runtime_state.h"
87
#include "storage/olap_common.h"
88
#include "storage/rowset/beta_rowset.h"
89
#include "storage/rowset/rowset.h"
90
#include "storage/rowset/rowset_fwd.h"
91
#include "storage/segment/segment_loader.h"
92
#include "storage/segment/variant/nested_group_path.h"
93
#include "storage/segment/variant/variant_column_reader.h"
94
#include "storage/segment/variant/variant_column_writer_impl.h"
95
#include "storage/tablet/tablet.h"
96
#include "storage/tablet/tablet_fwd.h"
97
#include "storage/tablet/tablet_schema.h"
98
#include "util/client_cache.h"
99
#include "util/defer_op.h"
100
#include "util/json/json_parser.h"
101
#include "util/json/path_in_data.h"
102
#include "util/json/simd_json_parser.h"
103
#include "util/jsonb_utils.h"
104
105
namespace doris::variant_util {
106
107
namespace {
108
109
2.34k
PathInData make_full_subcolumn_path(const TabletColumnPtr& parent_column, std::string_view path) {
110
2.34k
    if (!path.empty()) {
111
2.33k
        return PathInData(parent_column->name_lower_case() + "." + std::string(path));
112
2.33k
    }
113
114
    // Keep the empty JSON key as a real path part. The variant root is `parts.empty()`;
115
    // an empty key is `parts.size() == 1 && parts[0].key.empty()` after popping root.
116
6
    PathInDataBuilder builder;
117
6
    return builder.append(parent_column->name_lower_case(), false).append("", false).build();
118
2.34k
}
119
120
void append_empty_key_subcolumn_from_stats(TabletSchema::PathsSetInfo& paths_set_info,
121
                                           const TabletColumnPtr& parent_column,
122
97
                                           TabletSchemaSPtr& output_schema) {
123
97
    if (!paths_set_info.sub_path_set.contains("") || paths_set_info.sparse_path_set.contains("") ||
124
97
        paths_set_info.subcolumn_indexes.contains("")) {
125
96
        return;
126
96
    }
127
128
1
    auto column_name = parent_column->name_lower_case() + ".";
129
1
    auto column_path = make_full_subcolumn_path(parent_column, "");
130
131
1
    TabletColumn subcolumn;
132
1
    subcolumn.set_name(column_name);
133
1
    subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
134
1
    subcolumn.set_parent_unique_id(parent_column->unique_id());
135
1
    subcolumn.set_path_info(column_path);
136
1
    subcolumn.set_aggregation_method(parent_column->aggregation());
137
1
    subcolumn.set_variant_max_subcolumns_count(parent_column->variant_max_subcolumns_count());
138
1
    subcolumn.set_variant_enable_doc_mode(parent_column->variant_enable_doc_mode());
139
1
    subcolumn.set_is_nullable(true);
140
1
    output_schema->append_column(subcolumn);
141
1
}
142
143
} // namespace
144
145
2.84k
inline void append_escaped_regex_char(std::string* regex_output, char ch) {
146
2.84k
    switch (ch) {
147
22
    case '.':
148
24
    case '^':
149
26
    case '$':
150
28
    case '+':
151
34
    case '*':
152
36
    case '?':
153
38
    case '(':
154
40
    case ')':
155
42
    case '|':
156
44
    case '{':
157
46
    case '}':
158
48
    case '[':
159
48
    case ']':
160
52
    case '\\':
161
52
        regex_output->push_back('\\');
162
52
        regex_output->push_back(ch);
163
52
        break;
164
2.79k
    default:
165
2.79k
        regex_output->push_back(ch);
166
2.79k
        break;
167
2.84k
    }
168
2.84k
}
169
170
// Small LRU to cap compiled glob patterns
171
constexpr size_t kGlobRegexCacheCapacity = 256;
172
173
struct GlobRegexCacheEntry {
174
    std::shared_ptr<RE2> re2;
175
    std::list<std::string>::iterator lru_it;
176
};
177
178
static std::mutex g_glob_regex_cache_mutex;
179
static std::list<std::string> g_glob_regex_cache_lru;
180
static std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;
181
182
210k
std::shared_ptr<RE2> get_or_build_re2(const std::string& glob_pattern) {
183
210k
    {
184
210k
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
185
210k
        auto it = g_glob_regex_cache.find(glob_pattern);
186
210k
        if (it != g_glob_regex_cache.end()) {
187
210k
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
188
210k
                                          it->second.lru_it);
189
210k
            return it->second.re2;
190
210k
        }
191
210k
    }
192
99
    std::string regex_pattern;
193
99
    Status st = glob_to_regex(glob_pattern, &regex_pattern);
194
99
    if (!st.ok()) {
195
2
        return nullptr;
196
2
    }
197
97
    auto compiled = std::make_shared<RE2>(regex_pattern);
198
97
    if (!compiled->ok()) {
199
3
        return nullptr;
200
3
    }
201
94
    {
202
94
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
203
94
        auto it = g_glob_regex_cache.find(glob_pattern);
204
94
        if (it != g_glob_regex_cache.end()) {
205
0
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
206
0
                                          it->second.lru_it);
207
0
            return it->second.re2;
208
0
        }
209
94
        g_glob_regex_cache_lru.push_front(glob_pattern);
210
94
        g_glob_regex_cache.emplace(glob_pattern,
211
94
                                   GlobRegexCacheEntry {compiled, g_glob_regex_cache_lru.begin()});
212
94
        if (g_glob_regex_cache.size() > kGlobRegexCacheCapacity) {
213
0
            const std::string& evict_key = g_glob_regex_cache_lru.back();
214
0
            g_glob_regex_cache.erase(evict_key);
215
0
            g_glob_regex_cache_lru.pop_back();
216
0
        }
217
94
    }
218
0
    return compiled;
219
94
}
220
221
// Convert a restricted glob pattern into a regex.
222
// Supported: '*', '?', '[...]', '\\' escape. Others are treated as literals.
223
310
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern) {
224
310
    regex_pattern->clear();
225
310
    regex_pattern->append("^");
226
310
    bool is_escaped = false;
227
310
    size_t pattern_length = glob_pattern.size();
228
3.27k
    for (size_t index = 0; index < pattern_length; ++index) {
229
2.97k
        char current_char = glob_pattern[index];
230
2.97k
        if (is_escaped) {
231
10
            append_escaped_regex_char(regex_pattern, current_char);
232
10
            is_escaped = false;
233
10
            continue;
234
10
        }
235
2.96k
        if (current_char == '\\') {
236
14
            is_escaped = true;
237
14
            continue;
238
14
        }
239
2.94k
        if (current_char == '*') {
240
72
            regex_pattern->append(".*");
241
72
            continue;
242
72
        }
243
2.87k
        if (current_char == '?') {
244
15
            regex_pattern->append(".");
245
15
            continue;
246
15
        }
247
2.86k
        if (current_char == '[') {
248
33
            size_t class_index = index + 1;
249
33
            bool class_closed = false;
250
33
            bool is_class_escaped = false;
251
33
            std::string class_buffer;
252
33
            if (class_index < pattern_length &&
253
33
                (glob_pattern[class_index] == '!' || glob_pattern[class_index] == '^')) {
254
9
                class_buffer.push_back('^');
255
9
                ++class_index;
256
9
            }
257
99
            for (; class_index < pattern_length; ++class_index) {
258
95
                char class_char = glob_pattern[class_index];
259
95
                if (is_class_escaped) {
260
10
                    class_buffer.push_back(class_char);
261
10
                    is_class_escaped = false;
262
10
                    continue;
263
10
                }
264
85
                if (class_char == '\\') {
265
10
                    is_class_escaped = true;
266
10
                    continue;
267
10
                }
268
75
                if (class_char == ']') {
269
29
                    class_closed = true;
270
29
                    break;
271
29
                }
272
46
                class_buffer.push_back(class_char);
273
46
            }
274
33
            if (!class_closed) {
275
4
                return Status::InvalidArgument("Unclosed character class in glob pattern: {}",
276
4
                                               glob_pattern);
277
4
            }
278
29
            regex_pattern->append("[");
279
29
            regex_pattern->append(class_buffer);
280
29
            regex_pattern->append("]");
281
29
            index = class_index;
282
29
            continue;
283
33
        }
284
2.82k
        append_escaped_regex_char(regex_pattern, current_char);
285
2.82k
    }
286
306
    if (is_escaped) {
287
4
        append_escaped_regex_char(regex_pattern, '\\');
288
4
    }
289
306
    regex_pattern->append("$");
290
306
    return Status::OK();
291
310
}
292
293
210k
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path) {
294
210k
    auto compiled = get_or_build_re2(glob_pattern);
295
210k
    if (compiled == nullptr) {
296
5
        return false;
297
5
    }
298
210k
    return RE2::FullMatch(candidate_path, *compiled);
299
210k
}
300
301
// NestedGroup's physical children and offsets are produced by NestedGroupWriteProvider, not by
302
// appending TabletSchema extracted columns here. This predicate keeps only ordinary Variant paths
303
// that are outside the NG tree, for example `v.owner` beside `v.items[*]`.
304
0
bool is_regular_path_outside_nested_group(const PathInData& path) {
305
0
    const std::string& relative_path = path.get_path();
306
0
    return !relative_path.empty() && !path.get_is_typed() && !path.has_nested_part() &&
307
0
           !segment_v2::contains_nested_group_marker(relative_path) &&
308
0
           !segment_v2::is_root_nested_group_path(relative_path) &&
309
0
           relative_path != SPARSE_COLUMN_PATH &&
310
0
           relative_path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos;
311
0
}
312
313
bool should_materialize_nested_group_regular_subcolumns(
314
        const TabletColumnPtr& column,
315
666
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
316
666
    const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
317
666
    return column->variant_enable_nested_group() ||
318
666
           (info_it != uid_to_variant_extended_info.end() && info_it->second.has_nested_group);
319
666
}
320
321
std::unordered_set<int32_t> collect_nested_group_compaction_root_uids(
322
        const TabletSchemaSPtr& target,
323
7.69k
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
324
7.69k
    std::unordered_set<int32_t> root_uids;
325
86.8k
    for (const TabletColumnPtr& column : target->columns()) {
326
86.8k
        if (column->is_variant_type() && should_materialize_nested_group_regular_subcolumns(
327
667
                                                 column, uid_to_variant_extended_info)) {
328
1
            root_uids.insert(column->unique_id());
329
1
        }
330
86.8k
    }
331
7.69k
    return root_uids;
332
7.69k
}
333
334
PathToDataTypes collect_regular_types_outside_nested_group(
335
1
        const VariantExtendedInfo& extended_info) {
336
1
    PathToDataTypes regular_path_to_data_types;
337
1
    for (const auto& [path, data_types] : extended_info.path_to_data_types) {
338
0
        if (!is_regular_path_outside_nested_group(path)) {
339
0
            continue;
340
0
        }
341
0
        regular_path_to_data_types.emplace(path, data_types);
342
0
    }
343
1
    return regular_path_to_data_types;
344
1
}
345
346
1.02k
size_t get_number_of_dimensions(const IDataType& type) {
347
1.02k
    if (const auto* type_array = typeid_cast<const DataTypeArray*>(&type)) {
348
4
        return type_array->get_number_of_dimensions();
349
4
    }
350
1.01k
    return 0;
351
1.02k
}
352
3
size_t get_number_of_dimensions(const IColumn& column) {
353
3
    if (const auto* column_array = check_and_get_column<ColumnArray>(column)) {
354
2
        return column_array->get_number_of_dimensions();
355
2
    }
356
1
    return 0;
357
3
}
358
359
103k
DataTypePtr get_base_type_of_array(const DataTypePtr& type) {
360
    /// Get raw pointers to avoid extra copying of type pointers.
361
103k
    const DataTypeArray* last_array = nullptr;
362
103k
    const auto* current_type = type.get();
363
103k
    if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
364
103k
        current_type = nullable->get_nested_type().get();
365
103k
    }
366
107k
    while (const auto* type_array = typeid_cast<const DataTypeArray*>(current_type)) {
367
3.38k
        current_type = type_array->get_nested_type().get();
368
3.38k
        last_array = type_array;
369
3.38k
        if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
370
3.38k
            current_type = nullable->get_nested_type().get();
371
3.38k
        }
372
3.38k
    }
373
103k
    return last_array ? last_array->get_nested_type() : type;
374
103k
}
375
376
938k
Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result) {
377
938k
    ColumnsWithTypeAndName arguments {arg, {nullptr, type, type->get_name()}};
378
379
    // To prevent from null info lost, we should not call function since the function framework will wrap
380
    // nullable to Variant instead of the root of Variant
381
    // correct output: Nullable(Array(int)) -> Nullable(Variant(Nullable(Array(int))))
382
    // incorrect output: Nullable(Array(int)) -> Nullable(Variant(Array(int)))
383
938k
    if (type->get_primitive_type() == TYPE_VARIANT) {
384
        // If source column is variant, so the nullable info is different from dst column
385
8.50k
        if (arg.type->get_primitive_type() == TYPE_VARIANT) {
386
131
            *result = type->is_nullable() ? make_nullable(arg.column) : remove_nullable(arg.column);
387
131
            return Status::OK();
388
131
        }
389
        // set variant root column/type to from column/type
390
8.50k
        CHECK(is_column_nullable(*arg.column));
391
8.37k
        auto to_type = remove_nullable(type);
392
8.37k
        const auto& data_type_object = assert_cast<const DataTypeVariant&>(*to_type);
393
8.37k
        auto variant = ColumnVariant::create(data_type_object.variant_max_subcolumns_count(),
394
8.37k
                                             data_type_object.enable_doc_mode());
395
396
8.37k
        variant->create_root(arg.type, IColumn::mutate(arg.column));
397
8.37k
        ColumnPtr nullable = ColumnNullable::create(
398
8.37k
                variant->get_ptr(),
399
8.37k
                assert_cast<const ColumnNullable*>(arg.column.get())->get_null_map_column_ptr());
400
18.4E
        *result = type->is_nullable() ? nullable : variant->get_ptr();
401
8.37k
        return Status::OK();
402
8.50k
    }
403
404
929k
    auto function = SimpleFunctionFactory::instance().get_function("CAST", arguments, type);
405
929k
    if (!function) {
406
0
        return Status::InternalError("Not found cast function {} to {}", arg.type->get_name(),
407
0
                                     type->get_name());
408
0
    }
409
929k
    Block tmp_block {arguments};
410
929k
    uint32_t result_column = cast_set<uint32_t>(tmp_block.columns());
411
929k
    RuntimeState state;
412
929k
    auto ctx = FunctionContext::create_context(&state, {}, {});
413
414
929k
    if (arg.type->get_primitive_type() == INVALID_TYPE) {
415
        // cast from nothing to any type should result in nulls
416
5.94k
        *result = type->create_column_const_with_default_value(arg.column->size())
417
5.94k
                          ->convert_to_full_column_if_const();
418
5.94k
        return Status::OK();
419
5.94k
    }
420
421
    // We convert column string to jsonb type just add a string jsonb field to dst column instead of parse
422
    // each line in original string column.
423
923k
    ctx->set_string_as_jsonb_string(true);
424
923k
    ctx->set_jsonb_string_as_string(true);
425
923k
    tmp_block.insert({nullptr, type, arg.name});
426
    // TODO(lihangyu): we should handle this error in strict mode
427
923k
    if (!function->execute(ctx.get(), tmp_block, {0}, result_column, arg.column->size())) {
428
1
        LOG_EVERY_N(WARNING, 100) << fmt::format("cast from {} to {}", arg.type->get_name(),
429
1
                                                 type->get_name());
430
1
        *result = type->create_column_const_with_default_value(arg.column->size())
431
1
                          ->convert_to_full_column_if_const();
432
1
        return Status::OK();
433
1
    }
434
923k
    *result = tmp_block.get_by_position(result_column).column->convert_to_full_column_if_const();
435
923k
    VLOG_DEBUG << fmt::format("{} before convert {}, after convert {}", arg.name,
436
10
                              arg.column->get_name(), (*result)->get_name());
437
923k
    return Status::OK();
438
923k
}
439
440
32
ColumnPtr jsonb_root_to_json_string_column(const IColumn& root) {
441
32
    auto root_column = root.convert_to_full_column_if_const();
442
32
    const IColumn* jsonb_column = root_column.get();
443
32
    const NullMap* null_map = nullptr;
444
32
    if (root_column->is_nullable()) {
445
27
        const auto& nullable = assert_cast<const ColumnNullable&>(*root_column);
446
27
        jsonb_column = &nullable.get_nested_column();
447
27
        null_map = &nullable.get_null_map_data();
448
27
    }
449
450
32
    const auto& column = assert_cast<const ColumnString&>(*jsonb_column);
451
32
    auto result = ColumnString::create();
452
32
    result->reserve(column.size());
453
118
    for (size_t i = 0; i < column.size(); ++i) {
454
86
        if (null_map != nullptr && (*null_map)[i]) {
455
13
            result->insert_default();
456
13
            continue;
457
13
        }
458
459
73
        const auto jsonb = column.get_data_at(i);
460
73
        if (jsonb.size == 0) {
461
0
            result->insert_default();
462
0
            continue;
463
0
        }
464
465
73
        const auto json = JsonbToJson::jsonb_to_json_string(jsonb.data, jsonb.size);
466
73
        result->insert_data(json.data(), json.size());
467
73
    }
468
32
    return result->get_ptr();
469
32
}
470
471
void get_column_by_type(const DataTypePtr& data_type, const std::string& name, TabletColumn& column,
472
215k
                        const ExtraInfo& ext_info) {
473
215k
    column.set_name(name);
474
215k
    column.set_type(data_type->get_storage_field_type());
475
215k
    if (ext_info.unique_id >= 0) {
476
4
        column.set_unique_id(ext_info.unique_id);
477
4
    }
478
215k
    if (ext_info.parent_unique_id >= 0) {
479
104k
        column.set_parent_unique_id(ext_info.parent_unique_id);
480
104k
    }
481
215k
    if (!ext_info.path_info.empty()) {
482
104k
        column.set_path_info(ext_info.path_info);
483
104k
    }
484
215k
    if (data_type->is_nullable()) {
485
107k
        const auto& real_type = static_cast<const DataTypeNullable&>(*data_type);
486
107k
        column.set_is_nullable(true);
487
107k
        get_column_by_type(real_type.get_nested_type(), name, column, {});
488
107k
        return;
489
107k
    }
490
107k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_ARRAY) {
491
3.37k
        TabletColumn child;
492
3.37k
        get_column_by_type(assert_cast<const DataTypeArray*>(data_type.get())->get_nested_type(),
493
3.37k
                           "", child, {});
494
3.37k
        column.set_length(TabletColumn::get_field_length_by_type(TPrimitiveType::ARRAY, 0));
495
3.37k
        column.add_sub_column(child);
496
3.37k
        return;
497
3.37k
    }
498
104k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_VARIANT) {
499
0
        const auto* dt_variant = assert_cast<const DataTypeVariant*>(data_type.get());
500
0
        column.set_variant_max_subcolumns_count(dt_variant->variant_max_subcolumns_count());
501
0
        column.set_variant_enable_doc_mode(dt_variant->enable_doc_mode());
502
0
        return;
503
0
    }
504
    // size is not fixed when type is string or json
505
104k
    if (is_string_type(data_type->get_primitive_type()) ||
506
104k
        data_type->get_primitive_type() == TYPE_JSONB) {
507
33.9k
        column.set_length(INT_MAX);
508
33.9k
        return;
509
33.9k
    }
510
511
70.2k
    PrimitiveType type = data_type->get_primitive_type();
512
70.2k
    if (is_int_or_bool(type) || is_string_type(type) || is_float_or_double(type) || is_ip(type) ||
513
70.2k
        is_date_or_datetime(type) || type == PrimitiveType::TYPE_DATEV2) {
514
70.2k
        column.set_length(cast_set<int32_t>(data_type->get_size_of_value_in_memory()));
515
70.2k
        return;
516
70.2k
    }
517
105
    if (is_decimal(type)) {
518
105
        column.set_precision(data_type->get_precision());
519
105
        column.set_frac(data_type->get_scale());
520
105
        return;
521
105
    }
522
    // datetimev2 needs scale
523
18.4E
    if (type == PrimitiveType::TYPE_DATETIMEV2 || type == PrimitiveType::TYPE_TIMESTAMPTZ) {
524
19
        column.set_precision(-1);
525
19
        column.set_frac(data_type->get_scale());
526
19
        return;
527
19
    }
528
529
18.4E
    throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
530
18.4E
                           "unexcepted data column type: {}, column name is: {}",
531
18.4E
                           data_type->get_name(), name);
532
18.4E
}
533
534
TabletColumn get_column_by_type(const DataTypePtr& data_type, const std::string& name,
535
103k
                                const ExtraInfo& ext_info) {
536
103k
    TabletColumn result;
537
103k
    get_column_by_type(data_type, name, result, ext_info);
538
103k
    return result;
539
103k
}
540
541
// check if two paths which same prefix have different structure
542
static bool has_different_structure_in_same_path(const PathInData::Parts& lhs,
543
9.06k
                                                 const PathInData::Parts& rhs) {
544
9.06k
    if (lhs.size() != rhs.size()) {
545
1
        return false; // different size means different structure
546
1
    }
547
    // Since we group by path string, lhs and rhs must have the same size and keys
548
    // We only need to check if they have different nested structure
549
36.1k
    for (size_t i = 0; i < lhs.size(); ++i) {
550
27.1k
        if (lhs[i] != rhs[i]) {
551
5
            VLOG_DEBUG << fmt::format(
552
0
                    "Check different structure: {} vs {}, lhs[i].is_nested: {}, rhs[i].is_nested: "
553
0
                    "{}",
554
0
                    lhs[i].key, rhs[i].key, lhs[i].is_nested, rhs[i].is_nested);
555
5
            return true;
556
5
        }
557
27.1k
    }
558
9.05k
    return false;
559
9.06k
}
560
561
4.79k
Status check_variant_has_no_ambiguous_paths(const PathsInData& tuple_paths) {
562
    // Group paths by their string representation to reduce comparisons
563
4.79k
    std::unordered_map<std::string, std::vector<size_t>> path_groups;
564
565
26.8k
    for (size_t i = 0; i < tuple_paths.size(); ++i) {
566
        // same path should have same structure, so we group them by path
567
22.1k
        path_groups[tuple_paths[i].get_path()].push_back(i);
568
        // print part of tuple_paths[i]
569
22.1k
        VLOG_DEBUG << "tuple_paths[i]: " << tuple_paths[i].get_path();
570
22.1k
    }
571
572
    // Only compare paths within the same group
573
13.0k
    for (const auto& [path_str, indices] : path_groups) {
574
13.0k
        if (indices.size() <= 1) {
575
4.02k
            continue; // No conflicts possible
576
4.02k
        }
577
578
        // Compare all pairs within this group
579
27.1k
        for (size_t i = 0; i < indices.size(); ++i) {
580
27.1k
            for (size_t j = 0; j < i; ++j) {
581
9.06k
                if (has_different_structure_in_same_path(tuple_paths[indices[i]].get_parts(),
582
9.06k
                                                         tuple_paths[indices[j]].get_parts())) {
583
5
                    return Status::DataQualityError(
584
5
                            "Ambiguous paths: {} vs {} with different nested part {} vs {}",
585
5
                            tuple_paths[indices[i]].get_path(), tuple_paths[indices[j]].get_path(),
586
5
                            tuple_paths[indices[i]].has_nested_part(),
587
5
                            tuple_paths[indices[j]].has_nested_part());
588
5
                }
589
9.06k
            }
590
18.0k
        }
591
9.03k
    }
592
4.78k
    return Status::OK();
593
4.79k
}
594
595
Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
596
                                    TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
597
                                    const std::map<std::string, TabletColumnPtr>& typed_columns,
598
1.75k
                                    std::set<PathInData>* path_set) {
599
1.75k
    PathsInData tuple_paths;
600
1.75k
    DataTypes tuple_types;
601
1.75k
    CHECK(common_schema.use_count() == 1);
602
    // Get the least common type for all paths.
603
1.75k
    for (const auto& [key, subtypes] : subcolumns_types) {
604
972
        assert(!subtypes.empty());
605
972
        if (key.get_path() == ColumnVariant::COLUMN_NAME_DUMMY) {
606
0
            continue;
607
0
        }
608
972
        size_t first_dim = get_number_of_dimensions(*subtypes[0]);
609
972
        tuple_paths.emplace_back(key);
610
1.01k
        for (size_t i = 1; i < subtypes.size(); ++i) {
611
48
            if (first_dim != get_number_of_dimensions(*subtypes[i])) {
612
1
                tuple_types.emplace_back(make_nullable(std::make_shared<DataTypeJsonb>()));
613
1
                LOG(INFO) << fmt::format(
614
1
                        "Uncompatible types of subcolumn '{}': {} and {}, cast to JSONB",
615
1
                        key.get_path(), subtypes[0]->get_name(), subtypes[i]->get_name());
616
1
                break;
617
1
            }
618
48
        }
619
972
        if (tuple_paths.size() == tuple_types.size()) {
620
1
            continue;
621
1
        }
622
971
        DataTypePtr common_type;
623
971
        get_least_supertype_jsonb(subtypes, &common_type);
624
971
        if (!common_type->is_nullable()) {
625
3
            common_type = make_nullable(common_type);
626
3
        }
627
971
        tuple_types.emplace_back(common_type);
628
971
    }
629
1.75k
    CHECK_EQ(tuple_paths.size(), tuple_types.size());
630
631
    // Append all common type columns of this variant
632
2.72k
    for (int i = 0; i < tuple_paths.size(); ++i) {
633
972
        TabletColumn common_column;
634
        // typed path not contains root part
635
972
        auto path_without_root = tuple_paths[i].copy_pop_front().get_path();
636
972
        if (typed_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) {
637
17
            common_column = *typed_columns.at(path_without_root);
638
            // parent unique id and path may not be init in write path
639
17
            common_column.set_parent_unique_id(variant_col_unique_id);
640
17
            common_column.set_path_info(tuple_paths[i]);
641
17
            common_column.set_name(tuple_paths[i].get_path());
642
955
        } else {
643
            // const std::string& column_name = variant_col_name + "." + tuple_paths[i].get_path();
644
955
            get_column_by_type(tuple_types[i], tuple_paths[i].get_path(), common_column,
645
955
                               ExtraInfo {.unique_id = -1,
646
955
                                          .parent_unique_id = variant_col_unique_id,
647
955
                                          .path_info = tuple_paths[i]});
648
955
        }
649
972
        common_schema->append_column(common_column);
650
972
        if (path_set != nullptr) {
651
969
            path_set->insert(tuple_paths[i]);
652
969
        }
653
972
    }
654
1.75k
    return Status::OK();
655
1.75k
}
656
657
Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
658
                                  TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
659
1.75k
                                  std::set<PathInData>* path_set) {
660
1.75k
    std::map<std::string, TabletColumnPtr> typed_columns;
661
1.75k
    for (const TabletColumnPtr& col :
662
7.86k
         common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) {
663
7.86k
        typed_columns[col->name()] = col;
664
7.86k
    }
665
    // Types of subcolumns by path from all tuples.
666
1.75k
    std::map<PathInData, DataTypes> subcolumns_types;
667
668
    // Collect all paths first to enable batch checking
669
1.75k
    std::vector<PathInData> all_paths;
670
671
1.89k
    for (const TabletSchemaSPtr& schema : schemas) {
672
5.64k
        for (const TabletColumnPtr& col : schema->columns()) {
673
            // Get subcolumns of this variant
674
5.64k
            if (col->has_path_info() && col->parent_unique_id() >= 0 &&
675
5.64k
                col->parent_unique_id() == variant_col_unique_id) {
676
1.01k
                subcolumns_types[*col->path_info_ptr()].emplace_back(
677
1.01k
                        DataTypeFactory::instance().create_data_type(*col, col->is_nullable()));
678
1.01k
                all_paths.push_back(*col->path_info_ptr());
679
1.01k
            }
680
5.64k
        }
681
1.89k
    }
682
683
    // Batch check for conflicts
684
1.75k
    RETURN_IF_ERROR(check_variant_has_no_ambiguous_paths(all_paths));
685
686
1.75k
    return update_least_schema_internal(subcolumns_types, common_schema, variant_col_unique_id,
687
1.75k
                                        typed_columns, path_set);
688
1.75k
}
689
690
// Keep variant subcolumn BF support aligned with FE DDL checks.
691
112k
bool is_bf_supported_by_fe_for_variant_subcolumn(FieldType type) {
692
112k
    switch (type) {
693
91
    case FieldType::OLAP_FIELD_TYPE_SMALLINT:
694
526
    case FieldType::OLAP_FIELD_TYPE_INT:
695
60.5k
    case FieldType::OLAP_FIELD_TYPE_BIGINT:
696
60.7k
    case FieldType::OLAP_FIELD_TYPE_LARGEINT:
697
60.7k
    case FieldType::OLAP_FIELD_TYPE_CHAR:
698
60.7k
    case FieldType::OLAP_FIELD_TYPE_VARCHAR:
699
93.6k
    case FieldType::OLAP_FIELD_TYPE_STRING:
700
93.6k
    case FieldType::OLAP_FIELD_TYPE_DATE:
701
93.6k
    case FieldType::OLAP_FIELD_TYPE_DATETIME:
702
93.8k
    case FieldType::OLAP_FIELD_TYPE_DATEV2:
703
94.0k
    case FieldType::OLAP_FIELD_TYPE_DATETIMEV2:
704
94.0k
    case FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ:
705
94.0k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL:
706
94.1k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL32:
707
94.2k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL64:
708
94.5k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL128I:
709
94.6k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL256:
710
94.8k
    case FieldType::OLAP_FIELD_TYPE_IPV4:
711
94.9k
    case FieldType::OLAP_FIELD_TYPE_IPV6:
712
94.9k
        return true;
713
17.3k
    default:
714
17.3k
        return false;
715
112k
    }
716
112k
}
717
718
void inherit_column_attributes(const TabletColumn& source, TabletColumn& target,
719
112k
                               TabletSchemaSPtr* target_schema) {
720
112k
    if (!target.is_extracted_column()) {
721
0
        return;
722
0
    }
723
112k
    target.set_aggregation_method(source.aggregation());
724
725
    // 1. bloom filter
726
112k
    if (is_bf_supported_by_fe_for_variant_subcolumn(target.type())) {
727
94.9k
        target.set_is_bf_column(source.is_bf_column());
728
94.9k
    }
729
730
112k
    if (!target_schema) {
731
106k
        return;
732
106k
    }
733
734
    // 2. inverted index
735
5.69k
    TabletIndexes indexes_to_add;
736
5.69k
    auto source_indexes = (*target_schema)->inverted_indexs(source.unique_id());
737
    // if target is variant type, we need to inherit all indexes
738
    // because this schema is a read schema from fe
739
5.69k
    if (target.is_variant_type()) {
740
4.83k
        for (auto& index : source_indexes) {
741
404
            auto index_info = std::make_shared<TabletIndex>(*index);
742
404
            index_info->set_escaped_escaped_index_suffix_path(target.path_info_ptr()->get_path());
743
404
            indexes_to_add.emplace_back(std::move(index_info));
744
404
        }
745
4.83k
    } else {
746
857
        inherit_index(source_indexes, indexes_to_add, target);
747
857
    }
748
5.69k
    auto target_indexes = (*target_schema)
749
5.69k
                                  ->inverted_indexs(target.parent_unique_id(),
750
5.69k
                                                    target.path_info_ptr()->get_path());
751
5.79k
    if (target_indexes.empty()) {
752
5.79k
        for (auto& index_info : indexes_to_add) {
753
418
            (*target_schema)->append_index(std::move(*index_info));
754
418
        }
755
5.79k
    }
756
757
    // 3. TODO: gnragm bf index
758
5.69k
}
759
760
6.97k
void inherit_column_attributes(TabletSchemaSPtr& schema) {
761
    // Add index meta if extracted column is missing index meta
762
98.3k
    for (size_t i = 0; i < schema->num_columns(); ++i) {
763
91.3k
        TabletColumn& col = schema->mutable_column(i);
764
91.3k
        if (!col.is_extracted_column()) {
765
85.5k
            continue;
766
85.5k
        }
767
5.81k
        if (schema->field_index(col.parent_unique_id()) == -1) {
768
            // parent column is missing, maybe dropped
769
0
            continue;
770
0
        }
771
5.81k
        inherit_column_attributes(schema->column_by_uid(col.parent_unique_id()), col, &schema);
772
5.81k
    }
773
6.97k
}
774
775
Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
776
                               const TabletSchemaSPtr& base_schema, TabletSchemaSPtr& output_schema,
777
1.71k
                               bool check_schema_size) {
778
1.71k
    std::vector<int32_t> variant_column_unique_id;
779
    // Construct a schema excluding the extracted columns and gather unique identifiers for variants.
780
    // Ensure that the output schema also excludes these extracted columns. This approach prevents
781
    // duplicated paths following the update_least_common_schema process.
782
1.71k
    auto build_schema_without_extracted_columns = [&](const TabletSchemaSPtr& base_schema) {
783
1.71k
        output_schema = std::make_shared<TabletSchema>();
784
        // not copy columns but only shadow copy other attributes
785
1.71k
        output_schema->shawdow_copy_without_columns(*base_schema);
786
        // Get all columns without extracted columns and collect variant col unique id
787
4.24k
        for (const TabletColumnPtr& col : base_schema->columns()) {
788
4.24k
            if (col->is_variant_type()) {
789
1.75k
                variant_column_unique_id.push_back(col->unique_id());
790
1.75k
            }
791
4.24k
            if (!col->is_extracted_column()) {
792
3.74k
                output_schema->append_column(*col);
793
3.74k
            }
794
4.24k
        }
795
1.71k
    };
796
1.71k
    if (base_schema == nullptr) {
797
        // Pick tablet schema with max schema version
798
264
        auto max_version_schema =
799
264
                *std::max_element(schemas.cbegin(), schemas.cend(),
800
1.36k
                                  [](const TabletSchemaSPtr a, const TabletSchemaSPtr b) {
801
1.36k
                                      return a->schema_version() < b->schema_version();
802
1.36k
                                  });
803
264
        CHECK(max_version_schema);
804
264
        build_schema_without_extracted_columns(max_version_schema);
805
1.44k
    } else {
806
        // use input base_schema schema as base schema
807
1.44k
        build_schema_without_extracted_columns(base_schema);
808
1.44k
    }
809
810
1.75k
    for (int32_t unique_id : variant_column_unique_id) {
811
1.75k
        std::set<PathInData> path_set;
812
1.75k
        RETURN_IF_ERROR(update_least_common_schema(schemas, output_schema, unique_id, &path_set));
813
1.75k
    }
814
815
1.71k
    inherit_column_attributes(output_schema);
816
1.71k
    if (check_schema_size &&
817
1.71k
        output_schema->columns().size() > config::variant_max_merged_tablet_schema_size) {
818
0
        return Status::DataQualityError("Reached max column size limit {}",
819
0
                                        config::variant_max_merged_tablet_schema_size);
820
0
    }
821
822
1.71k
    return Status::OK();
823
1.71k
}
824
825
// sort by paths in lexicographical order
826
2.09k
ColumnVariant::Subcolumns get_sorted_subcolumns(const ColumnVariant::Subcolumns& subcolumns) {
827
    // sort by paths in lexicographical order
828
2.09k
    ColumnVariant::Subcolumns sorted = subcolumns;
829
1.22M
    std::sort(sorted.begin(), sorted.end(), [](const auto& lhsItem, const auto& rhsItem) {
830
1.22M
        return lhsItem->path < rhsItem->path;
831
1.22M
    });
832
2.09k
    return sorted;
833
2.09k
}
834
835
bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema,
836
24.2k
                           int32_t new_col_idx, int32_t old_col_idx) {
837
24.2k
    const auto& column_new = new_schema->column(new_col_idx);
838
24.2k
    const auto& column_old = old_schema->column(old_col_idx);
839
840
24.2k
    if (column_new.is_bf_column() != column_old.is_bf_column()) {
841
94
        return true;
842
94
    }
843
844
24.1k
    auto new_schema_inverted_indexs = new_schema->inverted_indexs(column_new);
845
24.1k
    auto old_schema_inverted_indexs = old_schema->inverted_indexs(column_old);
846
847
24.1k
    if (new_schema_inverted_indexs.size() != old_schema_inverted_indexs.size()) {
848
705
        return true;
849
705
    }
850
851
23.8k
    for (size_t i = 0; i < new_schema_inverted_indexs.size(); ++i) {
852
387
        if (!new_schema_inverted_indexs[i]->is_same_except_id(old_schema_inverted_indexs[i])) {
853
19
            return true;
854
19
        }
855
387
    }
856
857
23.4k
    return false;
858
23.4k
}
859
860
1.12k
TabletColumn create_sparse_column(const TabletColumn& variant) {
861
1.12k
    TabletColumn res;
862
1.12k
    res.set_name(variant.name_lower_case() + "." + SPARSE_COLUMN_PATH);
863
1.12k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
864
1.12k
    res.set_aggregation_method(variant.aggregation());
865
1.12k
    res.set_path_info(PathInData {variant.name_lower_case() + "." + SPARSE_COLUMN_PATH});
866
1.12k
    res.set_parent_unique_id(variant.unique_id());
867
    // set default value to "NULL" DefaultColumnIterator will call insert_many_defaults
868
1.12k
    res.set_default_value("NULL");
869
1.12k
    TabletColumn child_tcolumn;
870
1.12k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
871
1.12k
    res.add_sub_column(child_tcolumn);
872
1.12k
    res.add_sub_column(child_tcolumn);
873
1.12k
    return res;
874
1.12k
}
875
876
27.4k
TabletColumn create_sparse_shard_column(const TabletColumn& variant, int bucket_index) {
877
27.4k
    TabletColumn res;
878
27.4k
    std::string name = variant.name_lower_case() + "." + SPARSE_COLUMN_PATH + ".b" +
879
27.4k
                       std::to_string(bucket_index);
880
27.4k
    res.set_name(name);
881
27.4k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
882
27.4k
    res.set_aggregation_method(variant.aggregation());
883
27.4k
    res.set_parent_unique_id(variant.unique_id());
884
27.4k
    res.set_default_value("NULL");
885
27.4k
    PathInData path(name);
886
27.4k
    res.set_path_info(path);
887
27.4k
    TabletColumn child_tcolumn;
888
27.4k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
889
27.4k
    res.add_sub_column(child_tcolumn);
890
27.4k
    res.add_sub_column(child_tcolumn);
891
27.4k
    return res;
892
27.4k
}
893
894
7.01k
TabletColumn create_doc_value_column(const TabletColumn& variant, int bucket_index) {
895
7.01k
    TabletColumn res;
896
7.01k
    std::string name = variant.name_lower_case() + "." + DOC_VALUE_COLUMN_PATH + ".b" +
897
7.01k
                       std::to_string(bucket_index);
898
7.01k
    res.set_name(name);
899
7.01k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
900
7.01k
    res.set_aggregation_method(variant.aggregation());
901
7.01k
    res.set_parent_unique_id(variant.unique_id());
902
7.01k
    res.set_default_value("NULL");
903
7.01k
    res.set_path_info(PathInData {name});
904
905
7.01k
    TabletColumn child_tcolumn;
906
7.01k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
907
7.01k
    res.add_sub_column(child_tcolumn);
908
7.01k
    res.add_sub_column(child_tcolumn);
909
7.01k
    return res;
910
7.01k
}
911
912
114k
uint32_t variant_binary_shard_of(const StringRef& path, uint32_t bucket_num) {
913
114k
    if (bucket_num <= 1) return 0;
914
88.7k
    SipHash hash;
915
88.7k
    hash.update(path.data, path.size);
916
88.7k
    uint64_t h = hash.get64();
917
88.7k
    return static_cast<uint32_t>(h % bucket_num);
918
114k
}
919
920
Status VariantCompactionUtil::aggregate_path_to_stats(
921
        const RowsetSharedPtr& rs,
922
3.07k
        std::unordered_map<int32_t, PathToNoneNullValues>* uid_to_path_stats) {
923
3.07k
    SegmentCacheHandle segment_cache;
924
3.07k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
925
3.07k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
926
927
8.75k
    for (const auto& column : rs->tablet_schema()->columns()) {
928
8.75k
        if (!column->is_variant_type() || column->unique_id() < 0) {
929
4.67k
            continue;
930
4.67k
        }
931
4.07k
        if (!should_check_variant_path_stats(*column)) {
932
0
            continue;
933
0
        }
934
4.07k
        for (const auto& segment : segment_cache.get_segments()) {
935
2.07k
            std::shared_ptr<ColumnReader> column_reader;
936
2.07k
            OlapReaderStatistics stats;
937
2.07k
            RETURN_IF_ERROR(
938
2.07k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
939
2.07k
            if (!column_reader) {
940
0
                continue;
941
0
            }
942
943
2.07k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
944
2.07k
            auto* variant_column_reader =
945
2.07k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
946
            // load external meta before getting stats
947
2.07k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
948
2.07k
            const auto* source_stats = variant_column_reader->get_stats();
949
2.07k
            CHECK(source_stats);
950
951
            // agg path -> stats
952
5.86k
            for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
953
5.86k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
954
5.86k
            }
955
956
6.44k
            for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
957
6.44k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
958
6.44k
            }
959
2.07k
        }
960
4.07k
    }
961
3.07k
    return Status::OK();
962
3.07k
}
963
964
Status VariantCompactionUtil::aggregate_variant_extended_info(
965
        const RowsetSharedPtr& rs,
966
4.97k
        std::unordered_map<int32_t, VariantExtendedInfo>* uid_to_variant_extended_info) {
967
4.97k
    SegmentCacheHandle segment_cache;
968
4.97k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
969
4.97k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
970
971
18.9k
    for (const auto& column : rs->tablet_schema()->columns()) {
972
18.9k
        if (!column->is_variant_type()) {
973
12.8k
            continue;
974
12.8k
        }
975
6.11k
        auto& extended_info = (*uid_to_variant_extended_info)[column->unique_id()];
976
6.11k
        if (column->variant_enable_nested_group()) {
977
0
            extended_info.has_nested_group = true;
978
0
        }
979
6.11k
        for (const auto& segment : segment_cache.get_segments()) {
980
3.53k
            std::shared_ptr<ColumnReader> column_reader;
981
3.53k
            OlapReaderStatistics stats;
982
3.53k
            RETURN_IF_ERROR(
983
3.53k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
984
3.53k
            if (!column_reader) {
985
0
                continue;
986
0
            }
987
988
3.53k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
989
3.53k
            auto* variant_column_reader =
990
3.53k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
991
            // load external meta before getting stats
992
3.53k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
993
3.53k
            const auto* source_stats = variant_column_reader->get_stats();
994
3.53k
            CHECK(source_stats);
995
996
3.53k
            if (!column->variant_enable_nested_group()) {
997
                // NG roots still need type metadata for regular subpaths such as `v.owner`,
998
                // but their compaction schema should not be driven by flat path stats.
999
3.53k
                for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
1000
3.14k
                    extended_info.path_to_none_null_values[path] += size;
1001
3.14k
                    extended_info.sparse_paths.emplace(path);
1002
3.14k
                }
1003
1004
6.47k
                for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
1005
6.47k
                    extended_info.path_to_none_null_values[path] += size;
1006
6.47k
                }
1007
3.53k
            }
1008
1009
            //2. agg path -> schema
1010
3.53k
            variant_column_reader->get_subcolumns_types(&extended_info.path_to_data_types);
1011
1012
            // 3. extract typed paths
1013
3.53k
            variant_column_reader->get_typed_paths(&extended_info.typed_paths);
1014
1015
            // 4. extract nested paths
1016
3.53k
            if (!column->variant_enable_nested_group()) {
1017
3.53k
                variant_column_reader->get_nested_paths(&extended_info.nested_paths);
1018
3.53k
            }
1019
3.53k
        }
1020
6.11k
    }
1021
4.97k
    return Status::OK();
1022
4.97k
}
1023
1024
// get the subpaths and sparse paths for the variant column
1025
void VariantCompactionUtil::get_subpaths(int32_t max_subcolumns_count,
1026
                                         const PathToNoneNullValues& stats,
1027
415
                                         TabletSchema::PathsSetInfo& paths_set_info) {
1028
    // max_subcolumns_count is 0 means no limit
1029
415
    if (max_subcolumns_count > 0 && stats.size() > max_subcolumns_count) {
1030
103
        std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
1031
103
        paths_with_sizes.reserve(stats.size());
1032
2.16k
        for (const auto& [path, size] : stats) {
1033
2.16k
            paths_with_sizes.emplace_back(size, path);
1034
2.16k
        }
1035
103
        std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), std::greater());
1036
1037
        // Select top N paths as subcolumns, remaining paths as sparse columns
1038
2.16k
        for (const auto& [size, path] : paths_with_sizes) {
1039
2.16k
            if (paths_set_info.sub_path_set.size() < max_subcolumns_count) {
1040
214
                paths_set_info.sub_path_set.emplace(path);
1041
1.95k
            } else {
1042
1.95k
                paths_set_info.sparse_path_set.emplace(path);
1043
1.95k
            }
1044
2.16k
        }
1045
103
        LOG(INFO) << "subpaths " << paths_set_info.sub_path_set.size() << " sparse paths "
1046
103
                  << paths_set_info.sparse_path_set.size() << " variant max subcolumns count "
1047
103
                  << max_subcolumns_count << " stats size " << paths_with_sizes.size();
1048
312
    } else {
1049
        // Apply all paths as subcolumns
1050
2.12k
        for (const auto& [path, _] : stats) {
1051
2.12k
            paths_set_info.sub_path_set.emplace(path);
1052
2.12k
        }
1053
312
    }
1054
415
}
1055
1056
Status VariantCompactionUtil::check_path_stats(const std::vector<RowsetSharedPtr>& intputs,
1057
7.70k
                                               RowsetSharedPtr output, BaseTabletSPtr tablet) {
1058
7.70k
    if (output->tablet_schema()->num_variant_columns() == 0) {
1059
7.14k
        return Status::OK();
1060
7.14k
    }
1061
4.92k
    for (const auto& rowset : intputs) {
1062
18.7k
        for (const auto& column : rowset->tablet_schema()->columns()) {
1063
18.7k
            if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
1064
0
                return Status::OK();
1065
0
            }
1066
18.7k
        }
1067
4.92k
    }
1068
    // check no extended schema in input rowsets
1069
4.92k
    for (const auto& rowset : intputs) {
1070
18.8k
        for (const auto& column : rowset->tablet_schema()->columns()) {
1071
18.8k
            if (column->is_extracted_column()) {
1072
0
                return Status::OK();
1073
0
            }
1074
18.8k
        }
1075
4.92k
    }
1076
559
#ifndef BE_TEST
1077
    // check no extended schema in output rowset
1078
2.00k
    for (const auto& column : output->tablet_schema()->columns()) {
1079
2.00k
        if (column->is_extracted_column()) {
1080
0
            const auto& name = column->name();
1081
0
            if (name.find("." + DOC_VALUE_COLUMN_PATH + ".") != std::string::npos ||
1082
0
                name.find("." + SPARSE_COLUMN_PATH + ".") != std::string::npos ||
1083
0
                name.ends_with("." + SPARSE_COLUMN_PATH)) {
1084
0
                continue;
1085
0
            }
1086
0
            return Status::InternalError("Unexpected extracted column {} in output rowset",
1087
0
                                         column->name());
1088
0
        }
1089
2.00k
    }
1090
559
#endif
1091
    // only check path stats for dup_keys since the rows may be merged in other models
1092
559
    if (tablet->keys_type() != KeysType::DUP_KEYS) {
1093
208
        return Status::OK();
1094
208
    }
1095
    // if there is a delete predicate in the input rowsets, we skip the path stats check
1096
2.66k
    for (auto& rowset : intputs) {
1097
2.66k
        if (rowset->rowset_meta()->has_delete_predicate()) {
1098
4
            return Status::OK();
1099
4
        }
1100
2.66k
    }
1101
960
    for (const auto& column : output->tablet_schema()->columns()) {
1102
960
        if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
1103
0
            return Status::OK();
1104
0
        }
1105
960
    }
1106
347
    std::unordered_map<int32_t, PathToNoneNullValues> original_uid_to_path_stats;
1107
2.63k
    for (const auto& rs : intputs) {
1108
2.63k
        RETURN_IF_ERROR(aggregate_path_to_stats(rs, &original_uid_to_path_stats));
1109
2.63k
    }
1110
347
    std::unordered_map<int32_t, PathToNoneNullValues> output_uid_to_path_stats;
1111
347
    RETURN_IF_ERROR(aggregate_path_to_stats(output, &output_uid_to_path_stats));
1112
347
    for (const auto& [uid, stats] : output_uid_to_path_stats) {
1113
227
        if (output->tablet_schema()->column_by_uid(uid).is_variant_type() &&
1114
227
            output->tablet_schema()->column_by_uid(uid).variant_enable_doc_mode()) {
1115
54
            continue;
1116
54
        }
1117
173
        if (original_uid_to_path_stats.find(uid) == original_uid_to_path_stats.end()) {
1118
0
            return Status::InternalError("Path stats not found for uid {}, tablet_id {}", uid,
1119
0
                                         tablet->tablet_id());
1120
0
        }
1121
1122
        // In input rowsets, some rowsets may have statistics values exceeding the maximum limit,
1123
        // which leads to inaccurate statistics
1124
173
        if (stats.size() > output->tablet_schema()
1125
173
                                   ->column_by_uid(uid)
1126
173
                                   .variant_max_sparse_column_statistics_size()) {
1127
            // When there is only one segment, we can ensure that the size of each path in output stats is accurate
1128
1
            if (output->num_segments() == 1) {
1129
13
                for (const auto& [path, size] : stats) {
1130
13
                    if (original_uid_to_path_stats.at(uid).find(path) ==
1131
13
                        original_uid_to_path_stats.at(uid).end()) {
1132
0
                        continue;
1133
0
                    }
1134
13
                    if (original_uid_to_path_stats.at(uid).at(path) > size) {
1135
0
                        return Status::InternalError(
1136
0
                                "Path stats not smaller for uid {} with path `{}`, input size {}, "
1137
0
                                "output "
1138
0
                                "size {}, "
1139
0
                                "tablet_id {}",
1140
0
                                uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1141
0
                                tablet->tablet_id());
1142
0
                    }
1143
13
                }
1144
1
            }
1145
1
        }
1146
        // in this case, input stats is accurate, so we check the stats size and stats value
1147
172
        else {
1148
3.52k
            for (const auto& [path, size] : stats) {
1149
3.52k
                if (original_uid_to_path_stats.at(uid).find(path) ==
1150
3.52k
                    original_uid_to_path_stats.at(uid).end()) {
1151
0
                    return Status::InternalError(
1152
0
                            "Path stats not found for uid {}, path {}, tablet_id {}", uid, path,
1153
0
                            tablet->tablet_id());
1154
0
                }
1155
3.52k
                if (original_uid_to_path_stats.at(uid).at(path) != size) {
1156
0
                    return Status::InternalError(
1157
0
                            "Path stats not match for uid {} with path `{}`, input size {}, output "
1158
0
                            "size {}, "
1159
0
                            "tablet_id {}",
1160
0
                            uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1161
0
                            tablet->tablet_id());
1162
0
                }
1163
3.52k
            }
1164
172
        }
1165
173
    }
1166
1167
347
    return Status::OK();
1168
347
}
1169
1170
Status VariantCompactionUtil::get_compaction_typed_columns(
1171
        const TabletSchemaSPtr& target, const std::unordered_set<std::string>& typed_paths,
1172
        const TabletColumnPtr parent_column, TabletSchemaSPtr& output_schema,
1173
404
        TabletSchema::PathsSetInfo& paths_set_info) {
1174
404
    if (parent_column->variant_enable_typed_paths_to_sparse()) {
1175
40
        return Status::OK();
1176
40
    }
1177
375
    for (const auto& path : typed_paths) {
1178
375
        TabletSchema::SubColumnInfo sub_column_info;
1179
375
        if (generate_sub_column_info(*target, parent_column->unique_id(), path, &sub_column_info)) {
1180
374
            inherit_column_attributes(*parent_column, sub_column_info.column);
1181
374
            output_schema->append_column(sub_column_info.column);
1182
374
            paths_set_info.typed_path_set.insert({path, std::move(sub_column_info)});
1183
374
            VLOG_DEBUG << "append typed column " << path;
1184
374
        } else {
1185
1
            return Status::InternalError("Failed to generate sub column info for path {}", path);
1186
1
        }
1187
375
    }
1188
363
    return Status::OK();
1189
364
}
1190
1191
Status VariantCompactionUtil::get_compaction_nested_columns(
1192
        const std::unordered_set<PathInData, PathInData::Hash>& nested_paths,
1193
        const PathToDataTypes& path_to_data_types, const TabletColumnPtr parent_column,
1194
404
        TabletSchemaSPtr& output_schema, TabletSchema::PathsSetInfo& paths_set_info) {
1195
404
    const auto& parent_indexes = output_schema->inverted_indexs(parent_column->unique_id());
1196
404
    for (const auto& path : nested_paths) {
1197
3
        const auto& find_data_types = path_to_data_types.find(path);
1198
3
        if (find_data_types == path_to_data_types.end() || find_data_types->second.empty()) {
1199
1
            return Status::InternalError("Nested path {} has no data type", path.get_path());
1200
1
        }
1201
2
        DataTypePtr data_type;
1202
2
        get_least_supertype_jsonb(find_data_types->second, &data_type);
1203
1204
2
        const std::string& column_name = parent_column->name_lower_case() + "." + path.get_path();
1205
2
        PathInDataBuilder full_path_builder;
1206
2
        auto full_path = full_path_builder.append(parent_column->name_lower_case(), false)
1207
2
                                 .append(path.get_parts(), false)
1208
2
                                 .build();
1209
2
        TabletColumn nested_column =
1210
2
                get_column_by_type(data_type, column_name,
1211
2
                                   ExtraInfo {.unique_id = -1,
1212
2
                                              .parent_unique_id = parent_column->unique_id(),
1213
2
                                              .path_info = full_path});
1214
2
        inherit_column_attributes(*parent_column, nested_column);
1215
2
        TabletIndexes sub_column_indexes;
1216
2
        inherit_index(parent_indexes, sub_column_indexes, nested_column);
1217
2
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1218
2
        output_schema->append_column(nested_column);
1219
2
        VLOG_DEBUG << "append nested column " << path.get_path();
1220
2
    }
1221
403
    return Status::OK();
1222
404
}
1223
1224
void VariantCompactionUtil::get_compaction_subcolumns_from_subpaths(
1225
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1226
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1227
317
        const std::unordered_set<std::string>& sparse_paths, TabletSchemaSPtr& output_schema) {
1228
317
    auto& path_set = paths_set_info.sub_path_set;
1229
317
    std::vector<StringRef> sorted_subpaths(path_set.begin(), path_set.end());
1230
317
    std::sort(sorted_subpaths.begin(), sorted_subpaths.end());
1231
317
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1232
    // append subcolumns
1233
937
    for (const auto& subpath : sorted_subpaths) {
1234
937
        auto column_name = parent_column->name_lower_case() + "." + subpath.to_string();
1235
937
        auto column_path = make_full_subcolumn_path(parent_column,
1236
937
                                                    std::string_view(subpath.data, subpath.size));
1237
1238
937
        const auto& find_data_types = path_to_data_types.find(PathInData(subpath));
1239
1240
        // some cases: the subcolumn type is variant
1241
        // 1. this path has no data type in segments
1242
        // 2. this path is in sparse paths
1243
        // 3. the sparse paths are too much
1244
937
        TabletSchema::SubColumnInfo sub_column_info;
1245
937
        if (parent_column->variant_enable_typed_paths_to_sparse() &&
1246
937
            generate_sub_column_info(*target, parent_column->unique_id(), std::string(subpath),
1247
72
                                     &sub_column_info)) {
1248
62
            inherit_column_attributes(*parent_column, sub_column_info.column);
1249
62
            output_schema->append_column(sub_column_info.column);
1250
62
            paths_set_info.subcolumn_indexes.emplace(subpath, std::move(sub_column_info.indexes));
1251
62
            VLOG_DEBUG << "append typed column " << subpath;
1252
875
        } else if (find_data_types == path_to_data_types.end() || find_data_types->second.empty() ||
1253
875
                   sparse_paths.find(std::string(subpath)) != sparse_paths.end() ||
1254
875
                   sparse_paths.size() >=
1255
775
                           parent_column->variant_max_sparse_column_statistics_size()) {
1256
107
            TabletColumn subcolumn;
1257
107
            subcolumn.set_name(column_name);
1258
107
            subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1259
107
            subcolumn.set_parent_unique_id(parent_column->unique_id());
1260
107
            subcolumn.set_path_info(column_path);
1261
107
            subcolumn.set_aggregation_method(parent_column->aggregation());
1262
107
            subcolumn.set_variant_max_subcolumns_count(
1263
107
                    parent_column->variant_max_subcolumns_count());
1264
107
            subcolumn.set_variant_enable_doc_mode(parent_column->variant_enable_doc_mode());
1265
107
            subcolumn.set_is_nullable(true);
1266
107
            output_schema->append_column(subcolumn);
1267
107
            VLOG_DEBUG << "append sub column " << subpath << " data type "
1268
0
                       << "VARIANT";
1269
107
        }
1270
        // normal case: the subcolumn type can be calculated from the data types in segments
1271
768
        else {
1272
768
            DataTypePtr data_type;
1273
768
            get_least_supertype_jsonb(find_data_types->second, &data_type);
1274
768
            TabletColumn sub_column =
1275
768
                    get_column_by_type(data_type, column_name,
1276
768
                                       ExtraInfo {.unique_id = -1,
1277
768
                                                  .parent_unique_id = parent_column->unique_id(),
1278
768
                                                  .path_info = column_path});
1279
768
            inherit_column_attributes(*parent_column, sub_column);
1280
768
            TabletIndexes sub_column_indexes;
1281
768
            inherit_index(parent_indexes, sub_column_indexes, sub_column);
1282
768
            paths_set_info.subcolumn_indexes.emplace(subpath, std::move(sub_column_indexes));
1283
768
            output_schema->append_column(sub_column);
1284
768
            VLOG_DEBUG << "append sub column " << subpath << " data type " << data_type->get_name();
1285
768
        }
1286
937
    }
1287
317
}
1288
1289
void VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
1290
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1291
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1292
97
        TabletSchemaSPtr& output_schema) {
1293
97
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1294
1.45k
    for (const auto& [path, data_types] : path_to_data_types) {
1295
        // Typed paths are materialized by get_compaction_typed_columns(); this helper only
1296
        // materializes regular subcolumns inferred from rowset data types.
1297
1.45k
        if (data_types.empty() || path.empty() || path.get_is_typed() || path.has_nested_part()) {
1298
57
            continue;
1299
57
        }
1300
1.40k
        DataTypePtr data_type;
1301
1.40k
        get_least_supertype_jsonb(data_types, &data_type);
1302
1.40k
        auto column_name = parent_column->name_lower_case() + "." + path.get_path();
1303
1.40k
        auto column_path = make_full_subcolumn_path(parent_column, path.get_path());
1304
1.40k
        TabletColumn sub_column =
1305
1.40k
                get_column_by_type(data_type, column_name,
1306
1.40k
                                   ExtraInfo {.unique_id = -1,
1307
1.40k
                                              .parent_unique_id = parent_column->unique_id(),
1308
1.40k
                                              .path_info = column_path});
1309
1.40k
        inherit_column_attributes(*parent_column, sub_column);
1310
1.40k
        TabletIndexes sub_column_indexes;
1311
1.40k
        inherit_index(parent_indexes, sub_column_indexes, sub_column);
1312
1.40k
        paths_set_info.sub_path_set.emplace(path.get_path());
1313
1.40k
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1314
1.40k
        output_schema->append_column(sub_column);
1315
1.40k
        VLOG_DEBUG << "append sub column " << path.get_path() << " data type "
1316
0
                   << data_type->get_name();
1317
1.40k
    }
1318
    // The data-type map can contain the variant root as PathInData(), while an empty JSON key
1319
    // only appears in path stats as "". If stats selected it as a materialized path, append it
1320
    // with an explicit empty path part so it does not collide with root.
1321
97
    append_empty_key_subcolumn_from_stats(paths_set_info, parent_column, output_schema);
1322
97
}
1323
1324
// Build the temporary schema for compaction.
1325
// NestedGroup roots are special: the root VARIANT column owns the NG tree and the streaming NG
1326
// writer handles NG children, while regular non-NG paths beside the arrays are materialized as
1327
// ordinary extracted subcolumns. NG typed paths still use get_compaction_typed_columns(), keeping
1328
// typed-column rules out of the NG-specific regular-path filtering.
1329
Status VariantCompactionUtil::get_extended_compaction_schema(
1330
7.74k
        const std::vector<RowsetSharedPtr>& rowsets, TabletSchemaSPtr& target) {
1331
7.74k
    std::unordered_map<int32_t, VariantExtendedInfo> uid_to_variant_extended_info;
1332
7.74k
    const bool needs_variant_extended_info =
1333
84.6k
            std::ranges::any_of(target->columns(), [](const TabletColumnPtr& column) {
1334
84.6k
                return column->is_variant_type() && (should_check_variant_path_stats(*column) ||
1335
574
                                                     column->variant_enable_nested_group());
1336
84.6k
            });
1337
7.74k
    if (needs_variant_extended_info) {
1338
        // collect path stats from all rowsets and segments
1339
4.97k
        for (const auto& rs : rowsets) {
1340
4.97k
            RETURN_IF_ERROR(aggregate_variant_extended_info(rs, &uid_to_variant_extended_info));
1341
4.97k
        }
1342
573
    }
1343
1344
    // build the output schema
1345
7.74k
    TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>();
1346
7.74k
    output_schema->shawdow_copy_without_columns(*target);
1347
7.74k
    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info;
1348
7.74k
    const auto ng_root_uids =
1349
7.74k
            collect_nested_group_compaction_root_uids(target, uid_to_variant_extended_info);
1350
88.1k
    for (const TabletColumnPtr& column : target->columns()) {
1351
88.1k
        if (!column->is_extracted_column()) {
1352
87.9k
            output_schema->append_column(*column);
1353
87.9k
        }
1354
88.1k
        if (!column->is_variant_type()) {
1355
87.5k
            continue;
1356
87.5k
        }
1357
18.4E
        VLOG_DEBUG << "column " << column->name() << " unique id " << column->unique_id();
1358
1359
622
        const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
1360
622
        const VariantExtendedInfo empty_extended_info;
1361
622
        const VariantExtendedInfo& extended_info = info_it == uid_to_variant_extended_info.end()
1362
622
                                                           ? empty_extended_info
1363
622
                                                           : info_it->second;
1364
622
        auto& paths_set_info = uid_to_paths_set_info[column->unique_id()];
1365
622
        const bool use_nested_group_compaction_schema = ng_root_uids.contains(column->unique_id());
1366
1367
622
        if (use_nested_group_compaction_schema) {
1368
            // 1. append typed columns. Keep this shared with the non-NG typed helper; only the
1369
            // regular-path selection below is NG-specific.
1370
1
            RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1371
1
                                                         output_schema, paths_set_info));
1372
1373
            // NG roots do not record path-count stats for ordinary Variant paths, so their regular
1374
            // non-NG subcolumns use the same data-types materialization helper as the
1375
            // all-materialized non-NG branch below.
1376
1
            auto regular_path_to_data_types =
1377
1
                    collect_regular_types_outside_nested_group(extended_info);
1378
1
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1379
1
                                                      regular_path_to_data_types, output_schema);
1380
1
            LOG(INFO) << "Variant column uid=" << column->unique_id()
1381
1
                      << " keeps nested-group root and materializes regular non-NG subcolumns in "
1382
1
                         "compaction schema";
1383
1
            continue;
1384
1
        }
1385
1386
621
        if (column->variant_enable_doc_mode()) {
1387
263
            const int bucket_num = std::max(1, column->variant_doc_hash_shard_count());
1388
655
            for (int b = 0; b < bucket_num; ++b) {
1389
392
                TabletColumn doc_value_bucket_column = create_doc_value_column(*column, b);
1390
392
                doc_value_bucket_column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1391
392
                doc_value_bucket_column.set_is_nullable(false);
1392
392
                doc_value_bucket_column.set_variant_enable_doc_mode(true);
1393
392
                output_schema->append_column(doc_value_bucket_column);
1394
392
            }
1395
263
            continue;
1396
263
        }
1397
1398
        // 1. append typed columns
1399
358
        RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1400
358
                                                     output_schema, paths_set_info));
1401
1402
        // 2. append nested columns
1403
358
        RETURN_IF_ERROR(get_compaction_nested_columns(extended_info.nested_paths,
1404
358
                                                      extended_info.path_to_data_types, column,
1405
358
                                                      output_schema, paths_set_info));
1406
1407
        // 3. get the subpaths
1408
358
        get_subpaths(column->variant_max_subcolumns_count(), extended_info.path_to_none_null_values,
1409
358
                     paths_set_info);
1410
1411
        // 4. append subcolumns
1412
358
        if (column->variant_max_subcolumns_count() > 0 || !column->get_sub_columns().empty()) {
1413
309
            get_compaction_subcolumns_from_subpaths(paths_set_info, column, target,
1414
309
                                                    extended_info.path_to_data_types,
1415
309
                                                    extended_info.sparse_paths, output_schema);
1416
309
        }
1417
        // variant_max_subcolumns_count == 0 and no typed paths materialized
1418
        // it means that all subcolumns are materialized, may be from old data
1419
49
        else {
1420
49
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1421
49
                                                      extended_info.path_to_data_types,
1422
49
                                                      output_schema);
1423
49
        }
1424
1425
        // append sparse column(s)
1426
        // If variant uses bucketized sparse columns, append one sparse bucket column per bucket.
1427
        // Otherwise, append the single sparse column.
1428
358
        int bucket_num = std::max(1, column->variant_sparse_hash_shard_count());
1429
369
        if (bucket_num > 1) {
1430
1.85k
            for (int b = 0; b < bucket_num; ++b) {
1431
1.48k
                TabletColumn sparse_bucket_column = create_sparse_shard_column(*column, b);
1432
1.48k
                output_schema->append_column(sparse_bucket_column);
1433
1.48k
            }
1434
18.4E
        } else {
1435
18.4E
            TabletColumn sparse_column = create_sparse_column(*column);
1436
18.4E
            output_schema->append_column(sparse_column);
1437
18.4E
        }
1438
358
    }
1439
1440
7.74k
    target = output_schema;
1441
    // used to merge & filter path to sparse column during reading in compaction
1442
7.74k
    target->set_path_set_info(std::move(uid_to_paths_set_info));
1443
18.4E
    VLOG_DEBUG << "dump schema " << target->dump_full_schema();
1444
7.74k
    return Status::OK();
1445
7.74k
}
1446
1447
// Calculate statistics about variant data paths from the encoded sparse column
1448
void VariantCompactionUtil::calculate_variant_stats(const IColumn& encoded_sparse_column,
1449
                                                    segment_v2::VariantStatisticsPB* stats,
1450
                                                    size_t max_sparse_column_statistics_size,
1451
1.46k
                                                    size_t row_pos, size_t num_rows) {
1452
    // Cast input column to ColumnMap type since sparse column is stored as a map
1453
1.46k
    const auto& map_column = assert_cast<const ColumnMap&>(encoded_sparse_column);
1454
1455
    // Get the keys column which contains the paths as strings
1456
1.46k
    const auto& sparse_data_paths =
1457
1.46k
            assert_cast<const ColumnString*>(map_column.get_keys_ptr().get());
1458
1.46k
    const auto& serialized_sparse_column_offsets = map_column.get_offsets();
1459
1.46k
    auto& count_map = *stats->mutable_sparse_column_non_null_size();
1460
    // Iterate through all paths in the sparse column
1461
316k
    for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
1462
314k
        size_t offset = serialized_sparse_column_offsets[i - 1];
1463
314k
        size_t end = serialized_sparse_column_offsets[i];
1464
1.64M
        for (size_t j = offset; j != end; ++j) {
1465
1.33M
            auto path = sparse_data_paths->get_data_at(j);
1466
1467
1.33M
            const auto& sparse_path = path.to_string();
1468
            // If path already exists in statistics, increment its count
1469
1.33M
            if (auto it = count_map.find(sparse_path); it != count_map.end()) {
1470
1.33M
                ++it->second;
1471
1.33M
            }
1472
            // If path doesn't exist and we haven't hit the max statistics size limit,
1473
            // add it with count 1
1474
1.92k
            else if (count_map.size() < max_sparse_column_statistics_size) {
1475
1.90k
                count_map.emplace(sparse_path, 1);
1476
1.90k
            }
1477
1.33M
        }
1478
314k
    }
1479
1480
1.46k
    if (stats->sparse_column_non_null_size().size() > max_sparse_column_statistics_size) {
1481
0
        throw doris::Exception(
1482
0
                ErrorCode::INTERNAL_ERROR,
1483
0
                "Sparse column non null size: {} is greater than max statistics size: {}",
1484
0
                stats->sparse_column_non_null_size().size(), max_sparse_column_statistics_size);
1485
0
    }
1486
1.46k
}
1487
1488
/// Calculates number of dimensions in array field.
1489
/// Returns 0 for scalar fields.
1490
class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t> {
1491
public:
1492
    FieldVisitorToNumberOfDimensions() = default;
1493
    template <PrimitiveType T>
1494
24.6M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
24.6M
        if constexpr (T == TYPE_ARRAY) {
1496
2.63M
            const size_t size = x.size();
1497
2.63M
            size_t dimensions = 0;
1498
6.44M
            for (size_t i = 0; i < size; ++i) {
1499
3.80M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
3.80M
                dimensions = std::max(dimensions, element_dimensions);
1501
3.80M
            }
1502
2.63M
            return 1 + dimensions;
1503
21.9M
        } else {
1504
21.9M
            return 0;
1505
21.9M
        }
1506
24.6M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
121k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
121k
        } else {
1504
121k
            return 0;
1505
121k
        }
1506
121k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
480
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
480
        } else {
1504
480
            return 0;
1505
480
        }
1506
480
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
41.9k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
41.9k
        } else {
1504
41.9k
            return 0;
1505
41.9k
        }
1506
41.9k
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
396
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
396
        } else {
1504
396
            return 0;
1505
396
        }
1506
396
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
332k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
332k
        } else {
1504
332k
            return 0;
1505
332k
        }
1506
332k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
1.03k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
1.03k
        } else {
1504
1.03k
            return 0;
1505
1.03k
        }
1506
1.03k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
1.02k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
1.02k
        } else {
1504
1.02k
            return 0;
1505
1.02k
        }
1506
1.02k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
2.21k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
2.21k
        } else {
1504
2.21k
            return 0;
1505
2.21k
        }
1506
2.21k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
6.62M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
6.62M
        } else {
1504
6.62M
            return 0;
1505
6.62M
        }
1506
6.62M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
859
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
859
        } else {
1504
859
            return 0;
1505
859
        }
1506
859
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
2.97M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
2.97M
        } else {
1504
2.97M
            return 0;
1505
2.97M
        }
1506
2.97M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
306
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
306
        } else {
1504
306
            return 0;
1505
306
        }
1506
306
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
275
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
275
        } else {
1504
275
            return 0;
1505
275
        }
1506
275
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
11.7M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
11.7M
        } else {
1504
11.7M
            return 0;
1505
11.7M
        }
1506
11.7M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
2.63M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
2.63M
        if constexpr (T == TYPE_ARRAY) {
1496
2.63M
            const size_t size = x.size();
1497
2.63M
            size_t dimensions = 0;
1498
6.44M
            for (size_t i = 0; i < size; ++i) {
1499
3.80M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
3.80M
                dimensions = std::max(dimensions, element_dimensions);
1501
3.80M
            }
1502
2.63M
            return 1 + dimensions;
1503
        } else {
1504
            return 0;
1505
        }
1506
2.63M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
1
        } else {
1504
1
            return 0;
1505
1
        }
1506
1
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
1
        } else {
1504
1
            return 0;
1505
1
        }
1506
1
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
756
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
756
        } else {
1504
756
            return 0;
1505
756
        }
1506
756
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
696
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
696
        } else {
1504
696
            return 0;
1505
696
        }
1506
696
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
70.7k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
70.7k
        } else {
1504
70.7k
            return 0;
1505
70.7k
        }
1506
70.7k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
558
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
558
        } else {
1504
558
            return 0;
1505
558
        }
1506
558
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1494
46.9k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1495
        if constexpr (T == TYPE_ARRAY) {
1496
            const size_t size = x.size();
1497
            size_t dimensions = 0;
1498
            for (size_t i = 0; i < size; ++i) {
1499
                size_t element_dimensions = apply_visitor(*this, x[i]);
1500
                dimensions = std::max(dimensions, element_dimensions);
1501
            }
1502
            return 1 + dimensions;
1503
46.9k
        } else {
1504
46.9k
            return 0;
1505
46.9k
        }
1506
46.9k
    }
1507
};
1508
1509
// Visitor that allows to get type of scalar field
1510
// but exclude fields contain complex field.This is a faster version
1511
// for FieldVisitorToScalarType which does not support complex field.
1512
class SimpleFieldVisitorToScalarType : public StaticVisitor<size_t> {
1513
public:
1514
    template <PrimitiveType T>
1515
18.8M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
18.8M
        if constexpr (T == TYPE_ARRAY) {
1517
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
109k
        } else if constexpr (T == TYPE_NULL) {
1519
109k
            have_nulls = true;
1520
109k
            return 1;
1521
18.7M
        } else {
1522
18.7M
            type = T;
1523
18.7M
            return 1;
1524
18.7M
        }
1525
18.8M
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
109k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
109k
        } else if constexpr (T == TYPE_NULL) {
1519
109k
            have_nulls = true;
1520
109k
            return 1;
1521
        } else {
1522
            type = T;
1523
            return 1;
1524
        }
1525
109k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
12.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
        } else if constexpr (T == TYPE_NULL) {
1519
            have_nulls = true;
1520
            return 1;
1521
12.3k
        } else {
1522
12.3k
            type = T;
1523
12.3k
            return 1;
1524
12.3k
        }
1525
12.3k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
273k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
        } else if constexpr (T == TYPE_NULL) {
1519
            have_nulls = true;
1520
            return 1;
1521
273k
        } else {
1522
273k
            type = T;
1523
273k
            return 1;
1524
273k
        }
1525
273k
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
2
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
        } else if constexpr (T == TYPE_NULL) {
1519
            have_nulls = true;
1520
            return 1;
1521
2
        } else {
1522
2
            type = T;
1523
2
            return 1;
1524
2
        }
1525
2
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
7
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
        } else if constexpr (T == TYPE_NULL) {
1519
            have_nulls = true;
1520
            return 1;
1521
7
        } else {
1522
7
            type = T;
1523
7
            return 1;
1524
7
        }
1525
7
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
676
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
        } else if constexpr (T == TYPE_NULL) {
1519
            have_nulls = true;
1520
            return 1;
1521
676
        } else {
1522
676
            type = T;
1523
676
            return 1;
1524
676
        }
1525
676
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
5.08M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
        } else if constexpr (T == TYPE_NULL) {
1519
            have_nulls = true;
1520
            return 1;
1521
5.08M
        } else {
1522
5.08M
            type = T;
1523
5.08M
            return 1;
1524
5.08M
        }
1525
5.08M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
        } else if constexpr (T == TYPE_NULL) {
1519
            have_nulls = true;
1520
            return 1;
1521
1
        } else {
1522
1
            type = T;
1523
1
            return 1;
1524
1
        }
1525
1
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
2.77M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
        } else if constexpr (T == TYPE_NULL) {
1519
            have_nulls = true;
1520
            return 1;
1521
2.77M
        } else {
1522
2.77M
            type = T;
1523
2.77M
            return 1;
1524
2.77M
        }
1525
2.77M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
10.5M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
        } else if constexpr (T == TYPE_NULL) {
1519
            have_nulls = true;
1520
            return 1;
1521
10.5M
        } else {
1522
10.5M
            type = T;
1523
10.5M
            return 1;
1524
10.5M
        }
1525
10.5M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1515
46.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1516
        if constexpr (T == TYPE_ARRAY) {
1517
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1518
        } else if constexpr (T == TYPE_NULL) {
1519
            have_nulls = true;
1520
            return 1;
1521
46.8k
        } else {
1522
46.8k
            type = T;
1523
46.8k
            return 1;
1524
46.8k
        }
1525
46.8k
    }
1526
18.5M
    void get_scalar_type(PrimitiveType* data_type) const { *data_type = type; }
1527
18.6M
    bool contain_nulls() const { return have_nulls; }
1528
1529
18.6M
    bool need_convert_field() const { return false; }
1530
1531
private:
1532
    PrimitiveType type = PrimitiveType::INVALID_TYPE;
1533
    bool have_nulls = false;
1534
};
1535
1536
/// Visitor that allows to get type of scalar field
1537
/// or least common type of scalars in array.
1538
/// More optimized version of FieldToDataType.
1539
class FieldVisitorToScalarType : public StaticVisitor<size_t> {
1540
public:
1541
    template <PrimitiveType T>
1542
5.75M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
5.75M
        if constexpr (T == TYPE_ARRAY) {
1544
2.63M
            size_t size = x.size();
1545
6.44M
            for (size_t i = 0; i < size; ++i) {
1546
3.80M
                apply_visitor(*this, x[i]);
1547
3.80M
            }
1548
2.63M
            return 0;
1549
2.63M
        } else if constexpr (T == TYPE_NULL) {
1550
12.2k
            have_nulls = true;
1551
12.2k
            return 0;
1552
3.10M
        } else {
1553
3.10M
            field_types.insert(T);
1554
3.10M
            type_indexes.insert(T);
1555
3.10M
            return 0;
1556
3.10M
        }
1557
5.75M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
12.2k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
12.2k
        } else if constexpr (T == TYPE_NULL) {
1550
12.2k
            have_nulls = true;
1551
12.2k
            return 0;
1552
        } else {
1553
            field_types.insert(T);
1554
            type_indexes.insert(T);
1555
            return 0;
1556
        }
1557
12.2k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
480
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
480
        } else {
1553
480
            field_types.insert(T);
1554
480
            type_indexes.insert(T);
1555
480
            return 0;
1556
480
        }
1557
480
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
29.6k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
29.6k
        } else {
1553
29.6k
            field_types.insert(T);
1554
29.6k
            type_indexes.insert(T);
1555
29.6k
            return 0;
1556
29.6k
        }
1557
29.6k
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
396
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
396
        } else {
1553
396
            field_types.insert(T);
1554
396
            type_indexes.insert(T);
1555
396
            return 0;
1556
396
        }
1557
396
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
58.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
58.3k
        } else {
1553
58.3k
            field_types.insert(T);
1554
58.3k
            type_indexes.insert(T);
1555
58.3k
            return 0;
1556
58.3k
        }
1557
58.3k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
1.03k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
1.03k
        } else {
1553
1.03k
            field_types.insert(T);
1554
1.03k
            type_indexes.insert(T);
1555
1.03k
            return 0;
1556
1.03k
        }
1557
1.03k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
1.02k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
1.02k
        } else {
1553
1.02k
            field_types.insert(T);
1554
1.02k
            type_indexes.insert(T);
1555
1.02k
            return 0;
1556
1.02k
        }
1557
1.02k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
1.53k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
1.53k
        } else {
1553
1.53k
            field_types.insert(T);
1554
1.53k
            type_indexes.insert(T);
1555
1.53k
            return 0;
1556
1.53k
        }
1557
1.53k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
1.55M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
1.55M
        } else {
1553
1.55M
            field_types.insert(T);
1554
1.55M
            type_indexes.insert(T);
1555
1.55M
            return 0;
1556
1.55M
        }
1557
1.55M
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
858
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
858
        } else {
1553
858
            field_types.insert(T);
1554
858
            type_indexes.insert(T);
1555
858
            return 0;
1556
858
        }
1557
858
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
208k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
208k
        } else {
1553
208k
            field_types.insert(T);
1554
208k
            type_indexes.insert(T);
1555
208k
            return 0;
1556
208k
        }
1557
208k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
306
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
306
        } else {
1553
306
            field_types.insert(T);
1554
306
            type_indexes.insert(T);
1555
306
            return 0;
1556
306
        }
1557
306
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
275
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
275
        } else {
1553
275
            field_types.insert(T);
1554
275
            type_indexes.insert(T);
1555
275
            return 0;
1556
275
        }
1557
275
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
1.17M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
1.17M
        } else {
1553
1.17M
            field_types.insert(T);
1554
1.17M
            type_indexes.insert(T);
1555
1.17M
            return 0;
1556
1.17M
        }
1557
1.17M
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
2.63M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
2.63M
        if constexpr (T == TYPE_ARRAY) {
1544
2.63M
            size_t size = x.size();
1545
6.44M
            for (size_t i = 0; i < size; ++i) {
1546
3.80M
                apply_visitor(*this, x[i]);
1547
3.80M
            }
1548
2.63M
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
        } else {
1553
            field_types.insert(T);
1554
            type_indexes.insert(T);
1555
            return 0;
1556
        }
1557
2.63M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
1
        } else {
1553
1
            field_types.insert(T);
1554
1
            type_indexes.insert(T);
1555
1
            return 0;
1556
1
        }
1557
1
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
1
        } else {
1553
1
            field_types.insert(T);
1554
1
            type_indexes.insert(T);
1555
1
            return 0;
1556
1
        }
1557
1
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
756
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
756
        } else {
1553
756
            field_types.insert(T);
1554
756
            type_indexes.insert(T);
1555
756
            return 0;
1556
756
        }
1557
756
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
696
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
696
        } else {
1553
696
            field_types.insert(T);
1554
696
            type_indexes.insert(T);
1555
696
            return 0;
1556
696
        }
1557
696
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
70.7k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
70.7k
        } else {
1553
70.7k
            field_types.insert(T);
1554
70.7k
            type_indexes.insert(T);
1555
70.7k
            return 0;
1556
70.7k
        }
1557
70.7k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
558
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
558
        } else {
1553
558
            field_types.insert(T);
1554
558
            type_indexes.insert(T);
1555
558
            return 0;
1556
558
        }
1557
558
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1542
44
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1543
        if constexpr (T == TYPE_ARRAY) {
1544
            size_t size = x.size();
1545
            for (size_t i = 0; i < size; ++i) {
1546
                apply_visitor(*this, x[i]);
1547
            }
1548
            return 0;
1549
        } else if constexpr (T == TYPE_NULL) {
1550
            have_nulls = true;
1551
            return 0;
1552
44
        } else {
1553
44
            field_types.insert(T);
1554
44
            type_indexes.insert(T);
1555
44
            return 0;
1556
44
        }
1557
44
    }
1558
1.94M
    void get_scalar_type(PrimitiveType* type) const {
1559
1.94M
        if (type_indexes.size() == 1) {
1560
            // Most cases will have only one type
1561
1.85M
            *type = *type_indexes.begin();
1562
1.85M
            return;
1563
1.85M
        }
1564
90.6k
        DataTypePtr data_type;
1565
90.6k
        get_least_supertype_jsonb(type_indexes, &data_type);
1566
90.6k
        *type = data_type->get_primitive_type();
1567
90.6k
    }
1568
1.94M
    bool contain_nulls() const { return have_nulls; }
1569
1.94M
    bool need_convert_field() const { return field_types.size() > 1; }
1570
1571
private:
1572
    phmap::flat_hash_set<PrimitiveType> type_indexes;
1573
    phmap::flat_hash_set<PrimitiveType> field_types;
1574
    bool have_nulls = false;
1575
};
1576
1577
template <typename Visitor>
1578
20.7M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1579
20.7M
    Visitor to_scalar_type_visitor;
1580
20.7M
    apply_visitor(to_scalar_type_visitor, field);
1581
20.7M
    PrimitiveType type_id;
1582
20.7M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1583
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1584
20.7M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1585
20.7M
             to_scalar_type_visitor.need_convert_field(),
1586
20.7M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1587
20.7M
}
_ZN5doris12variant_util19get_field_info_implINS0_24FieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1578
1.94M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1579
1.94M
    Visitor to_scalar_type_visitor;
1580
1.94M
    apply_visitor(to_scalar_type_visitor, field);
1581
1.94M
    PrimitiveType type_id;
1582
1.94M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1583
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1584
1.94M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1585
1.94M
             to_scalar_type_visitor.need_convert_field(),
1586
1.94M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1587
1.94M
}
_ZN5doris12variant_util19get_field_info_implINS0_30SimpleFieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1578
18.8M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1579
18.8M
    Visitor to_scalar_type_visitor;
1580
18.8M
    apply_visitor(to_scalar_type_visitor, field);
1581
18.8M
    PrimitiveType type_id;
1582
18.8M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1583
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1584
18.8M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1585
18.8M
             to_scalar_type_visitor.need_convert_field(),
1586
18.8M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1587
18.8M
}
1588
1589
20.7M
void get_field_info(const Field& field, FieldInfo* info) {
1590
20.7M
    if (field.is_complex_field()) {
1591
1.94M
        get_field_info_impl<FieldVisitorToScalarType>(field, info);
1592
18.8M
    } else {
1593
18.8M
        get_field_info_impl<SimpleFieldVisitorToScalarType>(field, info);
1594
18.8M
    }
1595
20.7M
}
1596
1597
bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id,
1598
                              const std::string& path,
1599
272k
                              TabletSchema::SubColumnInfo* sub_column_info) {
1600
272k
    const auto& parent_column = schema.column_by_uid(col_unique_id);
1601
272k
    std::function<void(const TabletColumn&, TabletColumn*)> generate_result_column =
1602
272k
            [&](const TabletColumn& from_column, TabletColumn* to_column) {
1603
15.0k
                to_column->set_name(parent_column.name_lower_case() + "." + path);
1604
15.0k
                to_column->set_type(from_column.type());
1605
15.0k
                to_column->set_parent_unique_id(parent_column.unique_id());
1606
15.0k
                bool is_typed = !parent_column.variant_enable_typed_paths_to_sparse();
1607
15.0k
                to_column->set_path_info(
1608
15.0k
                        PathInData(parent_column.name_lower_case() + "." + path, is_typed));
1609
15.0k
                to_column->set_aggregation_method(parent_column.aggregation());
1610
15.0k
                to_column->set_is_nullable(true);
1611
15.0k
                to_column->set_parent_unique_id(parent_column.unique_id());
1612
15.0k
                if (from_column.is_decimal()) {
1613
14.8k
                    to_column->set_precision(from_column.precision());
1614
14.8k
                }
1615
15.0k
                to_column->set_frac(from_column.frac());
1616
1617
15.0k
                if (from_column.is_array_type()) {
1618
2.56k
                    TabletColumn nested_column;
1619
2.56k
                    generate_result_column(*from_column.get_sub_columns()[0], &nested_column);
1620
2.56k
                    to_column->add_sub_column(nested_column);
1621
2.56k
                }
1622
15.0k
            };
1623
1624
272k
    auto generate_index = [&](const std::string& pattern) {
1625
        // 1. find subcolumn's index
1626
12.4k
        if (const auto& indexes = schema.inverted_index_by_field_pattern(col_unique_id, pattern);
1627
12.4k
            !indexes.empty()) {
1628
5.03k
            for (const auto& index : indexes) {
1629
5.03k
                auto index_ptr = std::make_shared<TabletIndex>(*index);
1630
5.03k
                index_ptr->set_escaped_escaped_index_suffix_path(
1631
5.03k
                        sub_column_info->column.path_info_ptr()->get_path());
1632
5.03k
                sub_column_info->indexes.emplace_back(std::move(index_ptr));
1633
5.03k
            }
1634
4.94k
        }
1635
        // 2. find parent column's index
1636
7.53k
        else if (const auto parent_index = schema.inverted_indexs(col_unique_id);
1637
7.53k
                 !parent_index.empty()) {
1638
494
            inherit_index(parent_index, sub_column_info->indexes, sub_column_info->column);
1639
7.03k
        } else {
1640
7.03k
            sub_column_info->indexes.clear();
1641
7.03k
        }
1642
12.4k
    };
1643
1644
272k
    const auto& sub_columns = parent_column.get_sub_columns();
1645
272k
    for (const auto& sub_column : sub_columns) {
1646
215k
        const char* pattern = sub_column->name().c_str();
1647
215k
        switch (sub_column->pattern_type()) {
1648
5.11k
        case PatternTypePB::MATCH_NAME: {
1649
5.11k
            if (strcmp(pattern, path.c_str()) == 0) {
1650
1.12k
                generate_result_column(*sub_column, &sub_column_info->column);
1651
1.12k
                generate_index(sub_column->name());
1652
1.12k
                return true;
1653
1.12k
            }
1654
3.99k
            break;
1655
5.11k
        }
1656
210k
        case PatternTypePB::MATCH_NAME_GLOB: {
1657
210k
            if (glob_match_re2(pattern, path)) {
1658
11.3k
                generate_result_column(*sub_column, &sub_column_info->column);
1659
11.3k
                generate_index(sub_column->name());
1660
11.3k
                return true;
1661
11.3k
            }
1662
199k
            break;
1663
210k
        }
1664
199k
        default:
1665
0
            break;
1666
215k
        }
1667
215k
    }
1668
259k
    return false;
1669
272k
}
1670
1671
TabletSchemaSPtr VariantCompactionUtil::calculate_variant_extended_schema(
1672
1.45k
        const std::vector<RowsetSharedPtr>& rowsets, const TabletSchemaSPtr& base_schema) {
1673
1.45k
    if (rowsets.empty()) {
1674
1
        return nullptr;
1675
1
    }
1676
1677
1.45k
    std::vector<TabletSchemaSPtr> schemas;
1678
3.35k
    for (const auto& rs : rowsets) {
1679
3.35k
        if (rs->num_segments() == 0) {
1680
3.14k
            continue;
1681
3.14k
        }
1682
217
        const auto& tablet_schema = rs->tablet_schema();
1683
217
        SegmentCacheHandle segment_cache;
1684
217
        auto st = SegmentLoader::instance()->load_segments(std::static_pointer_cast<BetaRowset>(rs),
1685
217
                                                           &segment_cache);
1686
217
        if (!st.ok()) {
1687
0
            return base_schema;
1688
0
        }
1689
224
        for (const auto& segment : segment_cache.get_segments()) {
1690
224
            TabletSchemaSPtr schema = tablet_schema->copy_without_variant_extracted_columns();
1691
454
            for (const auto& column : tablet_schema->columns()) {
1692
454
                if (!column->is_variant_type()) {
1693
224
                    continue;
1694
224
                }
1695
230
                std::shared_ptr<ColumnReader> column_reader;
1696
230
                OlapReaderStatistics stats;
1697
230
                st = segment->get_column_reader(column->unique_id(), &column_reader, &stats);
1698
230
                if (!st.ok()) {
1699
0
                    LOG(WARNING) << "Failed to get column reader for column: " << column->name()
1700
0
                                 << " error: " << st.to_string();
1701
0
                    continue;
1702
0
                }
1703
230
                if (!column_reader) {
1704
0
                    continue;
1705
0
                }
1706
1707
230
                CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
1708
230
                auto* variant_column_reader =
1709
230
                        assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
1710
                // load external meta before getting subcolumn meta info
1711
230
                st = variant_column_reader->load_external_meta_once();
1712
230
                if (!st.ok()) {
1713
0
                    LOG(WARNING) << "Failed to load external meta for column: " << column->name()
1714
0
                                 << " error: " << st.to_string();
1715
0
                    continue;
1716
0
                }
1717
230
                const auto* subcolumn_meta_info = variant_column_reader->get_subcolumns_meta_info();
1718
627
                for (const auto& entry : *subcolumn_meta_info) {
1719
627
                    if (entry->path.empty()) {
1720
230
                        continue;
1721
230
                    }
1722
397
                    const std::string& column_name =
1723
397
                            column->name_lower_case() + "." + entry->path.get_path();
1724
397
                    const DataTypePtr& data_type = entry->data.file_column_type;
1725
397
                    PathInDataBuilder full_path_builder;
1726
397
                    auto full_path = full_path_builder.append(column->name_lower_case(), false)
1727
397
                                             .append(entry->path.get_parts(), false)
1728
397
                                             .build();
1729
397
                    TabletColumn subcolumn =
1730
397
                            get_column_by_type(data_type, column_name,
1731
397
                                               ExtraInfo {.unique_id = -1,
1732
397
                                                          .parent_unique_id = column->unique_id(),
1733
397
                                                          .path_info = full_path});
1734
397
                    schema->append_column(subcolumn);
1735
397
                }
1736
230
            }
1737
224
            schemas.emplace_back(schema);
1738
224
        }
1739
217
    }
1740
1.45k
    TabletSchemaSPtr least_common_schema;
1741
1.45k
    auto st = get_least_common_schema(schemas, base_schema, least_common_schema, false);
1742
1.45k
    if (!st.ok()) {
1743
0
        return base_schema;
1744
0
    }
1745
1.45k
    return least_common_schema;
1746
1.45k
}
1747
1748
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1749
                   TabletIndexes& subcolumns_indexes, FieldType column_type,
1750
104k
                   const std::string& suffix_path, bool is_array_nested_type) {
1751
104k
    if (parent_indexes.empty()) {
1752
99.7k
        return false;
1753
99.7k
    }
1754
4.46k
    subcolumns_indexes.clear();
1755
    // bkd index or array index only need to inherit one index
1756
4.46k
    if (field_is_numeric_type(column_type) ||
1757
4.46k
        (is_array_nested_type &&
1758
2.49k
         (field_is_numeric_type(column_type) || field_is_slice_type(column_type)))) {
1759
1.99k
        auto index_ptr = std::make_shared<TabletIndex>(*parent_indexes[0]);
1760
1.99k
        index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1761
        // no need parse for bkd index or array index
1762
1.99k
        index_ptr->remove_parser_and_analyzer();
1763
1.99k
        subcolumns_indexes.emplace_back(std::move(index_ptr));
1764
1.99k
        return true;
1765
1.99k
    }
1766
    // string type need to inherit all indexes
1767
2.47k
    else if (field_is_slice_type(column_type) && !is_array_nested_type) {
1768
2.49k
        for (const auto& index : parent_indexes) {
1769
2.49k
            auto index_ptr = std::make_shared<TabletIndex>(*index);
1770
2.49k
            index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1771
2.49k
            subcolumns_indexes.emplace_back(std::move(index_ptr));
1772
2.49k
        }
1773
2.47k
        return true;
1774
2.47k
    }
1775
5
    return false;
1776
4.46k
}
1777
1778
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1779
104k
                   TabletIndexes& subcolumns_indexes, const TabletColumn& column) {
1780
104k
    if (!column.is_extracted_column()) {
1781
3
        return false;
1782
3
    }
1783
104k
    if (column.is_array_type()) {
1784
3.28k
        if (column.get_sub_columns().empty()) {
1785
0
            return false;
1786
0
        }
1787
3.28k
        const TabletColumn* nested = column.get_sub_columns()[0].get();
1788
3.28k
        while (nested != nullptr && nested->is_array_type()) {
1789
0
            if (nested->get_sub_columns().empty()) {
1790
0
                return false;
1791
0
            }
1792
0
            nested = nested->get_sub_columns()[0].get();
1793
0
        }
1794
3.28k
        if (nested == nullptr) {
1795
0
            return false;
1796
0
        }
1797
3.28k
        return inherit_index(parent_indexes, subcolumns_indexes, nested->type(),
1798
3.28k
                             column.path_info_ptr()->get_path(), true);
1799
3.28k
    }
1800
100k
    return inherit_index(parent_indexes, subcolumns_indexes, column.type(),
1801
100k
                         column.path_info_ptr()->get_path());
1802
104k
}
1803
1804
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1805
0
                   TabletIndexes& subcolumns_indexes, const ColumnMetaPB& column_pb) {
1806
0
    if (!column_pb.has_column_path_info()) {
1807
0
        return false;
1808
0
    }
1809
0
    if (column_pb.type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1810
0
        if (column_pb.children_columns_size() == 0) {
1811
0
            return false;
1812
0
        }
1813
0
        const ColumnMetaPB* nested = &column_pb.children_columns(0);
1814
0
        while (nested != nullptr && nested->type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1815
0
            if (nested->children_columns_size() == 0) {
1816
0
                return false;
1817
0
            }
1818
0
            nested = &nested->children_columns(0);
1819
0
        }
1820
0
        if (nested == nullptr) {
1821
0
            return false;
1822
0
        }
1823
0
        return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)nested->type(),
1824
0
                             column_pb.column_path_info().path(), true);
1825
0
    }
1826
0
    return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)column_pb.type(),
1827
0
                         column_pb.column_path_info().path());
1828
0
}
1829
1830
// ============ Implementation from parse2column.cpp ============
1831
1832
/** Pool for objects that cannot be used from different threads simultaneously.
1833
  * Allows to create an object for each thread.
1834
  * Pool has unbounded size and objects are not destroyed before destruction of pool.
1835
  *
1836
  * Use it in cases when thread local storage is not appropriate
1837
  *  (when maximum number of simultaneously used objects is less
1838
  *   than number of running/sleeping threads, that has ever used object,
1839
  *   and creation/destruction of objects is expensive).
1840
  */
1841
template <typename T>
1842
class SimpleObjectPool {
1843
protected:
1844
    /// Hold all available objects in stack.
1845
    std::mutex mutex;
1846
    std::stack<std::unique_ptr<T>> stack;
1847
    /// Specialized deleter for std::unique_ptr.
1848
    /// Returns underlying pointer back to stack thus reclaiming its ownership.
1849
    struct Deleter {
1850
        SimpleObjectPool<T>* parent;
1851
16.9k
        Deleter(SimpleObjectPool<T>* parent_ = nullptr) : parent {parent_} {} /// NOLINT
1852
16.9k
        void operator()(T* owning_ptr) const {
1853
16.9k
            std::lock_guard lock {parent->mutex};
1854
16.9k
            parent->stack.emplace(owning_ptr);
1855
16.9k
        }
1856
    };
1857
1858
public:
1859
    using Pointer = std::unique_ptr<T, Deleter>;
1860
    /// Extracts and returns a pointer from the stack if it's not empty,
1861
    ///  creates a new one by calling provided f() otherwise.
1862
    template <typename Factory>
1863
16.9k
    Pointer get(Factory&& f) {
1864
16.9k
        std::unique_lock lock(mutex);
1865
16.9k
        if (stack.empty()) {
1866
27
            return {f(), this};
1867
27
        }
1868
16.9k
        auto object = stack.top().release();
1869
16.9k
        stack.pop();
1870
16.9k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1871
16.9k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9StringRefEPS4_RKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1863
12.4k
    Pointer get(Factory&& f) {
1864
12.4k
        std::unique_lock lock(mutex);
1865
12.4k
        if (stack.empty()) {
1866
1
            return {f(), this};
1867
1
        }
1868
12.4k
        auto object = stack.top().release();
1869
12.4k
        stack.pop();
1870
12.4k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1871
12.4k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9ColumnStrIjEERKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1863
4.54k
    Pointer get(Factory&& f) {
1864
4.54k
        std::unique_lock lock(mutex);
1865
4.54k
        if (stack.empty()) {
1866
26
            return {f(), this};
1867
26
        }
1868
4.51k
        auto object = stack.top().release();
1869
4.51k
        stack.pop();
1870
4.51k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1871
4.54k
    }
1872
    /// Like get(), but creates object using default constructor.
1873
    Pointer getDefault() {
1874
        return get([] { return new T; });
1875
    }
1876
};
1877
1878
SimpleObjectPool<JsonParser> parsers_pool;
1879
1880
using Node = typename ColumnVariant::Subcolumns::Node;
1881
1882
42.9M
static inline void append_binary_bytes(ColumnString::Chars& chars, const void* data, size_t size) {
1883
42.9M
    const auto old_size = chars.size();
1884
42.9M
    chars.resize(old_size + size);
1885
42.9M
    memcpy(chars.data() + old_size, reinterpret_cast<const char*>(data), size);
1886
42.9M
}
1887
1888
17.9M
static inline void append_binary_type(ColumnString::Chars& chars, FieldType type) {
1889
17.9M
    const uint8_t t = static_cast<uint8_t>(type);
1890
17.9M
    append_binary_bytes(chars, &t, sizeof(uint8_t));
1891
17.9M
}
1892
1893
10.8M
static inline void append_binary_sizet(ColumnString::Chars& chars, size_t v) {
1894
10.8M
    append_binary_bytes(chars, &v, sizeof(size_t));
1895
10.8M
}
1896
1897
17.9M
static void append_field_to_binary_chars(const Field& field, ColumnString::Chars& chars) {
1898
17.9M
    switch (field.get_type()) {
1899
14
    case PrimitiveType::TYPE_NULL: {
1900
14
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_NONE);
1901
14
        return;
1902
0
    }
1903
261k
    case PrimitiveType::TYPE_BOOLEAN: {
1904
261k
        append_binary_type(chars,
1905
261k
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BOOLEAN));
1906
261k
        const auto v = static_cast<UInt8>(field.get<PrimitiveType::TYPE_BOOLEAN>());
1907
261k
        append_binary_bytes(chars, &v, sizeof(UInt8));
1908
261k
        return;
1909
0
    }
1910
4.54M
    case PrimitiveType::TYPE_BIGINT: {
1911
4.54M
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BIGINT));
1912
4.54M
        const auto v = field.get<PrimitiveType::TYPE_BIGINT>();
1913
4.54M
        append_binary_bytes(chars, &v, sizeof(Int64));
1914
4.54M
        return;
1915
0
    }
1916
9
    case PrimitiveType::TYPE_LARGEINT: {
1917
9
        append_binary_type(chars,
1918
9
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_LARGEINT));
1919
9
        const auto v = field.get<PrimitiveType::TYPE_LARGEINT>();
1920
9
        append_binary_bytes(chars, &v, sizeof(int128_t));
1921
9
        return;
1922
0
    }
1923
2.76M
    case PrimitiveType::TYPE_DOUBLE: {
1924
2.76M
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_DOUBLE));
1925
2.76M
        const auto v = field.get<PrimitiveType::TYPE_DOUBLE>();
1926
2.76M
        append_binary_bytes(chars, &v, sizeof(Float64));
1927
2.76M
        return;
1928
0
    }
1929
10.2M
    case PrimitiveType::TYPE_STRING: {
1930
10.2M
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_STRING);
1931
10.2M
        const auto& v = field.get<PrimitiveType::TYPE_STRING>();
1932
10.2M
        append_binary_sizet(chars, v.size());
1933
10.2M
        append_binary_bytes(chars, v.data(), v.size());
1934
10.2M
        return;
1935
0
    }
1936
46.7k
    case PrimitiveType::TYPE_JSONB: {
1937
46.7k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_JSONB);
1938
46.7k
        const auto& v = field.get<PrimitiveType::TYPE_JSONB>();
1939
46.7k
        append_binary_sizet(chars, v.get_size());
1940
46.7k
        append_binary_bytes(chars, v.get_value(), v.get_size());
1941
46.7k
        return;
1942
0
    }
1943
535k
    case PrimitiveType::TYPE_ARRAY: {
1944
535k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_ARRAY);
1945
535k
        const auto& a = field.get<PrimitiveType::TYPE_ARRAY>();
1946
535k
        append_binary_sizet(chars, a.size());
1947
798k
        for (const auto& elem : a) {
1948
798k
            append_field_to_binary_chars(elem, chars);
1949
798k
        }
1950
535k
        return;
1951
0
    }
1952
0
    default:
1953
0
        throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Unsupported field type {}",
1954
0
                               field.get_type());
1955
17.9M
    }
1956
17.9M
}
1957
template <typename ParserImpl>
1958
void parse_json_to_variant_impl(IColumn& column, const char* src, size_t length,
1959
1.37M
                                JSONDataParser<ParserImpl>* parser, const ParseConfig& config) {
1960
1.37M
    auto& column_variant = assert_cast<ColumnVariant&>(column);
1961
1.37M
    std::optional<ParseResult> result;
1962
    /// Treat empty string as an empty object
1963
    /// for better CAST from String to Object.
1964
1.37M
    if (length > 0) {
1965
1.37M
        result = parser->parse(src, length, config);
1966
1.37M
    } else {
1967
2.43k
        result = ParseResult {};
1968
2.43k
    }
1969
1.37M
    if (!result) {
1970
657
        VLOG_DEBUG << "failed to parse " << std::string_view(src, length) << ", length= " << length;
1971
657
        if (config::variant_throw_exeception_on_invalid_json) {
1972
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse object {}",
1973
0
                                   std::string_view(src, length));
1974
0
        }
1975
        // Treat as string
1976
657
        PathInData root_path;
1977
657
        Field field = Field::create_field<TYPE_STRING>(String(src, length));
1978
657
        result = ParseResult {{root_path}, {field}};
1979
657
    }
1980
1.37M
    auto& [paths, values] = *result;
1981
1.37M
    assert(paths.size() == values.size());
1982
1.37M
    size_t old_num_rows = column_variant.rows();
1983
1.37M
    if (config.deprecated_enable_flatten_nested) {
1984
        // here we should check the paths in variant and paths in result,
1985
        // if two paths which same prefix have different structure, we should throw an exception
1986
3.02k
        std::vector<PathInData> check_paths;
1987
12.0k
        for (const auto& entry : column_variant.get_subcolumns()) {
1988
12.0k
            check_paths.push_back(entry->path);
1989
12.0k
        }
1990
3.02k
        check_paths.insert(check_paths.end(), paths.begin(), paths.end());
1991
3.02k
        THROW_IF_ERROR(check_variant_has_no_ambiguous_paths(check_paths));
1992
3.02k
    }
1993
1.37M
    auto [doc_value_data_paths, doc_value_data_values] =
1994
1.37M
            column_variant.get_doc_value_data_paths_and_values();
1995
1.37M
    auto& doc_value_data_offsets = column_variant.serialized_doc_value_column_offsets();
1996
1997
1.44M
    auto flush_defaults = [](ColumnVariant::Subcolumn* subcolumn) {
1998
1.44M
        const auto num_defaults = subcolumn->cur_num_of_defaults();
1999
1.44M
        if (num_defaults > 0) {
2000
165k
            subcolumn->insert_many_defaults(num_defaults);
2001
165k
            subcolumn->reset_current_num_of_defaults();
2002
165k
        }
2003
1.44M
    };
2004
2005
1.37M
    auto is_plain_path = [](const PathInData& path) {
2006
13
        for (const auto& part : path.get_parts()) {
2007
13
            if (part.is_nested || part.anonymous_array_level != 0) {
2008
0
                return false;
2009
0
            }
2010
13
        }
2011
9
        return true;
2012
9
    };
2013
2014
1.37M
    auto get_or_create_subcolumn = [&](const PathInData& path, size_t index_hint,
2015
1.43M
                                       const FieldInfo& field_info) -> ColumnVariant::Subcolumn* {
2016
1.43M
        auto* subcolumn = column_variant.get_subcolumn(path, index_hint);
2017
1.43M
        if (subcolumn == nullptr) {
2018
3.82k
            if (path.has_nested_part()) {
2019
17
                column_variant.add_nested_subcolumn(path, field_info, old_num_rows);
2020
3.80k
            } else {
2021
3.80k
                column_variant.add_sub_column(path, old_num_rows);
2022
3.80k
            }
2023
3.82k
            subcolumn = column_variant.get_subcolumn(path, index_hint);
2024
3.82k
        }
2025
1.43M
        if (!subcolumn) {
2026
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to find sub column {}",
2027
0
                                   path.get_path());
2028
0
        }
2029
1.43M
        return subcolumn;
2030
1.43M
    };
2031
2032
1.43M
    auto normalize_plain_path = [&](const PathInData& path) {
2033
1.43M
        if (!config.check_duplicate_json_path || path.empty() || !is_plain_path(path)) {
2034
1.43M
            return path;
2035
1.43M
        }
2036
9
        return PathInData(path.get_path());
2037
1.43M
    };
2038
2039
1.37M
    auto insert_into_subcolumn = [&](size_t i,
2040
1.43M
                                     bool check_size_mismatch) -> ColumnVariant::Subcolumn* {
2041
1.43M
        FieldInfo field_info;
2042
1.43M
        get_field_info(values[i], &field_info);
2043
1.43M
        if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
2044
109
            return nullptr;
2045
109
        }
2046
1.43M
        auto path = normalize_plain_path(paths[i]);
2047
1.43M
        auto* subcolumn = get_or_create_subcolumn(path, i, field_info);
2048
1.43M
        flush_defaults(subcolumn);
2049
1.43M
        if (check_size_mismatch && subcolumn->size() != old_num_rows) {
2050
1
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
2051
1
                                   "subcolumn {} size missmatched, may contains duplicated entry",
2052
1
                                   path.get_path());
2053
1
        }
2054
1.43M
        subcolumn->insert(std::move(values[i]), std::move(field_info));
2055
1.43M
        return subcolumn;
2056
1.43M
    };
2057
2058
1.37M
    switch (config.parse_to) {
2059
102k
    case ParseConfig::ParseTo::OnlySubcolumns:
2060
1.54M
        for (size_t i = 0; i < paths.size(); ++i) {
2061
1.43M
            insert_into_subcolumn(i, true);
2062
1.43M
        }
2063
102k
        break;
2064
1.27M
    case ParseConfig::ParseTo::OnlyDocValueColumn: {
2065
1.27M
        std::vector<size_t> doc_item_indexes;
2066
1.27M
        doc_item_indexes.reserve(paths.size());
2067
1.27M
        phmap::flat_hash_set<StringRef, StringRefHash> seen_paths;
2068
1.27M
        seen_paths.reserve(paths.size());
2069
2070
19.0M
        for (size_t i = 0; i < paths.size(); ++i) {
2071
17.8M
            FieldInfo field_info;
2072
17.8M
            get_field_info(values[i], &field_info);
2073
17.8M
            if (paths[i].empty()) {
2074
                // Plain non-doc VARIANT can use doc-value KV as writer-side staging. An
2075
                // invalid root entry from JSON object/array is neither a scalar root value nor
2076
                // a doc KV path, so leave this row's doc offset empty. Doc-mode and valid scalar
2077
                // roots still populate the root subcolumn below.
2078
776
                if (!column_variant.enable_doc_mode() &&
2079
776
                    field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
2080
1
                    continue;
2081
1
                }
2082
775
                auto* subcolumn = column_variant.get_subcolumn(paths[i]);
2083
775
                DCHECK(subcolumn != nullptr);
2084
775
                flush_defaults(subcolumn);
2085
775
                subcolumn->insert(std::move(values[i]), std::move(field_info));
2086
775
                continue;
2087
776
            }
2088
17.8M
            if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE ||
2089
17.8M
                values[i].get_type() == PrimitiveType::TYPE_NULL) {
2090
116k
                continue;
2091
116k
            }
2092
17.6M
            const auto& path_str = paths[i].get_path();
2093
17.6M
            StringRef path_ref {path_str.data(), path_str.size()};
2094
17.6M
            if (UNLIKELY(!seen_paths.emplace(path_ref).second)) {
2095
0
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
2096
0
                                       "may contains duplicated entry : {}",
2097
0
                                       std::string_view(path_str));
2098
0
            }
2099
17.6M
            doc_item_indexes.push_back(i);
2100
17.6M
        }
2101
2102
1.27M
        std::sort(doc_item_indexes.begin(), doc_item_indexes.end(),
2103
71.0M
                  [&](size_t l, size_t r) { return paths[l].get_path() < paths[r].get_path(); });
2104
15.9M
        for (const auto idx : doc_item_indexes) {
2105
15.9M
            const auto& path_str = paths[idx].get_path();
2106
15.9M
            doc_value_data_paths->insert_data(path_str.data(), path_str.size());
2107
15.9M
            auto& chars = doc_value_data_values->get_chars();
2108
15.9M
            append_field_to_binary_chars(values[idx], chars);
2109
15.9M
            doc_value_data_values->get_offsets().push_back(chars.size());
2110
15.9M
        }
2111
1.27M
    } break;
2112
1.37M
    }
2113
1.37M
    doc_value_data_offsets.push_back(doc_value_data_paths->size());
2114
    // /// Insert default values to missed subcolumns.
2115
1.37M
    const auto& subcolumns = column_variant.get_subcolumns();
2116
5.58M
    for (const auto& entry : subcolumns) {
2117
5.58M
        if (entry->data.size() == old_num_rows) {
2118
            // Handle nested paths differently from simple paths
2119
4.14M
            if (entry->path.has_nested_part()) {
2120
                // Try to insert default from nested, if failed, insert regular default
2121
0
                bool success = UNLIKELY(column_variant.try_insert_default_from_nested(entry));
2122
0
                if (!success) {
2123
0
                    entry->data.insert_default();
2124
0
                }
2125
4.14M
            } else {
2126
                // For non-nested paths, increment default counter
2127
4.14M
                entry->data.increment_default_counter();
2128
4.14M
            }
2129
4.14M
        }
2130
5.58M
    }
2131
1.37M
    column_variant.incr_num_rows();
2132
1.37M
    if (column_variant.get_sparse_column()->size() == old_num_rows) {
2133
1.37M
        column_variant.get_sparse_column_mutable().insert_default();
2134
1.37M
    }
2135
1.37M
#ifndef NDEBUG
2136
1.37M
    column_variant.check_consistency();
2137
1.37M
#endif
2138
1.37M
}
2139
2140
// exposed interfaces
2141
void parse_json_to_variant(IColumn& column, const StringRef& json, JsonParser* parser,
2142
12.4k
                           const ParseConfig& config) {
2143
12.4k
    if (parser) {
2144
0
        return parse_json_to_variant_impl(column, json.data, json.size, parser, config);
2145
12.4k
    } else {
2146
12.4k
        auto pool_parser = parsers_pool.get([] { return new JsonParser(); });
2147
12.4k
        return parse_json_to_variant_impl(column, json.data, json.size, pool_parser.get(), config);
2148
12.4k
    }
2149
12.4k
}
2150
2151
void parse_json_to_variant(IColumn& column, const ColumnString& raw_json_column,
2152
4.54k
                           const ParseConfig& config) {
2153
4.54k
    auto parser = parsers_pool.get([] { return new JsonParser(); });
2154
1.36M
    for (size_t i = 0; i < raw_json_column.size(); ++i) {
2155
1.36M
        StringRef raw_json = raw_json_column.get_data_at(i);
2156
1.36M
        parse_json_to_variant_impl(column, raw_json.data, raw_json.size, parser.get(), config);
2157
1.36M
    }
2158
4.54k
    column.finalize();
2159
4.54k
}
2160
2161
// parse the doc snapshot column to subcolumns
2162
0
void materialize_docs_to_subcolumns(ColumnVariant& column_variant) {
2163
0
    auto subcolumns = materialize_docs_to_subcolumns_map(column_variant);
2164
2165
0
    for (auto& entry : subcolumns) {
2166
0
        entry.second.finalize();
2167
0
        if (!column_variant.add_sub_column(PathInData(entry.first),
2168
0
                                           IColumn::mutate(entry.second.get_finalized_column_ptr()),
2169
0
                                           entry.second.get_least_common_type())) {
2170
0
            throw doris::Exception(ErrorCode::INTERNAL_ERROR,
2171
0
                                   "Failed to add subcolumn {}, which is from doc snapshot column",
2172
0
                                   entry.first);
2173
0
        }
2174
0
    }
2175
2176
0
    column_variant.finalize();
2177
0
}
2178
2179
// ============ Implementation from variant_util.cpp ============
2180
2181
phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_to_subcolumns_map(
2182
11
        const ColumnVariant& variant, size_t expected_unique_paths) {
2183
11
    constexpr size_t kInitialPathReserve = 8192;
2184
11
    phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> subcolumns;
2185
2186
11
    const auto [column_key, column_value] = variant.get_doc_value_data_paths_and_values();
2187
11
    const auto& column_offsets = variant.serialized_doc_value_column_offsets();
2188
11
    const size_t num_rows = column_offsets.size();
2189
2190
11
    DCHECK_EQ(num_rows, variant.size()) << "doc snapshot offsets size mismatch with variant rows";
2191
2192
11
    subcolumns.reserve(expected_unique_paths != 0
2193
11
                               ? expected_unique_paths
2194
11
                               : std::min<size_t>(column_key->size(), kInitialPathReserve));
2195
2196
36
    for (size_t row = 0; row < num_rows; ++row) {
2197
25
        const size_t start = column_offsets[row - 1];
2198
25
        const size_t end = column_offsets[row];
2199
71
        for (size_t i = start; i < end; ++i) {
2200
46
            const auto& key = column_key->get_data_at(i);
2201
46
            const std::string_view path_sv(key.data, key.size);
2202
2203
46
            auto [it, inserted] =
2204
46
                    subcolumns.try_emplace(path_sv, ColumnVariant::Subcolumn {0, true, false});
2205
46
            auto& subcolumn = it->second;
2206
46
            if (inserted) {
2207
27
                subcolumn.insert_many_defaults(row);
2208
27
            } else if (subcolumn.size() != row) {
2209
4
                subcolumn.insert_many_defaults(row - subcolumn.size());
2210
4
            }
2211
46
            subcolumn.deserialize_from_binary_column(column_value, i);
2212
46
        }
2213
25
    }
2214
2215
27
    for (auto& [path, subcolumn] : subcolumns) {
2216
27
        if (subcolumn.size() != num_rows) {
2217
7
            subcolumn.insert_many_defaults(num_rows - subcolumn.size());
2218
7
        }
2219
27
    }
2220
2221
11
    return subcolumns;
2222
11
}
2223
2224
Status _parse_and_materialize_variant_columns(Block& block,
2225
                                              const std::vector<uint32_t>& variant_pos,
2226
4.19k
                                              const std::vector<ParseConfig>& configs) {
2227
9.48k
    for (size_t i = 0; i < variant_pos.size(); ++i) {
2228
5.29k
        auto column_ref = block.get_by_position(variant_pos[i]).column;
2229
5.29k
        bool is_nullable = is_column_nullable(*column_ref);
2230
5.29k
        MutableColumnPtr owner_column = IColumn::mutate(std::move(column_ref));
2231
5.29k
        ColumnPtr nullable_null_map;
2232
5.29k
        MutableColumnPtr var_column;
2233
5.29k
        if (is_nullable) {
2234
4.99k
            const auto& nullable = assert_cast<const ColumnNullable&>(*owner_column);
2235
4.99k
            nullable_null_map = nullable.get_null_map_column_ptr();
2236
4.99k
            var_column = IColumn::mutate(nullable.get_nested_column_ptr());
2237
4.99k
        } else {
2238
303
            var_column = std::move(owner_column);
2239
303
        }
2240
5.29k
        auto& var = assert_cast<ColumnVariant&>(*var_column);
2241
5.29k
        var_column->finalize();
2242
2243
5.29k
        MutableColumnPtr variant_column;
2244
5.29k
        if (!var.is_scalar_variant()) {
2245
            // already parsed
2246
1.29k
            continue;
2247
1.29k
        }
2248
2249
18.4E
        VLOG_DEBUG << "parse scalar variant column: " << var.get_root_type()->get_name();
2250
3.99k
        ColumnPtr scalar_root_column;
2251
3.99k
        if (var.get_root_type()->get_primitive_type() == TYPE_JSONB) {
2252
32
            scalar_root_column = jsonb_root_to_json_string_column(*var.get_root());
2253
3.96k
        } else {
2254
3.96k
            const auto& root = *var.get_root();
2255
3.96k
            scalar_root_column =
2256
3.96k
                    is_column_nullable(root)
2257
3.96k
                            ? assert_cast<const ColumnNullable&>(root).get_nested_column_ptr()
2258
3.96k
                            : var.get_root();
2259
3.96k
        }
2260
2261
4.26k
        if (scalar_root_column->is_column_string()) {
2262
4.26k
            variant_column = ColumnVariant::create(0, var.enable_doc_mode());
2263
4.26k
            parse_json_to_variant(*variant_column.get(),
2264
4.26k
                                  assert_cast<const ColumnString&>(*scalar_root_column),
2265
4.26k
                                  configs[i]);
2266
18.4E
        } else {
2267
            // Root maybe other types rather than string like ColumnVariant(Int32).
2268
            // In this case, we should finlize the root and cast to JSON type
2269
18.4E
            auto expected_root_type =
2270
18.4E
                    make_nullable(std::make_shared<ColumnVariant::MostCommonType>());
2271
18.4E
            var.ensure_root_node_type(expected_root_type);
2272
18.4E
            variant_column = std::move(var_column);
2273
18.4E
        }
2274
2275
        // Wrap variant with nullmap if it is nullable
2276
3.99k
        ColumnPtr result = variant_column->get_ptr();
2277
4.03k
        if (is_nullable) {
2278
4.03k
            result = ColumnNullable::create(result, nullable_null_map);
2279
4.03k
        }
2280
3.99k
        block.get_by_position(variant_pos[i]).column = result;
2281
3.99k
    }
2282
4.19k
    return Status::OK();
2283
4.19k
}
2284
2285
Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos,
2286
4.20k
                                             const std::vector<ParseConfig>& configs) {
2287
4.20k
    RETURN_IF_CATCH_EXCEPTION(
2288
4.20k
            { return _parse_and_materialize_variant_columns(block, variant_pos, configs); });
2289
4.20k
}
2290
2291
namespace {
2292
2293
ParseConfig::ParseTo select_storage_variant_parse_target(const TabletColumn& column,
2294
4.97k
                                                         const ParseConfig& config) {
2295
    // NestedGroup consumes the parse-time subcolumn tree to build nested storage structures, so it
2296
    // must not go through doc-value staging.
2297
4.97k
    if (column.variant_enable_nested_group()) {
2298
0
        return ParseConfig::ParseTo::OnlySubcolumns;
2299
0
    }
2300
2301
    // Persistent doc mode owns doc-value bucket columns in VariantDocWriter. Keep it separate from
2302
    // the plain non-doc staging optimization, even when typed paths or parent indexes exist.
2303
4.97k
    if (column.variant_enable_doc_mode()) {
2304
1.89k
        return ParseConfig::ParseTo::OnlyDocValueColumn;
2305
1.89k
    }
2306
2307
    // Deprecated flatten-nested still consumes parse-time subcolumns. Predefined typed paths and
2308
    // parent inverted indexes are handled later by regular doc-value staging: typed paths are
2309
    // forced into the materialized set unless typed-to-sparse is enabled, and materialized dynamic
2310
    // subcolumns inherit parent indexes while sparse payloads stay unindexed.
2311
3.07k
    if (config.deprecated_enable_flatten_nested) {
2312
26
        return ParseConfig::ParseTo::OnlySubcolumns;
2313
26
    }
2314
2315
    // Plain dynamic non-doc VARIANT can avoid eagerly creating thousands of parse-time subcolumns.
2316
    // The segment writer will pick the materialized/sparse split from this doc-value KV staging.
2317
    // Keep a BE switch so tests and rollouts can compare the old parse-time path with staging under
2318
    // the same writer and schema.
2319
3.04k
    switch (config::variant_storage_parse_mode) {
2320
3.11k
    case 0:
2321
3.11k
    case 2:
2322
3.11k
        return ParseConfig::ParseTo::OnlyDocValueColumn;
2323
2
    case 1:
2324
2
        return ParseConfig::ParseTo::OnlySubcolumns;
2325
0
    default:
2326
0
        CHECK(false) << "invalid variant_storage_parse_mode: "
2327
0
                     << config::variant_storage_parse_mode;
2328
0
        return ParseConfig::ParseTo::OnlyDocValueColumn;
2329
3.04k
    }
2330
3.04k
}
2331
2332
} // namespace
2333
2334
Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& tablet_schema,
2335
4.39k
                                             const std::vector<uint32_t>& column_pos) {
2336
4.39k
    std::vector<uint32_t> variant_column_pos;
2337
4.39k
    std::vector<uint32_t> variant_schema_pos;
2338
4.39k
    variant_column_pos.reserve(column_pos.size());
2339
4.39k
    variant_schema_pos.reserve(column_pos.size());
2340
25.5k
    for (size_t block_pos = 0; block_pos < column_pos.size(); ++block_pos) {
2341
21.2k
        const uint32_t schema_pos = column_pos[block_pos];
2342
21.2k
        const auto& column = tablet_schema.column(schema_pos);
2343
21.2k
        if (column.is_variant_type()) {
2344
5.11k
            variant_column_pos.push_back(schema_pos);
2345
5.11k
            variant_schema_pos.push_back(schema_pos);
2346
5.11k
        }
2347
21.2k
    }
2348
2349
4.39k
    if (variant_column_pos.empty()) {
2350
47
        return Status::OK();
2351
47
    }
2352
2353
4.34k
    std::vector<ParseConfig> configs(variant_column_pos.size());
2354
9.59k
    for (size_t i = 0; i < variant_column_pos.size(); ++i) {
2355
        // Deprecated legacy flatten-nested switch. Distinct from variant_enable_nested_group.
2356
5.24k
        configs[i].deprecated_enable_flatten_nested =
2357
5.24k
                tablet_schema.deprecated_variant_flatten_nested();
2358
5.24k
        configs[i].check_duplicate_json_path = config::variant_enable_duplicate_json_path_check;
2359
5.24k
        const auto& column = tablet_schema.column(variant_schema_pos[i]);
2360
5.24k
        if (!column.is_variant_type()) {
2361
0
            return Status::InternalError("column is not variant type, column name: {}",
2362
0
                                         column.name());
2363
0
        }
2364
5.24k
        configs[i].parse_to = select_storage_variant_parse_target(column, configs[i]);
2365
5.24k
    }
2366
2367
4.34k
    RETURN_IF_ERROR(parse_and_materialize_variant_columns(block, variant_column_pos, configs));
2368
4.34k
    return Status::OK();
2369
4.34k
}
2370
2371
} // namespace doris::variant_util