Coverage Report

Created: 2026-05-17 19:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/common/variant_util.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/common/variant_util.h"
19
20
#include <assert.h>
21
#include <fmt/format.h>
22
#include <gen_cpp/FrontendService.h>
23
#include <gen_cpp/FrontendService_types.h>
24
#include <gen_cpp/HeartbeatService_types.h>
25
#include <gen_cpp/MasterService_types.h>
26
#include <gen_cpp/Status_types.h>
27
#include <gen_cpp/Types_types.h>
28
#include <glog/logging.h>
29
#include <rapidjson/document.h>
30
#include <rapidjson/stringbuffer.h>
31
#include <rapidjson/writer.h>
32
#include <simdjson/simdjson.h> // IWYU pragma: keep
33
#include <unicode/uchar.h>
34
35
#include <algorithm>
36
#include <cassert>
37
#include <cstddef>
38
#include <cstdint>
39
#include <cstring>
40
#include <list>
41
#include <memory>
42
#include <mutex>
43
#include <optional>
44
#include <ostream>
45
#include <ranges>
46
#include <set>
47
#include <stack>
48
#include <string>
49
#include <string_view>
50
#include <unordered_map>
51
#include <utility>
52
#include <vector>
53
54
#include "common/config.h"
55
#include "common/status.h"
56
#include "core/assert_cast.h"
57
#include "core/block/block.h"
58
#include "core/block/column_numbers.h"
59
#include "core/block/column_with_type_and_name.h"
60
#include "core/column/column.h"
61
#include "core/column/column_array.h"
62
#include "core/column/column_map.h"
63
#include "core/column/column_nullable.h"
64
#include "core/column/column_string.h"
65
#include "core/column/column_variant.h"
66
#include "core/data_type/data_type.h"
67
#include "core/data_type/data_type_array.h"
68
#include "core/data_type/data_type_factory.hpp"
69
#include "core/data_type/data_type_jsonb.h"
70
#include "core/data_type/data_type_nullable.h"
71
#include "core/data_type/data_type_string.h"
72
#include "core/data_type/data_type_variant.h"
73
#include "core/data_type/define_primitive_type.h"
74
#include "core/data_type/get_least_supertype.h"
75
#include "core/data_type/primitive_type.h"
76
#include "core/field.h"
77
#include "core/typeid_cast.h"
78
#include "core/types.h"
79
#include "exec/common/field_visitors.h"
80
#include "exec/common/sip_hash.h"
81
#include "exprs/function/function.h"
82
#include "exprs/function/simple_function_factory.h"
83
#include "exprs/function_context.h"
84
#include "exprs/json_functions.h"
85
#include "re2/re2.h"
86
#include "runtime/exec_env.h"
87
#include "runtime/runtime_state.h"
88
#include "storage/olap_common.h"
89
#include "storage/rowset/beta_rowset.h"
90
#include "storage/rowset/rowset.h"
91
#include "storage/rowset/rowset_fwd.h"
92
#include "storage/segment/segment_loader.h"
93
#include "storage/segment/variant/nested_group_path.h"
94
#include "storage/segment/variant/variant_column_reader.h"
95
#include "storage/segment/variant/variant_column_writer_impl.h"
96
#include "storage/tablet/tablet.h"
97
#include "storage/tablet/tablet_fwd.h"
98
#include "storage/tablet/tablet_schema.h"
99
#include "util/client_cache.h"
100
#include "util/defer_op.h"
101
#include "util/json/json_parser.h"
102
#include "util/json/path_in_data.h"
103
#include "util/json/simd_json_parser.h"
104
105
namespace doris::variant_util {
106
107
9.75k
static bool is_decimal_typed_path_column(const TabletColumn& column) {
108
9.75k
    if (column.is_array_type()) {
109
1.86k
        CHECK_EQ(column.get_sub_columns().size(), 1);
110
1.86k
        return is_decimal_typed_path_column(*column.get_sub_columns()[0]);
111
1.86k
    }
112
7.88k
    return is_decimal(TabletColumn::get_primitive_type_by_field_type(column.type()));
113
9.75k
}
114
115
static void configure_decimal_number_preserve_paths(const TabletColumn& column,
116
2.95k
                                                    ParseConfig* config) {
117
2.95k
    std::vector<std::string> glob_patterns;
118
7.94k
    for (const auto& sub_column : column.get_sub_columns()) {
119
7.94k
        if (!is_decimal_typed_path_column(*sub_column)) {
120
7.06k
            continue;
121
7.06k
        }
122
883
        switch (sub_column->pattern_type()) {
123
49
        case PatternTypePB::MATCH_NAME:
124
49
            config->preserve_decimal_number_paths.emplace(sub_column->name());
125
49
            break;
126
822
        case PatternTypePB::MATCH_NAME_GLOB:
127
822
            glob_patterns.emplace_back(sub_column->name());
128
822
            break;
129
883
        }
130
883
    }
131
2.93k
    if (!glob_patterns.empty()) {
132
186
        config->preserve_decimal_number_path_matcher =
133
1.92M
                [glob_patterns = std::move(glob_patterns)](std::string_view path) {
134
1.92M
                    std::string candidate_path(path);
135
3.76M
                    for (const auto& pattern : glob_patterns) {
136
3.76M
                        if (glob_match_re2(pattern, candidate_path)) {
137
273k
                            return true;
138
273k
                        }
139
3.76M
                    }
140
1.65M
                    return false;
141
1.92M
                };
142
186
    }
143
2.93k
}
144
145
2.89k
inline void append_escaped_regex_char(std::string* regex_output, char ch) {
146
2.89k
    switch (ch) {
147
23
    case '.':
148
25
    case '^':
149
27
    case '$':
150
29
    case '+':
151
35
    case '*':
152
37
    case '?':
153
39
    case '(':
154
41
    case ')':
155
43
    case '|':
156
45
    case '{':
157
47
    case '}':
158
49
    case '[':
159
49
    case ']':
160
53
    case '\\':
161
53
        regex_output->push_back('\\');
162
53
        regex_output->push_back(ch);
163
53
        break;
164
2.83k
    default:
165
2.83k
        regex_output->push_back(ch);
166
2.83k
        break;
167
2.89k
    }
168
2.89k
}
169
170
// Small LRU to cap compiled glob patterns
171
constexpr size_t kGlobRegexCacheCapacity = 256;
172
173
struct GlobRegexCacheEntry {
174
    std::shared_ptr<RE2> re2;
175
    std::list<std::string>::iterator lru_it;
176
};
177
178
static std::mutex g_glob_regex_cache_mutex;
179
static std::list<std::string> g_glob_regex_cache_lru;
180
static std::unordered_map<std::string, GlobRegexCacheEntry> g_glob_regex_cache;
181
182
3.92M
std::shared_ptr<RE2> get_or_build_re2(const std::string& glob_pattern) {
183
3.92M
    {
184
3.92M
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
185
3.92M
        auto it = g_glob_regex_cache.find(glob_pattern);
186
3.92M
        if (it != g_glob_regex_cache.end()) {
187
3.92M
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
188
3.92M
                                          it->second.lru_it);
189
3.92M
            return it->second.re2;
190
3.92M
        }
191
3.92M
    }
192
18.4E
    std::string regex_pattern;
193
18.4E
    Status st = glob_to_regex(glob_pattern, &regex_pattern);
194
18.4E
    if (!st.ok()) {
195
2
        return nullptr;
196
2
    }
197
18.4E
    auto compiled = std::make_shared<RE2>(regex_pattern);
198
18.4E
    if (!compiled->ok()) {
199
3
        return nullptr;
200
3
    }
201
18.4E
    {
202
18.4E
        std::lock_guard<std::mutex> lock(g_glob_regex_cache_mutex);
203
18.4E
        auto it = g_glob_regex_cache.find(glob_pattern);
204
18.4E
        if (it != g_glob_regex_cache.end()) {
205
7
            g_glob_regex_cache_lru.splice(g_glob_regex_cache_lru.begin(), g_glob_regex_cache_lru,
206
7
                                          it->second.lru_it);
207
7
            return it->second.re2;
208
7
        }
209
18.4E
        g_glob_regex_cache_lru.push_front(glob_pattern);
210
18.4E
        g_glob_regex_cache.emplace(glob_pattern,
211
18.4E
                                   GlobRegexCacheEntry {compiled, g_glob_regex_cache_lru.begin()});
212
18.4E
        if (g_glob_regex_cache.size() > kGlobRegexCacheCapacity) {
213
0
            const std::string& evict_key = g_glob_regex_cache_lru.back();
214
0
            g_glob_regex_cache.erase(evict_key);
215
0
            g_glob_regex_cache_lru.pop_back();
216
0
        }
217
18.4E
    }
218
0
    return compiled;
219
18.4E
}
220
221
// Convert a restricted glob pattern into a regex.
222
// Supported: '*', '?', '[...]', '\\' escape. Others are treated as literals.
223
303
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern) {
224
303
    regex_pattern->clear();
225
303
    regex_pattern->append("^");
226
303
    bool is_escaped = false;
227
303
    size_t pattern_length = glob_pattern.size();
228
3.31k
    for (size_t index = 0; index < pattern_length; ++index) {
229
3.01k
        char current_char = glob_pattern[index];
230
3.01k
        if (is_escaped) {
231
10
            append_escaped_regex_char(regex_pattern, current_char);
232
10
            is_escaped = false;
233
10
            continue;
234
10
        }
235
3.00k
        if (current_char == '\\') {
236
14
            is_escaped = true;
237
14
            continue;
238
14
        }
239
2.99k
        if (current_char == '*') {
240
69
            regex_pattern->append(".*");
241
69
            continue;
242
69
        }
243
2.92k
        if (current_char == '?') {
244
15
            regex_pattern->append(".");
245
15
            continue;
246
15
        }
247
2.91k
        if (current_char == '[') {
248
33
            size_t class_index = index + 1;
249
33
            bool class_closed = false;
250
33
            bool is_class_escaped = false;
251
33
            std::string class_buffer;
252
33
            if (class_index < pattern_length &&
253
33
                (glob_pattern[class_index] == '!' || glob_pattern[class_index] == '^')) {
254
9
                class_buffer.push_back('^');
255
9
                ++class_index;
256
9
            }
257
99
            for (; class_index < pattern_length; ++class_index) {
258
95
                char class_char = glob_pattern[class_index];
259
95
                if (is_class_escaped) {
260
10
                    class_buffer.push_back(class_char);
261
10
                    is_class_escaped = false;
262
10
                    continue;
263
10
                }
264
85
                if (class_char == '\\') {
265
10
                    is_class_escaped = true;
266
10
                    continue;
267
10
                }
268
75
                if (class_char == ']') {
269
29
                    class_closed = true;
270
29
                    break;
271
29
                }
272
46
                class_buffer.push_back(class_char);
273
46
            }
274
33
            if (!class_closed) {
275
4
                return Status::InvalidArgument("Unclosed character class in glob pattern: {}",
276
4
                                               glob_pattern);
277
4
            }
278
29
            regex_pattern->append("[");
279
29
            regex_pattern->append(class_buffer);
280
29
            regex_pattern->append("]");
281
29
            index = class_index;
282
29
            continue;
283
33
        }
284
2.87k
        append_escaped_regex_char(regex_pattern, current_char);
285
2.87k
    }
286
299
    if (is_escaped) {
287
4
        append_escaped_regex_char(regex_pattern, '\\');
288
4
    }
289
299
    regex_pattern->append("$");
290
299
    return Status::OK();
291
303
}
292
293
3.92M
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path) {
294
3.92M
    auto compiled = get_or_build_re2(glob_pattern);
295
3.92M
    if (compiled == nullptr) {
296
5
        return false;
297
5
    }
298
3.92M
    return RE2::FullMatch(candidate_path, *compiled);
299
3.92M
}
300
301
// NestedGroup's physical children and offsets are produced by NestedGroupWriteProvider, not by
302
// appending TabletSchema extracted columns here. This predicate keeps only ordinary Variant paths
303
// that are outside the NG tree, for example `v.owner` beside `v.items[*]`.
304
0
bool is_regular_path_outside_nested_group(const PathInData& path) {
305
0
    const std::string& relative_path = path.get_path();
306
0
    return !relative_path.empty() && !path.get_is_typed() && !path.has_nested_part() &&
307
0
           !segment_v2::contains_nested_group_marker(relative_path) &&
308
0
           !segment_v2::is_root_nested_group_path(relative_path) &&
309
0
           relative_path != SPARSE_COLUMN_PATH &&
310
0
           relative_path.find(DOC_VALUE_COLUMN_PATH) == std::string::npos;
311
0
}
312
313
bool should_materialize_nested_group_regular_subcolumns(
314
        const TabletColumnPtr& column,
315
622
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
316
622
    const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
317
622
    return column->variant_enable_nested_group() ||
318
625
           (info_it != uid_to_variant_extended_info.end() && info_it->second.has_nested_group);
319
622
}
320
321
std::unordered_set<int32_t> collect_nested_group_compaction_root_uids(
322
        const TabletSchemaSPtr& target,
323
7.95k
        const std::unordered_map<int32_t, VariantExtendedInfo>& uid_to_variant_extended_info) {
324
7.95k
    std::unordered_set<int32_t> root_uids;
325
92.2k
    for (const TabletColumnPtr& column : target->columns()) {
326
92.2k
        if (column->is_variant_type() && should_materialize_nested_group_regular_subcolumns(
327
626
                                                 column, uid_to_variant_extended_info)) {
328
1
            root_uids.insert(column->unique_id());
329
1
        }
330
92.2k
    }
331
7.95k
    return root_uids;
332
7.95k
}
333
334
PathToDataTypes collect_regular_types_outside_nested_group(
335
1
        const VariantExtendedInfo& extended_info) {
336
1
    PathToDataTypes regular_path_to_data_types;
337
1
    for (const auto& [path, data_types] : extended_info.path_to_data_types) {
338
0
        if (!is_regular_path_outside_nested_group(path)) {
339
0
            continue;
340
0
        }
341
0
        regular_path_to_data_types.emplace(path, data_types);
342
0
    }
343
1
    return regular_path_to_data_types;
344
1
}
345
346
639
size_t get_number_of_dimensions(const IDataType& type) {
347
639
    if (const auto* type_array = typeid_cast<const DataTypeArray*>(&type)) {
348
4
        return type_array->get_number_of_dimensions();
349
4
    }
350
635
    return 0;
351
639
}
352
3
size_t get_number_of_dimensions(const IColumn& column) {
353
3
    if (const auto* column_array = check_and_get_column<ColumnArray>(column)) {
354
2
        return column_array->get_number_of_dimensions();
355
2
    }
356
1
    return 0;
357
3
}
358
359
82.2k
DataTypePtr get_base_type_of_array(const DataTypePtr& type) {
360
    /// Get raw pointers to avoid extra copying of type pointers.
361
82.2k
    const DataTypeArray* last_array = nullptr;
362
82.2k
    const auto* current_type = type.get();
363
82.2k
    if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
364
82.2k
        current_type = nullable->get_nested_type().get();
365
82.2k
    }
366
83.5k
    while (const auto* type_array = typeid_cast<const DataTypeArray*>(current_type)) {
367
1.26k
        current_type = type_array->get_nested_type().get();
368
1.26k
        last_array = type_array;
369
1.26k
        if (const auto* nullable = typeid_cast<const DataTypeNullable*>(current_type)) {
370
1.26k
            current_type = nullable->get_nested_type().get();
371
1.26k
        }
372
1.26k
    }
373
82.2k
    return last_array ? last_array->get_nested_type() : type;
374
82.2k
}
375
376
69.9k
Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result) {
377
69.9k
    ColumnsWithTypeAndName arguments {arg, {nullptr, type, type->get_name()}};
378
379
    // To prevent from null info lost, we should not call function since the function framework will wrap
380
    // nullable to Variant instead of the root of Variant
381
    // correct output: Nullable(Array(int)) -> Nullable(Variant(Nullable(Array(int))))
382
    // incorrect output: Nullable(Array(int)) -> Nullable(Variant(Array(int)))
383
69.9k
    if (type->get_primitive_type() == TYPE_VARIANT) {
384
        // If source column is variant, so the nullable info is different from dst column
385
11.6k
        if (arg.type->get_primitive_type() == TYPE_VARIANT) {
386
275
            *result = type->is_nullable() ? make_nullable(arg.column) : remove_nullable(arg.column);
387
275
            return Status::OK();
388
275
        }
389
        // set variant root column/type to from column/type
390
11.6k
        CHECK(arg.column->is_nullable());
391
11.3k
        auto to_type = remove_nullable(type);
392
11.3k
        const auto& data_type_object = assert_cast<const DataTypeVariant&>(*to_type);
393
11.3k
        auto variant = ColumnVariant::create(data_type_object.variant_max_subcolumns_count(),
394
11.3k
                                             data_type_object.enable_doc_mode());
395
396
11.3k
        variant->create_root(arg.type, arg.column->assume_mutable());
397
11.3k
        ColumnPtr nullable = ColumnNullable::create(
398
11.3k
                variant->get_ptr(),
399
11.3k
                assert_cast<const ColumnNullable*>(arg.column.get())->get_null_map_column_ptr());
400
11.3k
        *result = type->is_nullable() ? nullable : variant->get_ptr();
401
11.3k
        return Status::OK();
402
11.6k
    }
403
404
58.3k
    auto function = SimpleFunctionFactory::instance().get_function("CAST", arguments, type);
405
58.3k
    if (!function) {
406
0
        return Status::InternalError("Not found cast function {} to {}", arg.type->get_name(),
407
0
                                     type->get_name());
408
0
    }
409
58.3k
    Block tmp_block {arguments};
410
58.3k
    uint32_t result_column = cast_set<uint32_t>(tmp_block.columns());
411
58.3k
    RuntimeState state;
412
58.3k
    auto ctx = FunctionContext::create_context(&state, {}, {});
413
414
58.3k
    if (arg.type->get_primitive_type() == INVALID_TYPE) {
415
        // cast from nothing to any type should result in nulls
416
5.81k
        *result = type->create_column_const_with_default_value(arg.column->size())
417
5.81k
                          ->convert_to_full_column_if_const();
418
5.81k
        return Status::OK();
419
5.81k
    }
420
421
    // We convert column string to jsonb type just add a string jsonb field to dst column instead of parse
422
    // each line in original string column.
423
52.5k
    ctx->set_string_as_jsonb_string(true);
424
52.5k
    ctx->set_jsonb_string_as_string(true);
425
52.5k
    tmp_block.insert({nullptr, type, arg.name});
426
    // TODO(lihangyu): we should handle this error in strict mode
427
52.5k
    if (!function->execute(ctx.get(), tmp_block, {0}, result_column, arg.column->size())) {
428
1
        LOG_EVERY_N(WARNING, 100) << fmt::format("cast from {} to {}", arg.type->get_name(),
429
1
                                                 type->get_name());
430
1
        *result = type->create_column_const_with_default_value(arg.column->size())
431
1
                          ->convert_to_full_column_if_const();
432
1
        return Status::OK();
433
1
    }
434
52.5k
    *result = tmp_block.get_by_position(result_column).column->convert_to_full_column_if_const();
435
52.5k
    VLOG_DEBUG << fmt::format("{} before convert {}, after convert {}", arg.name,
436
12
                              arg.column->get_name(), (*result)->get_name());
437
52.5k
    return Status::OK();
438
52.5k
}
439
440
void get_column_by_type(const DataTypePtr& data_type, const std::string& name, TabletColumn& column,
441
163k
                        const ExtraInfo& ext_info) {
442
163k
    column.set_name(name);
443
163k
    column.set_type(data_type->get_storage_field_type());
444
163k
    if (ext_info.unique_id >= 0) {
445
4
        column.set_unique_id(ext_info.unique_id);
446
4
    }
447
163k
    if (ext_info.parent_unique_id >= 0) {
448
80.5k
        column.set_parent_unique_id(ext_info.parent_unique_id);
449
80.5k
    }
450
163k
    if (!ext_info.path_info.empty()) {
451
80.5k
        column.set_path_info(ext_info.path_info);
452
80.5k
    }
453
163k
    if (data_type->is_nullable()) {
454
81.6k
        const auto& real_type = static_cast<const DataTypeNullable&>(*data_type);
455
81.6k
        column.set_is_nullable(true);
456
81.6k
        get_column_by_type(real_type.get_nested_type(), name, column, {});
457
81.6k
        return;
458
81.6k
    }
459
81.6k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_ARRAY) {
460
1.04k
        TabletColumn child;
461
1.04k
        get_column_by_type(assert_cast<const DataTypeArray*>(data_type.get())->get_nested_type(),
462
1.04k
                           "", child, {});
463
1.04k
        column.set_length(TabletColumn::get_field_length_by_type(TPrimitiveType::ARRAY, 0));
464
1.04k
        column.add_sub_column(child);
465
1.04k
        return;
466
1.04k
    }
467
80.5k
    if (data_type->get_primitive_type() == PrimitiveType::TYPE_VARIANT) {
468
0
        const auto* dt_variant = assert_cast<const DataTypeVariant*>(data_type.get());
469
0
        column.set_variant_max_subcolumns_count(dt_variant->variant_max_subcolumns_count());
470
0
        column.set_variant_enable_doc_mode(dt_variant->enable_doc_mode());
471
0
        return;
472
0
    }
473
    // size is not fixed when type is string or json
474
80.5k
    if (is_string_type(data_type->get_primitive_type()) ||
475
80.5k
        data_type->get_primitive_type() == TYPE_JSONB) {
476
25.5k
        column.set_length(INT_MAX);
477
25.5k
        return;
478
25.5k
    }
479
480
55.0k
    PrimitiveType type = data_type->get_primitive_type();
481
55.0k
    if (is_int_or_bool(type) || is_string_type(type) || is_float_or_double(type) || is_ip(type) ||
482
55.0k
        is_date_or_datetime(type) || type == PrimitiveType::TYPE_DATEV2) {
483
55.0k
        column.set_length(cast_set<int32_t>(data_type->get_size_of_value_in_memory()));
484
55.0k
        return;
485
55.0k
    }
486
6
    if (is_decimal(type)) {
487
1
        column.set_precision(data_type->get_precision());
488
1
        column.set_frac(data_type->get_scale());
489
1
        return;
490
1
    }
491
    // datetimev2 needs scale
492
5
    if (type == PrimitiveType::TYPE_DATETIMEV2 || type == PrimitiveType::TYPE_TIMESTAMPTZ) {
493
1
        column.set_precision(-1);
494
1
        column.set_frac(data_type->get_scale());
495
1
        return;
496
1
    }
497
498
4
    throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR,
499
4
                           "unexcepted data column type: {}, column name is: {}",
500
4
                           data_type->get_name(), name);
501
5
}
502
503
TabletColumn get_column_by_type(const DataTypePtr& data_type, const std::string& name,
504
79.9k
                                const ExtraInfo& ext_info) {
505
79.9k
    TabletColumn result;
506
79.9k
    get_column_by_type(data_type, name, result, ext_info);
507
79.9k
    return result;
508
79.9k
}
509
510
// check if two paths which same prefix have different structure
511
static bool has_different_structure_in_same_path(const PathInData::Parts& lhs,
512
9.04k
                                                 const PathInData::Parts& rhs) {
513
9.04k
    if (lhs.size() != rhs.size()) {
514
1
        return false; // different size means different structure
515
1
    }
516
    // Since we group by path string, lhs and rhs must have the same size and keys
517
    // We only need to check if they have different nested structure
518
36.1k
    for (size_t i = 0; i < lhs.size(); ++i) {
519
27.0k
        if (lhs[i] != rhs[i]) {
520
5
            VLOG_DEBUG << fmt::format(
521
0
                    "Check different structure: {} vs {}, lhs[i].is_nested: {}, rhs[i].is_nested: "
522
0
                    "{}",
523
0
                    lhs[i].key, rhs[i].key, lhs[i].is_nested, rhs[i].is_nested);
524
5
            return true;
525
5
        }
526
27.0k
    }
527
9.03k
    return false;
528
9.04k
}
529
530
4.54k
Status check_variant_has_no_ambiguous_paths(const PathsInData& tuple_paths) {
531
    // Group paths by their string representation to reduce comparisons
532
4.54k
    std::unordered_map<std::string, std::vector<size_t>> path_groups;
533
534
26.2k
    for (size_t i = 0; i < tuple_paths.size(); ++i) {
535
        // same path should have same structure, so we group them by path
536
21.7k
        path_groups[tuple_paths[i].get_path()].push_back(i);
537
        // print part of tuple_paths[i]
538
21.7k
        VLOG_DEBUG << "tuple_paths[i]: " << tuple_paths[i].get_path();
539
21.7k
    }
540
541
    // Only compare paths within the same group
542
12.6k
    for (const auto& [path_str, indices] : path_groups) {
543
12.6k
        if (indices.size() <= 1) {
544
3.67k
            continue; // No conflicts possible
545
3.67k
        }
546
547
        // Compare all pairs within this group
548
27.0k
        for (size_t i = 0; i < indices.size(); ++i) {
549
27.0k
            for (size_t j = 0; j < i; ++j) {
550
9.04k
                if (has_different_structure_in_same_path(tuple_paths[indices[i]].get_parts(),
551
9.04k
                                                         tuple_paths[indices[j]].get_parts())) {
552
5
                    return Status::DataQualityError(
553
5
                            "Ambiguous paths: {} vs {} with different nested part {} vs {}",
554
5
                            tuple_paths[indices[i]].get_path(), tuple_paths[indices[j]].get_path(),
555
5
                            tuple_paths[indices[i]].has_nested_part(),
556
5
                            tuple_paths[indices[j]].has_nested_part());
557
5
                }
558
9.04k
            }
559
18.0k
        }
560
9.01k
    }
561
4.53k
    return Status::OK();
562
4.54k
}
563
564
Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
565
                                    TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
566
                                    const std::map<std::string, TabletColumnPtr>& typed_columns,
567
1.50k
                                    std::set<PathInData>* path_set) {
568
1.50k
    PathsInData tuple_paths;
569
1.50k
    DataTypes tuple_types;
570
1.50k
    CHECK(common_schema.use_count() == 1);
571
    // Get the least common type for all paths.
572
1.50k
    for (const auto& [key, subtypes] : subcolumns_types) {
573
605
        assert(!subtypes.empty());
574
605
        if (key.get_path() == ColumnVariant::COLUMN_NAME_DUMMY) {
575
0
            continue;
576
0
        }
577
605
        size_t first_dim = get_number_of_dimensions(*subtypes[0]);
578
605
        tuple_paths.emplace_back(key);
579
635
        for (size_t i = 1; i < subtypes.size(); ++i) {
580
31
            if (first_dim != get_number_of_dimensions(*subtypes[i])) {
581
1
                tuple_types.emplace_back(make_nullable(std::make_shared<DataTypeJsonb>()));
582
1
                LOG(INFO) << fmt::format(
583
1
                        "Uncompatible types of subcolumn '{}': {} and {}, cast to JSONB",
584
1
                        key.get_path(), subtypes[0]->get_name(), subtypes[i]->get_name());
585
1
                break;
586
1
            }
587
31
        }
588
605
        if (tuple_paths.size() == tuple_types.size()) {
589
1
            continue;
590
1
        }
591
604
        DataTypePtr common_type;
592
604
        get_least_supertype_jsonb(subtypes, &common_type);
593
604
        if (!common_type->is_nullable()) {
594
3
            common_type = make_nullable(common_type);
595
3
        }
596
604
        tuple_types.emplace_back(common_type);
597
604
    }
598
1.50k
    CHECK_EQ(tuple_paths.size(), tuple_types.size());
599
600
    // Append all common type columns of this variant
601
2.10k
    for (int i = 0; i < tuple_paths.size(); ++i) {
602
605
        TabletColumn common_column;
603
        // typed path not contains root part
604
605
        auto path_without_root = tuple_paths[i].copy_pop_front().get_path();
605
605
        if (typed_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) {
606
0
            common_column = *typed_columns.at(path_without_root);
607
            // parent unique id and path may not be init in write path
608
0
            common_column.set_parent_unique_id(variant_col_unique_id);
609
0
            common_column.set_path_info(tuple_paths[i]);
610
0
            common_column.set_name(tuple_paths[i].get_path());
611
605
        } else {
612
            // const std::string& column_name = variant_col_name + "." + tuple_paths[i].get_path();
613
605
            get_column_by_type(tuple_types[i], tuple_paths[i].get_path(), common_column,
614
605
                               ExtraInfo {.unique_id = -1,
615
605
                                          .parent_unique_id = variant_col_unique_id,
616
605
                                          .path_info = tuple_paths[i]});
617
605
        }
618
605
        common_schema->append_column(common_column);
619
605
        if (path_set != nullptr) {
620
602
            path_set->insert(tuple_paths[i]);
621
602
        }
622
605
    }
623
1.50k
    return Status::OK();
624
1.50k
}
625
626
Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
627
                                  TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
628
1.50k
                                  std::set<PathInData>* path_set) {
629
1.50k
    std::map<std::string, TabletColumnPtr> typed_columns;
630
1.50k
    for (const TabletColumnPtr& col :
631
1.50k
         common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) {
632
44
        typed_columns[col->name()] = col;
633
44
    }
634
    // Types of subcolumns by path from all tuples.
635
1.50k
    std::map<PathInData, DataTypes> subcolumns_types;
636
637
    // Collect all paths first to enable batch checking
638
1.50k
    std::vector<PathInData> all_paths;
639
640
1.59k
    for (const TabletSchemaSPtr& schema : schemas) {
641
4.65k
        for (const TabletColumnPtr& col : schema->columns()) {
642
            // Get subcolumns of this variant
643
4.65k
            if (col->has_path_info() && col->parent_unique_id() >= 0 &&
644
4.65k
                col->parent_unique_id() == variant_col_unique_id) {
645
631
                subcolumns_types[*col->path_info_ptr()].emplace_back(
646
631
                        DataTypeFactory::instance().create_data_type(*col, col->is_nullable()));
647
631
                all_paths.push_back(*col->path_info_ptr());
648
631
            }
649
4.65k
        }
650
1.59k
    }
651
652
    // Batch check for conflicts
653
1.50k
    RETURN_IF_ERROR(check_variant_has_no_ambiguous_paths(all_paths));
654
655
1.50k
    return update_least_schema_internal(subcolumns_types, common_schema, variant_col_unique_id,
656
1.50k
                                        typed_columns, path_set);
657
1.50k
}
658
659
// Keep variant subcolumn BF support aligned with FE DDL checks.
660
90.6k
bool is_bf_supported_by_fe_for_variant_subcolumn(FieldType type) {
661
90.6k
    switch (type) {
662
80
    case FieldType::OLAP_FIELD_TYPE_SMALLINT:
663
414
    case FieldType::OLAP_FIELD_TYPE_INT:
664
50.7k
    case FieldType::OLAP_FIELD_TYPE_BIGINT:
665
50.9k
    case FieldType::OLAP_FIELD_TYPE_LARGEINT:
666
50.9k
    case FieldType::OLAP_FIELD_TYPE_CHAR:
667
50.9k
    case FieldType::OLAP_FIELD_TYPE_VARCHAR:
668
76.4k
    case FieldType::OLAP_FIELD_TYPE_STRING:
669
76.4k
    case FieldType::OLAP_FIELD_TYPE_DATE:
670
76.4k
    case FieldType::OLAP_FIELD_TYPE_DATETIME:
671
76.6k
    case FieldType::OLAP_FIELD_TYPE_DATEV2:
672
76.9k
    case FieldType::OLAP_FIELD_TYPE_DATETIMEV2:
673
76.9k
    case FieldType::OLAP_FIELD_TYPE_TIMESTAMPTZ:
674
76.9k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL:
675
76.9k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL32:
676
77.0k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL64:
677
77.3k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL128I:
678
77.4k
    case FieldType::OLAP_FIELD_TYPE_DECIMAL256:
679
77.6k
    case FieldType::OLAP_FIELD_TYPE_IPV4:
680
77.7k
    case FieldType::OLAP_FIELD_TYPE_IPV6:
681
77.7k
        return true;
682
12.8k
    default:
683
12.8k
        return false;
684
90.6k
    }
685
90.6k
}
686
687
void inherit_column_attributes(const TabletColumn& source, TabletColumn& target,
688
90.6k
                               TabletSchemaSPtr* target_schema) {
689
90.6k
    if (!target.is_extracted_column()) {
690
0
        return;
691
0
    }
692
90.6k
    target.set_aggregation_method(source.aggregation());
693
694
    // 1. bloom filter
695
90.6k
    if (is_bf_supported_by_fe_for_variant_subcolumn(target.type())) {
696
77.7k
        target.set_is_bf_column(source.is_bf_column());
697
77.7k
    }
698
699
90.6k
    if (!target_schema) {
700
83.2k
        return;
701
83.2k
    }
702
703
    // 2. inverted index
704
7.34k
    TabletIndexes indexes_to_add;
705
7.34k
    auto source_indexes = (*target_schema)->inverted_indexs(source.unique_id());
706
    // if target is variant type, we need to inherit all indexes
707
    // because this schema is a read schema from fe
708
7.34k
    if (target.is_variant_type()) {
709
6.75k
        for (auto& index : source_indexes) {
710
276
            auto index_info = std::make_shared<TabletIndex>(*index);
711
276
            index_info->set_escaped_escaped_index_suffix_path(target.path_info_ptr()->get_path());
712
276
            indexes_to_add.emplace_back(std::move(index_info));
713
276
        }
714
6.75k
    } else {
715
596
        inherit_index(source_indexes, indexes_to_add, target);
716
596
    }
717
7.34k
    auto target_indexes = (*target_schema)
718
7.34k
                                  ->inverted_indexs(target.parent_unique_id(),
719
7.34k
                                                    target.path_info_ptr()->get_path());
720
7.35k
    if (target_indexes.empty()) {
721
7.35k
        for (auto& index_info : indexes_to_add) {
722
283
            (*target_schema)->append_index(std::move(*index_info));
723
283
        }
724
7.35k
    }
725
726
    // 3. TODO: gnragm bf index
727
7.34k
}
728
729
7.95k
void inherit_column_attributes(TabletSchemaSPtr& schema) {
730
    // Add index meta if extracted column is missing index meta
731
100k
    for (size_t i = 0; i < schema->num_columns(); ++i) {
732
92.1k
        TabletColumn& col = schema->mutable_column(i);
733
92.1k
        if (!col.is_extracted_column()) {
734
84.7k
            continue;
735
84.7k
        }
736
7.35k
        if (schema->field_index(col.parent_unique_id()) == -1) {
737
            // parent column is missing, maybe dropped
738
0
            continue;
739
0
        }
740
7.35k
        inherit_column_attributes(schema->column_by_uid(col.parent_unique_id()), col, &schema);
741
7.35k
    }
742
7.95k
}
743
744
Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
745
                               const TabletSchemaSPtr& base_schema, TabletSchemaSPtr& output_schema,
746
1.46k
                               bool check_schema_size) {
747
1.46k
    std::vector<int32_t> variant_column_unique_id;
748
    // Construct a schema excluding the extracted columns and gather unique identifiers for variants.
749
    // Ensure that the output schema also excludes these extracted columns. This approach prevents
750
    // duplicated paths following the update_least_common_schema process.
751
1.46k
    auto build_schema_without_extracted_columns = [&](const TabletSchemaSPtr& base_schema) {
752
1.46k
        output_schema = std::make_shared<TabletSchema>();
753
        // not copy columns but only shadow copy other attributes
754
1.46k
        output_schema->shawdow_copy_without_columns(*base_schema);
755
        // Get all columns without extracted columns and collect variant col unique id
756
3.55k
        for (const TabletColumnPtr& col : base_schema->columns()) {
757
3.55k
            if (col->is_variant_type()) {
758
1.49k
                variant_column_unique_id.push_back(col->unique_id());
759
1.49k
            }
760
3.55k
            if (!col->is_extracted_column()) {
761
3.23k
                output_schema->append_column(*col);
762
3.23k
            }
763
3.55k
        }
764
1.46k
    };
765
1.46k
    if (base_schema == nullptr) {
766
        // Pick tablet schema with max schema version
767
120
        auto max_version_schema =
768
120
                *std::max_element(schemas.cbegin(), schemas.cend(),
769
1.35k
                                  [](const TabletSchemaSPtr a, const TabletSchemaSPtr b) {
770
1.35k
                                      return a->schema_version() < b->schema_version();
771
1.35k
                                  });
772
120
        CHECK(max_version_schema);
773
120
        build_schema_without_extracted_columns(max_version_schema);
774
1.34k
    } else {
775
        // use input base_schema schema as base schema
776
1.34k
        build_schema_without_extracted_columns(base_schema);
777
1.34k
    }
778
779
1.49k
    for (int32_t unique_id : variant_column_unique_id) {
780
1.49k
        std::set<PathInData> path_set;
781
1.49k
        RETURN_IF_ERROR(update_least_common_schema(schemas, output_schema, unique_id, &path_set));
782
1.49k
    }
783
784
1.46k
    inherit_column_attributes(output_schema);
785
1.46k
    if (check_schema_size &&
786
1.46k
        output_schema->columns().size() > config::variant_max_merged_tablet_schema_size) {
787
0
        return Status::DataQualityError("Reached max column size limit {}",
788
0
                                        config::variant_max_merged_tablet_schema_size);
789
0
    }
790
791
1.46k
    return Status::OK();
792
1.46k
}
793
794
// sort by paths in lexicographical order
795
8.92k
ColumnVariant::Subcolumns get_sorted_subcolumns(const ColumnVariant::Subcolumns& subcolumns) {
796
    // sort by paths in lexicographical order
797
8.92k
    ColumnVariant::Subcolumns sorted = subcolumns;
798
2.62M
    std::sort(sorted.begin(), sorted.end(), [](const auto& lhsItem, const auto& rhsItem) {
799
2.62M
        return lhsItem->path < rhsItem->path;
800
2.62M
    });
801
8.92k
    return sorted;
802
8.92k
}
803
804
bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema,
805
8.08k
                           int32_t new_col_idx, int32_t old_col_idx) {
806
8.08k
    const auto& column_new = new_schema->column(new_col_idx);
807
8.08k
    const auto& column_old = old_schema->column(old_col_idx);
808
809
8.08k
    if (column_new.is_bf_column() != column_old.is_bf_column()) {
810
94
        return true;
811
94
    }
812
813
7.99k
    auto new_schema_inverted_indexs = new_schema->inverted_indexs(column_new);
814
7.99k
    auto old_schema_inverted_indexs = old_schema->inverted_indexs(column_old);
815
816
7.99k
    if (new_schema_inverted_indexs.size() != old_schema_inverted_indexs.size()) {
817
706
        return true;
818
706
    }
819
820
7.65k
    for (size_t i = 0; i < new_schema_inverted_indexs.size(); ++i) {
821
391
        if (!new_schema_inverted_indexs[i]->is_same_except_id(old_schema_inverted_indexs[i])) {
822
20
            return true;
823
20
        }
824
391
    }
825
826
7.26k
    return false;
827
7.28k
}
828
829
1.15k
TabletColumn create_sparse_column(const TabletColumn& variant) {
830
1.15k
    TabletColumn res;
831
1.15k
    res.set_name(variant.name_lower_case() + "." + SPARSE_COLUMN_PATH);
832
1.15k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
833
1.15k
    res.set_aggregation_method(variant.aggregation());
834
1.15k
    res.set_path_info(PathInData {variant.name_lower_case() + "." + SPARSE_COLUMN_PATH});
835
1.15k
    res.set_parent_unique_id(variant.unique_id());
836
    // set default value to "NULL" DefaultColumnIterator will call insert_many_defaults
837
1.15k
    res.set_default_value("NULL");
838
1.15k
    TabletColumn child_tcolumn;
839
1.15k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
840
1.15k
    res.add_sub_column(child_tcolumn);
841
1.15k
    res.add_sub_column(child_tcolumn);
842
1.15k
    return res;
843
1.15k
}
844
845
20.1k
TabletColumn create_sparse_shard_column(const TabletColumn& variant, int bucket_index) {
846
20.1k
    TabletColumn res;
847
20.1k
    std::string name = variant.name_lower_case() + "." + SPARSE_COLUMN_PATH + ".b" +
848
20.1k
                       std::to_string(bucket_index);
849
20.1k
    res.set_name(name);
850
20.1k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
851
20.1k
    res.set_aggregation_method(variant.aggregation());
852
20.1k
    res.set_parent_unique_id(variant.unique_id());
853
20.1k
    res.set_default_value("NULL");
854
20.1k
    PathInData path(name);
855
20.1k
    res.set_path_info(path);
856
20.1k
    TabletColumn child_tcolumn;
857
20.1k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
858
20.1k
    res.add_sub_column(child_tcolumn);
859
20.1k
    res.add_sub_column(child_tcolumn);
860
20.1k
    return res;
861
20.1k
}
862
863
10.4k
TabletColumn create_doc_value_column(const TabletColumn& variant, int bucket_index) {
864
10.4k
    TabletColumn res;
865
10.4k
    std::string name = variant.name_lower_case() + "." + DOC_VALUE_COLUMN_PATH + ".b" +
866
10.4k
                       std::to_string(bucket_index);
867
10.4k
    res.set_name(name);
868
10.4k
    res.set_type(FieldType::OLAP_FIELD_TYPE_MAP);
869
10.4k
    res.set_aggregation_method(variant.aggregation());
870
10.4k
    res.set_parent_unique_id(variant.unique_id());
871
10.4k
    res.set_default_value("NULL");
872
10.4k
    res.set_path_info(PathInData {name});
873
874
10.4k
    TabletColumn child_tcolumn;
875
10.4k
    child_tcolumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
876
10.4k
    res.add_sub_column(child_tcolumn);
877
10.4k
    res.add_sub_column(child_tcolumn);
878
10.4k
    return res;
879
10.4k
}
880
881
131k
uint32_t variant_binary_shard_of(const StringRef& path, uint32_t bucket_num) {
882
131k
    if (bucket_num <= 1) return 0;
883
104k
    SipHash hash;
884
104k
    hash.update(path.data, path.size);
885
104k
    uint64_t h = hash.get64();
886
104k
    return static_cast<uint32_t>(h % bucket_num);
887
131k
}
888
889
Status VariantCompactionUtil::aggregate_path_to_stats(
890
        const RowsetSharedPtr& rs,
891
2.81k
        std::unordered_map<int32_t, PathToNoneNullValues>* uid_to_path_stats) {
892
2.81k
    SegmentCacheHandle segment_cache;
893
2.81k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
894
2.81k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
895
896
8.21k
    for (const auto& column : rs->tablet_schema()->columns()) {
897
8.21k
        if (!column->is_variant_type() || column->unique_id() < 0) {
898
4.36k
            continue;
899
4.36k
        }
900
3.85k
        if (!should_check_variant_path_stats(*column)) {
901
0
            continue;
902
0
        }
903
3.85k
        for (const auto& segment : segment_cache.get_segments()) {
904
1.93k
            std::shared_ptr<ColumnReader> column_reader;
905
1.93k
            OlapReaderStatistics stats;
906
1.93k
            RETURN_IF_ERROR(
907
1.93k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
908
1.93k
            if (!column_reader) {
909
0
                continue;
910
0
            }
911
912
1.93k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
913
1.93k
            auto* variant_column_reader =
914
1.93k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
915
            // load external meta before getting stats
916
1.93k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
917
1.93k
            const auto* source_stats = variant_column_reader->get_stats();
918
1.93k
            CHECK(source_stats);
919
920
            // agg path -> stats
921
3.46k
            for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
922
3.46k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
923
3.46k
            }
924
925
5.92k
            for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
926
5.92k
                (*uid_to_path_stats)[column->unique_id()][path] += size;
927
5.92k
            }
928
1.93k
        }
929
3.85k
    }
930
2.81k
    return Status::OK();
931
2.81k
}
932
933
Status VariantCompactionUtil::aggregate_variant_extended_info(
934
        const RowsetSharedPtr& rs,
935
4.68k
        std::unordered_map<int32_t, VariantExtendedInfo>* uid_to_variant_extended_info) {
936
4.68k
    SegmentCacheHandle segment_cache;
937
4.68k
    RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(
938
4.68k
            std::static_pointer_cast<BetaRowset>(rs), &segment_cache));
939
940
18.1k
    for (const auto& column : rs->tablet_schema()->columns()) {
941
18.1k
        if (!column->is_variant_type()) {
942
12.3k
            continue;
943
12.3k
        }
944
5.79k
        auto& extended_info = (*uid_to_variant_extended_info)[column->unique_id()];
945
5.79k
        if (column->variant_enable_nested_group()) {
946
0
            extended_info.has_nested_group = true;
947
0
        }
948
5.79k
        for (const auto& segment : segment_cache.get_segments()) {
949
3.43k
            std::shared_ptr<ColumnReader> column_reader;
950
3.43k
            OlapReaderStatistics stats;
951
3.43k
            RETURN_IF_ERROR(
952
3.43k
                    segment->get_column_reader(column->unique_id(), &column_reader, &stats));
953
3.43k
            if (!column_reader) {
954
0
                continue;
955
0
            }
956
957
3.43k
            CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
958
3.43k
            auto* variant_column_reader =
959
3.43k
                    assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
960
            // load external meta before getting stats
961
3.43k
            RETURN_IF_ERROR(variant_column_reader->load_external_meta_once());
962
3.43k
            const auto* source_stats = variant_column_reader->get_stats();
963
3.43k
            CHECK(source_stats);
964
965
3.43k
            if (!column->variant_enable_nested_group()) {
966
                // NG roots still need type metadata for regular subpaths such as `v.owner`,
967
                // but their compaction schema should not be driven by flat path stats.
968
3.43k
                for (const auto& [path, size] : source_stats->sparse_column_non_null_size) {
969
2.02k
                    extended_info.path_to_none_null_values[path] += size;
970
2.02k
                    extended_info.sparse_paths.emplace(path);
971
2.02k
                }
972
973
5.08k
                for (const auto& [path, size] : source_stats->subcolumns_non_null_size) {
974
5.08k
                    extended_info.path_to_none_null_values[path] += size;
975
5.08k
                }
976
3.43k
            }
977
978
            //2. agg path -> schema
979
3.43k
            variant_column_reader->get_subcolumns_types(&extended_info.path_to_data_types);
980
981
            // 3. extract typed paths
982
3.43k
            variant_column_reader->get_typed_paths(&extended_info.typed_paths);
983
984
            // 4. extract nested paths
985
3.43k
            if (!column->variant_enable_nested_group()) {
986
3.43k
                variant_column_reader->get_nested_paths(&extended_info.nested_paths);
987
3.43k
            }
988
3.43k
        }
989
5.79k
    }
990
4.68k
    return Status::OK();
991
4.68k
}
992
993
// get the subpaths and sparse paths for the variant column
994
void VariantCompactionUtil::get_subpaths(int32_t max_subcolumns_count,
995
                                         const PathToNoneNullValues& stats,
996
263
                                         TabletSchema::PathsSetInfo& paths_set_info) {
997
    // max_subcolumns_count is 0 means no limit
998
263
    if (max_subcolumns_count > 0 && stats.size() > max_subcolumns_count) {
999
61
        std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
1000
61
        paths_with_sizes.reserve(stats.size());
1001
873
        for (const auto& [path, size] : stats) {
1002
873
            paths_with_sizes.emplace_back(size, path);
1003
873
        }
1004
61
        std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), std::greater());
1005
1006
        // Select top N paths as subcolumns, remaining paths as sparse columns
1007
871
        for (const auto& [size, path] : paths_with_sizes) {
1008
871
            if (paths_set_info.sub_path_set.size() < max_subcolumns_count) {
1009
140
                paths_set_info.sub_path_set.emplace(path);
1010
731
            } else {
1011
731
                paths_set_info.sparse_path_set.emplace(path);
1012
731
            }
1013
871
        }
1014
61
        LOG(INFO) << "subpaths " << paths_set_info.sub_path_set.size() << " sparse paths "
1015
61
                  << paths_set_info.sparse_path_set.size() << " variant max subcolumns count "
1016
61
                  << max_subcolumns_count << " stats size " << paths_with_sizes.size();
1017
202
    } else {
1018
        // Apply all paths as subcolumns
1019
438
        for (const auto& [path, _] : stats) {
1020
438
            paths_set_info.sub_path_set.emplace(path);
1021
438
        }
1022
202
    }
1023
263
}
1024
1025
Status VariantCompactionUtil::check_path_stats(const std::vector<RowsetSharedPtr>& intputs,
1026
8.44k
                                               RowsetSharedPtr output, BaseTabletSPtr tablet) {
1027
8.44k
    if (output->tablet_schema()->num_variant_columns() == 0) {
1028
7.93k
        return Status::OK();
1029
7.93k
    }
1030
4.69k
    for (const auto& rowset : intputs) {
1031
18.1k
        for (const auto& column : rowset->tablet_schema()->columns()) {
1032
18.1k
            if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
1033
0
                return Status::OK();
1034
0
            }
1035
18.1k
        }
1036
4.69k
    }
1037
    // check no extended schema in input rowsets
1038
4.69k
    for (const auto& rowset : intputs) {
1039
18.1k
        for (const auto& column : rowset->tablet_schema()->columns()) {
1040
18.1k
            if (column->is_extracted_column()) {
1041
0
                return Status::OK();
1042
0
            }
1043
18.1k
        }
1044
4.69k
    }
1045
514
#ifndef BE_TEST
1046
    // check no extended schema in output rowset
1047
1.92k
    for (const auto& column : output->tablet_schema()->columns()) {
1048
1.92k
        if (column->is_extracted_column()) {
1049
0
            const auto& name = column->name();
1050
0
            if (name.find("." + DOC_VALUE_COLUMN_PATH + ".") != std::string::npos ||
1051
0
                name.find("." + SPARSE_COLUMN_PATH + ".") != std::string::npos ||
1052
0
                name.ends_with("." + SPARSE_COLUMN_PATH)) {
1053
0
                continue;
1054
0
            }
1055
0
            return Status::InternalError("Unexpected extracted column {} in output rowset",
1056
0
                                         column->name());
1057
0
        }
1058
1.92k
    }
1059
514
#endif
1060
    // only check path stats for dup_keys since the rows may be merged in other models
1061
514
    if (tablet->keys_type() != KeysType::DUP_KEYS) {
1062
195
        return Status::OK();
1063
195
    }
1064
    // if there is a delete predicate in the input rowsets, we skip the path stats check
1065
2.51k
    for (auto& rowset : intputs) {
1066
2.51k
        if (rowset->rowset_meta()->has_delete_predicate()) {
1067
4
            return Status::OK();
1068
4
        }
1069
2.51k
    }
1070
936
    for (const auto& column : output->tablet_schema()->columns()) {
1071
936
        if (column->is_variant_type() && !should_check_variant_path_stats(*column)) {
1072
0
            return Status::OK();
1073
0
        }
1074
936
    }
1075
315
    std::unordered_map<int32_t, PathToNoneNullValues> original_uid_to_path_stats;
1076
2.48k
    for (const auto& rs : intputs) {
1077
2.48k
        RETURN_IF_ERROR(aggregate_path_to_stats(rs, &original_uid_to_path_stats));
1078
2.48k
    }
1079
315
    std::unordered_map<int32_t, PathToNoneNullValues> output_uid_to_path_stats;
1080
315
    RETURN_IF_ERROR(aggregate_path_to_stats(output, &output_uid_to_path_stats));
1081
315
    for (const auto& [uid, stats] : output_uid_to_path_stats) {
1082
192
        if (output->tablet_schema()->column_by_uid(uid).is_variant_type() &&
1083
192
            output->tablet_schema()->column_by_uid(uid).variant_enable_doc_mode()) {
1084
62
            continue;
1085
62
        }
1086
130
        if (original_uid_to_path_stats.find(uid) == original_uid_to_path_stats.end()) {
1087
0
            return Status::InternalError("Path stats not found for uid {}, tablet_id {}", uid,
1088
0
                                         tablet->tablet_id());
1089
0
        }
1090
1091
        // In input rowsets, some rowsets may have statistics values exceeding the maximum limit,
1092
        // which leads to inaccurate statistics
1093
130
        if (stats.size() > output->tablet_schema()
1094
130
                                   ->column_by_uid(uid)
1095
130
                                   .variant_max_sparse_column_statistics_size()) {
1096
            // When there is only one segment, we can ensure that the size of each path in output stats is accurate
1097
1
            if (output->num_segments() == 1) {
1098
13
                for (const auto& [path, size] : stats) {
1099
13
                    if (original_uid_to_path_stats.at(uid).find(path) ==
1100
13
                        original_uid_to_path_stats.at(uid).end()) {
1101
0
                        continue;
1102
0
                    }
1103
13
                    if (original_uid_to_path_stats.at(uid).at(path) > size) {
1104
0
                        return Status::InternalError(
1105
0
                                "Path stats not smaller for uid {} with path `{}`, input size {}, "
1106
0
                                "output "
1107
0
                                "size {}, "
1108
0
                                "tablet_id {}",
1109
0
                                uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1110
0
                                tablet->tablet_id());
1111
0
                    }
1112
13
                }
1113
1
            }
1114
1
        }
1115
        // in this case, input stats is accurate, so we check the stats size and stats value
1116
129
        else {
1117
1.12k
            for (const auto& [path, size] : stats) {
1118
1.12k
                if (original_uid_to_path_stats.at(uid).find(path) ==
1119
1.12k
                    original_uid_to_path_stats.at(uid).end()) {
1120
0
                    return Status::InternalError(
1121
0
                            "Path stats not found for uid {}, path {}, tablet_id {}", uid, path,
1122
0
                            tablet->tablet_id());
1123
0
                }
1124
1.12k
                if (original_uid_to_path_stats.at(uid).at(path) != size) {
1125
0
                    return Status::InternalError(
1126
0
                            "Path stats not match for uid {} with path `{}`, input size {}, output "
1127
0
                            "size {}, "
1128
0
                            "tablet_id {}",
1129
0
                            uid, path, original_uid_to_path_stats.at(uid).at(path), size,
1130
0
                            tablet->tablet_id());
1131
0
                }
1132
1.12k
            }
1133
129
        }
1134
130
    }
1135
1136
315
    return Status::OK();
1137
315
}
1138
1139
Status VariantCompactionUtil::get_compaction_typed_columns(
1140
        const TabletSchemaSPtr& target, const std::unordered_set<std::string>& typed_paths,
1141
        const TabletColumnPtr parent_column, TabletSchemaSPtr& output_schema,
1142
259
        TabletSchema::PathsSetInfo& paths_set_info) {
1143
259
    if (parent_column->variant_enable_typed_paths_to_sparse()) {
1144
4
        return Status::OK();
1145
4
    }
1146
479
    for (const auto& path : typed_paths) {
1147
479
        TabletSchema::SubColumnInfo sub_column_info;
1148
479
        if (generate_sub_column_info(*target, parent_column->unique_id(), path, &sub_column_info)) {
1149
478
            inherit_column_attributes(*parent_column, sub_column_info.column);
1150
478
            output_schema->append_column(sub_column_info.column);
1151
478
            paths_set_info.typed_path_set.insert({path, std::move(sub_column_info)});
1152
478
            VLOG_DEBUG << "append typed column " << path;
1153
478
        } else {
1154
1
            return Status::InternalError("Failed to generate sub column info for path {}", path);
1155
1
        }
1156
479
    }
1157
254
    return Status::OK();
1158
255
}
1159
1160
Status VariantCompactionUtil::get_compaction_nested_columns(
1161
        const std::unordered_set<PathInData, PathInData::Hash>& nested_paths,
1162
        const PathToDataTypes& path_to_data_types, const TabletColumnPtr parent_column,
1163
257
        TabletSchemaSPtr& output_schema, TabletSchema::PathsSetInfo& paths_set_info) {
1164
257
    const auto& parent_indexes = output_schema->inverted_indexs(parent_column->unique_id());
1165
257
    for (const auto& path : nested_paths) {
1166
3
        const auto& find_data_types = path_to_data_types.find(path);
1167
3
        if (find_data_types == path_to_data_types.end() || find_data_types->second.empty()) {
1168
1
            return Status::InternalError("Nested path {} has no data type", path.get_path());
1169
1
        }
1170
2
        DataTypePtr data_type;
1171
2
        get_least_supertype_jsonb(find_data_types->second, &data_type);
1172
1173
2
        const std::string& column_name = parent_column->name_lower_case() + "." + path.get_path();
1174
2
        PathInDataBuilder full_path_builder;
1175
2
        auto full_path = full_path_builder.append(parent_column->name_lower_case(), false)
1176
2
                                 .append(path.get_parts(), false)
1177
2
                                 .build();
1178
2
        TabletColumn nested_column =
1179
2
                get_column_by_type(data_type, column_name,
1180
2
                                   ExtraInfo {.unique_id = -1,
1181
2
                                              .parent_unique_id = parent_column->unique_id(),
1182
2
                                              .path_info = full_path});
1183
2
        inherit_column_attributes(*parent_column, nested_column);
1184
2
        TabletIndexes sub_column_indexes;
1185
2
        inherit_index(parent_indexes, sub_column_indexes, nested_column);
1186
2
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1187
2
        output_schema->append_column(nested_column);
1188
2
        VLOG_DEBUG << "append nested column " << path.get_path();
1189
2
    }
1190
256
    return Status::OK();
1191
257
}
1192
1193
void VariantCompactionUtil::get_compaction_subcolumns_from_subpaths(
1194
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1195
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1196
200
        const std::unordered_set<std::string>& sparse_paths, TabletSchemaSPtr& output_schema) {
1197
200
    auto& path_set = paths_set_info.sub_path_set;
1198
200
    std::vector<StringRef> sorted_subpaths(path_set.begin(), path_set.end());
1199
200
    std::sort(sorted_subpaths.begin(), sorted_subpaths.end());
1200
200
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1201
    // append subcolumns
1202
488
    for (const auto& subpath : sorted_subpaths) {
1203
488
        auto column_name = parent_column->name_lower_case() + "." + subpath.to_string();
1204
488
        auto column_path = PathInData(column_name);
1205
1206
488
        const auto& find_data_types = path_to_data_types.find(PathInData(subpath));
1207
1208
        // some cases: the subcolumn type is variant
1209
        // 1. this path has no data type in segments
1210
        // 2. this path is in sparse paths
1211
        // 3. the sparse paths are too much
1212
488
        TabletSchema::SubColumnInfo sub_column_info;
1213
488
        if (parent_column->variant_enable_typed_paths_to_sparse() &&
1214
488
            generate_sub_column_info(*target, parent_column->unique_id(), std::string(subpath),
1215
32
                                     &sub_column_info)) {
1216
23
            inherit_column_attributes(*parent_column, sub_column_info.column);
1217
23
            output_schema->append_column(sub_column_info.column);
1218
23
            paths_set_info.subcolumn_indexes.emplace(subpath, std::move(sub_column_info.indexes));
1219
23
            VLOG_DEBUG << "append typed column " << subpath;
1220
465
        } else if (find_data_types == path_to_data_types.end() || find_data_types->second.empty() ||
1221
465
                   sparse_paths.find(std::string(subpath)) != sparse_paths.end() ||
1222
465
                   sparse_paths.size() >=
1223
395
                           parent_column->variant_max_sparse_column_statistics_size()) {
1224
74
            TabletColumn subcolumn;
1225
74
            subcolumn.set_name(column_name);
1226
74
            subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1227
74
            subcolumn.set_parent_unique_id(parent_column->unique_id());
1228
74
            subcolumn.set_path_info(column_path);
1229
74
            subcolumn.set_aggregation_method(parent_column->aggregation());
1230
74
            subcolumn.set_variant_max_subcolumns_count(
1231
74
                    parent_column->variant_max_subcolumns_count());
1232
74
            subcolumn.set_variant_enable_doc_mode(parent_column->variant_enable_doc_mode());
1233
74
            subcolumn.set_is_nullable(true);
1234
74
            output_schema->append_column(subcolumn);
1235
74
            VLOG_DEBUG << "append sub column " << subpath << " data type "
1236
0
                       << "VARIANT";
1237
74
        }
1238
        // normal case: the subcolumn type can be calculated from the data types in segments
1239
391
        else {
1240
391
            DataTypePtr data_type;
1241
391
            get_least_supertype_jsonb(find_data_types->second, &data_type);
1242
391
            TabletColumn sub_column =
1243
391
                    get_column_by_type(data_type, column_name,
1244
391
                                       ExtraInfo {.unique_id = -1,
1245
391
                                                  .parent_unique_id = parent_column->unique_id(),
1246
391
                                                  .path_info = column_path});
1247
391
            inherit_column_attributes(*parent_column, sub_column);
1248
391
            TabletIndexes sub_column_indexes;
1249
391
            inherit_index(parent_indexes, sub_column_indexes, sub_column);
1250
391
            paths_set_info.subcolumn_indexes.emplace(subpath, std::move(sub_column_indexes));
1251
391
            output_schema->append_column(sub_column);
1252
391
            VLOG_DEBUG << "append sub column " << subpath << " data type " << data_type->get_name();
1253
391
        }
1254
488
    }
1255
200
}
1256
1257
void VariantCompactionUtil::get_compaction_subcolumns_from_data_types(
1258
        TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
1259
        const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
1260
66
        TabletSchemaSPtr& output_schema) {
1261
66
    const auto& parent_indexes = target->inverted_indexs(parent_column->unique_id());
1262
119
    for (const auto& [path, data_types] : path_to_data_types) {
1263
        // Typed paths are materialized by get_compaction_typed_columns(); this helper only
1264
        // materializes regular subcolumns inferred from rowset data types.
1265
119
        if (data_types.empty() || path.empty() || path.get_is_typed() || path.has_nested_part()) {
1266
19
            continue;
1267
19
        }
1268
100
        DataTypePtr data_type;
1269
100
        get_least_supertype_jsonb(data_types, &data_type);
1270
100
        auto column_name = parent_column->name_lower_case() + "." + path.get_path();
1271
100
        auto column_path = PathInData(column_name);
1272
100
        TabletColumn sub_column =
1273
100
                get_column_by_type(data_type, column_name,
1274
100
                                   ExtraInfo {.unique_id = -1,
1275
100
                                              .parent_unique_id = parent_column->unique_id(),
1276
100
                                              .path_info = column_path});
1277
100
        inherit_column_attributes(*parent_column, sub_column);
1278
100
        TabletIndexes sub_column_indexes;
1279
100
        inherit_index(parent_indexes, sub_column_indexes, sub_column);
1280
100
        paths_set_info.sub_path_set.emplace(path.get_path());
1281
100
        paths_set_info.subcolumn_indexes.emplace(path.get_path(), std::move(sub_column_indexes));
1282
100
        output_schema->append_column(sub_column);
1283
100
        VLOG_DEBUG << "append sub column " << path.get_path() << " data type "
1284
0
                   << data_type->get_name();
1285
100
    }
1286
66
}
1287
1288
// Build the temporary schema for compaction.
1289
// NestedGroup roots are special: the root VARIANT column owns the NG tree and the streaming NG
1290
// writer handles NG children, while regular non-NG paths beside the arrays are materialized as
1291
// ordinary extracted subcolumns. NG typed paths still use get_compaction_typed_columns(), keeping
1292
// typed-column rules out of the NG-specific regular-path filtering.
1293
Status VariantCompactionUtil::get_extended_compaction_schema(
1294
7.98k
        const std::vector<RowsetSharedPtr>& rowsets, TabletSchemaSPtr& target) {
1295
7.98k
    std::unordered_map<int32_t, VariantExtendedInfo> uid_to_variant_extended_info;
1296
7.98k
    const bool needs_variant_extended_info =
1297
91.0k
            std::ranges::any_of(target->columns(), [](const TabletColumnPtr& column) {
1298
91.0k
                return column->is_variant_type() && (should_check_variant_path_stats(*column) ||
1299
517
                                                     column->variant_enable_nested_group());
1300
91.0k
            });
1301
7.98k
    if (needs_variant_extended_info) {
1302
        // collect path stats from all rowsets and segments
1303
4.68k
        for (const auto& rs : rowsets) {
1304
4.68k
            RETURN_IF_ERROR(aggregate_variant_extended_info(rs, &uid_to_variant_extended_info));
1305
4.68k
        }
1306
516
    }
1307
1308
    // build the output schema
1309
7.98k
    TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>();
1310
7.98k
    output_schema->shawdow_copy_without_columns(*target);
1311
7.98k
    std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info;
1312
7.98k
    const auto ng_root_uids =
1313
7.98k
            collect_nested_group_compaction_root_uids(target, uid_to_variant_extended_info);
1314
93.0k
    for (const TabletColumnPtr& column : target->columns()) {
1315
93.0k
        if (!column->is_extracted_column()) {
1316
92.9k
            output_schema->append_column(*column);
1317
92.9k
        }
1318
93.0k
        if (!column->is_variant_type()) {
1319
92.4k
            continue;
1320
92.4k
        }
1321
18.4E
        VLOG_DEBUG << "column " << column->name() << " unique id " << column->unique_id();
1322
1323
603
        const auto info_it = uid_to_variant_extended_info.find(column->unique_id());
1324
603
        const VariantExtendedInfo empty_extended_info;
1325
603
        const VariantExtendedInfo& extended_info = info_it == uid_to_variant_extended_info.end()
1326
603
                                                           ? empty_extended_info
1327
603
                                                           : info_it->second;
1328
603
        auto& paths_set_info = uid_to_paths_set_info[column->unique_id()];
1329
603
        const bool use_nested_group_compaction_schema = ng_root_uids.contains(column->unique_id());
1330
1331
603
        if (use_nested_group_compaction_schema) {
1332
            // 1. append typed columns. Keep this shared with the non-NG typed helper; only the
1333
            // regular-path selection below is NG-specific.
1334
1
            RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1335
1
                                                         output_schema, paths_set_info));
1336
1337
            // NG roots do not record path-count stats for ordinary Variant paths, so their regular
1338
            // non-NG subcolumns use the same data-types materialization helper as the
1339
            // all-materialized non-NG branch below.
1340
1
            auto regular_path_to_data_types =
1341
1
                    collect_regular_types_outside_nested_group(extended_info);
1342
1
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1343
1
                                                      regular_path_to_data_types, output_schema);
1344
1
            LOG(INFO) << "Variant column uid=" << column->unique_id()
1345
1
                      << " keeps nested-group root and materializes regular non-NG subcolumns in "
1346
1
                         "compaction schema";
1347
1
            continue;
1348
1
        }
1349
1350
602
        if (column->variant_enable_doc_mode()) {
1351
368
            const int bucket_num = std::max(1, column->variant_doc_hash_shard_count());
1352
1.13k
            for (int b = 0; b < bucket_num; ++b) {
1353
766
                TabletColumn doc_value_bucket_column = create_doc_value_column(*column, b);
1354
766
                doc_value_bucket_column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
1355
766
                doc_value_bucket_column.set_is_nullable(false);
1356
766
                doc_value_bucket_column.set_variant_enable_doc_mode(true);
1357
766
                output_schema->append_column(doc_value_bucket_column);
1358
766
            }
1359
368
            continue;
1360
368
        }
1361
1362
        // 1. append typed columns
1363
234
        RETURN_IF_ERROR(get_compaction_typed_columns(target, extended_info.typed_paths, column,
1364
234
                                                     output_schema, paths_set_info));
1365
1366
        // 2. append nested columns
1367
234
        RETURN_IF_ERROR(get_compaction_nested_columns(extended_info.nested_paths,
1368
234
                                                      extended_info.path_to_data_types, column,
1369
234
                                                      output_schema, paths_set_info));
1370
1371
        // 3. get the subpaths
1372
234
        get_subpaths(column->variant_max_subcolumns_count(), extended_info.path_to_none_null_values,
1373
234
                     paths_set_info);
1374
1375
        // 4. append subcolumns
1376
234
        if (column->variant_max_subcolumns_count() > 0 || !column->get_sub_columns().empty()) {
1377
192
            get_compaction_subcolumns_from_subpaths(paths_set_info, column, target,
1378
192
                                                    extended_info.path_to_data_types,
1379
192
                                                    extended_info.sparse_paths, output_schema);
1380
192
        }
1381
        // variant_max_subcolumns_count == 0 and no typed paths materialized
1382
        // it means that all subcolumns are materialized, may be from old data
1383
42
        else {
1384
42
            get_compaction_subcolumns_from_data_types(paths_set_info, column, target,
1385
42
                                                      extended_info.path_to_data_types,
1386
42
                                                      output_schema);
1387
42
        }
1388
1389
        // append sparse column(s)
1390
        // If variant uses bucketized sparse columns, append one sparse bucket column per bucket.
1391
        // Otherwise, append the single sparse column.
1392
234
        int bucket_num = std::max(1, column->variant_sparse_hash_shard_count());
1393
240
        if (bucket_num > 1) {
1394
1.02k
            for (int b = 0; b < bucket_num; ++b) {
1395
782
                TabletColumn sparse_bucket_column = create_sparse_shard_column(*column, b);
1396
782
                output_schema->append_column(sparse_bucket_column);
1397
782
            }
1398
18.4E
        } else {
1399
18.4E
            TabletColumn sparse_column = create_sparse_column(*column);
1400
18.4E
            output_schema->append_column(sparse_column);
1401
18.4E
        }
1402
234
    }
1403
1404
7.98k
    target = output_schema;
1405
    // used to merge & filter path to sparse column during reading in compaction
1406
7.98k
    target->set_path_set_info(std::move(uid_to_paths_set_info));
1407
18.4E
    VLOG_DEBUG << "dump schema " << target->dump_full_schema();
1408
7.98k
    return Status::OK();
1409
7.98k
}
1410
1411
// Calculate statistics about variant data paths from the encoded sparse column
1412
void VariantCompactionUtil::calculate_variant_stats(const IColumn& encoded_sparse_column,
1413
                                                    segment_v2::VariantStatisticsPB* stats,
1414
                                                    size_t max_sparse_column_statistics_size,
1415
859
                                                    size_t row_pos, size_t num_rows) {
1416
    // Cast input column to ColumnMap type since sparse column is stored as a map
1417
859
    const auto& map_column = assert_cast<const ColumnMap&>(encoded_sparse_column);
1418
1419
    // Get the keys column which contains the paths as strings
1420
859
    const auto& sparse_data_paths =
1421
859
            assert_cast<const ColumnString*>(map_column.get_keys_ptr().get());
1422
859
    const auto& serialized_sparse_column_offsets = map_column.get_offsets();
1423
859
    auto& count_map = *stats->mutable_sparse_column_non_null_size();
1424
    // Iterate through all paths in the sparse column
1425
335k
    for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
1426
334k
        size_t offset = serialized_sparse_column_offsets[i - 1];
1427
334k
        size_t end = serialized_sparse_column_offsets[i];
1428
1.84M
        for (size_t j = offset; j != end; ++j) {
1429
1.50M
            auto path = sparse_data_paths->get_data_at(j);
1430
1431
1.50M
            const auto& sparse_path = path.to_string();
1432
            // If path already exists in statistics, increment its count
1433
1.50M
            if (auto it = count_map.find(sparse_path); it != count_map.end()) {
1434
1.50M
                ++it->second;
1435
1.50M
            }
1436
            // If path doesn't exist and we haven't hit the max statistics size limit,
1437
            // add it with count 1
1438
690
            else if (count_map.size() < max_sparse_column_statistics_size) {
1439
690
                count_map.emplace(sparse_path, 1);
1440
690
            }
1441
1.50M
        }
1442
334k
    }
1443
1444
859
    if (stats->sparse_column_non_null_size().size() > max_sparse_column_statistics_size) {
1445
0
        throw doris::Exception(
1446
0
                ErrorCode::INTERNAL_ERROR,
1447
0
                "Sparse column non null size: {} is greater than max statistics size: {}",
1448
0
                stats->sparse_column_non_null_size().size(), max_sparse_column_statistics_size);
1449
0
    }
1450
859
}
1451
1452
/// Calculates number of dimensions in array field.
1453
/// Returns 0 for scalar fields.
1454
class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t> {
1455
public:
1456
    FieldVisitorToNumberOfDimensions() = default;
1457
    template <PrimitiveType T>
1458
23.4M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
23.4M
        if constexpr (T == TYPE_ARRAY) {
1460
2.26M
            const size_t size = x.size();
1461
2.26M
            size_t dimensions = 0;
1462
5.44M
            for (size_t i = 0; i < size; ++i) {
1463
3.18M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
3.18M
                dimensions = std::max(dimensions, element_dimensions);
1465
3.18M
            }
1466
2.26M
            return 1 + dimensions;
1467
21.2M
        } else {
1468
21.2M
            return 0;
1469
21.2M
        }
1470
23.4M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
121k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
121k
        } else {
1468
121k
            return 0;
1469
121k
        }
1470
121k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
1
        } else {
1468
1
            return 0;
1469
1
        }
1470
1
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
40.9k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
40.9k
        } else {
1468
40.9k
            return 0;
1469
40.9k
        }
1470
40.9k
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
1
        } else {
1468
1
            return 0;
1469
1
        }
1470
1
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
331k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
331k
        } else {
1468
331k
            return 0;
1469
331k
        }
1470
331k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
7
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
7
        } else {
1468
7
            return 0;
1469
7
        }
1470
7
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
8
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
8
        } else {
1468
8
            return 0;
1469
8
        }
1470
8
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
952
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
952
        } else {
1468
952
            return 0;
1469
952
        }
1470
952
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
6.34M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
6.34M
        } else {
1468
6.34M
            return 0;
1469
6.34M
        }
1470
6.34M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
2
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
2
        } else {
1468
2
            return 0;
1469
2
        }
1470
2
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
2.94M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
2.94M
        } else {
1468
2.94M
            return 0;
1469
2.94M
        }
1470
2.94M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
1
        } else {
1468
1
            return 0;
1469
1
        }
1470
1
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
95
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
95
        } else {
1468
95
            return 0;
1469
95
        }
1470
95
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
11.3M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
11.3M
        } else {
1468
11.3M
            return 0;
1469
11.3M
        }
1470
11.3M
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
2.26M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
2.26M
        if constexpr (T == TYPE_ARRAY) {
1460
2.26M
            const size_t size = x.size();
1461
2.26M
            size_t dimensions = 0;
1462
5.44M
            for (size_t i = 0; i < size; ++i) {
1463
3.18M
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
3.18M
                dimensions = std::max(dimensions, element_dimensions);
1465
3.18M
            }
1466
2.26M
            return 1 + dimensions;
1467
        } else {
1468
            return 0;
1469
        }
1470
2.26M
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
1
        } else {
1468
1
            return 0;
1469
1
        }
1470
1
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
1
        } else {
1468
1
            return 0;
1469
1
        }
1470
1
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
1
        } else {
1468
1
            return 0;
1469
1
        }
1470
1
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
1
        } else {
1468
1
            return 0;
1469
1
        }
1470
1
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
80.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
80.3k
        } else {
1468
80.3k
            return 0;
1469
80.3k
        }
1470
80.3k
    }
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
1
        } else {
1468
1
            return 0;
1469
1
        }
1470
1
    }
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util32FieldVisitorToNumberOfDimensions5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1458
46.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1459
        if constexpr (T == TYPE_ARRAY) {
1460
            const size_t size = x.size();
1461
            size_t dimensions = 0;
1462
            for (size_t i = 0; i < size; ++i) {
1463
                size_t element_dimensions = apply_visitor(*this, x[i]);
1464
                dimensions = std::max(dimensions, element_dimensions);
1465
            }
1466
            return 1 + dimensions;
1467
46.8k
        } else {
1468
46.8k
            return 0;
1469
46.8k
        }
1470
46.8k
    }
1471
};
1472
1473
// Visitor that allows to get type of scalar field
1474
// but exclude fields contain complex field.This is a faster version
1475
// for FieldVisitorToScalarType which does not support complex field.
1476
class SimpleFieldVisitorToScalarType : public StaticVisitor<size_t> {
1477
public:
1478
    template <PrimitiveType T>
1479
18.7M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
18.7M
        if constexpr (T == TYPE_ARRAY) {
1481
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
109k
        } else if constexpr (T == TYPE_NULL) {
1483
109k
            have_nulls = true;
1484
109k
            return 1;
1485
18.6M
        } else {
1486
18.6M
            type = T;
1487
18.6M
            return 1;
1488
18.6M
        }
1489
18.7M
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
109k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
109k
        } else if constexpr (T == TYPE_NULL) {
1483
109k
            have_nulls = true;
1484
109k
            return 1;
1485
        } else {
1486
            type = T;
1487
            return 1;
1488
        }
1489
109k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
12.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
        } else if constexpr (T == TYPE_NULL) {
1483
            have_nulls = true;
1484
            return 1;
1485
12.3k
        } else {
1486
12.3k
            type = T;
1487
12.3k
            return 1;
1488
12.3k
        }
1489
12.3k
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
273k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
        } else if constexpr (T == TYPE_NULL) {
1483
            have_nulls = true;
1484
            return 1;
1485
273k
        } else {
1486
273k
            type = T;
1487
273k
            return 1;
1488
273k
        }
1489
273k
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
2
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
        } else if constexpr (T == TYPE_NULL) {
1483
            have_nulls = true;
1484
            return 1;
1485
2
        } else {
1486
2
            type = T;
1487
2
            return 1;
1488
2
        }
1489
2
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
7
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
        } else if constexpr (T == TYPE_NULL) {
1483
            have_nulls = true;
1484
            return 1;
1485
7
        } else {
1486
7
            type = T;
1487
7
            return 1;
1488
7
        }
1489
7
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
570
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
        } else if constexpr (T == TYPE_NULL) {
1483
            have_nulls = true;
1484
            return 1;
1485
570
        } else {
1486
570
            type = T;
1487
570
            return 1;
1488
570
        }
1489
570
    }
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
4.93M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
        } else if constexpr (T == TYPE_NULL) {
1483
            have_nulls = true;
1484
            return 1;
1485
4.93M
        } else {
1486
4.93M
            type = T;
1487
4.93M
            return 1;
1488
4.93M
        }
1489
4.93M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
        } else if constexpr (T == TYPE_NULL) {
1483
            have_nulls = true;
1484
            return 1;
1485
1
        } else {
1486
1
            type = T;
1487
1
            return 1;
1488
1
        }
1489
1
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
2.76M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
        } else if constexpr (T == TYPE_NULL) {
1483
            have_nulls = true;
1484
            return 1;
1485
2.76M
        } else {
1486
2.76M
            type = T;
1487
2.76M
            return 1;
1488
2.76M
        }
1489
2.76M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
10.5M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
        } else if constexpr (T == TYPE_NULL) {
1483
            have_nulls = true;
1484
            return 1;
1485
10.5M
        } else {
1486
10.5M
            type = T;
1487
10.5M
            return 1;
1488
10.5M
        }
1489
10.5M
    }
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util30SimpleFieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1479
46.8k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1480
        if constexpr (T == TYPE_ARRAY) {
1481
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
1482
        } else if constexpr (T == TYPE_NULL) {
1483
            have_nulls = true;
1484
            return 1;
1485
46.8k
        } else {
1486
46.8k
            type = T;
1487
46.8k
            return 1;
1488
46.8k
        }
1489
46.8k
    }
1490
18.4M
    void get_scalar_type(PrimitiveType* data_type) const { *data_type = type; }
1491
18.4M
    bool contain_nulls() const { return have_nulls; }
1492
1493
18.4M
    bool need_convert_field() const { return false; }
1494
1495
private:
1496
    PrimitiveType type = PrimitiveType::INVALID_TYPE;
1497
    bool have_nulls = false;
1498
};
1499
1500
/// Visitor that allows to get type of scalar field
1501
/// or least common type of scalars in array.
1502
/// More optimized version of FieldToDataType.
1503
class FieldVisitorToScalarType : public StaticVisitor<size_t> {
1504
public:
1505
    template <PrimitiveType T>
1506
4.80M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
4.80M
        if constexpr (T == TYPE_ARRAY) {
1508
2.26M
            size_t size = x.size();
1509
5.44M
            for (size_t i = 0; i < size; ++i) {
1510
3.18M
                apply_visitor(*this, x[i]);
1511
3.18M
            }
1512
2.26M
            return 0;
1513
2.26M
        } else if constexpr (T == TYPE_NULL) {
1514
11.6k
            have_nulls = true;
1515
11.6k
            return 0;
1516
2.52M
        } else {
1517
2.52M
            field_types.insert(T);
1518
2.52M
            type_indexes.insert(T);
1519
2.52M
            return 0;
1520
2.52M
        }
1521
4.80M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE1EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
11.6k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
11.6k
        } else if constexpr (T == TYPE_NULL) {
1514
11.6k
            have_nulls = true;
1515
11.6k
            return 0;
1516
        } else {
1517
            field_types.insert(T);
1518
            type_indexes.insert(T);
1519
            return 0;
1520
        }
1521
11.6k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE26EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1
        } else {
1517
1
            field_types.insert(T);
1518
1
            type_indexes.insert(T);
1519
1
            return 0;
1520
1
        }
1521
1
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE42EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE7EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
28.6k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
28.6k
        } else {
1517
28.6k
            field_types.insert(T);
1518
28.6k
            type_indexes.insert(T);
1519
28.6k
            return 0;
1520
28.6k
        }
1521
28.6k
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE12EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE11EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE25EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1
        } else {
1517
1
            field_types.insert(T);
1518
1
            type_indexes.insert(T);
1519
1
            return 0;
1520
1
        }
1521
1
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE2EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
57.2k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
57.2k
        } else {
1517
57.2k
            field_types.insert(T);
1518
57.2k
            type_indexes.insert(T);
1519
57.2k
            return 0;
1520
57.2k
        }
1521
57.2k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE3EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
5
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
5
        } else {
1517
5
            field_types.insert(T);
1518
5
            type_indexes.insert(T);
1519
5
            return 0;
1520
5
        }
1521
5
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE4EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1
        } else {
1517
1
            field_types.insert(T);
1518
1
            type_indexes.insert(T);
1519
1
            return 0;
1520
1
        }
1521
1
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE5EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
382
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
382
        } else {
1517
382
            field_types.insert(T);
1518
382
            type_indexes.insert(T);
1519
382
            return 0;
1520
382
        }
1521
382
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE6EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1.41M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1.41M
        } else {
1517
1.41M
            field_types.insert(T);
1518
1.41M
            type_indexes.insert(T);
1519
1.41M
            return 0;
1520
1.41M
        }
1521
1.41M
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE38EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE39EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE8EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1
        } else {
1517
1
            field_types.insert(T);
1518
1
            type_indexes.insert(T);
1519
1
            return 0;
1520
1
        }
1521
1
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE27EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE9EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
182k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
182k
        } else {
1517
182k
            field_types.insert(T);
1518
182k
            type_indexes.insert(T);
1519
182k
            return 0;
1520
182k
        }
1521
182k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE36EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1
        } else {
1517
1
            field_types.insert(T);
1518
1
            type_indexes.insert(T);
1519
1
            return 0;
1520
1
        }
1521
1
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE37EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
95
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
95
        } else {
1517
95
            field_types.insert(T);
1518
95
            type_indexes.insert(T);
1519
95
            return 0;
1520
95
        }
1521
95
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE23EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
760k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
760k
        } else {
1517
760k
            field_types.insert(T);
1518
760k
            type_indexes.insert(T);
1519
760k
            return 0;
1520
760k
        }
1521
760k
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE15EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE10EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE41EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE17EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
2.26M
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
2.26M
        if constexpr (T == TYPE_ARRAY) {
1508
2.26M
            size_t size = x.size();
1509
5.44M
            for (size_t i = 0; i < size; ++i) {
1510
3.18M
                apply_visitor(*this, x[i]);
1511
3.18M
            }
1512
2.26M
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
        } else {
1517
            field_types.insert(T);
1518
            type_indexes.insert(T);
1519
            return 0;
1520
        }
1521
2.26M
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE16EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1
        } else {
1517
1
            field_types.insert(T);
1518
1
            type_indexes.insert(T);
1519
1
            return 0;
1520
1
        }
1521
1
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE18EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE32EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1
        } else {
1517
1
            field_types.insert(T);
1518
1
            type_indexes.insert(T);
1519
1
            return 0;
1520
1
        }
1521
1
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE28EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1
        } else {
1517
1
            field_types.insert(T);
1518
1
            type_indexes.insert(T);
1519
1
            return 0;
1520
1
        }
1521
1
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE29EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1
        } else {
1517
1
            field_types.insert(T);
1518
1
            type_indexes.insert(T);
1519
1
            return 0;
1520
1
        }
1521
1
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE20EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE30EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
80.3k
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
80.3k
        } else {
1517
80.3k
            field_types.insert(T);
1518
80.3k
            type_indexes.insert(T);
1519
80.3k
            return 0;
1520
80.3k
        }
1521
80.3k
    }
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE35EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
1
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
1
        } else {
1517
1
            field_types.insert(T);
1518
1
            type_indexes.insert(T);
1519
1
            return 0;
1520
1
        }
1521
1
    }
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE22EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE19EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Unexecuted instantiation: _ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE24EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
_ZN5doris12variant_util24FieldVisitorToScalarType5applyILNS_13PrimitiveTypeE31EEEmRKNS_19PrimitiveTypeTraitsIXT_EE7CppTypeE
Line
Count
Source
1506
26
    size_t apply(const typename PrimitiveTypeTraits<T>::CppType& x) {
1507
        if constexpr (T == TYPE_ARRAY) {
1508
            size_t size = x.size();
1509
            for (size_t i = 0; i < size; ++i) {
1510
                apply_visitor(*this, x[i]);
1511
            }
1512
            return 0;
1513
        } else if constexpr (T == TYPE_NULL) {
1514
            have_nulls = true;
1515
            return 0;
1516
26
        } else {
1517
26
            field_types.insert(T);
1518
26
            type_indexes.insert(T);
1519
26
            return 0;
1520
26
        }
1521
26
    }
1522
1.61M
    void get_scalar_type(PrimitiveType* type) const {
1523
1.61M
        if (type_indexes.size() == 1) {
1524
            // Most cases will have only one type
1525
1.53M
            *type = *type_indexes.begin();
1526
1.53M
            return;
1527
1.53M
        }
1528
74.1k
        DataTypePtr data_type;
1529
74.1k
        get_least_supertype_jsonb(type_indexes, &data_type);
1530
74.1k
        *type = data_type->get_primitive_type();
1531
74.1k
    }
1532
1.61M
    bool contain_nulls() const { return have_nulls; }
1533
1.61M
    bool need_convert_field() const { return field_types.size() > 1; }
1534
1535
private:
1536
    phmap::flat_hash_set<PrimitiveType> type_indexes;
1537
    phmap::flat_hash_set<PrimitiveType> field_types;
1538
    bool have_nulls = false;
1539
};
1540
1541
template <typename Visitor>
1542
20.4M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1543
20.4M
    Visitor to_scalar_type_visitor;
1544
20.4M
    apply_visitor(to_scalar_type_visitor, field);
1545
20.4M
    PrimitiveType type_id;
1546
20.4M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1547
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1548
20.4M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1549
20.4M
             to_scalar_type_visitor.need_convert_field(),
1550
20.4M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1551
20.4M
}
_ZN5doris12variant_util19get_field_info_implINS0_24FieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1542
1.61M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1543
1.61M
    Visitor to_scalar_type_visitor;
1544
1.61M
    apply_visitor(to_scalar_type_visitor, field);
1545
1.61M
    PrimitiveType type_id;
1546
1.61M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1547
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1548
1.61M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1549
1.61M
             to_scalar_type_visitor.need_convert_field(),
1550
1.61M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1551
1.61M
}
_ZN5doris12variant_util19get_field_info_implINS0_30SimpleFieldVisitorToScalarTypeEEEvRKNS_5FieldEPNS_9FieldInfoE
Line
Count
Source
1542
18.8M
void get_field_info_impl(const Field& field, FieldInfo* info) {
1543
18.8M
    Visitor to_scalar_type_visitor;
1544
18.8M
    apply_visitor(to_scalar_type_visitor, field);
1545
18.8M
    PrimitiveType type_id;
1546
18.8M
    to_scalar_type_visitor.get_scalar_type(&type_id);
1547
    // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
1548
18.8M
    *info = {type_id, to_scalar_type_visitor.contain_nulls(),
1549
18.8M
             to_scalar_type_visitor.need_convert_field(),
1550
18.8M
             apply_visitor(FieldVisitorToNumberOfDimensions(), field)};
1551
18.8M
}
1552
1553
20.4M
void get_field_info(const Field& field, FieldInfo* info) {
1554
20.4M
    if (field.is_complex_field()) {
1555
1.61M
        get_field_info_impl<FieldVisitorToScalarType>(field, info);
1556
18.7M
    } else {
1557
18.7M
        get_field_info_impl<SimpleFieldVisitorToScalarType>(field, info);
1558
18.7M
    }
1559
20.4M
}
1560
1561
bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id,
1562
                              const std::string& path,
1563
253k
                              TabletSchema::SubColumnInfo* sub_column_info) {
1564
253k
    const auto& parent_column = schema.column_by_uid(col_unique_id);
1565
253k
    std::function<void(const TabletColumn&, TabletColumn*)> generate_result_column =
1566
253k
            [&](const TabletColumn& from_column, TabletColumn* to_column) {
1567
11.7k
                to_column->set_name(parent_column.name_lower_case() + "." + path);
1568
11.7k
                to_column->set_type(from_column.type());
1569
11.7k
                to_column->set_parent_unique_id(parent_column.unique_id());
1570
11.7k
                bool is_typed = !parent_column.variant_enable_typed_paths_to_sparse();
1571
11.7k
                to_column->set_path_info(
1572
11.7k
                        PathInData(parent_column.name_lower_case() + "." + path, is_typed));
1573
11.7k
                to_column->set_aggregation_method(parent_column.aggregation());
1574
11.7k
                to_column->set_is_nullable(true);
1575
11.7k
                to_column->set_parent_unique_id(parent_column.unique_id());
1576
11.7k
                if (from_column.is_decimal()) {
1577
11.7k
                    to_column->set_precision(from_column.precision());
1578
11.7k
                }
1579
11.7k
                to_column->set_frac(from_column.frac());
1580
1581
11.7k
                if (from_column.is_array_type()) {
1582
2.28k
                    TabletColumn nested_column;
1583
2.28k
                    generate_result_column(*from_column.get_sub_columns()[0], &nested_column);
1584
2.28k
                    to_column->add_sub_column(nested_column);
1585
2.28k
                }
1586
11.7k
            };
1587
1588
253k
    auto generate_index = [&](const std::string& pattern) {
1589
        // 1. find subcolumn's index
1590
9.46k
        if (const auto& indexes = schema.inverted_index_by_field_pattern(col_unique_id, pattern);
1591
9.46k
            !indexes.empty()) {
1592
3.30k
            for (const auto& index : indexes) {
1593
3.30k
                auto index_ptr = std::make_shared<TabletIndex>(*index);
1594
3.30k
                index_ptr->set_escaped_escaped_index_suffix_path(
1595
3.30k
                        sub_column_info->column.path_info_ptr()->get_path());
1596
3.30k
                sub_column_info->indexes.emplace_back(std::move(index_ptr));
1597
3.30k
            }
1598
3.24k
        }
1599
        // 2. find parent column's index
1600
6.21k
        else if (const auto parent_index = schema.inverted_indexs(col_unique_id);
1601
6.21k
                 !parent_index.empty()) {
1602
391
            inherit_index(parent_index, sub_column_info->indexes, sub_column_info->column);
1603
5.82k
        } else {
1604
5.82k
            sub_column_info->indexes.clear();
1605
5.82k
        }
1606
9.46k
    };
1607
1608
253k
    const auto& sub_columns = parent_column.get_sub_columns();
1609
253k
    for (const auto& sub_column : sub_columns) {
1610
165k
        const char* pattern = sub_column->name().c_str();
1611
165k
        switch (sub_column->pattern_type()) {
1612
4.19k
        case PatternTypePB::MATCH_NAME: {
1613
4.19k
            if (strcmp(pattern, path.c_str()) == 0) {
1614
837
                generate_result_column(*sub_column, &sub_column_info->column);
1615
837
                generate_index(sub_column->name());
1616
837
                return true;
1617
837
            }
1618
3.35k
            break;
1619
4.19k
        }
1620
161k
        case PatternTypePB::MATCH_NAME_GLOB: {
1621
161k
            if (glob_match_re2(pattern, path)) {
1622
8.63k
                generate_result_column(*sub_column, &sub_column_info->column);
1623
8.63k
                generate_index(sub_column->name());
1624
8.63k
                return true;
1625
8.63k
            }
1626
152k
            break;
1627
161k
        }
1628
152k
        default:
1629
0
            break;
1630
165k
        }
1631
165k
    }
1632
243k
    return false;
1633
253k
}
1634
1635
TabletSchemaSPtr VariantCompactionUtil::calculate_variant_extended_schema(
1636
1.34k
        const std::vector<RowsetSharedPtr>& rowsets, const TabletSchemaSPtr& base_schema) {
1637
1.34k
    if (rowsets.empty()) {
1638
0
        return nullptr;
1639
0
    }
1640
1641
1.34k
    std::vector<TabletSchemaSPtr> schemas;
1642
3.13k
    for (const auto& rs : rowsets) {
1643
3.13k
        if (rs->num_segments() == 0) {
1644
3.06k
            continue;
1645
3.06k
        }
1646
70
        const auto& tablet_schema = rs->tablet_schema();
1647
70
        SegmentCacheHandle segment_cache;
1648
70
        auto st = SegmentLoader::instance()->load_segments(std::static_pointer_cast<BetaRowset>(rs),
1649
70
                                                           &segment_cache);
1650
70
        if (!st.ok()) {
1651
0
            return base_schema;
1652
0
        }
1653
70
        for (const auto& segment : segment_cache.get_segments()) {
1654
70
            TabletSchemaSPtr schema = tablet_schema->copy_without_variant_extracted_columns();
1655
146
            for (const auto& column : tablet_schema->columns()) {
1656
146
                if (!column->is_variant_type()) {
1657
70
                    continue;
1658
70
                }
1659
76
                std::shared_ptr<ColumnReader> column_reader;
1660
76
                OlapReaderStatistics stats;
1661
76
                st = segment->get_column_reader(column->unique_id(), &column_reader, &stats);
1662
76
                if (!st.ok()) {
1663
0
                    LOG(WARNING) << "Failed to get column reader for column: " << column->name()
1664
0
                                 << " error: " << st.to_string();
1665
0
                    continue;
1666
0
                }
1667
76
                if (!column_reader) {
1668
0
                    continue;
1669
0
                }
1670
1671
76
                CHECK(column_reader->get_meta_type() == FieldType::OLAP_FIELD_TYPE_VARIANT);
1672
76
                auto* variant_column_reader =
1673
76
                        assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
1674
                // load external meta before getting subcolumn meta info
1675
76
                st = variant_column_reader->load_external_meta_once();
1676
76
                if (!st.ok()) {
1677
0
                    LOG(WARNING) << "Failed to load external meta for column: " << column->name()
1678
0
                                 << " error: " << st.to_string();
1679
0
                    continue;
1680
0
                }
1681
76
                const auto* subcolumn_meta_info = variant_column_reader->get_subcolumns_meta_info();
1682
301
                for (const auto& entry : *subcolumn_meta_info) {
1683
301
                    if (entry->path.empty()) {
1684
76
                        continue;
1685
76
                    }
1686
225
                    const std::string& column_name =
1687
225
                            column->name_lower_case() + "." + entry->path.get_path();
1688
225
                    const DataTypePtr& data_type = entry->data.file_column_type;
1689
225
                    PathInDataBuilder full_path_builder;
1690
225
                    auto full_path = full_path_builder.append(column->name_lower_case(), false)
1691
225
                                             .append(entry->path.get_parts(), false)
1692
225
                                             .build();
1693
225
                    TabletColumn subcolumn =
1694
225
                            get_column_by_type(data_type, column_name,
1695
225
                                               ExtraInfo {.unique_id = -1,
1696
225
                                                          .parent_unique_id = column->unique_id(),
1697
225
                                                          .path_info = full_path});
1698
225
                    schema->append_column(subcolumn);
1699
225
                }
1700
76
            }
1701
70
            schemas.emplace_back(schema);
1702
70
        }
1703
70
    }
1704
1.34k
    TabletSchemaSPtr least_common_schema;
1705
1.34k
    auto st = get_least_common_schema(schemas, base_schema, least_common_schema, false);
1706
1.34k
    if (!st.ok()) {
1707
0
        return base_schema;
1708
0
    }
1709
1.34k
    return least_common_schema;
1710
1.34k
}
1711
1712
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1713
                   TabletIndexes& subcolumns_indexes, FieldType column_type,
1714
80.6k
                   const std::string& suffix_path, bool is_array_nested_type) {
1715
80.6k
    if (parent_indexes.empty()) {
1716
74.9k
        return false;
1717
74.9k
    }
1718
5.75k
    subcolumns_indexes.clear();
1719
    // bkd index or array index only need to inherit one index
1720
5.75k
    if (field_is_numeric_type(column_type) ||
1721
5.75k
        (is_array_nested_type &&
1722
3.57k
         (field_is_numeric_type(column_type) || field_is_slice_type(column_type)))) {
1723
2.20k
        auto index_ptr = std::make_shared<TabletIndex>(*parent_indexes[0]);
1724
2.20k
        index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1725
        // no need parse for bkd index or array index
1726
2.20k
        index_ptr->remove_parser_and_analyzer();
1727
2.20k
        subcolumns_indexes.emplace_back(std::move(index_ptr));
1728
2.20k
        return true;
1729
2.20k
    }
1730
    // string type need to inherit all indexes
1731
3.55k
    else if (field_is_slice_type(column_type) && !is_array_nested_type) {
1732
3.55k
        for (const auto& index : parent_indexes) {
1733
3.55k
            auto index_ptr = std::make_shared<TabletIndex>(*index);
1734
3.55k
            index_ptr->set_escaped_escaped_index_suffix_path(suffix_path);
1735
3.55k
            subcolumns_indexes.emplace_back(std::move(index_ptr));
1736
3.55k
        }
1737
3.52k
        return true;
1738
3.52k
    }
1739
23
    return false;
1740
5.75k
}
1741
1742
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1743
80.6k
                   TabletIndexes& subcolumns_indexes, const TabletColumn& column) {
1744
80.6k
    if (!column.is_extracted_column()) {
1745
3
        return false;
1746
3
    }
1747
80.6k
    if (column.is_array_type()) {
1748
1.01k
        if (column.get_sub_columns().empty()) {
1749
0
            return false;
1750
0
        }
1751
1.01k
        const TabletColumn* nested = column.get_sub_columns()[0].get();
1752
1.01k
        while (nested != nullptr && nested->is_array_type()) {
1753
0
            if (nested->get_sub_columns().empty()) {
1754
0
                return false;
1755
0
            }
1756
0
            nested = nested->get_sub_columns()[0].get();
1757
0
        }
1758
1.01k
        if (nested == nullptr) {
1759
0
            return false;
1760
0
        }
1761
1.01k
        return inherit_index(parent_indexes, subcolumns_indexes, nested->type(),
1762
1.01k
                             column.path_info_ptr()->get_path(), true);
1763
1.01k
    }
1764
79.6k
    return inherit_index(parent_indexes, subcolumns_indexes, column.type(),
1765
79.6k
                         column.path_info_ptr()->get_path());
1766
80.6k
}
1767
1768
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
1769
0
                   TabletIndexes& subcolumns_indexes, const ColumnMetaPB& column_pb) {
1770
0
    if (!column_pb.has_column_path_info()) {
1771
0
        return false;
1772
0
    }
1773
0
    if (column_pb.type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1774
0
        if (column_pb.children_columns_size() == 0) {
1775
0
            return false;
1776
0
        }
1777
0
        const ColumnMetaPB* nested = &column_pb.children_columns(0);
1778
0
        while (nested != nullptr && nested->type() == (int)FieldType::OLAP_FIELD_TYPE_ARRAY) {
1779
0
            if (nested->children_columns_size() == 0) {
1780
0
                return false;
1781
0
            }
1782
0
            nested = &nested->children_columns(0);
1783
0
        }
1784
0
        if (nested == nullptr) {
1785
0
            return false;
1786
0
        }
1787
0
        return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)nested->type(),
1788
0
                             column_pb.column_path_info().path(), true);
1789
0
    }
1790
0
    return inherit_index(parent_indexes, subcolumns_indexes, (FieldType)column_pb.type(),
1791
0
                         column_pb.column_path_info().path());
1792
0
}
1793
1794
// ============ Implementation from parse2column.cpp ============
1795
1796
/** Pool for objects that cannot be used from different threads simultaneously.
1797
  * Allows to create an object for each thread.
1798
  * Pool has unbounded size and objects are not destroyed before destruction of pool.
1799
  *
1800
  * Use it in cases when thread local storage is not appropriate
1801
  *  (when maximum number of simultaneously used objects is less
1802
  *   than number of running/sleeping threads, that has ever used object,
1803
  *   and creation/destruction of objects is expensive).
1804
  */
1805
template <typename T>
1806
class SimpleObjectPool {
1807
protected:
1808
    /// Hold all available objects in stack.
1809
    std::mutex mutex;
1810
    std::stack<std::unique_ptr<T>> stack;
1811
    /// Specialized deleter for std::unique_ptr.
1812
    /// Returns underlying pointer back to stack thus reclaiming its ownership.
1813
    struct Deleter {
1814
        SimpleObjectPool<T>* parent;
1815
16.8k
        Deleter(SimpleObjectPool<T>* parent_ = nullptr) : parent {parent_} {} /// NOLINT
1816
16.8k
        void operator()(T* owning_ptr) const {
1817
16.8k
            std::lock_guard lock {parent->mutex};
1818
16.8k
            parent->stack.emplace(owning_ptr);
1819
16.8k
        }
1820
    };
1821
1822
public:
1823
    using Pointer = std::unique_ptr<T, Deleter>;
1824
    /// Extracts and returns a pointer from the stack if it's not empty,
1825
    ///  creates a new one by calling provided f() otherwise.
1826
    template <typename Factory>
1827
16.7k
    Pointer get(Factory&& f) {
1828
16.7k
        std::unique_lock lock(mutex);
1829
16.7k
        if (stack.empty()) {
1830
77
            return {f(), this};
1831
77
        }
1832
16.7k
        auto object = stack.top().release();
1833
16.7k
        stack.pop();
1834
16.7k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1835
16.7k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9StringRefEPS4_RKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1827
12.4k
    Pointer get(Factory&& f) {
1828
12.4k
        std::unique_lock lock(mutex);
1829
12.4k
        if (stack.empty()) {
1830
1
            return {f(), this};
1831
1
        }
1832
12.4k
        auto object = stack.top().release();
1833
12.4k
        stack.pop();
1834
12.4k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1835
12.4k
    }
variant_util.cpp:_ZN5doris12variant_util16SimpleObjectPoolINS_14JSONDataParserINS_14SimdJSONParserEEEE3getIZNS0_21parse_json_to_variantERNS_7IColumnERKNS_9ColumnStrIjEERKNS_11ParseConfigEE3$_0EESt10unique_ptrIS4_NS5_7DeleterEEOT_
Line
Count
Source
1827
4.36k
    Pointer get(Factory&& f) {
1828
4.36k
        std::unique_lock lock(mutex);
1829
4.36k
        if (stack.empty()) {
1830
76
            return {f(), this};
1831
76
        }
1832
4.28k
        auto object = stack.top().release();
1833
4.28k
        stack.pop();
1834
4.28k
        return std::unique_ptr<T, Deleter>(object, Deleter(this));
1835
4.36k
    }
1836
    /// Like get(), but creates object using default constructor.
1837
    Pointer getDefault() {
1838
        return get([] { return new T; });
1839
    }
1840
};
1841
1842
SimpleObjectPool<JsonParser> parsers_pool;
1843
1844
using Node = typename ColumnVariant::Subcolumns::Node;
1845
1846
6.10M
static inline void append_binary_bytes(ColumnString::Chars& chars, const void* data, size_t size) {
1847
6.10M
    const auto old_size = chars.size();
1848
6.10M
    chars.resize(old_size + size);
1849
6.10M
    memcpy(chars.data() + old_size, reinterpret_cast<const char*>(data), size);
1850
6.10M
}
1851
1852
2.35M
static inline void append_binary_type(ColumnString::Chars& chars, FieldType type) {
1853
2.35M
    const uint8_t t = static_cast<uint8_t>(type);
1854
2.35M
    append_binary_bytes(chars, &t, sizeof(uint8_t));
1855
2.35M
}
1856
1857
1.68M
static inline void append_binary_sizet(ColumnString::Chars& chars, size_t v) {
1858
1.68M
    append_binary_bytes(chars, &v, sizeof(size_t));
1859
1.68M
}
1860
1861
2.35M
static void append_field_to_binary_chars(const Field& field, ColumnString::Chars& chars) {
1862
2.35M
    switch (field.get_type()) {
1863
0
    case PrimitiveType::TYPE_NULL: {
1864
0
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_NONE);
1865
0
        return;
1866
0
    }
1867
130k
    case PrimitiveType::TYPE_BOOLEAN: {
1868
130k
        append_binary_type(chars,
1869
130k
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BOOLEAN));
1870
130k
        const auto v = static_cast<UInt8>(field.get<PrimitiveType::TYPE_BOOLEAN>());
1871
130k
        append_binary_bytes(chars, &v, sizeof(UInt8));
1872
130k
        return;
1873
0
    }
1874
509k
    case PrimitiveType::TYPE_BIGINT: {
1875
509k
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_BIGINT));
1876
509k
        const auto v = field.get<PrimitiveType::TYPE_BIGINT>();
1877
509k
        append_binary_bytes(chars, &v, sizeof(Int64));
1878
509k
        return;
1879
0
    }
1880
1
    case PrimitiveType::TYPE_LARGEINT: {
1881
1
        append_binary_type(chars,
1882
1
                           TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_LARGEINT));
1883
1
        const auto v = field.get<PrimitiveType::TYPE_LARGEINT>();
1884
1
        append_binary_bytes(chars, &v, sizeof(int128_t));
1885
1
        return;
1886
0
    }
1887
55.5k
    case PrimitiveType::TYPE_DOUBLE: {
1888
55.5k
        append_binary_type(chars, TabletColumn::get_field_type_by_type(PrimitiveType::TYPE_DOUBLE));
1889
55.5k
        const auto v = field.get<PrimitiveType::TYPE_DOUBLE>();
1890
55.5k
        append_binary_bytes(chars, &v, sizeof(Float64));
1891
55.5k
        return;
1892
0
    }
1893
1.49M
    case PrimitiveType::TYPE_STRING: {
1894
1.49M
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_STRING);
1895
1.49M
        const auto& v = field.get<PrimitiveType::TYPE_STRING>();
1896
1.49M
        append_binary_sizet(chars, v.size());
1897
1.49M
        append_binary_bytes(chars, v.data(), v.size());
1898
1.49M
        return;
1899
0
    }
1900
22.1k
    case PrimitiveType::TYPE_JSONB: {
1901
22.1k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_JSONB);
1902
22.1k
        const auto& v = field.get<PrimitiveType::TYPE_JSONB>();
1903
22.1k
        append_binary_sizet(chars, v.get_size());
1904
22.1k
        append_binary_bytes(chars, v.get_value(), v.get_size());
1905
22.1k
        return;
1906
0
    }
1907
170k
    case PrimitiveType::TYPE_ARRAY: {
1908
170k
        append_binary_type(chars, FieldType::OLAP_FIELD_TYPE_ARRAY);
1909
170k
        const auto& a = field.get<PrimitiveType::TYPE_ARRAY>();
1910
170k
        append_binary_sizet(chars, a.size());
1911
182k
        for (const auto& elem : a) {
1912
182k
            append_field_to_binary_chars(elem, chars);
1913
182k
        }
1914
170k
        return;
1915
0
    }
1916
0
    default:
1917
0
        throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Unsupported field type {}",
1918
0
                               field.get_type());
1919
2.35M
    }
1920
2.35M
}
1921
template <typename ParserImpl>
1922
void parse_json_to_variant_impl(IColumn& column, const char* src, size_t length,
1923
1.35M
                                JSONDataParser<ParserImpl>* parser, const ParseConfig& config) {
1924
1.35M
    auto& column_variant = assert_cast<ColumnVariant&>(column);
1925
1.35M
    std::optional<ParseResult> result;
1926
    /// Treat empty string as an empty object
1927
    /// for better CAST from String to Object.
1928
1.35M
    if (length > 0) {
1929
1.34M
        result = parser->parse(src, length, config);
1930
1.34M
    } else {
1931
3.36k
        result = ParseResult {};
1932
3.36k
    }
1933
1.35M
    if (!result) {
1934
664
        VLOG_DEBUG << "failed to parse " << std::string_view(src, length) << ", length= " << length;
1935
664
        if (config::variant_throw_exeception_on_invalid_json) {
1936
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse object {}",
1937
0
                                   std::string_view(src, length));
1938
0
        }
1939
        // Treat as string
1940
664
        PathInData root_path;
1941
664
        Field field = Field::create_field<TYPE_STRING>(String(src, length));
1942
664
        result = ParseResult {{root_path}, {field}};
1943
664
    }
1944
1.35M
    auto& [paths, values] = *result;
1945
1.35M
    assert(paths.size() == values.size());
1946
1.35M
    size_t old_num_rows = column_variant.rows();
1947
1.35M
    if (config.deprecated_enable_flatten_nested) {
1948
        // here we should check the paths in variant and paths in result,
1949
        // if two paths which same prefix have different structure, we should throw an exception
1950
3.02k
        std::vector<PathInData> check_paths;
1951
12.0k
        for (const auto& entry : column_variant.get_subcolumns()) {
1952
12.0k
            check_paths.push_back(entry->path);
1953
12.0k
        }
1954
3.02k
        check_paths.insert(check_paths.end(), paths.begin(), paths.end());
1955
3.02k
        THROW_IF_ERROR(check_variant_has_no_ambiguous_paths(check_paths));
1956
3.02k
    }
1957
1.35M
    auto [doc_value_data_paths, doc_value_data_values] =
1958
1.35M
            column_variant.get_doc_value_data_paths_and_values();
1959
1.35M
    auto& doc_value_data_offsets = column_variant.serialized_doc_value_column_offsets();
1960
1961
16.6M
    auto flush_defaults = [](ColumnVariant::Subcolumn* subcolumn) {
1962
16.6M
        const auto num_defaults = subcolumn->cur_num_of_defaults();
1963
16.6M
        if (num_defaults > 0) {
1964
2.02M
            subcolumn->insert_many_defaults(num_defaults);
1965
2.02M
            subcolumn->reset_current_num_of_defaults();
1966
2.02M
        }
1967
16.6M
    };
1968
1969
1.35M
    auto is_plain_path = [](const PathInData& path) {
1970
8.35k
        for (const auto& part : path.get_parts()) {
1971
8.35k
            if (part.is_nested || part.anonymous_array_level != 0) {
1972
0
                return false;
1973
0
            }
1974
8.35k
        }
1975
8.30k
        return true;
1976
8.30k
    };
1977
1978
1.35M
    auto get_or_create_subcolumn = [&](const PathInData& path, size_t index_hint,
1979
16.7M
                                       const FieldInfo& field_info) -> ColumnVariant::Subcolumn* {
1980
16.7M
        auto* subcolumn = column_variant.get_subcolumn(path, index_hint);
1981
16.7M
        if (subcolumn == nullptr) {
1982
96.0k
            if (path.has_nested_part()) {
1983
17
                column_variant.add_nested_subcolumn(path, field_info, old_num_rows);
1984
96.0k
            } else {
1985
96.0k
                column_variant.add_sub_column(path, old_num_rows);
1986
96.0k
            }
1987
96.0k
            subcolumn = column_variant.get_subcolumn(path, index_hint);
1988
96.0k
        }
1989
16.7M
        if (!subcolumn) {
1990
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to find sub column {}",
1991
0
                                   path.get_path());
1992
0
        }
1993
16.7M
        return subcolumn;
1994
16.7M
    };
1995
1996
16.5M
    auto normalize_plain_path = [&](const PathInData& path) {
1997
16.5M
        if (!config.check_duplicate_json_path || path.empty() || !is_plain_path(path)) {
1998
16.5M
            return path;
1999
16.5M
        }
2000
18.4E
        return PathInData(path.get_path());
2001
16.5M
    };
2002
2003
1.35M
    auto insert_into_subcolumn = [&](size_t i,
2004
16.6M
                                     bool check_size_mismatch) -> ColumnVariant::Subcolumn* {
2005
16.6M
        FieldInfo field_info;
2006
16.6M
        get_field_info(values[i], &field_info);
2007
16.6M
        if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE) {
2008
47.2k
            return nullptr;
2009
47.2k
        }
2010
16.6M
        auto path = normalize_plain_path(paths[i]);
2011
16.6M
        auto* subcolumn = get_or_create_subcolumn(path, i, field_info);
2012
16.6M
        flush_defaults(subcolumn);
2013
16.6M
        if (check_size_mismatch && subcolumn->size() != old_num_rows) {
2014
2
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
2015
2
                                   "subcolumn {} size missmatched, may contains duplicated entry",
2016
2
                                   path.get_path());
2017
2
        }
2018
16.6M
        subcolumn->insert(std::move(values[i]), std::move(field_info));
2019
16.6M
        return subcolumn;
2020
16.6M
    };
2021
2022
1.35M
    switch (config.parse_to) {
2023
1.18M
    case ParseConfig::ParseTo::OnlySubcolumns:
2024
17.9M
        for (size_t i = 0; i < paths.size(); ++i) {
2025
16.7M
            insert_into_subcolumn(i, true);
2026
16.7M
        }
2027
1.18M
        break;
2028
165k
    case ParseConfig::ParseTo::OnlyDocValueColumn: {
2029
18.4E
        CHECK(column_variant.enable_doc_mode()) << "OnlyDocValueColumn requires doc mode enabled";
2030
165k
        std::vector<size_t> doc_item_indexes;
2031
165k
        doc_item_indexes.reserve(paths.size());
2032
165k
        phmap::flat_hash_set<StringRef, StringRefHash> seen_paths;
2033
165k
        seen_paths.reserve(paths.size());
2034
2035
2.44M
        for (size_t i = 0; i < paths.size(); ++i) {
2036
2.27M
            FieldInfo field_info;
2037
2.27M
            get_field_info(values[i], &field_info);
2038
2.27M
            if (paths[i].empty()) {
2039
709
                auto* subcolumn = column_variant.get_subcolumn(paths[i]);
2040
709
                DCHECK(subcolumn != nullptr);
2041
709
                flush_defaults(subcolumn);
2042
709
                subcolumn->insert(std::move(values[i]), std::move(field_info));
2043
709
                continue;
2044
709
            }
2045
2.27M
            if (field_info.scalar_type_id == PrimitiveType::INVALID_TYPE ||
2046
2.27M
                values[i].get_type() == PrimitiveType::TYPE_NULL) {
2047
69.5k
                continue;
2048
69.5k
            }
2049
2.20M
            const auto& path_str = paths[i].get_path();
2050
2.20M
            StringRef path_ref {path_str.data(), path_str.size()};
2051
2.20M
            if (UNLIKELY(!seen_paths.emplace(path_ref).second)) {
2052
0
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
2053
0
                                       "may contains duplicated entry : {}",
2054
0
                                       std::string_view(path_str));
2055
0
            }
2056
2.20M
            doc_item_indexes.push_back(i);
2057
2.20M
        }
2058
2059
165k
        std::sort(doc_item_indexes.begin(), doc_item_indexes.end(),
2060
14.4M
                  [&](size_t l, size_t r) { return paths[l].get_path() < paths[r].get_path(); });
2061
2.15M
        for (const auto idx : doc_item_indexes) {
2062
2.15M
            const auto& path_str = paths[idx].get_path();
2063
2.15M
            doc_value_data_paths->insert_data(path_str.data(), path_str.size());
2064
2.15M
            auto& chars = doc_value_data_values->get_chars();
2065
2.15M
            append_field_to_binary_chars(values[idx], chars);
2066
2.15M
            doc_value_data_values->get_offsets().push_back(chars.size());
2067
2.15M
        }
2068
165k
    } break;
2069
1.35M
    }
2070
1.34M
    doc_value_data_offsets.push_back(doc_value_data_paths->size());
2071
    // /// Insert default values to missed subcolumns.
2072
1.34M
    const auto& subcolumns = column_variant.get_subcolumns();
2073
44.4M
    for (const auto& entry : subcolumns) {
2074
44.4M
        if (entry->data.size() == old_num_rows) {
2075
            // Handle nested paths differently from simple paths
2076
27.6M
            if (entry->path.has_nested_part()) {
2077
                // Try to insert default from nested, if failed, insert regular default
2078
0
                bool success = UNLIKELY(column_variant.try_insert_default_from_nested(entry));
2079
0
                if (!success) {
2080
0
                    entry->data.insert_default();
2081
0
                }
2082
27.6M
            } else {
2083
                // For non-nested paths, increment default counter
2084
27.6M
                entry->data.increment_default_counter();
2085
27.6M
            }
2086
27.6M
        }
2087
44.4M
    }
2088
1.34M
    column_variant.incr_num_rows();
2089
1.34M
    auto sparse_column = column_variant.get_sparse_column();
2090
1.34M
    if (sparse_column->size() == old_num_rows) {
2091
1.34M
        sparse_column->assume_mutable()->insert_default();
2092
1.34M
    }
2093
1.34M
#ifndef NDEBUG
2094
1.34M
    column_variant.check_consistency();
2095
1.34M
#endif
2096
1.34M
}
2097
2098
// exposed interfaces
2099
void parse_json_to_variant(IColumn& column, const StringRef& json, JsonParser* parser,
2100
12.4k
                           const ParseConfig& config) {
2101
12.4k
    if (parser) {
2102
0
        return parse_json_to_variant_impl(column, json.data, json.size, parser, config);
2103
12.4k
    } else {
2104
12.4k
        auto pool_parser = parsers_pool.get([] { return new JsonParser(); });
2105
12.4k
        return parse_json_to_variant_impl(column, json.data, json.size, pool_parser.get(), config);
2106
12.4k
    }
2107
12.4k
}
2108
2109
void parse_json_to_variant(IColumn& column, const ColumnString& raw_json_column,
2110
4.35k
                           const ParseConfig& config) {
2111
4.35k
    auto parser = parsers_pool.get([] { return new JsonParser(); });
2112
1.34M
    for (size_t i = 0; i < raw_json_column.size(); ++i) {
2113
1.33M
        StringRef raw_json = raw_json_column.get_data_at(i);
2114
1.33M
        parse_json_to_variant_impl(column, raw_json.data, raw_json.size, parser.get(), config);
2115
1.33M
    }
2116
4.35k
    column.finalize();
2117
4.35k
}
2118
2119
// parse the doc snapshot column to subcolumns
2120
0
void materialize_docs_to_subcolumns(ColumnVariant& column_variant) {
2121
0
    auto subcolumns = materialize_docs_to_subcolumns_map(column_variant);
2122
2123
0
    for (auto& entry : subcolumns) {
2124
0
        entry.second.finalize();
2125
0
        if (!column_variant.add_sub_column(PathInData(entry.first),
2126
0
                                           IColumn::mutate(entry.second.get_finalized_column_ptr()),
2127
0
                                           entry.second.get_least_common_type())) {
2128
0
            throw doris::Exception(ErrorCode::INTERNAL_ERROR,
2129
0
                                   "Failed to add subcolumn {}, which is from doc snapshot column",
2130
0
                                   entry.first);
2131
0
        }
2132
0
    }
2133
2134
0
    column_variant.finalize();
2135
0
}
2136
2137
// ============ Implementation from variant_util.cpp ============
2138
2139
phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_to_subcolumns_map(
2140
7
        const ColumnVariant& variant, size_t expected_unique_paths) {
2141
7
    constexpr size_t kInitialPathReserve = 8192;
2142
7
    phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> subcolumns;
2143
2144
7
    const auto [column_key, column_value] = variant.get_doc_value_data_paths_and_values();
2145
7
    const auto& column_offsets = variant.serialized_doc_value_column_offsets();
2146
7
    const size_t num_rows = column_offsets.size();
2147
2148
7
    DCHECK_EQ(num_rows, variant.size()) << "doc snapshot offsets size mismatch with variant rows";
2149
2150
7
    subcolumns.reserve(expected_unique_paths != 0
2151
7
                               ? expected_unique_paths
2152
7
                               : std::min<size_t>(column_key->size(), kInitialPathReserve));
2153
2154
27
    for (size_t row = 0; row < num_rows; ++row) {
2155
20
        const size_t start = column_offsets[row - 1];
2156
20
        const size_t end = column_offsets[row];
2157
58
        for (size_t i = start; i < end; ++i) {
2158
38
            const auto& key = column_key->get_data_at(i);
2159
38
            const std::string_view path_sv(key.data, key.size);
2160
2161
38
            auto [it, inserted] =
2162
38
                    subcolumns.try_emplace(path_sv, ColumnVariant::Subcolumn {0, true, false});
2163
38
            auto& subcolumn = it->second;
2164
38
            if (inserted) {
2165
20
                subcolumn.insert_many_defaults(row);
2166
20
            } else if (subcolumn.size() != row) {
2167
4
                subcolumn.insert_many_defaults(row - subcolumn.size());
2168
4
            }
2169
38
            subcolumn.deserialize_from_binary_column(column_value, i);
2170
38
        }
2171
20
    }
2172
2173
20
    for (auto& [path, subcolumn] : subcolumns) {
2174
20
        if (subcolumn.size() != num_rows) {
2175
7
            subcolumn.insert_many_defaults(num_rows - subcolumn.size());
2176
7
        }
2177
20
    }
2178
2179
7
    return subcolumns;
2180
7
}
2181
2182
Status _parse_and_materialize_variant_columns(Block& block,
2183
                                              const std::vector<uint32_t>& variant_pos,
2184
4.41k
                                              const std::vector<ParseConfig>& configs) {
2185
9.73k
    for (size_t i = 0; i < variant_pos.size(); ++i) {
2186
5.31k
        auto column_ref = block.get_by_position(variant_pos[i]).column;
2187
5.31k
        bool is_nullable = column_ref->is_nullable();
2188
5.31k
        MutableColumnPtr var_column = column_ref->assume_mutable();
2189
5.31k
        if (is_nullable) {
2190
4.94k
            const auto& nullable = assert_cast<const ColumnNullable&>(*column_ref);
2191
4.94k
            var_column = nullable.get_nested_column_ptr()->assume_mutable();
2192
4.94k
        }
2193
5.31k
        auto& var = assert_cast<ColumnVariant&>(*var_column);
2194
5.31k
        var_column->finalize();
2195
2196
5.31k
        MutableColumnPtr variant_column;
2197
5.31k
        if (!var.is_scalar_variant()) {
2198
            // already parsed
2199
1.18k
            continue;
2200
1.18k
        }
2201
2202
18.4E
        VLOG_DEBUG << "parse scalar variant column: " << var.get_root_type()->get_name();
2203
4.13k
        ColumnPtr scalar_root_column;
2204
4.13k
        if (var.get_root_type()->get_primitive_type() == TYPE_JSONB) {
2205
            // TODO more efficient way to parse jsonb type, currently we just convert jsonb to
2206
            // json str and parse them into variant
2207
30
            RETURN_IF_ERROR(cast_column({var.get_root(), var.get_root_type(), ""},
2208
30
                                        var.get_root()->is_nullable()
2209
30
                                                ? make_nullable(std::make_shared<DataTypeString>())
2210
30
                                                : std::make_shared<DataTypeString>(),
2211
30
                                        &scalar_root_column));
2212
30
            if (scalar_root_column->is_nullable()) {
2213
30
                scalar_root_column = assert_cast<const ColumnNullable*>(scalar_root_column.get())
2214
30
                                             ->get_nested_column_ptr();
2215
30
            }
2216
4.10k
        } else {
2217
4.10k
            const auto& root = *var.get_root();
2218
4.10k
            scalar_root_column =
2219
4.10k
                    root.is_nullable()
2220
4.10k
                            ? assert_cast<const ColumnNullable&>(root).get_nested_column_ptr()
2221
4.10k
                            : var.get_root();
2222
4.10k
        }
2223
2224
4.14k
        if (scalar_root_column->is_column_string()) {
2225
4.14k
            variant_column = ColumnVariant::create(0, var.enable_doc_mode());
2226
4.14k
            parse_json_to_variant(*variant_column.get(),
2227
4.14k
                                  assert_cast<const ColumnString&>(*scalar_root_column),
2228
4.14k
                                  configs[i]);
2229
18.4E
        } else {
2230
            // Root maybe other types rather than string like ColumnVariant(Int32).
2231
            // In this case, we should finlize the root and cast to JSON type
2232
18.4E
            auto expected_root_type =
2233
18.4E
                    make_nullable(std::make_shared<ColumnVariant::MostCommonType>());
2234
18.4E
            var.ensure_root_node_type(expected_root_type);
2235
18.4E
            variant_column = var.assume_mutable();
2236
18.4E
        }
2237
2238
        // Wrap variant with nullmap if it is nullable
2239
4.13k
        ColumnPtr result = variant_column->get_ptr();
2240
4.13k
        if (is_nullable) {
2241
4.04k
            const auto& null_map =
2242
4.04k
                    assert_cast<const ColumnNullable&>(*column_ref).get_null_map_column_ptr();
2243
4.04k
            result = ColumnNullable::create(result, null_map);
2244
4.04k
        }
2245
4.13k
        block.get_by_position(variant_pos[i]).column = result;
2246
4.13k
    }
2247
4.41k
    return Status::OK();
2248
4.41k
}
2249
2250
Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos,
2251
4.34k
                                             const std::vector<ParseConfig>& configs) {
2252
4.34k
    RETURN_IF_CATCH_EXCEPTION(
2253
4.34k
            { return _parse_and_materialize_variant_columns(block, variant_pos, configs); });
2254
4.34k
}
2255
2256
Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& tablet_schema,
2257
4.45k
                                             const std::vector<uint32_t>& column_pos) {
2258
4.45k
    std::vector<uint32_t> variant_column_pos;
2259
4.45k
    std::vector<uint32_t> variant_schema_pos;
2260
4.45k
    variant_column_pos.reserve(column_pos.size());
2261
4.45k
    variant_schema_pos.reserve(column_pos.size());
2262
27.8k
    for (size_t block_pos = 0; block_pos < column_pos.size(); ++block_pos) {
2263
23.4k
        const uint32_t schema_pos = column_pos[block_pos];
2264
23.4k
        const auto& column = tablet_schema.column(schema_pos);
2265
23.4k
        if (column.is_variant_type()) {
2266
5.29k
            variant_column_pos.push_back(schema_pos);
2267
5.29k
            variant_schema_pos.push_back(schema_pos);
2268
5.29k
        }
2269
23.4k
    }
2270
2271
4.45k
    if (variant_column_pos.empty()) {
2272
44
        return Status::OK();
2273
44
    }
2274
2275
4.41k
    std::vector<ParseConfig> configs(variant_column_pos.size());
2276
9.72k
    for (size_t i = 0; i < variant_column_pos.size(); ++i) {
2277
        // Deprecated legacy flatten-nested switch. Distinct from variant_enable_nested_group.
2278
5.31k
        configs[i].deprecated_enable_flatten_nested =
2279
5.31k
                tablet_schema.deprecated_variant_flatten_nested();
2280
5.31k
        configs[i].check_duplicate_json_path = config::variant_enable_duplicate_json_path_check;
2281
5.31k
        const auto& column = tablet_schema.column(variant_schema_pos[i]);
2282
5.31k
        if (!column.is_variant_type()) {
2283
0
            return Status::InternalError("column is not variant type, column name: {}",
2284
0
                                         column.name());
2285
0
        }
2286
        // if doc mode is not enabled, no need to parse to doc value column
2287
5.31k
        if (!column.variant_enable_doc_mode()) {
2288
2.94k
            configure_decimal_number_preserve_paths(column, &configs[i]);
2289
2.94k
            configs[i].parse_to = ParseConfig::ParseTo::OnlySubcolumns;
2290
2.94k
            continue;
2291
2.94k
        }
2292
2293
2.37k
        configs[i].parse_to = ParseConfig::ParseTo::OnlyDocValueColumn;
2294
2.37k
    }
2295
2296
4.41k
    RETURN_IF_ERROR(parse_and_materialize_variant_columns(block, variant_column_pos, configs));
2297
4.41k
    return Status::OK();
2298
4.41k
}
2299
2300
} // namespace doris::variant_util