Coverage Report

Created: 2026-03-21 03:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/common/variant_util.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <gen_cpp/Descriptors_types.h>
21
#include <parallel_hashmap/phmap.h>
22
23
#include <cstddef>
24
#include <cstdint>
25
#include <map>
26
#include <string>
27
#include <string_view>
28
#include <unordered_map>
29
#include <vector>
30
31
#include "common/status.h"
32
#include "core/column/column.h"
33
#include "core/column/column_variant.h"
34
#include "core/data_type/data_type.h"
35
#include "core/field.h"
36
#include "core/string_ref.h"
37
#include "core/types.h"
38
#include "exprs/aggregate/aggregate_function.h"
39
#include "storage/tablet/tablet_fwd.h"
40
#include "storage/tablet/tablet_schema.h"
41
#include "util/json/json_parser.h"
42
43
namespace doris {
44
class TabletSchema;
45
enum class FieldType;
46
namespace segment_v2 {
47
struct VariantStatisticsPB;
48
} // namespace segment_v2
49
class Block;
50
class IColumn;
51
struct ColumnWithTypeAndName;
52
class SimdJSONParser;
53
enum class ExtractType;
54
template <typename ParserImpl>
55
class JSONDataParser;
56
template <typename T>
57
class ColumnStr;
58
using ColumnString = ColumnStr<UInt32>;
59
using JsonParser = JSONDataParser<SimdJSONParser>;
60
} // namespace doris
61
62
const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__";
63
const std::string DOC_VALUE_COLUMN_PATH = "__DORIS_VARIANT_DOC_VALUE__";
64
namespace doris::variant_util {
65
66
// Convert a restricted glob pattern into a regex (for tests/internal use).
67
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern);
68
69
// Match a glob pattern against a path using RE2.
70
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path);
71
72
using PathToNoneNullValues = std::unordered_map<std::string, int64_t>;
73
using PathToDataTypes = std::unordered_map<PathInData, std::vector<DataTypePtr>, PathInData::Hash>;
74
75
63.6k
inline bool should_record_variant_path_stats(const TabletColumn& column) {
76
63.6k
    return !column.variant_enable_nested_group();
77
63.6k
}
78
79
11.5k
inline bool should_write_variant_binary_columns(const TabletColumn& column) {
80
11.5k
    return !column.variant_enable_nested_group();
81
11.5k
}
82
83
14.2k
inline bool should_check_variant_path_stats(const TabletColumn& column) {
84
14.2k
    return !column.variant_enable_nested_group();
85
14.2k
}
86
87
struct VariantExtendedInfo {
88
    PathToNoneNullValues path_to_none_null_values; // key: path, value: number of none null values
89
    std::unordered_set<std::string> sparse_paths;  // sparse paths in this variant column
90
    std::unordered_set<std::string> typed_paths;   // typed paths in this variant column
91
    std::unordered_set<PathInData, PathInData::Hash>
92
            nested_paths;               // nested paths in this variant column
93
    PathToDataTypes path_to_data_types; // key: path, value: data types
94
    bool has_nested_group = false;      // whether this variant column has nested group
95
};
96
97
/// Returns number of dimensions in Array type. 0 if type is not array.
98
size_t get_number_of_dimensions(const IDataType& type);
99
100
/// Returns number of dimensions in Array column. 0 if column is not array.
101
size_t get_number_of_dimensions(const IColumn& column);
102
103
/// Returns type of scalars of Array of arbitrary dimensions.
104
DataTypePtr get_base_type_of_array(const DataTypePtr& type);
105
106
// Cast column to dst type
107
Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result);
108
109
struct ExtraInfo {
110
    // -1 indicates it's not a Frontend generated column
111
    int32_t unique_id = -1;
112
    int32_t parent_unique_id = -1;
113
    PathInData path_info;
114
};
115
116
TabletColumn get_column_by_type(const DataTypePtr& data_type, const std::string& name,
117
                                const ExtraInfo& ext_info);
118
119
// check if the tuple_paths has ambiguous paths
120
// situation:
121
// throw exception if there exists a prefix with matched names, but not matched structure (is Nested, number of dimensions).
122
Status check_variant_has_no_ambiguous_paths(const std::vector<PathInData>& paths);
123
124
// Pick the tablet schema with the highest schema version as the reference.
125
// Then update all variant columns to there least common types.
126
// Return the final merged schema as common schema.
127
// If base_schema == nullptr then, max schema version tablet schema will be picked as base schema
128
Status get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
129
                               const TabletSchemaSPtr& base_schema, TabletSchemaSPtr& result,
130
                               bool check_schema_size = false);
131
132
// Get least common types for extracted columns which has Path info,
133
// with a speicified variant column's unique id
134
Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
135
                                  TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
136
                                  std::set<PathInData>* path_set);
137
138
// inherit attributes like index/agg info from it's parent column
139
void inherit_column_attributes(TabletSchemaSPtr& schema);
140
141
// source: variant column
142
// target: extracted column from variant column
143
void inherit_column_attributes(const TabletColumn& source, TabletColumn& target,
144
                               TabletSchemaSPtr* target_schema = nullptr);
145
146
// Align variant subcolumn BF inheritance with FE BF-supported types.
147
bool is_bf_supported_by_fe_for_variant_subcolumn(FieldType type);
148
149
// get sorted subcolumns of variant
150
ColumnVariant::Subcolumns get_sorted_subcolumns(const ColumnVariant::Subcolumns& subcolumns);
151
152
bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema,
153
                           int32_t new_col_idx, int32_t old_col_idx);
154
155
// create ColumnMap<String, String>
156
TabletColumn create_sparse_column(const TabletColumn& variant);
157
158
// Create one bucket sparse column: name = variant.name_lower_case() + "." + SPARSE_COLUMN_PATH + ".b{index}"
159
TabletColumn create_sparse_shard_column(const TabletColumn& variant, int bucket_index);
160
161
TabletColumn create_doc_value_column(const TabletColumn& variant, int bucket_index);
162
163
// Compute bucket id for given path string using SipHash64(path) % bucket_num.
164
uint32_t variant_binary_shard_of(const StringRef& path, uint32_t bucket_num);
165
166
void get_field_info(const Field& field, FieldInfo* info);
167
168
// inherit index from parent column
169
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
170
                   TabletIndexes& sub_column_indexes, FieldType column_type,
171
                   const std::string& suffix_path, bool is_array_nested_type = false);
172
173
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
174
                   TabletIndexes& sub_column_indexes, const TabletColumn& column);
175
176
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
177
                   TabletIndexes& sub_column_indexes, const segment_v2::ColumnMetaPB& column_pb);
178
179
Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
180
                                    TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
181
                                    const std::map<std::string, TabletColumnPtr>& typed_columns,
182
                                    std::set<PathInData>* path_set = nullptr);
183
184
bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id,
185
                              const std::string& path,
186
                              TabletSchema::SubColumnInfo* sub_column_info);
187
188
class VariantCompactionUtil {
189
public:
190
    // get the subpaths and sparse paths for the variant column
191
    static void get_subpaths(int32_t max_subcolumns_count, const PathToNoneNullValues& path_stats,
192
                             TabletSchema::PathsSetInfo& paths_set_info);
193
194
    // collect extended info from the variant column
195
    static Status aggregate_variant_extended_info(
196
            const RowsetSharedPtr& rs,
197
            std::unordered_map<int32_t, VariantExtendedInfo>* uid_to_variant_extended_info);
198
199
    // collect path stats from the variant column
200
    static Status aggregate_path_to_stats(
201
            const RowsetSharedPtr& rs,
202
            std::unordered_map<int32_t, PathToNoneNullValues>* uid_to_path_stats);
203
204
    // Build the temporary schema for compaction, this will reduce the memory usage of compacting variant columns
205
    static Status get_extended_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets,
206
                                                 TabletSchemaSPtr& target);
207
208
    // Used to collect all the subcolumns types of variant column from rowsets
209
    static TabletSchemaSPtr calculate_variant_extended_schema(
210
            const std::vector<RowsetSharedPtr>& rowsets, const TabletSchemaSPtr& base_schema);
211
212
    // Check if the path stats are consistent between inputs rowsets and output rowset.
213
    // Used to check the correctness of compaction.
214
    static Status check_path_stats(const std::vector<RowsetSharedPtr>& intputs,
215
                                   RowsetSharedPtr output, BaseTabletSPtr tablet);
216
217
    // Calculate statistics about variant data paths from the encoded sparse column
218
    static void calculate_variant_stats(const IColumn& encoded_sparse_column,
219
                                        segment_v2::VariantStatisticsPB* stats,
220
                                        size_t max_sparse_column_statistics_size, size_t row_pos,
221
                                        size_t num_rows);
222
223
    static void get_compaction_subcolumns_from_subpaths(
224
            TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
225
            const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
226
            const std::unordered_set<std::string>& sparse_paths, TabletSchemaSPtr& output_schema);
227
228
    static void get_compaction_subcolumns_from_data_types(
229
            TabletSchema::PathsSetInfo& paths_set_info, const TabletColumnPtr parent_column,
230
            const TabletSchemaSPtr& target, const PathToDataTypes& path_to_data_types,
231
            TabletSchemaSPtr& output_schema);
232
233
    static Status get_compaction_typed_columns(const TabletSchemaSPtr& target,
234
                                               const std::unordered_set<std::string>& typed_paths,
235
                                               const TabletColumnPtr parent_column,
236
                                               TabletSchemaSPtr& output_schema,
237
                                               TabletSchema::PathsSetInfo& paths_set_info);
238
239
    static Status get_compaction_nested_columns(
240
            const std::unordered_set<PathInData, PathInData::Hash>& nested_paths,
241
            const PathToDataTypes& path_to_data_types, const TabletColumnPtr parent_column,
242
            TabletSchemaSPtr& output_schema, TabletSchema::PathsSetInfo& paths_set_info);
243
};
244
245
// parse a batch of json strings into column object, throws doris::Execption when failed
246
// only UT test
247
void parse_json_to_variant(IColumn& column, const ColumnString& raw_json_column,
248
                           const ParseConfig& config);
249
250
// Parse variant columns by picking variant positions from `variant_pos` and using provided ParseConfigs.
251
// only UT test
252
Status parse_and_materialize_variant_columns(Block& block, const std::vector<uint32_t>& variant_pos,
253
                                             const std::vector<ParseConfig>& configs);
254
255
// parse a single json, throws doris::Execption when failed
256
void parse_json_to_variant(IColumn& column, const StringRef& jsons, JsonParser* parser,
257
                           const ParseConfig& config);
258
259
// Parse variant columns by picking variant positions from `column_pos` and generating ParseConfig
260
// based on tablet schema settings (flatten nested / doc snapshot mode).
261
Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& tablet_schema,
262
                                             const std::vector<uint32_t>& column_pos);
263
264
// Parse doc snapshot column (paths/values/offsets stored in ColumnVariant) into per-path subcolumns.
265
// NOTE: Returned map keys are `std::string_view` pointing into the underlying doc snapshot paths
266
// column, so the input `variant` must outlive the returned map.
267
phmap::flat_hash_map<std::string_view, ColumnVariant::Subcolumn> materialize_docs_to_subcolumns_map(
268
        const ColumnVariant& variant);
269
270
} // namespace  doris::variant_util