Coverage Report

Created: 2026-04-14 10:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/tablet/tablet_schema.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <gen_cpp/Types_types.h>
21
#include <gen_cpp/olap_common.pb.h>
22
#include <gen_cpp/olap_file.pb.h>
23
#include <gen_cpp/segment_v2.pb.h>
24
#include <parallel_hashmap/phmap.h>
25
26
#include <algorithm>
27
#include <cstdint>
28
#include <map>
29
#include <memory>
30
#include <string>
31
#include <unordered_map>
32
#include <unordered_set>
33
#include <utility>
34
#include <vector>
35
36
#include "common/consts.h"
37
#include "common/status.h"
38
#include "core/data_type/define_primitive_type.h"
39
#include "core/string_ref.h"
40
#include "core/types.h"
41
#include "exec/common/string_utils/string_utils.h"
42
#include "exprs/aggregate/aggregate_function.h"
43
#include "runtime/descriptors.h"
44
#include "runtime/memory/lru_cache_policy.h"
45
#include "storage/index/inverted/inverted_index_parser.h"
46
#include "storage/metadata_adder.h"
47
#include "storage/olap_common.h"
48
#include "storage/segment/options.h"
49
#include "util/debug_points.h"
50
#include "util/json/path_in_data.h"
51
#include "util/string_parser.hpp"
52
#include "util/string_util.h"
53
54
namespace doris {
55
class Block;
56
class PathInData;
57
class IDataType;
58
59
struct OlapTableIndexSchema;
60
class TColumn;
61
class TOlapTableIndex;
62
class TabletColumn;
63
64
using TabletColumnPtr = std::shared_ptr<TabletColumn>;
65
66
class TabletColumn : public MetadataAdder<TabletColumn> {
67
public:
68
    struct VariantParams {
69
        int32_t max_subcolumns_count = 0;
70
        bool enable_typed_paths_to_sparse = false;
71
        int32_t max_sparse_column_statistics_size =
72
                BeConsts::DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE;
73
        // default to 0, no shard
74
        int32_t sparse_hash_shard_count = 0;
75
76
        bool enable_doc_mode = false;
77
        int64_t doc_materialization_min_rows = 0;
78
        int32_t doc_hash_shard_count = 64;
79
80
        bool enable_nested_group = false;
81
    };
82
83
    TabletColumn();
84
    TabletColumn(const ColumnPB& column);
85
    TabletColumn(const TColumn& column);
86
    TabletColumn(FieldAggregationMethod agg, FieldType type);
87
    TabletColumn(FieldAggregationMethod agg, FieldType filed_type, bool is_nullable);
88
    TabletColumn(FieldAggregationMethod agg, FieldType filed_type, bool is_nullable,
89
                 int32_t unique_id, size_t length);
90
91
#ifdef BE_TEST
92
    virtual ~TabletColumn() = default;
93
#endif
94
95
    void init_from_pb(const ColumnPB& column);
96
    void init_from_thrift(const TColumn& column);
97
    void to_schema_pb(ColumnPB* column) const;
98
99
506M
    int32_t unique_id() const { return _unique_id; }
100
11.5k
    void set_unique_id(int32_t id) { _unique_id = id; }
101
889M
    const std::string& name() const { return _col_name; }
102
200k
    const std::string& name_lower_case() const { return _col_name_lower_case; }
103
388k
    void set_name(std::string col_name) {
104
388k
        _col_name = col_name;
105
388k
        _col_name_lower_case = to_lower(_col_name);
106
388k
    }
107
1.32G
    MOCK_FUNCTION FieldType type() const { return _type; }
108
305k
    void set_type(FieldType type) { _type = type; }
109
302M
    bool is_key() const { return _is_key; }
110
283M
    bool is_nullable() const { return _is_nullable; }
111
31
    bool is_auto_increment() const { return _is_auto_increment; }
112
36
    bool is_seqeunce_col() const { return _col_name == SEQUENCE_COL; }
113
7.55k
    bool is_on_update_current_timestamp() const { return _is_on_update_current_timestamp; }
114
160M
    bool is_variant_type() const { return _type == FieldType::OLAP_FIELD_TYPE_VARIANT; }
115
3.11M
    bool is_bf_column() const { return _is_bf_column; }
116
57.2M
    bool is_array_type() const { return _type == FieldType::OLAP_FIELD_TYPE_ARRAY; }
117
27.2M
    bool is_agg_state_type() const { return _type == FieldType::OLAP_FIELD_TYPE_AGG_STATE; }
118
0
    bool is_jsonb_type() const { return _type == FieldType::OLAP_FIELD_TYPE_JSONB; }
119
0
    bool is_length_variable_type() const {
120
0
        return _type == FieldType::OLAP_FIELD_TYPE_CHAR ||
121
0
               _type == FieldType::OLAP_FIELD_TYPE_VARCHAR ||
122
0
               _type == FieldType::OLAP_FIELD_TYPE_STRING ||
123
0
               _type == FieldType::OLAP_FIELD_TYPE_HLL ||
124
0
               _type == FieldType::OLAP_FIELD_TYPE_BITMAP ||
125
0
               _type == FieldType::OLAP_FIELD_TYPE_QUANTILE_STATE ||
126
0
               _type == FieldType::OLAP_FIELD_TYPE_AGG_STATE;
127
0
    }
128
    // Such columns are not exist in frontend schema info, so we need to
129
    // add them into tablet_schema for later column indexing.
130
    static TabletColumn create_materialized_variant_column(const std::string& root,
131
                                                           const std::vector<std::string>& paths,
132
                                                           int32_t parent_unique_id,
133
                                                           int32_t max_subcolumns_count,
134
                                                           bool enable_doc_mode = false);
135
90.0k
    bool has_default_value() const { return _has_default_value; }
136
1.00M
    std::string default_value() const { return _default_value; }
137
247M
    int32_t length() const { return _length; }
138
106k
    void set_length(int32_t length) { _length = length; }
139
34.9k
    void set_default_value(const std::string& default_value) {
140
34.9k
        _default_value = default_value;
141
34.9k
        _has_default_value = true;
142
34.9k
    }
143
108M
    int32_t index_length() const { return _index_length; }
144
120k
    void set_index_length(int32_t index_length) { _index_length = index_length; }
145
    void set_is_key(bool is_key) { _is_key = is_key; }
146
129k
    void set_is_nullable(bool is_nullable) { _is_nullable = is_nullable; }
147
0
    void set_is_auto_increment(bool is_auto_increment) { _is_auto_increment = is_auto_increment; }
148
0
    void set_is_on_update_current_timestamp(bool is_on_update_current_timestamp) {
149
0
        _is_on_update_current_timestamp = is_on_update_current_timestamp;
150
0
    }
151
    void set_path_info(const PathInData& path);
152
53.0M
    FieldAggregationMethod aggregation() const { return _aggregation; }
153
    AggregateFunctionPtr get_aggregate_function_union(DataTypePtr type,
154
                                                      int current_be_exec_version) const;
155
    AggregateFunctionPtr get_aggregate_function(std::string suffix,
156
                                                int current_be_exec_version) const;
157
145M
    int precision() const { return _precision; }
158
145M
    int frac() const { return _frac; }
159
    inline bool visible() const { return _visible; }
160
    bool has_char_type() const;
161
162
157k
    void set_aggregation_method(FieldAggregationMethod agg) {
163
157k
        _aggregation = agg;
164
157k
        _aggregation_name = get_string_by_aggregation_type(agg);
165
157k
    }
166
167
    /**
168
     * Add a sub column.
169
     */
170
    void add_sub_column(TabletColumn& sub_column);
171
172
2.81M
    uint32_t get_subtype_count() const { return _sub_column_count; }
173
3.58M
    MOCK_FUNCTION const TabletColumn& get_sub_column(uint64_t i) const { return *_sub_columns[i]; }
174
272k
    const std::vector<TabletColumnPtr>& get_sub_columns() const { return _sub_columns; }
175
176
    friend bool operator==(const TabletColumn& a, const TabletColumn& b);
177
    friend bool operator!=(const TabletColumn& a, const TabletColumn& b);
178
179
    static std::string get_string_by_field_type(FieldType type);
180
    static std::string get_string_by_aggregation_type(FieldAggregationMethod aggregation_type);
181
    static FieldType get_field_type_by_string(const std::string& str);
182
    static FieldType get_field_type_by_type(PrimitiveType type);
183
    static PrimitiveType get_primitive_type_by_field_type(FieldType type);
184
    static FieldAggregationMethod get_aggregation_type_by_string(const std::string& str);
185
    static uint32_t get_field_length_by_type(TPrimitiveType::type type, uint32_t string_length);
186
    bool is_row_store_column() const;
187
833k
    std::string get_aggregation_name() const { return _aggregation_name; }
188
832k
    bool get_result_is_nullable() const { return _result_is_nullable; }
189
887k
    int get_be_exec_version() const { return _be_exec_version; }
190
75.3M
    bool has_path_info() const { return _column_path != nullptr && !_column_path->empty(); }
191
139M
    const PathInDataPtr& path_info_ptr() const { return _column_path; }
192
    // If it is an extracted column from variant column
193
305M
    bool is_extracted_column() const {
194
305M
        return _column_path != nullptr && !_column_path->empty() && _parent_col_unique_id > 0;
195
305M
    };
196
28.9M
    std::string suffix_path() const {
197
28.9M
        return is_extracted_column() ? _column_path->get_path() : "";
198
28.9M
    }
199
20.2k
    bool is_nested_subcolumn() const {
200
20.2k
        return _column_path != nullptr && _column_path->has_nested_part();
201
20.2k
    }
202
109M
    int32_t parent_unique_id() const { return _parent_col_unique_id; }
203
168k
    void set_parent_unique_id(int32_t col_unique_id) { _parent_col_unique_id = col_unique_id; }
204
90.0k
    void set_is_bf_column(bool is_bf_column) { _is_bf_column = is_bf_column; }
205
    std::shared_ptr<const IDataType> get_vec_type() const;
206
207
114k
    Status check_valid() const {
208
114k
        if (type() != FieldType::OLAP_FIELD_TYPE_ARRAY &&
209
114k
            type() != FieldType::OLAP_FIELD_TYPE_STRUCT &&
210
114k
            type() != FieldType::OLAP_FIELD_TYPE_MAP) {
211
110k
            return Status::OK();
212
110k
        }
213
3.81k
        if (is_bf_column()) {
214
0
            return Status::NotSupported("Do not support bloom filter index, type={}",
215
0
                                        get_string_by_field_type(type()));
216
0
        }
217
3.81k
        return Status::OK();
218
3.81k
    }
219
220
11.6k
    void set_precision(int precision) {
221
11.6k
        _precision = precision;
222
11.6k
        _is_decimal = true;
223
11.6k
    }
224
225
11.7k
    void set_frac(int frac) { _frac = frac; }
226
227
0
    const VariantParams& variant_params() const { return _variant; }
228
0
    VariantParams* mutable_variant_params() { return &_variant; }
229
230
187k
    int32_t variant_max_subcolumns_count() const { return _variant.max_subcolumns_count; }
231
232
10.1k
    void set_variant_max_subcolumns_count(int32_t variant_max_subcolumns_count) {
233
10.1k
        _variant.max_subcolumns_count = variant_max_subcolumns_count;
234
10.1k
    }
235
236
158k
    PatternTypePB pattern_type() const { return _pattern_type; }
237
238
21.7k
    bool variant_enable_typed_paths_to_sparse() const {
239
21.7k
        return _variant.enable_typed_paths_to_sparse;
240
21.7k
    }
241
242
49.2k
    int32_t variant_max_sparse_column_statistics_size() const {
243
49.2k
        return _variant.max_sparse_column_statistics_size;
244
49.2k
    }
245
246
3.49k
    int32_t variant_sparse_hash_shard_count() const { return _variant.sparse_hash_shard_count; }
247
248
198k
    bool variant_enable_doc_mode() const { return _variant.enable_doc_mode; }
249
250
2.94k
    int64_t variant_doc_materialization_min_rows() const {
251
2.94k
        return _variant.doc_materialization_min_rows;
252
2.94k
    }
253
254
2.64k
    int32_t variant_doc_hash_shard_count() const { return _variant.doc_hash_shard_count; }
255
256
0
    void set_variant_doc_materialization_min_rows(int64_t variant_doc_materialization_min_rows) {
257
0
        _variant.doc_materialization_min_rows = variant_doc_materialization_min_rows;
258
0
    }
259
260
0
    void set_variant_doc_hash_shard_count(int32_t variant_doc_hash_shard_count) {
261
0
        _variant.doc_hash_shard_count = variant_doc_hash_shard_count;
262
0
    }
263
264
    void set_variant_max_sparse_column_statistics_size(
265
            int32_t variant_max_sparse_column_statistics_size) {
266
        _variant.max_sparse_column_statistics_size = variant_max_sparse_column_statistics_size;
267
    }
268
269
0
    void set_variant_sparse_hash_shard_count(int32_t variant_sparse_hash_shard_count) {
270
0
        _variant.sparse_hash_shard_count = variant_sparse_hash_shard_count;
271
0
    }
272
273
10.8k
    void set_variant_enable_doc_mode(bool variant_enable_doc_mode) {
274
10.8k
        _variant.enable_doc_mode = variant_enable_doc_mode;
275
10.8k
    }
276
277
    void set_variant_enable_typed_paths_to_sparse(bool variant_enable_typed_paths_to_sparse) {
278
        _variant.enable_typed_paths_to_sparse = variant_enable_typed_paths_to_sparse;
279
    }
280
281
140k
    bool variant_enable_nested_group() const { return _variant.enable_nested_group; }
282
283
0
    void set_variant_enable_nested_group(bool val) { _variant.enable_nested_group = val; }
284
285
11.6k
    bool is_decimal() const { return _is_decimal; }
286
287
private:
288
    int32_t _unique_id = -1;
289
    std::string _col_name;
290
    std::string _col_name_lower_case;
291
    // the field _type will change from TPrimitiveType
292
    // to string by 'EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type);' (reference: TabletMeta::init_column_from_tcolumn)
293
    // to FieldType by 'TabletColumn::get_field_type_by_string' (reference: TabletColumn::init_from_pb).
294
    // And the _type in columnPB is string and it changed from FieldType by 'get_string_by_field_type' (reference: TabletColumn::to_schema_pb).
295
    FieldType _type;
296
    bool _is_key = false;
297
    FieldAggregationMethod _aggregation;
298
    std::string _aggregation_name;
299
    bool _is_nullable = false;
300
    bool _is_auto_increment = false;
301
    bool _is_on_update_current_timestamp {false};
302
303
    bool _has_default_value = false;
304
    std::string _default_value;
305
306
    bool _is_decimal = false;
307
    int32_t _precision = -1;
308
    int32_t _frac = -1;
309
310
    int32_t _length = -1;
311
    int32_t _index_length = -1;
312
313
    bool _is_bf_column = false;
314
315
    bool _visible = true;
316
317
    std::vector<TabletColumnPtr> _sub_columns;
318
    uint32_t _sub_column_count = 0;
319
320
    bool _result_is_nullable = false;
321
    int _be_exec_version = -1;
322
323
    // The extracted sub-columns from "variant" contain the following information:
324
    int32_t _parent_col_unique_id = -1; // "variant" -> col_unique_id
325
    PathInDataPtr _column_path;         // the path of the sub-columns themselves
326
    PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB;
327
328
    VariantParams _variant;
329
};
330
331
bool operator==(const TabletColumn& a, const TabletColumn& b);
332
bool operator!=(const TabletColumn& a, const TabletColumn& b);
333
334
class TabletIndex : public MetadataAdder<TabletIndex> {
335
public:
336
1.20M
    TabletIndex() = default;
337
    void init_from_thrift(const TOlapTableIndex& index, const TabletSchema& tablet_schema);
338
    void init_from_thrift(const TOlapTableIndex& index, const std::vector<int32_t>& column_uids);
339
    void init_from_pb(const TabletIndexPB& index);
340
    void to_schema_pb(TabletIndexPB* index) const;
341
342
404k
    int64_t index_id() const { return _index_id; }
343
    const std::string& index_name() const { return _index_name; }
344
1.51M
    MOCK_FUNCTION IndexType index_type() const { return _index_type; }
345
1.43M
    const std::vector<int32_t>& col_unique_ids() const { return _col_unique_ids; }
346
881k
    MOCK_FUNCTION const std::map<std::string, std::string>& properties() const {
347
881k
        return _properties;
348
881k
    }
349
3.41k
    int32_t get_gram_size() const {
350
3.41k
        if (_properties.contains("gram_size")) {
351
3.41k
            return std::stoi(_properties.at("gram_size"));
352
3.41k
        }
353
354
1
        return 0;
355
3.41k
    }
356
3.41k
    int32_t get_gram_bf_size() const {
357
3.41k
        if (_properties.contains("bf_size")) {
358
3.41k
            return std::stoi(_properties.at("bf_size"));
359
3.41k
        }
360
361
0
        return 0;
362
3.41k
    }
363
364
1.45M
    const std::string& get_index_suffix() const { return _escaped_index_suffix_path; }
365
366
    void set_escaped_escaped_index_suffix_path(const std::string& name);
367
368
36.6k
    bool is_inverted_index() const { return _index_type == IndexType::INVERTED; }
369
370
2
    bool is_ann_index() const { return _index_type == IndexType::ANN; }
371
372
1.37k
    void remove_parser_and_analyzer() {
373
1.37k
        _properties.erase(INVERTED_INDEX_PARSER_KEY);
374
1.37k
        _properties.erase(INVERTED_INDEX_PARSER_KEY_ALIAS);
375
1.37k
        _properties.erase(INVERTED_INDEX_ANALYZER_NAME_KEY);
376
1.37k
        _properties.erase(INVERTED_INDEX_NORMALIZER_NAME_KEY);
377
1.37k
    }
378
379
1.34M
    std::string field_pattern() const {
380
1.34M
        if (_properties.contains("field_pattern")) {
381
34.8k
            return _properties.at("field_pattern");
382
34.8k
        }
383
1.30M
        return "";
384
1.34M
    }
385
386
399
    bool is_same_except_id(const TabletIndex* other) const {
387
399
        return _escaped_index_suffix_path == other->_escaped_index_suffix_path &&
388
399
               _index_name == other->_index_name && _index_type == other->_index_type &&
389
399
               _col_unique_ids == other->_col_unique_ids && _properties == other->_properties;
390
399
    }
391
392
private:
393
    int64_t _index_id = -1;
394
    // Identify the different index with the same _index_id
395
    std::string _escaped_index_suffix_path;
396
    std::string _index_name;
397
    IndexType _index_type;
398
    std::vector<int32_t> _col_unique_ids;
399
    std::map<std::string, std::string> _properties;
400
};
401
402
using TabletIndexPtr = std::shared_ptr<TabletIndex>;
403
using TabletIndexes = std::vector<std::shared_ptr<TabletIndex>>;
404
using PathSet = phmap::flat_hash_set<std::string>;
405
406
class TabletSchema : public MetadataAdder<TabletSchema> {
407
public:
408
    enum class ColumnType { NORMAL = 0, DROPPED = 1, VARIANT = 2 };
409
    // TODO(yingchun): better to make constructor as private to avoid
410
    // manually init members incorrectly, and define a new function like
411
    // void create_from_pb(const TabletSchemaPB& schema, TabletSchema* tablet_schema).
412
    TabletSchema();
413
    ~TabletSchema() override;
414
415
    // Init from pb
416
    // ignore_extracted_columns: ignore the extracted columns from variant column
417
    // reuse_cached_column: reuse the cached column in the schema if they are the same, to reduce memory usage
418
    void init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns = false,
419
                      bool reuse_cached_column = false);
420
    // Notice: Use deterministic way to serialize protobuf,
421
    // since serialize Map in protobuf may could lead to un-deterministic by default
422
    template <class PbType>
423
4.50M
    static std::string deterministic_string_serialize(const PbType& pb) {
424
4.50M
        std::string output;
425
4.50M
        google::protobuf::io::StringOutputStream string_output_stream(&output);
426
4.50M
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
427
4.50M
        output_stream.SetSerializationDeterministic(true);
428
4.50M
        pb.SerializeToCodedStream(&output_stream);
429
4.50M
        return output;
430
4.50M
    }
_ZN5doris12TabletSchema30deterministic_string_serializeINS_14TabletSchemaPBEEENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKT_
Line
Count
Source
423
1.84M
    static std::string deterministic_string_serialize(const PbType& pb) {
424
1.84M
        std::string output;
425
1.84M
        google::protobuf::io::StringOutputStream string_output_stream(&output);
426
1.84M
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
427
1.84M
        output_stream.SetSerializationDeterministic(true);
428
1.84M
        pb.SerializeToCodedStream(&output_stream);
429
1.84M
        return output;
430
1.84M
    }
_ZN5doris12TabletSchema30deterministic_string_serializeINS_8ColumnPBEEENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKT_
Line
Count
Source
423
2.48M
    static std::string deterministic_string_serialize(const PbType& pb) {
424
2.48M
        std::string output;
425
2.48M
        google::protobuf::io::StringOutputStream string_output_stream(&output);
426
2.48M
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
427
2.48M
        output_stream.SetSerializationDeterministic(true);
428
2.48M
        pb.SerializeToCodedStream(&output_stream);
429
2.48M
        return output;
430
2.48M
    }
_ZN5doris12TabletSchema30deterministic_string_serializeINS_13TabletIndexPBEEENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKT_
Line
Count
Source
423
167k
    static std::string deterministic_string_serialize(const PbType& pb) {
424
167k
        std::string output;
425
167k
        google::protobuf::io::StringOutputStream string_output_stream(&output);
426
167k
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
427
167k
        output_stream.SetSerializationDeterministic(true);
428
167k
        pb.SerializeToCodedStream(&output_stream);
429
167k
        return output;
430
167k
    }
431
    void to_schema_pb(TabletSchemaPB* tablet_meta_pb) const;
432
    void append_column(TabletColumn column, ColumnType col_type = ColumnType::NORMAL);
433
    void append_index(TabletIndex&& index);
434
    void remove_index(int64_t index_id);
435
    void clear_index();
436
    // Must make sure the row column is always the last column
437
    void add_row_column();
438
    void copy_from(const TabletSchema& tablet_schema);
439
    // lightweight copy, take care of lifecycle of TabletColumn
440
    void shawdow_copy_without_columns(const TabletSchema& tablet_schema);
441
    void update_index_info_from(const TabletSchema& tablet_schema);
442
    std::string to_key() const;
443
    // get_metadata_size is only the memory of the TabletSchema itself, not include child objects.
444
47.0k
    int64_t mem_size() const { return get_metadata_size(); }
445
    size_t row_size() const;
446
    int32_t field_index(const std::string& field_name) const;
447
    int32_t field_index(const PathInData& path) const;
448
    int32_t field_index(int32_t col_unique_id) const;
449
    const TabletColumn& column(size_t ordinal) const;
450
    Result<const TabletColumn*> column(const std::string& field_name) const;
451
    Status have_column(const std::string& field_name) const;
452
    bool exist_column(const std::string& field_name) const;
453
    bool has_column_unique_id(int32_t col_unique_id) const;
454
    const TabletColumn& column_by_uid(int32_t col_unique_id) const;
455
    TabletColumn& mutable_column_by_uid(int32_t col_unique_id);
456
    TabletColumn& mutable_column(size_t ordinal);
457
    void replace_column(size_t pos, TabletColumn new_col);
458
    const std::vector<TabletColumnPtr>& columns() const;
459
11.7M
    size_t num_columns() const { return _num_columns; }
460
35.8M
    size_t num_key_columns() const { return _num_key_columns; }
461
16.4M
    const std::vector<uint32_t>& cluster_key_uids() const { return _cluster_key_uids; }
462
0
    size_t num_null_columns() const { return _num_null_columns; }
463
969k
    size_t num_short_key_columns() const { return _num_short_key_columns; }
464
186k
    size_t num_rows_per_row_block() const { return _num_rows_per_row_block; }
465
2.03M
    size_t num_variant_columns() const { return _num_variant_columns; };
466
0
    size_t num_virtual_columns() const { return _num_virtual_columns; }
467
62.8M
    KeysType keys_type() const { return _keys_type; }
468
2.15M
    SortType sort_type() const { return _sort_type; }
469
185k
    size_t sort_col_num() const { return _sort_col_num; }
470
199k
    CompressKind compress_kind() const { return _compress_kind; }
471
187k
    size_t next_column_unique_id() const { return _next_column_unique_id; }
472
1.95k
    bool has_bf_fpp() const { return _has_bf_fpp; }
473
9.88k
    double bloom_filter_fpp() const { return _bf_fpp; }
474
22.6M
    bool is_in_memory() const { return _is_in_memory; }
475
0
    void set_is_in_memory(bool is_in_memory) { _is_in_memory = is_in_memory; }
476
7
    void set_disable_auto_compaction(bool disable_auto_compaction) {
477
7
        _disable_auto_compaction = disable_auto_compaction;
478
7
    }
479
612M
    bool disable_auto_compaction() const { return _disable_auto_compaction; }
480
    // Deprecated legacy switch for flatten-nested variant behavior.
481
    // It is distinct from variant_enable_nested_group.
482
0
    void set_deprecated_variant_flatten_nested(bool flatten_nested) {
483
0
        _deprecated_enable_variant_flatten_nested = flatten_nested;
484
0
    }
485
191k
    bool deprecated_variant_flatten_nested() const {
486
191k
        return _deprecated_enable_variant_flatten_nested;
487
191k
    }
488
0
    void set_enable_single_replica_compaction(bool enable_single_replica_compaction) {
489
0
        _enable_single_replica_compaction = enable_single_replica_compaction;
490
0
    }
491
27.4M
    bool enable_single_replica_compaction() const { return _enable_single_replica_compaction; }
492
    // indicate if full row store column(all the columns encodes as row) exists
493
101k
    bool has_row_store_for_all_columns() const {
494
101k
        return _store_row_column && row_columns_uids().empty();
495
101k
    }
496
0
    void set_skip_write_index_on_load(bool skip) { _skip_write_index_on_load = skip; }
497
807k
    bool skip_write_index_on_load() const { return _skip_write_index_on_load; }
498
1.24M
    int32_t delete_sign_idx() const { return _delete_sign_idx; }
499
24.4M
    bool has_sequence_col() const { return _sequence_col_idx != -1; }
500
172k
    int32_t sequence_col_idx() const { return _sequence_col_idx; }
501
0
    void set_version_col_idx(int32_t version_col_idx) { _version_col_idx = version_col_idx; }
502
8.94k
    int32_t version_col_idx() const { return _version_col_idx; }
503
530
    bool has_skip_bitmap_col() const { return _skip_bitmap_col_idx != -1; }
504
11.1k
    int32_t skip_bitmap_col_idx() const { return _skip_bitmap_col_idx; }
505
15.7k
    segment_v2::CompressionTypePB compression_type() const { return _compression_type; }
506
0
    void set_row_store_page_size(long page_size) { _row_store_page_size = page_size; }
507
187k
    long row_store_page_size() const { return _row_store_page_size; }
508
0
    void set_storage_page_size(long storage_page_size) { _storage_page_size = storage_page_size; }
509
912k
    long storage_page_size() const { return _storage_page_size; }
510
0
    void set_storage_dict_page_size(long storage_dict_page_size) {
511
0
        _storage_dict_page_size = storage_dict_page_size;
512
0
    }
513
912k
    long storage_dict_page_size() const { return _storage_dict_page_size; }
514
1.19M
    bool has_global_row_id() const {
515
19.2M
        for (auto [col_name, _] : _field_name_to_index) {
516
19.2M
            if (col_name.start_with(StringRef(BeConsts::GLOBAL_ROWID_COL.data(),
517
19.2M
                                              BeConsts::GLOBAL_ROWID_COL.size()))) {
518
7.77k
                return true;
519
7.77k
            }
520
19.2M
        }
521
1.18M
        return false;
522
1.19M
    }
523
524
11.1k
    const std::vector<const TabletIndex*> inverted_indexes() const {
525
11.1k
        std::vector<const TabletIndex*> inverted_indexes;
526
11.1k
        for (const auto& index : _indexes) {
527
4.32k
            if (index->index_type() == IndexType::INVERTED) {
528
4.15k
                inverted_indexes.emplace_back(index.get());
529
4.15k
            }
530
4.32k
        }
531
11.1k
        return inverted_indexes;
532
11.1k
    }
533
486k
    bool has_inverted_index() const {
534
486k
        for (const auto& index : _indexes) {
535
45.8k
            DBUG_EXECUTE_IF("tablet_schema::has_inverted_index", {
536
45.8k
                if (index->col_unique_ids().empty()) {
537
45.8k
                    throw Exception(Status::InternalError("col unique ids cannot be empty"));
538
45.8k
                }
539
45.8k
            });
540
541
45.8k
            if (index->index_type() == IndexType::INVERTED) {
542
                //if index_id == -1, ignore it.
543
45.1k
                if (!index->col_unique_ids().empty() && index->col_unique_ids()[0] >= 0) {
544
45.1k
                    return true;
545
45.1k
                }
546
45.1k
            }
547
45.8k
        }
548
440k
        return false;
549
486k
    }
550
551
379k
    bool has_ann_index() const {
552
379k
        for (const auto& index : _indexes) {
553
576
            if (index->index_type() == IndexType::ANN) {
554
294
                if (!index->col_unique_ids().empty() && index->col_unique_ids()[0] >= 0) {
555
294
                    return true;
556
294
                }
557
293
            }
558
576
        }
559
379k
        return false;
560
379k
    }
561
562
    bool has_inverted_index_with_index_id(int64_t index_id) const;
563
564
    std::vector<const TabletIndex*> inverted_indexs(const TabletColumn& col) const;
565
566
    std::vector<const TabletIndex*> inverted_indexs(int32_t col_unique_id,
567
                                                    const std::string& suffix_path = "") const;
568
    const TabletIndex* ann_index(const TabletColumn& col) const;
569
570
    // Regardless of whether this column supports inverted index
571
    // TabletIndex information will be returned as long as it exists.
572
    const TabletIndex* ann_index(int32_t col_unique_id, const std::string& suffix_path = "") const;
573
574
    std::vector<TabletIndexPtr> inverted_index_by_field_pattern(
575
            int32_t col_unique_id, const std::string& field_pattern) const;
576
577
    bool has_ngram_bf_index(int32_t col_unique_id) const;
578
    const TabletIndex* get_ngram_bf_index(int32_t col_unique_id) const;
579
    const TabletIndex* get_index(int32_t col_unique_id, IndexType index_type,
580
                                 const std::string& suffix_path) const;
581
    void update_indexes_from_thrift(const std::vector<doris::TOlapTableIndex>& indexes);
582
    // If schema version is not set, it should be -1
583
1.75M
    int32_t schema_version() const { return _schema_version; }
584
    void clear_columns();
585
    Block create_block(
586
            const std::vector<uint32_t>& return_columns,
587
            const std::unordered_set<uint32_t>* tablet_columns_need_convert_null = nullptr) const;
588
    Block create_block() const;
589
1.40M
    void set_schema_version(int32_t version) { _schema_version = version; }
590
3.75k
    void set_auto_increment_column(const std::string& auto_increment_column) {
591
3.75k
        _auto_increment_column = auto_increment_column;
592
3.75k
    }
593
2.93k
    std::string auto_increment_column() const { return _auto_increment_column; }
594
595
189k
    void set_table_id(int64_t table_id) { _table_id = table_id; }
596
1.23M
    int64_t table_id() const { return _table_id; }
597
188k
    void set_db_id(int64_t db_id) { _db_id = db_id; }
598
0
    int64_t db_id() const { return _db_id; }
599
    void build_current_tablet_schema(int64_t index_id, int32_t version,
600
                                     const OlapTableIndexSchema* index,
601
                                     const TabletSchema& out_tablet_schema);
602
603
    // Merge columns that not exit in current schema, these column is dropped in current schema
604
    // but they are useful in some cases. For example,
605
    // 1. origin schema is  ColA, ColB
606
    // 2. insert values     1, 2
607
    // 3. delete where ColB = 2
608
    // 4. drop ColB
609
    // 5. insert values  3
610
    // 6. add column ColB, although it is name ColB, but it is different with previous ColB, the new ColB we name could call ColB'
611
    // 7. insert value  4, 5
612
    // Then the read schema should be ColA, ColB, ColB' because the delete predicate need ColB to remove related data.
613
    // Because they have same name, so that the dropped column should not be added to the map, only with unique id.
614
    void merge_dropped_columns(const TabletSchema& src_schema);
615
616
    bool is_dropped_column(const TabletColumn& col) const;
617
618
    // copy extracted columns from src_schema
619
    void copy_extracted_columns(const TabletSchema& src_schema);
620
621
    // only reserve extracted columns
622
    void reserve_extracted_columns();
623
624
4.04k
    std::string get_all_field_names() const {
625
4.04k
        std::string str = "[";
626
4.04k
        for (auto p : _field_name_to_index) {
627
4.04k
            if (str.size() > 1) {
628
0
                str += ", ";
629
0
            }
630
4.04k
            str += p.first.to_string() + "(" + std::to_string(_cols[p.second]->unique_id()) + ")";
631
4.04k
        }
632
4.04k
        str += "]";
633
4.04k
        return str;
634
4.04k
    }
635
636
    // Dump [(name, type, is_nullable), ...]
637
1
    std::string dump_structure() const {
638
1
        std::string str = "[";
639
15
        for (auto p : _cols) {
640
15
            if (str.size() > 1) {
641
14
                str += ", ";
642
14
            }
643
15
            str += "(";
644
15
            str += p->name();
645
15
            str += ", ";
646
15
            str += TabletColumn::get_string_by_field_type(p->type());
647
15
            str += ", ";
648
15
            str += "is_nullable:";
649
15
            str += (p->is_nullable() ? "true" : "false");
650
15
            str += ")";
651
15
        }
652
1
        str += "]";
653
1
        return str;
654
1
    }
655
656
1
    std::string dump_full_schema() const {
657
1
        std::string str = "[";
658
4
        for (auto p : _cols) {
659
4
            if (str.size() > 1) {
660
3
                str += ", ";
661
3
            }
662
4
            ColumnPB col_pb;
663
4
            p->to_schema_pb(&col_pb);
664
4
            str += "(";
665
4
            str += col_pb.ShortDebugString();
666
4
            str += ")";
667
4
        }
668
1
        str += "]";
669
1
        return str;
670
1
    }
671
672
    Block create_block_by_cids(const std::vector<uint32_t>& cids) const;
673
674
    std::shared_ptr<TabletSchema> copy_without_variant_extracted_columns();
675
2.33M
    InvertedIndexStorageFormatPB get_inverted_index_storage_format() const {
676
2.33M
        return _inverted_index_storage_format;
677
2.33M
    }
678
679
    void update_tablet_columns(const TabletSchema& tablet_schema,
680
                               const std::vector<TColumn>& t_columns);
681
682
13.9k
    const std::vector<int32_t>& row_columns_uids() const { return _row_store_column_unique_ids; }
683
684
    int64_t get_metadata_size() const override;
685
686
    struct SubColumnInfo {
687
        TabletColumn column;
688
        TabletIndexes indexes;
689
    };
690
691
    // all path in path_set_info are relative to the parent column
692
    struct PathsSetInfo {
693
        std::unordered_map<std::string, SubColumnInfo> typed_path_set;    // typed columns
694
        std::unordered_map<std::string, TabletIndexes> subcolumn_indexes; // subcolumns indexes
695
        PathSet sub_path_set;                                             // extracted columns
696
        PathSet sparse_path_set;                                          // sparse columns
697
    };
698
699
8.06k
    void set_path_set_info(std::unordered_map<int32_t, PathsSetInfo>&& path_set_info_map) {
700
8.06k
        _path_set_info_map = std::move(path_set_info_map);
701
8.06k
    }
702
703
5.18k
    const PathsSetInfo& path_set_info(int32_t unique_id) const {
704
5.18k
        return _path_set_info_map.at(unique_id);
705
5.18k
    }
706
707
    bool need_record_variant_extended_schema() const { return variant_max_subcolumns_count() == 0; }
708
709
    int32_t variant_max_subcolumns_count() const {
710
        for (const auto& col : _cols) {
711
            if (col->is_variant_type()) {
712
                return col->variant_max_subcolumns_count();
713
            }
714
        }
715
        return 0;
716
    }
717
    const std::unordered_map<uint32_t, std::vector<uint32_t>>& seq_col_idx_to_value_cols_idx()
718
76
            const {
719
76
        return _seq_col_idx_to_value_cols_idx;
720
76
    }
721
722
4.90M
    bool has_seq_map() const { return !_seq_col_idx_to_value_cols_idx.empty(); }
723
724
39
    const std::unordered_map<uint32_t, uint32_t>& value_col_idx_to_seq_col_idx() const {
725
39
        return _value_col_idx_to_seq_col_idx;
726
39
    }
727
728
63.0k
    void add_pruned_columns_data_type(int32_t col_unique_id, DataTypePtr data_type) {
729
63.0k
        _pruned_columns_data_type[col_unique_id] = std::move(data_type);
730
63.0k
    }
731
732
0
    void clear_pruned_columns_data_type() { _pruned_columns_data_type.clear(); }
733
734
0
    bool has_pruned_columns() const { return !_pruned_columns_data_type.empty(); }
735
736
    // Whether new segments use externalized ColumnMetaPB layout (CMO) by default
737
72.1k
    bool is_external_segment_column_meta_used() const {
738
72.1k
        return _is_external_segment_column_meta_used;
739
72.1k
    }
740
741
34
    void set_external_segment_meta_used_default(bool v) {
742
34
        _is_external_segment_column_meta_used = v;
743
34
    }
744
745
725k
    bool integer_type_default_use_plain_encoding() const {
746
725k
        return _integer_type_default_use_plain_encoding;
747
725k
    }
748
749
0
    void set_integer_type_default_use_plain_encoding(bool v) {
750
0
        _integer_type_default_use_plain_encoding = v;
751
0
    }
752
753
725k
    BinaryPlainEncodingTypePB binary_plain_encoding_default_impl() const {
754
725k
        return _binary_plain_encoding_default_impl;
755
725k
    }
756
757
0
    void set_binary_plain_encoding_default_impl(BinaryPlainEncodingTypePB impl) {
758
0
        _binary_plain_encoding_default_impl = impl;
759
0
    }
760
761
private:
762
    friend bool operator==(const TabletSchema& a, const TabletSchema& b);
763
    friend bool operator!=(const TabletSchema& a, const TabletSchema& b);
764
    TabletSchema(const TabletSchema&) = default;
765
766
    KeysType _keys_type = DUP_KEYS;
767
    SortType _sort_type = SortType::LEXICAL;
768
    size_t _sort_col_num = 0;
769
    std::vector<TabletColumnPtr> _cols;
770
771
    std::vector<TabletIndexPtr> _indexes;
772
    std::unordered_map<StringRef, int32_t, StringRefHash> _field_name_to_index;
773
    std::unordered_map<int32_t, int32_t> _field_uniqueid_to_index;
774
    std::unordered_map<PathInDataRef, int32_t, PathInDataRef::Hash> _field_path_to_index;
775
776
    // index_type/col_unique_id/suffix -> idxs in _indexes
777
    using IndexKey = std::tuple<IndexType, int32_t, std::string>;
778
    struct IndexKeyHash {
779
31.1M
        size_t operator()(const IndexKey& t) const {
780
31.1M
            uint32_t seed = 0;
781
31.1M
            seed = doris::HashUtil::hash((const char*)&std::get<0>(t), sizeof(std::get<0>(t)),
782
31.1M
                                         seed);
783
31.1M
            seed = doris::HashUtil::hash((const char*)&std::get<1>(t), sizeof(std::get<1>(t)),
784
31.1M
                                         seed);
785
31.1M
            seed = doris::HashUtil::hash((const char*)std::get<2>(t).c_str(),
786
31.1M
                                         static_cast<uint32_t>(std::get<2>(t).size()), seed);
787
31.1M
            return seed;
788
31.1M
        }
789
    };
790
    std::unordered_map<IndexKey, std::vector<size_t>, IndexKeyHash> _col_id_suffix_to_index;
791
792
    int32_t _num_columns = 0;
793
    size_t _num_variant_columns = 0;
794
    size_t _num_virtual_columns = 0;
795
    size_t _num_key_columns = 0;
796
    std::vector<uint32_t> _cluster_key_uids;
797
    size_t _num_null_columns = 0;
798
    size_t _num_short_key_columns = 0;
799
    size_t _num_rows_per_row_block = 0;
800
    CompressKind _compress_kind = COMPRESS_NONE;
801
    segment_v2::CompressionTypePB _compression_type = segment_v2::CompressionTypePB::LZ4F;
802
    long _row_store_page_size = segment_v2::ROW_STORE_PAGE_SIZE_DEFAULT_VALUE;
803
    long _storage_page_size = segment_v2::STORAGE_PAGE_SIZE_DEFAULT_VALUE;
804
    long _storage_dict_page_size = segment_v2::STORAGE_DICT_PAGE_SIZE_DEFAULT_VALUE;
805
    size_t _next_column_unique_id = 0;
806
    std::string _auto_increment_column;
807
808
    bool _has_bf_fpp = false;
809
    double _bf_fpp = 0;
810
    bool _is_in_memory = false;
811
    int32_t _delete_sign_idx = -1;
812
    int32_t _sequence_col_idx = -1;
813
    int32_t _version_col_idx = -1;
814
    int32_t _skip_bitmap_col_idx = -1;
815
    int32_t _schema_version = -1;
816
    int64_t _table_id = -1;
817
    int64_t _db_id = -1;
818
    bool _disable_auto_compaction = false;
819
    bool _enable_single_replica_compaction = false;
820
    bool _store_row_column = false;
821
    bool _skip_write_index_on_load = false;
822
    InvertedIndexStorageFormatPB _inverted_index_storage_format = InvertedIndexStorageFormatPB::V1;
823
824
    // Contains column ids of which columns should be encoded into row store.
825
    // ATTN: For compability reason empty cids means all columns of tablet schema are encoded to row column
826
    std::vector<int32_t> _row_store_column_unique_ids;
827
    bool _deprecated_enable_variant_flatten_nested = false;
828
829
    std::map<size_t, int32_t> _vir_col_idx_to_unique_id;
830
    std::map<int32_t, DataTypePtr> _pruned_columns_data_type;
831
832
    // value: extracted path set and sparse path set
833
    std::unordered_map<int32_t, PathsSetInfo> _path_set_info_map;
834
835
    // key: field_pattern
836
    // value: indexes
837
    using PatternToIndex = std::unordered_map<std::string, std::vector<TabletIndexPtr>>;
838
    std::unordered_map<int32_t, PatternToIndex> _index_by_unique_id_with_pattern;
839
840
    // Default behavior for new segments: use external ColumnMeta region + CMO table if true
841
    bool _is_external_segment_column_meta_used = false;
842
843
    bool _integer_type_default_use_plain_encoding {false};
844
    BinaryPlainEncodingTypePB _binary_plain_encoding_default_impl {
845
            BinaryPlainEncodingTypePB::BINARY_PLAIN_ENCODING_V1};
846
    // Sequence column unique id mapping to value columns unique id
847
    std::unordered_map<uint32_t, std::vector<uint32_t>> _seq_col_uid_to_value_cols_uid;
848
    // Value column unique id mapping to sequence column unique id(also map sequence column it self)
849
    std::unordered_map<uint32_t, uint32_t> _value_col_uid_to_seq_col_uid;
850
    // Sequence column index mapping to value column index
851
    std::unordered_map<uint32_t, std::vector<uint32_t>> _seq_col_idx_to_value_cols_idx;
852
    // Value column index mapping to sequence column index(also map sequence column it self)
853
    std::unordered_map<uint32_t, uint32_t> _value_col_idx_to_seq_col_idx;
854
};
855
856
bool operator==(const TabletSchema& a, const TabletSchema& b);
857
bool operator!=(const TabletSchema& a, const TabletSchema& b);
858
859
using TabletSchemaSPtr = std::shared_ptr<TabletSchema>;
860
861
} // namespace doris