Coverage Report

Created: 2025-10-23 18:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/olap/tablet_schema.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <gen_cpp/Types_types.h>
21
#include <gen_cpp/olap_common.pb.h>
22
#include <gen_cpp/olap_file.pb.h>
23
#include <gen_cpp/segment_v2.pb.h>
24
#include <parallel_hashmap/phmap.h>
25
26
#include <algorithm>
27
#include <cstdint>
28
#include <map>
29
#include <memory>
30
#include <string>
31
#include <unordered_map>
32
#include <unordered_set>
33
#include <utility>
34
#include <vector>
35
36
#include "common/consts.h"
37
#include "common/status.h"
38
#include "olap/inverted_index_parser.h"
39
#include "olap/metadata_adder.h"
40
#include "olap/olap_common.h"
41
#include "olap/rowset/segment_v2/options.h"
42
#include "runtime/define_primitive_type.h"
43
#include "runtime/descriptors.h"
44
#include "runtime/memory/lru_cache_policy.h"
45
#include "util/debug_points.h"
46
#include "util/string_parser.hpp"
47
#include "util/string_util.h"
48
#include "vec/aggregate_functions/aggregate_function.h"
49
#include "vec/common/string_ref.h"
50
#include "vec/common/string_utils/string_utils.h"
51
#include "vec/core/types.h"
52
#include "vec/json/path_in_data.h"
53
54
namespace doris {
55
namespace vectorized {
56
class Block;
57
class PathInData;
58
class IDataType;
59
} // namespace vectorized
60
61
#include "common/compile_check_begin.h"
62
63
struct OlapTableIndexSchema;
64
class TColumn;
65
class TOlapTableIndex;
66
class TabletColumn;
67
68
using TabletColumnPtr = std::shared_ptr<TabletColumn>;
69
70
class TabletColumn : public MetadataAdder<TabletColumn> {
71
public:
72
    TabletColumn();
73
    TabletColumn(const ColumnPB& column);
74
    TabletColumn(const TColumn& column);
75
    TabletColumn(FieldAggregationMethod agg, FieldType type);
76
    TabletColumn(FieldAggregationMethod agg, FieldType filed_type, bool is_nullable);
77
    TabletColumn(FieldAggregationMethod agg, FieldType filed_type, bool is_nullable,
78
                 int32_t unique_id, size_t length);
79
80
#ifdef BE_TEST
81
61.0k
    virtual ~TabletColumn() = default;
82
#endif
83
84
    void init_from_pb(const ColumnPB& column);
85
    void init_from_thrift(const TColumn& column);
86
    void to_schema_pb(ColumnPB* column) const;
87
88
306k
    int32_t unique_id() const { return _unique_id; }
89
464
    void set_unique_id(int32_t id) { _unique_id = id; }
90
195k
    const std::string& name() const { return _col_name; }
91
21.5k
    const std::string& name_lower_case() const { return _col_name_lower_case; }
92
2.65k
    void set_name(std::string col_name) {
93
2.65k
        _col_name = col_name;
94
2.65k
        _col_name_lower_case = to_lower(_col_name);
95
2.65k
    }
96
1.48M
    MOCK_FUNCTION FieldType type() const { return _type; }
97
2.64k
    void set_type(FieldType type) { _type = type; }
98
92.4k
    bool is_key() const { return _is_key; }
99
241k
    bool is_nullable() const { return _is_nullable; }
100
0
    bool is_auto_increment() const { return _is_auto_increment; }
101
0
    bool is_seqeunce_col() const { return _col_name == SEQUENCE_COL; }
102
0
    bool is_on_update_current_timestamp() const { return _is_on_update_current_timestamp; }
103
74.0k
    bool is_variant_type() const { return _type == FieldType::OLAP_FIELD_TYPE_VARIANT; }
104
15.4k
    bool is_bf_column() const { return _is_bf_column; }
105
15.1k
    bool has_bitmap_index() const { return _has_bitmap_index; }
106
41.2k
    bool is_array_type() const { return _type == FieldType::OLAP_FIELD_TYPE_ARRAY; }
107
12.5k
    bool is_agg_state_type() const { return _type == FieldType::OLAP_FIELD_TYPE_AGG_STATE; }
108
0
    bool is_jsonb_type() const { return _type == FieldType::OLAP_FIELD_TYPE_JSONB; }
109
0
    bool is_length_variable_type() const {
110
0
        return _type == FieldType::OLAP_FIELD_TYPE_CHAR ||
111
0
               _type == FieldType::OLAP_FIELD_TYPE_VARCHAR ||
112
0
               _type == FieldType::OLAP_FIELD_TYPE_STRING ||
113
0
               _type == FieldType::OLAP_FIELD_TYPE_HLL ||
114
0
               _type == FieldType::OLAP_FIELD_TYPE_BITMAP ||
115
0
               _type == FieldType::OLAP_FIELD_TYPE_QUANTILE_STATE ||
116
0
               _type == FieldType::OLAP_FIELD_TYPE_AGG_STATE;
117
0
    }
118
    // Such columns are not exist in frontend schema info, so we need to
119
    // add them into tablet_schema for later column indexing.
120
    static TabletColumn create_materialized_variant_column(const std::string& root,
121
                                                           const std::vector<std::string>& paths,
122
                                                           int32_t parent_unique_id,
123
                                                           int32_t max_subcolumns_count);
124
193
    bool has_default_value() const { return _has_default_value; }
125
15.4k
    std::string default_value() const { return _default_value; }
126
240k
    int32_t length() const { return _length; }
127
1.04k
    void set_length(int32_t length) { _length = length; }
128
256
    void set_default_value(const std::string& default_value) {
129
256
        _default_value = default_value;
130
256
        _has_default_value = true;
131
256
    }
132
33.4k
    int32_t index_length() const { return _index_length; }
133
470
    void set_index_length(int32_t index_length) { _index_length = index_length; }
134
252
    void set_is_key(bool is_key) { _is_key = is_key; }
135
1.11k
    void set_is_nullable(bool is_nullable) { _is_nullable = is_nullable; }
136
0
    void set_is_auto_increment(bool is_auto_increment) { _is_auto_increment = is_auto_increment; }
137
0
    void set_is_on_update_current_timestamp(bool is_on_update_current_timestamp) {
138
0
        _is_on_update_current_timestamp = is_on_update_current_timestamp;
139
0
    }
140
    void set_path_info(const vectorized::PathInData& path);
141
24.0k
    FieldAggregationMethod aggregation() const { return _aggregation; }
142
    vectorized::AggregateFunctionPtr get_aggregate_function_union(
143
            vectorized::DataTypePtr type, int current_be_exec_version) const;
144
    vectorized::AggregateFunctionPtr get_aggregate_function(std::string suffix,
145
                                                            int current_be_exec_version) const;
146
210k
    int precision() const { return _precision; }
147
211k
    int frac() const { return _frac; }
148
2
    inline bool visible() const { return _visible; }
149
    bool has_char_type() const;
150
151
1.15k
    void set_aggregation_method(FieldAggregationMethod agg) {
152
1.15k
        _aggregation = agg;
153
1.15k
        _aggregation_name = get_string_by_aggregation_type(agg);
154
1.15k
    }
155
156
    /**
157
     * Add a sub column.
158
     */
159
    void add_sub_column(TabletColumn& sub_column);
160
161
17.4k
    uint32_t get_subtype_count() const { return _sub_column_count; }
162
2.62k
    MOCK_FUNCTION const TabletColumn& get_sub_column(uint64_t i) const { return *_sub_columns[i]; }
163
2.86k
    const std::vector<TabletColumnPtr>& get_sub_columns() const { return _sub_columns; }
164
165
    friend bool operator==(const TabletColumn& a, const TabletColumn& b);
166
    friend bool operator!=(const TabletColumn& a, const TabletColumn& b);
167
168
    static std::string get_string_by_field_type(FieldType type);
169
    static std::string get_string_by_aggregation_type(FieldAggregationMethod aggregation_type);
170
    static FieldType get_field_type_by_string(const std::string& str);
171
    static FieldType get_field_type_by_type(PrimitiveType type);
172
    static PrimitiveType get_primitive_type_by_field_type(FieldType type);
173
    static FieldAggregationMethod get_aggregation_type_by_string(const std::string& str);
174
    static uint32_t get_field_length_by_type(TPrimitiveType::type type, uint32_t string_length);
175
    bool is_row_store_column() const;
176
13.8k
    std::string get_aggregation_name() const { return _aggregation_name; }
177
13.8k
    bool get_result_is_nullable() const { return _result_is_nullable; }
178
13.8k
    int get_be_exec_version() const { return _be_exec_version; }
179
96.3k
    bool has_path_info() const { return _column_path != nullptr && !_column_path->empty(); }
180
128k
    const vectorized::PathInDataPtr& path_info_ptr() const { return _column_path; }
181
    // If it is an extracted column from variant column
182
175k
    bool is_extracted_column() const {
183
175k
        return _column_path != nullptr && !_column_path->empty() && _parent_col_unique_id > 0;
184
175k
    };
185
36.6k
    std::string suffix_path() const {
186
36.6k
        return is_extracted_column() ? _column_path->get_path() : "";
187
36.6k
    }
188
513
    bool is_nested_subcolumn() const {
189
513
        return _column_path != nullptr && _column_path->has_nested_part();
190
513
    }
191
32.1k
    int32_t parent_unique_id() const { return _parent_col_unique_id; }
192
1.24k
    void set_parent_unique_id(int32_t col_unique_id) { _parent_col_unique_id = col_unique_id; }
193
332
    void set_is_bf_column(bool is_bf_column) { _is_bf_column = is_bf_column; }
194
0
    void set_has_bitmap_index(bool has_bitmap_index) { _has_bitmap_index = has_bitmap_index; }
195
    std::shared_ptr<const vectorized::IDataType> get_vec_type() const;
196
197
505
    Status check_valid() const {
198
505
        if (type() != FieldType::OLAP_FIELD_TYPE_ARRAY &&
199
505
            type() != FieldType::OLAP_FIELD_TYPE_STRUCT &&
200
505
            type() != FieldType::OLAP_FIELD_TYPE_MAP) {
201
505
            return Status::OK();
202
505
        }
203
0
        if (is_bf_column()) {
204
0
            return Status::NotSupported("Do not support bloom filter index, type={}",
205
0
                                        get_string_by_field_type(type()));
206
0
        }
207
0
        if (has_bitmap_index()) {
208
0
            return Status::NotSupported("Do not support bitmap index, type={}",
209
0
                                        get_string_by_field_type(type()));
210
0
        }
211
0
        return Status::OK();
212
0
    }
213
214
2
    void set_precision(int precision) {
215
2
        _precision = precision;
216
2
        _is_decimal = true;
217
2
    }
218
219
53
    void set_frac(int frac) { _frac = frac; }
220
221
68
    void set_variant_max_subcolumns_count(int32_t variant_max_subcolumns_count) {
222
68
        _variant_max_subcolumns_count = variant_max_subcolumns_count;
223
68
    }
224
225
13
    void set_variant_enable_typed_paths_to_sparse(bool enable) {
226
13
        _variant_enable_typed_paths_to_sparse = enable;
227
13
    }
228
229
    void set_variant_max_sparse_column_statistics_size(
230
2
            int32_t variant_max_sparse_column_statistics_size) {
231
2
        _variant_max_sparse_column_statistics_size = variant_max_sparse_column_statistics_size;
232
2
    }
233
234
2.11k
    int32_t variant_max_subcolumns_count() const { return _variant_max_subcolumns_count; }
235
236
154
    PatternTypePB pattern_type() const { return _pattern_type; }
237
238
370
    bool variant_enable_typed_paths_to_sparse() const {
239
370
        return _variant_enable_typed_paths_to_sparse;
240
370
    }
241
242
22.3k
    int32_t variant_max_sparse_column_statistics_size() const {
243
22.3k
        return _variant_max_sparse_column_statistics_size;
244
22.3k
    }
245
246
51
    bool is_decimal() const { return _is_decimal; }
247
248
private:
249
    int32_t _unique_id = -1;
250
    std::string _col_name;
251
    std::string _col_name_lower_case;
252
    // the field _type will change from TPrimitiveType
253
    // to string by 'EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type);' (reference: TabletMeta::init_column_from_tcolumn)
254
    // to FieldType by 'TabletColumn::get_field_type_by_string' (reference: TabletColumn::init_from_pb).
255
    // And the _type in columnPB is string and it changed from FieldType by 'get_string_by_field_type' (reference: TabletColumn::to_schema_pb).
256
    FieldType _type;
257
    bool _is_key = false;
258
    FieldAggregationMethod _aggregation;
259
    std::string _aggregation_name;
260
    bool _is_nullable = false;
261
    bool _is_auto_increment = false;
262
    bool _is_on_update_current_timestamp {false};
263
264
    bool _has_default_value = false;
265
    std::string _default_value;
266
267
    bool _is_decimal = false;
268
    int32_t _precision = -1;
269
    int32_t _frac = -1;
270
271
    int32_t _length = -1;
272
    int32_t _index_length = -1;
273
274
    bool _is_bf_column = false;
275
276
    bool _has_bitmap_index = false;
277
    bool _visible = true;
278
279
    std::vector<TabletColumnPtr> _sub_columns;
280
    uint32_t _sub_column_count = 0;
281
282
    bool _result_is_nullable = false;
283
    int _be_exec_version = -1;
284
285
    // The extracted sub-columns from "variant" contain the following information:
286
    int32_t _parent_col_unique_id = -1;     // "variant" -> col_unique_id
287
    vectorized::PathInDataPtr _column_path; // the path of the sub-columns themselves
288
289
    int32_t _variant_max_subcolumns_count = 0;
290
    PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB;
291
    bool _variant_enable_typed_paths_to_sparse = false;
292
    // set variant_max_sparse_column_statistics_size
293
    int32_t _variant_max_sparse_column_statistics_size =
294
            BeConsts::DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE;
295
};
296
297
bool operator==(const TabletColumn& a, const TabletColumn& b);
298
bool operator!=(const TabletColumn& a, const TabletColumn& b);
299
300
class TabletIndex : public MetadataAdder<TabletIndex> {
301
public:
302
7.55k
    TabletIndex() = default;
303
    void init_from_thrift(const TOlapTableIndex& index, const TabletSchema& tablet_schema);
304
    void init_from_thrift(const TOlapTableIndex& index, const std::vector<int32_t>& column_uids);
305
    void init_from_pb(const TabletIndexPB& index);
306
    void to_schema_pb(TabletIndexPB* index) const;
307
308
19.0k
    int64_t index_id() const { return _index_id; }
309
21
    const std::string& index_name() const { return _index_name; }
310
13.3k
    MOCK_FUNCTION IndexType index_type() const { return _index_type; }
311
9.29k
    const std::vector<int32_t>& col_unique_ids() const { return _col_unique_ids; }
312
25.7k
    MOCK_FUNCTION const std::map<std::string, std::string>& properties() const {
313
25.7k
        return _properties;
314
25.7k
    }
315
1
    int32_t get_gram_size() const {
316
1
        if (_properties.contains("gram_size")) {
317
1
            return std::stoi(_properties.at("gram_size"));
318
1
        }
319
320
0
        return 0;
321
1
    }
322
1
    int32_t get_gram_bf_size() const {
323
1
        if (_properties.contains("bf_size")) {
324
1
            return std::stoi(_properties.at("bf_size"));
325
1
        }
326
327
0
        return 0;
328
1
    }
329
330
15.8k
    const std::string& get_index_suffix() const { return _escaped_index_suffix_path; }
331
332
    void set_escaped_escaped_index_suffix_path(const std::string& name);
333
334
2
    bool is_inverted_index() const { return _index_type == IndexType::INVERTED; }
335
336
8
    void remove_parser_and_analyzer() {
337
8
        _properties.erase(INVERTED_INDEX_PARSER_KEY);
338
8
        _properties.erase(INVERTED_INDEX_PARSER_KEY_ALIAS);
339
8
        _properties.erase(INVERTED_INDEX_CUSTOM_ANALYZER_KEY);
340
8
    }
341
342
7.45k
    std::string field_pattern() const {
343
7.45k
        if (_properties.contains("field_pattern")) {
344
8
            return _properties.at("field_pattern");
345
8
        }
346
7.44k
        return "";
347
7.45k
    }
348
349
8
    bool is_same_except_id(const TabletIndex* other) const {
350
8
        return _escaped_index_suffix_path == other->_escaped_index_suffix_path &&
351
8
               _index_name == other->_index_name && _index_type == other->_index_type &&
352
8
               _col_unique_ids == other->_col_unique_ids && _properties == other->_properties;
353
8
    }
354
355
private:
356
    int64_t _index_id = -1;
357
    // Identify the different index with the same _index_id
358
    std::string _escaped_index_suffix_path;
359
    std::string _index_name;
360
    IndexType _index_type;
361
    std::vector<int32_t> _col_unique_ids;
362
    std::map<std::string, std::string> _properties;
363
};
364
365
using TabletIndexPtr = std::shared_ptr<TabletIndex>;
366
using TabletIndexes = std::vector<std::shared_ptr<TabletIndex>>;
367
using PathSet = phmap::flat_hash_set<std::string>;
368
369
class TabletSchema : public MetadataAdder<TabletSchema> {
370
public:
371
    enum class ColumnType { NORMAL = 0, DROPPED = 1, VARIANT = 2 };
372
    // TODO(yingchun): better to make constructor as private to avoid
373
    // manually init members incorrectly, and define a new function like
374
    // void create_from_pb(const TabletSchemaPB& schema, TabletSchema* tablet_schema).
375
    TabletSchema();
376
    ~TabletSchema() override;
377
378
    // Init from pb
379
    // ignore_extracted_columns: ignore the extracted columns from variant column
380
    // reuse_cached_column: reuse the cached column in the schema if they are the same, to reduce memory usage
381
    void init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns = false,
382
                      bool reuse_cached_column = false);
383
    // Notice: Use deterministic way to serialize protobuf,
384
    // since serialize Map in protobuf may could lead to un-deterministic by default
385
    template <class PbType>
386
4.33k
    static std::string deterministic_string_serialize(const PbType& pb) {
387
4.33k
        std::string output;
388
4.33k
        google::protobuf::io::StringOutputStream string_output_stream(&output);
389
4.33k
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
390
4.33k
        output_stream.SetSerializationDeterministic(true);
391
4.33k
        pb.SerializeToCodedStream(&output_stream);
392
4.33k
        return output;
393
4.33k
    }
_ZN5doris12TabletSchema30deterministic_string_serializeINS_14TabletSchemaPBEEENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKT_
Line
Count
Source
386
3.86k
    static std::string deterministic_string_serialize(const PbType& pb) {
387
3.86k
        std::string output;
388
3.86k
        google::protobuf::io::StringOutputStream string_output_stream(&output);
389
3.86k
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
390
3.86k
        output_stream.SetSerializationDeterministic(true);
391
3.86k
        pb.SerializeToCodedStream(&output_stream);
392
3.86k
        return output;
393
3.86k
    }
_ZN5doris12TabletSchema30deterministic_string_serializeINS_8ColumnPBEEENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKT_
Line
Count
Source
386
320
    static std::string deterministic_string_serialize(const PbType& pb) {
387
320
        std::string output;
388
320
        google::protobuf::io::StringOutputStream string_output_stream(&output);
389
320
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
390
320
        output_stream.SetSerializationDeterministic(true);
391
320
        pb.SerializeToCodedStream(&output_stream);
392
320
        return output;
393
320
    }
_ZN5doris12TabletSchema30deterministic_string_serializeINS_13TabletIndexPBEEENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKT_
Line
Count
Source
386
149
    static std::string deterministic_string_serialize(const PbType& pb) {
387
149
        std::string output;
388
149
        google::protobuf::io::StringOutputStream string_output_stream(&output);
389
149
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
390
149
        output_stream.SetSerializationDeterministic(true);
391
149
        pb.SerializeToCodedStream(&output_stream);
392
149
        return output;
393
149
    }
394
    void to_schema_pb(TabletSchemaPB* tablet_meta_pb) const;
395
    void append_column(TabletColumn column, ColumnType col_type = ColumnType::NORMAL);
396
    void append_index(TabletIndex&& index);
397
    void remove_index(int64_t index_id);
398
    void clear_index();
399
    // Must make sure the row column is always the last column
400
    void add_row_column();
401
    void copy_from(const TabletSchema& tablet_schema);
402
    // lightweight copy, take care of lifecycle of TabletColumn
403
    void shawdow_copy_without_columns(const TabletSchema& tablet_schema);
404
    void update_index_info_from(const TabletSchema& tablet_schema);
405
    std::string to_key() const;
406
    // get_metadata_size is only the memory of the TabletSchema itself, not include child objects.
407
57
    int64_t mem_size() const { return get_metadata_size(); }
408
    size_t row_size() const;
409
    int32_t field_index(const std::string& field_name) const;
410
    int32_t field_index(const vectorized::PathInData& path) const;
411
    int32_t field_index(int32_t col_unique_id) const;
412
    const TabletColumn& column(size_t ordinal) const;
413
    Result<const TabletColumn*> column(const std::string& field_name) const;
414
    Status have_column(const std::string& field_name) const;
415
    bool exist_column(const std::string& field_name) const;
416
    bool has_column_unique_id(int32_t col_unique_id) const;
417
    const TabletColumn& column_by_uid(int32_t col_unique_id) const;
418
    TabletColumn& mutable_column_by_uid(int32_t col_unique_id);
419
    TabletColumn& mutable_column(size_t ordinal);
420
    void replace_column(size_t pos, TabletColumn new_col);
421
    const std::vector<TabletColumnPtr>& columns() const;
422
594k
    size_t num_columns() const { return _num_columns; }
423
1.08M
    size_t num_key_columns() const { return _num_key_columns; }
424
136k
    const std::vector<uint32_t>& cluster_key_uids() const { return _cluster_key_uids; }
425
0
    size_t num_null_columns() const { return _num_null_columns; }
426
5.33k
    size_t num_short_key_columns() const { return _num_short_key_columns; }
427
0
    size_t num_rows_per_row_block() const { return _num_rows_per_row_block; }
428
1.13k
    size_t num_variant_columns() const { return _num_variant_columns; };
429
0
    size_t num_virtual_columns() const { return _num_virtual_columns; }
430
7.11M
    KeysType keys_type() const { return _keys_type; }
431
5.62k
    SortType sort_type() const { return _sort_type; }
432
0
    size_t sort_col_num() const { return _sort_col_num; }
433
0
    CompressKind compress_kind() const { return _compress_kind; }
434
0
    size_t next_column_unique_id() const { return _next_column_unique_id; }
435
4
    bool has_bf_fpp() const { return _has_bf_fpp; }
436
4
    double bloom_filter_fpp() const { return _bf_fpp; }
437
18.8k
    bool is_in_memory() const { return _is_in_memory; }
438
0
    void set_is_in_memory(bool is_in_memory) { _is_in_memory = is_in_memory; }
439
2
    void set_disable_auto_compaction(bool disable_auto_compaction) {
440
2
        _disable_auto_compaction = disable_auto_compaction;
441
2
    }
442
288
    bool disable_auto_compaction() const { return _disable_auto_compaction; }
443
0
    void set_enable_variant_flatten_nested(bool flatten_nested) {
444
0
        _enable_variant_flatten_nested = flatten_nested;
445
0
    }
446
0
    bool variant_flatten_nested() const { return _enable_variant_flatten_nested; }
447
0
    void set_enable_single_replica_compaction(bool enable_single_replica_compaction) {
448
0
        _enable_single_replica_compaction = enable_single_replica_compaction;
449
0
    }
450
351
    bool enable_single_replica_compaction() const { return _enable_single_replica_compaction; }
451
    // indicate if full row store column(all the columns encodes as row) exists
452
0
    bool has_row_store_for_all_columns() const {
453
0
        return _store_row_column && row_columns_uids().empty();
454
0
    }
455
0
    void set_skip_write_index_on_load(bool skip) { _skip_write_index_on_load = skip; }
456
73
    bool skip_write_index_on_load() const { return _skip_write_index_on_load; }
457
8.20k
    int32_t delete_sign_idx() const { return _delete_sign_idx; }
458
0
    void set_delete_sign_idx(int32_t delete_sign_idx) { _delete_sign_idx = delete_sign_idx; }
459
142k
    bool has_sequence_col() const { return _sequence_col_idx != -1; }
460
64.7k
    int32_t sequence_col_idx() const { return _sequence_col_idx; }
461
0
    void set_version_col_idx(int32_t version_col_idx) { _version_col_idx = version_col_idx; }
462
0
    int32_t version_col_idx() const { return _version_col_idx; }
463
0
    bool has_skip_bitmap_col() const { return _skip_bitmap_col_idx != -1; }
464
0
    int32_t skip_bitmap_col_idx() const { return _skip_bitmap_col_idx; }
465
5.21k
    segment_v2::CompressionTypePB compression_type() const { return _compression_type; }
466
0
    void set_row_store_page_size(long page_size) { _row_store_page_size = page_size; }
467
0
    long row_store_page_size() const { return _row_store_page_size; }
468
0
    void set_storage_page_size(long storage_page_size) { _storage_page_size = storage_page_size; }
469
13.8k
    long storage_page_size() const { return _storage_page_size; }
470
0
    void set_storage_dict_page_size(long storage_dict_page_size) {
471
0
        _storage_dict_page_size = storage_dict_page_size;
472
0
    }
473
13.8k
    long storage_dict_page_size() const { return _storage_dict_page_size; }
474
0
    bool has_global_row_id() const {
475
0
        for (auto [col_name, _] : _field_name_to_index) {
476
0
            if (col_name.start_with(StringRef(BeConsts::GLOBAL_ROWID_COL.data(),
477
0
                                              BeConsts::GLOBAL_ROWID_COL.size()))) {
478
0
                return true;
479
0
            }
480
0
        }
481
0
        return false;
482
0
    }
483
484
120
    const std::vector<const TabletIndex*> inverted_indexes() const {
485
120
        std::vector<const TabletIndex*> inverted_indexes;
486
1.25k
        for (const auto& index : _indexes) {
487
1.25k
            if (index->index_type() == IndexType::INVERTED) {
488
1.25k
                inverted_indexes.emplace_back(index.get());
489
1.25k
            }
490
1.25k
        }
491
120
        return inverted_indexes;
492
120
    }
493
11.3k
    bool has_inverted_index() const {
494
11.3k
        for (const auto& index : _indexes) {
495
706
            DBUG_EXECUTE_IF("tablet_schema::has_inverted_index", {
496
706
                if (index->col_unique_ids().empty()) {
497
706
                    throw Exception(Status::InternalError("col unique ids cannot be empty"));
498
706
                }
499
706
            });
500
501
706
            if (index->index_type() == IndexType::INVERTED) {
502
                //if index_id == -1, ignore it.
503
706
                if (!index->col_unique_ids().empty() && index->col_unique_ids()[0] >= 0) {
504
706
                    return true;
505
706
                }
506
706
            }
507
706
        }
508
10.6k
        return false;
509
11.3k
    }
510
511
10.6k
    bool has_ann_index() const {
512
10.6k
        for (const auto& index : _indexes) {
513
0
            if (index->index_type() == IndexType::ANN) {
514
0
                if (!index->col_unique_ids().empty() && index->col_unique_ids()[0] >= 0) {
515
0
                    return true;
516
0
                }
517
0
            }
518
0
        }
519
10.6k
        return false;
520
10.6k
    }
521
522
    bool has_inverted_index_with_index_id(int64_t index_id) const;
523
524
    std::vector<const TabletIndex*> inverted_indexs(const TabletColumn& col) const;
525
526
    std::vector<const TabletIndex*> inverted_indexs(int32_t col_unique_id,
527
                                                    const std::string& suffix_path = "") const;
528
    const TabletIndex* ann_index(const TabletColumn& col) const;
529
530
    // Regardless of whether this column supports inverted index
531
    // TabletIndex information will be returned as long as it exists.
532
    const TabletIndex* ann_index(int32_t col_unique_id, const std::string& suffix_path = "") const;
533
534
    std::vector<TabletIndexPtr> inverted_index_by_field_pattern(
535
            int32_t col_unique_id, const std::string& field_pattern) const;
536
537
    bool has_ngram_bf_index(int32_t col_unique_id) const;
538
    const TabletIndex* get_ngram_bf_index(int32_t col_unique_id) const;
539
    void update_indexes_from_thrift(const std::vector<doris::TOlapTableIndex>& indexes);
540
    // If schema version is not set, it should be -1
541
1.56k
    int32_t schema_version() const { return _schema_version; }
542
    void clear_columns();
543
    vectorized::Block create_block(
544
            const std::vector<uint32_t>& return_columns,
545
            const std::unordered_set<uint32_t>* tablet_columns_need_convert_null = nullptr) const;
546
    vectorized::Block create_block(bool ignore_dropped_col = true) const;
547
2
    void set_schema_version(int32_t version) { _schema_version = version; }
548
0
    void set_auto_increment_column(const std::string& auto_increment_column) {
549
0
        _auto_increment_column = auto_increment_column;
550
0
    }
551
0
    std::string auto_increment_column() const { return _auto_increment_column; }
552
553
28
    void set_table_id(int64_t table_id) { _table_id = table_id; }
554
476
    int64_t table_id() const { return _table_id; }
555
28
    void set_db_id(int64_t db_id) { _db_id = db_id; }
556
0
    int64_t db_id() const { return _db_id; }
557
    void build_current_tablet_schema(int64_t index_id, int32_t version,
558
                                     const OlapTableIndexSchema* index,
559
                                     const TabletSchema& out_tablet_schema);
560
561
    // Merge columns that not exit in current schema, these column is dropped in current schema
562
    // but they are useful in some cases. For example,
563
    // 1. origin schema is  ColA, ColB
564
    // 2. insert values     1, 2
565
    // 3. delete where ColB = 2
566
    // 4. drop ColB
567
    // 5. insert values  3
568
    // 6. add column ColB, although it is name ColB, but it is different with previous ColB, the new ColB we name could call ColB'
569
    // 7. insert value  4, 5
570
    // Then the read schema should be ColA, ColB, ColB' because the delete predicate need ColB to remove related data.
571
    // Because they have same name, so that the dropped column should not be added to the map, only with unique id.
572
    void merge_dropped_columns(const TabletSchema& src_schema);
573
574
    bool is_dropped_column(const TabletColumn& col) const;
575
576
    // copy extracted columns from src_schema
577
    void copy_extracted_columns(const TabletSchema& src_schema);
578
579
    // only reserve extracted columns
580
    void reserve_extracted_columns();
581
582
4.04k
    std::string get_all_field_names() const {
583
4.04k
        std::string str = "[";
584
4.04k
        for (auto p : _field_name_to_index) {
585
4.04k
            if (str.size() > 1) {
586
0
                str += ", ";
587
0
            }
588
4.04k
            str += p.first.to_string() + "(" + std::to_string(_cols[p.second]->unique_id()) + ")";
589
4.04k
        }
590
4.04k
        str += "]";
591
4.04k
        return str;
592
4.04k
    }
593
594
    // Dump [(name, type, is_nullable), ...]
595
1
    std::string dump_structure() const {
596
1
        std::string str = "[";
597
15
        for (auto p : _cols) {
598
15
            if (str.size() > 1) {
599
14
                str += ", ";
600
14
            }
601
15
            str += "(";
602
15
            str += p->name();
603
15
            str += ", ";
604
15
            str += TabletColumn::get_string_by_field_type(p->type());
605
15
            str += ", ";
606
15
            str += "is_nullable:";
607
15
            str += (p->is_nullable() ? "true" : "false");
608
15
            str += ")";
609
15
        }
610
1
        str += "]";
611
1
        return str;
612
1
    }
613
614
1
    std::string dump_full_schema() const {
615
1
        std::string str = "[";
616
4
        for (auto p : _cols) {
617
4
            if (str.size() > 1) {
618
3
                str += ", ";
619
3
            }
620
4
            ColumnPB col_pb;
621
4
            p->to_schema_pb(&col_pb);
622
4
            str += "(";
623
4
            str += col_pb.ShortDebugString();
624
4
            str += ")";
625
4
        }
626
1
        str += "]";
627
1
        return str;
628
1
    }
629
630
    vectorized::Block create_block_by_cids(const std::vector<uint32_t>& cids) const;
631
632
    std::shared_ptr<TabletSchema> copy_without_variant_extracted_columns();
633
9.34k
    InvertedIndexStorageFormatPB get_inverted_index_storage_format() const {
634
9.34k
        return _inverted_index_storage_format;
635
9.34k
    }
636
637
    void update_tablet_columns(const TabletSchema& tablet_schema,
638
                               const std::vector<TColumn>& t_columns);
639
640
0
    const std::vector<int32_t>& row_columns_uids() const { return _row_store_column_unique_ids; }
641
642
    int64_t get_metadata_size() const override;
643
644
    struct SubColumnInfo {
645
        TabletColumn column;
646
        TabletIndexes indexes;
647
    };
648
649
    // all path in path_set_info are relative to the parent column
650
    struct PathsSetInfo {
651
        std::unordered_map<std::string, SubColumnInfo> typed_path_set;    // typed columns
652
        std::unordered_map<std::string, TabletIndexes> subcolumn_indexes; // subcolumns indexes
653
        PathSet sub_path_set;                                             // extracted columns
654
        PathSet sparse_path_set;                                          // sparse columns
655
    };
656
657
41
    void set_path_set_info(std::unordered_map<int32_t, PathsSetInfo>&& path_set_info_map) {
658
41
        _path_set_info_map = std::move(path_set_info_map);
659
41
    }
660
661
2
    const PathsSetInfo& path_set_info(int32_t unique_id) const {
662
2
        return _path_set_info_map.at(unique_id);
663
2
    }
664
665
6
    bool need_record_variant_extended_schema() const { return variant_max_subcolumns_count() == 0; }
666
667
11
    int32_t variant_max_subcolumns_count() const {
668
12
        for (const auto& col : _cols) {
669
12
            if (col->is_variant_type()) {
670
7
                return col->variant_max_subcolumns_count();
671
7
            }
672
12
        }
673
4
        return 0;
674
11
    }
675
676
private:
677
    friend bool operator==(const TabletSchema& a, const TabletSchema& b);
678
    friend bool operator!=(const TabletSchema& a, const TabletSchema& b);
679
0
    TabletSchema(const TabletSchema&) = default;
680
681
    KeysType _keys_type = DUP_KEYS;
682
    SortType _sort_type = SortType::LEXICAL;
683
    size_t _sort_col_num = 0;
684
    std::vector<TabletColumnPtr> _cols;
685
686
    std::vector<TabletIndexPtr> _indexes;
687
    std::unordered_map<StringRef, int32_t, StringRefHash> _field_name_to_index;
688
    std::unordered_map<int32_t, int32_t> _field_uniqueid_to_index;
689
    std::unordered_map<vectorized::PathInDataRef, int32_t, vectorized::PathInDataRef::Hash>
690
            _field_path_to_index;
691
692
    // index_type/col_unique_id/suffix -> idxs in _indexes
693
    using IndexKey = std::tuple<IndexType, int32_t, std::string>;
694
    struct IndexKeyHash {
695
51.9k
        size_t operator()(const IndexKey& t) const {
696
51.9k
            uint32_t seed = 0;
697
51.9k
            seed = doris::HashUtil::hash((const char*)&std::get<0>(t), sizeof(std::get<0>(t)),
698
51.9k
                                         seed);
699
51.9k
            seed = doris::HashUtil::hash((const char*)&std::get<1>(t), sizeof(std::get<1>(t)),
700
51.9k
                                         seed);
701
51.9k
            seed = doris::HashUtil::hash((const char*)std::get<2>(t).c_str(),
702
51.9k
                                         static_cast<uint32_t>(std::get<2>(t).size()), seed);
703
51.9k
            return seed;
704
51.9k
        }
705
    };
706
    std::unordered_map<IndexKey, std::vector<size_t>, IndexKeyHash> _col_id_suffix_to_index;
707
708
    int32_t _num_columns = 0;
709
    size_t _num_variant_columns = 0;
710
    size_t _num_virtual_columns = 0;
711
    size_t _num_key_columns = 0;
712
    std::vector<uint32_t> _cluster_key_uids;
713
    size_t _num_null_columns = 0;
714
    size_t _num_short_key_columns = 0;
715
    size_t _num_rows_per_row_block = 0;
716
    CompressKind _compress_kind = COMPRESS_NONE;
717
    segment_v2::CompressionTypePB _compression_type = segment_v2::CompressionTypePB::LZ4F;
718
    long _row_store_page_size = segment_v2::ROW_STORE_PAGE_SIZE_DEFAULT_VALUE;
719
    long _storage_page_size = segment_v2::STORAGE_PAGE_SIZE_DEFAULT_VALUE;
720
    long _storage_dict_page_size = segment_v2::STORAGE_DICT_PAGE_SIZE_DEFAULT_VALUE;
721
    size_t _next_column_unique_id = 0;
722
    std::string _auto_increment_column;
723
724
    bool _has_bf_fpp = false;
725
    double _bf_fpp = 0;
726
    bool _is_in_memory = false;
727
    int32_t _delete_sign_idx = -1;
728
    int32_t _sequence_col_idx = -1;
729
    int32_t _version_col_idx = -1;
730
    int32_t _skip_bitmap_col_idx = -1;
731
    int32_t _schema_version = -1;
732
    int64_t _table_id = -1;
733
    int64_t _db_id = -1;
734
    bool _disable_auto_compaction = false;
735
    bool _enable_single_replica_compaction = false;
736
    bool _store_row_column = false;
737
    bool _skip_write_index_on_load = false;
738
    InvertedIndexStorageFormatPB _inverted_index_storage_format = InvertedIndexStorageFormatPB::V1;
739
740
    // Contains column ids of which columns should be encoded into row store.
741
    // ATTN: For compability reason empty cids means all columns of tablet schema are encoded to row column
742
    std::vector<int32_t> _row_store_column_unique_ids;
743
    bool _enable_variant_flatten_nested = false;
744
745
    std::map<size_t, int32_t> _vir_col_idx_to_unique_id;
746
747
    // value: extracted path set and sparse path set
748
    std::unordered_map<int32_t, PathsSetInfo> _path_set_info_map;
749
750
    // key: field_pattern
751
    // value: indexes
752
    using PatternToIndex = std::unordered_map<std::string, std::vector<TabletIndexPtr>>;
753
    std::unordered_map<int32_t, PatternToIndex> _index_by_unique_id_with_pattern;
754
};
755
756
bool operator==(const TabletSchema& a, const TabletSchema& b);
757
bool operator!=(const TabletSchema& a, const TabletSchema& b);
758
759
using TabletSchemaSPtr = std::shared_ptr<TabletSchema>;
760
761
#include "common/compile_check_end.h"
762
} // namespace doris