Coverage Report

Created: 2025-07-28 21:04

/root/doris/be/src/olap/tablet_schema.h
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <gen_cpp/Types_types.h>
21
#include <gen_cpp/olap_common.pb.h>
22
#include <gen_cpp/olap_file.pb.h>
23
#include <gen_cpp/segment_v2.pb.h>
24
#include <parallel_hashmap/phmap.h>
25
26
#include <algorithm>
27
#include <cstdint>
28
#include <map>
29
#include <memory>
30
#include <string>
31
#include <unordered_map>
32
#include <unordered_set>
33
#include <utility>
34
#include <vector>
35
36
#include "common/consts.h"
37
#include "common/status.h"
38
#include "gutil/stringprintf.h"
39
#include "olap/inverted_index_parser.h"
40
#include "olap/metadata_adder.h"
41
#include "olap/olap_common.h"
42
#include "olap/rowset/segment_v2/options.h"
43
#include "runtime/define_primitive_type.h"
44
#include "runtime/descriptors.h"
45
#include "runtime/memory/lru_cache_policy.h"
46
#include "util/debug_points.h"
47
#include "util/string_util.h"
48
#include "vec/aggregate_functions/aggregate_function.h"
49
#include "vec/common/string_ref.h"
50
#include "vec/common/string_utils/string_utils.h"
51
#include "vec/core/types.h"
52
#include "vec/json/path_in_data.h"
53
54
namespace doris {
55
namespace vectorized {
56
class Block;
57
class PathInData;
58
class IDataType;
59
} // namespace vectorized
60
61
struct OlapTableIndexSchema;
62
class TColumn;
63
class TOlapTableIndex;
64
class TabletColumn;
65
66
using TabletColumnPtr = std::shared_ptr<TabletColumn>;
67
68
class TabletColumn : public MetadataAdder<TabletColumn> {
69
public:
70
    TabletColumn();
71
    TabletColumn(const ColumnPB& column);
72
    TabletColumn(const TColumn& column);
73
    TabletColumn(FieldAggregationMethod agg, FieldType type);
74
    TabletColumn(FieldAggregationMethod agg, FieldType filed_type, bool is_nullable);
75
    TabletColumn(FieldAggregationMethod agg, FieldType filed_type, bool is_nullable,
76
                 int32_t unique_id, size_t length);
77
    void init_from_pb(const ColumnPB& column);
78
    void init_from_thrift(const TColumn& column);
79
    void to_schema_pb(ColumnPB* column) const;
80
81
280k
    int32_t unique_id() const { return _unique_id; }
82
141
    void set_unique_id(int32_t id) { _unique_id = id; }
83
175k
    const std::string& name() const { return _col_name; }
84
22.8k
    const std::string& name_lower_case() const { return _col_name_lower_case; }
85
3.58k
    void set_name(std::string col_name) {
86
3.58k
        _col_name = col_name;
87
3.58k
        _col_name_lower_case = to_lower(_col_name);
88
3.58k
    }
89
1.79M
    FieldType type() const { return _type; }
90
3.57k
    void set_type(FieldType type) { _type = type; }
91
93.1k
    bool is_key() const { return _is_key; }
92
290k
    bool is_nullable() const { return _is_nullable; }
93
0
    bool is_auto_increment() const { return _is_auto_increment; }
94
0
    bool is_seqeunce_col() const { return _col_name == SEQUENCE_COL; }
95
0
    bool is_on_update_current_timestamp() const { return _is_on_update_current_timestamp; }
96
91.5k
    bool is_variant_type() const { return _type == FieldType::OLAP_FIELD_TYPE_VARIANT; }
97
15.8k
    bool is_bf_column() const { return _is_bf_column; }
98
15.4k
    bool has_bitmap_index() const { return _has_bitmap_index; }
99
28.4k
    bool is_array_type() const { return _type == FieldType::OLAP_FIELD_TYPE_ARRAY; }
100
0
    bool is_jsonb_type() const { return _type == FieldType::OLAP_FIELD_TYPE_JSONB; }
101
0
    bool is_length_variable_type() const {
102
0
        return _type == FieldType::OLAP_FIELD_TYPE_CHAR ||
103
0
               _type == FieldType::OLAP_FIELD_TYPE_VARCHAR ||
104
0
               _type == FieldType::OLAP_FIELD_TYPE_STRING ||
105
0
               _type == FieldType::OLAP_FIELD_TYPE_HLL ||
106
0
               _type == FieldType::OLAP_FIELD_TYPE_OBJECT ||
107
0
               _type == FieldType::OLAP_FIELD_TYPE_QUANTILE_STATE ||
108
0
               _type == FieldType::OLAP_FIELD_TYPE_AGG_STATE;
109
0
    }
110
    // Such columns are not exist in frontend schema info, so we need to
111
    // add them into tablet_schema for later column indexing.
112
    static TabletColumn create_materialized_variant_column(const std::string& root,
113
                                                           const std::vector<std::string>& paths,
114
                                                           int32_t parent_unique_id,
115
                                                           int32_t max_subcolumns_count);
116
192
    bool has_default_value() const { return _has_default_value; }
117
15.8k
    std::string default_value() const { return _default_value; }
118
46.3k
    size_t length() const { return _length; }
119
1.23k
    void set_length(size_t length) { _length = length; }
120
309
    void set_default_value(const std::string& default_value) {
121
309
        _default_value = default_value;
122
309
        _has_default_value = true;
123
309
    }
124
34.1k
    size_t index_length() const { return _index_length; }
125
335
    void set_index_length(size_t index_length) { _index_length = index_length; }
126
77
    void set_is_key(bool is_key) { _is_key = is_key; }
127
1.61k
    void set_is_nullable(bool is_nullable) { _is_nullable = is_nullable; }
128
0
    void set_is_auto_increment(bool is_auto_increment) { _is_auto_increment = is_auto_increment; }
129
0
    void set_has_default_value(bool has) { _has_default_value = has; }
130
0
    void set_is_on_update_current_timestamp(bool is_on_update_current_timestamp) {
131
0
        _is_on_update_current_timestamp = is_on_update_current_timestamp;
132
0
    }
133
    void set_path_info(const vectorized::PathInData& path);
134
25.1k
    FieldAggregationMethod aggregation() const { return _aggregation; }
135
    vectorized::AggregateFunctionPtr get_aggregate_function_union(
136
            vectorized::DataTypePtr type, int current_be_exec_version) const;
137
    vectorized::AggregateFunctionPtr get_aggregate_function(std::string suffix,
138
                                                            int current_be_exec_version) const;
139
354
    void set_precision(int precision) { _precision = precision; }
140
261k
    int precision() const { return _precision; }
141
354
    void set_frac(int frac) { _frac = frac; }
142
261k
    int frac() const { return _frac; }
143
0
    inline bool visible() const { return _visible; }
144
    bool has_char_type() const;
145
146
1.66k
    void set_aggregation_method(FieldAggregationMethod agg) {
147
1.66k
        _aggregation = agg;
148
1.66k
        _aggregation_name = get_string_by_aggregation_type(agg);
149
1.66k
    }
150
151
    /**
152
     * Add a sub column.
153
     */
154
    void add_sub_column(TabletColumn& sub_column);
155
156
18.9k
    uint32_t get_subtype_count() const { return _sub_column_count; }
157
4.46k
    const TabletColumn& get_sub_column(uint32_t i) const { return *_sub_columns[i]; }
158
3.87k
    const std::vector<TabletColumnPtr>& get_sub_columns() const { return _sub_columns; }
159
160
    friend bool operator==(const TabletColumn& a, const TabletColumn& b);
161
    friend bool operator!=(const TabletColumn& a, const TabletColumn& b);
162
163
    static std::string get_string_by_field_type(FieldType type);
164
    static std::string get_string_by_aggregation_type(FieldAggregationMethod aggregation_type);
165
    static FieldType get_field_type_by_string(const std::string& str);
166
    static FieldType get_field_type_by_type(PrimitiveType type);
167
    static FieldAggregationMethod get_aggregation_type_by_string(const std::string& str);
168
    static uint32_t get_field_length_by_type(TPrimitiveType::type type, uint32_t string_length);
169
    bool is_row_store_column() const;
170
13.9k
    std::string get_aggregation_name() const { return _aggregation_name; }
171
13.9k
    bool get_result_is_nullable() const { return _result_is_nullable; }
172
13.9k
    int get_be_exec_version() const { return _be_exec_version; }
173
79.7k
    bool has_path_info() const { return _column_path != nullptr && !_column_path->empty(); }
  Branch (173:41): [True: 4.64k, False: 75.0k]
  Branch (173:68): [True: 4.64k, False: 0]
174
157k
    const vectorized::PathInDataPtr& path_info_ptr() const { return _column_path; }
175
    // If it is an extracted column from variant column
176
156k
    bool is_extracted_column() const {
177
156k
        return _column_path != nullptr && !_column_path->empty() && _parent_col_unique_id > 0;
  Branch (177:16): [True: 5.05k, False: 151k]
  Branch (177:43): [True: 5.05k, False: 0]
  Branch (177:69): [True: 3.37k, False: 1.68k]
178
156k
    }
179
    // If it is sparse column of variant type
180
    bool is_sparse_column() const;
181
27.2k
    std::string suffix_path() const {
182
27.2k
        return is_extracted_column() ? _column_path->get_path() : "";
  Branch (182:16): [True: 268, False: 27.0k]
183
27.2k
    }
184
2.10k
    bool is_nested_subcolumn() const {
185
2.10k
        return _column_path != nullptr && _column_path->has_nested_part();
  Branch (185:16): [True: 2.10k, False: 0]
  Branch (185:43): [True: 6, False: 2.10k]
186
2.10k
    }
187
37.7k
    int32_t parent_unique_id() const { return _parent_col_unique_id; }
188
2.27k
    void set_parent_unique_id(int32_t col_unique_id) { _parent_col_unique_id = col_unique_id; }
189
415
    void set_is_bf_column(bool is_bf_column) { _is_bf_column = is_bf_column; }
190
0
    void set_has_bitmap_index(bool has_bitmap_index) { _has_bitmap_index = has_bitmap_index; }
191
    std::shared_ptr<const vectorized::IDataType> get_vec_type() const;
192
193
    void append_sparse_column(TabletColumn column);
194
    const TabletColumn& sparse_column_at(size_t oridinal) const;
195
    const std::vector<TabletColumnPtr>& sparse_columns() const;
196
0
    size_t num_sparse_columns() const { return _num_sparse_columns; }
197
198
2
    void set_precision_frac(int32_t precision, int32_t frac) {
199
2
        _precision = precision;
200
2
        _frac = frac;
201
2
    }
202
203
355
    void set_is_decimal(bool is_decimal) { _is_decimal = is_decimal; }
204
354
    bool is_decimal() const { return _is_decimal; }
205
1.73k
    PatternTypePB pattern_type() const { return _pattern_type; }
206
207
607
    Status check_valid() const {
208
607
        if (type() != FieldType::OLAP_FIELD_TYPE_ARRAY &&
  Branch (208:13): [True: 607, False: 0]
209
607
            type() != FieldType::OLAP_FIELD_TYPE_STRUCT &&
  Branch (209:13): [True: 607, False: 0]
210
607
            type() != FieldType::OLAP_FIELD_TYPE_MAP) {
  Branch (210:13): [True: 607, False: 0]
211
607
            return Status::OK();
212
607
        }
213
0
        if (is_bf_column()) {
  Branch (213:13): [True: 0, False: 0]
214
0
            return Status::NotSupported("Do not support bloom filter index, type={}",
215
0
                                        get_string_by_field_type(type()));
216
0
        }
217
0
        if (has_bitmap_index()) {
  Branch (217:13): [True: 0, False: 0]
218
0
            return Status::NotSupported("Do not support bitmap index, type={}",
219
0
                                        get_string_by_field_type(type()));
220
0
        }
221
0
        return Status::OK();
222
0
    }
223
224
55
    void set_variant_max_subcolumns_count(int32_t variant_max_subcolumns_count) {
225
55
        _variant_max_subcolumns_count = variant_max_subcolumns_count;
226
55
    }
227
4.44k
    int32_t variant_max_subcolumns_count() const { return _variant_max_subcolumns_count; }
228
229
13
    void set_variant_enable_typed_paths_to_sparse(bool variant_enable_typed_paths_to_sparse) {
230
13
        _variant_enable_typed_paths_to_sparse = variant_enable_typed_paths_to_sparse;
231
13
    }
232
233
810
    bool variant_enable_typed_paths_to_sparse() const {
234
810
        return _variant_enable_typed_paths_to_sparse;
235
810
    }
236
237
private:
238
    int32_t _unique_id = -1;
239
    std::string _col_name;
240
    std::string _col_name_lower_case;
241
    // the field _type will change from TPrimitiveType
242
    // to string by 'EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type);' (reference: TabletMeta::init_column_from_tcolumn)
243
    // to FieldType by 'TabletColumn::get_field_type_by_string' (reference: TabletColumn::init_from_pb).
244
    // And the _type in columnPB is string and it changed from FieldType by 'get_string_by_field_type' (reference: TabletColumn::to_schema_pb).
245
    FieldType _type;
246
    bool _is_key = false;
247
    FieldAggregationMethod _aggregation;
248
    std::string _aggregation_name;
249
    bool _is_nullable = false;
250
    bool _is_auto_increment = false;
251
    bool _is_on_update_current_timestamp {false};
252
253
    bool _has_default_value = false;
254
    std::string _default_value;
255
256
    bool _is_decimal = false;
257
    int32_t _precision = -1;
258
    int32_t _frac = -1;
259
260
    int32_t _length = -1;
261
    int32_t _index_length = -1;
262
263
    bool _is_bf_column = false;
264
265
    bool _has_bitmap_index = false;
266
    bool _visible = true;
267
268
    std::vector<TabletColumnPtr> _sub_columns;
269
    uint32_t _sub_column_count = 0;
270
271
    bool _result_is_nullable = false;
272
    int _be_exec_version = -1;
273
274
    // The extracted sub-columns from "variant" contain the following information:
275
    int32_t _parent_col_unique_id = -1;     // "variant" -> col_unique_id
276
    vectorized::PathInDataPtr _column_path; // the path of the sub-columns themselves
277
278
    // Record information about columns merged into a sparse column within a variant
279
    // `{"id": 100, "name" : "jack", "point" : 3.9}`
280
    // If the information mentioned above is inserted into the variant column,
281
    // 'id' and 'name' are correctly extracted, while 'point' is merged into the sparse column due to its sparsity.
282
    // The path_info and type of 'point' will be recorded using the TabletColumn.
283
    // Use shared_ptr for reuse and reducing column memory usage
284
    std::vector<TabletColumnPtr> _sparse_cols;
285
    size_t _num_sparse_columns = 0;
286
    int32_t _variant_max_subcolumns_count = 0;
287
    PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB;
288
    bool _variant_enable_typed_paths_to_sparse = false;
289
};
290
291
bool operator==(const TabletColumn& a, const TabletColumn& b);
292
bool operator!=(const TabletColumn& a, const TabletColumn& b);
293
294
class TabletIndex : public MetadataAdder<TabletIndex> {
295
public:
296
7.38k
    TabletIndex() = default;
297
    void init_from_thrift(const TOlapTableIndex& index, const TabletSchema& tablet_schema);
298
    void init_from_thrift(const TOlapTableIndex& index, const std::vector<int32_t>& column_uids);
299
    void init_from_pb(const TabletIndexPB& index);
300
    void to_schema_pb(TabletIndexPB* index) const;
301
302
20.6k
    int64_t index_id() const { return _index_id; }
303
9
    const std::string& index_name() const { return _index_name; }
304
9.40k
    IndexType index_type() const { return _index_type; }
305
9.30k
    const vector<int32_t>& col_unique_ids() const { return _col_unique_ids; }
306
26.5k
    const std::map<string, string>& properties() const { return _properties; }
307
0
    int32_t get_gram_size() const {
308
0
        if (_properties.contains("gram_size")) {
  Branch (308:13): [True: 0, False: 0]
309
0
            return std::stoi(_properties.at("gram_size"));
310
0
        }
311
312
0
        return 0;
313
0
    }
314
0
    int32_t get_gram_bf_size() const {
315
0
        if (_properties.contains("bf_size")) {
  Branch (315:13): [True: 0, False: 0]
316
0
            return std::stoi(_properties.at("bf_size"));
317
0
        }
318
319
0
        return 0;
320
0
    }
321
322
15.6k
    const std::string& get_index_suffix() const { return _escaped_index_suffix_path; }
323
324
    void set_escaped_escaped_index_suffix_path(const std::string& name);
325
326
0
    bool is_inverted_index() const { return _index_type == IndexType::INVERTED; }
327
328
204
    void remove_parser_and_analyzer() { _properties.erase(INVERTED_INDEX_PARSER_KEY); }
329
330
7.40k
    std::string field_pattern() const {
331
7.40k
        if (_properties.contains("field_pattern")) {
  Branch (331:13): [True: 22, False: 7.38k]
332
22
            return _properties.at("field_pattern");
333
22
        }
334
7.38k
        return "";
335
7.40k
    }
336
337
1
    bool is_same_except_id(const TabletIndex* other) const {
338
1
        return _escaped_index_suffix_path == other->_escaped_index_suffix_path &&
  Branch (338:16): [True: 1, False: 0]
339
1
               _index_name == other->_index_name && _index_type == other->_index_type &&
  Branch (339:16): [True: 1, False: 0]
  Branch (339:53): [True: 1, False: 0]
340
1
               _col_unique_ids == other->_col_unique_ids && _properties == other->_properties;
  Branch (340:16): [True: 1, False: 0]
  Branch (340:61): [True: 1, False: 0]
341
1
    }
342
343
private:
344
    int64_t _index_id = -1;
345
    // Identify the different index with the same _index_id
346
    std::string _escaped_index_suffix_path;
347
    std::string _index_name;
348
    IndexType _index_type;
349
    std::vector<int32_t> _col_unique_ids;
350
    std::map<string, string> _properties;
351
352
    friend class TabletSchemaMultiIndexTest;
353
};
354
355
using TabletIndexPtr = std::shared_ptr<TabletIndex>;
356
using TabletIndexes = std::vector<std::shared_ptr<TabletIndex>>;
357
358
class TabletSchema : public MetadataAdder<TabletSchema> {
359
public:
360
    enum ColumnType { NORMAL = 0, DROPPED = 1, VARIANT = 2 };
361
    // TODO(yingchun): better to make constructor as private to avoid
362
    // manually init members incorrectly, and define a new function like
363
    // void create_from_pb(const TabletSchemaPB& schema, TabletSchema* tablet_schema).
364
    TabletSchema();
365
    virtual ~TabletSchema();
366
367
    // Init from pb
368
    // ignore_extracted_columns: ignore the extracted columns from variant column
369
    // reuse_cached_column: reuse the cached column in the schema if they are the same, to reduce memory usage
370
    void init_from_pb(const TabletSchemaPB& schema, bool ignore_extracted_columns = false,
371
                      bool reuse_cached_column = false);
372
    // Notice: Use deterministic way to serialize protobuf,
373
    // since serialize Map in protobuf may could lead to un-deterministic by default
374
    template <class PbType>
375
4.12k
    static std::string deterministic_string_serialize(const PbType& pb) {
376
4.12k
        std::string output;
377
4.12k
        google::protobuf::io::StringOutputStream string_output_stream(&output);
378
4.12k
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
379
4.12k
        output_stream.SetSerializationDeterministic(true);
380
4.12k
        pb.SerializeToCodedStream(&output_stream);
381
4.12k
        return output;
382
4.12k
    }
_ZN5doris12TabletSchema30deterministic_string_serializeINS_14TabletSchemaPBEEENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKT_
Line
Count
Source
375
3.71k
    static std::string deterministic_string_serialize(const PbType& pb) {
376
3.71k
        std::string output;
377
3.71k
        google::protobuf::io::StringOutputStream string_output_stream(&output);
378
3.71k
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
379
3.71k
        output_stream.SetSerializationDeterministic(true);
380
3.71k
        pb.SerializeToCodedStream(&output_stream);
381
3.71k
        return output;
382
3.71k
    }
_ZN5doris12TabletSchema30deterministic_string_serializeINS_8ColumnPBEEENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKT_
Line
Count
Source
375
281
    static std::string deterministic_string_serialize(const PbType& pb) {
376
281
        std::string output;
377
281
        google::protobuf::io::StringOutputStream string_output_stream(&output);
378
281
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
379
281
        output_stream.SetSerializationDeterministic(true);
380
281
        pb.SerializeToCodedStream(&output_stream);
381
281
        return output;
382
281
    }
_ZN5doris12TabletSchema30deterministic_string_serializeINS_13TabletIndexPBEEENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKT_
Line
Count
Source
375
122
    static std::string deterministic_string_serialize(const PbType& pb) {
376
122
        std::string output;
377
122
        google::protobuf::io::StringOutputStream string_output_stream(&output);
378
122
        google::protobuf::io::CodedOutputStream output_stream(&string_output_stream);
379
122
        output_stream.SetSerializationDeterministic(true);
380
122
        pb.SerializeToCodedStream(&output_stream);
381
122
        return output;
382
122
    }
383
    void to_schema_pb(TabletSchemaPB* tablet_meta_pb) const;
384
    void append_column(TabletColumn column, ColumnType col_type = ColumnType::NORMAL);
385
    void append_index(TabletIndex&& index);
386
    void update_index(const TabletColumn& column, const IndexType& index_type,
387
                      std::vector<TabletIndex>&& indexs);
388
    void remove_index(int64_t index_id);
389
    void clear_index();
390
    // Must make sure the row column is always the last column
391
    void add_row_column();
392
    void copy_from(const TabletSchema& tablet_schema);
393
    // lightweight copy, take care of lifecycle of TabletColumn
394
    void shawdow_copy_without_columns(const TabletSchema& tablet_schema);
395
    void update_index_info_from(const TabletSchema& tablet_schema);
396
    std::string to_key() const;
397
    // get_metadata_size is only the memory of the TabletSchema itself, not include child objects.
398
58
    int64_t mem_size() const { return get_metadata_size(); }
399
    size_t row_size() const;
400
    int32_t field_index(const std::string& field_name) const;
401
    int32_t field_index(const vectorized::PathInData& path) const;
402
    int32_t field_index(int32_t col_unique_id) const;
403
    const TabletColumn& column(size_t ordinal) const;
404
    Result<const TabletColumn*> column(const std::string& field_name) const;
405
    Status have_column(const std::string& field_name) const;
406
    bool exist_column(const std::string& field_name) const;
407
    bool has_column_unique_id(int32_t col_unique_id) const;
408
    const TabletColumn& column_by_uid(int32_t col_unique_id) const;
409
    TabletColumn& mutable_column_by_uid(int32_t col_unique_id);
410
    TabletColumn& mutable_column(size_t ordinal);
411
    void replace_column(size_t pos, TabletColumn new_col);
412
    const std::vector<TabletColumnPtr>& columns() const;
413
594k
    size_t num_columns() const { return _num_columns; }
414
1.08M
    size_t num_key_columns() const { return _num_key_columns; }
415
135k
    const std::vector<uint32_t>& cluster_key_idxes() const { return _cluster_key_idxes; }
416
0
    size_t num_null_columns() const { return _num_null_columns; }
417
5.37k
    size_t num_short_key_columns() const { return _num_short_key_columns; }
418
0
    size_t num_rows_per_row_block() const { return _num_rows_per_row_block; }
419
1.36k
    size_t num_variant_columns() const { return _num_variant_columns; };
420
7.11M
    KeysType keys_type() const { return _keys_type; }
421
5.73k
    SortType sort_type() const { return _sort_type; }
422
0
    size_t sort_col_num() const { return _sort_col_num; }
423
0
    CompressKind compress_kind() const { return _compress_kind; }
424
0
    size_t next_column_unique_id() const { return _next_column_unique_id; }
425
4
    bool has_bf_fpp() const { return _has_bf_fpp; }
426
0
    double bloom_filter_fpp() const { return _bf_fpp; }
427
20.3k
    bool is_in_memory() const { return _is_in_memory; }
428
0
    void set_is_in_memory(bool is_in_memory) { _is_in_memory = is_in_memory; }
429
2
    void set_disable_auto_compaction(bool disable_auto_compaction) {
430
2
        _disable_auto_compaction = disable_auto_compaction;
431
2
    }
432
288
    bool disable_auto_compaction() const { return _disable_auto_compaction; }
433
0
    void set_variant_enable_flatten_nested(bool flatten_nested) {
434
0
        _variant_enable_flatten_nested = flatten_nested;
435
0
    }
436
0
    bool variant_flatten_nested() const { return _variant_enable_flatten_nested; }
437
0
    void set_enable_single_replica_compaction(bool enable_single_replica_compaction) {
438
0
        _enable_single_replica_compaction = enable_single_replica_compaction;
439
0
    }
440
350
    bool enable_single_replica_compaction() const { return _enable_single_replica_compaction; }
441
    // indicate if full row store column(all the columns encodes as row) exists
442
0
    bool has_row_store_for_all_columns() const {
443
0
        return _store_row_column && row_columns_uids().empty();
  Branch (443:16): [True: 0, False: 0]
  Branch (443:37): [True: 0, False: 0]
444
0
    }
445
0
    void set_skip_write_index_on_load(bool skip) { _skip_write_index_on_load = skip; }
446
123
    bool skip_write_index_on_load() const { return _skip_write_index_on_load; }
447
4.15k
    int32_t delete_sign_idx() const { return _delete_sign_idx; }
448
0
    void set_delete_sign_idx(int32_t delete_sign_idx) { _delete_sign_idx = delete_sign_idx; }
449
142k
    bool has_sequence_col() const { return _sequence_col_idx != -1; }
450
64.7k
    int32_t sequence_col_idx() const { return _sequence_col_idx; }
451
0
    void set_version_col_idx(int32_t version_col_idx) { _version_col_idx = version_col_idx; }
452
0
    int32_t version_col_idx() const { return _version_col_idx; }
453
0
    bool has_skip_bitmap_col() const { return _skip_bitmap_col_idx != -1; }
454
0
    int32_t skip_bitmap_col_idx() const { return _skip_bitmap_col_idx; }
455
5.25k
    segment_v2::CompressionTypePB compression_type() const { return _compression_type; }
456
0
    void set_row_store_page_size(long page_size) { _row_store_page_size = page_size; }
457
0
    long row_store_page_size() const { return _row_store_page_size; }
458
0
    void set_storage_page_size(long storage_page_size) { _storage_page_size = storage_page_size; }
459
13.8k
    long storage_page_size() const { return _storage_page_size; }
460
461
    // Currently if variant_max_subcolumns_count = 0, then we need to record variant extended schema
462
    // for compability reason
463
2.35k
    bool need_record_variant_extended_schema() const { return variant_max_subcolumns_count() == 0; }
464
465
2.35k
    int32_t variant_max_subcolumns_count() const {
466
19.8k
        for (const auto& col : _cols) {
  Branch (466:30): [True: 19.8k, False: 2.21k]
467
19.8k
            if (col->is_variant_type()) {
  Branch (467:17): [True: 136, False: 19.7k]
468
136
                return col->variant_max_subcolumns_count();
469
136
            }
470
19.8k
        }
471
2.21k
        return 0;
472
2.35k
    }
473
0
    void set_storage_dict_page_size(long storage_dict_page_size) {
474
0
        _storage_dict_page_size = storage_dict_page_size;
475
0
    }
476
13.8k
    long storage_dict_page_size() const { return _storage_dict_page_size; }
477
478
110
    const std::vector<const TabletIndex*> inverted_indexes() const {
479
110
        std::vector<const TabletIndex*> inverted_indexes;
480
1.24k
        for (const auto& index : _indexes) {
  Branch (480:32): [True: 1.24k, False: 110]
481
1.24k
            if (index->index_type() == IndexType::INVERTED) {
  Branch (481:17): [True: 1.24k, False: 1]
482
1.24k
                inverted_indexes.emplace_back(index.get());
483
1.24k
            }
484
1.24k
        }
485
110
        return inverted_indexes;
486
110
    }
487
11.4k
    bool has_inverted_index() const {
488
11.4k
        for (const auto& index : _indexes) {
  Branch (488:32): [True: 730, False: 10.7k]
489
730
            DBUG_EXECUTE_IF("tablet_schema::has_inverted_index", {
Line
Count
Source
37
730
    if (UNLIKELY(config::enable_debug_points)) {                              \
Line
Count
Source
36
730
#define UNLIKELY(expr) __builtin_expect(!!(expr), 0)
  Branch (36:24): [True: 3, False: 727]
38
3
        auto dp = DebugPoints::instance()->get_debug_point(debug_point_name); \
39
3
        if (dp) {                                                             \
  Branch (39:13): [True: 0, False: 3]
40
0
            [[maybe_unused]] auto DP_NAME = debug_point_name;                 \
41
0
            { code; }                                                         \
  Branch (41:15): [True: 0, False: 0]
42
0
        }                                                                     \
43
3
    }
490
730
                if (index->col_unique_ids().empty()) {
491
730
                    throw Exception(Status::InternalError("col unique ids cannot be empty"));
492
730
                }
493
730
            });
494
495
730
            if (index->index_type() == IndexType::INVERTED) {
  Branch (495:17): [True: 730, False: 0]
496
                //if index_id == -1, ignore it.
497
730
                if (!index->col_unique_ids().empty() && index->col_unique_ids()[0] >= 0) {
  Branch (497:21): [True: 730, False: 0]
  Branch (497:57): [True: 730, False: 0]
498
730
                    return true;
499
730
                }
500
730
            }
501
730
        }
502
10.7k
        return false;
503
11.4k
    }
504
    bool has_inverted_index_with_index_id(int64_t index_id) const;
505
    // Check whether this column supports inverted index
506
    // Some columns (Float, Double, JSONB ...) from the variant do not support index, but they are listed in TabletIndex.
507
    std::vector<const TabletIndex*> inverted_indexs(const TabletColumn& col) const;
508
509
    // Regardless of whether this column supports inverted index
510
    // TabletIndex information will be returned as long as it exists.
511
    std::vector<const TabletIndex*> inverted_indexs(int32_t col_unique_id,
512
                                                    const std::string& suffix_path = "") const;
513
    std::vector<TabletIndexPtr> inverted_index_by_field_pattern(
514
            int32_t col_unique_id, const std::string& field_pattern) const;
515
    bool has_ngram_bf_index(int32_t col_unique_id) const;
516
    const TabletIndex* get_ngram_bf_index(int32_t col_unique_id) const;
517
    void update_indexes_from_thrift(const std::vector<doris::TOlapTableIndex>& indexes);
518
    // If schema version is not set, it should be -1
519
1.59k
    int32_t schema_version() const { return _schema_version; }
520
    void clear_columns();
521
    vectorized::Block create_block(
522
            const std::vector<uint32_t>& return_columns,
523
            const std::unordered_set<uint32_t>* tablet_columns_need_convert_null = nullptr) const;
524
    vectorized::Block create_block(bool ignore_dropped_col = true) const;
525
0
    void set_schema_version(int32_t version) { _schema_version = version; }
526
0
    void set_auto_increment_column(const std::string& auto_increment_column) {
527
0
        _auto_increment_column = auto_increment_column;
528
0
    }
529
0
    std::string auto_increment_column() const { return _auto_increment_column; }
530
531
28
    void set_table_id(int64_t table_id) { _table_id = table_id; }
532
472
    int64_t table_id() const { return _table_id; }
533
28
    void set_db_id(int64_t db_id) { _db_id = db_id; }
534
0
    int64_t db_id() const { return _db_id; }
535
    void build_current_tablet_schema(int64_t index_id, int32_t version,
536
                                     const OlapTableIndexSchema* index,
537
                                     const TabletSchema& out_tablet_schema);
538
539
    // Merge columns that not exit in current schema, these column is dropped in current schema
540
    // but they are useful in some cases. For example,
541
    // 1. origin schema is  ColA, ColB
542
    // 2. insert values     1, 2
543
    // 3. delete where ColB = 2
544
    // 4. drop ColB
545
    // 5. insert values  3
546
    // 6. add column ColB, although it is name ColB, but it is different with previous ColB, the new ColB we name could call ColB'
547
    // 7. insert value  4, 5
548
    // Then the read schema should be ColA, ColB, ColB' because the delete predicate need ColB to remove related data.
549
    // Because they have same name, so that the dropped column should not be added to the map, only with unique id.
550
    void merge_dropped_columns(const TabletSchema& src_schema);
551
552
    bool is_dropped_column(const TabletColumn& col) const;
553
554
    // copy extracted columns from src_schema
555
    void copy_extracted_columns(const TabletSchema& src_schema);
556
557
    // only reserve extracted columns
558
    void reserve_extracted_columns();
559
560
0
    string get_all_field_names() const {
561
0
        string str = "[";
562
0
        for (auto p : _field_name_to_index) {
  Branch (562:21): [True: 0, False: 0]
563
0
            if (str.size() > 1) {
  Branch (563:17): [True: 0, False: 0]
564
0
                str += ", ";
565
0
            }
566
0
            str += p.first.to_string() + "(" + std::to_string(_cols[p.second]->unique_id()) + ")";
567
0
        }
568
0
        str += "]";
569
0
        return str;
570
0
    }
571
572
    // Dump [(name, type, is_nullable), ...]
573
1
    string dump_structure() const {
574
1
        string str = "[";
575
15
        for (auto p : _cols) {
  Branch (575:21): [True: 15, False: 1]
576
15
            if (str.size() > 1) {
  Branch (576:17): [True: 14, False: 1]
577
14
                str += ", ";
578
14
            }
579
15
            str += "(";
580
15
            str += p->name();
581
15
            str += ", ";
582
15
            str += TabletColumn::get_string_by_field_type(p->type());
583
15
            str += ", ";
584
15
            str += "is_nullable:";
585
15
            str += (p->is_nullable() ? "true" : "false");
  Branch (585:21): [True: 8, False: 7]
586
15
            str += ")";
587
15
        }
588
1
        str += "]";
589
1
        return str;
590
1
    }
591
592
1
    string dump_full_schema() const {
593
1
        string str = "[";
594
4
        for (auto p : _cols) {
  Branch (594:21): [True: 4, False: 1]
595
4
            if (str.size() > 1) {
  Branch (595:17): [True: 3, False: 1]
596
3
                str += ", ";
597
3
            }
598
4
            ColumnPB col_pb;
599
4
            p->to_schema_pb(&col_pb);
600
4
            str += "(";
601
4
            str += col_pb.ShortDebugString();
602
4
            str += ")";
603
4
        }
604
1
        str += "]";
605
1
        return str;
606
1
    }
607
608
    vectorized::Block create_block_by_cids(const std::vector<uint32_t>& cids) const;
609
610
    std::shared_ptr<TabletSchema> copy_without_variant_extracted_columns();
611
9.68k
    InvertedIndexStorageFormatPB get_inverted_index_storage_format() const {
612
9.68k
        return _inverted_index_storage_format;
613
9.68k
    }
614
615
    void update_tablet_columns(const TabletSchema& tablet_schema,
616
                               const std::vector<TColumn>& t_columns);
617
618
0
    const std::vector<int32_t>& row_columns_uids() const { return _row_store_column_unique_ids; }
619
620
    int64_t get_metadata_size() const override;
621
622
    using PathSet = phmap::flat_hash_set<std::string>;
623
624
    struct SubColumnInfo {
625
        TabletColumn column;
626
        TabletIndexes indexes;
627
    };
628
629
    // all path in path_set_info are relative to the parent column
630
    struct PathsSetInfo {
631
        std::unordered_map<std::string, SubColumnInfo> typed_path_set;    // typed columns
632
        std::unordered_map<std::string, TabletIndexes> subcolumn_indexes; // subcolumns indexes
633
        PathSet sub_path_set;                                             // extracted columns
634
        PathSet sparse_path_set;                                          // sparse columns
635
    };
636
637
51
    const PathsSetInfo& path_set_info(int32_t unique_id) const {
638
51
        return _path_set_info_map.at(unique_id);
639
51
    }
640
641
7
    void set_path_set_info(std::unordered_map<int32_t, PathsSetInfo>&& path_set_info_map) {
642
7
        _path_set_info_map = std::move(path_set_info_map);
643
7
    }
644
645
0
    void clear_path_set_info() { _path_set_info_map.clear(); }
646
647
private:
648
    friend bool operator==(const TabletSchema& a, const TabletSchema& b);
649
    friend bool operator!=(const TabletSchema& a, const TabletSchema& b);
650
    TabletSchema(const TabletSchema&) = default;
651
652
    KeysType _keys_type = DUP_KEYS;
653
    SortType _sort_type = SortType::LEXICAL;
654
    size_t _sort_col_num = 0;
655
    std::vector<TabletColumnPtr> _cols;
656
657
    std::vector<TabletIndexPtr> _indexes;
658
    std::unordered_map<StringRef, int32_t, StringRefHash> _field_name_to_index;
659
    std::unordered_map<int32_t, int32_t> _field_id_to_index;
660
    std::unordered_map<vectorized::PathInDataRef, int32_t, vectorized::PathInDataRef::Hash>
661
            _field_path_to_index;
662
663
    // index_type/col_unique_id/suffix -> idxs in _indexes
664
    using IndexKey = std::tuple<IndexType, int32_t, std::string>;
665
    struct IndexKeyHash {
666
52.8k
        size_t operator()(const IndexKey& t) const {
667
52.8k
            std::size_t seed = 0;
668
52.8k
            seed = doris::HashUtil::hash((const char*)&std::get<0>(t), sizeof(std::get<0>(t)),
669
52.8k
                                         seed);
670
52.8k
            seed = doris::HashUtil::hash((const char*)&std::get<1>(t), sizeof(std::get<1>(t)),
671
52.8k
                                         seed);
672
52.8k
            seed = doris::HashUtil::hash((const char*)std::get<2>(t).c_str(), std::get<2>(t).size(),
673
52.8k
                                         seed);
674
52.8k
            return seed;
675
52.8k
        }
676
    };
677
    std::unordered_map<IndexKey, std::vector<int32_t>, IndexKeyHash> _col_id_suffix_to_index;
678
679
    size_t _num_columns = 0;
680
    size_t _num_variant_columns = 0;
681
    size_t _num_key_columns = 0;
682
    std::vector<uint32_t> _cluster_key_idxes;
683
    size_t _num_null_columns = 0;
684
    size_t _num_short_key_columns = 0;
685
    size_t _num_rows_per_row_block = 0;
686
    CompressKind _compress_kind = COMPRESS_NONE;
687
    segment_v2::CompressionTypePB _compression_type = segment_v2::CompressionTypePB::LZ4F;
688
    long _row_store_page_size = segment_v2::ROW_STORE_PAGE_SIZE_DEFAULT_VALUE;
689
    long _storage_page_size = segment_v2::STORAGE_PAGE_SIZE_DEFAULT_VALUE;
690
    long _storage_dict_page_size = segment_v2::STORAGE_DICT_PAGE_SIZE_DEFAULT_VALUE;
691
    size_t _next_column_unique_id = 0;
692
    std::string _auto_increment_column;
693
694
    bool _has_bf_fpp = false;
695
    double _bf_fpp = 0;
696
    bool _is_in_memory = false;
697
    int32_t _delete_sign_idx = -1;
698
    int32_t _sequence_col_idx = -1;
699
    int32_t _version_col_idx = -1;
700
    int32_t _skip_bitmap_col_idx = -1;
701
    int32_t _schema_version = -1;
702
    int64_t _table_id = -1;
703
    int64_t _db_id = -1;
704
    bool _disable_auto_compaction = false;
705
    bool _enable_single_replica_compaction = false;
706
    bool _store_row_column = false;
707
    bool _skip_write_index_on_load = false;
708
    InvertedIndexStorageFormatPB _inverted_index_storage_format = InvertedIndexStorageFormatPB::V1;
709
710
    // Contains column ids of which columns should be encoded into row store.
711
    // ATTN: For compability reason empty cids means all columns of tablet schema are encoded to row column
712
    std::vector<int32_t> _row_store_column_unique_ids;
713
    bool _variant_enable_flatten_nested = false;
714
    int64_t _vl_field_mem_size {0}; // variable length field
715
                                    // key: unique_id of column
716
    // value: extracted path set and sparse path set
717
    std::unordered_map<int32_t, PathsSetInfo> _path_set_info_map;
718
719
    // key: field_pattern
720
    // value: index
721
    using PatternToIndex = std::unordered_map<std::string, std::vector<TabletIndexPtr>>;
722
    std::unordered_map<int32_t, PatternToIndex> _index_by_unique_id_with_pattern;
723
};
724
725
bool operator==(const TabletSchema& a, const TabletSchema& b);
726
bool operator!=(const TabletSchema& a, const TabletSchema& b);
727
728
using TabletSchemaSPtr = std::shared_ptr<TabletSchema>;
729
730
} // namespace doris