Coverage Report

Created: 2026-05-09 10:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/tablet/tablet_meta.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/tablet/tablet_meta.h"
19
20
#include <bvar/bvar.h>
21
#include <gen_cpp/Descriptors_types.h>
22
#include <gen_cpp/FrontendService_types.h>
23
#include <gen_cpp/Types_types.h>
24
#include <gen_cpp/olap_common.pb.h>
25
#include <gen_cpp/olap_file.pb.h>
26
#include <gen_cpp/segment_v2.pb.h>
27
#include <gen_cpp/types.pb.h>
28
#include <json2pb/pb_to_json.h>
29
#include <time.h>
30
31
#include <cstdint>
32
#include <memory>
33
#include <random>
34
#include <set>
35
#include <utility>
36
37
#include "cloud/cloud_meta_mgr.h"
38
#include "cloud/cloud_storage_engine.h"
39
#include "cloud/config.h"
40
#include "common/config.h"
41
#include "io/fs/file_writer.h"
42
#include "io/fs/local_file_system.h"
43
#include "storage/data_dir.h"
44
#include "storage/file_header.h"
45
#include "storage/olap_common.h"
46
#include "storage/olap_define.h"
47
#include "storage/rowset/rowset.h"
48
#include "storage/rowset/rowset_meta_manager.h"
49
#include "storage/tablet/tablet_fwd.h"
50
#include "storage/tablet/tablet_meta_manager.h"
51
#include "storage/tablet/tablet_schema_cache.h"
52
#include "storage/utils.h"
53
#include "util/debug_points.h"
54
#include "util/lru_cache.h"
55
#include "util/mem_info.h"
56
#include "util/parse_util.h"
57
#include "util/string_util.h"
58
#include "util/time.h"
59
#include "util/uid_util.h"
60
61
using std::string;
62
using std::unordered_map;
63
using std::vector;
64
65
namespace doris {
66
using namespace ErrorCode;
67
68
bvar::Adder<uint64_t> g_contains_agg_with_cache_if_eligible_total(
69
        "g_contains_agg_with_cache_if_eligible_total");
70
bvar::Adder<uint64_t> g_contains_agg_with_cache_if_eligible_partial_hit(
71
        "g_contains_agg_with_cache_if_eligible_partial_hit");
72
bvar::Adder<uint64_t> g_contains_agg_with_cache_if_eligible_full_hit(
73
        "g_contains_agg_with_cache_if_eligible_full_hit");
74
bvar::Window<bvar::Adder<uint64_t>> g_contains_agg_with_cache_if_eligible_total_minute(
75
        "g_contains_agg_with_cache_if_eligible_total_1m",
76
        &g_contains_agg_with_cache_if_eligible_total, 60);
77
bvar::Window<bvar::Adder<uint64_t>> g_contains_agg_with_cache_if_eligible_partial_hit_minute(
78
        "g_contains_agg_with_cache_if_eligible_partial_hit_1m",
79
        &g_contains_agg_with_cache_if_eligible_partial_hit, 60);
80
bvar::Window<bvar::Adder<uint64_t>> g_contains_agg_with_cache_if_eligible_full_hit_minute(
81
        "g_contains_agg_with_cache_if_eligible_full_hit_1m",
82
        &g_contains_agg_with_cache_if_eligible_full_hit, 60);
83
84
TabletMetaSharedPtr TabletMeta::create(
85
        const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id,
86
        uint32_t next_unique_id,
87
6.90k
        const unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id) {
88
6.90k
    std::optional<TBinlogConfig> binlog_config;
89
6.90k
    if (request.__isset.binlog_config) {
90
6.60k
        binlog_config = request.binlog_config;
91
6.60k
    }
92
6.90k
    TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format =
93
6.90k
            request.inverted_index_file_storage_format;
94
95
    // We will discard this format. Don't make any further changes here.
96
6.90k
    if (request.__isset.inverted_index_storage_format) {
97
6.90k
        switch (request.inverted_index_storage_format) {
98
0
        case TInvertedIndexStorageFormat::V1:
99
0
            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V1;
100
0
            break;
101
0
        case TInvertedIndexStorageFormat::V2:
102
0
            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V2;
103
0
            break;
104
6.90k
        default:
105
6.90k
            break;
106
6.90k
        }
107
6.90k
    }
108
    // Decide storage format for this tablet. DEFAULT / not-set fall back to V2 on BE side.
109
6.90k
    TStorageFormat::type storage_format =
110
6.90k
            request.__isset.storage_format ? request.storage_format : TStorageFormat::V2;
111
6.90k
    return std::make_shared<TabletMeta>(
112
6.90k
            request.table_id, request.partition_id, request.tablet_id, request.replica_id,
113
6.90k
            request.tablet_schema.schema_hash, shard_id, request.tablet_schema, next_unique_id,
114
6.90k
            col_ordinal_to_unique_id, tablet_uid,
115
6.90k
            request.__isset.tablet_type ? request.tablet_type : TTabletType::TABLET_TYPE_DISK,
116
6.90k
            request.__isset.compression_type ? request.compression_type : TCompressionType::LZ4F,
117
6.90k
            request.__isset.storage_policy_id ? request.storage_policy_id : -1,
118
6.90k
            request.__isset.enable_unique_key_merge_on_write
119
6.90k
                    ? request.enable_unique_key_merge_on_write
120
6.90k
                    : false,
121
6.90k
            std::move(binlog_config), request.compaction_policy,
122
6.90k
            request.time_series_compaction_goal_size_mbytes,
123
6.90k
            request.time_series_compaction_file_count_threshold,
124
6.90k
            request.time_series_compaction_time_threshold_seconds,
125
6.90k
            request.time_series_compaction_empty_rowsets_threshold,
126
6.90k
            request.time_series_compaction_level_threshold, inverted_index_file_storage_format,
127
6.90k
            request.tde_algorithm, storage_format,
128
6.90k
            request.__isset.vertical_compaction_num_columns_per_group
129
6.90k
                    ? request.vertical_compaction_num_columns_per_group
130
6.90k
                    : 5,
131
6.90k
            request.__isset.row_binlog_schema ? &request.row_binlog_schema : nullptr);
132
6.90k
}
133
134
359k
TabletMeta::~TabletMeta() {
135
359k
    if (_handle) {
136
357k
        TabletSchemaCache::instance()->release(_handle);
137
357k
    }
138
359k
}
139
140
TabletMeta::TabletMeta()
141
590k
        : _tablet_uid(0, 0),
142
590k
          _schema(new TabletSchema),
143
590k
          _delete_bitmap(new DeleteBitmap(_tablet_id)),
144
590k
          _binlog_delvec(new DeleteBitmap(_tablet_id)) {}
145
146
TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id,
147
                       int64_t replica_id, int32_t schema_hash, int32_t shard_id,
148
                       const TTabletSchema& tablet_schema, uint32_t next_unique_id,
149
                       const std::unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id,
150
                       TabletUid tablet_uid, TTabletType::type tabletType,
151
                       TCompressionType::type compression_type, int64_t storage_policy_id,
152
                       bool enable_unique_key_merge_on_write,
153
                       std::optional<TBinlogConfig> binlog_config, std::string compaction_policy,
154
                       int64_t time_series_compaction_goal_size_mbytes,
155
                       int64_t time_series_compaction_file_count_threshold,
156
                       int64_t time_series_compaction_time_threshold_seconds,
157
                       int64_t time_series_compaction_empty_rowsets_threshold,
158
                       int64_t time_series_compaction_level_threshold,
159
                       TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format,
160
                       TEncryptionAlgorithm::type tde_algorithm,
161
                       TStorageFormat::type storage_format,
162
                       int32_t vertical_compaction_num_columns_per_group,
163
                       const TTabletSchema* row_binlog_schema)
164
7.27k
        : _tablet_uid(0, 0),
165
7.27k
          _schema(new TabletSchema),
166
7.27k
          _delete_bitmap(new DeleteBitmap(tablet_id)),
167
7.27k
          _binlog_delvec(new DeleteBitmap(tablet_id)),
168
7.27k
          _storage_format(storage_format) {
169
7.27k
    TabletMetaPB tablet_meta_pb;
170
7.27k
    tablet_meta_pb.set_table_id(table_id);
171
7.27k
    tablet_meta_pb.set_partition_id(partition_id);
172
7.27k
    tablet_meta_pb.set_tablet_id(tablet_id);
173
7.27k
    tablet_meta_pb.set_replica_id(replica_id);
174
7.27k
    tablet_meta_pb.set_schema_hash(schema_hash);
175
7.27k
    tablet_meta_pb.set_shard_id(shard_id);
176
    // Persist the creation time, but it is not used
177
7.27k
    tablet_meta_pb.set_creation_time(time(nullptr));
178
7.27k
    tablet_meta_pb.set_cumulative_layer_point(-1);
179
7.27k
    tablet_meta_pb.set_tablet_state(PB_RUNNING);
180
7.27k
    *(tablet_meta_pb.mutable_tablet_uid()) = tablet_uid.to_proto();
181
7.27k
    tablet_meta_pb.set_tablet_type(tabletType == TTabletType::TABLET_TYPE_DISK
182
7.27k
                                           ? TabletTypePB::TABLET_TYPE_DISK
183
7.27k
                                           : TabletTypePB::TABLET_TYPE_MEMORY);
184
7.27k
    tablet_meta_pb.set_enable_unique_key_merge_on_write(enable_unique_key_merge_on_write);
185
7.27k
    tablet_meta_pb.set_storage_policy_id(storage_policy_id);
186
7.27k
    tablet_meta_pb.set_compaction_policy(compaction_policy);
187
7.27k
    tablet_meta_pb.set_time_series_compaction_goal_size_mbytes(
188
7.27k
            time_series_compaction_goal_size_mbytes);
189
7.27k
    tablet_meta_pb.set_time_series_compaction_file_count_threshold(
190
7.27k
            time_series_compaction_file_count_threshold);
191
7.27k
    tablet_meta_pb.set_time_series_compaction_time_threshold_seconds(
192
7.27k
            time_series_compaction_time_threshold_seconds);
193
7.27k
    tablet_meta_pb.set_time_series_compaction_empty_rowsets_threshold(
194
7.27k
            time_series_compaction_empty_rowsets_threshold);
195
7.27k
    tablet_meta_pb.set_time_series_compaction_level_threshold(
196
7.27k
            time_series_compaction_level_threshold);
197
7.27k
    tablet_meta_pb.set_vertical_compaction_num_columns_per_group(
198
7.27k
            vertical_compaction_num_columns_per_group);
199
7.27k
    SchemaCreateOptions schema_create_options_for_data = {
200
7.27k
            .col_ordinal_to_unique_id = col_ordinal_to_unique_id,
201
7.27k
            .compression_type = compression_type,
202
7.27k
            .inverted_index_file_storage_format = inverted_index_file_storage_format,
203
7.27k
            .next_unique_id = next_unique_id};
204
7.27k
    TabletSchemaPB* schema_pb_for_data = tablet_meta_pb.mutable_schema();
205
7.27k
    init_schema_from_thrift(tablet_schema, schema_create_options_for_data, schema_pb_for_data);
206
207
7.27k
    tablet_meta_pb.set_in_restore_mode(false);
208
209
7.27k
    TabletSchemaPB* schema_pb_for_row_binlog = nullptr;
210
7.27k
    if (row_binlog_schema != nullptr) {
211
1
        tablet_meta_pb.set_row_binlog_schema_hash(row_binlog_schema->schema_hash);
212
1
        DCHECK(binlog_config.has_value());
213
1
        DCHECK(binlog_config->enable && binlog_config->binlog_format == TBinlogFormat::ROW);
214
215
1
        std::unordered_map<uint32_t, uint32_t> row_binlog_col_ordinal_to_unique_id;
216
1
        uint32_t row_binlog_next_unique_id = 0;
217
3
        for (uint32_t col_ordinal = 0; col_ordinal < row_binlog_schema->columns.size();
218
2
             ++col_ordinal) {
219
2
            const auto& tcolumn = row_binlog_schema->columns[col_ordinal];
220
2
            uint32_t unique_id = 0;
221
2
            if (tcolumn.col_unique_id >= 0) {
222
0
                unique_id = tcolumn.col_unique_id;
223
2
            } else {
224
2
                unique_id = col_ordinal;
225
2
            }
226
2
            row_binlog_col_ordinal_to_unique_id[col_ordinal] = unique_id;
227
2
            if (row_binlog_next_unique_id <= unique_id) {
228
2
                row_binlog_next_unique_id = unique_id + 1;
229
2
            }
230
2
        }
231
232
1
        SchemaCreateOptions schema_create_options_for_row_binlog = {
233
1
                .col_ordinal_to_unique_id = row_binlog_col_ordinal_to_unique_id,
234
1
                .compression_type = compression_type,
235
1
                .inverted_index_file_storage_format = inverted_index_file_storage_format,
236
1
                .next_unique_id = row_binlog_next_unique_id};
237
1
        schema_pb_for_row_binlog = tablet_meta_pb.mutable_row_binlog_schema();
238
1
        init_schema_from_thrift(*row_binlog_schema, schema_create_options_for_row_binlog,
239
1
                                schema_pb_for_row_binlog);
240
1
    }
241
7.27k
    if (binlog_config.has_value()) {
242
6.58k
        BinlogConfig tmp_binlog_config;
243
6.58k
        tmp_binlog_config = binlog_config.value();
244
6.58k
        tmp_binlog_config.to_pb(tablet_meta_pb.mutable_binlog_config());
245
6.58k
    }
246
247
7.27k
    switch (tde_algorithm) {
248
0
    case doris::TEncryptionAlgorithm::AES256:
249
0
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::AES_256_CTR);
250
0
        break;
251
0
    case doris::TEncryptionAlgorithm::SM4:
252
0
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::SM4_128_CTR);
253
0
        break;
254
7.26k
    default:
255
7.26k
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::PLAINTEXT);
256
7.27k
    }
257
258
    // Initialize default external ColumnMeta usage according to storage format.
259
    // V2: legacy behavior, inline ColumnMetaPB only.
260
    // V3: V2 + external ColumnMetaPB (CMO) enabled by default.
261
7.26k
    switch (_storage_format) {
262
7.25k
    case TStorageFormat::V2:
263
7.25k
    case TStorageFormat::DEFAULT:
264
7.25k
    case TStorageFormat::V1:
265
7.25k
        break;
266
0
    case TStorageFormat::V3:
267
0
        schema_pb_for_data->set_is_external_segment_column_meta_used(true);
268
0
        _schema->set_external_segment_meta_used_default(true);
269
270
0
        schema_pb_for_data->set_integer_type_default_use_plain_encoding(true);
271
0
        _schema->set_integer_type_default_use_plain_encoding(true);
272
0
        schema_pb_for_data->set_binary_plain_encoding_default_impl(
273
0
                BinaryPlainEncodingTypePB::BINARY_PLAIN_ENCODING_V2);
274
0
        _schema->set_binary_plain_encoding_default_impl(
275
0
                BinaryPlainEncodingTypePB::BINARY_PLAIN_ENCODING_V2);
276
277
0
        if (schema_pb_for_row_binlog != nullptr) {
278
0
            schema_pb_for_row_binlog->set_is_external_segment_column_meta_used(true);
279
0
            schema_pb_for_row_binlog->set_integer_type_default_use_plain_encoding(true);
280
0
            schema_pb_for_row_binlog->set_binary_plain_encoding_default_impl(
281
0
                    BinaryPlainEncodingTypePB::BINARY_PLAIN_ENCODING_V2);
282
0
        }
283
0
        break;
284
0
    default:
285
0
        break;
286
7.26k
    }
287
288
7.24k
    init_from_pb(tablet_meta_pb);
289
7.24k
}
290
291
TabletMeta::TabletMeta(const TabletMeta& b)
292
1.27k
        : MetadataAdder(b),
293
1.27k
          _table_id(b._table_id),
294
1.27k
          _index_id(b._index_id),
295
1.27k
          _partition_id(b._partition_id),
296
1.27k
          _tablet_id(b._tablet_id),
297
1.27k
          _replica_id(b._replica_id),
298
1.27k
          _schema_hash(b._schema_hash),
299
1.27k
          _shard_id(b._shard_id),
300
1.27k
          _creation_time(b._creation_time),
301
1.27k
          _cumulative_layer_point(b._cumulative_layer_point),
302
1.27k
          _tablet_uid(b._tablet_uid),
303
1.27k
          _tablet_type(b._tablet_type),
304
1.27k
          _tablet_state(b._tablet_state),
305
1.27k
          _schema(b._schema),
306
1.27k
          _rs_metas(b._rs_metas),
307
1.27k
          _stale_rs_metas(b._stale_rs_metas),
308
1.27k
          _in_restore_mode(b._in_restore_mode),
309
1.27k
          _preferred_rowset_type(b._preferred_rowset_type),
310
1.27k
          _storage_policy_id(b._storage_policy_id),
311
1.27k
          _cooldown_meta_id(b._cooldown_meta_id),
312
1.27k
          _enable_unique_key_merge_on_write(b._enable_unique_key_merge_on_write),
313
1.27k
          _delete_bitmap(b._delete_bitmap),
314
1.27k
          _binlog_delvec(b._binlog_delvec),
315
1.27k
          _row_binlog_schema_hash(b._row_binlog_schema_hash),
316
1.27k
          _row_binlog_schema(b._row_binlog_schema),
317
1.27k
          _row_binlog_rs_metas(b._row_binlog_rs_metas),
318
1.27k
          _binlog_config(b._binlog_config),
319
1.27k
          _compaction_policy(b._compaction_policy),
320
1.27k
          _time_series_compaction_goal_size_mbytes(b._time_series_compaction_goal_size_mbytes),
321
          _time_series_compaction_file_count_threshold(
322
1.27k
                  b._time_series_compaction_file_count_threshold),
323
          _time_series_compaction_time_threshold_seconds(
324
1.27k
                  b._time_series_compaction_time_threshold_seconds),
325
          _time_series_compaction_empty_rowsets_threshold(
326
1.27k
                  b._time_series_compaction_empty_rowsets_threshold),
327
1.27k
          _time_series_compaction_level_threshold(b._time_series_compaction_level_threshold),
328
          _vertical_compaction_num_columns_per_group(
329
1.27k
                  b._vertical_compaction_num_columns_per_group) {};
330
331
void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tcolumn,
332
25.2M
                                          ColumnPB* column) {
333
25.2M
    column->set_unique_id(unique_id);
334
25.2M
    column->set_name(tcolumn.column_name);
335
25.2M
    column->set_is_auto_increment(tcolumn.is_auto_increment);
336
25.3M
    if (tcolumn.__isset.is_on_update_current_timestamp) {
337
25.3M
        column->set_is_on_update_current_timestamp(tcolumn.is_on_update_current_timestamp);
338
25.3M
    }
339
25.2M
    string data_type;
340
25.2M
    EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type);
341
25.2M
    column->set_type(data_type);
342
343
25.2M
    uint32_t length = TabletColumn::get_field_length_by_type(tcolumn.column_type.type,
344
25.2M
                                                             tcolumn.column_type.len);
345
25.2M
    column->set_length(length);
346
25.2M
    column->set_index_length(length);
347
25.2M
    column->set_precision(tcolumn.column_type.precision);
348
25.2M
    column->set_frac(tcolumn.column_type.scale);
349
350
25.2M
    if (tcolumn.__isset.result_is_nullable) {
351
2.66k
        column->set_result_is_nullable(tcolumn.result_is_nullable);
352
2.66k
    }
353
354
25.2M
    if (tcolumn.__isset.be_exec_version) {
355
25.2M
        column->set_be_exec_version(tcolumn.be_exec_version);
356
25.2M
    }
357
358
25.2M
    if (tcolumn.column_type.type == TPrimitiveType::VARCHAR ||
359
25.2M
        tcolumn.column_type.type == TPrimitiveType::STRING) {
360
10.3M
        if (!tcolumn.column_type.__isset.index_len) {
361
204
            column->set_index_length(10);
362
10.3M
        } else {
363
10.3M
            column->set_index_length(tcolumn.column_type.index_len);
364
10.3M
        }
365
10.3M
    }
366
25.2M
    if (!tcolumn.is_key) {
367
17.8M
        column->set_is_key(false);
368
17.8M
        if (tcolumn.__isset.aggregation) {
369
2.67k
            column->set_aggregation(tcolumn.aggregation);
370
17.8M
        } else {
371
17.8M
            string aggregation_type;
372
17.8M
            EnumToString(TAggregationType, tcolumn.aggregation_type, aggregation_type);
373
17.8M
            column->set_aggregation(aggregation_type);
374
17.8M
        }
375
17.8M
    } else {
376
7.42M
        column->set_is_key(true);
377
7.42M
        column->set_aggregation("NONE");
378
7.42M
    }
379
25.2M
    column->set_is_nullable(tcolumn.is_allow_null);
380
25.2M
    if (tcolumn.__isset.default_value) {
381
2.37M
        column->set_default_value(tcolumn.default_value);
382
2.37M
    }
383
25.2M
    if (tcolumn.__isset.is_bloom_filter_column) {
384
12.6k
        column->set_is_bf_column(tcolumn.is_bloom_filter_column);
385
12.6k
    }
386
25.3M
    if (tcolumn.__isset.visible) {
387
25.3M
        column->set_visible(tcolumn.visible);
388
25.3M
    }
389
27.6M
    for (size_t i = 0; i < tcolumn.children_column.size(); i++) {
390
2.38M
        ColumnPB* children_column = column->add_children_columns();
391
2.38M
        init_column_from_tcolumn(tcolumn.children_column[i].col_unique_id,
392
2.38M
                                 tcolumn.children_column[i], children_column);
393
2.38M
    }
394
25.3M
    if (tcolumn.column_type.__isset.variant_max_subcolumns_count) {
395
25.3M
        column->set_variant_max_subcolumns_count(tcolumn.column_type.variant_max_subcolumns_count);
396
25.3M
    }
397
25.2M
    if (tcolumn.__isset.pattern_type) {
398
33.7k
        switch (tcolumn.pattern_type) {
399
2.08k
        case TPatternType::MATCH_NAME:
400
2.08k
            column->set_pattern_type(PatternTypePB::MATCH_NAME);
401
2.08k
            break;
402
31.6k
        case TPatternType::MATCH_NAME_GLOB:
403
31.6k
            column->set_pattern_type(PatternTypePB::MATCH_NAME_GLOB);
404
33.7k
        }
405
33.7k
    }
406
25.3M
    if (tcolumn.__isset.variant_enable_typed_paths_to_sparse) {
407
25.3M
        column->set_variant_enable_typed_paths_to_sparse(
408
25.3M
                tcolumn.variant_enable_typed_paths_to_sparse);
409
25.3M
    }
410
25.3M
    if (tcolumn.__isset.variant_max_sparse_column_statistics_size) {
411
25.3M
        column->set_variant_max_sparse_column_statistics_size(
412
25.3M
                tcolumn.variant_max_sparse_column_statistics_size);
413
25.3M
    }
414
25.2M
    if (tcolumn.__isset.variant_sparse_hash_shard_count) {
415
22.9M
        column->set_variant_sparse_hash_shard_count(tcolumn.variant_sparse_hash_shard_count);
416
22.9M
    }
417
25.3M
    if (tcolumn.column_type.__isset.variant_enable_doc_mode) {
418
25.3M
        column->set_variant_enable_doc_mode(tcolumn.column_type.variant_enable_doc_mode);
419
25.3M
    }
420
25.2M
    if (tcolumn.__isset.variant_doc_materialization_min_rows) {
421
22.9M
        column->set_variant_doc_materialization_min_rows(
422
22.9M
                tcolumn.variant_doc_materialization_min_rows);
423
22.9M
    }
424
25.2M
    if (tcolumn.__isset.variant_doc_hash_shard_count) {
425
22.9M
        column->set_variant_doc_hash_shard_count(tcolumn.variant_doc_hash_shard_count);
426
22.9M
    }
427
25.2M
    if (tcolumn.__isset.variant_enable_nested_group) {
428
22.9M
        column->set_variant_enable_nested_group(tcolumn.variant_enable_nested_group);
429
22.9M
    }
430
25.2M
}
431
432
void TabletMeta::init_schema_from_thrift(const TTabletSchema& tablet_schema,
433
                                         const SchemaCreateOptions& schema_create_options,
434
7.27k
                                         TabletSchemaPB* tablet_schema_pb) {
435
7.27k
    const std::unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id =
436
7.27k
            schema_create_options.col_ordinal_to_unique_id;
437
7.27k
    TCompressionType::type compression_type = schema_create_options.compression_type;
438
7.27k
    TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format =
439
7.27k
            schema_create_options.inverted_index_file_storage_format;
440
7.27k
    uint32_t next_unique_id = schema_create_options.next_unique_id;
441
442
7.27k
    tablet_schema_pb->set_num_short_key_columns(tablet_schema.short_key_column_count);
443
7.27k
    tablet_schema_pb->set_num_rows_per_row_block(config::default_num_rows_per_column_file_block);
444
7.27k
    tablet_schema_pb->set_sequence_col_idx(tablet_schema.sequence_col_idx);
445
7.27k
    auto p_seq_map = tablet_schema_pb->mutable_seq_map(); // ColumnGroupsPB
446
7.27k
    for (auto& it : tablet_schema.seq_map) {              // std::vector< ::doris::TColumnGroup>
447
0
        uint32_t key = it.sequence_column;
448
0
        ColumnGroupPB* cg_pb = p_seq_map->add_cg(); // ColumnGroupPB {key: {v1, v2, v3}}
449
0
        cg_pb->set_sequence_column(key);
450
0
        for (auto v : it.columns_in_group) {
451
0
            cg_pb->add_columns_in_group(v);
452
0
        }
453
0
    }
454
455
7.27k
    switch (tablet_schema.keys_type) {
456
4.95k
    case TKeysType::DUP_KEYS:
457
4.95k
        tablet_schema_pb->set_keys_type(KeysType::DUP_KEYS);
458
4.95k
        break;
459
1.31k
    case TKeysType::UNIQUE_KEYS:
460
1.31k
        tablet_schema_pb->set_keys_type(KeysType::UNIQUE_KEYS);
461
1.31k
        break;
462
746
    case TKeysType::AGG_KEYS:
463
746
        tablet_schema_pb->set_keys_type(KeysType::AGG_KEYS);
464
746
        break;
465
257
    default:
466
257
        LOG(WARNING) << "unknown tablet keys type";
467
257
        break;
468
7.27k
    }
469
470
    // compress_kind used to compress segment files
471
7.26k
    tablet_schema_pb->set_compress_kind(COMPRESS_LZ4);
472
473
    // compression_type used to compress segment page
474
7.26k
    switch (compression_type) {
475
0
    case TCompressionType::NO_COMPRESSION:
476
0
        tablet_schema_pb->set_compression_type(segment_v2::NO_COMPRESSION);
477
0
        break;
478
0
    case TCompressionType::SNAPPY:
479
0
        tablet_schema_pb->set_compression_type(segment_v2::SNAPPY);
480
0
        break;
481
0
    case TCompressionType::LZ4:
482
0
        tablet_schema_pb->set_compression_type(segment_v2::LZ4);
483
0
        break;
484
681
    case TCompressionType::LZ4F:
485
681
        tablet_schema_pb->set_compression_type(segment_v2::LZ4F);
486
681
        break;
487
0
    case TCompressionType::ZLIB:
488
0
        tablet_schema_pb->set_compression_type(segment_v2::ZLIB);
489
0
        break;
490
6.58k
    case TCompressionType::ZSTD:
491
6.58k
        tablet_schema_pb->set_compression_type(segment_v2::ZSTD);
492
6.58k
        break;
493
0
    default:
494
0
        tablet_schema_pb->set_compression_type(segment_v2::LZ4F);
495
0
        break;
496
7.26k
    }
497
498
7.25k
    switch (inverted_index_file_storage_format) {
499
0
    case TInvertedIndexFileStorageFormat::V1:
500
0
        tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1);
501
0
        break;
502
681
    case TInvertedIndexFileStorageFormat::V2:
503
681
        tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
504
681
        break;
505
6.57k
    case TInvertedIndexFileStorageFormat::V3:
506
6.57k
        tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
507
6.57k
        break;
508
0
    default:
509
0
        tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
510
0
        break;
511
7.25k
    }
512
513
7.25k
    switch (tablet_schema.sort_type) {
514
0
    case TSortType::type::ZORDER:
515
0
        tablet_schema_pb->set_sort_type(SortType::ZORDER);
516
0
        break;
517
7.25k
    default:
518
7.25k
        tablet_schema_pb->set_sort_type(SortType::LEXICAL);
519
7.25k
    }
520
7.25k
    tablet_schema_pb->set_sort_col_num(tablet_schema.sort_col_num);
521
7.25k
    for (const auto& i : tablet_schema.cluster_key_uids) {
522
2
        tablet_schema_pb->add_cluster_key_uids(i);
523
2
    }
524
525
    // set column information
526
7.25k
    uint32_t col_ordinal = 0;
527
7.25k
    bool has_bf_columns = false;
528
42.9k
    for (TColumn tcolumn : tablet_schema.columns) {
529
42.9k
        ColumnPB* column = tablet_schema_pb->add_column();
530
42.9k
        uint32_t unique_id = -1;
531
42.9k
        if (tcolumn.col_unique_id >= 0) {
532
40.7k
            unique_id = tcolumn.col_unique_id;
533
40.7k
        } else {
534
2.16k
            unique_id = col_ordinal_to_unique_id.at(col_ordinal);
535
2.16k
        }
536
42.9k
        col_ordinal++;
537
42.9k
        init_column_from_tcolumn(unique_id, tcolumn, column);
538
539
42.9k
        if (column->is_bf_column()) {
540
0
            has_bf_columns = true;
541
0
        }
542
543
42.9k
        if (tablet_schema.__isset.indexes) {
544
2
            for (auto& index : tablet_schema.indexes) {
545
2
                if (index.index_type == TIndexType::type::BLOOMFILTER ||
546
2
                    index.index_type == TIndexType::type::NGRAM_BF) {
547
0
                    DCHECK_EQ(index.columns.size(), 1);
548
0
                    if (iequal(tcolumn.column_name, index.columns[0])) {
549
0
                        column->set_is_bf_column(true);
550
0
                        break;
551
0
                    }
552
0
                }
553
2
            }
554
2
        }
555
42.9k
    }
556
557
    // copy index meta
558
7.25k
    if (tablet_schema.__isset.indexes) {
559
1
        for (auto& index : tablet_schema.indexes) {
560
1
            TabletIndexPB* index_pb = tablet_schema_pb->add_index();
561
1
            index_pb->set_index_id(index.index_id);
562
1
            index_pb->set_index_name(index.index_name);
563
            // init col_unique_id in index at be side, since col_unique_id may be -1 at fe side
564
            // get column unique id by name
565
1
            for (auto column_name : index.columns) {
566
2
                for (auto column : tablet_schema_pb->column()) {
567
2
                    if (iequal(column.name(), column_name)) {
568
1
                        index_pb->add_col_unique_id(column.unique_id());
569
1
                    }
570
2
                }
571
1
            }
572
1
            switch (index.index_type) {
573
1
            case TIndexType::BITMAP:
574
1
                index_pb->set_index_type(IndexType::BITMAP);
575
1
                break;
576
0
            case TIndexType::INVERTED:
577
0
                index_pb->set_index_type(IndexType::INVERTED);
578
0
                break;
579
0
            case TIndexType::ANN:
580
0
                index_pb->set_index_type(IndexType::ANN);
581
0
                break;
582
0
            case TIndexType::BLOOMFILTER:
583
0
                index_pb->set_index_type(IndexType::BLOOMFILTER);
584
0
                break;
585
0
            case TIndexType::NGRAM_BF:
586
0
                index_pb->set_index_type(IndexType::NGRAM_BF);
587
0
                break;
588
1
            }
589
590
1
            if (index.__isset.properties) {
591
0
                auto properties = index_pb->mutable_properties();
592
0
                for (auto kv : index.properties) {
593
0
                    (*properties)[kv.first] = kv.second;
594
0
                }
595
0
            }
596
1
        }
597
1
    }
598
599
7.25k
    tablet_schema_pb->set_next_column_unique_id(next_unique_id);
600
7.25k
    if (has_bf_columns && tablet_schema.__isset.bloom_filter_fpp) {
601
0
        tablet_schema_pb->set_bf_fpp(tablet_schema.bloom_filter_fpp);
602
0
    }
603
604
7.25k
    if (tablet_schema.__isset.is_in_memory) {
605
6.59k
        tablet_schema_pb->set_is_in_memory(tablet_schema.is_in_memory);
606
6.59k
    }
607
608
7.25k
    if (tablet_schema.__isset.disable_auto_compaction) {
609
6.59k
        tablet_schema_pb->set_disable_auto_compaction(tablet_schema.disable_auto_compaction);
610
6.59k
    }
611
612
    // Deprecated legacy flatten-nested switch. Distinct from variant_enable_nested_group.
613
7.26k
    if (tablet_schema.__isset.variant_enable_flatten_nested) {
614
7.26k
        tablet_schema_pb->set_enable_variant_flatten_nested(
615
7.26k
                tablet_schema.variant_enable_flatten_nested);
616
7.26k
    }
617
618
7.27k
    if (tablet_schema.__isset.enable_single_replica_compaction) {
619
7.27k
        tablet_schema_pb->set_enable_single_replica_compaction(
620
7.27k
                tablet_schema.enable_single_replica_compaction);
621
7.27k
    }
622
623
7.26k
    if (tablet_schema.__isset.delete_sign_idx) {
624
7.26k
        tablet_schema_pb->set_delete_sign_idx(tablet_schema.delete_sign_idx);
625
7.26k
    }
626
7.26k
    if (tablet_schema.__isset.store_row_column) {
627
7.26k
        tablet_schema_pb->set_store_row_column(tablet_schema.store_row_column);
628
7.26k
    }
629
7.26k
    if (tablet_schema.__isset.row_store_page_size) {
630
7.26k
        tablet_schema_pb->set_row_store_page_size(tablet_schema.row_store_page_size);
631
7.26k
    }
632
7.26k
    if (tablet_schema.__isset.storage_page_size) {
633
7.26k
        tablet_schema_pb->set_storage_page_size(tablet_schema.storage_page_size);
634
7.26k
    }
635
7.26k
    if (tablet_schema.__isset.storage_dict_page_size) {
636
7.26k
        tablet_schema_pb->set_storage_dict_page_size(tablet_schema.storage_dict_page_size);
637
7.26k
    }
638
7.26k
    if (tablet_schema.__isset.skip_write_index_on_load) {
639
7.26k
        tablet_schema_pb->set_skip_write_index_on_load(tablet_schema.skip_write_index_on_load);
640
7.26k
    }
641
7.25k
    if (tablet_schema.__isset.row_store_col_cids) {
642
6.58k
        tablet_schema_pb->mutable_row_store_column_unique_ids()->Add(
643
6.58k
                tablet_schema.row_store_col_cids.begin(), tablet_schema.row_store_col_cids.end());
644
6.58k
    }
645
7.25k
}
646
647
58.4k
void TabletMeta::remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version) {
648
58.4k
    if (_enable_unique_key_merge_on_write) {
649
33.9k
        delete_bitmap().remove({rowset_id, 0, 0}, {rowset_id, UINT32_MAX, 0});
650
33.9k
        if (config::enable_mow_verbose_log) {
651
0
            LOG_INFO("delete rowset delete bitmap. tablet={}, rowset={}, version={}", tablet_id(),
652
0
                     rowset_id.to_string(), version.to_string());
653
0
        }
654
33.9k
        size_t rowset_cache_version_size = delete_bitmap().remove_rowset_cache_version(rowset_id);
655
33.9k
        _check_mow_rowset_cache_version_size(rowset_cache_version_size);
656
33.9k
    }
657
58.4k
}
658
659
176
Status TabletMeta::create_from_file(const string& file_path) {
660
176
    TabletMetaPB tablet_meta_pb;
661
176
    RETURN_IF_ERROR(load_from_file(file_path, &tablet_meta_pb));
662
176
    init_from_pb(tablet_meta_pb);
663
176
    return Status::OK();
664
176
}
665
666
356
Status TabletMeta::load_from_file(const string& file_path, TabletMetaPB* tablet_meta_pb) {
667
356
    FileHeader<TabletMetaPB> file_header(file_path);
668
    // In file_header.deserialize(), it validates file length, signature, checksum of protobuf.
669
356
    RETURN_IF_ERROR(file_header.deserialize());
670
356
    try {
671
356
        tablet_meta_pb->CopyFrom(file_header.message());
672
356
    } catch (const std::exception& e) {
673
0
        LOG(WARNING) << "Failed to copy protocol buffer object: " << e.what()
674
0
                     << ", file=" << file_path;
675
0
        return Status::Error<PARSE_PROTOBUF_ERROR>(
676
0
                "fail to copy protocol buffer object. file={}, error={}", file_path, e.what());
677
0
    }
678
356
    return Status::OK();
679
356
}
680
681
6
Status TabletMeta::create_from_buffer(const uint8_t* buffer, size_t buffer_size) {
682
6
    FileHeader<TabletMetaPB> file_header(""); // empty file path
683
6
    RETURN_IF_ERROR(file_header.deserialize_from_memory(buffer, buffer_size));
684
685
2
    TabletMetaPB tablet_meta_pb;
686
2
    try {
687
2
        tablet_meta_pb.CopyFrom(file_header.message());
688
2
    } catch (const std::exception& e) {
689
0
        LOG(WARNING) << "Failed to copy protocol buffer object from buffer: " << e.what();
690
0
        return Status::Error<ErrorCode::PARSE_PROTOBUF_ERROR>(
691
0
                "fail to copy protocol buffer object from buffer. error={}", e.what());
692
0
    }
693
694
2
    init_from_pb(tablet_meta_pb);
695
2
    return Status::OK();
696
2
}
697
698
std::string TabletMeta::construct_header_file_path(const string& schema_hash_path,
699
175
                                                   int64_t tablet_id) {
700
175
    std::stringstream header_name_stream;
701
175
    header_name_stream << schema_hash_path << "/" << tablet_id << ".hdr";
702
175
    return header_name_stream.str();
703
175
}
704
705
0
Status TabletMeta::save_as_json(const string& file_path) {
706
0
    std::string json_meta;
707
0
    json2pb::Pb2JsonOptions json_options;
708
0
    json_options.pretty_json = true;
709
0
    json_options.bytes_to_base64 = true;
710
0
    to_json(&json_meta, json_options);
711
    // save to file
712
0
    io::FileWriterPtr file_writer;
713
0
    RETURN_IF_ERROR(io::global_local_filesystem()->create_file(file_path, &file_writer));
714
0
    RETURN_IF_ERROR(file_writer->append(json_meta));
715
0
    RETURN_IF_ERROR(file_writer->close());
716
0
    return Status::OK();
717
0
}
718
719
5.16k
Status TabletMeta::save(const string& file_path) {
720
5.16k
    TabletMetaPB tablet_meta_pb;
721
5.16k
    to_meta_pb(&tablet_meta_pb, false);
722
5.16k
    return TabletMeta::save(file_path, tablet_meta_pb);
723
5.16k
}
724
725
5.34k
Status TabletMeta::save(const string& file_path, const TabletMetaPB& tablet_meta_pb) {
726
5.34k
    DCHECK(!file_path.empty());
727
5.34k
    FileHeader<TabletMetaPB> file_header(file_path);
728
5.34k
    try {
729
5.34k
        file_header.mutable_message()->CopyFrom(tablet_meta_pb);
730
5.34k
    } catch (...) {
731
0
        LOG(WARNING) << "fail to copy protocol buffer object. file='" << file_path;
732
0
        return Status::Error<ErrorCode::INTERNAL_ERROR>(
733
0
                "fail to copy protocol buffer object. file={}", file_path);
734
0
    }
735
5.34k
    RETURN_IF_ERROR(file_header.prepare());
736
5.34k
    RETURN_IF_ERROR(file_header.serialize());
737
5.34k
    return Status::OK();
738
5.34k
}
739
740
20.6k
Status TabletMeta::save_meta(DataDir* data_dir) {
741
20.6k
    std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
742
20.6k
    return _save_meta(data_dir);
743
20.6k
}
744
745
20.5k
Status TabletMeta::_save_meta(DataDir* data_dir) {
746
    // check if tablet uid is valid
747
20.5k
    if (_tablet_uid.hi == 0 && _tablet_uid.lo == 0) {
748
0
        LOG(FATAL) << "tablet_uid is invalid"
749
0
                   << " tablet=" << tablet_id() << " _tablet_uid=" << _tablet_uid.to_string();
750
0
    }
751
20.5k
    string meta_binary;
752
753
20.5k
    auto t1 = MonotonicMicros();
754
20.5k
    serialize(&meta_binary);
755
20.5k
    auto t2 = MonotonicMicros();
756
20.5k
    Status status = TabletMetaManager::save(data_dir, tablet_id(), schema_hash(), meta_binary);
757
20.5k
    if (!status.ok()) {
758
0
        LOG(FATAL) << "fail to save tablet_meta. status=" << status << ", tablet_id=" << tablet_id()
759
0
                   << ", schema_hash=" << schema_hash();
760
0
    }
761
20.5k
    auto t3 = MonotonicMicros();
762
20.5k
    auto cost = t3 - t1;
763
20.5k
    if (cost > 1 * 1000 * 1000) {
764
0
        LOG(INFO) << "save tablet(" << tablet_id() << ") meta too slow. serialize cost " << t2 - t1
765
0
                  << "(us), serialized binary size: " << meta_binary.length()
766
0
                  << "(bytes), write rocksdb cost " << t3 - t2 << "(us)";
767
0
    }
768
20.5k
    return status;
769
20.5k
}
770
771
20.7k
void TabletMeta::serialize(string* meta_binary) {
772
20.7k
    TabletMetaPB tablet_meta_pb;
773
20.7k
    to_meta_pb(&tablet_meta_pb, false);
774
20.7k
    if (tablet_meta_pb.partition_id() <= 0) {
775
468
        LOG(WARNING) << "invalid partition id " << tablet_meta_pb.partition_id() << " tablet "
776
468
                     << tablet_meta_pb.tablet_id();
777
468
    }
778
20.7k
    DBUG_EXECUTE_IF("TabletMeta::serialize::zero_partition_id", {
779
20.7k
        long partition_id = tablet_meta_pb.partition_id();
780
20.7k
        tablet_meta_pb.set_partition_id(0);
781
20.7k
        LOG(WARNING) << "set debug point TabletMeta::serialize::zero_partition_id old="
782
20.7k
                     << partition_id << " new=" << tablet_meta_pb.DebugString();
783
20.7k
    });
784
20.7k
    bool serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
785
20.7k
    if (!_rs_metas.empty() || !_stale_rs_metas.empty()) {
786
20.7k
        _avg_rs_meta_serialize_size =
787
20.7k
                meta_binary->length() / (_rs_metas.size() + _stale_rs_metas.size());
788
20.7k
        if (meta_binary->length() > config::tablet_meta_serialize_size_limit ||
789
20.7k
            !serialize_success) {
790
0
            int64_t origin_meta_size = meta_binary->length();
791
0
            int64_t stale_rowsets_num = tablet_meta_pb.stale_rs_metas().size();
792
0
            tablet_meta_pb.clear_stale_rs_metas();
793
0
            meta_binary->clear();
794
0
            serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
795
0
            LOG(WARNING) << "tablet meta serialization size exceeds limit: "
796
0
                         << config::tablet_meta_serialize_size_limit
797
0
                         << " clean up stale rowsets, tablet id: " << tablet_id()
798
0
                         << " stale rowset num: " << stale_rowsets_num
799
0
                         << " serialization size before clean " << origin_meta_size
800
0
                         << " serialization size after clean " << meta_binary->length();
801
0
        }
802
20.7k
    }
803
804
20.7k
    if (!serialize_success) {
805
0
        LOG(FATAL) << "failed to serialize meta " << tablet_id();
806
0
    }
807
20.7k
}
808
809
284k
Status TabletMeta::deserialize(std::string_view meta_binary) {
810
284k
    TabletMetaPB tablet_meta_pb;
811
284k
    bool parsed = tablet_meta_pb.ParseFromArray(meta_binary.data(),
812
284k
                                                static_cast<int32_t>(meta_binary.size()));
813
284k
    if (!parsed) {
814
0
        return Status::Error<INIT_FAILED>("parse tablet meta failed");
815
0
    }
816
284k
    init_from_pb(tablet_meta_pb);
817
284k
    return Status::OK();
818
284k
}
819
820
597k
void TabletMeta::init_from_pb(const TabletMetaPB& tablet_meta_pb) {
821
597k
    _table_id = tablet_meta_pb.table_id();
822
597k
    _index_id = tablet_meta_pb.index_id();
823
597k
    _partition_id = tablet_meta_pb.partition_id();
824
597k
    _tablet_id = tablet_meta_pb.tablet_id();
825
597k
    _replica_id = tablet_meta_pb.replica_id();
826
597k
    _schema_hash = tablet_meta_pb.schema_hash();
827
597k
    _shard_id = tablet_meta_pb.shard_id();
828
597k
    _creation_time = tablet_meta_pb.creation_time();
829
597k
    _cumulative_layer_point = tablet_meta_pb.cumulative_layer_point();
830
597k
    _tablet_uid = TabletUid(tablet_meta_pb.tablet_uid());
831
597k
    _ttl_seconds = tablet_meta_pb.ttl_seconds();
832
597k
    if (tablet_meta_pb.has_tablet_type()) {
833
596k
        _tablet_type = tablet_meta_pb.tablet_type();
834
596k
    } else {
835
143
        _tablet_type = TabletTypePB::TABLET_TYPE_DISK;
836
143
    }
837
838
    // init _tablet_state
839
597k
    switch (tablet_meta_pb.tablet_state()) {
840
20.4k
    case PB_NOTREADY:
841
20.4k
        _tablet_state = TabletState::TABLET_NOTREADY;
842
20.4k
        break;
843
569k
    case PB_RUNNING:
844
569k
        _tablet_state = TabletState::TABLET_RUNNING;
845
569k
        break;
846
0
    case PB_TOMBSTONED:
847
0
        _tablet_state = TabletState::TABLET_TOMBSTONED;
848
0
        break;
849
0
    case PB_STOPPED:
850
0
        _tablet_state = TabletState::TABLET_STOPPED;
851
0
        break;
852
5.73k
    case PB_SHUTDOWN:
853
5.73k
        _tablet_state = TabletState::TABLET_SHUTDOWN;
854
5.73k
        break;
855
0
    default:
856
0
        LOG(WARNING) << "tablet has no state. tablet=" << tablet_id()
857
0
                     << ", schema_hash=" << schema_hash();
858
597k
    }
859
860
    // init _schema
861
596k
    TabletSchemaSPtr schema = std::make_shared<TabletSchema>();
862
596k
    schema->init_from_pb(tablet_meta_pb.schema());
863
596k
    if (_handle) {
864
4
        TabletSchemaCache::instance()->release(_handle);
865
4
    }
866
596k
    auto pair = TabletSchemaCache::instance()->insert(schema->to_key());
867
596k
    _handle = pair.first;
868
596k
    _schema = pair.second;
869
870
596k
    if (tablet_meta_pb.has_row_binlog_schema()) {
871
1
        TabletSchemaSPtr row_binlog_schema = std::make_shared<TabletSchema>();
872
1
        row_binlog_schema->init_from_pb(tablet_meta_pb.row_binlog_schema());
873
1
        _row_binlog_schema = std::move(row_binlog_schema);
874
1
        _row_binlog_schema_hash = tablet_meta_pb.row_binlog_schema_hash();
875
1
    }
876
877
597k
    if (tablet_meta_pb.has_enable_unique_key_merge_on_write()) {
878
597k
        _enable_unique_key_merge_on_write = tablet_meta_pb.enable_unique_key_merge_on_write();
879
597k
        _delete_bitmap->set_tablet_id(_tablet_id);
880
597k
        _binlog_delvec->set_tablet_id(_tablet_id);
881
597k
    }
882
883
    // init _rs_metas
884
596k
    for (auto& it : tablet_meta_pb.rs_metas()) {
885
493k
        RowsetMetaSharedPtr rs_meta(new RowsetMeta());
886
493k
        rs_meta->init_from_pb(it);
887
493k
        _rs_metas.emplace(rs_meta->version(), rs_meta);
888
493k
    }
889
890
    // For mow table, delete bitmap of stale rowsets has not been persisted.
891
    // When be restart, query should not read the stale rowset, otherwise duplicate keys
892
    // will be read out. Therefore, we don't add them to _stale_rs_meta for mow table.
893
597k
    if (!config::skip_loading_stale_rowset_meta && !_enable_unique_key_merge_on_write) {
894
466k
        for (auto& it : tablet_meta_pb.stale_rs_metas()) {
895
13.6k
            RowsetMetaSharedPtr rs_meta(new RowsetMeta());
896
13.6k
            rs_meta->init_from_pb(it);
897
13.6k
            _stale_rs_metas.emplace(rs_meta->version(), rs_meta);
898
13.6k
        }
899
466k
    }
900
901
596k
    for (auto& it : tablet_meta_pb.row_binlog_rs_metas()) {
902
0
        RowsetMetaSharedPtr rs_meta(new RowsetMeta());
903
0
        rs_meta->init_from_pb(it);
904
0
        _row_binlog_rs_metas.emplace(rs_meta->version(), rs_meta);
905
0
    }
906
907
597k
    if (tablet_meta_pb.has_in_restore_mode()) {
908
597k
        _in_restore_mode = tablet_meta_pb.in_restore_mode();
909
597k
    }
910
911
596k
    if (tablet_meta_pb.has_preferred_rowset_type()) {
912
590k
        _preferred_rowset_type = tablet_meta_pb.preferred_rowset_type();
913
590k
    }
914
915
596k
    _storage_policy_id = tablet_meta_pb.storage_policy_id();
916
596k
    if (tablet_meta_pb.has_cooldown_meta_id()) {
917
304k
        _cooldown_meta_id = tablet_meta_pb.cooldown_meta_id();
918
304k
    }
919
920
596k
    if (tablet_meta_pb.has_delete_bitmap()) {
921
50.9k
        int rst_ids_size = tablet_meta_pb.delete_bitmap().rowset_ids_size();
922
50.9k
        int seg_ids_size = tablet_meta_pb.delete_bitmap().segment_ids_size();
923
50.9k
        int versions_size = tablet_meta_pb.delete_bitmap().versions_size();
924
50.9k
        int seg_maps_size = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps_size();
925
50.9k
        int binlog_mark_size = tablet_meta_pb.delete_bitmap().is_binlog_delvec_size();
926
50.9k
        CHECK(rst_ids_size == seg_ids_size && seg_ids_size == seg_maps_size &&
927
50.9k
              seg_maps_size == versions_size);
928
50.9k
        CHECK(binlog_mark_size == 0 || binlog_mark_size == rst_ids_size);
929
54.8k
        for (int i = 0; i < rst_ids_size; ++i) {
930
3.88k
            RowsetId rst_id;
931
3.88k
            rst_id.init(tablet_meta_pb.delete_bitmap().rowset_ids(i));
932
3.88k
            auto seg_id = tablet_meta_pb.delete_bitmap().segment_ids(i);
933
3.88k
            auto ver = tablet_meta_pb.delete_bitmap().versions(i);
934
3.88k
            auto bitmap = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps(i).data();
935
3.88k
            bool from_binlog = tablet_meta_pb.delete_bitmap().is_binlog_delvec_size() > 0
936
3.88k
                                       ? tablet_meta_pb.delete_bitmap().is_binlog_delvec(i)
937
3.88k
                                       : false;
938
3.88k
            if (!from_binlog) {
939
3.88k
                delete_bitmap().delete_bitmap[{rst_id, seg_id, ver}] =
940
3.88k
                        roaring::Roaring::read(bitmap);
941
3.88k
            } else {
942
0
                binlog_delvec().delete_bitmap[{rst_id, seg_id, ver}] =
943
0
                        roaring::Roaring::read(bitmap);
944
0
            }
945
3.88k
        }
946
50.9k
    }
947
948
596k
    if (tablet_meta_pb.has_binlog_config()) {
949
291k
        _binlog_config = tablet_meta_pb.binlog_config();
950
291k
    }
951
596k
    _compaction_policy = tablet_meta_pb.compaction_policy();
952
596k
    _time_series_compaction_goal_size_mbytes =
953
596k
            tablet_meta_pb.time_series_compaction_goal_size_mbytes();
954
596k
    _time_series_compaction_file_count_threshold =
955
596k
            tablet_meta_pb.time_series_compaction_file_count_threshold();
956
596k
    _time_series_compaction_time_threshold_seconds =
957
596k
            tablet_meta_pb.time_series_compaction_time_threshold_seconds();
958
596k
    _time_series_compaction_empty_rowsets_threshold =
959
596k
            tablet_meta_pb.time_series_compaction_empty_rowsets_threshold();
960
596k
    _time_series_compaction_level_threshold =
961
596k
            tablet_meta_pb.time_series_compaction_level_threshold();
962
596k
    _vertical_compaction_num_columns_per_group =
963
596k
            tablet_meta_pb.vertical_compaction_num_columns_per_group();
964
965
597k
    if (tablet_meta_pb.has_encryption_algorithm()) {
966
597k
        _encryption_algorithm = tablet_meta_pb.encryption_algorithm();
967
597k
    }
968
969
596k
    if (tablet_meta_pb.has_row_binlog_schema_hash()) {
970
1
        _row_binlog_schema_hash = tablet_meta_pb.row_binlog_schema_hash();
971
1
    }
972
596k
}
973
974
26.2k
void TabletMeta::to_meta_pb(TabletMetaPB* tablet_meta_pb, bool cloud_get_rowset_meta) {
975
26.2k
    tablet_meta_pb->set_table_id(table_id());
976
26.2k
    tablet_meta_pb->set_index_id(index_id());
977
26.2k
    tablet_meta_pb->set_partition_id(partition_id());
978
26.2k
    tablet_meta_pb->set_tablet_id(tablet_id());
979
26.2k
    tablet_meta_pb->set_replica_id(replica_id());
980
26.2k
    tablet_meta_pb->set_schema_hash(schema_hash());
981
26.2k
    tablet_meta_pb->set_shard_id(shard_id());
982
26.2k
    tablet_meta_pb->set_creation_time(creation_time());
983
26.2k
    tablet_meta_pb->set_cumulative_layer_point(cumulative_layer_point());
984
26.2k
    *(tablet_meta_pb->mutable_tablet_uid()) = tablet_uid().to_proto();
985
26.2k
    tablet_meta_pb->set_tablet_type(_tablet_type);
986
26.2k
    tablet_meta_pb->set_ttl_seconds(_ttl_seconds);
987
26.2k
    switch (tablet_state()) {
988
9
    case TABLET_NOTREADY:
989
9
        tablet_meta_pb->set_tablet_state(PB_NOTREADY);
990
9
        break;
991
16.4k
    case TABLET_RUNNING:
992
16.4k
        tablet_meta_pb->set_tablet_state(PB_RUNNING);
993
16.4k
        break;
994
0
    case TABLET_TOMBSTONED:
995
0
        tablet_meta_pb->set_tablet_state(PB_TOMBSTONED);
996
0
        break;
997
0
    case TABLET_STOPPED:
998
0
        tablet_meta_pb->set_tablet_state(PB_STOPPED);
999
0
        break;
1000
9.76k
    case TABLET_SHUTDOWN:
1001
9.76k
        tablet_meta_pb->set_tablet_state(PB_SHUTDOWN);
1002
9.76k
        break;
1003
26.2k
    }
1004
1005
    // RowsetMetaPB is separated from TabletMetaPB
1006
26.2k
    if (!config::is_cloud_mode() || cloud_get_rowset_meta) {
1007
69.0k
        for (const auto& [_, rs] : _rs_metas) {
1008
69.0k
            rs->to_rowset_pb(tablet_meta_pb->add_rs_metas());
1009
69.0k
        }
1010
92.3k
        for (const auto& [_, rs] : _stale_rs_metas) {
1011
92.3k
            rs->to_rowset_pb(tablet_meta_pb->add_stale_rs_metas());
1012
92.3k
        }
1013
26.2k
        for (const auto& [_, rs] : _row_binlog_rs_metas) {
1014
2
            rs->to_rowset_pb(tablet_meta_pb->add_row_binlog_rs_metas());
1015
2
        }
1016
26.2k
    }
1017
1018
26.2k
    _schema->to_schema_pb(tablet_meta_pb->mutable_schema());
1019
1020
26.2k
    if (_row_binlog_schema != nullptr) {
1021
2
        _row_binlog_schema->to_schema_pb(tablet_meta_pb->mutable_row_binlog_schema());
1022
2
        tablet_meta_pb->set_row_binlog_schema_hash(_row_binlog_schema_hash);
1023
2
    }
1024
1025
26.2k
    tablet_meta_pb->set_in_restore_mode(in_restore_mode());
1026
1027
    // to avoid modify tablet meta to the greatest extend
1028
26.3k
    if (_preferred_rowset_type == BETA_ROWSET) {
1029
26.3k
        tablet_meta_pb->set_preferred_rowset_type(_preferred_rowset_type);
1030
26.3k
    }
1031
26.2k
    if (_storage_policy_id > 0) {
1032
5
        tablet_meta_pb->set_storage_policy_id(_storage_policy_id);
1033
5
    }
1034
26.2k
    if (_cooldown_meta_id.initialized()) {
1035
5
        tablet_meta_pb->mutable_cooldown_meta_id()->CopyFrom(_cooldown_meta_id.to_proto());
1036
5
    }
1037
1038
26.2k
    tablet_meta_pb->set_enable_unique_key_merge_on_write(_enable_unique_key_merge_on_write);
1039
1040
26.2k
    if (_enable_unique_key_merge_on_write) {
1041
4.14k
        std::set<RowsetId> stale_rs_ids;
1042
70.5k
        for (const auto& [_, rowset] : _stale_rs_metas) {
1043
70.5k
            stale_rs_ids.insert(rowset->rowset_id());
1044
70.5k
        }
1045
4.14k
        DeleteBitmapPB* delete_bitmap_pb = tablet_meta_pb->mutable_delete_bitmap();
1046
6.96k
        for (auto& [id, bitmap] : delete_bitmap().snapshot().delete_bitmap) {
1047
6.96k
            auto& [rowset_id, segment_id, ver] = id;
1048
6.96k
            if (stale_rs_ids.count(rowset_id) != 0) {
1049
6.40k
                continue;
1050
6.40k
            }
1051
562
            delete_bitmap_pb->add_rowset_ids(rowset_id.to_string());
1052
562
            delete_bitmap_pb->add_segment_ids(segment_id);
1053
562
            delete_bitmap_pb->add_versions(ver);
1054
562
            delete_bitmap_pb->add_is_binlog_delvec(false);
1055
562
            std::string bitmap_data(bitmap.getSizeInBytes(), '\0');
1056
562
            bitmap.write(bitmap_data.data());
1057
562
            *(delete_bitmap_pb->add_segment_delete_bitmaps()) = std::move(bitmap_data);
1058
562
        }
1059
1060
4.14k
        for (auto& [id, bitmap] : binlog_delvec().snapshot().delete_bitmap) {
1061
0
            auto& [rowset_id, segment_id, ver] = id;
1062
0
            delete_bitmap_pb->add_rowset_ids(rowset_id.to_string());
1063
0
            delete_bitmap_pb->add_segment_ids(segment_id);
1064
0
            delete_bitmap_pb->add_versions(ver);
1065
0
            delete_bitmap_pb->add_is_binlog_delvec(true);
1066
0
            std::string bitmap_data(bitmap.getSizeInBytes(), '\0');
1067
0
            bitmap.write(bitmap_data.data());
1068
0
            *(delete_bitmap_pb->add_segment_delete_bitmaps()) = std::move(bitmap_data);
1069
0
        }
1070
4.14k
    }
1071
26.2k
    _binlog_config.to_pb(tablet_meta_pb->mutable_binlog_config());
1072
26.2k
    tablet_meta_pb->set_compaction_policy(compaction_policy());
1073
26.2k
    tablet_meta_pb->set_time_series_compaction_goal_size_mbytes(
1074
26.2k
            time_series_compaction_goal_size_mbytes());
1075
26.2k
    tablet_meta_pb->set_time_series_compaction_file_count_threshold(
1076
26.2k
            time_series_compaction_file_count_threshold());
1077
26.2k
    tablet_meta_pb->set_time_series_compaction_time_threshold_seconds(
1078
26.2k
            time_series_compaction_time_threshold_seconds());
1079
26.2k
    tablet_meta_pb->set_time_series_compaction_empty_rowsets_threshold(
1080
26.2k
            time_series_compaction_empty_rowsets_threshold());
1081
26.2k
    tablet_meta_pb->set_time_series_compaction_level_threshold(
1082
26.2k
            time_series_compaction_level_threshold());
1083
26.2k
    tablet_meta_pb->set_vertical_compaction_num_columns_per_group(
1084
26.2k
            vertical_compaction_num_columns_per_group());
1085
1086
26.2k
    tablet_meta_pb->set_encryption_algorithm(_encryption_algorithm);
1087
26.2k
}
1088
1089
3
void TabletMeta::to_json(string* json_string, json2pb::Pb2JsonOptions& options) {
1090
3
    TabletMetaPB tablet_meta_pb;
1091
3
    to_meta_pb(&tablet_meta_pb, false);
1092
3
    json2pb::ProtoMessageToJson(tablet_meta_pb, json_string, options);
1093
3
}
1094
1095
1.34M
Version TabletMeta::max_version() const {
1096
1.34M
    Version max_version = {-1, 0};
1097
4.43M
    for (const auto& [_, rs_meta] : _rs_metas) {
1098
4.43M
        if (rs_meta->end_version() > max_version.second) {
1099
1.74M
            max_version = rs_meta->version();
1100
1.74M
        }
1101
4.43M
    }
1102
1.34M
    return max_version;
1103
1.34M
}
1104
1105
932k
size_t TabletMeta::version_count_cross_with_range(const Version& range) const {
1106
932k
    size_t count = 0;
1107
1.63M
    for (const auto& [_, rs_meta] : _rs_metas) {
1108
1.63M
        if (!(range.first > rs_meta->version().second || range.second < rs_meta->version().first)) {
1109
1.63M
            count++;
1110
1.63M
        }
1111
1.63M
    }
1112
932k
    return count;
1113
932k
}
1114
1115
36.8k
Status TabletMeta::add_rs_meta(const RowsetMetaSharedPtr& rs_meta) {
1116
    // check RowsetMeta is valid
1117
744k
    for (const auto& [_, rs] : _rs_metas) {
1118
744k
        if (rs->version() == rs_meta->version()) {
1119
0
            if (rs->rowset_id() != rs_meta->rowset_id()) {
1120
0
                return Status::Error<PUSH_VERSION_ALREADY_EXIST>(
1121
0
                        "version already exist. rowset_id={}, version={}, tablet={}",
1122
0
                        rs->rowset_id().to_string(), rs->version().to_string(), tablet_id());
1123
0
            } else {
1124
                // rowsetid,version is equal, it is a duplicate req, skip it
1125
0
                return Status::OK();
1126
0
            }
1127
0
        }
1128
744k
    }
1129
36.8k
    _rs_metas.emplace(rs_meta->version(), rs_meta);
1130
36.8k
    return Status::OK();
1131
36.8k
}
1132
1133
1
Status TabletMeta::add_row_binlog_rs_meta(const RowsetMetaSharedPtr& row_binlog_meta) {
1134
    // check RowsetMeta is valid
1135
1
    for (auto& [_, rs] : _row_binlog_rs_metas) {
1136
0
        if (rs->version() == row_binlog_meta->version()) {
1137
0
            if (rs->rowset_id() != row_binlog_meta->rowset_id()) {
1138
0
                return Status::Error<PUSH_VERSION_ALREADY_EXIST>(
1139
0
                        "binlog version already exist. binlog_rowset_id={}, version={}, tablet={}",
1140
0
                        rs->rowset_id().to_string(), rs->version().to_string(), tablet_id());
1141
0
            } else {
1142
                // rowsetid,version is equal, it is a duplicate req, skip it
1143
0
                return Status::OK();
1144
0
            }
1145
0
        }
1146
0
    }
1147
1
    _row_binlog_rs_metas.emplace(row_binlog_meta->version(), row_binlog_meta);
1148
1
    return Status::OK();
1149
1
}
1150
1151
304k
void TabletMeta::add_rowsets_unchecked(const std::vector<RowsetSharedPtr>& to_add) {
1152
320k
    for (const auto& rs : to_add) {
1153
320k
        _rs_metas.emplace(rs->rowset_meta()->version(), rs->rowset_meta());
1154
320k
    }
1155
304k
}
1156
1157
void TabletMeta::delete_rs_meta_by_version(const Version& version,
1158
834
                                           std::vector<RowsetMetaSharedPtr>* deleted_rs_metas) {
1159
834
    size_t rowset_cache_version_size = 0;
1160
835
    if (auto it = _rs_metas.find(version); it != _rs_metas.end()) {
1161
835
        if (deleted_rs_metas != nullptr) {
1162
0
            deleted_rs_metas->push_back(it->second);
1163
0
        }
1164
835
        auto rowset_id = it->second->rowset_id();
1165
835
        _rs_metas.erase(it);
1166
835
        if (_enable_unique_key_merge_on_write) {
1167
33
            rowset_cache_version_size = _delete_bitmap->remove_rowset_cache_version(rowset_id);
1168
33
        }
1169
835
        return;
1170
835
    }
1171
18.4E
    _check_mow_rowset_cache_version_size(rowset_cache_version_size);
1172
18.4E
}
1173
1174
void TabletMeta::modify_rs_metas(const std::vector<RowsetMetaSharedPtr>& to_add,
1175
                                 const std::vector<RowsetMetaSharedPtr>& to_delete,
1176
7.78k
                                 bool same_version) {
1177
7.78k
    size_t rowset_cache_version_size = 0;
1178
    // Remove to_delete rowsets from _rs_metas
1179
64.4k
    for (auto rs_to_del : to_delete) {
1180
64.4k
        if (auto it = _rs_metas.find(rs_to_del->version()); it != _rs_metas.end()) {
1181
64.3k
            auto rowset_id = it->second->rowset_id();
1182
64.3k
            _rs_metas.erase(it);
1183
64.3k
            if (_enable_unique_key_merge_on_write) {
1184
34.9k
                rowset_cache_version_size = _delete_bitmap->remove_rowset_cache_version(rowset_id);
1185
34.9k
            }
1186
64.3k
        }
1187
64.4k
    }
1188
7.78k
    if (!same_version) {
1189
        // put to_delete rowsets in _stale_rs_metas.
1190
64.3k
        for (auto rs_to_del : to_delete) {
1191
64.3k
            _stale_rs_metas.emplace(rs_to_del->version(), rs_to_del);
1192
64.3k
        }
1193
7.75k
    }
1194
1195
    // put to_add rowsets in _rs_metas.
1196
7.78k
    for (auto rs_to_add : to_add) {
1197
1.49k
        _rs_metas.emplace(rs_to_add->version(), rs_to_add);
1198
1.49k
    }
1199
7.78k
    _check_mow_rowset_cache_version_size(rowset_cache_version_size);
1200
7.78k
}
1201
1202
// Use the passing "rs_metas" to replace the rs meta in this tablet meta
1203
// Also clear the _stale_rs_metas because this tablet meta maybe copyied from
1204
// an existing tablet before. Add after revise, only the passing "rs_metas"
1205
// is needed.
1206
323
void TabletMeta::revise_rs_metas(std::vector<RowsetMetaSharedPtr>&& rs_metas) {
1207
323
    {
1208
323
        std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
1209
323
        _rs_metas.clear();
1210
478
        for (auto& rs_meta : rs_metas) {
1211
478
            _rs_metas.emplace(rs_meta->version(), rs_meta);
1212
478
        }
1213
323
        _stale_rs_metas.clear();
1214
323
    }
1215
323
    if (_enable_unique_key_merge_on_write) {
1216
40
        _delete_bitmap->clear_rowset_cache_version();
1217
40
    }
1218
323
}
1219
1220
// This method should call after revise_rs_metas, since new rs_metas might be a subset
1221
// of original tablet, we should revise the delete_bitmap according to current rowset.
1222
//
1223
// Delete bitmap is protected by Tablet::_meta_lock, we don't need to acquire the
1224
// TabletMeta's _meta_lock
1225
41
void TabletMeta::revise_delete_bitmap_unlocked(const DeleteBitmap& delete_bitmap) {
1226
41
    _delete_bitmap = std::make_unique<DeleteBitmap>(tablet_id());
1227
50
    for (const auto& [_, rs] : _rs_metas) {
1228
50
        DeleteBitmap rs_bm(tablet_id());
1229
50
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
1230
50
                             &rs_bm);
1231
50
        _delete_bitmap->merge(rs_bm);
1232
50
    }
1233
41
    for (const auto& [_, rs] : _stale_rs_metas) {
1234
0
        DeleteBitmap rs_bm(tablet_id());
1235
0
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
1236
0
                             &rs_bm);
1237
0
        _delete_bitmap->merge(rs_bm);
1238
0
    }
1239
41
}
1240
1241
69.0k
void TabletMeta::delete_stale_rs_meta_by_version(const Version& version) {
1242
69.0k
    _stale_rs_metas.erase(version);
1243
69.0k
}
1244
1245
0
RowsetMetaSharedPtr TabletMeta::acquire_rs_meta_by_version(const Version& version) const {
1246
0
    if (auto it = _rs_metas.find(version); it != _rs_metas.end()) {
1247
0
        return it->second;
1248
0
    }
1249
0
    return nullptr;
1250
0
}
1251
1252
15.7k
RowsetMetaSharedPtr TabletMeta::acquire_stale_rs_meta_by_version(const Version& version) const {
1253
15.7k
    if (auto it = _stale_rs_metas.find(version); it != _stale_rs_metas.end()) {
1254
15.7k
        return it->second;
1255
15.7k
    }
1256
8
    return nullptr;
1257
15.7k
}
1258
1259
23
Status TabletMeta::set_partition_id(int64_t partition_id) {
1260
23
    if ((_partition_id > 0 && _partition_id != partition_id) || partition_id < 1) {
1261
0
        LOG(WARNING) << "cur partition id=" << _partition_id << " new partition id=" << partition_id
1262
0
                     << " not equal";
1263
0
    }
1264
23
    _partition_id = partition_id;
1265
23
    return Status::OK();
1266
23
}
1267
1268
0
void TabletMeta::clear_stale_rowset() {
1269
0
    _stale_rs_metas.clear();
1270
0
    if (_enable_unique_key_merge_on_write) {
1271
0
        _delete_bitmap->clear_rowset_cache_version();
1272
0
    }
1273
0
}
1274
1275
0
void TabletMeta::clear_rowsets() {
1276
0
    _rs_metas.clear();
1277
0
    if (_enable_unique_key_merge_on_write) {
1278
0
        _delete_bitmap->clear_rowset_cache_version();
1279
0
    }
1280
0
}
1281
1282
41.6k
void TabletMeta::_check_mow_rowset_cache_version_size(size_t rowset_cache_version_size) {
1283
41.6k
    if (_enable_unique_key_merge_on_write && config::enable_mow_verbose_log &&
1284
41.6k
        rowset_cache_version_size > _rs_metas.size() + _stale_rs_metas.size()) {
1285
0
        std::stringstream ss;
1286
0
        auto rowset_ids = _delete_bitmap->get_rowset_cache_version();
1287
0
        std::set<std::string> tablet_rowset_ids;
1288
0
        {
1289
0
            std::shared_lock rlock(_meta_lock);
1290
0
            for (const auto& [_, rs_meta] : _rs_metas) {
1291
0
                tablet_rowset_ids.emplace(rs_meta->rowset_id().to_string());
1292
0
            }
1293
0
            for (const auto& [_, rs_meta] : _stale_rs_metas) {
1294
0
                tablet_rowset_ids.emplace(rs_meta->rowset_id().to_string());
1295
0
            }
1296
0
        }
1297
0
        for (const auto& rowset_id : rowset_ids) {
1298
0
            if (tablet_rowset_ids.find(rowset_id) == tablet_rowset_ids.end()) {
1299
0
                ss << rowset_id << ", ";
1300
0
            }
1301
0
        }
1302
        // size(rowset_cache_version) <= size(_rs_metas) + size(_stale_rs_metas) + size(_unused_rs)
1303
0
        std::string msg = fmt::format(
1304
0
                "tablet: {}, rowset_cache_version size: {}, "
1305
0
                "_rs_metas size: {}, _stale_rs_metas size: {}, delta: {}. rowset only in cache: {}",
1306
0
                _tablet_id, rowset_cache_version_size, _rs_metas.size(), _stale_rs_metas.size(),
1307
0
                rowset_cache_version_size - _rs_metas.size() - _stale_rs_metas.size(), ss.str());
1308
0
        LOG(INFO) << msg;
1309
0
    }
1310
41.6k
}
1311
1312
3
bool operator==(const TabletMeta& a, const TabletMeta& b) {
1313
3
    if (a._table_id != b._table_id) return false;
1314
3
    if (a._index_id != b._index_id) return false;
1315
3
    if (a._partition_id != b._partition_id) return false;
1316
3
    if (a._tablet_id != b._tablet_id) return false;
1317
3
    if (a._replica_id != b._replica_id) return false;
1318
3
    if (a._schema_hash != b._schema_hash) return false;
1319
3
    if (a._shard_id != b._shard_id) return false;
1320
3
    if (a._creation_time != b._creation_time) return false;
1321
3
    if (a._cumulative_layer_point != b._cumulative_layer_point) return false;
1322
3
    if (a._tablet_uid != b._tablet_uid) return false;
1323
3
    if (a._tablet_type != b._tablet_type) return false;
1324
3
    if (a._tablet_state != b._tablet_state) return false;
1325
3
    if (*a._schema != *b._schema) return false;
1326
3
    if (a._rs_metas != b._rs_metas) return false;
1327
3
    if (a._in_restore_mode != b._in_restore_mode) return false;
1328
3
    if (a._preferred_rowset_type != b._preferred_rowset_type) return false;
1329
3
    if (a._storage_policy_id != b._storage_policy_id) return false;
1330
3
    if (a._compaction_policy != b._compaction_policy) return false;
1331
3
    if (a._time_series_compaction_goal_size_mbytes != b._time_series_compaction_goal_size_mbytes)
1332
0
        return false;
1333
3
    if (a._time_series_compaction_file_count_threshold !=
1334
3
        b._time_series_compaction_file_count_threshold)
1335
0
        return false;
1336
3
    if (a._time_series_compaction_time_threshold_seconds !=
1337
3
        b._time_series_compaction_time_threshold_seconds)
1338
0
        return false;
1339
3
    if (a._time_series_compaction_empty_rowsets_threshold !=
1340
3
        b._time_series_compaction_empty_rowsets_threshold)
1341
0
        return false;
1342
3
    if (a._time_series_compaction_level_threshold != b._time_series_compaction_level_threshold)
1343
0
        return false;
1344
3
    return true;
1345
3
}
1346
1347
0
bool operator!=(const TabletMeta& a, const TabletMeta& b) {
1348
0
    return !(a == b);
1349
0
}
1350
1351
// We cannot just copy the underlying memory to construct a string
1352
// due to equivalent objects may have different padding bytes.
1353
// Reading padding bytes is undefined behavior, neither copy nor
1354
// placement new will help simplify the code.
1355
// Refer to C11 standards §6.2.6.1/6 and §6.7.9/21 for more info.
1356
5.42M
static std::string agg_cache_key(int64_t tablet_id, const DeleteBitmap::BitmapKey& bmk) {
1357
5.42M
    std::string ret(sizeof(tablet_id) + sizeof(bmk), '\0');
1358
5.42M
    *reinterpret_cast<int64_t*>(ret.data()) = tablet_id;
1359
5.42M
    auto t = reinterpret_cast<DeleteBitmap::BitmapKey*>(ret.data() + sizeof(tablet_id));
1360
5.42M
    std::get<RowsetId>(*t).version = std::get<RowsetId>(bmk).version;
1361
5.42M
    std::get<RowsetId>(*t).hi = std::get<RowsetId>(bmk).hi;
1362
5.42M
    std::get<RowsetId>(*t).mi = std::get<RowsetId>(bmk).mi;
1363
5.42M
    std::get<RowsetId>(*t).lo = std::get<RowsetId>(bmk).lo;
1364
5.42M
    std::get<1>(*t) = std::get<1>(bmk);
1365
5.42M
    std::get<2>(*t) = std::get<2>(bmk);
1366
5.42M
    return ret;
1367
5.42M
}
1368
1369
// decode cache key info from a agg_cache_key
1370
static void decode_agg_cache_key(const std::string& key_str, int64_t& tablet_id,
1371
43.4k
                                 DeleteBitmap::BitmapKey& bmk) {
1372
43.4k
    const char* ptr = key_str.data();
1373
43.4k
    tablet_id = *reinterpret_cast<const int64_t*>(ptr);
1374
43.4k
    ptr += sizeof(tablet_id);
1375
43.4k
    const auto* t = reinterpret_cast<const DeleteBitmap::BitmapKey*>(ptr);
1376
43.4k
    std::get<RowsetId>(bmk).version = std::get<RowsetId>(*t).version;
1377
43.4k
    std::get<RowsetId>(bmk).hi = std::get<RowsetId>(*t).hi;
1378
43.4k
    std::get<RowsetId>(bmk).mi = std::get<RowsetId>(*t).mi;
1379
43.4k
    std::get<RowsetId>(bmk).lo = std::get<RowsetId>(*t).lo;
1380
43.4k
    std::get<1>(bmk) = std::get<1>(*t);
1381
43.4k
    std::get<2>(bmk) = std::get<2>(*t);
1382
43.4k
}
1383
1384
DeleteBitmapAggCache::DeleteBitmapAggCache(size_t capacity)
1385
8
        : LRUCachePolicy(CachePolicy::CacheType::DELETE_BITMAP_AGG_CACHE, capacity,
1386
8
                         LRUCacheType::SIZE, config::delete_bitmap_agg_cache_stale_sweep_time_sec,
1387
8
                         /*num_shards*/ 256,
1388
8
                         /*element_count_capacity*/ 0, /*enable_prune*/ true,
1389
8
                         /*is_lru_k*/ false) {}
1390
1391
11.6M
DeleteBitmapAggCache* DeleteBitmapAggCache::instance() {
1392
11.6M
    return ExecEnv::GetInstance()->delete_bitmap_agg_cache();
1393
11.6M
}
1394
1395
8
DeleteBitmapAggCache* DeleteBitmapAggCache::create_instance(size_t capacity) {
1396
8
    return new DeleteBitmapAggCache(capacity);
1397
8
}
1398
1399
2
DeleteBitmap DeleteBitmapAggCache::snapshot(int64_t tablet_id) {
1400
2
    DeleteBitmap ret(tablet_id);
1401
43.4k
    auto collector = [&](const LRUHandle* handle) {
1402
43.4k
        auto key = handle->key().to_string();
1403
43.4k
        int64_t key_tablet_id;
1404
43.4k
        DeleteBitmap::BitmapKey bmk;
1405
43.4k
        decode_agg_cache_key(key, key_tablet_id, bmk);
1406
43.4k
        if (key_tablet_id == tablet_id) {
1407
16
            const auto& dbm = reinterpret_cast<DeleteBitmapAggCache::Value*>(handle->value)->bitmap;
1408
16
            ret.set(bmk, dbm);
1409
16
        }
1410
43.4k
    };
1411
2
    DeleteBitmapAggCache::instance()->for_each_entry(collector);
1412
2
    return ret;
1413
2
}
1414
1415
1.40M
DeleteBitmap::DeleteBitmap(int64_t tablet_id) : _tablet_id(tablet_id) {}
1416
1417
11.9k
DeleteBitmap::DeleteBitmap(const DeleteBitmap& o) {
1418
11.9k
    std::shared_lock l1(o.lock);
1419
11.9k
    delete_bitmap = o.delete_bitmap;
1420
11.9k
    _tablet_id = o._tablet_id;
1421
11.9k
}
1422
1423
26.7k
DeleteBitmap& DeleteBitmap::operator=(const DeleteBitmap& o) {
1424
26.7k
    if (this == &o) return *this;
1425
25.6k
    if (this < &o) {
1426
25.6k
        std::unique_lock l1(lock);
1427
25.6k
        std::shared_lock l2(o.lock);
1428
25.6k
        delete_bitmap = o.delete_bitmap;
1429
25.6k
        _tablet_id = o._tablet_id;
1430
18.4E
    } else {
1431
18.4E
        std::shared_lock l2(o.lock);
1432
18.4E
        std::unique_lock l1(lock);
1433
18.4E
        delete_bitmap = o.delete_bitmap;
1434
18.4E
        _tablet_id = o._tablet_id;
1435
18.4E
    }
1436
25.6k
    return *this;
1437
26.7k
}
1438
1439
0
DeleteBitmap::DeleteBitmap(DeleteBitmap&& o) noexcept {
1440
0
    std::scoped_lock l(o.lock, o._rowset_cache_version_lock);
1441
0
    delete_bitmap = std::move(o.delete_bitmap);
1442
0
    _tablet_id = std::move(o._tablet_id);
1443
0
    o._rowset_cache_version.clear();
1444
0
}
1445
1446
40
DeleteBitmap& DeleteBitmap::operator=(DeleteBitmap&& o) noexcept {
1447
40
    if (this == &o) return *this;
1448
40
    std::scoped_lock l(lock, o.lock, o._rowset_cache_version_lock);
1449
40
    delete_bitmap = std::move(o.delete_bitmap);
1450
40
    _tablet_id = std::move(o._tablet_id);
1451
40
    o._rowset_cache_version.clear();
1452
40
    return *this;
1453
40
}
1454
1455
0
DeleteBitmap DeleteBitmap::from_pb(const DeleteBitmapPB& pb, int64_t tablet_id) {
1456
0
    size_t len = pb.rowset_ids().size();
1457
0
    DCHECK_EQ(len, pb.segment_ids().size());
1458
0
    DCHECK_EQ(len, pb.versions().size());
1459
0
    DeleteBitmap delete_bitmap(tablet_id);
1460
0
    for (int32_t i = 0; i < len; ++i) {
1461
0
        RowsetId rs_id;
1462
0
        rs_id.init(pb.rowset_ids(i));
1463
0
        BitmapKey key = {rs_id, pb.segment_ids(i), pb.versions(i)};
1464
0
        delete_bitmap.delete_bitmap[key] =
1465
0
                roaring::Roaring::read(pb.segment_delete_bitmaps(i).data());
1466
0
    }
1467
0
    return delete_bitmap;
1468
0
}
1469
1470
0
DeleteBitmapPB DeleteBitmap::to_pb() {
1471
0
    std::shared_lock l(lock);
1472
0
    DeleteBitmapPB ret;
1473
0
    for (const auto& [k, v] : delete_bitmap) {
1474
0
        ret.mutable_rowset_ids()->Add(std::get<0>(k).to_string());
1475
0
        ret.mutable_segment_ids()->Add(std::get<1>(k));
1476
0
        ret.mutable_versions()->Add(std::get<2>(k));
1477
0
        std::string bitmap_data(v.getSizeInBytes(), '\0');
1478
0
        v.write(bitmap_data.data());
1479
0
        ret.mutable_segment_delete_bitmaps()->Add(std::move(bitmap_data));
1480
0
    }
1481
0
    return ret;
1482
0
}
1483
1484
8.31k
DeleteBitmap DeleteBitmap::snapshot() const {
1485
8.31k
    std::shared_lock l(lock);
1486
8.31k
    return DeleteBitmap(*this);
1487
8.31k
}
1488
1489
43
DeleteBitmap DeleteBitmap::snapshot(Version version) const {
1490
    // Take snapshot first, then remove keys greater than given version.
1491
43
    DeleteBitmap snapshot = this->snapshot();
1492
43
    auto it = snapshot.delete_bitmap.begin();
1493
452
    while (it != snapshot.delete_bitmap.end()) {
1494
409
        if (std::get<2>(it->first) > version) {
1495
4
            it = snapshot.delete_bitmap.erase(it);
1496
405
        } else {
1497
405
            it++;
1498
405
        }
1499
409
    }
1500
43
    return snapshot;
1501
43
}
1502
1503
4.73M
void DeleteBitmap::add(const BitmapKey& bmk, uint32_t row_id) {
1504
4.73M
    std::lock_guard l(lock);
1505
4.73M
    delete_bitmap[bmk].add(row_id);
1506
4.73M
}
1507
1508
0
int DeleteBitmap::remove(const BitmapKey& bmk, uint32_t row_id) {
1509
0
    std::lock_guard l(lock);
1510
0
    auto it = delete_bitmap.find(bmk);
1511
0
    if (it == delete_bitmap.end()) return -1;
1512
0
    it->second.remove(row_id);
1513
0
    return 0;
1514
0
}
1515
1516
67.8k
void DeleteBitmap::remove(const BitmapKey& start, const BitmapKey& end) {
1517
67.8k
    std::lock_guard l(lock);
1518
83.1k
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) {
1519
54.4k
        auto& [k, _] = *it;
1520
54.4k
        if (k >= end) {
1521
39.1k
            break;
1522
39.1k
        }
1523
15.3k
        it = delete_bitmap.erase(it);
1524
15.3k
    }
1525
67.8k
}
1526
1527
914
void DeleteBitmap::remove(const std::vector<std::tuple<BitmapKey, BitmapKey>>& key_ranges) {
1528
914
    std::lock_guard l(lock);
1529
914
    for (auto& [start, end] : key_ranges) {
1530
1.84k
        for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) {
1531
1.84k
            auto& [k, _] = *it;
1532
1.84k
            if (k >= end) {
1533
914
                break;
1534
914
            }
1535
935
            it = delete_bitmap.erase(it);
1536
935
        }
1537
914
    }
1538
914
}
1539
1540
3.67M
bool DeleteBitmap::contains(const BitmapKey& bmk, uint32_t row_id) const {
1541
3.67M
    std::shared_lock l(lock);
1542
3.67M
    auto it = delete_bitmap.find(bmk);
1543
3.67M
    return it != delete_bitmap.end() && it->second.contains(row_id);
1544
3.67M
}
1545
1546
2
bool DeleteBitmap::contains_agg(const BitmapKey& bmk, uint32_t row_id) const {
1547
2
    return get_agg(bmk)->contains(row_id);
1548
2
}
1549
1550
0
bool DeleteBitmap::empty() const {
1551
0
    std::shared_lock l(lock);
1552
0
    return delete_bitmap.empty();
1553
0
}
1554
1555
277k
uint64_t DeleteBitmap::cardinality() const {
1556
277k
    std::shared_lock l(lock);
1557
277k
    uint64_t res = 0;
1558
1.24M
    for (auto entry : delete_bitmap) {
1559
1.24M
        if (std::get<1>(entry.first) != DeleteBitmap::INVALID_SEGMENT_ID) {
1560
50.6k
            res += entry.second.cardinality();
1561
50.6k
        }
1562
1.24M
    }
1563
277k
    return res;
1564
277k
}
1565
1566
6
uint64_t DeleteBitmap::get_size() const {
1567
6
    std::shared_lock l(lock);
1568
6
    uint64_t charge = 0;
1569
44
    for (auto& [k, v] : delete_bitmap) {
1570
44
        if (std::get<1>(k) != DeleteBitmap::INVALID_SEGMENT_ID) {
1571
44
            charge += v.getSizeInBytes();
1572
44
        }
1573
44
    }
1574
6
    return charge;
1575
6
}
1576
1577
bool DeleteBitmap::contains_agg_with_cache_if_eligible(const BitmapKey& bmk,
1578
3.67M
                                                       uint32_t row_id) const {
1579
3.67M
    g_contains_agg_with_cache_if_eligible_total << 1;
1580
3.67M
    int64_t start_version {0};
1581
3.69M
    if (config::enable_mow_get_agg_by_cache) {
1582
3.69M
        auto deleter = [&](Cache::Handle* handle) {
1583
1.40M
            DeleteBitmapAggCache::instance()->release(handle);
1584
1.40M
        };
1585
3.69M
        std::unique_ptr<Cache::Handle, decltype(deleter)> dbm_handle(nullptr, deleter);
1586
3.69M
        int64_t cached_version = 0;
1587
        // 1. try to lookup the desired key directly
1588
3.69M
        dbm_handle.reset(DeleteBitmapAggCache::instance()->lookup(agg_cache_key(_tablet_id, bmk)));
1589
3.69M
        if (dbm_handle != nullptr) {
1590
1.40M
            cached_version = std::get<2>(bmk);
1591
2.29M
        } else {
1592
            // 2. if not found, try to lookup with cached version
1593
2.29M
            cached_version = _get_rowset_cache_version(bmk);
1594
2.29M
            if (cached_version > 0) {
1595
280
                if (cached_version > std::get<2>(bmk)) {
1596
44
                    cached_version = 0;
1597
236
                } else {
1598
236
                    dbm_handle.reset(DeleteBitmapAggCache::instance()->lookup(agg_cache_key(
1599
236
                            _tablet_id, {std::get<0>(bmk), std::get<1>(bmk), cached_version})));
1600
236
                }
1601
280
            }
1602
2.29M
        }
1603
3.69M
        if (dbm_handle != nullptr) {
1604
1.40M
            const auto& cached_dbm =
1605
1.40M
                    reinterpret_cast<DeleteBitmapAggCache::Value*>(
1606
1.40M
                            DeleteBitmapAggCache::instance()->value(dbm_handle.get()))
1607
1.40M
                            ->bitmap;
1608
1.40M
            if (cached_version == std::get<2>(bmk)) {
1609
1.40M
                g_contains_agg_with_cache_if_eligible_full_hit << 1;
1610
1.40M
            } else {
1611
2.18k
                g_contains_agg_with_cache_if_eligible_partial_hit << 1;
1612
2.18k
            }
1613
1.40M
            if (cached_dbm.contains(row_id)) {
1614
345
                return true;
1615
345
            }
1616
1.40M
            if (cached_version == std::get<2>(bmk)) {
1617
1.40M
                return false;
1618
1.40M
            }
1619
1.23k
            start_version = cached_version + 1;
1620
1.23k
        }
1621
3.69M
    }
1622
2.27M
    DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1623
2.27M
    std::shared_lock l(lock);
1624
2.27M
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1625
1.46M
        auto& [k, bm] = *it;
1626
1.46M
        if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1627
1.46M
            std::get<2>(k) > std::get<2>(bmk)) {
1628
1.46M
            break;
1629
1.46M
        }
1630
336
        if (bm.contains(row_id)) {
1631
20
            return true;
1632
20
        }
1633
336
    }
1634
2.27M
    return false;
1635
2.27M
}
1636
1637
8
void DeleteBitmap::remove_sentinel_marks() {
1638
8
    std::lock_guard l(lock);
1639
64
    for (auto it = delete_bitmap.begin(), end = delete_bitmap.end(); it != end;) {
1640
56
        if (std::get<1>(it->first) == DeleteBitmap::INVALID_SEGMENT_ID) {
1641
55
            it = delete_bitmap.erase(it);
1642
55
        } else {
1643
1
            ++it;
1644
1
        }
1645
56
    }
1646
8
}
1647
1648
8.77k
int DeleteBitmap::set(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1649
8.77k
    std::lock_guard l(lock);
1650
8.77k
    auto [_, inserted] = delete_bitmap.insert_or_assign(bmk, segment_delete_bitmap);
1651
8.77k
    return inserted;
1652
8.77k
}
1653
1654
7
int DeleteBitmap::get(const BitmapKey& bmk, roaring::Roaring* segment_delete_bitmap) const {
1655
7
    std::shared_lock l(lock);
1656
7
    auto it = delete_bitmap.find(bmk);
1657
7
    if (it == delete_bitmap.end()) return -1;
1658
7
    *segment_delete_bitmap = it->second; // copy
1659
7
    return 0;
1660
7
}
1661
1662
54
const roaring::Roaring* DeleteBitmap::get(const BitmapKey& bmk) const {
1663
54
    std::shared_lock l(lock);
1664
54
    auto it = delete_bitmap.find(bmk);
1665
54
    if (it == delete_bitmap.end()) return nullptr;
1666
41
    return &(it->second); // get address
1667
54
}
1668
1669
void DeleteBitmap::subset(const BitmapKey& start, const BitmapKey& end,
1670
31.5k
                          DeleteBitmap* subset_rowset_map) const {
1671
31.5k
    DCHECK(start < end);
1672
31.5k
    std::shared_lock l(lock);
1673
38.3k
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1674
16.5k
        auto& [k, bm] = *it;
1675
16.5k
        if (k >= end) {
1676
9.70k
            break;
1677
9.70k
        }
1678
6.80k
        subset_rowset_map->set(k, bm);
1679
6.80k
    }
1680
31.5k
}
1681
1682
void DeleteBitmap::subset(std::vector<std::pair<RowsetId, int64_t>>& rowset_ids,
1683
                          int64_t start_version, int64_t end_version,
1684
0
                          DeleteBitmap* subset_delete_map) const {
1685
0
    DCHECK(start_version <= end_version);
1686
0
    for (auto& [rowset_id, _] : rowset_ids) {
1687
0
        BitmapKey start {rowset_id, 0, 0};
1688
0
        BitmapKey end {rowset_id, UINT32_MAX, end_version + 1};
1689
0
        std::shared_lock l(lock);
1690
0
        for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1691
0
            auto& [k, bm] = *it;
1692
0
            if (k >= end) {
1693
0
                break;
1694
0
            }
1695
0
            auto version = std::get<2>(k);
1696
0
            if (version >= start_version && version <= end_version) {
1697
0
                subset_delete_map->merge(k, bm);
1698
0
                VLOG_DEBUG << "subset delete bitmap, tablet=" << _tablet_id << ", version=["
1699
0
                           << start_version << ", " << end_version
1700
0
                           << "]. rowset=" << std::get<0>(k).to_string()
1701
0
                           << ", segment=" << std::get<1>(k) << ", version=" << version
1702
0
                           << ", cardinality=" << bm.cardinality();
1703
0
            }
1704
0
        }
1705
0
    }
1706
0
}
1707
1708
void DeleteBitmap::subset_and_agg(std::vector<std::pair<RowsetId, int64_t>>& rowset_ids,
1709
                                  int64_t start_version, int64_t end_version,
1710
1
                                  DeleteBitmap* subset_delete_map) const {
1711
1
    DCHECK(start_version <= end_version);
1712
2
    for (auto& [rowset_id, segment_num] : rowset_ids) {
1713
6
        for (int64_t seg_id = 0; seg_id < segment_num; ++seg_id) {
1714
4
            BitmapKey end {rowset_id, seg_id, end_version};
1715
4
            auto bm = get_agg_without_cache(end, start_version);
1716
4
            VLOG_DEBUG << "subset delete bitmap, tablet=" << _tablet_id << ", rowset=" << rowset_id
1717
0
                       << ", segment=" << seg_id << ", version=[" << start_version << "-"
1718
0
                       << end_version << "], cardinality=" << bm->cardinality();
1719
4
            if (bm->isEmpty()) {
1720
0
                continue;
1721
0
            }
1722
4
            subset_delete_map->merge(end, *bm);
1723
4
        }
1724
2
    }
1725
1
}
1726
1727
670
size_t DeleteBitmap::get_count_with_range(const BitmapKey& start, const BitmapKey& end) const {
1728
670
    DCHECK(start < end);
1729
670
    size_t count = 0;
1730
670
    std::shared_lock l(lock);
1731
884
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1732
584
        auto& [k, bm] = *it;
1733
584
        if (k >= end) {
1734
370
            break;
1735
370
        }
1736
214
        count++;
1737
214
    }
1738
670
    return count;
1739
670
}
1740
1741
17.1k
void DeleteBitmap::merge(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1742
17.1k
    std::lock_guard l(lock);
1743
17.1k
    auto [iter, succ] = delete_bitmap.emplace(bmk, segment_delete_bitmap);
1744
17.1k
    if (!succ) {
1745
0
        iter->second |= segment_delete_bitmap;
1746
0
    }
1747
17.1k
}
1748
1749
47.1k
void DeleteBitmap::merge(const DeleteBitmap& other) {
1750
47.1k
    std::lock_guard l(lock);
1751
47.1k
    for (auto& i : other.delete_bitmap) {
1752
2.28k
        auto [j, succ] = this->delete_bitmap.insert(i);
1753
2.28k
        if (!succ) j->second |= i.second;
1754
2.28k
    }
1755
47.1k
}
1756
1757
732k
uint64_t DeleteBitmap::get_delete_bitmap_count() {
1758
732k
    std::shared_lock l(lock);
1759
732k
    uint64_t count = 0;
1760
1.22M
    for (auto it = delete_bitmap.begin(); it != delete_bitmap.end(); it++) {
1761
494k
        if (std::get<1>(it->first) != DeleteBitmap::INVALID_SEGMENT_ID) {
1762
50.3k
            count++;
1763
50.3k
        }
1764
494k
    }
1765
732k
    return count;
1766
732k
}
1767
1768
void DeleteBitmap::traverse_rowset_and_version(
1769
0
        const std::function<int(const RowsetId& rowsetId, int64_t version)>& func) const {
1770
0
    std::shared_lock l(lock);
1771
0
    auto it = delete_bitmap.cbegin();
1772
0
    while (it != delete_bitmap.cend()) {
1773
0
        RowsetId rowset_id = std::get<0>(it->first);
1774
0
        int64_t version = std::get<2>(it->first);
1775
0
        int result = func(rowset_id, version);
1776
0
        if (result == -2) {
1777
            // find next <rowset, version>
1778
0
            it++;
1779
0
        } else {
1780
            // find next <rowset>
1781
0
            it = delete_bitmap.upper_bound({rowset_id, std::numeric_limits<SegmentId>::max(),
1782
0
                                            std::numeric_limits<Version>::max()});
1783
0
        }
1784
0
    }
1785
0
}
1786
1787
0
bool DeleteBitmap::has_calculated_for_multi_segments(const RowsetId& rowset_id) const {
1788
0
    return contains({rowset_id, INVALID_SEGMENT_ID, TEMP_VERSION_COMMON}, ROWSET_SENTINEL_MARK);
1789
0
}
1790
1791
68.9k
size_t DeleteBitmap::remove_rowset_cache_version(const RowsetId& rowset_id) {
1792
68.9k
    std::lock_guard l(_rowset_cache_version_lock);
1793
68.9k
    _rowset_cache_version.erase(rowset_id);
1794
68.9k
    VLOG_DEBUG << "remove agg cache version for tablet=" << _tablet_id
1795
0
               << ", rowset=" << rowset_id.to_string();
1796
68.9k
    return _rowset_cache_version.size();
1797
68.9k
}
1798
1799
40
void DeleteBitmap::clear_rowset_cache_version() {
1800
40
    std::lock_guard l(_rowset_cache_version_lock);
1801
40
    _rowset_cache_version.clear();
1802
40
    VLOG_DEBUG << "clear agg cache version for tablet=" << _tablet_id;
1803
40
}
1804
1805
0
std::set<std::string> DeleteBitmap::get_rowset_cache_version() {
1806
0
    std::set<std::string> set;
1807
0
    std::shared_lock l(_rowset_cache_version_lock);
1808
0
    for (auto& [k, _] : _rowset_cache_version) {
1809
0
        set.insert(k.to_string());
1810
0
    }
1811
0
    return set;
1812
0
}
1813
1814
2.35M
DeleteBitmap::Version DeleteBitmap::_get_rowset_cache_version(const BitmapKey& bmk) const {
1815
2.35M
    std::shared_lock l(_rowset_cache_version_lock);
1816
2.35M
    if (auto it = _rowset_cache_version.find(std::get<0>(bmk)); it != _rowset_cache_version.end()) {
1817
12.1k
        auto& segment_cache_version = it->second;
1818
12.1k
        if (auto it1 = segment_cache_version.find(std::get<1>(bmk));
1819
12.1k
            it1 != segment_cache_version.end()) {
1820
12.1k
            return it1->second;
1821
12.1k
        }
1822
12.1k
    }
1823
2.33M
    return 0;
1824
2.35M
}
1825
1826
2
DeleteBitmap DeleteBitmap::agg_cache_snapshot() {
1827
2
    return DeleteBitmapAggCache::instance()->snapshot(_tablet_id);
1828
2
}
1829
1830
1.19M
void DeleteBitmap::set_tablet_id(int64_t tablet_id) {
1831
1.19M
    _tablet_id = tablet_id;
1832
1.19M
}
1833
1834
1.72M
std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg(const BitmapKey& bmk) const {
1835
1.72M
    std::string key_str = agg_cache_key(_tablet_id, bmk); // Cache key container
1836
1.72M
    CacheKey key(key_str);
1837
1.72M
    Cache::Handle* handle = DeleteBitmapAggCache::instance()->lookup(key);
1838
1839
1.72M
    DeleteBitmapAggCache::Value* val =
1840
1.72M
            handle == nullptr ? nullptr
1841
1.72M
                              : reinterpret_cast<DeleteBitmapAggCache::Value*>(
1842
1.66M
                                        DeleteBitmapAggCache::instance()->value(handle));
1843
    // FIXME: do we need a mutex here to get rid of duplicated initializations
1844
    //        of cache entries in some cases?
1845
1.72M
    if (val == nullptr) { // Renew if needed, put a new Value to cache
1846
61.5k
        val = new DeleteBitmapAggCache::Value();
1847
61.5k
        Version start_version =
1848
61.5k
                config::enable_mow_get_agg_by_cache ? _get_rowset_cache_version(bmk) : 0;
1849
61.5k
        if (start_version > 0) {
1850
11.8k
            Cache::Handle* handle2 = DeleteBitmapAggCache::instance()->lookup(
1851
11.8k
                    agg_cache_key(_tablet_id, {std::get<0>(bmk), std::get<1>(bmk), start_version}));
1852
1853
11.8k
            DBUG_EXECUTE_IF("DeleteBitmap::get_agg.cache_miss", {
1854
11.8k
                if (handle2 != nullptr) {
1855
11.8k
                    auto p = dp->param("percent", 0.3);
1856
11.8k
                    std::mt19937 gen {std::random_device {}()};
1857
11.8k
                    std::bernoulli_distribution inject_fault {p};
1858
11.8k
                    if (inject_fault(gen)) {
1859
11.8k
                        LOG_INFO("injection DeleteBitmap::get_agg.cache_miss, tablet_id={}",
1860
11.8k
                                 _tablet_id);
1861
11.8k
                        handle2 = nullptr;
1862
11.8k
                    }
1863
11.8k
                }
1864
11.8k
            });
1865
11.8k
            if (handle2 == nullptr || start_version > std::get<2>(bmk)) {
1866
84
                start_version = 0;
1867
11.7k
            } else {
1868
11.7k
                val->bitmap |= reinterpret_cast<DeleteBitmapAggCache::Value*>(
1869
11.7k
                                       DeleteBitmapAggCache::instance()->value(handle2))
1870
11.7k
                                       ->bitmap;
1871
11.7k
                VLOG_DEBUG << "get agg cache version=" << start_version
1872
40
                           << " for tablet=" << _tablet_id
1873
40
                           << ", rowset=" << std::get<0>(bmk).to_string()
1874
40
                           << ", segment=" << std::get<1>(bmk);
1875
11.7k
                start_version += 1;
1876
11.7k
            }
1877
11.8k
            if (handle2 != nullptr) {
1878
11.8k
                DeleteBitmapAggCache::instance()->release(handle2);
1879
11.8k
            }
1880
11.8k
        }
1881
61.5k
        {
1882
61.5k
            std::shared_lock l(lock);
1883
61.5k
            DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1884
69.5k
            for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1885
24.5k
                auto& [k, bm] = *it;
1886
24.5k
                if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1887
24.5k
                    std::get<2>(k) > std::get<2>(bmk)) {
1888
16.5k
                    break;
1889
16.5k
                }
1890
7.98k
                val->bitmap |= bm;
1891
7.98k
            }
1892
61.5k
        }
1893
61.5k
        size_t charge = val->bitmap.getSizeInBytes() + sizeof(DeleteBitmapAggCache::Value);
1894
61.5k
        handle = DeleteBitmapAggCache::instance()->insert(key, val, charge, charge,
1895
61.5k
                                                          CachePriority::NORMAL);
1896
61.5k
        if (config::enable_mow_get_agg_by_cache && !val->bitmap.isEmpty()) {
1897
17.9k
            std::lock_guard l(_rowset_cache_version_lock);
1898
            // this version is already agg
1899
17.9k
            _rowset_cache_version[std::get<0>(bmk)][std::get<1>(bmk)] = std::get<2>(bmk);
1900
17.9k
            VLOG_DEBUG << "set agg cache version=" << std::get<2>(bmk)
1901
24
                       << " for tablet=" << _tablet_id
1902
24
                       << ", rowset=" << std::get<0>(bmk).to_string()
1903
24
                       << ", segment=" << std::get<1>(bmk);
1904
17.9k
        }
1905
61.5k
        if (start_version > 0 && config::enable_mow_get_agg_correctness_check_core) {
1906
0
            std::shared_ptr<roaring::Roaring> bitmap = get_agg_without_cache(bmk);
1907
0
            if (val->bitmap != *bitmap) {
1908
0
                CHECK(false) << ". get agg correctness check failed for tablet=" << _tablet_id
1909
0
                             << ", rowset=" << std::get<0>(bmk).to_string()
1910
0
                             << ", segment=" << std::get<1>(bmk) << ", version=" << std::get<2>(bmk)
1911
0
                             << ". start_version from cache=" << start_version
1912
0
                             << ", delete_bitmap cardinality with cache="
1913
0
                             << val->bitmap.cardinality()
1914
0
                             << ", delete_bitmap cardinality without cache="
1915
0
                             << bitmap->cardinality();
1916
0
            }
1917
0
        }
1918
61.5k
    }
1919
1920
    // It is natural for the cache to reclaim the underlying memory
1921
1.72M
    return std::shared_ptr<roaring::Roaring>(
1922
1.72M
            &val->bitmap, [handle](...) { DeleteBitmapAggCache::instance()->release(handle); });
1923
1.72M
}
1924
1925
std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg_without_cache(
1926
6.35k
        const BitmapKey& bmk, const int64_t start_version) const {
1927
6.35k
    std::shared_ptr<roaring::Roaring> bitmap = std::make_shared<roaring::Roaring>();
1928
6.35k
    std::shared_lock l(lock);
1929
6.35k
    DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1930
26.8k
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1931
25.1k
        auto& [k, bm] = *it;
1932
25.1k
        if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1933
25.1k
            std::get<2>(k) > std::get<2>(bmk)) {
1934
4.67k
            break;
1935
4.67k
        }
1936
20.5k
        *bitmap |= bm;
1937
20.5k
    }
1938
6.35k
    return bitmap;
1939
6.35k
}
1940
1941
0
DeleteBitmap DeleteBitmap::diffset(const std::set<BitmapKey>& key_set) const {
1942
0
    std::shared_lock l(lock);
1943
0
    auto diff_key_set_view =
1944
0
            delete_bitmap | std::ranges::views::transform([](const auto& kv) { return kv.first; }) |
1945
0
            std::ranges::views::filter(
1946
0
                    [&key_set](const auto& key) { return !key_set.contains(key); });
1947
1948
0
    DeleteBitmap dbm(_tablet_id);
1949
0
    for (const auto& key : diff_key_set_view) {
1950
0
        const auto* bitmap = get(key);
1951
0
        DCHECK_NE(bitmap, nullptr);
1952
0
        dbm.delete_bitmap[key] = *bitmap;
1953
0
    }
1954
0
    return dbm;
1955
0
}
1956
1957
0
std::string tablet_state_name(TabletState state) {
1958
0
    switch (state) {
1959
0
    case TABLET_NOTREADY:
1960
0
        return "TABLET_NOTREADY";
1961
1962
0
    case TABLET_RUNNING:
1963
0
        return "TABLET_RUNNING";
1964
1965
0
    case TABLET_TOMBSTONED:
1966
0
        return "TABLET_TOMBSTONED";
1967
1968
0
    case TABLET_STOPPED:
1969
0
        return "TABLET_STOPPED";
1970
1971
0
    case TABLET_SHUTDOWN:
1972
0
        return "TABLET_SHUTDOWN";
1973
1974
0
    default:
1975
0
        return "TabletState(" + std::to_string(state) + ")";
1976
0
    }
1977
0
}
1978
1979
} // namespace doris