Coverage Report

Created: 2026-06-02 13:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/tablet/tablet_meta.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/tablet/tablet_meta.h"
19
20
#include <bvar/bvar.h>
21
#include <gen_cpp/Descriptors_types.h>
22
#include <gen_cpp/FrontendService_types.h>
23
#include <gen_cpp/Types_types.h>
24
#include <gen_cpp/olap_common.pb.h>
25
#include <gen_cpp/olap_file.pb.h>
26
#include <gen_cpp/segment_v2.pb.h>
27
#include <gen_cpp/types.pb.h>
28
#include <json2pb/pb_to_json.h>
29
#include <time.h>
30
31
#include <cstdint>
32
#include <memory>
33
#include <random>
34
#include <set>
35
#include <utility>
36
37
#include "cloud/cloud_meta_mgr.h"
38
#include "cloud/cloud_storage_engine.h"
39
#include "cloud/config.h"
40
#include "common/config.h"
41
#include "io/fs/file_writer.h"
42
#include "io/fs/local_file_system.h"
43
#include "storage/data_dir.h"
44
#include "storage/file_header.h"
45
#include "storage/olap_common.h"
46
#include "storage/olap_define.h"
47
#include "storage/rowset/rowset.h"
48
#include "storage/rowset/rowset_meta_manager.h"
49
#include "storage/tablet/tablet_fwd.h"
50
#include "storage/tablet/tablet_meta_manager.h"
51
#include "storage/tablet/tablet_schema_cache.h"
52
#include "storage/utils.h"
53
#include "util/debug_points.h"
54
#include "util/lru_cache.h"
55
#include "util/mem_info.h"
56
#include "util/parse_util.h"
57
#include "util/string_util.h"
58
#include "util/time.h"
59
#include "util/uid_util.h"
60
61
using std::string;
62
using std::unordered_map;
63
using std::vector;
64
65
namespace doris {
66
using namespace ErrorCode;
67
68
bvar::Adder<uint64_t> g_contains_agg_with_cache_if_eligible_total(
69
        "g_contains_agg_with_cache_if_eligible_total");
70
bvar::Adder<uint64_t> g_contains_agg_with_cache_if_eligible_partial_hit(
71
        "g_contains_agg_with_cache_if_eligible_partial_hit");
72
bvar::Adder<uint64_t> g_contains_agg_with_cache_if_eligible_full_hit(
73
        "g_contains_agg_with_cache_if_eligible_full_hit");
74
bvar::Window<bvar::Adder<uint64_t>> g_contains_agg_with_cache_if_eligible_total_minute(
75
        "g_contains_agg_with_cache_if_eligible_total_1m",
76
        &g_contains_agg_with_cache_if_eligible_total, 60);
77
bvar::Window<bvar::Adder<uint64_t>> g_contains_agg_with_cache_if_eligible_partial_hit_minute(
78
        "g_contains_agg_with_cache_if_eligible_partial_hit_1m",
79
        &g_contains_agg_with_cache_if_eligible_partial_hit, 60);
80
bvar::Window<bvar::Adder<uint64_t>> g_contains_agg_with_cache_if_eligible_full_hit_minute(
81
        "g_contains_agg_with_cache_if_eligible_full_hit_1m",
82
        &g_contains_agg_with_cache_if_eligible_full_hit, 60);
83
84
TabletMetaSharedPtr TabletMeta::create(
85
        const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id,
86
        uint32_t next_unique_id,
87
311
        const unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id) {
88
311
    std::optional<TBinlogConfig> binlog_config;
89
311
    if (request.__isset.binlog_config) {
90
6
        binlog_config = request.binlog_config;
91
6
    }
92
311
    TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format =
93
311
            request.inverted_index_file_storage_format;
94
95
    // We will discard this format. Don't make any further changes here.
96
311
    if (request.__isset.inverted_index_storage_format) {
97
311
        switch (request.inverted_index_storage_format) {
98
0
        case TInvertedIndexStorageFormat::V1:
99
0
            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V1;
100
0
            break;
101
0
        case TInvertedIndexStorageFormat::V2:
102
0
            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V2;
103
0
            break;
104
311
        default:
105
311
            break;
106
311
        }
107
311
    }
108
    // Decide storage format for this tablet. DEFAULT / not-set fall back to V2 on BE side.
109
311
    TStorageFormat::type storage_format =
110
311
            request.__isset.storage_format ? request.storage_format : TStorageFormat::V2;
111
311
    return std::make_shared<TabletMeta>(
112
311
            request.table_id, request.partition_id, request.tablet_id, request.replica_id,
113
311
            request.tablet_schema.schema_hash, shard_id, request.tablet_schema, next_unique_id,
114
311
            col_ordinal_to_unique_id, tablet_uid,
115
311
            request.__isset.tablet_type ? request.tablet_type : TTabletType::TABLET_TYPE_DISK,
116
311
            request.__isset.compression_type ? request.compression_type : TCompressionType::LZ4F,
117
311
            request.__isset.storage_policy_id ? request.storage_policy_id : -1,
118
311
            request.__isset.enable_unique_key_merge_on_write
119
311
                    ? request.enable_unique_key_merge_on_write
120
311
                    : false,
121
311
            std::move(binlog_config), request.compaction_policy,
122
311
            request.time_series_compaction_goal_size_mbytes,
123
311
            request.time_series_compaction_file_count_threshold,
124
311
            request.time_series_compaction_time_threshold_seconds,
125
311
            request.time_series_compaction_empty_rowsets_threshold,
126
311
            request.time_series_compaction_level_threshold, inverted_index_file_storage_format,
127
311
            request.tde_algorithm, storage_format,
128
311
            request.__isset.vertical_compaction_num_columns_per_group
129
311
                    ? request.vertical_compaction_num_columns_per_group
130
311
                    : 5,
131
311
            request.__isset.row_binlog_schema ? &request.row_binlog_schema : nullptr);
132
311
}
133
134
150k
TabletMeta::~TabletMeta() {
135
150k
    if (_handle) {
136
150k
        TabletSchemaCache::instance()->release(_handle);
137
150k
    }
138
150k
}
139
140
TabletMeta::TabletMeta()
141
294k
        : _tablet_uid(0, 0),
142
294k
          _schema(new TabletSchema),
143
294k
          _delete_bitmap(new DeleteBitmap(_tablet_id)),
144
294k
          _binlog_delvec(new DeleteBitmap(_tablet_id)) {}
145
146
TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id,
147
                       int64_t replica_id, int32_t schema_hash, int32_t shard_id,
148
                       const TTabletSchema& tablet_schema, uint32_t next_unique_id,
149
                       const std::unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id,
150
                       TabletUid tablet_uid, TTabletType::type tabletType,
151
                       TCompressionType::type compression_type, int64_t storage_policy_id,
152
                       bool enable_unique_key_merge_on_write,
153
                       std::optional<TBinlogConfig> binlog_config, std::string compaction_policy,
154
                       int64_t time_series_compaction_goal_size_mbytes,
155
                       int64_t time_series_compaction_file_count_threshold,
156
                       int64_t time_series_compaction_time_threshold_seconds,
157
                       int64_t time_series_compaction_empty_rowsets_threshold,
158
                       int64_t time_series_compaction_level_threshold,
159
                       TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format,
160
                       TEncryptionAlgorithm::type tde_algorithm,
161
                       TStorageFormat::type storage_format,
162
                       int32_t vertical_compaction_num_columns_per_group,
163
                       const TTabletSchema* row_binlog_schema)
164
690
        : _tablet_uid(0, 0),
165
690
          _schema(new TabletSchema),
166
690
          _delete_bitmap(new DeleteBitmap(tablet_id)),
167
690
          _binlog_delvec(new DeleteBitmap(tablet_id)),
168
690
          _storage_format(storage_format) {
169
690
    TabletMetaPB tablet_meta_pb;
170
690
    tablet_meta_pb.set_table_id(table_id);
171
690
    tablet_meta_pb.set_partition_id(partition_id);
172
690
    tablet_meta_pb.set_tablet_id(tablet_id);
173
690
    tablet_meta_pb.set_replica_id(replica_id);
174
690
    tablet_meta_pb.set_schema_hash(schema_hash);
175
690
    tablet_meta_pb.set_shard_id(shard_id);
176
    // Persist the creation time, but it is not used
177
690
    tablet_meta_pb.set_creation_time(time(nullptr));
178
690
    tablet_meta_pb.set_cumulative_layer_point(-1);
179
690
    tablet_meta_pb.set_tablet_state(PB_RUNNING);
180
690
    *(tablet_meta_pb.mutable_tablet_uid()) = tablet_uid.to_proto();
181
690
    tablet_meta_pb.set_tablet_type(tabletType == TTabletType::TABLET_TYPE_DISK
182
690
                                           ? TabletTypePB::TABLET_TYPE_DISK
183
690
                                           : TabletTypePB::TABLET_TYPE_MEMORY);
184
690
    tablet_meta_pb.set_enable_unique_key_merge_on_write(enable_unique_key_merge_on_write);
185
690
    tablet_meta_pb.set_storage_policy_id(storage_policy_id);
186
690
    tablet_meta_pb.set_compaction_policy(compaction_policy);
187
690
    tablet_meta_pb.set_time_series_compaction_goal_size_mbytes(
188
690
            time_series_compaction_goal_size_mbytes);
189
690
    tablet_meta_pb.set_time_series_compaction_file_count_threshold(
190
690
            time_series_compaction_file_count_threshold);
191
690
    tablet_meta_pb.set_time_series_compaction_time_threshold_seconds(
192
690
            time_series_compaction_time_threshold_seconds);
193
690
    tablet_meta_pb.set_time_series_compaction_empty_rowsets_threshold(
194
690
            time_series_compaction_empty_rowsets_threshold);
195
690
    tablet_meta_pb.set_time_series_compaction_level_threshold(
196
690
            time_series_compaction_level_threshold);
197
690
    tablet_meta_pb.set_vertical_compaction_num_columns_per_group(
198
690
            vertical_compaction_num_columns_per_group);
199
690
    SchemaCreateOptions schema_create_options_for_data = {
200
690
            .col_ordinal_to_unique_id = col_ordinal_to_unique_id,
201
690
            .compression_type = compression_type,
202
690
            .inverted_index_file_storage_format = inverted_index_file_storage_format,
203
690
            .next_unique_id = next_unique_id};
204
690
    TabletSchemaPB* schema_pb_for_data = tablet_meta_pb.mutable_schema();
205
690
    init_schema_from_thrift(tablet_schema, schema_create_options_for_data, schema_pb_for_data);
206
207
690
    tablet_meta_pb.set_in_restore_mode(false);
208
209
690
    TabletSchemaPB* schema_pb_for_row_binlog = nullptr;
210
690
    if (row_binlog_schema != nullptr) {
211
6
        tablet_meta_pb.set_row_binlog_schema_hash(row_binlog_schema->schema_hash);
212
6
        DCHECK(binlog_config.has_value());
213
6
        DCHECK(binlog_config->enable && binlog_config->binlog_format == TBinlogFormat::ROW);
214
215
6
        std::unordered_map<uint32_t, uint32_t> row_binlog_col_ordinal_to_unique_id;
216
6
        uint32_t row_binlog_next_unique_id = 0;
217
39
        for (uint32_t col_ordinal = 0; col_ordinal < row_binlog_schema->columns.size();
218
33
             ++col_ordinal) {
219
33
            const auto& tcolumn = row_binlog_schema->columns[col_ordinal];
220
33
            uint32_t unique_id = 0;
221
33
            if (tcolumn.col_unique_id >= 0) {
222
0
                unique_id = tcolumn.col_unique_id;
223
33
            } else {
224
33
                unique_id = col_ordinal;
225
33
            }
226
33
            row_binlog_col_ordinal_to_unique_id[col_ordinal] = unique_id;
227
33
            if (row_binlog_next_unique_id <= unique_id) {
228
33
                row_binlog_next_unique_id = unique_id + 1;
229
33
            }
230
33
        }
231
232
6
        SchemaCreateOptions schema_create_options_for_row_binlog = {
233
6
                .col_ordinal_to_unique_id = row_binlog_col_ordinal_to_unique_id,
234
6
                .compression_type = compression_type,
235
6
                .inverted_index_file_storage_format = inverted_index_file_storage_format,
236
6
                .next_unique_id = row_binlog_next_unique_id};
237
6
        schema_pb_for_row_binlog = tablet_meta_pb.mutable_row_binlog_schema();
238
6
        init_schema_from_thrift(*row_binlog_schema, schema_create_options_for_row_binlog,
239
6
                                schema_pb_for_row_binlog);
240
6
    }
241
690
    if (binlog_config.has_value()) {
242
6
        BinlogConfig tmp_binlog_config;
243
6
        tmp_binlog_config = binlog_config.value();
244
6
        tmp_binlog_config.to_pb(tablet_meta_pb.mutable_binlog_config());
245
6
    }
246
247
690
    switch (tde_algorithm) {
248
0
    case doris::TEncryptionAlgorithm::AES256:
249
0
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::AES_256_CTR);
250
0
        break;
251
0
    case doris::TEncryptionAlgorithm::SM4:
252
0
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::SM4_128_CTR);
253
0
        break;
254
690
    default:
255
690
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::PLAINTEXT);
256
690
    }
257
258
    // Initialize default external ColumnMeta usage according to storage format.
259
    // V2: legacy behavior, inline ColumnMetaPB only.
260
    // V3: V2 + external ColumnMetaPB (CMO) enabled by default.
261
690
    switch (_storage_format) {
262
690
    case TStorageFormat::V2:
263
690
    case TStorageFormat::DEFAULT:
264
690
    case TStorageFormat::V1:
265
690
        break;
266
0
    case TStorageFormat::V3:
267
0
        schema_pb_for_data->set_storage_format(TabletStorageFormatPB::TABLET_STORAGE_FORMAT_V3);
268
0
        _schema->set_storage_format(TabletStorageFormatPB::TABLET_STORAGE_FORMAT_V3);
269
0
        if (schema_pb_for_row_binlog != nullptr) {
270
0
            schema_pb_for_row_binlog->set_storage_format(
271
0
                    TabletStorageFormatPB::TABLET_STORAGE_FORMAT_V3);
272
0
        }
273
0
        break;
274
0
    default:
275
0
        break;
276
690
    }
277
278
690
    init_from_pb(tablet_meta_pb);
279
690
}
280
281
TabletMeta::TabletMeta(const TabletMeta& b)
282
288
        : MetadataAdder(b),
283
288
          _table_id(b._table_id),
284
288
          _index_id(b._index_id),
285
288
          _partition_id(b._partition_id),
286
288
          _tablet_id(b._tablet_id),
287
288
          _replica_id(b._replica_id),
288
288
          _schema_hash(b._schema_hash),
289
288
          _shard_id(b._shard_id),
290
288
          _creation_time(b._creation_time),
291
288
          _cumulative_layer_point(b._cumulative_layer_point),
292
288
          _tablet_uid(b._tablet_uid),
293
288
          _tablet_type(b._tablet_type),
294
288
          _tablet_state(b._tablet_state),
295
288
          _schema(b._schema),
296
288
          _rs_metas(b._rs_metas),
297
288
          _stale_rs_metas(b._stale_rs_metas),
298
288
          _in_restore_mode(b._in_restore_mode),
299
288
          _preferred_rowset_type(b._preferred_rowset_type),
300
288
          _storage_policy_id(b._storage_policy_id),
301
288
          _cooldown_meta_id(b._cooldown_meta_id),
302
288
          _enable_unique_key_merge_on_write(b._enable_unique_key_merge_on_write),
303
288
          _delete_bitmap(b._delete_bitmap),
304
288
          _binlog_delvec(b._binlog_delvec),
305
288
          _row_binlog_schema_hash(b._row_binlog_schema_hash),
306
288
          _row_binlog_schema(b._row_binlog_schema),
307
288
          _row_binlog_rs_metas(b._row_binlog_rs_metas),
308
288
          _binlog_config(b._binlog_config),
309
288
          _compaction_policy(b._compaction_policy),
310
288
          _time_series_compaction_goal_size_mbytes(b._time_series_compaction_goal_size_mbytes),
311
          _time_series_compaction_file_count_threshold(
312
288
                  b._time_series_compaction_file_count_threshold),
313
          _time_series_compaction_time_threshold_seconds(
314
288
                  b._time_series_compaction_time_threshold_seconds),
315
          _time_series_compaction_empty_rowsets_threshold(
316
288
                  b._time_series_compaction_empty_rowsets_threshold),
317
288
          _time_series_compaction_level_threshold(b._time_series_compaction_level_threshold),
318
          _vertical_compaction_num_columns_per_group(
319
288
                  b._vertical_compaction_num_columns_per_group) {};
320
321
void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tcolumn,
322
1.92M
                                          ColumnPB* column) {
323
1.92M
    column->set_unique_id(unique_id);
324
1.92M
    column->set_name(tcolumn.column_name);
325
1.92M
    column->set_is_auto_increment(tcolumn.is_auto_increment);
326
1.92M
    if (tcolumn.__isset.is_on_update_current_timestamp) {
327
1.92M
        column->set_is_on_update_current_timestamp(tcolumn.is_on_update_current_timestamp);
328
1.92M
    }
329
1.92M
    string data_type;
330
1.92M
    EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type);
331
1.92M
    column->set_type(data_type);
332
333
1.92M
    uint32_t length = TabletColumn::get_field_length_by_type(tcolumn.column_type.type,
334
1.92M
                                                             tcolumn.column_type.len);
335
1.92M
    column->set_length(length);
336
1.92M
    column->set_index_length(length);
337
1.92M
    column->set_precision(tcolumn.column_type.precision);
338
1.92M
    column->set_frac(tcolumn.column_type.scale);
339
340
1.92M
    if (tcolumn.__isset.result_is_nullable) {
341
8
        column->set_result_is_nullable(tcolumn.result_is_nullable);
342
8
    }
343
344
1.92M
    if (tcolumn.__isset.be_exec_version) {
345
1.92M
        column->set_be_exec_version(tcolumn.be_exec_version);
346
1.92M
    }
347
348
1.92M
    if (tcolumn.column_type.type == TPrimitiveType::VARCHAR ||
349
1.92M
        tcolumn.column_type.type == TPrimitiveType::STRING) {
350
832k
        if (!tcolumn.column_type.__isset.index_len) {
351
109
            column->set_index_length(10);
352
832k
        } else {
353
832k
            column->set_index_length(tcolumn.column_type.index_len);
354
832k
        }
355
832k
    }
356
1.92M
    if (!tcolumn.is_key) {
357
1.34M
        column->set_is_key(false);
358
1.34M
        if (tcolumn.__isset.aggregation) {
359
8
            column->set_aggregation(tcolumn.aggregation);
360
1.34M
        } else {
361
1.34M
            string aggregation_type;
362
1.34M
            EnumToString(TAggregationType, tcolumn.aggregation_type, aggregation_type);
363
1.34M
            column->set_aggregation(aggregation_type);
364
1.34M
        }
365
1.34M
    } else {
366
579k
        column->set_is_key(true);
367
579k
        column->set_aggregation("NONE");
368
579k
    }
369
1.92M
    column->set_is_nullable(tcolumn.is_allow_null);
370
1.92M
    if (tcolumn.__isset.default_value) {
371
195k
        column->set_default_value(tcolumn.default_value);
372
195k
    }
373
1.92M
    if (tcolumn.__isset.is_bloom_filter_column) {
374
2.37k
        column->set_is_bf_column(tcolumn.is_bloom_filter_column);
375
2.37k
    }
376
1.92M
    if (tcolumn.__isset.visible) {
377
1.92M
        column->set_visible(tcolumn.visible);
378
1.92M
    }
379
2.06M
    for (size_t i = 0; i < tcolumn.children_column.size(); i++) {
380
137k
        ColumnPB* children_column = column->add_children_columns();
381
137k
        init_column_from_tcolumn(tcolumn.children_column[i].col_unique_id,
382
137k
                                 tcolumn.children_column[i], children_column);
383
137k
    }
384
1.93M
    if (tcolumn.column_type.__isset.variant_max_subcolumns_count) {
385
1.93M
        column->set_variant_max_subcolumns_count(tcolumn.column_type.variant_max_subcolumns_count);
386
1.93M
    }
387
1.92M
    if (tcolumn.__isset.pattern_type) {
388
2.18k
        switch (tcolumn.pattern_type) {
389
5
        case TPatternType::MATCH_NAME:
390
5
            column->set_pattern_type(PatternTypePB::MATCH_NAME);
391
5
            break;
392
2.17k
        case TPatternType::MATCH_NAME_GLOB:
393
2.17k
            column->set_pattern_type(PatternTypePB::MATCH_NAME_GLOB);
394
2.18k
        }
395
2.18k
    }
396
1.92M
    if (tcolumn.__isset.variant_enable_typed_paths_to_sparse) {
397
1.92M
        column->set_variant_enable_typed_paths_to_sparse(
398
1.92M
                tcolumn.variant_enable_typed_paths_to_sparse);
399
1.92M
    }
400
1.92M
    if (tcolumn.__isset.variant_max_sparse_column_statistics_size) {
401
1.92M
        column->set_variant_max_sparse_column_statistics_size(
402
1.92M
                tcolumn.variant_max_sparse_column_statistics_size);
403
1.92M
    }
404
1.92M
    if (tcolumn.__isset.variant_sparse_hash_shard_count) {
405
1.79M
        column->set_variant_sparse_hash_shard_count(tcolumn.variant_sparse_hash_shard_count);
406
1.79M
    }
407
1.93M
    if (tcolumn.column_type.__isset.variant_enable_doc_mode) {
408
1.93M
        column->set_variant_enable_doc_mode(tcolumn.column_type.variant_enable_doc_mode);
409
1.93M
    }
410
1.92M
    if (tcolumn.__isset.variant_doc_materialization_min_rows) {
411
1.79M
        column->set_variant_doc_materialization_min_rows(
412
1.79M
                tcolumn.variant_doc_materialization_min_rows);
413
1.79M
    }
414
1.92M
    if (tcolumn.__isset.variant_doc_hash_shard_count) {
415
1.79M
        column->set_variant_doc_hash_shard_count(tcolumn.variant_doc_hash_shard_count);
416
1.79M
    }
417
1.92M
    if (tcolumn.__isset.variant_enable_nested_group) {
418
1.79M
        column->set_variant_enable_nested_group(tcolumn.variant_enable_nested_group);
419
1.79M
    }
420
1.92M
}
421
422
void TabletMeta::init_schema_from_thrift(const TTabletSchema& tablet_schema,
423
                                         const SchemaCreateOptions& schema_create_options,
424
696
                                         TabletSchemaPB* tablet_schema_pb) {
425
696
    const std::unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id =
426
696
            schema_create_options.col_ordinal_to_unique_id;
427
696
    TCompressionType::type compression_type = schema_create_options.compression_type;
428
696
    TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format =
429
696
            schema_create_options.inverted_index_file_storage_format;
430
696
    uint32_t next_unique_id = schema_create_options.next_unique_id;
431
432
696
    tablet_schema_pb->set_num_short_key_columns(tablet_schema.short_key_column_count);
433
696
    tablet_schema_pb->set_num_rows_per_row_block(config::default_num_rows_per_column_file_block);
434
696
    tablet_schema_pb->set_sequence_col_idx(tablet_schema.sequence_col_idx);
435
696
    auto p_seq_map = tablet_schema_pb->mutable_seq_map(); // ColumnGroupsPB
436
696
    for (auto& it : tablet_schema.seq_map) {              // std::vector< ::doris::TColumnGroup>
437
0
        uint32_t key = it.sequence_column;
438
0
        ColumnGroupPB* cg_pb = p_seq_map->add_cg(); // ColumnGroupPB {key: {v1, v2, v3}}
439
0
        cg_pb->set_sequence_column(key);
440
0
        for (auto v : it.columns_in_group) {
441
0
            cg_pb->add_columns_in_group(v);
442
0
        }
443
0
    }
444
445
696
    switch (tablet_schema.keys_type) {
446
57
    case TKeysType::DUP_KEYS:
447
57
        tablet_schema_pb->set_keys_type(KeysType::DUP_KEYS);
448
57
        break;
449
309
    case TKeysType::UNIQUE_KEYS:
450
309
        tablet_schema_pb->set_keys_type(KeysType::UNIQUE_KEYS);
451
309
        break;
452
69
    case TKeysType::AGG_KEYS:
453
69
        tablet_schema_pb->set_keys_type(KeysType::AGG_KEYS);
454
69
        break;
455
261
    default:
456
261
        LOG(WARNING) << "unknown tablet keys type";
457
261
        break;
458
696
    }
459
460
    // compress_kind used to compress segment files
461
696
    tablet_schema_pb->set_compress_kind(COMPRESS_LZ4);
462
463
    // compression_type used to compress segment page
464
696
    switch (compression_type) {
465
0
    case TCompressionType::NO_COMPRESSION:
466
0
        tablet_schema_pb->set_compression_type(segment_v2::NO_COMPRESSION);
467
0
        break;
468
0
    case TCompressionType::SNAPPY:
469
0
        tablet_schema_pb->set_compression_type(segment_v2::SNAPPY);
470
0
        break;
471
0
    case TCompressionType::LZ4:
472
0
        tablet_schema_pb->set_compression_type(segment_v2::LZ4);
473
0
        break;
474
696
    case TCompressionType::LZ4F:
475
696
        tablet_schema_pb->set_compression_type(segment_v2::LZ4F);
476
696
        break;
477
0
    case TCompressionType::ZLIB:
478
0
        tablet_schema_pb->set_compression_type(segment_v2::ZLIB);
479
0
        break;
480
0
    case TCompressionType::ZSTD:
481
0
        tablet_schema_pb->set_compression_type(segment_v2::ZSTD);
482
0
        break;
483
0
    default:
484
0
        tablet_schema_pb->set_compression_type(segment_v2::LZ4F);
485
0
        break;
486
696
    }
487
488
696
    switch (inverted_index_file_storage_format) {
489
0
    case TInvertedIndexFileStorageFormat::V1:
490
0
        tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1);
491
0
        break;
492
696
    case TInvertedIndexFileStorageFormat::V2:
493
696
        tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
494
696
        break;
495
0
    case TInvertedIndexFileStorageFormat::V3:
496
0
        tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
497
0
        break;
498
0
    default:
499
0
        tablet_schema_pb->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
500
0
        break;
501
696
    }
502
503
696
    switch (tablet_schema.sort_type) {
504
0
    case TSortType::type::ZORDER:
505
0
        tablet_schema_pb->set_sort_type(SortType::ZORDER);
506
0
        break;
507
696
    default:
508
696
        tablet_schema_pb->set_sort_type(SortType::LEXICAL);
509
696
    }
510
696
    tablet_schema_pb->set_sort_col_num(tablet_schema.sort_col_num);
511
696
    for (const auto& i : tablet_schema.cluster_key_uids) {
512
2
        tablet_schema_pb->add_cluster_key_uids(i);
513
2
    }
514
515
    // set column information
516
696
    uint32_t col_ordinal = 0;
517
696
    bool has_bf_columns = false;
518
2.27k
    for (TColumn tcolumn : tablet_schema.columns) {
519
2.27k
        ColumnPB* column = tablet_schema_pb->add_column();
520
2.27k
        uint32_t unique_id = -1;
521
2.27k
        if (tcolumn.col_unique_id >= 0) {
522
11
            unique_id = tcolumn.col_unique_id;
523
2.26k
        } else {
524
2.26k
            unique_id = col_ordinal_to_unique_id.at(col_ordinal);
525
2.26k
        }
526
2.27k
        col_ordinal++;
527
2.27k
        init_column_from_tcolumn(unique_id, tcolumn, column);
528
529
2.27k
        if (column->is_bf_column()) {
530
0
            has_bf_columns = true;
531
0
        }
532
533
2.27k
        if (tablet_schema.__isset.indexes) {
534
2
            for (auto& index : tablet_schema.indexes) {
535
2
                if (index.index_type == TIndexType::type::BLOOMFILTER ||
536
2
                    index.index_type == TIndexType::type::NGRAM_BF) {
537
0
                    DCHECK_EQ(index.columns.size(), 1);
538
0
                    if (iequal(tcolumn.column_name, index.columns[0])) {
539
0
                        column->set_is_bf_column(true);
540
0
                        break;
541
0
                    }
542
0
                }
543
2
            }
544
2
        }
545
2.27k
    }
546
547
    // copy index meta
548
696
    if (tablet_schema.__isset.indexes) {
549
1
        for (auto& index : tablet_schema.indexes) {
550
1
            TabletIndexPB* index_pb = tablet_schema_pb->add_index();
551
1
            index_pb->set_index_id(index.index_id);
552
1
            index_pb->set_index_name(index.index_name);
553
            // init col_unique_id in index at be side, since col_unique_id may be -1 at fe side
554
            // get column unique id by name
555
1
            for (auto column_name : index.columns) {
556
2
                for (auto column : tablet_schema_pb->column()) {
557
2
                    if (iequal(column.name(), column_name)) {
558
1
                        index_pb->add_col_unique_id(column.unique_id());
559
1
                    }
560
2
                }
561
1
            }
562
1
            switch (index.index_type) {
563
1
            case TIndexType::BITMAP:
564
1
                index_pb->set_index_type(IndexType::BITMAP);
565
1
                break;
566
0
            case TIndexType::INVERTED:
567
0
                index_pb->set_index_type(IndexType::INVERTED);
568
0
                break;
569
0
            case TIndexType::ANN:
570
0
                index_pb->set_index_type(IndexType::ANN);
571
0
                break;
572
0
            case TIndexType::BLOOMFILTER:
573
0
                index_pb->set_index_type(IndexType::BLOOMFILTER);
574
0
                break;
575
0
            case TIndexType::NGRAM_BF:
576
0
                index_pb->set_index_type(IndexType::NGRAM_BF);
577
0
                break;
578
1
            }
579
580
1
            if (index.__isset.properties) {
581
0
                auto properties = index_pb->mutable_properties();
582
0
                for (auto kv : index.properties) {
583
0
                    (*properties)[kv.first] = kv.second;
584
0
                }
585
0
            }
586
1
        }
587
1
    }
588
589
696
    tablet_schema_pb->set_next_column_unique_id(next_unique_id);
590
696
    if (has_bf_columns && tablet_schema.__isset.bloom_filter_fpp) {
591
0
        tablet_schema_pb->set_bf_fpp(tablet_schema.bloom_filter_fpp);
592
0
    }
593
594
696
    if (tablet_schema.__isset.is_in_memory) {
595
0
        tablet_schema_pb->set_is_in_memory(tablet_schema.is_in_memory);
596
0
    }
597
598
696
    if (tablet_schema.__isset.disable_auto_compaction) {
599
10
        tablet_schema_pb->set_disable_auto_compaction(tablet_schema.disable_auto_compaction);
600
10
    }
601
602
    // Deprecated legacy flatten-nested switch. Distinct from variant_enable_nested_group.
603
696
    if (tablet_schema.__isset.variant_enable_flatten_nested) {
604
696
        tablet_schema_pb->set_enable_variant_flatten_nested(
605
696
                tablet_schema.variant_enable_flatten_nested);
606
696
    }
607
608
696
    if (tablet_schema.__isset.enable_single_replica_compaction) {
609
696
        tablet_schema_pb->set_enable_single_replica_compaction(
610
696
                tablet_schema.enable_single_replica_compaction);
611
696
    }
612
613
696
    if (tablet_schema.__isset.delete_sign_idx) {
614
696
        tablet_schema_pb->set_delete_sign_idx(tablet_schema.delete_sign_idx);
615
696
    }
616
696
    if (tablet_schema.__isset.store_row_column) {
617
696
        tablet_schema_pb->set_store_row_column(tablet_schema.store_row_column);
618
696
    }
619
696
    if (tablet_schema.__isset.row_store_page_size) {
620
696
        tablet_schema_pb->set_row_store_page_size(tablet_schema.row_store_page_size);
621
696
    }
622
696
    if (tablet_schema.__isset.storage_page_size) {
623
696
        tablet_schema_pb->set_storage_page_size(tablet_schema.storage_page_size);
624
696
    }
625
696
    if (tablet_schema.__isset.storage_dict_page_size) {
626
696
        tablet_schema_pb->set_storage_dict_page_size(tablet_schema.storage_dict_page_size);
627
696
    }
628
696
    if (tablet_schema.__isset.skip_write_index_on_load) {
629
696
        tablet_schema_pb->set_skip_write_index_on_load(tablet_schema.skip_write_index_on_load);
630
696
    }
631
696
    if (tablet_schema.__isset.row_store_col_cids) {
632
0
        tablet_schema_pb->mutable_row_store_column_unique_ids()->Add(
633
0
                tablet_schema.row_store_col_cids.begin(), tablet_schema.row_store_col_cids.end());
634
0
    }
635
696
}
636
637
7.52k
void TabletMeta::remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version) {
638
7.52k
    if (_enable_unique_key_merge_on_write) {
639
4.46k
        delete_bitmap().remove({rowset_id, 0, 0}, {rowset_id, UINT32_MAX, 0});
640
4.46k
        if (config::enable_mow_verbose_log) {
641
0
            LOG_INFO("delete rowset delete bitmap. tablet={}, rowset={}, version={}", tablet_id(),
642
0
                     rowset_id.to_string(), version.to_string());
643
0
        }
644
4.46k
        size_t rowset_cache_version_size = delete_bitmap().remove_rowset_cache_version(rowset_id);
645
4.46k
        _check_mow_rowset_cache_version_size(rowset_cache_version_size);
646
4.46k
    }
647
7.52k
}
648
649
4
Status TabletMeta::create_from_file(const string& file_path) {
650
4
    TabletMetaPB tablet_meta_pb;
651
4
    RETURN_IF_ERROR(load_from_file(file_path, &tablet_meta_pb));
652
4
    init_from_pb(tablet_meta_pb);
653
4
    return Status::OK();
654
4
}
655
656
14
Status TabletMeta::load_from_file(const string& file_path, TabletMetaPB* tablet_meta_pb) {
657
14
    FileHeader<TabletMetaPB> file_header(file_path);
658
    // In file_header.deserialize(), it validates file length, signature, checksum of protobuf.
659
14
    RETURN_IF_ERROR(file_header.deserialize());
660
14
    try {
661
14
        tablet_meta_pb->CopyFrom(file_header.message());
662
14
    } catch (const std::exception& e) {
663
0
        LOG(WARNING) << "Failed to copy protocol buffer object: " << e.what()
664
0
                     << ", file=" << file_path;
665
0
        return Status::Error<PARSE_PROTOBUF_ERROR>(
666
0
                "fail to copy protocol buffer object. file={}, error={}", file_path, e.what());
667
0
    }
668
14
    return Status::OK();
669
14
}
670
671
6
Status TabletMeta::create_from_buffer(const uint8_t* buffer, size_t buffer_size) {
672
6
    FileHeader<TabletMetaPB> file_header(""); // empty file path
673
6
    RETURN_IF_ERROR(file_header.deserialize_from_memory(buffer, buffer_size));
674
675
2
    TabletMetaPB tablet_meta_pb;
676
2
    try {
677
2
        tablet_meta_pb.CopyFrom(file_header.message());
678
2
    } catch (const std::exception& e) {
679
0
        LOG(WARNING) << "Failed to copy protocol buffer object from buffer: " << e.what();
680
0
        return Status::Error<ErrorCode::PARSE_PROTOBUF_ERROR>(
681
0
                "fail to copy protocol buffer object from buffer. error={}", e.what());
682
0
    }
683
684
2
    init_from_pb(tablet_meta_pb);
685
2
    return Status::OK();
686
2
}
687
688
std::string TabletMeta::construct_header_file_path(const string& schema_hash_path,
689
3
                                                   int64_t tablet_id) {
690
3
    std::stringstream header_name_stream;
691
3
    header_name_stream << schema_hash_path << "/" << tablet_id << ".hdr";
692
3
    return header_name_stream.str();
693
3
}
694
695
0
Status TabletMeta::save_as_json(const string& file_path) {
696
0
    std::string json_meta;
697
0
    json2pb::Pb2JsonOptions json_options;
698
0
    json_options.pretty_json = true;
699
0
    json_options.bytes_to_base64 = true;
700
0
    to_json(&json_meta, json_options);
701
    // save to file
702
0
    io::FileWriterPtr file_writer;
703
0
    RETURN_IF_ERROR(io::global_local_filesystem()->create_file(file_path, &file_writer));
704
0
    RETURN_IF_ERROR(file_writer->append(json_meta));
705
0
    RETURN_IF_ERROR(file_writer->close());
706
0
    return Status::OK();
707
0
}
708
709
2.25k
Status TabletMeta::save(const string& file_path) {
710
2.25k
    TabletMetaPB tablet_meta_pb;
711
2.25k
    to_meta_pb(&tablet_meta_pb, false);
712
2.25k
    return TabletMeta::save(file_path, tablet_meta_pb);
713
2.25k
}
714
715
2.26k
Status TabletMeta::save(const string& file_path, const TabletMetaPB& tablet_meta_pb) {
716
2.26k
    DCHECK(!file_path.empty());
717
2.26k
    FileHeader<TabletMetaPB> file_header(file_path);
718
2.26k
    try {
719
2.26k
        file_header.mutable_message()->CopyFrom(tablet_meta_pb);
720
2.26k
    } catch (...) {
721
0
        LOG(WARNING) << "fail to copy protocol buffer object. file='" << file_path;
722
0
        return Status::Error<ErrorCode::INTERNAL_ERROR>(
723
0
                "fail to copy protocol buffer object. file={}", file_path);
724
0
    }
725
2.26k
    RETURN_IF_ERROR(file_header.prepare());
726
2.26k
    RETURN_IF_ERROR(file_header.serialize());
727
2.26k
    return Status::OK();
728
2.26k
}
729
730
4.05k
Status TabletMeta::save_meta(DataDir* data_dir) {
731
4.05k
    std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
732
4.05k
    return _save_meta(data_dir);
733
4.05k
}
734
735
4.05k
Status TabletMeta::_save_meta(DataDir* data_dir) {
736
    // check if tablet uid is valid
737
4.05k
    if (_tablet_uid.hi == 0 && _tablet_uid.lo == 0) {
738
0
        LOG(FATAL) << "tablet_uid is invalid"
739
0
                   << " tablet=" << tablet_id() << " _tablet_uid=" << _tablet_uid.to_string();
740
0
    }
741
4.05k
    string meta_binary;
742
743
4.05k
    auto t1 = MonotonicMicros();
744
4.05k
    serialize(&meta_binary);
745
4.05k
    auto t2 = MonotonicMicros();
746
4.05k
    Status status = TabletMetaManager::save(data_dir, tablet_id(), schema_hash(), meta_binary);
747
4.05k
    if (!status.ok()) {
748
0
        LOG(FATAL) << "fail to save tablet_meta. status=" << status << ", tablet_id=" << tablet_id()
749
0
                   << ", schema_hash=" << schema_hash();
750
0
    }
751
4.05k
    auto t3 = MonotonicMicros();
752
4.05k
    auto cost = t3 - t1;
753
4.05k
    if (cost > 1 * 1000 * 1000) {
754
0
        LOG(INFO) << "save tablet(" << tablet_id() << ") meta too slow. serialize cost " << t2 - t1
755
0
                  << "(us), serialized binary size: " << meta_binary.length()
756
0
                  << "(bytes), write rocksdb cost " << t3 - t2 << "(us)";
757
0
    }
758
4.05k
    return status;
759
4.05k
}
760
761
4.06k
void TabletMeta::serialize(string* meta_binary) {
762
4.06k
    TabletMetaPB tablet_meta_pb;
763
4.06k
    to_meta_pb(&tablet_meta_pb, false);
764
4.06k
    if (tablet_meta_pb.partition_id() <= 0) {
765
468
        LOG(WARNING) << "invalid partition id " << tablet_meta_pb.partition_id() << " tablet "
766
468
                     << tablet_meta_pb.tablet_id();
767
468
    }
768
4.06k
    DBUG_EXECUTE_IF("TabletMeta::serialize::zero_partition_id", {
769
4.06k
        long partition_id = tablet_meta_pb.partition_id();
770
4.06k
        tablet_meta_pb.set_partition_id(0);
771
4.06k
        LOG(WARNING) << "set debug point TabletMeta::serialize::zero_partition_id old="
772
4.06k
                     << partition_id << " new=" << tablet_meta_pb.DebugString();
773
4.06k
    });
774
4.06k
    bool serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
775
4.06k
    if (!_rs_metas.empty() || !_stale_rs_metas.empty()) {
776
4.05k
        _avg_rs_meta_serialize_size =
777
4.05k
                meta_binary->length() / (_rs_metas.size() + _stale_rs_metas.size());
778
4.05k
        if (meta_binary->length() > config::tablet_meta_serialize_size_limit ||
779
4.05k
            !serialize_success) {
780
0
            int64_t origin_meta_size = meta_binary->length();
781
0
            int64_t stale_rowsets_num = tablet_meta_pb.stale_rs_metas().size();
782
0
            tablet_meta_pb.clear_stale_rs_metas();
783
0
            meta_binary->clear();
784
0
            serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
785
0
            LOG(WARNING) << "tablet meta serialization size exceeds limit: "
786
0
                         << config::tablet_meta_serialize_size_limit
787
0
                         << " clean up stale rowsets, tablet id: " << tablet_id()
788
0
                         << " stale rowset num: " << stale_rowsets_num
789
0
                         << " serialization size before clean " << origin_meta_size
790
0
                         << " serialization size after clean " << meta_binary->length();
791
0
        }
792
4.05k
    }
793
794
4.06k
    if (!serialize_success) {
795
0
        LOG(FATAL) << "failed to serialize meta " << tablet_id();
796
0
    }
797
4.06k
}
798
799
279k
Status TabletMeta::deserialize(std::string_view meta_binary) {
800
279k
    TabletMetaPB tablet_meta_pb;
801
279k
    bool parsed = tablet_meta_pb.ParseFromArray(meta_binary.data(),
802
279k
                                                static_cast<int32_t>(meta_binary.size()));
803
279k
    if (!parsed) {
804
0
        return Status::Error<INIT_FAILED>("parse tablet meta failed");
805
0
    }
806
279k
    init_from_pb(tablet_meta_pb);
807
279k
    return Status::OK();
808
279k
}
809
810
294k
void TabletMeta::init_from_pb(const TabletMetaPB& tablet_meta_pb) {
811
294k
    _table_id = tablet_meta_pb.table_id();
812
294k
    _index_id = tablet_meta_pb.index_id();
813
294k
    _partition_id = tablet_meta_pb.partition_id();
814
294k
    _tablet_id = tablet_meta_pb.tablet_id();
815
294k
    _replica_id = tablet_meta_pb.replica_id();
816
294k
    _schema_hash = tablet_meta_pb.schema_hash();
817
294k
    _shard_id = tablet_meta_pb.shard_id();
818
294k
    _creation_time = tablet_meta_pb.creation_time();
819
294k
    _cumulative_layer_point = tablet_meta_pb.cumulative_layer_point();
820
294k
    _tablet_uid = TabletUid(tablet_meta_pb.tablet_uid());
821
294k
    _ttl_seconds = tablet_meta_pb.ttl_seconds();
822
294k
    if (tablet_meta_pb.has_tablet_type()) {
823
294k
        _tablet_type = tablet_meta_pb.tablet_type();
824
294k
    } else {
825
57
        _tablet_type = TabletTypePB::TABLET_TYPE_DISK;
826
57
    }
827
828
    // init _tablet_state
829
294k
    switch (tablet_meta_pb.tablet_state()) {
830
857
    case PB_NOTREADY:
831
857
        _tablet_state = TabletState::TABLET_NOTREADY;
832
857
        break;
833
291k
    case PB_RUNNING:
834
291k
        _tablet_state = TabletState::TABLET_RUNNING;
835
291k
        break;
836
0
    case PB_TOMBSTONED:
837
0
        _tablet_state = TabletState::TABLET_TOMBSTONED;
838
0
        break;
839
0
    case PB_STOPPED:
840
0
        _tablet_state = TabletState::TABLET_STOPPED;
841
0
        break;
842
2.55k
    case PB_SHUTDOWN:
843
2.55k
        _tablet_state = TabletState::TABLET_SHUTDOWN;
844
2.55k
        break;
845
0
    default:
846
0
        LOG(WARNING) << "tablet has no state. tablet=" << tablet_id()
847
0
                     << ", schema_hash=" << schema_hash();
848
294k
    }
849
850
    // init _schema
851
294k
    TabletSchemaSPtr schema = std::make_shared<TabletSchema>();
852
294k
    schema->init_from_pb(tablet_meta_pb.schema());
853
294k
    if (_handle) {
854
4
        TabletSchemaCache::instance()->release(_handle);
855
4
    }
856
294k
    auto pair = TabletSchemaCache::instance()->insert(schema->to_key());
857
294k
    _handle = pair.first;
858
294k
    _schema = pair.second;
859
860
294k
    if (tablet_meta_pb.has_row_binlog_schema()) {
861
50
        TabletSchemaSPtr row_binlog_schema = std::make_shared<TabletSchema>();
862
50
        row_binlog_schema->init_from_pb(tablet_meta_pb.row_binlog_schema());
863
50
        _row_binlog_schema = std::move(row_binlog_schema);
864
50
        _row_binlog_schema_hash = tablet_meta_pb.row_binlog_schema_hash();
865
50
    }
866
867
294k
    if (tablet_meta_pb.has_enable_unique_key_merge_on_write()) {
868
294k
        _enable_unique_key_merge_on_write = tablet_meta_pb.enable_unique_key_merge_on_write();
869
294k
        _delete_bitmap->set_tablet_id(_tablet_id);
870
294k
        _binlog_delvec->set_tablet_id(_tablet_id);
871
294k
    }
872
873
    // init _rs_metas
874
487k
    for (auto& it : tablet_meta_pb.rs_metas()) {
875
487k
        RowsetMetaSharedPtr rs_meta(new RowsetMeta());
876
487k
        rs_meta->init_from_pb(it);
877
487k
        _rs_metas.emplace(rs_meta->version(), rs_meta);
878
487k
    }
879
880
    // For mow table, delete bitmap of stale rowsets has not been persisted.
881
    // When be restart, query should not read the stale rowset, otherwise duplicate keys
882
    // will be read out. Therefore, we don't add them to _stale_rs_meta for mow table.
883
294k
    if (!config::skip_loading_stale_rowset_meta && !_enable_unique_key_merge_on_write) {
884
238k
        for (auto& it : tablet_meta_pb.stale_rs_metas()) {
885
4.68k
            RowsetMetaSharedPtr rs_meta(new RowsetMeta());
886
4.68k
            rs_meta->init_from_pb(it);
887
4.68k
            _stale_rs_metas.emplace(rs_meta->version(), rs_meta);
888
4.68k
        }
889
238k
    }
890
891
294k
    for (auto& it : tablet_meta_pb.row_binlog_rs_metas()) {
892
44
        RowsetMetaSharedPtr rs_meta(new RowsetMeta());
893
44
        rs_meta->init_from_pb(it);
894
44
        _row_binlog_rs_metas.emplace(rs_meta->version(), rs_meta);
895
44
    }
896
897
294k
    if (tablet_meta_pb.has_in_restore_mode()) {
898
294k
        _in_restore_mode = tablet_meta_pb.in_restore_mode();
899
294k
    }
900
901
294k
    if (tablet_meta_pb.has_preferred_rowset_type()) {
902
294k
        _preferred_rowset_type = tablet_meta_pb.preferred_rowset_type();
903
294k
    }
904
905
294k
    _storage_policy_id = tablet_meta_pb.storage_policy_id();
906
294k
    if (tablet_meta_pb.has_cooldown_meta_id()) {
907
14.9k
        _cooldown_meta_id = tablet_meta_pb.cooldown_meta_id();
908
14.9k
    }
909
910
294k
    if (tablet_meta_pb.has_delete_bitmap()) {
911
50.6k
        int rst_ids_size = tablet_meta_pb.delete_bitmap().rowset_ids_size();
912
50.6k
        int seg_ids_size = tablet_meta_pb.delete_bitmap().segment_ids_size();
913
50.6k
        int versions_size = tablet_meta_pb.delete_bitmap().versions_size();
914
50.6k
        int seg_maps_size = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps_size();
915
50.6k
        int binlog_mark_size = tablet_meta_pb.delete_bitmap().is_binlog_delvec_size();
916
50.6k
        CHECK(rst_ids_size == seg_ids_size && seg_ids_size == seg_maps_size &&
917
50.6k
              seg_maps_size == versions_size);
918
50.6k
        CHECK(binlog_mark_size == 0 || binlog_mark_size == rst_ids_size);
919
54.4k
        for (int i = 0; i < rst_ids_size; ++i) {
920
3.89k
            RowsetId rst_id;
921
3.89k
            rst_id.init(tablet_meta_pb.delete_bitmap().rowset_ids(i));
922
3.89k
            auto seg_id = tablet_meta_pb.delete_bitmap().segment_ids(i);
923
3.89k
            auto ver = tablet_meta_pb.delete_bitmap().versions(i);
924
3.89k
            auto bitmap = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps(i).data();
925
3.89k
            bool from_binlog = tablet_meta_pb.delete_bitmap().is_binlog_delvec_size() > 0
926
3.89k
                                       ? tablet_meta_pb.delete_bitmap().is_binlog_delvec(i)
927
3.89k
                                       : false;
928
3.89k
            if (!from_binlog) {
929
3.89k
                delete_bitmap().delete_bitmap[{rst_id, seg_id, ver}] =
930
3.89k
                        roaring::Roaring::read(bitmap);
931
3.89k
            } else {
932
0
                binlog_delvec().delete_bitmap[{rst_id, seg_id, ver}] =
933
0
                        roaring::Roaring::read(bitmap);
934
0
            }
935
3.89k
        }
936
50.6k
    }
937
938
294k
    if (tablet_meta_pb.has_binlog_config()) {
939
279k
        _binlog_config = tablet_meta_pb.binlog_config();
940
279k
    }
941
294k
    _compaction_policy = tablet_meta_pb.compaction_policy();
942
294k
    _time_series_compaction_goal_size_mbytes =
943
294k
            tablet_meta_pb.time_series_compaction_goal_size_mbytes();
944
294k
    _time_series_compaction_file_count_threshold =
945
294k
            tablet_meta_pb.time_series_compaction_file_count_threshold();
946
294k
    _time_series_compaction_time_threshold_seconds =
947
294k
            tablet_meta_pb.time_series_compaction_time_threshold_seconds();
948
294k
    _time_series_compaction_empty_rowsets_threshold =
949
294k
            tablet_meta_pb.time_series_compaction_empty_rowsets_threshold();
950
294k
    _time_series_compaction_level_threshold =
951
294k
            tablet_meta_pb.time_series_compaction_level_threshold();
952
294k
    _vertical_compaction_num_columns_per_group =
953
294k
            tablet_meta_pb.vertical_compaction_num_columns_per_group();
954
955
294k
    if (tablet_meta_pb.has_encryption_algorithm()) {
956
294k
        _encryption_algorithm = tablet_meta_pb.encryption_algorithm();
957
294k
    }
958
959
294k
    if (tablet_meta_pb.has_row_binlog_schema_hash()) {
960
50
        _row_binlog_schema_hash = tablet_meta_pb.row_binlog_schema_hash();
961
50
    }
962
294k
}
963
964
6.32k
void TabletMeta::to_meta_pb(TabletMetaPB* tablet_meta_pb, bool cloud_get_rowset_meta) {
965
6.32k
    tablet_meta_pb->set_table_id(table_id());
966
6.32k
    tablet_meta_pb->set_index_id(index_id());
967
6.32k
    tablet_meta_pb->set_partition_id(partition_id());
968
6.32k
    tablet_meta_pb->set_tablet_id(tablet_id());
969
6.32k
    tablet_meta_pb->set_replica_id(replica_id());
970
6.32k
    tablet_meta_pb->set_schema_hash(schema_hash());
971
6.32k
    tablet_meta_pb->set_shard_id(shard_id());
972
6.32k
    tablet_meta_pb->set_creation_time(creation_time());
973
6.32k
    tablet_meta_pb->set_cumulative_layer_point(cumulative_layer_point());
974
6.32k
    *(tablet_meta_pb->mutable_tablet_uid()) = tablet_uid().to_proto();
975
6.32k
    tablet_meta_pb->set_tablet_type(_tablet_type);
976
6.32k
    tablet_meta_pb->set_ttl_seconds(_ttl_seconds);
977
6.32k
    switch (tablet_state()) {
978
9
    case TABLET_NOTREADY:
979
9
        tablet_meta_pb->set_tablet_state(PB_NOTREADY);
980
9
        break;
981
1.85k
    case TABLET_RUNNING:
982
1.85k
        tablet_meta_pb->set_tablet_state(PB_RUNNING);
983
1.85k
        break;
984
0
    case TABLET_TOMBSTONED:
985
0
        tablet_meta_pb->set_tablet_state(PB_TOMBSTONED);
986
0
        break;
987
0
    case TABLET_STOPPED:
988
0
        tablet_meta_pb->set_tablet_state(PB_STOPPED);
989
0
        break;
990
4.46k
    case TABLET_SHUTDOWN:
991
4.46k
        tablet_meta_pb->set_tablet_state(PB_SHUTDOWN);
992
4.46k
        break;
993
6.32k
    }
994
995
    // RowsetMetaPB is separated from TabletMetaPB
996
6.32k
    if (!config::is_cloud_mode() || cloud_get_rowset_meta) {
997
34.8k
        for (const auto& [_, rs] : _rs_metas) {
998
34.8k
            rs->to_rowset_pb(tablet_meta_pb->add_rs_metas());
999
34.8k
        }
1000
6.32k
        for (const auto& [_, rs] : _stale_rs_metas) {
1001
518
            rs->to_rowset_pb(tablet_meta_pb->add_stale_rs_metas());
1002
518
        }
1003
6.32k
        for (const auto& [_, rs] : _row_binlog_rs_metas) {
1004
48
            rs->to_rowset_pb(tablet_meta_pb->add_row_binlog_rs_metas());
1005
48
        }
1006
6.32k
    }
1007
1008
6.32k
    _schema->to_schema_pb(tablet_meta_pb->mutable_schema());
1009
1010
6.32k
    if (_row_binlog_schema != nullptr) {
1011
22
        _row_binlog_schema->to_schema_pb(tablet_meta_pb->mutable_row_binlog_schema());
1012
22
        tablet_meta_pb->set_row_binlog_schema_hash(_row_binlog_schema_hash);
1013
22
    }
1014
1015
6.32k
    tablet_meta_pb->set_in_restore_mode(in_restore_mode());
1016
1017
    // to avoid modify tablet meta to the greatest extend
1018
6.32k
    if (_preferred_rowset_type == BETA_ROWSET) {
1019
6.32k
        tablet_meta_pb->set_preferred_rowset_type(_preferred_rowset_type);
1020
6.32k
    }
1021
6.32k
    if (_storage_policy_id > 0) {
1022
5
        tablet_meta_pb->set_storage_policy_id(_storage_policy_id);
1023
5
    }
1024
6.32k
    if (_cooldown_meta_id.initialized()) {
1025
5
        tablet_meta_pb->mutable_cooldown_meta_id()->CopyFrom(_cooldown_meta_id.to_proto());
1026
5
    }
1027
1028
6.32k
    tablet_meta_pb->set_enable_unique_key_merge_on_write(_enable_unique_key_merge_on_write);
1029
1030
6.32k
    if (_enable_unique_key_merge_on_write) {
1031
514
        std::set<RowsetId> stale_rs_ids;
1032
514
        for (const auto& [_, rowset] : _stale_rs_metas) {
1033
438
            stale_rs_ids.insert(rowset->rowset_id());
1034
438
        }
1035
514
        DeleteBitmapPB* delete_bitmap_pb = tablet_meta_pb->mutable_delete_bitmap();
1036
514
        for (auto& [id, bitmap] : delete_bitmap().snapshot().delete_bitmap) {
1037
354
            auto& [rowset_id, segment_id, ver] = id;
1038
354
            if (stale_rs_ids.count(rowset_id) != 0) {
1039
66
                continue;
1040
66
            }
1041
288
            delete_bitmap_pb->add_rowset_ids(rowset_id.to_string());
1042
288
            delete_bitmap_pb->add_segment_ids(segment_id);
1043
288
            delete_bitmap_pb->add_versions(ver);
1044
288
            delete_bitmap_pb->add_is_binlog_delvec(false);
1045
288
            std::string bitmap_data(bitmap.getSizeInBytes(), '\0');
1046
288
            bitmap.write(bitmap_data.data());
1047
288
            *(delete_bitmap_pb->add_segment_delete_bitmaps()) = std::move(bitmap_data);
1048
288
        }
1049
1050
514
        for (auto& [id, bitmap] : binlog_delvec().snapshot().delete_bitmap) {
1051
12
            auto& [rowset_id, segment_id, ver] = id;
1052
12
            delete_bitmap_pb->add_rowset_ids(rowset_id.to_string());
1053
12
            delete_bitmap_pb->add_segment_ids(segment_id);
1054
12
            delete_bitmap_pb->add_versions(ver);
1055
12
            delete_bitmap_pb->add_is_binlog_delvec(true);
1056
12
            std::string bitmap_data(bitmap.getSizeInBytes(), '\0');
1057
12
            bitmap.write(bitmap_data.data());
1058
12
            *(delete_bitmap_pb->add_segment_delete_bitmaps()) = std::move(bitmap_data);
1059
12
        }
1060
514
    }
1061
6.32k
    _binlog_config.to_pb(tablet_meta_pb->mutable_binlog_config());
1062
6.32k
    tablet_meta_pb->set_compaction_policy(compaction_policy());
1063
6.32k
    tablet_meta_pb->set_time_series_compaction_goal_size_mbytes(
1064
6.32k
            time_series_compaction_goal_size_mbytes());
1065
6.32k
    tablet_meta_pb->set_time_series_compaction_file_count_threshold(
1066
6.32k
            time_series_compaction_file_count_threshold());
1067
6.32k
    tablet_meta_pb->set_time_series_compaction_time_threshold_seconds(
1068
6.32k
            time_series_compaction_time_threshold_seconds());
1069
6.32k
    tablet_meta_pb->set_time_series_compaction_empty_rowsets_threshold(
1070
6.32k
            time_series_compaction_empty_rowsets_threshold());
1071
6.32k
    tablet_meta_pb->set_time_series_compaction_level_threshold(
1072
6.32k
            time_series_compaction_level_threshold());
1073
6.32k
    tablet_meta_pb->set_vertical_compaction_num_columns_per_group(
1074
6.32k
            vertical_compaction_num_columns_per_group());
1075
1076
6.32k
    tablet_meta_pb->set_encryption_algorithm(_encryption_algorithm);
1077
6.32k
}
1078
1079
2
void TabletMeta::to_json(string* json_string, json2pb::Pb2JsonOptions& options) {
1080
2
    TabletMetaPB tablet_meta_pb;
1081
2
    to_meta_pb(&tablet_meta_pb, false);
1082
2
    json2pb::ProtoMessageToJson(tablet_meta_pb, json_string, options);
1083
2
}
1084
1085
412k
Version TabletMeta::max_version() const {
1086
412k
    Version max_version = {-1, 0};
1087
1.16M
    for (const auto& [_, rs_meta] : _rs_metas) {
1088
1.16M
        if (rs_meta->end_version() > max_version.second) {
1089
746k
            max_version = rs_meta->version();
1090
746k
        }
1091
1.16M
    }
1092
412k
    return max_version;
1093
412k
}
1094
1095
824k
size_t TabletMeta::version_count_cross_with_range(const Version& range) const {
1096
824k
    size_t count = 0;
1097
1.40M
    for (const auto& [_, rs_meta] : _rs_metas) {
1098
1.40M
        if (!(range.first > rs_meta->version().second || range.second < rs_meta->version().first)) {
1099
1.40M
            count++;
1100
1.40M
        }
1101
1.40M
    }
1102
824k
    return count;
1103
824k
}
1104
1105
16.3k
Status TabletMeta::add_rs_meta(const RowsetMetaSharedPtr& rs_meta) {
1106
    // check RowsetMeta is valid
1107
695k
    for (const auto& [_, rs] : _rs_metas) {
1108
695k
        if (rs->version() == rs_meta->version()) {
1109
0
            if (rs->rowset_id() != rs_meta->rowset_id()) {
1110
0
                return Status::Error<PUSH_VERSION_ALREADY_EXIST>(
1111
0
                        "version already exist. rowset_id={}, version={}, tablet={}",
1112
0
                        rs->rowset_id().to_string(), rs->version().to_string(), tablet_id());
1113
0
            } else {
1114
                // rowsetid,version is equal, it is a duplicate req, skip it
1115
0
                return Status::OK();
1116
0
            }
1117
0
        }
1118
695k
    }
1119
16.3k
    _rs_metas.emplace(rs_meta->version(), rs_meta);
1120
16.3k
    return Status::OK();
1121
16.3k
}
1122
1123
32
Status TabletMeta::add_row_binlog_rs_meta(const RowsetMetaSharedPtr& row_binlog_meta) {
1124
    // check RowsetMeta is valid
1125
72
    for (auto& [_, rs] : _row_binlog_rs_metas) {
1126
72
        if (rs->version() == row_binlog_meta->version()) {
1127
0
            if (rs->rowset_id() != row_binlog_meta->rowset_id()) {
1128
0
                return Status::Error<PUSH_VERSION_ALREADY_EXIST>(
1129
0
                        "binlog version already exist. binlog_rowset_id={}, version={}, tablet={}",
1130
0
                        rs->rowset_id().to_string(), rs->version().to_string(), tablet_id());
1131
0
            } else {
1132
                // rowsetid,version is equal, it is a duplicate req, skip it
1133
0
                return Status::OK();
1134
0
            }
1135
0
        }
1136
72
    }
1137
32
    _row_binlog_rs_metas.emplace(row_binlog_meta->version(), row_binlog_meta);
1138
32
    return Status::OK();
1139
32
}
1140
1141
31.7k
void TabletMeta::add_rowsets_unchecked(const std::vector<RowsetSharedPtr>& to_add) {
1142
32.6k
    for (const auto& rs : to_add) {
1143
32.6k
        _rs_metas.emplace(rs->rowset_meta()->version(), rs->rowset_meta());
1144
32.6k
    }
1145
31.7k
}
1146
1147
void TabletMeta::delete_rs_meta_by_version(const Version& version,
1148
0
                                           std::vector<RowsetMetaSharedPtr>* deleted_rs_metas) {
1149
0
    size_t rowset_cache_version_size = 0;
1150
0
    if (auto it = _rs_metas.find(version); it != _rs_metas.end()) {
1151
0
        if (deleted_rs_metas != nullptr) {
1152
0
            deleted_rs_metas->push_back(it->second);
1153
0
        }
1154
0
        auto rowset_id = it->second->rowset_id();
1155
0
        _rs_metas.erase(it);
1156
0
        if (_enable_unique_key_merge_on_write) {
1157
0
            rowset_cache_version_size = _delete_bitmap->remove_rowset_cache_version(rowset_id);
1158
0
        }
1159
0
        return;
1160
0
    }
1161
0
    _check_mow_rowset_cache_version_size(rowset_cache_version_size);
1162
0
}
1163
1164
void TabletMeta::modify_rs_metas(const std::vector<RowsetMetaSharedPtr>& to_add,
1165
                                 const std::vector<RowsetMetaSharedPtr>& to_delete,
1166
1.36k
                                 bool same_version) {
1167
1.36k
    size_t rowset_cache_version_size = 0;
1168
    // Remove to_delete rowsets from _rs_metas
1169
12.8k
    for (auto rs_to_del : to_delete) {
1170
12.8k
        if (auto it = _rs_metas.find(rs_to_del->version()); it != _rs_metas.end()) {
1171
12.8k
            auto rowset_id = it->second->rowset_id();
1172
12.8k
            _rs_metas.erase(it);
1173
12.8k
            if (_enable_unique_key_merge_on_write) {
1174
5.18k
                rowset_cache_version_size = _delete_bitmap->remove_rowset_cache_version(rowset_id);
1175
5.18k
            }
1176
12.8k
        }
1177
12.8k
    }
1178
1.36k
    if (!same_version) {
1179
        // put to_delete rowsets in _stale_rs_metas.
1180
12.8k
        for (auto rs_to_del : to_delete) {
1181
12.8k
            _stale_rs_metas.emplace(rs_to_del->version(), rs_to_del);
1182
12.8k
        }
1183
1.34k
    }
1184
1185
    // put to_add rowsets in _rs_metas.
1186
1.36k
    for (auto rs_to_add : to_add) {
1187
63
        _rs_metas.emplace(rs_to_add->version(), rs_to_add);
1188
63
    }
1189
1.36k
    _check_mow_rowset_cache_version_size(rowset_cache_version_size);
1190
1.36k
}
1191
1192
void TabletMeta::modify_row_binlog_rs_metas(const std::vector<RowsetMetaSharedPtr>& to_add,
1193
0
                                            const std::vector<RowsetMetaSharedPtr>& to_delete) {
1194
0
    for (const auto& rs_to_del : to_delete) {
1195
0
        _row_binlog_rs_metas.erase(rs_to_del->version());
1196
0
    }
1197
1198
0
    for (const auto& rs_to_add : to_add) {
1199
0
        _row_binlog_rs_metas.emplace(rs_to_add->version(), rs_to_add);
1200
0
    }
1201
0
}
1202
1203
// Use the passing "rs_metas" to replace the rs meta in this tablet meta
1204
// Also clear the _stale_rs_metas because this tablet meta maybe copyied from
1205
// an existing tablet before. Add after revise, only the passing "rs_metas"
1206
// is needed.
1207
5
void TabletMeta::revise_rs_metas(std::vector<RowsetMetaSharedPtr>&& rs_metas) {
1208
5
    {
1209
5
        std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
1210
5
        _rs_metas.clear();
1211
10
        for (auto& rs_meta : rs_metas) {
1212
10
            _rs_metas.emplace(rs_meta->version(), rs_meta);
1213
10
        }
1214
5
        _stale_rs_metas.clear();
1215
5
    }
1216
5
    if (_enable_unique_key_merge_on_write) {
1217
0
        _delete_bitmap->clear_rowset_cache_version();
1218
0
    }
1219
5
}
1220
1221
0
void TabletMeta::revise_row_binlog_rs_metas(std::vector<RowsetMetaSharedPtr>&& rs_metas) {
1222
0
    std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
1223
0
    _row_binlog_rs_metas.clear();
1224
0
    for (auto& rs_meta : rs_metas) {
1225
0
        _row_binlog_rs_metas.emplace(rs_meta->version(), rs_meta);
1226
0
    }
1227
0
}
1228
1229
// This method should call after revise_rs_metas, since new rs_metas might be a subset
1230
// of original tablet, we should revise the delete_bitmap according to current rowset.
1231
//
1232
// Delete bitmap is protected by Tablet::_meta_lock, we don't need to acquire the
1233
// TabletMeta's _meta_lock
1234
1
void TabletMeta::revise_delete_bitmap_unlocked(const DeleteBitmap& delete_bitmap) {
1235
1
    _delete_bitmap = std::make_unique<DeleteBitmap>(tablet_id());
1236
2
    for (const auto& [_, rs] : _rs_metas) {
1237
2
        DeleteBitmap rs_bm(tablet_id());
1238
2
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
1239
2
                             &rs_bm);
1240
2
        _delete_bitmap->merge(rs_bm);
1241
2
    }
1242
1
    for (const auto& [_, rs] : _stale_rs_metas) {
1243
0
        DeleteBitmap rs_bm(tablet_id());
1244
0
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
1245
0
                             &rs_bm);
1246
0
        _delete_bitmap->merge(rs_bm);
1247
0
    }
1248
1
}
1249
1250
0
void TabletMeta::revise_binlog_delvec_unlocked(const DeleteBitmap& binlog_delvec) {
1251
0
    _binlog_delvec = std::make_unique<DeleteBitmap>(tablet_id());
1252
0
    for (const auto& [_, rs] : _row_binlog_rs_metas) {
1253
0
        DeleteBitmap rs_bm(tablet_id());
1254
0
        binlog_delvec.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
1255
0
                             &rs_bm);
1256
0
        _binlog_delvec->merge(rs_bm);
1257
0
    }
1258
0
}
1259
1260
10.6k
void TabletMeta::delete_stale_rs_meta_by_version(const Version& version) {
1261
10.6k
    _stale_rs_metas.erase(version);
1262
10.6k
}
1263
1264
0
RowsetMetaSharedPtr TabletMeta::acquire_rs_meta_by_version(const Version& version) const {
1265
0
    if (auto it = _rs_metas.find(version); it != _rs_metas.end()) {
1266
0
        return it->second;
1267
0
    }
1268
0
    return nullptr;
1269
0
}
1270
1271
4.38k
RowsetMetaSharedPtr TabletMeta::acquire_stale_rs_meta_by_version(const Version& version) const {
1272
4.38k
    if (auto it = _stale_rs_metas.find(version); it != _stale_rs_metas.end()) {
1273
4.37k
        return it->second;
1274
4.37k
    }
1275
8
    return nullptr;
1276
4.38k
}
1277
1278
RowsetMetaSharedPtr TabletMeta::acquire_row_binlog_rs_meta_by_version(
1279
0
        const Version& version) const {
1280
0
    if (auto it = _row_binlog_rs_metas.find(version); it != _row_binlog_rs_metas.end()) {
1281
0
        return it->second;
1282
0
    }
1283
0
    return nullptr;
1284
0
}
1285
1286
23
Status TabletMeta::set_partition_id(int64_t partition_id) {
1287
23
    if ((_partition_id > 0 && _partition_id != partition_id) || partition_id < 1) {
1288
0
        LOG(WARNING) << "cur partition id=" << _partition_id << " new partition id=" << partition_id
1289
0
                     << " not equal";
1290
0
    }
1291
23
    _partition_id = partition_id;
1292
23
    return Status::OK();
1293
23
}
1294
1295
0
void TabletMeta::clear_stale_rowset() {
1296
0
    _stale_rs_metas.clear();
1297
0
    if (_enable_unique_key_merge_on_write) {
1298
0
        _delete_bitmap->clear_rowset_cache_version();
1299
0
    }
1300
0
}
1301
1302
0
void TabletMeta::clear_rowsets() {
1303
0
    _rs_metas.clear();
1304
0
    if (_enable_unique_key_merge_on_write) {
1305
0
        _delete_bitmap->clear_rowset_cache_version();
1306
0
    }
1307
0
}
1308
1309
5.83k
void TabletMeta::_check_mow_rowset_cache_version_size(size_t rowset_cache_version_size) {
1310
5.83k
    if (_enable_unique_key_merge_on_write && config::enable_mow_verbose_log &&
1311
5.83k
        rowset_cache_version_size > _rs_metas.size() + _stale_rs_metas.size()) {
1312
0
        std::stringstream ss;
1313
0
        auto rowset_ids = _delete_bitmap->get_rowset_cache_version();
1314
0
        std::set<std::string> tablet_rowset_ids;
1315
0
        {
1316
0
            std::shared_lock rlock(_meta_lock);
1317
0
            for (const auto& [_, rs_meta] : _rs_metas) {
1318
0
                tablet_rowset_ids.emplace(rs_meta->rowset_id().to_string());
1319
0
            }
1320
0
            for (const auto& [_, rs_meta] : _stale_rs_metas) {
1321
0
                tablet_rowset_ids.emplace(rs_meta->rowset_id().to_string());
1322
0
            }
1323
0
        }
1324
0
        for (const auto& rowset_id : rowset_ids) {
1325
0
            if (tablet_rowset_ids.find(rowset_id) == tablet_rowset_ids.end()) {
1326
0
                ss << rowset_id << ", ";
1327
0
            }
1328
0
        }
1329
        // size(rowset_cache_version) <= size(_rs_metas) + size(_stale_rs_metas) + size(_unused_rs)
1330
0
        std::string msg = fmt::format(
1331
0
                "tablet: {}, rowset_cache_version size: {}, "
1332
0
                "_rs_metas size: {}, _stale_rs_metas size: {}, delta: {}. rowset only in cache: {}",
1333
0
                _tablet_id, rowset_cache_version_size, _rs_metas.size(), _stale_rs_metas.size(),
1334
0
                rowset_cache_version_size - _rs_metas.size() - _stale_rs_metas.size(), ss.str());
1335
0
        LOG(INFO) << msg;
1336
0
    }
1337
5.83k
}
1338
1339
3
bool operator==(const TabletMeta& a, const TabletMeta& b) {
1340
3
    if (a._table_id != b._table_id) return false;
1341
3
    if (a._index_id != b._index_id) return false;
1342
3
    if (a._partition_id != b._partition_id) return false;
1343
3
    if (a._tablet_id != b._tablet_id) return false;
1344
3
    if (a._replica_id != b._replica_id) return false;
1345
3
    if (a._schema_hash != b._schema_hash) return false;
1346
3
    if (a._shard_id != b._shard_id) return false;
1347
3
    if (a._creation_time != b._creation_time) return false;
1348
3
    if (a._cumulative_layer_point != b._cumulative_layer_point) return false;
1349
3
    if (a._tablet_uid != b._tablet_uid) return false;
1350
3
    if (a._tablet_type != b._tablet_type) return false;
1351
3
    if (a._tablet_state != b._tablet_state) return false;
1352
3
    if (*a._schema != *b._schema) return false;
1353
3
    if (a._rs_metas != b._rs_metas) return false;
1354
3
    if (a._in_restore_mode != b._in_restore_mode) return false;
1355
3
    if (a._preferred_rowset_type != b._preferred_rowset_type) return false;
1356
3
    if (a._storage_policy_id != b._storage_policy_id) return false;
1357
3
    if (a._compaction_policy != b._compaction_policy) return false;
1358
3
    if (a._time_series_compaction_goal_size_mbytes != b._time_series_compaction_goal_size_mbytes)
1359
0
        return false;
1360
3
    if (a._time_series_compaction_file_count_threshold !=
1361
3
        b._time_series_compaction_file_count_threshold)
1362
0
        return false;
1363
3
    if (a._time_series_compaction_time_threshold_seconds !=
1364
3
        b._time_series_compaction_time_threshold_seconds)
1365
0
        return false;
1366
3
    if (a._time_series_compaction_empty_rowsets_threshold !=
1367
3
        b._time_series_compaction_empty_rowsets_threshold)
1368
0
        return false;
1369
3
    if (a._time_series_compaction_level_threshold != b._time_series_compaction_level_threshold)
1370
0
        return false;
1371
3
    return true;
1372
3
}
1373
1374
0
bool operator!=(const TabletMeta& a, const TabletMeta& b) {
1375
0
    return !(a == b);
1376
0
}
1377
1378
// We cannot just copy the underlying memory to construct a string
1379
// due to equivalent objects may have different padding bytes.
1380
// Reading padding bytes is undefined behavior, neither copy nor
1381
// placement new will help simplify the code.
1382
// Refer to C11 standards §6.2.6.1/6 and §6.7.9/21 for more info.
1383
476k
static std::string agg_cache_key(int64_t tablet_id, const DeleteBitmap::BitmapKey& bmk) {
1384
476k
    std::string ret(sizeof(tablet_id) + sizeof(bmk), '\0');
1385
476k
    *reinterpret_cast<int64_t*>(ret.data()) = tablet_id;
1386
476k
    auto t = reinterpret_cast<DeleteBitmap::BitmapKey*>(ret.data() + sizeof(tablet_id));
1387
476k
    std::get<RowsetId>(*t).version = std::get<RowsetId>(bmk).version;
1388
476k
    std::get<RowsetId>(*t).hi = std::get<RowsetId>(bmk).hi;
1389
476k
    std::get<RowsetId>(*t).mi = std::get<RowsetId>(bmk).mi;
1390
476k
    std::get<RowsetId>(*t).lo = std::get<RowsetId>(bmk).lo;
1391
476k
    std::get<1>(*t) = std::get<1>(bmk);
1392
476k
    std::get<2>(*t) = std::get<2>(bmk);
1393
476k
    return ret;
1394
476k
}
1395
1396
// decode cache key info from a agg_cache_key
1397
static void decode_agg_cache_key(const std::string& key_str, int64_t& tablet_id,
1398
0
                                 DeleteBitmap::BitmapKey& bmk) {
1399
0
    const char* ptr = key_str.data();
1400
0
    tablet_id = *reinterpret_cast<const int64_t*>(ptr);
1401
0
    ptr += sizeof(tablet_id);
1402
0
    const auto* t = reinterpret_cast<const DeleteBitmap::BitmapKey*>(ptr);
1403
0
    std::get<RowsetId>(bmk).version = std::get<RowsetId>(*t).version;
1404
0
    std::get<RowsetId>(bmk).hi = std::get<RowsetId>(*t).hi;
1405
0
    std::get<RowsetId>(bmk).mi = std::get<RowsetId>(*t).mi;
1406
0
    std::get<RowsetId>(bmk).lo = std::get<RowsetId>(*t).lo;
1407
0
    std::get<1>(bmk) = std::get<1>(*t);
1408
0
    std::get<2>(bmk) = std::get<2>(*t);
1409
0
}
1410
1411
DeleteBitmapAggCache::DeleteBitmapAggCache(size_t capacity)
1412
8
        : LRUCachePolicy(CachePolicy::CacheType::DELETE_BITMAP_AGG_CACHE, capacity,
1413
8
                         LRUCacheType::SIZE, config::delete_bitmap_agg_cache_stale_sweep_time_sec,
1414
8
                         /*num_shards*/ 256,
1415
8
                         /*element_count_capacity*/ 0, /*enable_prune*/ true,
1416
8
                         /*is_lru_k*/ false) {}
1417
1418
771k
DeleteBitmapAggCache* DeleteBitmapAggCache::instance() {
1419
771k
    return ExecEnv::GetInstance()->delete_bitmap_agg_cache();
1420
771k
}
1421
1422
8
DeleteBitmapAggCache* DeleteBitmapAggCache::create_instance(size_t capacity) {
1423
8
    return new DeleteBitmapAggCache(capacity);
1424
8
}
1425
1426
0
DeleteBitmap DeleteBitmapAggCache::snapshot(int64_t tablet_id) {
1427
0
    DeleteBitmap ret(tablet_id);
1428
0
    auto collector = [&](const LRUHandle* handle) {
1429
0
        auto key = handle->key().to_string();
1430
0
        int64_t key_tablet_id;
1431
0
        DeleteBitmap::BitmapKey bmk;
1432
0
        decode_agg_cache_key(key, key_tablet_id, bmk);
1433
0
        if (key_tablet_id == tablet_id) {
1434
0
            const auto& dbm = reinterpret_cast<DeleteBitmapAggCache::Value*>(handle->value)->bitmap;
1435
0
            ret.set(bmk, dbm);
1436
0
        }
1437
0
    };
1438
0
    DeleteBitmapAggCache::instance()->for_each_entry(collector);
1439
0
    return ret;
1440
0
}
1441
1442
608k
DeleteBitmap::DeleteBitmap(int64_t tablet_id) : _tablet_id(tablet_id) {}
1443
1444
1.12k
DeleteBitmap::DeleteBitmap(const DeleteBitmap& o) {
1445
1.12k
    std::shared_lock l1(o.lock);
1446
1.12k
    delete_bitmap = o.delete_bitmap;
1447
1.12k
    _tablet_id = o._tablet_id;
1448
1.12k
}
1449
1450
2.05k
DeleteBitmap& DeleteBitmap::operator=(const DeleteBitmap& o) {
1451
2.05k
    if (this == &o) return *this;
1452
1.89k
    if (this < &o) {
1453
1.89k
        std::unique_lock l1(lock);
1454
1.89k
        std::shared_lock l2(o.lock);
1455
1.89k
        delete_bitmap = o.delete_bitmap;
1456
1.89k
        _tablet_id = o._tablet_id;
1457
1.89k
    } else {
1458
0
        std::shared_lock l2(o.lock);
1459
0
        std::unique_lock l1(lock);
1460
0
        delete_bitmap = o.delete_bitmap;
1461
0
        _tablet_id = o._tablet_id;
1462
0
    }
1463
1.89k
    return *this;
1464
2.05k
}
1465
1466
0
DeleteBitmap::DeleteBitmap(DeleteBitmap&& o) noexcept {
1467
0
    std::scoped_lock l(o.lock, o._rowset_cache_version_lock);
1468
0
    delete_bitmap = std::move(o.delete_bitmap);
1469
0
    _tablet_id = std::move(o._tablet_id);
1470
0
    o._rowset_cache_version.clear();
1471
0
}
1472
1473
0
DeleteBitmap& DeleteBitmap::operator=(DeleteBitmap&& o) noexcept {
1474
0
    if (this == &o) return *this;
1475
0
    std::scoped_lock l(lock, o.lock, o._rowset_cache_version_lock);
1476
0
    delete_bitmap = std::move(o.delete_bitmap);
1477
0
    _tablet_id = std::move(o._tablet_id);
1478
0
    o._rowset_cache_version.clear();
1479
0
    return *this;
1480
0
}
1481
1482
0
DeleteBitmap DeleteBitmap::from_pb(const DeleteBitmapPB& pb, int64_t tablet_id) {
1483
0
    size_t len = pb.rowset_ids().size();
1484
0
    DCHECK_EQ(len, pb.segment_ids().size());
1485
0
    DCHECK_EQ(len, pb.versions().size());
1486
0
    DeleteBitmap delete_bitmap(tablet_id);
1487
0
    for (int32_t i = 0; i < len; ++i) {
1488
0
        RowsetId rs_id;
1489
0
        rs_id.init(pb.rowset_ids(i));
1490
0
        BitmapKey key = {rs_id, pb.segment_ids(i), pb.versions(i)};
1491
0
        delete_bitmap.delete_bitmap[key] =
1492
0
                roaring::Roaring::read(pb.segment_delete_bitmaps(i).data());
1493
0
    }
1494
0
    return delete_bitmap;
1495
0
}
1496
1497
0
DeleteBitmapPB DeleteBitmap::to_pb() {
1498
0
    std::shared_lock l(lock);
1499
0
    DeleteBitmapPB ret;
1500
0
    for (const auto& [k, v] : delete_bitmap) {
1501
0
        ret.mutable_rowset_ids()->Add(std::get<0>(k).to_string());
1502
0
        ret.mutable_segment_ids()->Add(std::get<1>(k));
1503
0
        ret.mutable_versions()->Add(std::get<2>(k));
1504
0
        std::string bitmap_data(v.getSizeInBytes(), '\0');
1505
0
        v.write(bitmap_data.data());
1506
0
        ret.mutable_segment_delete_bitmaps()->Add(std::move(bitmap_data));
1507
0
    }
1508
0
    return ret;
1509
0
}
1510
1511
1.03k
DeleteBitmap DeleteBitmap::snapshot() const {
1512
1.03k
    std::shared_lock l(lock);
1513
1.03k
    return DeleteBitmap(*this);
1514
1.03k
}
1515
1516
3
DeleteBitmap DeleteBitmap::snapshot(Version version) const {
1517
    // Take snapshot first, then remove keys greater than given version.
1518
3
    DeleteBitmap snapshot = this->snapshot();
1519
3
    auto it = snapshot.delete_bitmap.begin();
1520
412
    while (it != snapshot.delete_bitmap.end()) {
1521
409
        if (std::get<2>(it->first) > version) {
1522
4
            it = snapshot.delete_bitmap.erase(it);
1523
405
        } else {
1524
405
            it++;
1525
405
        }
1526
409
    }
1527
3
    return snapshot;
1528
3
}
1529
1530
838k
void DeleteBitmap::add(const BitmapKey& bmk, uint32_t row_id) {
1531
838k
    std::lock_guard l(lock);
1532
838k
    delete_bitmap[bmk].add(row_id);
1533
838k
}
1534
1535
0
int DeleteBitmap::remove(const BitmapKey& bmk, uint32_t row_id) {
1536
0
    std::lock_guard l(lock);
1537
0
    auto it = delete_bitmap.find(bmk);
1538
0
    if (it == delete_bitmap.end()) return -1;
1539
0
    it->second.remove(row_id);
1540
0
    return 0;
1541
0
}
1542
1543
4.97k
void DeleteBitmap::remove(const BitmapKey& start, const BitmapKey& end) {
1544
4.97k
    std::lock_guard l(lock);
1545
5.39k
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) {
1546
1.04k
        auto& [k, _] = *it;
1547
1.04k
        if (k >= end) {
1548
627
            break;
1549
627
        }
1550
419
        it = delete_bitmap.erase(it);
1551
419
    }
1552
4.97k
}
1553
1554
0
void DeleteBitmap::remove(const std::vector<std::tuple<BitmapKey, BitmapKey>>& key_ranges) {
1555
0
    std::lock_guard l(lock);
1556
0
    for (auto& [start, end] : key_ranges) {
1557
0
        for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) {
1558
0
            auto& [k, _] = *it;
1559
0
            if (k >= end) {
1560
0
                break;
1561
0
            }
1562
0
            it = delete_bitmap.erase(it);
1563
0
        }
1564
0
    }
1565
0
}
1566
1567
357k
bool DeleteBitmap::contains(const BitmapKey& bmk, uint32_t row_id) const {
1568
357k
    std::shared_lock l(lock);
1569
357k
    auto it = delete_bitmap.find(bmk);
1570
357k
    return it != delete_bitmap.end() && it->second.contains(row_id);
1571
357k
}
1572
1573
0
bool DeleteBitmap::contain_rowsets(const RowsetIdUnorderedSet& rowset_ids) const {
1574
0
    std::shared_lock l(lock);
1575
0
    return std::any_of(delete_bitmap.begin(), delete_bitmap.end(), [&](const auto& entry) {
1576
0
        return rowset_ids.contains(std::get<0>(entry.first));
1577
0
    });
1578
0
}
1579
1580
2
bool DeleteBitmap::contains_agg(const BitmapKey& bmk, uint32_t row_id) const {
1581
2
    return get_agg(bmk)->contains(row_id);
1582
2
}
1583
1584
0
bool DeleteBitmap::empty() const {
1585
0
    std::shared_lock l(lock);
1586
0
    return delete_bitmap.empty();
1587
0
}
1588
1589
17.7k
uint64_t DeleteBitmap::cardinality() const {
1590
17.7k
    std::shared_lock l(lock);
1591
17.7k
    uint64_t res = 0;
1592
74.0k
    for (auto entry : delete_bitmap) {
1593
74.0k
        if (std::get<1>(entry.first) != DeleteBitmap::INVALID_SEGMENT_ID) {
1594
3.52k
            res += entry.second.cardinality();
1595
3.52k
        }
1596
74.0k
    }
1597
17.7k
    return res;
1598
17.7k
}
1599
1600
0
uint64_t DeleteBitmap::get_size() const {
1601
0
    std::shared_lock l(lock);
1602
0
    uint64_t charge = 0;
1603
0
    for (auto& [k, v] : delete_bitmap) {
1604
0
        if (std::get<1>(k) != DeleteBitmap::INVALID_SEGMENT_ID) {
1605
0
            charge += v.getSizeInBytes();
1606
0
        }
1607
0
    }
1608
0
    return charge;
1609
0
}
1610
1611
bool DeleteBitmap::contains_agg_with_cache_if_eligible(const BitmapKey& bmk,
1612
356k
                                                       uint32_t row_id) const {
1613
356k
    g_contains_agg_with_cache_if_eligible_total << 1;
1614
356k
    int64_t start_version {0};
1615
357k
    if (config::enable_mow_get_agg_by_cache) {
1616
357k
        auto deleter = [&](Cache::Handle* handle) {
1617
27.9k
            DeleteBitmapAggCache::instance()->release(handle);
1618
27.9k
        };
1619
357k
        std::unique_ptr<Cache::Handle, decltype(deleter)> dbm_handle(nullptr, deleter);
1620
357k
        int64_t cached_version = 0;
1621
        // 1. try to lookup the desired key directly
1622
357k
        dbm_handle.reset(DeleteBitmapAggCache::instance()->lookup(agg_cache_key(_tablet_id, bmk)));
1623
357k
        if (dbm_handle != nullptr) {
1624
27.9k
            cached_version = std::get<2>(bmk);
1625
329k
        } else {
1626
            // 2. if not found, try to lookup with cached version
1627
329k
            cached_version = _get_rowset_cache_version(bmk);
1628
329k
            if (cached_version > 0) {
1629
0
                if (cached_version > std::get<2>(bmk)) {
1630
0
                    cached_version = 0;
1631
0
                } else {
1632
0
                    dbm_handle.reset(DeleteBitmapAggCache::instance()->lookup(agg_cache_key(
1633
0
                            _tablet_id, {std::get<0>(bmk), std::get<1>(bmk), cached_version})));
1634
0
                }
1635
0
            }
1636
329k
        }
1637
357k
        if (dbm_handle != nullptr) {
1638
27.9k
            const auto& cached_dbm =
1639
27.9k
                    reinterpret_cast<DeleteBitmapAggCache::Value*>(
1640
27.9k
                            DeleteBitmapAggCache::instance()->value(dbm_handle.get()))
1641
27.9k
                            ->bitmap;
1642
27.9k
            if (cached_version == std::get<2>(bmk)) {
1643
27.9k
                g_contains_agg_with_cache_if_eligible_full_hit << 1;
1644
27.9k
            } else {
1645
0
                g_contains_agg_with_cache_if_eligible_partial_hit << 1;
1646
0
            }
1647
27.9k
            if (cached_dbm.contains(row_id)) {
1648
14
                return true;
1649
14
            }
1650
27.9k
            if (cached_version == std::get<2>(bmk)) {
1651
27.9k
                return false;
1652
27.9k
            }
1653
18.4E
            start_version = cached_version + 1;
1654
18.4E
        }
1655
357k
    }
1656
328k
    DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1657
328k
    std::shared_lock l(lock);
1658
328k
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1659
0
        auto& [k, bm] = *it;
1660
0
        if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1661
0
            std::get<2>(k) > std::get<2>(bmk)) {
1662
0
            break;
1663
0
        }
1664
0
        if (bm.contains(row_id)) {
1665
0
            return true;
1666
0
        }
1667
0
    }
1668
328k
    return false;
1669
328k
}
1670
1671
3
void DeleteBitmap::remove_sentinel_marks() {
1672
3
    std::lock_guard l(lock);
1673
23
    for (auto it = delete_bitmap.begin(), end = delete_bitmap.end(); it != end;) {
1674
20
        if (std::get<1>(it->first) == DeleteBitmap::INVALID_SEGMENT_ID) {
1675
20
            it = delete_bitmap.erase(it);
1676
20
        } else {
1677
0
            ++it;
1678
0
        }
1679
20
    }
1680
3
}
1681
1682
375
int DeleteBitmap::set(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1683
375
    std::lock_guard l(lock);
1684
375
    auto [_, inserted] = delete_bitmap.insert_or_assign(bmk, segment_delete_bitmap);
1685
375
    return inserted;
1686
375
}
1687
1688
7
int DeleteBitmap::get(const BitmapKey& bmk, roaring::Roaring* segment_delete_bitmap) const {
1689
7
    std::shared_lock l(lock);
1690
7
    auto it = delete_bitmap.find(bmk);
1691
7
    if (it == delete_bitmap.end()) return -1;
1692
7
    *segment_delete_bitmap = it->second; // copy
1693
7
    return 0;
1694
7
}
1695
1696
54
const roaring::Roaring* DeleteBitmap::get(const BitmapKey& bmk) const {
1697
54
    std::shared_lock l(lock);
1698
54
    auto it = delete_bitmap.find(bmk);
1699
54
    if (it == delete_bitmap.end()) return nullptr;
1700
41
    return &(it->second); // get address
1701
54
}
1702
1703
void DeleteBitmap::subset(const BitmapKey& start, const BitmapKey& end,
1704
2.46k
                          DeleteBitmap* subset_rowset_map) const {
1705
2.46k
    DCHECK(start < end);
1706
2.46k
    std::shared_lock l(lock);
1707
2.82k
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1708
1.02k
        auto& [k, bm] = *it;
1709
1.02k
        if (k >= end) {
1710
666
            break;
1711
666
        }
1712
360
        subset_rowset_map->set(k, bm);
1713
360
    }
1714
2.46k
}
1715
1716
void DeleteBitmap::subset(std::vector<std::pair<RowsetId, int64_t>>& rowset_ids,
1717
                          int64_t start_version, int64_t end_version,
1718
0
                          DeleteBitmap* subset_delete_map) const {
1719
0
    DCHECK(start_version <= end_version);
1720
0
    for (auto& [rowset_id, _] : rowset_ids) {
1721
0
        BitmapKey start {rowset_id, 0, 0};
1722
0
        BitmapKey end {rowset_id, UINT32_MAX, end_version + 1};
1723
0
        std::shared_lock l(lock);
1724
0
        for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1725
0
            auto& [k, bm] = *it;
1726
0
            if (k >= end) {
1727
0
                break;
1728
0
            }
1729
0
            auto version = std::get<2>(k);
1730
0
            if (version >= start_version && version <= end_version) {
1731
0
                subset_delete_map->merge(k, bm);
1732
0
                VLOG_DEBUG << "subset delete bitmap, tablet=" << _tablet_id << ", version=["
1733
0
                           << start_version << ", " << end_version
1734
0
                           << "]. rowset=" << std::get<0>(k).to_string()
1735
0
                           << ", segment=" << std::get<1>(k) << ", version=" << version
1736
0
                           << ", cardinality=" << bm.cardinality();
1737
0
            }
1738
0
        }
1739
0
    }
1740
0
}
1741
1742
void DeleteBitmap::subset_and_agg(std::vector<std::pair<RowsetId, int64_t>>& rowset_ids,
1743
                                  int64_t start_version, int64_t end_version,
1744
1
                                  DeleteBitmap* subset_delete_map) const {
1745
1
    DCHECK(start_version <= end_version);
1746
2
    for (auto& [rowset_id, segment_num] : rowset_ids) {
1747
6
        for (int64_t seg_id = 0; seg_id < segment_num; ++seg_id) {
1748
4
            BitmapKey end {rowset_id, seg_id, end_version};
1749
4
            auto bm = get_agg_without_cache(end, start_version);
1750
4
            VLOG_DEBUG << "subset delete bitmap, tablet=" << _tablet_id << ", rowset=" << rowset_id
1751
0
                       << ", segment=" << seg_id << ", version=[" << start_version << "-"
1752
0
                       << end_version << "], cardinality=" << bm->cardinality();
1753
4
            if (bm->isEmpty()) {
1754
0
                continue;
1755
0
            }
1756
4
            subset_delete_map->merge(end, *bm);
1757
4
        }
1758
2
    }
1759
1
}
1760
1761
79
size_t DeleteBitmap::get_count_with_range(const BitmapKey& start, const BitmapKey& end) const {
1762
79
    DCHECK(start < end);
1763
79
    size_t count = 0;
1764
79
    std::shared_lock l(lock);
1765
136
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1766
112
        auto& [k, bm] = *it;
1767
112
        if (k >= end) {
1768
55
            break;
1769
55
        }
1770
57
        count++;
1771
57
    }
1772
79
    return count;
1773
79
}
1774
1775
1.04k
void DeleteBitmap::merge(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1776
1.04k
    std::lock_guard l(lock);
1777
1.04k
    auto [iter, succ] = delete_bitmap.emplace(bmk, segment_delete_bitmap);
1778
1.04k
    if (!succ) {
1779
0
        iter->second |= segment_delete_bitmap;
1780
0
    }
1781
1.04k
}
1782
1783
3.71k
void DeleteBitmap::merge(const DeleteBitmap& other) {
1784
3.71k
    std::lock_guard l(lock);
1785
3.71k
    for (auto& i : other.delete_bitmap) {
1786
49
        auto [j, succ] = this->delete_bitmap.insert(i);
1787
49
        if (!succ) j->second |= i.second;
1788
49
    }
1789
3.71k
}
1790
1791
20.2k
uint64_t DeleteBitmap::get_delete_bitmap_count() {
1792
20.2k
    std::shared_lock l(lock);
1793
20.2k
    uint64_t count = 0;
1794
47.7k
    for (auto it = delete_bitmap.begin(); it != delete_bitmap.end(); it++) {
1795
27.5k
        if (std::get<1>(it->first) != DeleteBitmap::INVALID_SEGMENT_ID) {
1796
1.94k
            count++;
1797
1.94k
        }
1798
27.5k
    }
1799
20.2k
    return count;
1800
20.2k
}
1801
1802
void DeleteBitmap::traverse_rowset_and_version(
1803
0
        const std::function<int(const RowsetId& rowsetId, int64_t version)>& func) const {
1804
0
    std::shared_lock l(lock);
1805
0
    auto it = delete_bitmap.cbegin();
1806
0
    while (it != delete_bitmap.cend()) {
1807
0
        RowsetId rowset_id = std::get<0>(it->first);
1808
0
        int64_t version = std::get<2>(it->first);
1809
0
        int result = func(rowset_id, version);
1810
0
        if (result == -2) {
1811
            // find next <rowset, version>
1812
0
            it++;
1813
0
        } else {
1814
            // find next <rowset>
1815
0
            it = delete_bitmap.upper_bound({rowset_id, std::numeric_limits<SegmentId>::max(),
1816
0
                                            std::numeric_limits<Version>::max()});
1817
0
        }
1818
0
    }
1819
0
}
1820
1821
0
bool DeleteBitmap::has_calculated_for_multi_segments(const RowsetId& rowset_id) const {
1822
0
    return contains({rowset_id, INVALID_SEGMENT_ID, TEMP_VERSION_COMMON}, ROWSET_SENTINEL_MARK);
1823
0
}
1824
1825
9.64k
size_t DeleteBitmap::remove_rowset_cache_version(const RowsetId& rowset_id) {
1826
9.64k
    std::lock_guard l(_rowset_cache_version_lock);
1827
9.64k
    _rowset_cache_version.erase(rowset_id);
1828
18.4E
    VLOG_DEBUG << "remove agg cache version for tablet=" << _tablet_id
1829
18.4E
               << ", rowset=" << rowset_id.to_string();
1830
9.64k
    return _rowset_cache_version.size();
1831
9.64k
}
1832
1833
0
void DeleteBitmap::clear_rowset_cache_version() {
1834
0
    std::lock_guard l(_rowset_cache_version_lock);
1835
0
    _rowset_cache_version.clear();
1836
0
    VLOG_DEBUG << "clear agg cache version for tablet=" << _tablet_id;
1837
0
}
1838
1839
0
std::set<std::string> DeleteBitmap::get_rowset_cache_version() {
1840
0
    std::set<std::string> set;
1841
0
    std::shared_lock l(_rowset_cache_version_lock);
1842
0
    for (auto& [k, _] : _rowset_cache_version) {
1843
0
        set.insert(k.to_string());
1844
0
    }
1845
0
    return set;
1846
0
}
1847
1848
336k
DeleteBitmap::Version DeleteBitmap::_get_rowset_cache_version(const BitmapKey& bmk) const {
1849
336k
    std::shared_lock l(_rowset_cache_version_lock);
1850
336k
    if (auto it = _rowset_cache_version.find(std::get<0>(bmk)); it != _rowset_cache_version.end()) {
1851
830
        auto& segment_cache_version = it->second;
1852
830
        if (auto it1 = segment_cache_version.find(std::get<1>(bmk));
1853
830
            it1 != segment_cache_version.end()) {
1854
790
            return it1->second;
1855
790
        }
1856
830
    }
1857
336k
    return 0;
1858
336k
}
1859
1860
0
DeleteBitmap DeleteBitmap::agg_cache_snapshot() {
1861
0
    return DeleteBitmapAggCache::instance()->snapshot(_tablet_id);
1862
0
}
1863
1864
589k
void DeleteBitmap::set_tablet_id(int64_t tablet_id) {
1865
589k
    _tablet_id = tablet_id;
1866
589k
}
1867
1868
118k
std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg(const BitmapKey& bmk) const {
1869
118k
    std::string key_str = agg_cache_key(_tablet_id, bmk); // Cache key container
1870
118k
    CacheKey key(key_str);
1871
118k
    Cache::Handle* handle = DeleteBitmapAggCache::instance()->lookup(key);
1872
1873
118k
    DeleteBitmapAggCache::Value* val =
1874
118k
            handle == nullptr ? nullptr
1875
118k
                              : reinterpret_cast<DeleteBitmapAggCache::Value*>(
1876
111k
                                        DeleteBitmapAggCache::instance()->value(handle));
1877
    // FIXME: do we need a mutex here to get rid of duplicated initializations
1878
    //        of cache entries in some cases?
1879
118k
    if (val == nullptr) { // Renew if needed, put a new Value to cache
1880
7.62k
        val = new DeleteBitmapAggCache::Value();
1881
7.62k
        Version start_version =
1882
7.62k
                config::enable_mow_get_agg_by_cache ? _get_rowset_cache_version(bmk) : 0;
1883
7.62k
        if (start_version > 0) {
1884
788
            Cache::Handle* handle2 = DeleteBitmapAggCache::instance()->lookup(
1885
788
                    agg_cache_key(_tablet_id, {std::get<0>(bmk), std::get<1>(bmk), start_version}));
1886
1887
788
            DBUG_EXECUTE_IF("DeleteBitmap::get_agg.cache_miss", {
1888
788
                if (handle2 != nullptr) {
1889
788
                    auto p = dp->param("percent", 0.3);
1890
788
                    std::mt19937 gen {std::random_device {}()};
1891
788
                    std::bernoulli_distribution inject_fault {p};
1892
788
                    if (inject_fault(gen)) {
1893
788
                        LOG_INFO("injection DeleteBitmap::get_agg.cache_miss, tablet_id={}",
1894
788
                                 _tablet_id);
1895
788
                        handle2 = nullptr;
1896
788
                    }
1897
788
                }
1898
788
            });
1899
788
            if (handle2 == nullptr || start_version > std::get<2>(bmk)) {
1900
3
                start_version = 0;
1901
785
            } else {
1902
785
                val->bitmap |= reinterpret_cast<DeleteBitmapAggCache::Value*>(
1903
785
                                       DeleteBitmapAggCache::instance()->value(handle2))
1904
785
                                       ->bitmap;
1905
785
                VLOG_DEBUG << "get agg cache version=" << start_version
1906
0
                           << " for tablet=" << _tablet_id
1907
0
                           << ", rowset=" << std::get<0>(bmk).to_string()
1908
0
                           << ", segment=" << std::get<1>(bmk);
1909
785
                start_version += 1;
1910
785
            }
1911
788
            if (handle2 != nullptr) {
1912
788
                DeleteBitmapAggCache::instance()->release(handle2);
1913
788
            }
1914
788
        }
1915
7.62k
        {
1916
7.62k
            std::shared_lock l(lock);
1917
7.62k
            DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1918
8.19k
            for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1919
2.26k
                auto& [k, bm] = *it;
1920
2.26k
                if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1921
2.26k
                    std::get<2>(k) > std::get<2>(bmk)) {
1922
1.69k
                    break;
1923
1.69k
                }
1924
570
                val->bitmap |= bm;
1925
570
            }
1926
7.62k
        }
1927
7.62k
        size_t charge = val->bitmap.getSizeInBytes() + sizeof(DeleteBitmapAggCache::Value);
1928
7.62k
        handle = DeleteBitmapAggCache::instance()->insert(key, val, charge, charge,
1929
7.62k
                                                          CachePriority::NORMAL);
1930
7.62k
        if (config::enable_mow_get_agg_by_cache && !val->bitmap.isEmpty()) {
1931
1.32k
            std::lock_guard l(_rowset_cache_version_lock);
1932
            // this version is already agg
1933
1.32k
            _rowset_cache_version[std::get<0>(bmk)][std::get<1>(bmk)] = std::get<2>(bmk);
1934
1.32k
            VLOG_DEBUG << "set agg cache version=" << std::get<2>(bmk)
1935
1
                       << " for tablet=" << _tablet_id
1936
1
                       << ", rowset=" << std::get<0>(bmk).to_string()
1937
1
                       << ", segment=" << std::get<1>(bmk);
1938
1.32k
        }
1939
7.62k
        if (start_version > 0 && config::enable_mow_get_agg_correctness_check_core) {
1940
0
            std::shared_ptr<roaring::Roaring> bitmap = get_agg_without_cache(bmk);
1941
0
            if (val->bitmap != *bitmap) {
1942
0
                CHECK(false) << ". get agg correctness check failed for tablet=" << _tablet_id
1943
0
                             << ", rowset=" << std::get<0>(bmk).to_string()
1944
0
                             << ", segment=" << std::get<1>(bmk) << ", version=" << std::get<2>(bmk)
1945
0
                             << ". start_version from cache=" << start_version
1946
0
                             << ", delete_bitmap cardinality with cache="
1947
0
                             << val->bitmap.cardinality()
1948
0
                             << ", delete_bitmap cardinality without cache="
1949
0
                             << bitmap->cardinality();
1950
0
            }
1951
0
        }
1952
7.62k
    }
1953
1954
    // It is natural for the cache to reclaim the underlying memory
1955
118k
    return std::shared_ptr<roaring::Roaring>(
1956
118k
            &val->bitmap, [handle](...) { DeleteBitmapAggCache::instance()->release(handle); });
1957
118k
}
1958
1959
std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg_without_cache(
1960
426
        const BitmapKey& bmk, const int64_t start_version) const {
1961
426
    std::shared_ptr<roaring::Roaring> bitmap = std::make_shared<roaring::Roaring>();
1962
426
    std::shared_lock l(lock);
1963
426
    DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1964
446
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1965
44
        auto& [k, bm] = *it;
1966
44
        if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1967
44
            std::get<2>(k) > std::get<2>(bmk)) {
1968
24
            break;
1969
24
        }
1970
20
        *bitmap |= bm;
1971
20
    }
1972
426
    return bitmap;
1973
426
}
1974
1975
0
DeleteBitmap DeleteBitmap::diffset(const std::set<BitmapKey>& key_set) const {
1976
0
    std::shared_lock l(lock);
1977
0
    auto diff_key_set_view =
1978
0
            delete_bitmap | std::ranges::views::transform([](const auto& kv) { return kv.first; }) |
1979
0
            std::ranges::views::filter(
1980
0
                    [&key_set](const auto& key) { return !key_set.contains(key); });
1981
1982
0
    DeleteBitmap dbm(_tablet_id);
1983
0
    for (const auto& key : diff_key_set_view) {
1984
0
        const auto* bitmap = get(key);
1985
0
        DCHECK_NE(bitmap, nullptr);
1986
0
        dbm.delete_bitmap[key] = *bitmap;
1987
0
    }
1988
0
    return dbm;
1989
0
}
1990
1991
0
std::string tablet_state_name(TabletState state) {
1992
0
    switch (state) {
1993
0
    case TABLET_NOTREADY:
1994
0
        return "TABLET_NOTREADY";
1995
1996
0
    case TABLET_RUNNING:
1997
0
        return "TABLET_RUNNING";
1998
1999
0
    case TABLET_TOMBSTONED:
2000
0
        return "TABLET_TOMBSTONED";
2001
2002
0
    case TABLET_STOPPED:
2003
0
        return "TABLET_STOPPED";
2004
2005
0
    case TABLET_SHUTDOWN:
2006
0
        return "TABLET_SHUTDOWN";
2007
2008
0
    default:
2009
0
        return "TabletState(" + std::to_string(state) + ")";
2010
0
    }
2011
0
}
2012
2013
} // namespace doris