Coverage Report

Created: 2026-02-24 17:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/olap/tablet_meta.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "olap/tablet_meta.h"
19
20
#include <bvar/bvar.h>
21
#include <gen_cpp/Descriptors_types.h>
22
#include <gen_cpp/FrontendService_types.h>
23
#include <gen_cpp/Types_types.h>
24
#include <gen_cpp/olap_common.pb.h>
25
#include <gen_cpp/olap_file.pb.h>
26
#include <gen_cpp/segment_v2.pb.h>
27
#include <gen_cpp/types.pb.h>
28
#include <json2pb/pb_to_json.h>
29
#include <time.h>
30
31
#include <cstdint>
32
#include <memory>
33
#include <random>
34
#include <set>
35
#include <utility>
36
37
#include "cloud/cloud_meta_mgr.h"
38
#include "cloud/cloud_storage_engine.h"
39
#include "cloud/config.h"
40
#include "common/config.h"
41
#include "io/fs/file_writer.h"
42
#include "io/fs/local_file_system.h"
43
#include "olap/data_dir.h"
44
#include "olap/file_header.h"
45
#include "olap/lru_cache.h"
46
#include "olap/olap_common.h"
47
#include "olap/olap_define.h"
48
#include "olap/rowset/rowset.h"
49
#include "olap/rowset/rowset_meta_manager.h"
50
#include "olap/tablet_fwd.h"
51
#include "olap/tablet_meta_manager.h"
52
#include "olap/tablet_schema_cache.h"
53
#include "olap/utils.h"
54
#include "util/debug_points.h"
55
#include "util/mem_info.h"
56
#include "util/parse_util.h"
57
#include "util/string_util.h"
58
#include "util/time.h"
59
#include "util/uid_util.h"
60
61
using std::string;
62
using std::unordered_map;
63
using std::vector;
64
65
namespace doris {
66
#include "common/compile_check_begin.h"
67
using namespace ErrorCode;
68
69
bvar::Adder<uint64_t> g_contains_agg_with_cache_if_eligible_total(
70
        "g_contains_agg_with_cache_if_eligible_total");
71
bvar::Adder<uint64_t> g_contains_agg_with_cache_if_eligible_partial_hit(
72
        "g_contains_agg_with_cache_if_eligible_partial_hit");
73
bvar::Adder<uint64_t> g_contains_agg_with_cache_if_eligible_full_hit(
74
        "g_contains_agg_with_cache_if_eligible_full_hit");
75
bvar::Window<bvar::Adder<uint64_t>> g_contains_agg_with_cache_if_eligible_total_minute(
76
        "g_contains_agg_with_cache_if_eligible_total_1m",
77
        &g_contains_agg_with_cache_if_eligible_total, 60);
78
bvar::Window<bvar::Adder<uint64_t>> g_contains_agg_with_cache_if_eligible_partial_hit_minute(
79
        "g_contains_agg_with_cache_if_eligible_partial_hit_1m",
80
        &g_contains_agg_with_cache_if_eligible_partial_hit, 60);
81
bvar::Window<bvar::Adder<uint64_t>> g_contains_agg_with_cache_if_eligible_full_hit_minute(
82
        "g_contains_agg_with_cache_if_eligible_full_hit_1m",
83
        &g_contains_agg_with_cache_if_eligible_full_hit, 60);
84
85
namespace {
86
87
3
inline PatternTypePB to_pattern_type_pb(TPatternType::type pattern_type) {
88
3
    return static_cast<PatternTypePB>(pattern_type);
89
3
}
90
91
} // namespace
92
93
TabletMetaSharedPtr TabletMeta::create(
94
        const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id,
95
        uint32_t next_unique_id,
96
304
        const unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id) {
97
304
    std::optional<TBinlogConfig> binlog_config;
98
304
    if (request.__isset.binlog_config) {
99
0
        binlog_config = request.binlog_config;
100
0
    }
101
304
    TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format =
102
304
            request.inverted_index_file_storage_format;
103
104
    // We will discard this format. Don't make any further changes here.
105
304
    if (request.__isset.inverted_index_storage_format) {
106
304
        switch (request.inverted_index_storage_format) {
107
0
        case TInvertedIndexStorageFormat::V1:
108
0
            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V1;
109
0
            break;
110
0
        case TInvertedIndexStorageFormat::V2:
111
0
            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V2;
112
0
            break;
113
304
        default:
114
304
            break;
115
304
        }
116
304
    }
117
    // Decide storage format for this tablet. DEFAULT / not-set fall back to V2 on BE side.
118
304
    TStorageFormat::type storage_format =
119
304
            request.__isset.storage_format ? request.storage_format : TStorageFormat::V2;
120
304
    return std::make_shared<TabletMeta>(
121
304
            request.table_id, request.partition_id, request.tablet_id, request.replica_id,
122
304
            request.tablet_schema.schema_hash, shard_id, request.tablet_schema, next_unique_id,
123
304
            col_ordinal_to_unique_id, tablet_uid,
124
304
            request.__isset.tablet_type ? request.tablet_type : TTabletType::TABLET_TYPE_DISK,
125
304
            request.__isset.compression_type ? request.compression_type : TCompressionType::LZ4F,
126
304
            request.__isset.storage_policy_id ? request.storage_policy_id : -1,
127
304
            request.__isset.enable_unique_key_merge_on_write
128
304
                    ? request.enable_unique_key_merge_on_write
129
304
                    : false,
130
304
            std::move(binlog_config), request.compaction_policy,
131
304
            request.time_series_compaction_goal_size_mbytes,
132
304
            request.time_series_compaction_file_count_threshold,
133
304
            request.time_series_compaction_time_threshold_seconds,
134
304
            request.time_series_compaction_empty_rowsets_threshold,
135
304
            request.time_series_compaction_level_threshold, inverted_index_file_storage_format,
136
304
            request.tde_algorithm, storage_format);
137
304
}
138
139
1.35k
TabletMeta::~TabletMeta() {
140
1.35k
    if (_handle) {
141
1.14k
        TabletSchemaCache::instance()->release(_handle);
142
1.14k
    }
143
1.35k
}
144
145
TabletMeta::TabletMeta()
146
557
        : _tablet_uid(0, 0),
147
557
          _schema(new TabletSchema),
148
557
          _delete_bitmap(new DeleteBitmap(_tablet_id)) {}
149
150
TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id,
151
                       int64_t replica_id, int32_t schema_hash, int32_t shard_id,
152
                       const TTabletSchema& tablet_schema, uint32_t next_unique_id,
153
                       const std::unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id,
154
                       TabletUid tablet_uid, TTabletType::type tabletType,
155
                       TCompressionType::type compression_type, int64_t storage_policy_id,
156
                       bool enable_unique_key_merge_on_write,
157
                       std::optional<TBinlogConfig> binlog_config, std::string compaction_policy,
158
                       int64_t time_series_compaction_goal_size_mbytes,
159
                       int64_t time_series_compaction_file_count_threshold,
160
                       int64_t time_series_compaction_time_threshold_seconds,
161
                       int64_t time_series_compaction_empty_rowsets_threshold,
162
                       int64_t time_series_compaction_level_threshold,
163
                       TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format,
164
                       TEncryptionAlgorithm::type tde_algorithm,
165
                       TStorageFormat::type storage_format)
166
645
        : _tablet_uid(0, 0),
167
645
          _schema(new TabletSchema),
168
645
          _delete_bitmap(new DeleteBitmap(tablet_id)),
169
645
          _storage_format(storage_format) {
170
645
    TabletMetaPB tablet_meta_pb;
171
645
    tablet_meta_pb.set_table_id(table_id);
172
645
    tablet_meta_pb.set_partition_id(partition_id);
173
645
    tablet_meta_pb.set_tablet_id(tablet_id);
174
645
    tablet_meta_pb.set_replica_id(replica_id);
175
645
    tablet_meta_pb.set_schema_hash(schema_hash);
176
645
    tablet_meta_pb.set_shard_id(shard_id);
177
    // Persist the creation time, but it is not used
178
645
    tablet_meta_pb.set_creation_time(time(nullptr));
179
645
    tablet_meta_pb.set_cumulative_layer_point(-1);
180
645
    tablet_meta_pb.set_tablet_state(PB_RUNNING);
181
645
    *(tablet_meta_pb.mutable_tablet_uid()) = tablet_uid.to_proto();
182
645
    tablet_meta_pb.set_tablet_type(tabletType == TTabletType::TABLET_TYPE_DISK
183
645
                                           ? TabletTypePB::TABLET_TYPE_DISK
184
645
                                           : TabletTypePB::TABLET_TYPE_MEMORY);
185
645
    tablet_meta_pb.set_enable_unique_key_merge_on_write(enable_unique_key_merge_on_write);
186
645
    tablet_meta_pb.set_storage_policy_id(storage_policy_id);
187
645
    tablet_meta_pb.set_compaction_policy(compaction_policy);
188
645
    tablet_meta_pb.set_time_series_compaction_goal_size_mbytes(
189
645
            time_series_compaction_goal_size_mbytes);
190
645
    tablet_meta_pb.set_time_series_compaction_file_count_threshold(
191
645
            time_series_compaction_file_count_threshold);
192
645
    tablet_meta_pb.set_time_series_compaction_time_threshold_seconds(
193
645
            time_series_compaction_time_threshold_seconds);
194
645
    tablet_meta_pb.set_time_series_compaction_empty_rowsets_threshold(
195
645
            time_series_compaction_empty_rowsets_threshold);
196
645
    tablet_meta_pb.set_time_series_compaction_level_threshold(
197
645
            time_series_compaction_level_threshold);
198
645
    TabletSchemaPB* schema = tablet_meta_pb.mutable_schema();
199
645
    schema->set_num_short_key_columns(tablet_schema.short_key_column_count);
200
645
    schema->set_num_rows_per_row_block(config::default_num_rows_per_column_file_block);
201
645
    schema->set_sequence_col_idx(tablet_schema.sequence_col_idx);
202
645
    auto p_seq_map = schema->mutable_seq_map(); // ColumnGroupsPB
203
204
645
    for (auto& it : tablet_schema.seq_map) { // std::vector< ::doris::TColumnGroup>
205
0
        uint32_t key = it.sequence_column;
206
0
        ColumnGroupPB* cg_pb = p_seq_map->add_cg(); // ColumnGroupPB {key: {v1, v2, v3}}
207
0
        cg_pb->set_sequence_column(key);
208
0
        for (auto v : it.columns_in_group) {
209
0
            cg_pb->add_columns_in_group(v);
210
0
        }
211
0
    }
212
645
    switch (tablet_schema.keys_type) {
213
50
    case TKeysType::DUP_KEYS:
214
50
        schema->set_keys_type(KeysType::DUP_KEYS);
215
50
        break;
216
303
    case TKeysType::UNIQUE_KEYS:
217
303
        schema->set_keys_type(KeysType::UNIQUE_KEYS);
218
303
        break;
219
68
    case TKeysType::AGG_KEYS:
220
68
        schema->set_keys_type(KeysType::AGG_KEYS);
221
68
        break;
222
224
    default:
223
224
        LOG(WARNING) << "unknown tablet keys type";
224
224
        break;
225
645
    }
226
    // compress_kind used to compress segment files
227
645
    schema->set_compress_kind(COMPRESS_LZ4);
228
229
    // compression_type used to compress segment page
230
645
    switch (compression_type) {
231
0
    case TCompressionType::NO_COMPRESSION:
232
0
        schema->set_compression_type(segment_v2::NO_COMPRESSION);
233
0
        break;
234
0
    case TCompressionType::SNAPPY:
235
0
        schema->set_compression_type(segment_v2::SNAPPY);
236
0
        break;
237
0
    case TCompressionType::LZ4:
238
0
        schema->set_compression_type(segment_v2::LZ4);
239
0
        break;
240
645
    case TCompressionType::LZ4F:
241
645
        schema->set_compression_type(segment_v2::LZ4F);
242
645
        break;
243
0
    case TCompressionType::ZLIB:
244
0
        schema->set_compression_type(segment_v2::ZLIB);
245
0
        break;
246
0
    case TCompressionType::ZSTD:
247
0
        schema->set_compression_type(segment_v2::ZSTD);
248
0
        break;
249
0
    default:
250
0
        schema->set_compression_type(segment_v2::LZ4F);
251
0
        break;
252
645
    }
253
254
645
    switch (inverted_index_file_storage_format) {
255
0
    case TInvertedIndexFileStorageFormat::V1:
256
0
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1);
257
0
        break;
258
645
    case TInvertedIndexFileStorageFormat::V2:
259
645
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
260
645
        break;
261
0
    case TInvertedIndexFileStorageFormat::V3:
262
0
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
263
0
        break;
264
0
    default:
265
0
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
266
0
        break;
267
645
    }
268
269
645
    switch (tablet_schema.sort_type) {
270
0
    case TSortType::type::ZORDER:
271
0
        schema->set_sort_type(SortType::ZORDER);
272
0
        break;
273
645
    default:
274
645
        schema->set_sort_type(SortType::LEXICAL);
275
645
    }
276
645
    schema->set_sort_col_num(tablet_schema.sort_col_num);
277
645
    for (const auto& i : tablet_schema.cluster_key_uids) {
278
2
        schema->add_cluster_key_uids(i);
279
2
    }
280
645
    tablet_meta_pb.set_in_restore_mode(false);
281
282
    // set column information
283
645
    uint32_t col_ordinal = 0;
284
645
    bool has_bf_columns = false;
285
2.22k
    for (TColumn tcolumn : tablet_schema.columns) {
286
2.22k
        ColumnPB* column = schema->add_column();
287
2.22k
        uint32_t unique_id = -1;
288
2.22k
        if (tcolumn.col_unique_id >= 0) {
289
11
            unique_id = tcolumn.col_unique_id;
290
2.21k
        } else {
291
2.21k
            unique_id = col_ordinal_to_unique_id.at(col_ordinal);
292
2.21k
        }
293
2.22k
        col_ordinal++;
294
2.22k
        init_column_from_tcolumn(unique_id, tcolumn, column);
295
296
2.22k
        if (column->is_bf_column()) {
297
0
            has_bf_columns = true;
298
0
        }
299
300
2.22k
        if (tablet_schema.__isset.indexes) {
301
2
            for (auto& index : tablet_schema.indexes) {
302
2
                if (index.index_type == TIndexType::type::BLOOMFILTER ||
303
2
                    index.index_type == TIndexType::type::NGRAM_BF) {
304
0
                    DCHECK_EQ(index.columns.size(), 1);
305
0
                    if (iequal(tcolumn.column_name, index.columns[0])) {
306
0
                        column->set_is_bf_column(true);
307
0
                        break;
308
0
                    }
309
0
                }
310
2
            }
311
2
        }
312
2.22k
    }
313
314
    // copy index meta
315
645
    if (tablet_schema.__isset.indexes) {
316
1
        for (auto& index : tablet_schema.indexes) {
317
1
            TabletIndexPB* index_pb = schema->add_index();
318
1
            index_pb->set_index_id(index.index_id);
319
1
            index_pb->set_index_name(index.index_name);
320
            // init col_unique_id in index at be side, since col_unique_id may be -1 at fe side
321
            // get column unique id by name
322
1
            for (auto column_name : index.columns) {
323
2
                for (auto column : schema->column()) {
324
2
                    if (iequal(column.name(), column_name)) {
325
1
                        index_pb->add_col_unique_id(column.unique_id());
326
1
                    }
327
2
                }
328
1
            }
329
1
            switch (index.index_type) {
330
1
            case TIndexType::BITMAP:
331
1
                index_pb->set_index_type(IndexType::BITMAP);
332
1
                break;
333
0
            case TIndexType::INVERTED:
334
0
                index_pb->set_index_type(IndexType::INVERTED);
335
0
                break;
336
0
            case TIndexType::ANN:
337
0
                index_pb->set_index_type(IndexType::ANN);
338
0
                break;
339
0
            case TIndexType::BLOOMFILTER:
340
0
                index_pb->set_index_type(IndexType::BLOOMFILTER);
341
0
                break;
342
0
            case TIndexType::NGRAM_BF:
343
0
                index_pb->set_index_type(IndexType::NGRAM_BF);
344
0
                break;
345
1
            }
346
347
1
            if (index.__isset.properties) {
348
0
                auto properties = index_pb->mutable_properties();
349
0
                for (auto kv : index.properties) {
350
0
                    (*properties)[kv.first] = kv.second;
351
0
                }
352
0
            }
353
1
        }
354
1
    }
355
356
645
    schema->set_next_column_unique_id(next_unique_id);
357
645
    if (has_bf_columns && tablet_schema.__isset.bloom_filter_fpp) {
358
0
        schema->set_bf_fpp(tablet_schema.bloom_filter_fpp);
359
0
    }
360
361
645
    if (tablet_schema.__isset.is_in_memory) {
362
0
        schema->set_is_in_memory(tablet_schema.is_in_memory);
363
0
    }
364
365
645
    if (tablet_schema.__isset.disable_auto_compaction) {
366
10
        schema->set_disable_auto_compaction(tablet_schema.disable_auto_compaction);
367
10
    }
368
369
645
    if (tablet_schema.__isset.variant_enable_flatten_nested) {
370
645
        schema->set_enable_variant_flatten_nested(tablet_schema.variant_enable_flatten_nested);
371
645
    }
372
373
645
    if (tablet_schema.__isset.enable_single_replica_compaction) {
374
645
        schema->set_enable_single_replica_compaction(
375
645
                tablet_schema.enable_single_replica_compaction);
376
645
    }
377
378
645
    if (tablet_schema.__isset.delete_sign_idx) {
379
645
        schema->set_delete_sign_idx(tablet_schema.delete_sign_idx);
380
645
    }
381
645
    if (tablet_schema.__isset.store_row_column) {
382
645
        schema->set_store_row_column(tablet_schema.store_row_column);
383
645
    }
384
645
    if (tablet_schema.__isset.row_store_page_size) {
385
645
        schema->set_row_store_page_size(tablet_schema.row_store_page_size);
386
645
    }
387
645
    if (tablet_schema.__isset.storage_page_size) {
388
645
        schema->set_storage_page_size(tablet_schema.storage_page_size);
389
645
    }
390
645
    if (tablet_schema.__isset.storage_dict_page_size) {
391
645
        schema->set_storage_dict_page_size(tablet_schema.storage_dict_page_size);
392
645
    }
393
645
    if (tablet_schema.__isset.skip_write_index_on_load) {
394
645
        schema->set_skip_write_index_on_load(tablet_schema.skip_write_index_on_load);
395
645
    }
396
645
    if (tablet_schema.__isset.row_store_col_cids) {
397
0
        schema->mutable_row_store_column_unique_ids()->Add(tablet_schema.row_store_col_cids.begin(),
398
0
                                                           tablet_schema.row_store_col_cids.end());
399
0
    }
400
645
    if (binlog_config.has_value()) {
401
0
        BinlogConfig tmp_binlog_config;
402
0
        tmp_binlog_config = binlog_config.value();
403
0
        tmp_binlog_config.to_pb(tablet_meta_pb.mutable_binlog_config());
404
0
    }
405
406
645
    switch (tde_algorithm) {
407
0
    case doris::TEncryptionAlgorithm::AES256:
408
0
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::AES_256_CTR);
409
0
        break;
410
0
    case doris::TEncryptionAlgorithm::SM4:
411
0
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::SM4_128_CTR);
412
0
        break;
413
645
    default:
414
645
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::PLAINTEXT);
415
645
    }
416
417
    // Initialize default external ColumnMeta usage according to storage format.
418
    // V2: legacy behavior, inline ColumnMetaPB only.
419
    // V3: V2 + external ColumnMetaPB (CMO) enabled by default.
420
645
    switch (_storage_format) {
421
645
    case TStorageFormat::V2:
422
645
    case TStorageFormat::DEFAULT:
423
645
    case TStorageFormat::V1:
424
645
        break;
425
0
    case TStorageFormat::V3:
426
0
        schema->set_is_external_segment_column_meta_used(true);
427
0
        _schema->set_external_segment_meta_used_default(true);
428
429
0
        schema->set_integer_type_default_use_plain_encoding(true);
430
0
        _schema->set_integer_type_default_use_plain_encoding(true);
431
0
        schema->set_binary_plain_encoding_default_impl(
432
0
                BinaryPlainEncodingTypePB::BINARY_PLAIN_ENCODING_V2);
433
0
        _schema->set_binary_plain_encoding_default_impl(
434
0
                BinaryPlainEncodingTypePB::BINARY_PLAIN_ENCODING_V2);
435
0
        break;
436
0
    default:
437
0
        break;
438
645
    }
439
440
645
    init_from_pb(tablet_meta_pb);
441
645
}
442
443
TabletMeta::TabletMeta(const TabletMeta& b)
444
97
        : MetadataAdder(b),
445
97
          _table_id(b._table_id),
446
97
          _index_id(b._index_id),
447
97
          _partition_id(b._partition_id),
448
97
          _tablet_id(b._tablet_id),
449
97
          _replica_id(b._replica_id),
450
97
          _schema_hash(b._schema_hash),
451
97
          _shard_id(b._shard_id),
452
97
          _creation_time(b._creation_time),
453
97
          _cumulative_layer_point(b._cumulative_layer_point),
454
97
          _tablet_uid(b._tablet_uid),
455
97
          _tablet_type(b._tablet_type),
456
97
          _tablet_state(b._tablet_state),
457
97
          _schema(b._schema),
458
97
          _rs_metas(b._rs_metas),
459
97
          _stale_rs_metas(b._stale_rs_metas),
460
97
          _in_restore_mode(b._in_restore_mode),
461
97
          _preferred_rowset_type(b._preferred_rowset_type),
462
97
          _storage_policy_id(b._storage_policy_id),
463
97
          _cooldown_meta_id(b._cooldown_meta_id),
464
97
          _enable_unique_key_merge_on_write(b._enable_unique_key_merge_on_write),
465
97
          _delete_bitmap(b._delete_bitmap),
466
97
          _binlog_config(b._binlog_config),
467
97
          _compaction_policy(b._compaction_policy),
468
97
          _time_series_compaction_goal_size_mbytes(b._time_series_compaction_goal_size_mbytes),
469
          _time_series_compaction_file_count_threshold(
470
97
                  b._time_series_compaction_file_count_threshold),
471
          _time_series_compaction_time_threshold_seconds(
472
97
                  b._time_series_compaction_time_threshold_seconds),
473
          _time_series_compaction_empty_rowsets_threshold(
474
97
                  b._time_series_compaction_empty_rowsets_threshold),
475
97
          _time_series_compaction_level_threshold(b._time_series_compaction_level_threshold) {};
476
477
void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tcolumn,
478
2.23k
                                          ColumnPB* column) {
479
2.23k
    column->set_unique_id(unique_id);
480
2.23k
    column->set_name(tcolumn.column_name);
481
2.23k
    column->set_is_auto_increment(tcolumn.is_auto_increment);
482
2.23k
    if (tcolumn.__isset.is_on_update_current_timestamp) {
483
2.23k
        column->set_is_on_update_current_timestamp(tcolumn.is_on_update_current_timestamp);
484
2.23k
    }
485
2.23k
    string data_type;
486
2.23k
    EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type);
487
2.23k
    column->set_type(data_type);
488
489
2.23k
    uint32_t length = TabletColumn::get_field_length_by_type(tcolumn.column_type.type,
490
2.23k
                                                             tcolumn.column_type.len);
491
2.23k
    column->set_length(length);
492
2.23k
    column->set_index_length(length);
493
2.23k
    column->set_precision(tcolumn.column_type.precision);
494
2.23k
    column->set_frac(tcolumn.column_type.scale);
495
496
2.23k
    if (tcolumn.__isset.result_is_nullable) {
497
0
        column->set_result_is_nullable(tcolumn.result_is_nullable);
498
0
    }
499
500
2.23k
    if (tcolumn.__isset.be_exec_version) {
501
2.23k
        column->set_be_exec_version(tcolumn.be_exec_version);
502
2.23k
    }
503
504
2.23k
    if (tcolumn.column_type.type == TPrimitiveType::VARCHAR ||
505
2.23k
        tcolumn.column_type.type == TPrimitiveType::STRING) {
506
110
        if (!tcolumn.column_type.__isset.index_len) {
507
110
            column->set_index_length(10);
508
110
        } else {
509
0
            column->set_index_length(tcolumn.column_type.index_len);
510
0
        }
511
110
    }
512
2.23k
    if (!tcolumn.is_key) {
513
1.17k
        column->set_is_key(false);
514
1.17k
        if (tcolumn.__isset.aggregation) {
515
0
            column->set_aggregation(tcolumn.aggregation);
516
1.17k
        } else {
517
1.17k
            string aggregation_type;
518
1.17k
            EnumToString(TAggregationType, tcolumn.aggregation_type, aggregation_type);
519
1.17k
            column->set_aggregation(aggregation_type);
520
1.17k
        }
521
1.17k
    } else {
522
1.06k
        column->set_is_key(true);
523
1.06k
        column->set_aggregation("NONE");
524
1.06k
    }
525
2.23k
    column->set_is_nullable(tcolumn.is_allow_null);
526
2.23k
    if (tcolumn.__isset.default_value) {
527
1
        column->set_default_value(tcolumn.default_value);
528
1
    }
529
2.23k
    if (tcolumn.__isset.is_bloom_filter_column) {
530
1
        column->set_is_bf_column(tcolumn.is_bloom_filter_column);
531
1
    }
532
2.23k
    if (tcolumn.__isset.visible) {
533
2.23k
        column->set_visible(tcolumn.visible);
534
2.23k
    }
535
2.23k
    for (size_t i = 0; i < tcolumn.children_column.size(); i++) {
536
0
        ColumnPB* children_column = column->add_children_columns();
537
0
        init_column_from_tcolumn(tcolumn.children_column[i].col_unique_id,
538
0
                                 tcolumn.children_column[i], children_column);
539
0
    }
540
2.23k
    if (tcolumn.column_type.__isset.variant_max_subcolumns_count) {
541
2.23k
        column->set_variant_max_subcolumns_count(tcolumn.column_type.variant_max_subcolumns_count);
542
2.23k
    }
543
2.23k
    if (tcolumn.__isset.pattern_type) {
544
3
        column->set_pattern_type(to_pattern_type_pb(tcolumn.pattern_type));
545
3
    }
546
2.23k
    if (tcolumn.__isset.variant_enable_typed_paths_to_sparse) {
547
2.23k
        column->set_variant_enable_typed_paths_to_sparse(
548
2.23k
                tcolumn.variant_enable_typed_paths_to_sparse);
549
2.23k
    }
550
2.23k
    if (tcolumn.__isset.variant_max_sparse_column_statistics_size) {
551
2.23k
        column->set_variant_max_sparse_column_statistics_size(
552
2.23k
                tcolumn.variant_max_sparse_column_statistics_size);
553
2.23k
    }
554
2.23k
    if (tcolumn.__isset.variant_sparse_hash_shard_count) {
555
0
        column->set_variant_sparse_hash_shard_count(tcolumn.variant_sparse_hash_shard_count);
556
0
    }
557
2.23k
    if (tcolumn.__isset.variant_enable_doc_mode) {
558
0
        column->set_variant_enable_doc_mode(tcolumn.variant_enable_doc_mode);
559
0
    }
560
2.23k
    if (tcolumn.__isset.variant_doc_materialization_min_rows) {
561
0
        column->set_variant_doc_materialization_min_rows(
562
0
                tcolumn.variant_doc_materialization_min_rows);
563
0
    }
564
2.23k
    if (tcolumn.__isset.variant_doc_hash_shard_count) {
565
0
        column->set_variant_doc_hash_shard_count(tcolumn.variant_doc_hash_shard_count);
566
0
    }
567
2.23k
}
568
569
0
void TabletMeta::remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version) {
570
0
    if (_enable_unique_key_merge_on_write) {
571
0
        delete_bitmap().remove({rowset_id, 0, 0}, {rowset_id, UINT32_MAX, 0});
572
0
        if (config::enable_mow_verbose_log) {
573
0
            LOG_INFO("delete rowset delete bitmap. tablet={}, rowset={}, version={}", tablet_id(),
574
0
                     rowset_id.to_string(), version.to_string());
575
0
        }
576
0
        size_t rowset_cache_version_size = delete_bitmap().remove_rowset_cache_version(rowset_id);
577
0
        _check_mow_rowset_cache_version_size(rowset_cache_version_size);
578
0
    }
579
0
}
580
581
4
Status TabletMeta::create_from_file(const string& file_path) {
582
4
    TabletMetaPB tablet_meta_pb;
583
4
    RETURN_IF_ERROR(load_from_file(file_path, &tablet_meta_pb));
584
4
    init_from_pb(tablet_meta_pb);
585
4
    return Status::OK();
586
4
}
587
588
12
Status TabletMeta::load_from_file(const string& file_path, TabletMetaPB* tablet_meta_pb) {
589
12
    FileHeader<TabletMetaPB> file_header(file_path);
590
    // In file_header.deserialize(), it validates file length, signature, checksum of protobuf.
591
12
    RETURN_IF_ERROR(file_header.deserialize());
592
12
    try {
593
12
        tablet_meta_pb->CopyFrom(file_header.message());
594
12
    } catch (const std::exception& e) {
595
0
        LOG(WARNING) << "Failed to copy protocol buffer object: " << e.what()
596
0
                     << ", file=" << file_path;
597
0
        return Status::Error<PARSE_PROTOBUF_ERROR>(
598
0
                "fail to copy protocol buffer object. file={}, error={}", file_path, e.what());
599
0
    }
600
12
    return Status::OK();
601
12
}
602
603
6
Status TabletMeta::create_from_buffer(const uint8_t* buffer, size_t buffer_size) {
604
6
    FileHeader<TabletMetaPB> file_header(""); // empty file path
605
6
    RETURN_IF_ERROR(file_header.deserialize_from_memory(buffer, buffer_size));
606
607
2
    TabletMetaPB tablet_meta_pb;
608
2
    try {
609
2
        tablet_meta_pb.CopyFrom(file_header.message());
610
2
    } catch (const std::exception& e) {
611
0
        LOG(WARNING) << "Failed to copy protocol buffer object from buffer: " << e.what();
612
0
        return Status::Error<ErrorCode::PARSE_PROTOBUF_ERROR>(
613
0
                "fail to copy protocol buffer object from buffer. error={}", e.what());
614
0
    }
615
616
2
    init_from_pb(tablet_meta_pb);
617
2
    return Status::OK();
618
2
}
619
620
std::string TabletMeta::construct_header_file_path(const string& schema_hash_path,
621
3
                                                   int64_t tablet_id) {
622
3
    std::stringstream header_name_stream;
623
3
    header_name_stream << schema_hash_path << "/" << tablet_id << ".hdr";
624
3
    return header_name_stream.str();
625
3
}
626
627
0
Status TabletMeta::save_as_json(const string& file_path) {
628
0
    std::string json_meta;
629
0
    json2pb::Pb2JsonOptions json_options;
630
0
    json_options.pretty_json = true;
631
0
    json_options.bytes_to_base64 = true;
632
0
    to_json(&json_meta, json_options);
633
    // save to file
634
0
    io::FileWriterPtr file_writer;
635
0
    RETURN_IF_ERROR(io::global_local_filesystem()->create_file(file_path, &file_writer));
636
0
    RETURN_IF_ERROR(file_writer->append(json_meta));
637
0
    RETURN_IF_ERROR(file_writer->close());
638
0
    return Status::OK();
639
0
}
640
641
230
Status TabletMeta::save(const string& file_path) {
642
230
    TabletMetaPB tablet_meta_pb;
643
230
    to_meta_pb(&tablet_meta_pb, false);
644
230
    return TabletMeta::save(file_path, tablet_meta_pb);
645
230
}
646
647
236
Status TabletMeta::save(const string& file_path, const TabletMetaPB& tablet_meta_pb) {
648
236
    DCHECK(!file_path.empty());
649
236
    FileHeader<TabletMetaPB> file_header(file_path);
650
236
    try {
651
236
        file_header.mutable_message()->CopyFrom(tablet_meta_pb);
652
236
    } catch (...) {
653
0
        LOG(WARNING) << "fail to copy protocol buffer object. file='" << file_path;
654
0
        return Status::Error<ErrorCode::INTERNAL_ERROR>(
655
0
                "fail to copy protocol buffer object. file={}", file_path);
656
0
    }
657
236
    RETURN_IF_ERROR(file_header.prepare());
658
236
    RETURN_IF_ERROR(file_header.serialize());
659
236
    return Status::OK();
660
236
}
661
662
572
Status TabletMeta::save_meta(DataDir* data_dir) {
663
572
    std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
664
572
    return _save_meta(data_dir);
665
572
}
666
667
572
Status TabletMeta::_save_meta(DataDir* data_dir) {
668
    // check if tablet uid is valid
669
572
    if (_tablet_uid.hi == 0 && _tablet_uid.lo == 0) {
670
0
        LOG(FATAL) << "tablet_uid is invalid"
671
0
                   << " tablet=" << tablet_id() << " _tablet_uid=" << _tablet_uid.to_string();
672
0
    }
673
572
    string meta_binary;
674
675
572
    auto t1 = MonotonicMicros();
676
572
    serialize(&meta_binary);
677
572
    auto t2 = MonotonicMicros();
678
572
    Status status = TabletMetaManager::save(data_dir, tablet_id(), schema_hash(), meta_binary);
679
572
    if (!status.ok()) {
680
0
        LOG(FATAL) << "fail to save tablet_meta. status=" << status << ", tablet_id=" << tablet_id()
681
0
                   << ", schema_hash=" << schema_hash();
682
0
    }
683
572
    auto t3 = MonotonicMicros();
684
572
    auto cost = t3 - t1;
685
572
    if (cost > 1 * 1000 * 1000) {
686
0
        LOG(INFO) << "save tablet(" << tablet_id() << ") meta too slow. serialize cost " << t2 - t1
687
0
                  << "(us), serialized binary size: " << meta_binary.length()
688
0
                  << "(bytes), write rocksdb cost " << t3 - t2 << "(us)";
689
0
    }
690
572
    return status;
691
572
}
692
693
577
void TabletMeta::serialize(string* meta_binary) {
694
577
    TabletMetaPB tablet_meta_pb;
695
577
    to_meta_pb(&tablet_meta_pb, false);
696
577
    if (tablet_meta_pb.partition_id() <= 0) {
697
468
        LOG(WARNING) << "invalid partition id " << tablet_meta_pb.partition_id() << " tablet "
698
468
                     << tablet_meta_pb.tablet_id();
699
468
    }
700
577
    DBUG_EXECUTE_IF("TabletMeta::serialize::zero_partition_id", {
701
577
        long partition_id = tablet_meta_pb.partition_id();
702
577
        tablet_meta_pb.set_partition_id(0);
703
577
        LOG(WARNING) << "set debug point TabletMeta::serialize::zero_partition_id old="
704
577
                     << partition_id << " new=" << tablet_meta_pb.DebugString();
705
577
    });
706
577
    bool serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
707
577
    if (!_rs_metas.empty() || !_stale_rs_metas.empty()) {
708
577
        _avg_rs_meta_serialize_size =
709
577
                meta_binary->length() / (_rs_metas.size() + _stale_rs_metas.size());
710
577
        if (meta_binary->length() > config::tablet_meta_serialize_size_limit ||
711
577
            !serialize_success) {
712
0
            int64_t origin_meta_size = meta_binary->length();
713
0
            int64_t stale_rowsets_num = tablet_meta_pb.stale_rs_metas().size();
714
0
            tablet_meta_pb.clear_stale_rs_metas();
715
0
            meta_binary->clear();
716
0
            serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
717
0
            LOG(WARNING) << "tablet meta serialization size exceeds limit: "
718
0
                         << config::tablet_meta_serialize_size_limit
719
0
                         << " clean up stale rowsets, tablet id: " << tablet_id()
720
0
                         << " stale rowset num: " << stale_rowsets_num
721
0
                         << " serialization size before clean " << origin_meta_size
722
0
                         << " serialization size after clean " << meta_binary->length();
723
0
        }
724
577
    }
725
726
577
    if (!serialize_success) {
727
0
        LOG(FATAL) << "failed to serialize meta " << tablet_id();
728
0
    }
729
577
}
730
731
461
Status TabletMeta::deserialize(std::string_view meta_binary) {
732
461
    TabletMetaPB tablet_meta_pb;
733
461
    bool parsed = tablet_meta_pb.ParseFromArray(meta_binary.data(),
734
461
                                                static_cast<int32_t>(meta_binary.size()));
735
461
    if (!parsed) {
736
0
        return Status::Error<INIT_FAILED>("parse tablet meta failed");
737
0
    }
738
461
    init_from_pb(tablet_meta_pb);
739
461
    return Status::OK();
740
461
}
741
742
1.14k
void TabletMeta::init_from_pb(const TabletMetaPB& tablet_meta_pb) {
743
1.14k
    _table_id = tablet_meta_pb.table_id();
744
1.14k
    _index_id = tablet_meta_pb.index_id();
745
1.14k
    _partition_id = tablet_meta_pb.partition_id();
746
1.14k
    _tablet_id = tablet_meta_pb.tablet_id();
747
1.14k
    _replica_id = tablet_meta_pb.replica_id();
748
1.14k
    _schema_hash = tablet_meta_pb.schema_hash();
749
1.14k
    _shard_id = tablet_meta_pb.shard_id();
750
1.14k
    _creation_time = tablet_meta_pb.creation_time();
751
1.14k
    _cumulative_layer_point = tablet_meta_pb.cumulative_layer_point();
752
1.14k
    _tablet_uid = TabletUid(tablet_meta_pb.tablet_uid());
753
1.14k
    _ttl_seconds = tablet_meta_pb.ttl_seconds();
754
1.14k
    if (tablet_meta_pb.has_tablet_type()) {
755
1.12k
        _tablet_type = tablet_meta_pb.tablet_type();
756
1.12k
    } else {
757
29
        _tablet_type = TabletTypePB::TABLET_TYPE_DISK;
758
29
    }
759
760
    // init _tablet_state
761
1.14k
    switch (tablet_meta_pb.tablet_state()) {
762
32
    case PB_NOTREADY:
763
32
        _tablet_state = TabletState::TABLET_NOTREADY;
764
32
        break;
765
892
    case PB_RUNNING:
766
892
        _tablet_state = TabletState::TABLET_RUNNING;
767
892
        break;
768
0
    case PB_TOMBSTONED:
769
0
        _tablet_state = TabletState::TABLET_TOMBSTONED;
770
0
        break;
771
0
    case PB_STOPPED:
772
0
        _tablet_state = TabletState::TABLET_STOPPED;
773
0
        break;
774
225
    case PB_SHUTDOWN:
775
225
        _tablet_state = TabletState::TABLET_SHUTDOWN;
776
225
        break;
777
0
    default:
778
0
        LOG(WARNING) << "tablet has no state. tablet=" << tablet_id()
779
0
                     << ", schema_hash=" << schema_hash();
780
1.14k
    }
781
782
    // init _schema
783
1.14k
    TabletSchemaSPtr schema = std::make_shared<TabletSchema>();
784
1.14k
    schema->init_from_pb(tablet_meta_pb.schema());
785
1.14k
    if (_handle) {
786
4
        TabletSchemaCache::instance()->release(_handle);
787
4
    }
788
1.14k
    auto pair = TabletSchemaCache::instance()->insert(schema->to_key());
789
1.14k
    _handle = pair.first;
790
1.14k
    _schema = pair.second;
791
792
1.14k
    if (tablet_meta_pb.has_enable_unique_key_merge_on_write()) {
793
1.12k
        _enable_unique_key_merge_on_write = tablet_meta_pb.enable_unique_key_merge_on_write();
794
1.12k
        _delete_bitmap->set_tablet_id(_tablet_id);
795
1.12k
    }
796
797
    // init _rs_metas
798
10.8k
    for (auto& it : tablet_meta_pb.rs_metas()) {
799
10.8k
        RowsetMetaSharedPtr rs_meta(new RowsetMeta());
800
10.8k
        rs_meta->init_from_pb(it);
801
10.8k
        _rs_metas.emplace(rs_meta->version(), rs_meta);
802
10.8k
    }
803
804
    // For mow table, delete bitmap of stale rowsets has not been persisted.
805
    // When be restart, query should not read the stale rowset, otherwise duplicate keys
806
    // will be read out. Therefore, we don't add them to _stale_rs_meta for mow table.
807
1.14k
    if (!config::skip_loading_stale_rowset_meta && !_enable_unique_key_merge_on_write) {
808
1.11k
        for (auto& it : tablet_meta_pb.stale_rs_metas()) {
809
0
            RowsetMetaSharedPtr rs_meta(new RowsetMeta());
810
0
            rs_meta->init_from_pb(it);
811
0
            _stale_rs_metas.emplace(rs_meta->version(), rs_meta);
812
0
        }
813
1.11k
    }
814
815
1.14k
    if (tablet_meta_pb.has_in_restore_mode()) {
816
1.12k
        _in_restore_mode = tablet_meta_pb.in_restore_mode();
817
1.12k
    }
818
819
1.14k
    if (tablet_meta_pb.has_preferred_rowset_type()) {
820
475
        _preferred_rowset_type = tablet_meta_pb.preferred_rowset_type();
821
475
    }
822
823
1.14k
    _storage_policy_id = tablet_meta_pb.storage_policy_id();
824
1.14k
    if (tablet_meta_pb.has_cooldown_meta_id()) {
825
0
        _cooldown_meta_id = tablet_meta_pb.cooldown_meta_id();
826
0
    }
827
828
1.14k
    if (tablet_meta_pb.has_delete_bitmap()) {
829
0
        int rst_ids_size = tablet_meta_pb.delete_bitmap().rowset_ids_size();
830
0
        int seg_ids_size = tablet_meta_pb.delete_bitmap().segment_ids_size();
831
0
        int versions_size = tablet_meta_pb.delete_bitmap().versions_size();
832
0
        int seg_maps_size = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps_size();
833
0
        CHECK(rst_ids_size == seg_ids_size && seg_ids_size == seg_maps_size &&
834
0
              seg_maps_size == versions_size);
835
0
        for (int i = 0; i < rst_ids_size; ++i) {
836
0
            RowsetId rst_id;
837
0
            rst_id.init(tablet_meta_pb.delete_bitmap().rowset_ids(i));
838
0
            auto seg_id = tablet_meta_pb.delete_bitmap().segment_ids(i);
839
0
            auto ver = tablet_meta_pb.delete_bitmap().versions(i);
840
0
            auto bitmap = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps(i).data();
841
0
            delete_bitmap().delete_bitmap[{rst_id, seg_id, ver}] = roaring::Roaring::read(bitmap);
842
0
        }
843
0
    }
844
845
1.14k
    if (tablet_meta_pb.has_binlog_config()) {
846
473
        _binlog_config = tablet_meta_pb.binlog_config();
847
473
    }
848
1.14k
    _compaction_policy = tablet_meta_pb.compaction_policy();
849
1.14k
    _time_series_compaction_goal_size_mbytes =
850
1.14k
            tablet_meta_pb.time_series_compaction_goal_size_mbytes();
851
1.14k
    _time_series_compaction_file_count_threshold =
852
1.14k
            tablet_meta_pb.time_series_compaction_file_count_threshold();
853
1.14k
    _time_series_compaction_time_threshold_seconds =
854
1.14k
            tablet_meta_pb.time_series_compaction_time_threshold_seconds();
855
1.14k
    _time_series_compaction_empty_rowsets_threshold =
856
1.14k
            tablet_meta_pb.time_series_compaction_empty_rowsets_threshold();
857
1.14k
    _time_series_compaction_level_threshold =
858
1.14k
            tablet_meta_pb.time_series_compaction_level_threshold();
859
860
1.14k
    if (tablet_meta_pb.has_encryption_algorithm()) {
861
1.11k
        _encryption_algorithm = tablet_meta_pb.encryption_algorithm();
862
1.11k
    }
863
1.14k
}
864
865
822
void TabletMeta::to_meta_pb(TabletMetaPB* tablet_meta_pb, bool cloud_get_rowset_meta) {
866
822
    tablet_meta_pb->set_table_id(table_id());
867
822
    tablet_meta_pb->set_index_id(index_id());
868
822
    tablet_meta_pb->set_partition_id(partition_id());
869
822
    tablet_meta_pb->set_tablet_id(tablet_id());
870
822
    tablet_meta_pb->set_replica_id(replica_id());
871
822
    tablet_meta_pb->set_schema_hash(schema_hash());
872
822
    tablet_meta_pb->set_shard_id(shard_id());
873
822
    tablet_meta_pb->set_creation_time(creation_time());
874
822
    tablet_meta_pb->set_cumulative_layer_point(cumulative_layer_point());
875
822
    *(tablet_meta_pb->mutable_tablet_uid()) = tablet_uid().to_proto();
876
822
    tablet_meta_pb->set_tablet_type(_tablet_type);
877
822
    tablet_meta_pb->set_ttl_seconds(_ttl_seconds);
878
822
    switch (tablet_state()) {
879
9
    case TABLET_NOTREADY:
880
9
        tablet_meta_pb->set_tablet_state(PB_NOTREADY);
881
9
        break;
882
332
    case TABLET_RUNNING:
883
332
        tablet_meta_pb->set_tablet_state(PB_RUNNING);
884
332
        break;
885
0
    case TABLET_TOMBSTONED:
886
0
        tablet_meta_pb->set_tablet_state(PB_TOMBSTONED);
887
0
        break;
888
0
    case TABLET_STOPPED:
889
0
        tablet_meta_pb->set_tablet_state(PB_STOPPED);
890
0
        break;
891
481
    case TABLET_SHUTDOWN:
892
481
        tablet_meta_pb->set_tablet_state(PB_SHUTDOWN);
893
481
        break;
894
822
    }
895
896
    // RowsetMetaPB is separated from TabletMetaPB
897
822
    if (!config::is_cloud_mode() || cloud_get_rowset_meta) {
898
21.6k
        for (const auto& [_, rs] : _rs_metas) {
899
21.6k
            rs->to_rowset_pb(tablet_meta_pb->add_rs_metas());
900
21.6k
        }
901
822
        for (const auto& [_, rs] : _stale_rs_metas) {
902
0
            rs->to_rowset_pb(tablet_meta_pb->add_stale_rs_metas());
903
0
        }
904
822
    }
905
906
822
    _schema->to_schema_pb(tablet_meta_pb->mutable_schema());
907
908
822
    tablet_meta_pb->set_in_restore_mode(in_restore_mode());
909
910
    // to avoid modify tablet meta to the greatest extend
911
822
    if (_preferred_rowset_type == BETA_ROWSET) {
912
822
        tablet_meta_pb->set_preferred_rowset_type(_preferred_rowset_type);
913
822
    }
914
822
    if (_storage_policy_id > 0) {
915
5
        tablet_meta_pb->set_storage_policy_id(_storage_policy_id);
916
5
    }
917
822
    if (_cooldown_meta_id.initialized()) {
918
5
        tablet_meta_pb->mutable_cooldown_meta_id()->CopyFrom(_cooldown_meta_id.to_proto());
919
5
    }
920
921
822
    tablet_meta_pb->set_enable_unique_key_merge_on_write(_enable_unique_key_merge_on_write);
922
923
822
    if (_enable_unique_key_merge_on_write) {
924
4
        std::set<RowsetId> stale_rs_ids;
925
4
        for (const auto& [_, rowset] : _stale_rs_metas) {
926
0
            stale_rs_ids.insert(rowset->rowset_id());
927
0
        }
928
4
        DeleteBitmapPB* delete_bitmap_pb = tablet_meta_pb->mutable_delete_bitmap();
929
4
        for (auto& [id, bitmap] : delete_bitmap().snapshot().delete_bitmap) {
930
2
            auto& [rowset_id, segment_id, ver] = id;
931
2
            if (stale_rs_ids.count(rowset_id) != 0) {
932
0
                continue;
933
0
            }
934
2
            delete_bitmap_pb->add_rowset_ids(rowset_id.to_string());
935
2
            delete_bitmap_pb->add_segment_ids(segment_id);
936
2
            delete_bitmap_pb->add_versions(ver);
937
2
            std::string bitmap_data(bitmap.getSizeInBytes(), '\0');
938
2
            bitmap.write(bitmap_data.data());
939
2
            *(delete_bitmap_pb->add_segment_delete_bitmaps()) = std::move(bitmap_data);
940
2
        }
941
4
    }
942
822
    _binlog_config.to_pb(tablet_meta_pb->mutable_binlog_config());
943
822
    tablet_meta_pb->set_compaction_policy(compaction_policy());
944
822
    tablet_meta_pb->set_time_series_compaction_goal_size_mbytes(
945
822
            time_series_compaction_goal_size_mbytes());
946
822
    tablet_meta_pb->set_time_series_compaction_file_count_threshold(
947
822
            time_series_compaction_file_count_threshold());
948
822
    tablet_meta_pb->set_time_series_compaction_time_threshold_seconds(
949
822
            time_series_compaction_time_threshold_seconds());
950
822
    tablet_meta_pb->set_time_series_compaction_empty_rowsets_threshold(
951
822
            time_series_compaction_empty_rowsets_threshold());
952
822
    tablet_meta_pb->set_time_series_compaction_level_threshold(
953
822
            time_series_compaction_level_threshold());
954
955
822
    tablet_meta_pb->set_encryption_algorithm(_encryption_algorithm);
956
822
}
957
958
2
void TabletMeta::to_json(string* json_string, json2pb::Pb2JsonOptions& options) {
959
2
    TabletMetaPB tablet_meta_pb;
960
2
    to_meta_pb(&tablet_meta_pb, false);
961
2
    json2pb::ProtoMessageToJson(tablet_meta_pb, json_string, options);
962
2
}
963
964
165
Version TabletMeta::max_version() const {
965
165
    Version max_version = {-1, 0};
966
3.15k
    for (const auto& [_, rs_meta] : _rs_metas) {
967
3.15k
        if (rs_meta->end_version() > max_version.second) {
968
151
            max_version = rs_meta->version();
969
151
        }
970
3.15k
    }
971
165
    return max_version;
972
165
}
973
974
0
size_t TabletMeta::version_count_cross_with_range(const Version& range) const {
975
0
    size_t count = 0;
976
0
    for (const auto& [_, rs_meta] : _rs_metas) {
977
0
        if (!(range.first > rs_meta->version().second || range.second < rs_meta->version().first)) {
978
0
            count++;
979
0
        }
980
0
    }
981
0
    return count;
982
0
}
983
984
14.1k
Status TabletMeta::add_rs_meta(const RowsetMetaSharedPtr& rs_meta) {
985
    // check RowsetMeta is valid
986
690k
    for (const auto& [_, rs] : _rs_metas) {
987
690k
        if (rs->version() == rs_meta->version()) {
988
0
            if (rs->rowset_id() != rs_meta->rowset_id()) {
989
0
                return Status::Error<PUSH_VERSION_ALREADY_EXIST>(
990
0
                        "version already exist. rowset_id={}, version={}, tablet={}",
991
0
                        rs->rowset_id().to_string(), rs->version().to_string(), tablet_id());
992
0
            } else {
993
                // rowsetid,version is equal, it is a duplicate req, skip it
994
0
                return Status::OK();
995
0
            }
996
0
        }
997
690k
    }
998
14.1k
    _rs_metas.emplace(rs_meta->version(), rs_meta);
999
14.1k
    return Status::OK();
1000
14.1k
}
1001
1002
247
void TabletMeta::add_rowsets_unchecked(const std::vector<RowsetSharedPtr>& to_add) {
1003
732
    for (const auto& rs : to_add) {
1004
732
        _rs_metas.emplace(rs->rowset_meta()->version(), rs->rowset_meta());
1005
732
    }
1006
247
}
1007
1008
void TabletMeta::delete_rs_meta_by_version(const Version& version,
1009
0
                                           std::vector<RowsetMetaSharedPtr>* deleted_rs_metas) {
1010
0
    size_t rowset_cache_version_size = 0;
1011
0
    if (auto it = _rs_metas.find(version); it != _rs_metas.end()) {
1012
0
        if (deleted_rs_metas != nullptr) {
1013
0
            deleted_rs_metas->push_back(it->second);
1014
0
        }
1015
0
        auto rowset_id = it->second->rowset_id();
1016
0
        _rs_metas.erase(it);
1017
0
        if (_enable_unique_key_merge_on_write) {
1018
0
            rowset_cache_version_size = _delete_bitmap->remove_rowset_cache_version(rowset_id);
1019
0
        }
1020
0
        return;
1021
0
    }
1022
0
    _check_mow_rowset_cache_version_size(rowset_cache_version_size);
1023
0
}
1024
1025
void TabletMeta::modify_rs_metas(const std::vector<RowsetMetaSharedPtr>& to_add,
1026
                                 const std::vector<RowsetMetaSharedPtr>& to_delete,
1027
122
                                 bool same_version) {
1028
122
    size_t rowset_cache_version_size = 0;
1029
    // Remove to_delete rowsets from _rs_metas
1030
581
    for (auto rs_to_del : to_delete) {
1031
581
        if (auto it = _rs_metas.find(rs_to_del->version()); it != _rs_metas.end()) {
1032
581
            auto rowset_id = it->second->rowset_id();
1033
581
            _rs_metas.erase(it);
1034
581
            if (_enable_unique_key_merge_on_write) {
1035
180
                rowset_cache_version_size = _delete_bitmap->remove_rowset_cache_version(rowset_id);
1036
180
            }
1037
581
        }
1038
581
    }
1039
122
    if (!same_version) {
1040
        // put to_delete rowsets in _stale_rs_metas.
1041
560
        for (auto rs_to_del : to_delete) {
1042
560
            _stale_rs_metas.emplace(rs_to_del->version(), rs_to_del);
1043
560
        }
1044
101
    }
1045
1046
    // put to_add rowsets in _rs_metas.
1047
122
    for (auto rs_to_add : to_add) {
1048
23
        _rs_metas.emplace(rs_to_add->version(), rs_to_add);
1049
23
    }
1050
122
    _check_mow_rowset_cache_version_size(rowset_cache_version_size);
1051
122
}
1052
1053
// Use the passing "rs_metas" to replace the rs meta in this tablet meta
1054
// Also clear the _stale_rs_metas because this tablet meta maybe copyied from
1055
// an existing tablet before. Add after revise, only the passing "rs_metas"
1056
// is needed.
1057
5
void TabletMeta::revise_rs_metas(std::vector<RowsetMetaSharedPtr>&& rs_metas) {
1058
5
    {
1059
5
        std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
1060
5
        _rs_metas.clear();
1061
10
        for (auto& rs_meta : rs_metas) {
1062
10
            _rs_metas.emplace(rs_meta->version(), rs_meta);
1063
10
        }
1064
5
        _stale_rs_metas.clear();
1065
5
    }
1066
5
    if (_enable_unique_key_merge_on_write) {
1067
0
        _delete_bitmap->clear_rowset_cache_version();
1068
0
    }
1069
5
}
1070
1071
// This method should call after revise_rs_metas, since new rs_metas might be a subset
1072
// of original tablet, we should revise the delete_bitmap according to current rowset.
1073
//
1074
// Delete bitmap is protected by Tablet::_meta_lock, we don't need to acquire the
1075
// TabletMeta's _meta_lock
1076
1
void TabletMeta::revise_delete_bitmap_unlocked(const DeleteBitmap& delete_bitmap) {
1077
1
    _delete_bitmap = std::make_unique<DeleteBitmap>(tablet_id());
1078
2
    for (const auto& [_, rs] : _rs_metas) {
1079
2
        DeleteBitmap rs_bm(tablet_id());
1080
2
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
1081
2
                             &rs_bm);
1082
2
        _delete_bitmap->merge(rs_bm);
1083
2
    }
1084
1
    for (const auto& [_, rs] : _stale_rs_metas) {
1085
0
        DeleteBitmap rs_bm(tablet_id());
1086
0
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
1087
0
                             &rs_bm);
1088
0
        _delete_bitmap->merge(rs_bm);
1089
0
    }
1090
1
}
1091
1092
0
void TabletMeta::delete_stale_rs_meta_by_version(const Version& version) {
1093
0
    _stale_rs_metas.erase(version);
1094
0
}
1095
1096
0
RowsetMetaSharedPtr TabletMeta::acquire_rs_meta_by_version(const Version& version) const {
1097
0
    if (auto it = _rs_metas.find(version); it != _rs_metas.end()) {
1098
0
        return it->second;
1099
0
    }
1100
0
    return nullptr;
1101
0
}
1102
1103
8
RowsetMetaSharedPtr TabletMeta::acquire_stale_rs_meta_by_version(const Version& version) const {
1104
8
    if (auto it = _stale_rs_metas.find(version); it != _stale_rs_metas.end()) {
1105
0
        return it->second;
1106
0
    }
1107
8
    return nullptr;
1108
8
}
1109
1110
23
Status TabletMeta::set_partition_id(int64_t partition_id) {
1111
23
    if ((_partition_id > 0 && _partition_id != partition_id) || partition_id < 1) {
1112
0
        LOG(WARNING) << "cur partition id=" << _partition_id << " new partition id=" << partition_id
1113
0
                     << " not equal";
1114
0
    }
1115
23
    _partition_id = partition_id;
1116
23
    return Status::OK();
1117
23
}
1118
1119
0
void TabletMeta::clear_stale_rowset() {
1120
0
    _stale_rs_metas.clear();
1121
0
    if (_enable_unique_key_merge_on_write) {
1122
0
        _delete_bitmap->clear_rowset_cache_version();
1123
0
    }
1124
0
}
1125
1126
0
void TabletMeta::clear_rowsets() {
1127
0
    _rs_metas.clear();
1128
0
    if (_enable_unique_key_merge_on_write) {
1129
0
        _delete_bitmap->clear_rowset_cache_version();
1130
0
    }
1131
0
}
1132
1133
122
void TabletMeta::_check_mow_rowset_cache_version_size(size_t rowset_cache_version_size) {
1134
122
    if (_enable_unique_key_merge_on_write && config::enable_mow_verbose_log &&
1135
122
        rowset_cache_version_size > _rs_metas.size() + _stale_rs_metas.size()) {
1136
0
        std::stringstream ss;
1137
0
        auto rowset_ids = _delete_bitmap->get_rowset_cache_version();
1138
0
        std::set<std::string> tablet_rowset_ids;
1139
0
        {
1140
0
            std::shared_lock rlock(_meta_lock);
1141
0
            for (const auto& [_, rs_meta] : _rs_metas) {
1142
0
                tablet_rowset_ids.emplace(rs_meta->rowset_id().to_string());
1143
0
            }
1144
0
            for (const auto& [_, rs_meta] : _stale_rs_metas) {
1145
0
                tablet_rowset_ids.emplace(rs_meta->rowset_id().to_string());
1146
0
            }
1147
0
        }
1148
0
        for (const auto& rowset_id : rowset_ids) {
1149
0
            if (tablet_rowset_ids.find(rowset_id) == tablet_rowset_ids.end()) {
1150
0
                ss << rowset_id << ", ";
1151
0
            }
1152
0
        }
1153
        // size(rowset_cache_version) <= size(_rs_metas) + size(_stale_rs_metas) + size(_unused_rs)
1154
0
        std::string msg = fmt::format(
1155
0
                "tablet: {}, rowset_cache_version size: {}, "
1156
0
                "_rs_metas size: {}, _stale_rs_metas size: {}, delta: {}. rowset only in cache: {}",
1157
0
                _tablet_id, rowset_cache_version_size, _rs_metas.size(), _stale_rs_metas.size(),
1158
0
                rowset_cache_version_size - _rs_metas.size() - _stale_rs_metas.size(), ss.str());
1159
0
        LOG(INFO) << msg;
1160
0
    }
1161
122
}
1162
1163
3
bool operator==(const TabletMeta& a, const TabletMeta& b) {
1164
3
    if (a._table_id != b._table_id) return false;
1165
3
    if (a._index_id != b._index_id) return false;
1166
3
    if (a._partition_id != b._partition_id) return false;
1167
3
    if (a._tablet_id != b._tablet_id) return false;
1168
3
    if (a._replica_id != b._replica_id) return false;
1169
3
    if (a._schema_hash != b._schema_hash) return false;
1170
3
    if (a._shard_id != b._shard_id) return false;
1171
3
    if (a._creation_time != b._creation_time) return false;
1172
3
    if (a._cumulative_layer_point != b._cumulative_layer_point) return false;
1173
3
    if (a._tablet_uid != b._tablet_uid) return false;
1174
3
    if (a._tablet_type != b._tablet_type) return false;
1175
3
    if (a._tablet_state != b._tablet_state) return false;
1176
3
    if (*a._schema != *b._schema) return false;
1177
3
    if (a._rs_metas != b._rs_metas) return false;
1178
3
    if (a._in_restore_mode != b._in_restore_mode) return false;
1179
3
    if (a._preferred_rowset_type != b._preferred_rowset_type) return false;
1180
3
    if (a._storage_policy_id != b._storage_policy_id) return false;
1181
3
    if (a._compaction_policy != b._compaction_policy) return false;
1182
3
    if (a._time_series_compaction_goal_size_mbytes != b._time_series_compaction_goal_size_mbytes)
1183
0
        return false;
1184
3
    if (a._time_series_compaction_file_count_threshold !=
1185
3
        b._time_series_compaction_file_count_threshold)
1186
0
        return false;
1187
3
    if (a._time_series_compaction_time_threshold_seconds !=
1188
3
        b._time_series_compaction_time_threshold_seconds)
1189
0
        return false;
1190
3
    if (a._time_series_compaction_empty_rowsets_threshold !=
1191
3
        b._time_series_compaction_empty_rowsets_threshold)
1192
0
        return false;
1193
3
    if (a._time_series_compaction_level_threshold != b._time_series_compaction_level_threshold)
1194
0
        return false;
1195
3
    return true;
1196
3
}
1197
1198
0
bool operator!=(const TabletMeta& a, const TabletMeta& b) {
1199
0
    return !(a == b);
1200
0
}
1201
1202
// We cannot just copy the underlying memory to construct a string
1203
// due to equivalent objects may have different padding bytes.
1204
// Reading padding bytes is undefined behavior, neither copy nor
1205
// placement new will help simplify the code.
1206
// Refer to C11 standards §6.2.6.1/6 and §6.7.9/21 for more info.
1207
64
static std::string agg_cache_key(int64_t tablet_id, const DeleteBitmap::BitmapKey& bmk) {
1208
64
    std::string ret(sizeof(tablet_id) + sizeof(bmk), '\0');
1209
64
    *reinterpret_cast<int64_t*>(ret.data()) = tablet_id;
1210
64
    auto t = reinterpret_cast<DeleteBitmap::BitmapKey*>(ret.data() + sizeof(tablet_id));
1211
64
    std::get<RowsetId>(*t).version = std::get<RowsetId>(bmk).version;
1212
64
    std::get<RowsetId>(*t).hi = std::get<RowsetId>(bmk).hi;
1213
64
    std::get<RowsetId>(*t).mi = std::get<RowsetId>(bmk).mi;
1214
64
    std::get<RowsetId>(*t).lo = std::get<RowsetId>(bmk).lo;
1215
64
    std::get<1>(*t) = std::get<1>(bmk);
1216
64
    std::get<2>(*t) = std::get<2>(bmk);
1217
64
    return ret;
1218
64
}
1219
1220
// decode cache key info from a agg_cache_key
1221
static void decode_agg_cache_key(const std::string& key_str, int64_t& tablet_id,
1222
0
                                 DeleteBitmap::BitmapKey& bmk) {
1223
0
    const char* ptr = key_str.data();
1224
0
    tablet_id = *reinterpret_cast<const int64_t*>(ptr);
1225
0
    ptr += sizeof(tablet_id);
1226
0
    const auto* t = reinterpret_cast<const DeleteBitmap::BitmapKey*>(ptr);
1227
0
    std::get<RowsetId>(bmk).version = std::get<RowsetId>(*t).version;
1228
0
    std::get<RowsetId>(bmk).hi = std::get<RowsetId>(*t).hi;
1229
0
    std::get<RowsetId>(bmk).mi = std::get<RowsetId>(*t).mi;
1230
0
    std::get<RowsetId>(bmk).lo = std::get<RowsetId>(*t).lo;
1231
0
    std::get<1>(bmk) = std::get<1>(*t);
1232
0
    std::get<2>(bmk) = std::get<2>(*t);
1233
0
}
1234
1235
DeleteBitmapAggCache::DeleteBitmapAggCache(size_t capacity)
1236
1
        : LRUCachePolicy(CachePolicy::CacheType::DELETE_BITMAP_AGG_CACHE, capacity,
1237
1
                         LRUCacheType::SIZE, config::delete_bitmap_agg_cache_stale_sweep_time_sec,
1238
1
                         /*num_shards*/ 256,
1239
1
                         /*element_count_capacity*/ 0, /*enable_prune*/ true,
1240
1
                         /*is_lru_k*/ false) {}
1241
1242
190
DeleteBitmapAggCache* DeleteBitmapAggCache::instance() {
1243
190
    return ExecEnv::GetInstance()->delete_bitmap_agg_cache();
1244
190
}
1245
1246
1
DeleteBitmapAggCache* DeleteBitmapAggCache::create_instance(size_t capacity) {
1247
1
    return new DeleteBitmapAggCache(capacity);
1248
1
}
1249
1250
0
DeleteBitmap DeleteBitmapAggCache::snapshot(int64_t tablet_id) {
1251
0
    DeleteBitmap ret(tablet_id);
1252
0
    auto collector = [&](const LRUHandle* handle) {
1253
0
        auto key = handle->key().to_string();
1254
0
        int64_t key_tablet_id;
1255
0
        DeleteBitmap::BitmapKey bmk;
1256
0
        decode_agg_cache_key(key, key_tablet_id, bmk);
1257
0
        if (key_tablet_id == tablet_id) {
1258
0
            const auto& dbm = reinterpret_cast<DeleteBitmapAggCache::Value*>(handle->value)->bitmap;
1259
0
            ret.set(bmk, dbm);
1260
0
        }
1261
0
    };
1262
0
    DeleteBitmapAggCache::instance()->for_each_entry(collector);
1263
0
    return ret;
1264
0
}
1265
1266
1.23k
DeleteBitmap::DeleteBitmap(int64_t tablet_id) : _tablet_id(tablet_id) {}
1267
1268
7
DeleteBitmap::DeleteBitmap(const DeleteBitmap& o) {
1269
7
    std::shared_lock l1(o.lock);
1270
7
    delete_bitmap = o.delete_bitmap;
1271
7
    _tablet_id = o._tablet_id;
1272
7
}
1273
1274
0
DeleteBitmap& DeleteBitmap::operator=(const DeleteBitmap& o) {
1275
0
    if (this == &o) return *this;
1276
0
    if (this < &o) {
1277
0
        std::unique_lock l1(lock);
1278
0
        std::shared_lock l2(o.lock);
1279
0
        delete_bitmap = o.delete_bitmap;
1280
0
        _tablet_id = o._tablet_id;
1281
0
    } else {
1282
0
        std::shared_lock l2(o.lock);
1283
0
        std::unique_lock l1(lock);
1284
0
        delete_bitmap = o.delete_bitmap;
1285
0
        _tablet_id = o._tablet_id;
1286
0
    }
1287
0
    return *this;
1288
0
}
1289
1290
0
DeleteBitmap::DeleteBitmap(DeleteBitmap&& o) noexcept {
1291
0
    std::scoped_lock l(o.lock, o._rowset_cache_version_lock);
1292
0
    delete_bitmap = std::move(o.delete_bitmap);
1293
0
    _tablet_id = std::move(o._tablet_id);
1294
0
    o._rowset_cache_version.clear();
1295
0
}
1296
1297
0
DeleteBitmap& DeleteBitmap::operator=(DeleteBitmap&& o) noexcept {
1298
0
    if (this == &o) return *this;
1299
0
    std::scoped_lock l(lock, o.lock, o._rowset_cache_version_lock);
1300
0
    delete_bitmap = std::move(o.delete_bitmap);
1301
0
    _tablet_id = std::move(o._tablet_id);
1302
0
    o._rowset_cache_version.clear();
1303
0
    return *this;
1304
0
}
1305
1306
0
DeleteBitmap DeleteBitmap::from_pb(const DeleteBitmapPB& pb, int64_t tablet_id) {
1307
0
    size_t len = pb.rowset_ids().size();
1308
0
    DCHECK_EQ(len, pb.segment_ids().size());
1309
0
    DCHECK_EQ(len, pb.versions().size());
1310
0
    DeleteBitmap delete_bitmap(tablet_id);
1311
0
    for (int32_t i = 0; i < len; ++i) {
1312
0
        RowsetId rs_id;
1313
0
        rs_id.init(pb.rowset_ids(i));
1314
0
        BitmapKey key = {rs_id, pb.segment_ids(i), pb.versions(i)};
1315
0
        delete_bitmap.delete_bitmap[key] =
1316
0
                roaring::Roaring::read(pb.segment_delete_bitmaps(i).data());
1317
0
    }
1318
0
    return delete_bitmap;
1319
0
}
1320
1321
0
DeleteBitmapPB DeleteBitmap::to_pb() {
1322
0
    std::shared_lock l(lock);
1323
0
    DeleteBitmapPB ret;
1324
0
    for (const auto& [k, v] : delete_bitmap) {
1325
0
        ret.mutable_rowset_ids()->Add(std::get<0>(k).to_string());
1326
0
        ret.mutable_segment_ids()->Add(std::get<1>(k));
1327
0
        ret.mutable_versions()->Add(std::get<2>(k));
1328
0
        std::string bitmap_data(v.getSizeInBytes(), '\0');
1329
0
        v.write(bitmap_data.data());
1330
0
        ret.mutable_segment_delete_bitmaps()->Add(std::move(bitmap_data));
1331
0
    }
1332
0
    return ret;
1333
0
}
1334
1335
7
DeleteBitmap DeleteBitmap::snapshot() const {
1336
7
    std::shared_lock l(lock);
1337
7
    return DeleteBitmap(*this);
1338
7
}
1339
1340
3
DeleteBitmap DeleteBitmap::snapshot(Version version) const {
1341
    // Take snapshot first, then remove keys greater than given version.
1342
3
    DeleteBitmap snapshot = this->snapshot();
1343
3
    auto it = snapshot.delete_bitmap.begin();
1344
412
    while (it != snapshot.delete_bitmap.end()) {
1345
409
        if (std::get<2>(it->first) > version) {
1346
4
            it = snapshot.delete_bitmap.erase(it);
1347
405
        } else {
1348
405
            it++;
1349
405
        }
1350
409
    }
1351
3
    return snapshot;
1352
3
}
1353
1354
463k
void DeleteBitmap::add(const BitmapKey& bmk, uint32_t row_id) {
1355
463k
    std::lock_guard l(lock);
1356
463k
    delete_bitmap[bmk].add(row_id);
1357
463k
}
1358
1359
0
int DeleteBitmap::remove(const BitmapKey& bmk, uint32_t row_id) {
1360
0
    std::lock_guard l(lock);
1361
0
    auto it = delete_bitmap.find(bmk);
1362
0
    if (it == delete_bitmap.end()) return -1;
1363
0
    it->second.remove(row_id);
1364
0
    return 0;
1365
0
}
1366
1367
8
void DeleteBitmap::remove(const BitmapKey& start, const BitmapKey& end) {
1368
8
    std::lock_guard l(lock);
1369
107
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) {
1370
101
        auto& [k, _] = *it;
1371
101
        if (k >= end) {
1372
2
            break;
1373
2
        }
1374
99
        it = delete_bitmap.erase(it);
1375
99
    }
1376
8
}
1377
1378
0
void DeleteBitmap::remove(const std::vector<std::tuple<BitmapKey, BitmapKey>>& key_ranges) {
1379
0
    std::lock_guard l(lock);
1380
0
    for (auto& [start, end] : key_ranges) {
1381
0
        for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) {
1382
0
            auto& [k, _] = *it;
1383
0
            if (k >= end) {
1384
0
                break;
1385
0
            }
1386
0
            it = delete_bitmap.erase(it);
1387
0
        }
1388
0
    }
1389
0
}
1390
1391
6
bool DeleteBitmap::contains(const BitmapKey& bmk, uint32_t row_id) const {
1392
6
    std::shared_lock l(lock);
1393
6
    auto it = delete_bitmap.find(bmk);
1394
6
    return it != delete_bitmap.end() && it->second.contains(row_id);
1395
6
}
1396
1397
2
bool DeleteBitmap::contains_agg(const BitmapKey& bmk, uint32_t row_id) const {
1398
2
    return get_agg(bmk)->contains(row_id);
1399
2
}
1400
1401
0
bool DeleteBitmap::empty() const {
1402
0
    std::shared_lock l(lock);
1403
0
    return delete_bitmap.empty();
1404
0
}
1405
1406
63
uint64_t DeleteBitmap::cardinality() const {
1407
63
    std::shared_lock l(lock);
1408
63
    uint64_t res = 0;
1409
320
    for (auto entry : delete_bitmap) {
1410
320
        if (std::get<1>(entry.first) != DeleteBitmap::INVALID_SEGMENT_ID) {
1411
320
            res += entry.second.cardinality();
1412
320
        }
1413
320
    }
1414
63
    return res;
1415
63
}
1416
1417
0
uint64_t DeleteBitmap::get_size() const {
1418
0
    std::shared_lock l(lock);
1419
0
    uint64_t charge = 0;
1420
0
    for (auto& [k, v] : delete_bitmap) {
1421
0
        if (std::get<1>(k) != DeleteBitmap::INVALID_SEGMENT_ID) {
1422
0
            charge += v.getSizeInBytes();
1423
0
        }
1424
0
    }
1425
0
    return charge;
1426
0
}
1427
1428
bool DeleteBitmap::contains_agg_with_cache_if_eligible(const BitmapKey& bmk,
1429
1
                                                       uint32_t row_id) const {
1430
1
    g_contains_agg_with_cache_if_eligible_total << 1;
1431
1
    int64_t start_version {0};
1432
1
    if (config::enable_mow_get_agg_by_cache) {
1433
1
        auto deleter = [&](Cache::Handle* handle) {
1434
0
            DeleteBitmapAggCache::instance()->release(handle);
1435
0
        };
1436
1
        std::unique_ptr<Cache::Handle, decltype(deleter)> dbm_handle(nullptr, deleter);
1437
1
        int64_t cached_version = 0;
1438
        // 1. try to lookup the desired key directly
1439
1
        dbm_handle.reset(DeleteBitmapAggCache::instance()->lookup(agg_cache_key(_tablet_id, bmk)));
1440
1
        if (dbm_handle != nullptr) {
1441
0
            cached_version = std::get<2>(bmk);
1442
1
        } else {
1443
            // 2. if not found, try to lookup with cached version
1444
1
            cached_version = _get_rowset_cache_version(bmk);
1445
1
            if (cached_version > 0) {
1446
0
                if (cached_version > std::get<2>(bmk)) {
1447
0
                    cached_version = 0;
1448
0
                } else {
1449
0
                    dbm_handle.reset(DeleteBitmapAggCache::instance()->lookup(agg_cache_key(
1450
0
                            _tablet_id, {std::get<0>(bmk), std::get<1>(bmk), cached_version})));
1451
0
                }
1452
0
            }
1453
1
        }
1454
1
        if (dbm_handle != nullptr) {
1455
0
            const auto& cached_dbm =
1456
0
                    reinterpret_cast<DeleteBitmapAggCache::Value*>(
1457
0
                            DeleteBitmapAggCache::instance()->value(dbm_handle.get()))
1458
0
                            ->bitmap;
1459
0
            if (cached_version == std::get<2>(bmk)) {
1460
0
                g_contains_agg_with_cache_if_eligible_full_hit << 1;
1461
0
            } else {
1462
0
                g_contains_agg_with_cache_if_eligible_partial_hit << 1;
1463
0
            }
1464
0
            if (cached_dbm.contains(row_id)) {
1465
0
                return true;
1466
0
            }
1467
0
            if (cached_version == std::get<2>(bmk)) {
1468
0
                return false;
1469
0
            }
1470
0
            start_version = cached_version + 1;
1471
0
        }
1472
1
    }
1473
1
    DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1474
1
    std::shared_lock l(lock);
1475
1
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1476
0
        auto& [k, bm] = *it;
1477
0
        if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1478
0
            std::get<2>(k) > std::get<2>(bmk)) {
1479
0
            break;
1480
0
        }
1481
0
        if (bm.contains(row_id)) {
1482
0
            return true;
1483
0
        }
1484
0
    }
1485
1
    return false;
1486
1
}
1487
1488
0
void DeleteBitmap::remove_sentinel_marks() {
1489
0
    std::lock_guard l(lock);
1490
0
    for (auto it = delete_bitmap.begin(), end = delete_bitmap.end(); it != end;) {
1491
0
        if (std::get<1>(it->first) == DeleteBitmap::INVALID_SEGMENT_ID) {
1492
0
            it = delete_bitmap.erase(it);
1493
0
        } else {
1494
0
            ++it;
1495
0
        }
1496
0
    }
1497
0
}
1498
1499
38
int DeleteBitmap::set(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1500
38
    std::lock_guard l(lock);
1501
38
    auto [_, inserted] = delete_bitmap.insert_or_assign(bmk, segment_delete_bitmap);
1502
38
    return inserted;
1503
38
}
1504
1505
7
int DeleteBitmap::get(const BitmapKey& bmk, roaring::Roaring* segment_delete_bitmap) const {
1506
7
    std::shared_lock l(lock);
1507
7
    auto it = delete_bitmap.find(bmk);
1508
7
    if (it == delete_bitmap.end()) return -1;
1509
7
    *segment_delete_bitmap = it->second; // copy
1510
7
    return 0;
1511
7
}
1512
1513
54
const roaring::Roaring* DeleteBitmap::get(const BitmapKey& bmk) const {
1514
54
    std::shared_lock l(lock);
1515
54
    auto it = delete_bitmap.find(bmk);
1516
54
    if (it == delete_bitmap.end()) return nullptr;
1517
41
    return &(it->second); // get address
1518
54
}
1519
1520
void DeleteBitmap::subset(const BitmapKey& start, const BitmapKey& end,
1521
3
                          DeleteBitmap* subset_rowset_map) const {
1522
3
    DCHECK(start < end);
1523
3
    std::shared_lock l(lock);
1524
26
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1525
25
        auto& [k, bm] = *it;
1526
25
        if (k >= end) {
1527
2
            break;
1528
2
        }
1529
23
        subset_rowset_map->set(k, bm);
1530
23
    }
1531
3
}
1532
1533
void DeleteBitmap::subset(std::vector<std::pair<RowsetId, int64_t>>& rowset_ids,
1534
                          int64_t start_version, int64_t end_version,
1535
0
                          DeleteBitmap* subset_delete_map) const {
1536
0
    DCHECK(start_version <= end_version);
1537
0
    for (auto& [rowset_id, _] : rowset_ids) {
1538
0
        BitmapKey start {rowset_id, 0, 0};
1539
0
        BitmapKey end {rowset_id, UINT32_MAX, end_version + 1};
1540
0
        std::shared_lock l(lock);
1541
0
        for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1542
0
            auto& [k, bm] = *it;
1543
0
            if (k >= end) {
1544
0
                break;
1545
0
            }
1546
0
            auto version = std::get<2>(k);
1547
0
            if (version >= start_version && version <= end_version) {
1548
0
                subset_delete_map->merge(k, bm);
1549
0
                VLOG_DEBUG << "subset delete bitmap, tablet=" << _tablet_id << ", version=["
1550
0
                           << start_version << ", " << end_version
1551
0
                           << "]. rowset=" << std::get<0>(k).to_string()
1552
0
                           << ", segment=" << std::get<1>(k) << ", version=" << version
1553
0
                           << ", cardinality=" << bm.cardinality();
1554
0
            }
1555
0
        }
1556
0
    }
1557
0
}
1558
1559
void DeleteBitmap::subset_and_agg(std::vector<std::pair<RowsetId, int64_t>>& rowset_ids,
1560
                                  int64_t start_version, int64_t end_version,
1561
1
                                  DeleteBitmap* subset_delete_map) const {
1562
1
    DCHECK(start_version <= end_version);
1563
2
    for (auto& [rowset_id, segment_num] : rowset_ids) {
1564
6
        for (int64_t seg_id = 0; seg_id < segment_num; ++seg_id) {
1565
4
            BitmapKey end {rowset_id, seg_id, end_version};
1566
4
            auto bm = get_agg_without_cache(end, start_version);
1567
4
            VLOG_DEBUG << "subset delete bitmap, tablet=" << _tablet_id << ", rowset=" << rowset_id
1568
0
                       << ", segment=" << seg_id << ", version=[" << start_version << "-"
1569
0
                       << end_version << "], cardinality=" << bm->cardinality();
1570
4
            if (bm->isEmpty()) {
1571
0
                continue;
1572
0
            }
1573
4
            subset_delete_map->merge(end, *bm);
1574
4
        }
1575
2
    }
1576
1
}
1577
1578
0
size_t DeleteBitmap::get_count_with_range(const BitmapKey& start, const BitmapKey& end) const {
1579
0
    DCHECK(start < end);
1580
0
    size_t count = 0;
1581
0
    std::shared_lock l(lock);
1582
0
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1583
0
        auto& [k, bm] = *it;
1584
0
        if (k >= end) {
1585
0
            break;
1586
0
        }
1587
0
        count++;
1588
0
    }
1589
0
    return count;
1590
0
}
1591
1592
6
void DeleteBitmap::merge(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1593
6
    std::lock_guard l(lock);
1594
6
    auto [iter, succ] = delete_bitmap.emplace(bmk, segment_delete_bitmap);
1595
6
    if (!succ) {
1596
0
        iter->second |= segment_delete_bitmap;
1597
0
    }
1598
6
}
1599
1600
9
void DeleteBitmap::merge(const DeleteBitmap& other) {
1601
9
    std::lock_guard l(lock);
1602
29
    for (auto& i : other.delete_bitmap) {
1603
29
        auto [j, succ] = this->delete_bitmap.insert(i);
1604
29
        if (!succ) j->second |= i.second;
1605
29
    }
1606
9
}
1607
1608
63
uint64_t DeleteBitmap::get_delete_bitmap_count() {
1609
63
    std::shared_lock l(lock);
1610
63
    uint64_t count = 0;
1611
383
    for (auto it = delete_bitmap.begin(); it != delete_bitmap.end(); it++) {
1612
320
        if (std::get<1>(it->first) != DeleteBitmap::INVALID_SEGMENT_ID) {
1613
320
            count++;
1614
320
        }
1615
320
    }
1616
63
    return count;
1617
63
}
1618
1619
void DeleteBitmap::traverse_rowset_and_version(
1620
0
        const std::function<int(const RowsetId& rowsetId, int64_t version)>& func) const {
1621
0
    std::shared_lock l(lock);
1622
0
    auto it = delete_bitmap.cbegin();
1623
0
    while (it != delete_bitmap.cend()) {
1624
0
        RowsetId rowset_id = std::get<0>(it->first);
1625
0
        int64_t version = std::get<2>(it->first);
1626
0
        int result = func(rowset_id, version);
1627
0
        if (result == -2) {
1628
            // find next <rowset, version>
1629
0
            it++;
1630
0
        } else {
1631
            // find next <rowset>
1632
0
            it = delete_bitmap.upper_bound({rowset_id, std::numeric_limits<SegmentId>::max(),
1633
0
                                            std::numeric_limits<Version>::max()});
1634
0
        }
1635
0
    }
1636
0
}
1637
1638
0
bool DeleteBitmap::has_calculated_for_multi_segments(const RowsetId& rowset_id) const {
1639
0
    return contains({rowset_id, INVALID_SEGMENT_ID, TEMP_VERSION_COMMON}, ROWSET_SENTINEL_MARK);
1640
0
}
1641
1642
180
size_t DeleteBitmap::remove_rowset_cache_version(const RowsetId& rowset_id) {
1643
180
    std::lock_guard l(_rowset_cache_version_lock);
1644
180
    _rowset_cache_version.erase(rowset_id);
1645
180
    VLOG_DEBUG << "remove agg cache version for tablet=" << _tablet_id
1646
0
               << ", rowset=" << rowset_id.to_string();
1647
180
    return _rowset_cache_version.size();
1648
180
}
1649
1650
0
void DeleteBitmap::clear_rowset_cache_version() {
1651
0
    std::lock_guard l(_rowset_cache_version_lock);
1652
0
    _rowset_cache_version.clear();
1653
0
    VLOG_DEBUG << "clear agg cache version for tablet=" << _tablet_id;
1654
0
}
1655
1656
0
std::set<std::string> DeleteBitmap::get_rowset_cache_version() {
1657
0
    std::set<std::string> set;
1658
0
    std::shared_lock l(_rowset_cache_version_lock);
1659
0
    for (auto& [k, _] : _rowset_cache_version) {
1660
0
        set.insert(k.to_string());
1661
0
    }
1662
0
    return set;
1663
0
}
1664
1665
49
DeleteBitmap::Version DeleteBitmap::_get_rowset_cache_version(const BitmapKey& bmk) const {
1666
49
    std::shared_lock l(_rowset_cache_version_lock);
1667
49
    if (auto it = _rowset_cache_version.find(std::get<0>(bmk)); it != _rowset_cache_version.end()) {
1668
39
        auto& segment_cache_version = it->second;
1669
39
        if (auto it1 = segment_cache_version.find(std::get<1>(bmk));
1670
39
            it1 != segment_cache_version.end()) {
1671
10
            return it1->second;
1672
10
        }
1673
39
    }
1674
39
    return 0;
1675
49
}
1676
1677
0
DeleteBitmap DeleteBitmap::agg_cache_snapshot() {
1678
0
    return DeleteBitmapAggCache::instance()->snapshot(_tablet_id);
1679
0
}
1680
1681
1.12k
void DeleteBitmap::set_tablet_id(int64_t tablet_id) {
1682
1.12k
    _tablet_id = tablet_id;
1683
1.12k
}
1684
1685
54
std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg(const BitmapKey& bmk) const {
1686
54
    std::string key_str = agg_cache_key(_tablet_id, bmk); // Cache key container
1687
54
    CacheKey key(key_str);
1688
54
    Cache::Handle* handle = DeleteBitmapAggCache::instance()->lookup(key);
1689
1690
54
    DeleteBitmapAggCache::Value* val =
1691
54
            handle == nullptr ? nullptr
1692
54
                              : reinterpret_cast<DeleteBitmapAggCache::Value*>(
1693
6
                                        DeleteBitmapAggCache::instance()->value(handle));
1694
    // FIXME: do we need a mutex here to get rid of duplicated initializations
1695
    //        of cache entries in some cases?
1696
54
    if (val == nullptr) { // Renew if needed, put a new Value to cache
1697
48
        val = new DeleteBitmapAggCache::Value();
1698
48
        Version start_version =
1699
48
                config::enable_mow_get_agg_by_cache ? _get_rowset_cache_version(bmk) : 0;
1700
48
        if (start_version > 0) {
1701
9
            Cache::Handle* handle2 = DeleteBitmapAggCache::instance()->lookup(
1702
9
                    agg_cache_key(_tablet_id, {std::get<0>(bmk), std::get<1>(bmk), start_version}));
1703
1704
9
            DBUG_EXECUTE_IF("DeleteBitmap::get_agg.cache_miss", {
1705
9
                if (handle2 != nullptr) {
1706
9
                    auto p = dp->param("percent", 0.3);
1707
9
                    std::mt19937 gen {std::random_device {}()};
1708
9
                    std::bernoulli_distribution inject_fault {p};
1709
9
                    if (inject_fault(gen)) {
1710
9
                        LOG_INFO("injection DeleteBitmap::get_agg.cache_miss, tablet_id={}",
1711
9
                                 _tablet_id);
1712
9
                        handle2 = nullptr;
1713
9
                    }
1714
9
                }
1715
9
            });
1716
9
            if (handle2 == nullptr || start_version > std::get<2>(bmk)) {
1717
0
                start_version = 0;
1718
9
            } else {
1719
9
                val->bitmap |= reinterpret_cast<DeleteBitmapAggCache::Value*>(
1720
9
                                       DeleteBitmapAggCache::instance()->value(handle2))
1721
9
                                       ->bitmap;
1722
9
                VLOG_DEBUG << "get agg cache version=" << start_version
1723
0
                           << " for tablet=" << _tablet_id
1724
0
                           << ", rowset=" << std::get<0>(bmk).to_string()
1725
0
                           << ", segment=" << std::get<1>(bmk);
1726
9
                start_version += 1;
1727
9
            }
1728
9
            if (handle2 != nullptr) {
1729
9
                DeleteBitmapAggCache::instance()->release(handle2);
1730
9
            }
1731
9
        }
1732
48
        {
1733
48
            std::shared_lock l(lock);
1734
48
            DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1735
87
            for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1736
84
                auto& [k, bm] = *it;
1737
84
                if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1738
84
                    std::get<2>(k) > std::get<2>(bmk)) {
1739
45
                    break;
1740
45
                }
1741
39
                val->bitmap |= bm;
1742
39
            }
1743
48
        }
1744
48
        size_t charge = val->bitmap.getSizeInBytes() + sizeof(DeleteBitmapAggCache::Value);
1745
48
        handle = DeleteBitmapAggCache::instance()->insert(key, val, charge, charge,
1746
48
                                                          CachePriority::NORMAL);
1747
48
        if (config::enable_mow_get_agg_by_cache && !val->bitmap.isEmpty()) {
1748
37
            std::lock_guard l(_rowset_cache_version_lock);
1749
            // this version is already agg
1750
37
            _rowset_cache_version[std::get<0>(bmk)][std::get<1>(bmk)] = std::get<2>(bmk);
1751
37
            VLOG_DEBUG << "set agg cache version=" << std::get<2>(bmk)
1752
0
                       << " for tablet=" << _tablet_id
1753
0
                       << ", rowset=" << std::get<0>(bmk).to_string()
1754
0
                       << ", segment=" << std::get<1>(bmk);
1755
37
        }
1756
48
        if (start_version > 0 && config::enable_mow_get_agg_correctness_check_core) {
1757
0
            std::shared_ptr<roaring::Roaring> bitmap = get_agg_without_cache(bmk);
1758
0
            if (val->bitmap != *bitmap) {
1759
0
                CHECK(false) << ". get agg correctness check failed for tablet=" << _tablet_id
1760
0
                             << ", rowset=" << std::get<0>(bmk).to_string()
1761
0
                             << ", segment=" << std::get<1>(bmk) << ", version=" << std::get<2>(bmk)
1762
0
                             << ". start_version from cache=" << start_version
1763
0
                             << ", delete_bitmap cardinality with cache="
1764
0
                             << val->bitmap.cardinality()
1765
0
                             << ", delete_bitmap cardinality without cache="
1766
0
                             << bitmap->cardinality();
1767
0
            }
1768
0
        }
1769
48
    }
1770
1771
    // It is natural for the cache to reclaim the underlying memory
1772
54
    return std::shared_ptr<roaring::Roaring>(
1773
54
            &val->bitmap, [handle](...) { DeleteBitmapAggCache::instance()->release(handle); });
1774
54
}
1775
1776
std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg_without_cache(
1777
4
        const BitmapKey& bmk, const int64_t start_version) const {
1778
4
    std::shared_ptr<roaring::Roaring> bitmap = std::make_shared<roaring::Roaring>();
1779
4
    std::shared_lock l(lock);
1780
4
    DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1781
24
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1782
23
        auto& [k, bm] = *it;
1783
23
        if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1784
23
            std::get<2>(k) > std::get<2>(bmk)) {
1785
3
            break;
1786
3
        }
1787
20
        *bitmap |= bm;
1788
20
    }
1789
4
    return bitmap;
1790
4
}
1791
1792
0
DeleteBitmap DeleteBitmap::diffset(const std::set<BitmapKey>& key_set) const {
1793
0
    std::shared_lock l(lock);
1794
0
    auto diff_key_set_view =
1795
0
            delete_bitmap | std::ranges::views::transform([](const auto& kv) { return kv.first; }) |
1796
0
            std::ranges::views::filter(
1797
0
                    [&key_set](const auto& key) { return !key_set.contains(key); });
1798
1799
0
    DeleteBitmap dbm(_tablet_id);
1800
0
    for (const auto& key : diff_key_set_view) {
1801
0
        const auto* bitmap = get(key);
1802
0
        DCHECK_NE(bitmap, nullptr);
1803
0
        dbm.delete_bitmap[key] = *bitmap;
1804
0
    }
1805
0
    return dbm;
1806
0
}
1807
1808
0
std::string tablet_state_name(TabletState state) {
1809
0
    switch (state) {
1810
0
    case TABLET_NOTREADY:
1811
0
        return "TABLET_NOTREADY";
1812
1813
0
    case TABLET_RUNNING:
1814
0
        return "TABLET_RUNNING";
1815
1816
0
    case TABLET_TOMBSTONED:
1817
0
        return "TABLET_TOMBSTONED";
1818
1819
0
    case TABLET_STOPPED:
1820
0
        return "TABLET_STOPPED";
1821
1822
0
    case TABLET_SHUTDOWN:
1823
0
        return "TABLET_SHUTDOWN";
1824
1825
0
    default:
1826
0
        return "TabletState(" + std::to_string(state) + ")";
1827
0
    }
1828
0
}
1829
1830
#include "common/compile_check_end.h"
1831
} // namespace doris