Coverage Report

Created: 2025-09-05 19:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/olap/tablet_meta.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "olap/tablet_meta.h"
19
20
#include <gen_cpp/Descriptors_types.h>
21
#include <gen_cpp/FrontendService_types.h>
22
#include <gen_cpp/Types_types.h>
23
#include <gen_cpp/olap_common.pb.h>
24
#include <gen_cpp/olap_file.pb.h>
25
#include <gen_cpp/segment_v2.pb.h>
26
#include <gen_cpp/types.pb.h>
27
#include <json2pb/pb_to_json.h>
28
#include <time.h>
29
30
#include <cstdint>
31
#include <memory>
32
#include <random>
33
#include <set>
34
#include <utility>
35
36
#include "cloud/cloud_meta_mgr.h"
37
#include "cloud/cloud_storage_engine.h"
38
#include "cloud/config.h"
39
#include "common/config.h"
40
#include "io/fs/file_writer.h"
41
#include "io/fs/local_file_system.h"
42
#include "olap/data_dir.h"
43
#include "olap/file_header.h"
44
#include "olap/olap_common.h"
45
#include "olap/olap_define.h"
46
#include "olap/rowset/rowset.h"
47
#include "olap/rowset/rowset_meta_manager.h"
48
#include "olap/tablet_fwd.h"
49
#include "olap/tablet_meta_manager.h"
50
#include "olap/tablet_schema_cache.h"
51
#include "olap/utils.h"
52
#include "util/debug_points.h"
53
#include "util/mem_info.h"
54
#include "util/parse_util.h"
55
#include "util/string_util.h"
56
#include "util/time.h"
57
#include "util/uid_util.h"
58
59
using std::string;
60
using std::unordered_map;
61
using std::vector;
62
63
namespace doris {
64
#include "common/compile_check_begin.h"
65
using namespace ErrorCode;
66
67
TabletMetaSharedPtr TabletMeta::create(
68
        const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id,
69
        uint32_t next_unique_id,
70
300
        const unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id) {
71
300
    std::optional<TBinlogConfig> binlog_config;
72
300
    if (request.__isset.binlog_config) {
73
0
        binlog_config = request.binlog_config;
74
0
    }
75
300
    TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format =
76
300
            request.inverted_index_file_storage_format;
77
78
    // We will discard this format. Don't make any further changes here.
79
300
    if (request.__isset.inverted_index_storage_format) {
80
300
        switch (request.inverted_index_storage_format) {
81
0
        case TInvertedIndexStorageFormat::V1:
82
0
            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V1;
83
0
            break;
84
0
        case TInvertedIndexStorageFormat::V2:
85
0
            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V2;
86
0
            break;
87
300
        default:
88
300
            break;
89
300
        }
90
300
    }
91
300
    return std::make_shared<TabletMeta>(
92
300
            request.table_id, request.partition_id, request.tablet_id, request.replica_id,
93
300
            request.tablet_schema.schema_hash, shard_id, request.tablet_schema, next_unique_id,
94
300
            col_ordinal_to_unique_id, tablet_uid,
95
300
            request.__isset.tablet_type ? request.tablet_type : TTabletType::TABLET_TYPE_DISK,
96
300
            request.compression_type, request.storage_policy_id,
97
300
            request.__isset.enable_unique_key_merge_on_write
98
300
                    ? request.enable_unique_key_merge_on_write
99
300
                    : false,
100
300
            std::move(binlog_config), request.compaction_policy,
101
300
            request.time_series_compaction_goal_size_mbytes,
102
300
            request.time_series_compaction_file_count_threshold,
103
300
            request.time_series_compaction_time_threshold_seconds,
104
300
            request.time_series_compaction_empty_rowsets_threshold,
105
300
            request.time_series_compaction_level_threshold, inverted_index_file_storage_format,
106
300
            request.tde_algorithm);
107
300
}
108
109
1.11k
TabletMeta::~TabletMeta() {
110
1.11k
    if (_handle) {
111
989
        TabletSchemaCache::instance()->release(_handle);
112
989
    }
113
1.11k
}
114
115
TabletMeta::TabletMeta()
116
553
        : _tablet_uid(0, 0),
117
553
          _schema(new TabletSchema),
118
553
          _delete_bitmap(new DeleteBitmap(_tablet_id)) {}
119
120
TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id,
121
                       int64_t replica_id, int32_t schema_hash, int32_t shard_id,
122
                       const TTabletSchema& tablet_schema, uint32_t next_unique_id,
123
                       const std::unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id,
124
                       TabletUid tablet_uid, TTabletType::type tabletType,
125
                       TCompressionType::type compression_type, int64_t storage_policy_id,
126
                       bool enable_unique_key_merge_on_write,
127
                       std::optional<TBinlogConfig> binlog_config, std::string compaction_policy,
128
                       int64_t time_series_compaction_goal_size_mbytes,
129
                       int64_t time_series_compaction_file_count_threshold,
130
                       int64_t time_series_compaction_time_threshold_seconds,
131
                       int64_t time_series_compaction_empty_rowsets_threshold,
132
                       int64_t time_series_compaction_level_threshold,
133
                       TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format,
134
                       TEncryptionAlgorithm::type tde_algorithm)
135
492
        : _tablet_uid(0, 0),
136
492
          _schema(new TabletSchema),
137
492
          _delete_bitmap(new DeleteBitmap(tablet_id)) {
138
492
    TabletMetaPB tablet_meta_pb;
139
492
    tablet_meta_pb.set_table_id(table_id);
140
492
    tablet_meta_pb.set_partition_id(partition_id);
141
492
    tablet_meta_pb.set_tablet_id(tablet_id);
142
492
    tablet_meta_pb.set_replica_id(replica_id);
143
492
    tablet_meta_pb.set_schema_hash(schema_hash);
144
492
    tablet_meta_pb.set_shard_id(shard_id);
145
    // Persist the creation time, but it is not used
146
492
    tablet_meta_pb.set_creation_time(time(nullptr));
147
492
    tablet_meta_pb.set_cumulative_layer_point(-1);
148
492
    tablet_meta_pb.set_tablet_state(PB_RUNNING);
149
492
    *(tablet_meta_pb.mutable_tablet_uid()) = tablet_uid.to_proto();
150
492
    tablet_meta_pb.set_tablet_type(tabletType == TTabletType::TABLET_TYPE_DISK
151
492
                                           ? TabletTypePB::TABLET_TYPE_DISK
152
492
                                           : TabletTypePB::TABLET_TYPE_MEMORY);
153
492
    tablet_meta_pb.set_enable_unique_key_merge_on_write(enable_unique_key_merge_on_write);
154
492
    tablet_meta_pb.set_storage_policy_id(storage_policy_id);
155
492
    tablet_meta_pb.set_compaction_policy(compaction_policy);
156
492
    tablet_meta_pb.set_time_series_compaction_goal_size_mbytes(
157
492
            time_series_compaction_goal_size_mbytes);
158
492
    tablet_meta_pb.set_time_series_compaction_file_count_threshold(
159
492
            time_series_compaction_file_count_threshold);
160
492
    tablet_meta_pb.set_time_series_compaction_time_threshold_seconds(
161
492
            time_series_compaction_time_threshold_seconds);
162
492
    tablet_meta_pb.set_time_series_compaction_empty_rowsets_threshold(
163
492
            time_series_compaction_empty_rowsets_threshold);
164
492
    tablet_meta_pb.set_time_series_compaction_level_threshold(
165
492
            time_series_compaction_level_threshold);
166
492
    TabletSchemaPB* schema = tablet_meta_pb.mutable_schema();
167
492
    schema->set_num_short_key_columns(tablet_schema.short_key_column_count);
168
492
    schema->set_num_rows_per_row_block(config::default_num_rows_per_column_file_block);
169
492
    schema->set_sequence_col_idx(tablet_schema.sequence_col_idx);
170
492
    switch (tablet_schema.keys_type) {
171
40
    case TKeysType::DUP_KEYS:
172
40
        schema->set_keys_type(KeysType::DUP_KEYS);
173
40
        break;
174
301
    case TKeysType::UNIQUE_KEYS:
175
301
        schema->set_keys_type(KeysType::UNIQUE_KEYS);
176
301
        break;
177
64
    case TKeysType::AGG_KEYS:
178
64
        schema->set_keys_type(KeysType::AGG_KEYS);
179
64
        break;
180
87
    default:
181
87
        LOG(WARNING) << "unknown tablet keys type";
182
87
        break;
183
492
    }
184
    // compress_kind used to compress segment files
185
492
    schema->set_compress_kind(COMPRESS_LZ4);
186
187
    // compression_type used to compress segment page
188
492
    switch (compression_type) {
189
0
    case TCompressionType::NO_COMPRESSION:
190
0
        schema->set_compression_type(segment_v2::NO_COMPRESSION);
191
0
        break;
192
0
    case TCompressionType::SNAPPY:
193
0
        schema->set_compression_type(segment_v2::SNAPPY);
194
0
        break;
195
0
    case TCompressionType::LZ4:
196
0
        schema->set_compression_type(segment_v2::LZ4);
197
0
        break;
198
492
    case TCompressionType::LZ4F:
199
492
        schema->set_compression_type(segment_v2::LZ4F);
200
492
        break;
201
0
    case TCompressionType::ZLIB:
202
0
        schema->set_compression_type(segment_v2::ZLIB);
203
0
        break;
204
0
    case TCompressionType::ZSTD:
205
0
        schema->set_compression_type(segment_v2::ZSTD);
206
0
        break;
207
0
    default:
208
0
        schema->set_compression_type(segment_v2::LZ4F);
209
0
        break;
210
492
    }
211
212
492
    switch (inverted_index_file_storage_format) {
213
0
    case TInvertedIndexFileStorageFormat::V1:
214
0
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1);
215
0
        break;
216
492
    case TInvertedIndexFileStorageFormat::V2:
217
492
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
218
492
        break;
219
0
    case TInvertedIndexFileStorageFormat::V3:
220
0
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
221
0
        break;
222
0
    default:
223
0
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
224
0
        break;
225
492
    }
226
227
492
    switch (tablet_schema.sort_type) {
228
0
    case TSortType::type::ZORDER:
229
0
        schema->set_sort_type(SortType::ZORDER);
230
0
        break;
231
492
    default:
232
492
        schema->set_sort_type(SortType::LEXICAL);
233
492
    }
234
492
    schema->set_sort_col_num(tablet_schema.sort_col_num);
235
492
    for (const auto& i : tablet_schema.cluster_key_uids) {
236
2
        schema->add_cluster_key_uids(i);
237
2
    }
238
492
    tablet_meta_pb.set_in_restore_mode(false);
239
240
    // set column information
241
492
    uint32_t col_ordinal = 0;
242
492
    bool has_bf_columns = false;
243
2.15k
    for (TColumn tcolumn : tablet_schema.columns) {
244
2.15k
        ColumnPB* column = schema->add_column();
245
2.15k
        uint32_t unique_id = -1;
246
2.15k
        if (tcolumn.col_unique_id >= 0) {
247
1
            unique_id = tcolumn.col_unique_id;
248
2.15k
        } else {
249
2.15k
            unique_id = col_ordinal_to_unique_id.at(col_ordinal);
250
2.15k
        }
251
2.15k
        col_ordinal++;
252
2.15k
        init_column_from_tcolumn(unique_id, tcolumn, column);
253
254
2.15k
        if (column->is_bf_column()) {
255
0
            has_bf_columns = true;
256
0
        }
257
258
2.15k
        if (tablet_schema.__isset.indexes) {
259
0
            for (auto& index : tablet_schema.indexes) {
260
0
                if (index.index_type == TIndexType::type::BITMAP) {
261
0
                    DCHECK_EQ(index.columns.size(), 1);
262
0
                    if (iequal(tcolumn.column_name, index.columns[0])) {
263
0
                        column->set_has_bitmap_index(true);
264
0
                        break;
265
0
                    }
266
0
                } else if (index.index_type == TIndexType::type::BLOOMFILTER ||
267
0
                           index.index_type == TIndexType::type::NGRAM_BF) {
268
0
                    DCHECK_EQ(index.columns.size(), 1);
269
0
                    if (iequal(tcolumn.column_name, index.columns[0])) {
270
0
                        column->set_is_bf_column(true);
271
0
                        break;
272
0
                    }
273
0
                }
274
0
            }
275
0
        }
276
2.15k
    }
277
278
    // copy index meta
279
492
    if (tablet_schema.__isset.indexes) {
280
0
        for (auto& index : tablet_schema.indexes) {
281
0
            TabletIndexPB* index_pb = schema->add_index();
282
0
            index_pb->set_index_id(index.index_id);
283
0
            index_pb->set_index_name(index.index_name);
284
            // init col_unique_id in index at be side, since col_unique_id may be -1 at fe side
285
            // get column unique id by name
286
0
            for (auto column_name : index.columns) {
287
0
                for (auto column : schema->column()) {
288
0
                    if (iequal(column.name(), column_name)) {
289
0
                        index_pb->add_col_unique_id(column.unique_id());
290
0
                    }
291
0
                }
292
0
            }
293
0
            switch (index.index_type) {
294
0
            case TIndexType::BITMAP:
295
0
                index_pb->set_index_type(IndexType::BITMAP);
296
0
                break;
297
0
            case TIndexType::INVERTED:
298
0
                index_pb->set_index_type(IndexType::INVERTED);
299
0
                break;
300
0
            case TIndexType::ANN:
301
0
                index_pb->set_index_type(IndexType::ANN);
302
0
                break;
303
0
            case TIndexType::BLOOMFILTER:
304
0
                index_pb->set_index_type(IndexType::BLOOMFILTER);
305
0
                break;
306
0
            case TIndexType::NGRAM_BF:
307
0
                index_pb->set_index_type(IndexType::NGRAM_BF);
308
0
                break;
309
0
            }
310
311
0
            if (index.__isset.properties) {
312
0
                auto properties = index_pb->mutable_properties();
313
0
                for (auto kv : index.properties) {
314
0
                    (*properties)[kv.first] = kv.second;
315
0
                }
316
0
            }
317
0
        }
318
0
    }
319
320
492
    schema->set_next_column_unique_id(next_unique_id);
321
492
    if (has_bf_columns && tablet_schema.__isset.bloom_filter_fpp) {
322
0
        schema->set_bf_fpp(tablet_schema.bloom_filter_fpp);
323
0
    }
324
325
492
    if (tablet_schema.__isset.is_in_memory) {
326
0
        schema->set_is_in_memory(tablet_schema.is_in_memory);
327
0
    }
328
329
492
    if (tablet_schema.__isset.disable_auto_compaction) {
330
0
        schema->set_disable_auto_compaction(tablet_schema.disable_auto_compaction);
331
0
    }
332
333
492
    if (tablet_schema.__isset.variant_enable_flatten_nested) {
334
492
        schema->set_enable_variant_flatten_nested(tablet_schema.variant_enable_flatten_nested);
335
492
    }
336
337
492
    if (tablet_schema.__isset.enable_single_replica_compaction) {
338
492
        schema->set_enable_single_replica_compaction(
339
492
                tablet_schema.enable_single_replica_compaction);
340
492
    }
341
342
492
    if (tablet_schema.__isset.delete_sign_idx) {
343
492
        schema->set_delete_sign_idx(tablet_schema.delete_sign_idx);
344
492
    }
345
492
    if (tablet_schema.__isset.store_row_column) {
346
492
        schema->set_store_row_column(tablet_schema.store_row_column);
347
492
    }
348
492
    if (tablet_schema.__isset.row_store_page_size) {
349
492
        schema->set_row_store_page_size(tablet_schema.row_store_page_size);
350
492
    }
351
492
    if (tablet_schema.__isset.storage_page_size) {
352
492
        schema->set_storage_page_size(tablet_schema.storage_page_size);
353
492
    }
354
492
    if (tablet_schema.__isset.storage_dict_page_size) {
355
492
        schema->set_storage_dict_page_size(tablet_schema.storage_dict_page_size);
356
492
    }
357
492
    if (tablet_schema.__isset.skip_write_index_on_load) {
358
492
        schema->set_skip_write_index_on_load(tablet_schema.skip_write_index_on_load);
359
492
    }
360
492
    if (tablet_schema.__isset.row_store_col_cids) {
361
0
        schema->mutable_row_store_column_unique_ids()->Add(tablet_schema.row_store_col_cids.begin(),
362
0
                                                           tablet_schema.row_store_col_cids.end());
363
0
    }
364
492
    if (binlog_config.has_value()) {
365
0
        BinlogConfig tmp_binlog_config;
366
0
        tmp_binlog_config = binlog_config.value();
367
0
        tmp_binlog_config.to_pb(tablet_meta_pb.mutable_binlog_config());
368
0
    }
369
370
492
    switch (tde_algorithm) {
371
0
    case doris::TEncryptionAlgorithm::AES256:
372
0
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::AES_256_CTR);
373
0
        break;
374
0
    case doris::TEncryptionAlgorithm::SM4:
375
0
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::SM4_128_CTR);
376
0
        break;
377
492
    default:
378
492
        tablet_meta_pb.set_encryption_algorithm(EncryptionAlgorithmPB::PLAINTEXT);
379
492
    }
380
381
492
    init_from_pb(tablet_meta_pb);
382
492
}
383
384
TabletMeta::TabletMeta(const TabletMeta& b)
385
20
        : MetadataAdder(b),
386
20
          _table_id(b._table_id),
387
20
          _index_id(b._index_id),
388
20
          _partition_id(b._partition_id),
389
20
          _tablet_id(b._tablet_id),
390
20
          _replica_id(b._replica_id),
391
20
          _schema_hash(b._schema_hash),
392
20
          _shard_id(b._shard_id),
393
20
          _creation_time(b._creation_time),
394
20
          _cumulative_layer_point(b._cumulative_layer_point),
395
20
          _tablet_uid(b._tablet_uid),
396
20
          _tablet_type(b._tablet_type),
397
20
          _tablet_state(b._tablet_state),
398
20
          _schema(b._schema),
399
20
          _rs_metas(b._rs_metas),
400
20
          _stale_rs_metas(b._stale_rs_metas),
401
20
          _in_restore_mode(b._in_restore_mode),
402
20
          _preferred_rowset_type(b._preferred_rowset_type),
403
20
          _storage_policy_id(b._storage_policy_id),
404
20
          _cooldown_meta_id(b._cooldown_meta_id),
405
20
          _enable_unique_key_merge_on_write(b._enable_unique_key_merge_on_write),
406
20
          _delete_bitmap(b._delete_bitmap),
407
20
          _binlog_config(b._binlog_config),
408
20
          _compaction_policy(b._compaction_policy),
409
20
          _time_series_compaction_goal_size_mbytes(b._time_series_compaction_goal_size_mbytes),
410
          _time_series_compaction_file_count_threshold(
411
20
                  b._time_series_compaction_file_count_threshold),
412
          _time_series_compaction_time_threshold_seconds(
413
20
                  b._time_series_compaction_time_threshold_seconds),
414
          _time_series_compaction_empty_rowsets_threshold(
415
20
                  b._time_series_compaction_empty_rowsets_threshold),
416
20
          _time_series_compaction_level_threshold(b._time_series_compaction_level_threshold) {};
417
418
void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tcolumn,
419
2.15k
                                          ColumnPB* column) {
420
2.15k
    column->set_unique_id(unique_id);
421
2.15k
    column->set_name(tcolumn.column_name);
422
2.15k
    column->set_has_bitmap_index(tcolumn.has_bitmap_index);
423
2.15k
    column->set_is_auto_increment(tcolumn.is_auto_increment);
424
2.15k
    if (tcolumn.__isset.is_on_update_current_timestamp) {
425
2.15k
        column->set_is_on_update_current_timestamp(tcolumn.is_on_update_current_timestamp);
426
2.15k
    }
427
2.15k
    string data_type;
428
2.15k
    EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type);
429
2.15k
    column->set_type(data_type);
430
431
2.15k
    uint32_t length = TabletColumn::get_field_length_by_type(tcolumn.column_type.type,
432
2.15k
                                                             tcolumn.column_type.len);
433
2.15k
    column->set_length(length);
434
2.15k
    column->set_index_length(length);
435
2.15k
    column->set_precision(tcolumn.column_type.precision);
436
2.15k
    column->set_frac(tcolumn.column_type.scale);
437
438
2.15k
    if (tcolumn.__isset.result_is_nullable) {
439
0
        column->set_result_is_nullable(tcolumn.result_is_nullable);
440
0
    }
441
442
2.15k
    if (tcolumn.__isset.be_exec_version) {
443
2.15k
        column->set_be_exec_version(tcolumn.be_exec_version);
444
2.15k
    }
445
446
2.15k
    if (tcolumn.column_type.type == TPrimitiveType::VARCHAR ||
447
2.15k
        tcolumn.column_type.type == TPrimitiveType::STRING) {
448
105
        if (!tcolumn.column_type.__isset.index_len) {
449
105
            column->set_index_length(10);
450
105
        } else {
451
0
            column->set_index_length(tcolumn.column_type.index_len);
452
0
        }
453
105
    }
454
2.15k
    if (!tcolumn.is_key) {
455
1.15k
        column->set_is_key(false);
456
1.15k
        if (tcolumn.__isset.aggregation) {
457
0
            column->set_aggregation(tcolumn.aggregation);
458
1.15k
        } else {
459
1.15k
            string aggregation_type;
460
1.15k
            EnumToString(TAggregationType, tcolumn.aggregation_type, aggregation_type);
461
1.15k
            column->set_aggregation(aggregation_type);
462
1.15k
        }
463
1.15k
    } else {
464
1.00k
        column->set_is_key(true);
465
1.00k
        column->set_aggregation("NONE");
466
1.00k
    }
467
2.15k
    column->set_is_nullable(tcolumn.is_allow_null);
468
2.15k
    if (tcolumn.__isset.default_value) {
469
1
        column->set_default_value(tcolumn.default_value);
470
1
    }
471
2.15k
    if (tcolumn.__isset.is_bloom_filter_column) {
472
1
        column->set_is_bf_column(tcolumn.is_bloom_filter_column);
473
1
    }
474
2.15k
    if (tcolumn.__isset.visible) {
475
2.15k
        column->set_visible(tcolumn.visible);
476
2.15k
    }
477
2.15k
    for (size_t i = 0; i < tcolumn.children_column.size(); i++) {
478
0
        ColumnPB* children_column = column->add_children_columns();
479
0
        init_column_from_tcolumn(tcolumn.children_column[i].col_unique_id,
480
0
                                 tcolumn.children_column[i], children_column);
481
0
    }
482
2.15k
    if (tcolumn.column_type.__isset.variant_max_subcolumns_count) {
483
2.15k
        column->set_variant_max_subcolumns_count(tcolumn.column_type.variant_max_subcolumns_count);
484
2.15k
    }
485
2.15k
    if (tcolumn.__isset.pattern_type) {
486
1
        switch (tcolumn.pattern_type) {
487
0
        case TPatternType::MATCH_NAME:
488
0
            column->set_pattern_type(PatternTypePB::MATCH_NAME);
489
0
            break;
490
1
        case TPatternType::MATCH_NAME_GLOB:
491
1
            column->set_pattern_type(PatternTypePB::MATCH_NAME_GLOB);
492
1
        }
493
1
    }
494
2.15k
    if (tcolumn.__isset.variant_enable_typed_paths_to_sparse) {
495
2.15k
        column->set_variant_enable_typed_paths_to_sparse(
496
2.15k
                tcolumn.variant_enable_typed_paths_to_sparse);
497
2.15k
    }
498
2.15k
    if (tcolumn.__isset.variant_max_sparse_column_statistics_size) {
499
2.15k
        column->set_variant_max_sparse_column_statistics_size(
500
2.15k
                tcolumn.variant_max_sparse_column_statistics_size);
501
2.15k
    }
502
2.15k
}
503
504
0
void TabletMeta::remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version) {
505
0
    if (_enable_unique_key_merge_on_write) {
506
0
        delete_bitmap().remove({rowset_id, 0, 0}, {rowset_id, UINT32_MAX, 0});
507
0
        if (config::enable_mow_verbose_log) {
508
0
            LOG_INFO("delete rowset delete bitmap. tablet={}, rowset={}, version={}", tablet_id(),
509
0
                     rowset_id.to_string(), version.to_string());
510
0
        }
511
0
        size_t rowset_cache_version_size = delete_bitmap().remove_rowset_cache_version(rowset_id);
512
0
        _check_mow_rowset_cache_version_size(rowset_cache_version_size);
513
0
    }
514
0
}
515
516
4
Status TabletMeta::create_from_file(const string& file_path) {
517
4
    TabletMetaPB tablet_meta_pb;
518
4
    RETURN_IF_ERROR(load_from_file(file_path, &tablet_meta_pb));
519
4
    init_from_pb(tablet_meta_pb);
520
4
    return Status::OK();
521
4
}
522
523
10
Status TabletMeta::load_from_file(const string& file_path, TabletMetaPB* tablet_meta_pb) {
524
10
    FileHeader<TabletMetaPB> file_header(file_path);
525
    // In file_header.deserialize(), it validates file length, signature, checksum of protobuf.
526
10
    RETURN_IF_ERROR(file_header.deserialize());
527
10
    try {
528
10
        tablet_meta_pb->CopyFrom(file_header.message());
529
10
    } catch (const std::exception& e) {
530
0
        LOG(WARNING) << "Failed to copy protocol buffer object: " << e.what()
531
0
                     << ", file=" << file_path;
532
0
        return Status::Error<PARSE_PROTOBUF_ERROR>(
533
0
                "fail to copy protocol buffer object. file={}, error={}", file_path, e.what());
534
0
    }
535
10
    return Status::OK();
536
10
}
537
538
6
Status TabletMeta::create_from_buffer(const uint8_t* buffer, size_t buffer_size) {
539
6
    FileHeader<TabletMetaPB> file_header(""); // empty file path
540
6
    RETURN_IF_ERROR(file_header.deserialize_from_memory(buffer, buffer_size));
541
542
2
    TabletMetaPB tablet_meta_pb;
543
2
    try {
544
2
        tablet_meta_pb.CopyFrom(file_header.message());
545
2
    } catch (const std::exception& e) {
546
0
        LOG(WARNING) << "Failed to copy protocol buffer object from buffer: " << e.what();
547
0
        return Status::Error<ErrorCode::PARSE_PROTOBUF_ERROR>(
548
0
                "fail to copy protocol buffer object from buffer. error={}", e.what());
549
0
    }
550
551
2
    init_from_pb(tablet_meta_pb);
552
2
    return Status::OK();
553
2
}
554
555
std::string TabletMeta::construct_header_file_path(const string& schema_hash_path,
556
3
                                                   int64_t tablet_id) {
557
3
    std::stringstream header_name_stream;
558
3
    header_name_stream << schema_hash_path << "/" << tablet_id << ".hdr";
559
3
    return header_name_stream.str();
560
3
}
561
562
0
Status TabletMeta::save_as_json(const string& file_path) {
563
0
    std::string json_meta;
564
0
    json2pb::Pb2JsonOptions json_options;
565
0
    json_options.pretty_json = true;
566
0
    json_options.bytes_to_base64 = true;
567
0
    to_json(&json_meta, json_options);
568
    // save to file
569
0
    io::FileWriterPtr file_writer;
570
0
    RETURN_IF_ERROR(io::global_local_filesystem()->create_file(file_path, &file_writer));
571
0
    RETURN_IF_ERROR(file_writer->append(json_meta));
572
0
    RETURN_IF_ERROR(file_writer->close());
573
0
    return Status::OK();
574
0
}
575
576
230
Status TabletMeta::save(const string& file_path) {
577
230
    TabletMetaPB tablet_meta_pb;
578
230
    to_meta_pb(&tablet_meta_pb);
579
230
    return TabletMeta::save(file_path, tablet_meta_pb);
580
230
}
581
582
234
Status TabletMeta::save(const string& file_path, const TabletMetaPB& tablet_meta_pb) {
583
234
    DCHECK(!file_path.empty());
584
234
    FileHeader<TabletMetaPB> file_header(file_path);
585
234
    try {
586
234
        file_header.mutable_message()->CopyFrom(tablet_meta_pb);
587
234
    } catch (...) {
588
0
        LOG(WARNING) << "fail to copy protocol buffer object. file='" << file_path;
589
0
        return Status::Error<ErrorCode::INTERNAL_ERROR>(
590
0
                "fail to copy protocol buffer object. file={}", file_path);
591
0
    }
592
234
    RETURN_IF_ERROR(file_header.prepare());
593
234
    RETURN_IF_ERROR(file_header.serialize());
594
234
    return Status::OK();
595
234
}
596
597
565
Status TabletMeta::save_meta(DataDir* data_dir) {
598
565
    std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
599
565
    return _save_meta(data_dir);
600
565
}
601
602
565
Status TabletMeta::_save_meta(DataDir* data_dir) {
603
    // check if tablet uid is valid
604
565
    if (_tablet_uid.hi == 0 && _tablet_uid.lo == 0) {
605
0
        LOG(FATAL) << "tablet_uid is invalid"
606
0
                   << " tablet=" << tablet_id() << " _tablet_uid=" << _tablet_uid.to_string();
607
0
    }
608
565
    string meta_binary;
609
610
565
    auto t1 = MonotonicMicros();
611
565
    serialize(&meta_binary);
612
565
    auto t2 = MonotonicMicros();
613
565
    Status status = TabletMetaManager::save(data_dir, tablet_id(), schema_hash(), meta_binary);
614
565
    if (!status.ok()) {
615
0
        LOG(FATAL) << "fail to save tablet_meta. status=" << status << ", tablet_id=" << tablet_id()
616
0
                   << ", schema_hash=" << schema_hash();
617
0
    }
618
565
    auto t3 = MonotonicMicros();
619
565
    auto cost = t3 - t1;
620
565
    if (cost > 1 * 1000 * 1000) {
621
0
        LOG(INFO) << "save tablet(" << tablet_id() << ") meta too slow. serialize cost " << t2 - t1
622
0
                  << "(us), serialized binary size: " << meta_binary.length()
623
0
                  << "(bytes), write rocksdb cost " << t3 - t2 << "(us)";
624
0
    }
625
565
    return status;
626
565
}
627
628
570
void TabletMeta::serialize(string* meta_binary) {
629
570
    TabletMetaPB tablet_meta_pb;
630
570
    to_meta_pb(&tablet_meta_pb);
631
570
    if (tablet_meta_pb.partition_id() <= 0) {
632
468
        LOG(WARNING) << "invalid partition id " << tablet_meta_pb.partition_id() << " tablet "
633
468
                     << tablet_meta_pb.tablet_id();
634
468
    }
635
570
    DBUG_EXECUTE_IF("TabletMeta::serialize::zero_partition_id", {
636
570
        long partition_id = tablet_meta_pb.partition_id();
637
570
        tablet_meta_pb.set_partition_id(0);
638
570
        LOG(WARNING) << "set debug point TabletMeta::serialize::zero_partition_id old="
639
570
                     << partition_id << " new=" << tablet_meta_pb.DebugString();
640
570
    });
641
570
    bool serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
642
570
    if (!_rs_metas.empty() || !_stale_rs_metas.empty()) {
643
570
        _avg_rs_meta_serialize_size =
644
570
                meta_binary->length() / (_rs_metas.size() + _stale_rs_metas.size());
645
570
        if (meta_binary->length() > config::tablet_meta_serialize_size_limit ||
646
570
            !serialize_success) {
647
0
            int64_t origin_meta_size = meta_binary->length();
648
0
            int64_t stale_rowsets_num = tablet_meta_pb.stale_rs_metas().size();
649
0
            tablet_meta_pb.clear_stale_rs_metas();
650
0
            meta_binary->clear();
651
0
            serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
652
0
            LOG(WARNING) << "tablet meta serialization size exceeds limit: "
653
0
                         << config::tablet_meta_serialize_size_limit
654
0
                         << " clean up stale rowsets, tablet id: " << tablet_id()
655
0
                         << " stale rowset num: " << stale_rowsets_num
656
0
                         << " serialization size before clean " << origin_meta_size
657
0
                         << " serialization size after clean " << meta_binary->length();
658
0
        }
659
570
    }
660
661
570
    if (!serialize_success) {
662
0
        LOG(FATAL) << "failed to serialize meta " << tablet_id();
663
0
    }
664
570
}
665
666
461
Status TabletMeta::deserialize(std::string_view meta_binary) {
667
461
    TabletMetaPB tablet_meta_pb;
668
461
    bool parsed = tablet_meta_pb.ParseFromArray(meta_binary.data(),
669
461
                                                static_cast<int32_t>(meta_binary.size()));
670
461
    if (!parsed) {
671
0
        return Status::Error<INIT_FAILED>("parse tablet meta failed");
672
0
    }
673
461
    init_from_pb(tablet_meta_pb);
674
461
    return Status::OK();
675
461
}
676
677
992
void TabletMeta::init_from_pb(const TabletMetaPB& tablet_meta_pb) {
678
992
    _table_id = tablet_meta_pb.table_id();
679
992
    _index_id = tablet_meta_pb.index_id();
680
992
    _partition_id = tablet_meta_pb.partition_id();
681
992
    _tablet_id = tablet_meta_pb.tablet_id();
682
992
    _replica_id = tablet_meta_pb.replica_id();
683
992
    _schema_hash = tablet_meta_pb.schema_hash();
684
992
    _shard_id = tablet_meta_pb.shard_id();
685
992
    _creation_time = tablet_meta_pb.creation_time();
686
992
    _cumulative_layer_point = tablet_meta_pb.cumulative_layer_point();
687
992
    _tablet_uid = TabletUid(tablet_meta_pb.tablet_uid());
688
992
    _ttl_seconds = tablet_meta_pb.ttl_seconds();
689
992
    if (tablet_meta_pb.has_tablet_type()) {
690
966
        _tablet_type = tablet_meta_pb.tablet_type();
691
966
    } else {
692
26
        _tablet_type = TabletTypePB::TABLET_TYPE_DISK;
693
26
    }
694
695
    // init _tablet_state
696
992
    switch (tablet_meta_pb.tablet_state()) {
697
28
    case PB_NOTREADY:
698
28
        _tablet_state = TabletState::TABLET_NOTREADY;
699
28
        break;
700
739
    case PB_RUNNING:
701
739
        _tablet_state = TabletState::TABLET_RUNNING;
702
739
        break;
703
0
    case PB_TOMBSTONED:
704
0
        _tablet_state = TabletState::TABLET_TOMBSTONED;
705
0
        break;
706
0
    case PB_STOPPED:
707
0
        _tablet_state = TabletState::TABLET_STOPPED;
708
0
        break;
709
225
    case PB_SHUTDOWN:
710
225
        _tablet_state = TabletState::TABLET_SHUTDOWN;
711
225
        break;
712
0
    default:
713
0
        LOG(WARNING) << "tablet has no state. tablet=" << tablet_id()
714
0
                     << ", schema_hash=" << schema_hash();
715
992
    }
716
717
    // init _schema
718
992
    TabletSchemaSPtr schema = std::make_shared<TabletSchema>();
719
992
    schema->init_from_pb(tablet_meta_pb.schema());
720
992
    if (_handle) {
721
3
        TabletSchemaCache::instance()->release(_handle);
722
3
    }
723
992
    auto pair = TabletSchemaCache::instance()->insert(schema->to_key());
724
992
    _handle = pair.first;
725
992
    _schema = pair.second;
726
727
992
    if (tablet_meta_pb.has_enable_unique_key_merge_on_write()) {
728
966
        _enable_unique_key_merge_on_write = tablet_meta_pb.enable_unique_key_merge_on_write();
729
966
    }
730
731
    // init _rs_metas
732
10.8k
    for (auto& it : tablet_meta_pb.rs_metas()) {
733
10.8k
        RowsetMetaSharedPtr rs_meta(new RowsetMeta());
734
10.8k
        rs_meta->init_from_pb(it);
735
10.8k
        _rs_metas.push_back(std::move(rs_meta));
736
10.8k
    }
737
738
    // For mow table, delete bitmap of stale rowsets has not been persisted.
739
    // When be restart, query should not read the stale rowset, otherwise duplicate keys
740
    // will be read out. Therefore, we don't add them to _stale_rs_meta for mow table.
741
992
    if (!config::skip_loading_stale_rowset_meta && !_enable_unique_key_merge_on_write) {
742
957
        for (auto& it : tablet_meta_pb.stale_rs_metas()) {
743
0
            RowsetMetaSharedPtr rs_meta(new RowsetMeta());
744
0
            rs_meta->init_from_pb(it);
745
0
            _stale_rs_metas.push_back(std::move(rs_meta));
746
0
        }
747
957
    }
748
749
992
    if (tablet_meta_pb.has_in_restore_mode()) {
750
966
        _in_restore_mode = tablet_meta_pb.in_restore_mode();
751
966
    }
752
753
992
    if (tablet_meta_pb.has_preferred_rowset_type()) {
754
474
        _preferred_rowset_type = tablet_meta_pb.preferred_rowset_type();
755
474
    }
756
757
992
    _storage_policy_id = tablet_meta_pb.storage_policy_id();
758
992
    if (tablet_meta_pb.has_cooldown_meta_id()) {
759
0
        _cooldown_meta_id = tablet_meta_pb.cooldown_meta_id();
760
0
    }
761
762
992
    if (tablet_meta_pb.has_delete_bitmap()) {
763
0
        int rst_ids_size = tablet_meta_pb.delete_bitmap().rowset_ids_size();
764
0
        int seg_ids_size = tablet_meta_pb.delete_bitmap().segment_ids_size();
765
0
        int versions_size = tablet_meta_pb.delete_bitmap().versions_size();
766
0
        int seg_maps_size = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps_size();
767
0
        CHECK(rst_ids_size == seg_ids_size && seg_ids_size == seg_maps_size &&
768
0
              seg_maps_size == versions_size);
769
0
        for (int i = 0; i < rst_ids_size; ++i) {
770
0
            RowsetId rst_id;
771
0
            rst_id.init(tablet_meta_pb.delete_bitmap().rowset_ids(i));
772
0
            auto seg_id = tablet_meta_pb.delete_bitmap().segment_ids(i);
773
0
            auto ver = tablet_meta_pb.delete_bitmap().versions(i);
774
0
            auto bitmap = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps(i).data();
775
0
            delete_bitmap().delete_bitmap[{rst_id, seg_id, ver}] = roaring::Roaring::read(bitmap);
776
0
        }
777
0
    }
778
779
992
    if (tablet_meta_pb.has_binlog_config()) {
780
472
        _binlog_config = tablet_meta_pb.binlog_config();
781
472
    }
782
992
    _compaction_policy = tablet_meta_pb.compaction_policy();
783
992
    _time_series_compaction_goal_size_mbytes =
784
992
            tablet_meta_pb.time_series_compaction_goal_size_mbytes();
785
992
    _time_series_compaction_file_count_threshold =
786
992
            tablet_meta_pb.time_series_compaction_file_count_threshold();
787
992
    _time_series_compaction_time_threshold_seconds =
788
992
            tablet_meta_pb.time_series_compaction_time_threshold_seconds();
789
992
    _time_series_compaction_empty_rowsets_threshold =
790
992
            tablet_meta_pb.time_series_compaction_empty_rowsets_threshold();
791
992
    _time_series_compaction_level_threshold =
792
992
            tablet_meta_pb.time_series_compaction_level_threshold();
793
794
992
    if (tablet_meta_pb.has_encryption_algorithm()) {
795
964
        _encryption_algorithm = tablet_meta_pb.encryption_algorithm();
796
964
    }
797
992
}
798
799
814
void TabletMeta::to_meta_pb(TabletMetaPB* tablet_meta_pb) {
800
814
    tablet_meta_pb->set_table_id(table_id());
801
814
    tablet_meta_pb->set_index_id(index_id());
802
814
    tablet_meta_pb->set_partition_id(partition_id());
803
814
    tablet_meta_pb->set_tablet_id(tablet_id());
804
814
    tablet_meta_pb->set_replica_id(replica_id());
805
814
    tablet_meta_pb->set_schema_hash(schema_hash());
806
814
    tablet_meta_pb->set_shard_id(shard_id());
807
814
    tablet_meta_pb->set_creation_time(creation_time());
808
814
    tablet_meta_pb->set_cumulative_layer_point(cumulative_layer_point());
809
814
    *(tablet_meta_pb->mutable_tablet_uid()) = tablet_uid().to_proto();
810
814
    tablet_meta_pb->set_tablet_type(_tablet_type);
811
814
    tablet_meta_pb->set_ttl_seconds(_ttl_seconds);
812
814
    switch (tablet_state()) {
813
8
    case TABLET_NOTREADY:
814
8
        tablet_meta_pb->set_tablet_state(PB_NOTREADY);
815
8
        break;
816
328
    case TABLET_RUNNING:
817
328
        tablet_meta_pb->set_tablet_state(PB_RUNNING);
818
328
        break;
819
0
    case TABLET_TOMBSTONED:
820
0
        tablet_meta_pb->set_tablet_state(PB_TOMBSTONED);
821
0
        break;
822
0
    case TABLET_STOPPED:
823
0
        tablet_meta_pb->set_tablet_state(PB_STOPPED);
824
0
        break;
825
478
    case TABLET_SHUTDOWN:
826
478
        tablet_meta_pb->set_tablet_state(PB_SHUTDOWN);
827
478
        break;
828
814
    }
829
830
    // RowsetMetaPB is separated from TabletMetaPB
831
814
    if (!config::is_cloud_mode()) {
832
21.6k
        for (auto& rs : _rs_metas) {
833
21.6k
            rs->to_rowset_pb(tablet_meta_pb->add_rs_metas());
834
21.6k
        }
835
814
        for (auto rs : _stale_rs_metas) {
836
0
            rs->to_rowset_pb(tablet_meta_pb->add_stale_rs_metas());
837
0
        }
838
814
    }
839
840
814
    _schema->to_schema_pb(tablet_meta_pb->mutable_schema());
841
842
814
    tablet_meta_pb->set_in_restore_mode(in_restore_mode());
843
844
    // to avoid modify tablet meta to the greatest extend
845
814
    if (_preferred_rowset_type == BETA_ROWSET) {
846
814
        tablet_meta_pb->set_preferred_rowset_type(_preferred_rowset_type);
847
814
    }
848
814
    if (_storage_policy_id > 0) {
849
5
        tablet_meta_pb->set_storage_policy_id(_storage_policy_id);
850
5
    }
851
814
    if (_cooldown_meta_id.initialized()) {
852
5
        tablet_meta_pb->mutable_cooldown_meta_id()->CopyFrom(_cooldown_meta_id.to_proto());
853
5
    }
854
855
814
    tablet_meta_pb->set_enable_unique_key_merge_on_write(_enable_unique_key_merge_on_write);
856
857
814
    if (_enable_unique_key_merge_on_write) {
858
4
        std::set<RowsetId> stale_rs_ids;
859
4
        for (const auto& rowset : _stale_rs_metas) {
860
0
            stale_rs_ids.insert(rowset->rowset_id());
861
0
        }
862
4
        DeleteBitmapPB* delete_bitmap_pb = tablet_meta_pb->mutable_delete_bitmap();
863
4
        for (auto& [id, bitmap] : delete_bitmap().snapshot().delete_bitmap) {
864
2
            auto& [rowset_id, segment_id, ver] = id;
865
2
            if (stale_rs_ids.count(rowset_id) != 0) {
866
0
                continue;
867
0
            }
868
2
            delete_bitmap_pb->add_rowset_ids(rowset_id.to_string());
869
2
            delete_bitmap_pb->add_segment_ids(segment_id);
870
2
            delete_bitmap_pb->add_versions(ver);
871
2
            std::string bitmap_data(bitmap.getSizeInBytes(), '\0');
872
2
            bitmap.write(bitmap_data.data());
873
2
            *(delete_bitmap_pb->add_segment_delete_bitmaps()) = std::move(bitmap_data);
874
2
        }
875
4
    }
876
814
    _binlog_config.to_pb(tablet_meta_pb->mutable_binlog_config());
877
814
    tablet_meta_pb->set_compaction_policy(compaction_policy());
878
814
    tablet_meta_pb->set_time_series_compaction_goal_size_mbytes(
879
814
            time_series_compaction_goal_size_mbytes());
880
814
    tablet_meta_pb->set_time_series_compaction_file_count_threshold(
881
814
            time_series_compaction_file_count_threshold());
882
814
    tablet_meta_pb->set_time_series_compaction_time_threshold_seconds(
883
814
            time_series_compaction_time_threshold_seconds());
884
814
    tablet_meta_pb->set_time_series_compaction_empty_rowsets_threshold(
885
814
            time_series_compaction_empty_rowsets_threshold());
886
814
    tablet_meta_pb->set_time_series_compaction_level_threshold(
887
814
            time_series_compaction_level_threshold());
888
889
814
    tablet_meta_pb->set_encryption_algorithm(_encryption_algorithm);
890
814
}
891
892
2
void TabletMeta::to_json(string* json_string, json2pb::Pb2JsonOptions& options) {
893
2
    TabletMetaPB tablet_meta_pb;
894
2
    to_meta_pb(&tablet_meta_pb);
895
2
    json2pb::ProtoMessageToJson(tablet_meta_pb, json_string, options);
896
2
}
897
898
124
Version TabletMeta::max_version() const {
899
124
    Version max_version = {-1, 0};
900
182
    for (auto& rs_meta : _rs_metas) {
901
182
        if (rs_meta->end_version() > max_version.second) {
902
180
            max_version = rs_meta->version();
903
180
        }
904
182
    }
905
124
    return max_version;
906
124
}
907
908
0
size_t TabletMeta::version_count_cross_with_range(const Version& range) const {
909
0
    size_t count = 0;
910
0
    for (const auto& rs_meta : _rs_metas) {
911
0
        if (!(range.first > rs_meta->version().second || range.second < rs_meta->version().first)) {
912
0
            count++;
913
0
        }
914
0
    }
915
0
    return count;
916
0
}
917
918
11.1k
Status TabletMeta::add_rs_meta(const RowsetMetaSharedPtr& rs_meta) {
919
    // check RowsetMeta is valid
920
357k
    for (auto& rs : _rs_metas) {
921
357k
        if (rs->version() == rs_meta->version()) {
922
0
            if (rs->rowset_id() != rs_meta->rowset_id()) {
923
0
                return Status::Error<PUSH_VERSION_ALREADY_EXIST>(
924
0
                        "version already exist. rowset_id={}, version={}, tablet={}",
925
0
                        rs->rowset_id().to_string(), rs->version().to_string(), tablet_id());
926
0
            } else {
927
                // rowsetid,version is equal, it is a duplicate req, skip it
928
0
                return Status::OK();
929
0
            }
930
0
        }
931
357k
    }
932
11.1k
    _rs_metas.push_back(rs_meta);
933
11.1k
    return Status::OK();
934
11.1k
}
935
936
11
void TabletMeta::add_rowsets_unchecked(const std::vector<RowsetSharedPtr>& to_add) {
937
34
    for (const auto& rs : to_add) {
938
34
        _rs_metas.push_back(rs->rowset_meta());
939
34
    }
940
11
}
941
942
void TabletMeta::delete_rs_meta_by_version(const Version& version,
943
0
                                           std::vector<RowsetMetaSharedPtr>* deleted_rs_metas) {
944
0
    size_t rowset_cache_version_size = 0;
945
0
    auto it = _rs_metas.begin();
946
0
    while (it != _rs_metas.end()) {
947
0
        if ((*it)->version() == version) {
948
0
            if (deleted_rs_metas != nullptr) {
949
0
                deleted_rs_metas->push_back(*it);
950
0
            }
951
0
            _rs_metas.erase(it);
952
0
            if (_enable_unique_key_merge_on_write) {
953
0
                rowset_cache_version_size =
954
0
                        _delete_bitmap->remove_rowset_cache_version((*it)->rowset_id());
955
0
            }
956
0
            return;
957
0
        } else {
958
0
            ++it;
959
0
        }
960
0
    }
961
0
    _check_mow_rowset_cache_version_size(rowset_cache_version_size);
962
0
}
963
964
void TabletMeta::modify_rs_metas(const std::vector<RowsetMetaSharedPtr>& to_add,
965
                                 const std::vector<RowsetMetaSharedPtr>& to_delete,
966
25
                                 bool same_version) {
967
25
    size_t rowset_cache_version_size = 0;
968
    // Remove to_delete rowsets from _rs_metas
969
38
    for (auto rs_to_del : to_delete) {
970
38
        auto it = _rs_metas.begin();
971
200
        while (it != _rs_metas.end()) {
972
200
            if (rs_to_del->version() == (*it)->version()) {
973
38
                _rs_metas.erase(it);
974
38
                if (_enable_unique_key_merge_on_write) {
975
1
                    rowset_cache_version_size =
976
1
                            _delete_bitmap->remove_rowset_cache_version((*it)->rowset_id());
977
1
                }
978
                // there should be only one rowset match the version
979
38
                break;
980
162
            } else {
981
162
                ++it;
982
162
            }
983
200
        }
984
38
    }
985
25
    if (!same_version) {
986
        // put to_delete rowsets in _stale_rs_metas.
987
7
        _stale_rs_metas.insert(_stale_rs_metas.end(), to_delete.begin(), to_delete.end());
988
7
    }
989
990
    // put to_add rowsets in _rs_metas.
991
25
    _rs_metas.insert(_rs_metas.end(), to_add.begin(), to_add.end());
992
25
    _check_mow_rowset_cache_version_size(rowset_cache_version_size);
993
25
}
994
995
// Use the passing "rs_metas" to replace the rs meta in this tablet meta
996
// Also clear the _stale_rs_metas because this tablet meta maybe copyied from
997
// an existing tablet before. Add after revise, only the passing "rs_metas"
998
// is needed.
999
5
void TabletMeta::revise_rs_metas(std::vector<RowsetMetaSharedPtr>&& rs_metas) {
1000
5
    {
1001
5
        std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
1002
5
        _rs_metas = std::move(rs_metas);
1003
5
        _stale_rs_metas.clear();
1004
5
    }
1005
5
    if (_enable_unique_key_merge_on_write) {
1006
0
        _delete_bitmap->clear_rowset_cache_version();
1007
0
    }
1008
5
}
1009
1010
// This method should call after revise_rs_metas, since new rs_metas might be a subset
1011
// of original tablet, we should revise the delete_bitmap according to current rowset.
1012
//
1013
// Delete bitmap is protected by Tablet::_meta_lock, we don't need to acquire the
1014
// TabletMeta's _meta_lock
1015
1
void TabletMeta::revise_delete_bitmap_unlocked(const DeleteBitmap& delete_bitmap) {
1016
1
    _delete_bitmap = std::make_unique<DeleteBitmap>(tablet_id());
1017
2
    for (auto rs : _rs_metas) {
1018
2
        DeleteBitmap rs_bm(tablet_id());
1019
2
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
1020
2
                             &rs_bm);
1021
2
        _delete_bitmap->merge(rs_bm);
1022
2
    }
1023
1
    for (auto rs : _stale_rs_metas) {
1024
0
        DeleteBitmap rs_bm(tablet_id());
1025
0
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
1026
0
                             &rs_bm);
1027
0
        _delete_bitmap->merge(rs_bm);
1028
0
    }
1029
1
}
1030
1031
0
void TabletMeta::delete_stale_rs_meta_by_version(const Version& version) {
1032
0
    auto it = _stale_rs_metas.begin();
1033
0
    while (it != _stale_rs_metas.end()) {
1034
0
        if ((*it)->version() == version) {
1035
0
            it = _stale_rs_metas.erase(it);
1036
0
        } else {
1037
0
            it++;
1038
0
        }
1039
0
    }
1040
0
}
1041
1042
0
RowsetMetaSharedPtr TabletMeta::acquire_rs_meta_by_version(const Version& version) const {
1043
0
    for (auto it : _rs_metas) {
1044
0
        if (it->version() == version) {
1045
0
            return it;
1046
0
        }
1047
0
    }
1048
0
    return nullptr;
1049
0
}
1050
1051
8
RowsetMetaSharedPtr TabletMeta::acquire_stale_rs_meta_by_version(const Version& version) const {
1052
8
    for (auto it : _stale_rs_metas) {
1053
0
        if (it->version() == version) {
1054
0
            return it;
1055
0
        }
1056
0
    }
1057
8
    return nullptr;
1058
8
}
1059
1060
23
Status TabletMeta::set_partition_id(int64_t partition_id) {
1061
23
    if ((_partition_id > 0 && _partition_id != partition_id) || partition_id < 1) {
1062
0
        LOG(WARNING) << "cur partition id=" << _partition_id << " new partition id=" << partition_id
1063
0
                     << " not equal";
1064
0
    }
1065
23
    _partition_id = partition_id;
1066
23
    return Status::OK();
1067
23
}
1068
1069
0
void TabletMeta::clear_stale_rowset() {
1070
0
    _stale_rs_metas.clear();
1071
0
    if (_enable_unique_key_merge_on_write) {
1072
0
        _delete_bitmap->clear_rowset_cache_version();
1073
0
    }
1074
0
}
1075
1076
0
void TabletMeta::clear_rowsets() {
1077
0
    _rs_metas.clear();
1078
0
    if (_enable_unique_key_merge_on_write) {
1079
0
        _delete_bitmap->clear_rowset_cache_version();
1080
0
    }
1081
0
}
1082
1083
25
void TabletMeta::_check_mow_rowset_cache_version_size(size_t rowset_cache_version_size) {
1084
25
    if (_enable_unique_key_merge_on_write && config::enable_mow_verbose_log &&
1085
25
        rowset_cache_version_size > _rs_metas.size() + _stale_rs_metas.size()) {
1086
0
        std::stringstream ss;
1087
0
        auto rowset_ids = _delete_bitmap->get_rowset_cache_version();
1088
0
        std::set<std::string> tablet_rowset_ids;
1089
0
        {
1090
0
            std::shared_lock rlock(_meta_lock);
1091
0
            for (auto& rs_meta : _rs_metas) {
1092
0
                tablet_rowset_ids.emplace(rs_meta->rowset_id().to_string());
1093
0
            }
1094
0
            for (auto& rs_meta : _stale_rs_metas) {
1095
0
                tablet_rowset_ids.emplace(rs_meta->rowset_id().to_string());
1096
0
            }
1097
0
        }
1098
0
        for (const auto& rowset_id : rowset_ids) {
1099
0
            if (tablet_rowset_ids.find(rowset_id) == tablet_rowset_ids.end()) {
1100
0
                ss << rowset_id << ", ";
1101
0
            }
1102
0
        }
1103
        // size(rowset_cache_version) <= size(_rs_metas) + size(_stale_rs_metas) + size(_unused_rs)
1104
0
        std::string msg = fmt::format(
1105
0
                "tablet: {}, rowset_cache_version size: {}, "
1106
0
                "_rs_metas size: {}, _stale_rs_metas size: {}, delta: {}. rowset only in cache: {}",
1107
0
                _tablet_id, rowset_cache_version_size, _rs_metas.size(), _stale_rs_metas.size(),
1108
0
                rowset_cache_version_size - _rs_metas.size() - _stale_rs_metas.size(), ss.str());
1109
0
        LOG(INFO) << msg;
1110
0
    }
1111
25
}
1112
1113
3
bool operator==(const TabletMeta& a, const TabletMeta& b) {
1114
3
    if (a._table_id != b._table_id) return false;
1115
3
    if (a._index_id != b._index_id) return false;
1116
3
    if (a._partition_id != b._partition_id) return false;
1117
3
    if (a._tablet_id != b._tablet_id) return false;
1118
3
    if (a._replica_id != b._replica_id) return false;
1119
3
    if (a._schema_hash != b._schema_hash) return false;
1120
3
    if (a._shard_id != b._shard_id) return false;
1121
3
    if (a._creation_time != b._creation_time) return false;
1122
3
    if (a._cumulative_layer_point != b._cumulative_layer_point) return false;
1123
3
    if (a._tablet_uid != b._tablet_uid) return false;
1124
3
    if (a._tablet_type != b._tablet_type) return false;
1125
3
    if (a._tablet_state != b._tablet_state) return false;
1126
3
    if (*a._schema != *b._schema) return false;
1127
3
    if (a._rs_metas.size() != b._rs_metas.size()) return false;
1128
3
    for (int i = 0; i < a._rs_metas.size(); ++i) {
1129
0
        if (a._rs_metas[i] != b._rs_metas[i]) return false;
1130
0
    }
1131
3
    if (a._in_restore_mode != b._in_restore_mode) return false;
1132
3
    if (a._preferred_rowset_type != b._preferred_rowset_type) return false;
1133
3
    if (a._storage_policy_id != b._storage_policy_id) return false;
1134
3
    if (a._compaction_policy != b._compaction_policy) return false;
1135
3
    if (a._time_series_compaction_goal_size_mbytes != b._time_series_compaction_goal_size_mbytes)
1136
0
        return false;
1137
3
    if (a._time_series_compaction_file_count_threshold !=
1138
3
        b._time_series_compaction_file_count_threshold)
1139
0
        return false;
1140
3
    if (a._time_series_compaction_time_threshold_seconds !=
1141
3
        b._time_series_compaction_time_threshold_seconds)
1142
0
        return false;
1143
3
    if (a._time_series_compaction_empty_rowsets_threshold !=
1144
3
        b._time_series_compaction_empty_rowsets_threshold)
1145
0
        return false;
1146
3
    if (a._time_series_compaction_level_threshold != b._time_series_compaction_level_threshold)
1147
0
        return false;
1148
3
    return true;
1149
3
}
1150
1151
0
bool operator!=(const TabletMeta& a, const TabletMeta& b) {
1152
0
    return !(a == b);
1153
0
}
1154
1155
DeleteBitmapAggCache::DeleteBitmapAggCache(size_t capacity)
1156
1
        : LRUCachePolicy(CachePolicy::CacheType::DELETE_BITMAP_AGG_CACHE, capacity,
1157
1
                         LRUCacheType::SIZE, config::delete_bitmap_agg_cache_stale_sweep_time_sec,
1158
1
                         256) {}
1159
1160
189
DeleteBitmapAggCache* DeleteBitmapAggCache::instance() {
1161
189
    return ExecEnv::GetInstance()->delete_bitmap_agg_cache();
1162
189
}
1163
1164
1
DeleteBitmapAggCache* DeleteBitmapAggCache::create_instance(size_t capacity) {
1165
1
    return new DeleteBitmapAggCache(capacity);
1166
1
}
1167
1168
1.07k
DeleteBitmap::DeleteBitmap(int64_t tablet_id) : _tablet_id(tablet_id) {}
1169
1170
7
DeleteBitmap::DeleteBitmap(const DeleteBitmap& o) {
1171
7
    std::shared_lock l1(o.lock);
1172
7
    delete_bitmap = o.delete_bitmap;
1173
7
    _tablet_id = o._tablet_id;
1174
7
}
1175
1176
0
DeleteBitmap& DeleteBitmap::operator=(const DeleteBitmap& o) {
1177
0
    if (this == &o) return *this;
1178
0
    if (this < &o) {
1179
0
        std::unique_lock l1(lock);
1180
0
        std::shared_lock l2(o.lock);
1181
0
        delete_bitmap = o.delete_bitmap;
1182
0
        _tablet_id = o._tablet_id;
1183
0
    } else {
1184
0
        std::shared_lock l2(o.lock);
1185
0
        std::unique_lock l1(lock);
1186
0
        delete_bitmap = o.delete_bitmap;
1187
0
        _tablet_id = o._tablet_id;
1188
0
    }
1189
0
    return *this;
1190
0
}
1191
1192
0
DeleteBitmap::DeleteBitmap(DeleteBitmap&& o) noexcept {
1193
0
    std::scoped_lock l(o.lock, o._rowset_cache_version_lock);
1194
0
    delete_bitmap = std::move(o.delete_bitmap);
1195
0
    _tablet_id = std::move(o._tablet_id);
1196
0
    o._rowset_cache_version.clear();
1197
0
}
1198
1199
0
DeleteBitmap& DeleteBitmap::operator=(DeleteBitmap&& o) noexcept {
1200
0
    if (this == &o) return *this;
1201
0
    std::scoped_lock l(lock, o.lock, o._rowset_cache_version_lock);
1202
0
    delete_bitmap = std::move(o.delete_bitmap);
1203
0
    _tablet_id = std::move(o._tablet_id);
1204
0
    o._rowset_cache_version.clear();
1205
0
    return *this;
1206
0
}
1207
1208
7
DeleteBitmap DeleteBitmap::snapshot() const {
1209
7
    std::shared_lock l(lock);
1210
7
    return DeleteBitmap(*this);
1211
7
}
1212
1213
3
DeleteBitmap DeleteBitmap::snapshot(Version version) const {
1214
    // Take snapshot first, then remove keys greater than given version.
1215
3
    DeleteBitmap snapshot = this->snapshot();
1216
3
    auto it = snapshot.delete_bitmap.begin();
1217
412
    while (it != snapshot.delete_bitmap.end()) {
1218
409
        if (std::get<2>(it->first) > version) {
1219
4
            it = snapshot.delete_bitmap.erase(it);
1220
405
        } else {
1221
405
            it++;
1222
405
        }
1223
409
    }
1224
3
    return snapshot;
1225
3
}
1226
1227
463k
void DeleteBitmap::add(const BitmapKey& bmk, uint32_t row_id) {
1228
463k
    std::lock_guard l(lock);
1229
463k
    delete_bitmap[bmk].add(row_id);
1230
463k
}
1231
1232
0
int DeleteBitmap::remove(const BitmapKey& bmk, uint32_t row_id) {
1233
0
    std::lock_guard l(lock);
1234
0
    auto it = delete_bitmap.find(bmk);
1235
0
    if (it == delete_bitmap.end()) return -1;
1236
0
    it->second.remove(row_id);
1237
0
    return 0;
1238
0
}
1239
1240
8
void DeleteBitmap::remove(const BitmapKey& start, const BitmapKey& end) {
1241
8
    std::lock_guard l(lock);
1242
107
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) {
1243
101
        auto& [k, _] = *it;
1244
101
        if (k >= end) {
1245
2
            break;
1246
2
        }
1247
99
        it = delete_bitmap.erase(it);
1248
99
    }
1249
8
}
1250
1251
0
void DeleteBitmap::remove(const std::vector<std::tuple<BitmapKey, BitmapKey>>& key_ranges) {
1252
0
    std::lock_guard l(lock);
1253
0
    for (auto& [start, end] : key_ranges) {
1254
0
        for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) {
1255
0
            auto& [k, _] = *it;
1256
0
            if (k >= end) {
1257
0
                break;
1258
0
            }
1259
0
            it = delete_bitmap.erase(it);
1260
0
        }
1261
0
    }
1262
0
}
1263
1264
6
bool DeleteBitmap::contains(const BitmapKey& bmk, uint32_t row_id) const {
1265
6
    std::shared_lock l(lock);
1266
6
    auto it = delete_bitmap.find(bmk);
1267
6
    return it != delete_bitmap.end() && it->second.contains(row_id);
1268
6
}
1269
1270
2
bool DeleteBitmap::contains_agg(const BitmapKey& bmk, uint32_t row_id) const {
1271
2
    return get_agg(bmk)->contains(row_id);
1272
2
}
1273
1274
0
bool DeleteBitmap::empty() const {
1275
0
    std::shared_lock l(lock);
1276
0
    return delete_bitmap.empty();
1277
0
}
1278
1279
63
uint64_t DeleteBitmap::cardinality() const {
1280
63
    std::shared_lock l(lock);
1281
63
    uint64_t res = 0;
1282
314
    for (auto entry : delete_bitmap) {
1283
314
        if (std::get<1>(entry.first) != DeleteBitmap::INVALID_SEGMENT_ID) {
1284
314
            res += entry.second.cardinality();
1285
314
        }
1286
314
    }
1287
63
    return res;
1288
63
}
1289
1290
0
uint64_t DeleteBitmap::get_size() const {
1291
0
    std::shared_lock l(lock);
1292
0
    uint64_t charge = 0;
1293
0
    for (auto& [k, v] : delete_bitmap) {
1294
0
        if (std::get<1>(k) != DeleteBitmap::INVALID_SEGMENT_ID) {
1295
0
            charge += v.getSizeInBytes();
1296
0
        }
1297
0
    }
1298
0
    return charge;
1299
0
}
1300
1301
1
bool DeleteBitmap::contains_agg_without_cache(const BitmapKey& bmk, uint32_t row_id) const {
1302
1
    std::shared_lock l(lock);
1303
1
    DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), 0};
1304
1
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1305
0
        auto& [k, bm] = *it;
1306
0
        if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1307
0
            std::get<2>(k) > std::get<2>(bmk)) {
1308
0
            break;
1309
0
        }
1310
0
        if (bm.contains(row_id)) {
1311
0
            return true;
1312
0
        }
1313
0
    }
1314
1
    return false;
1315
1
}
1316
1317
0
void DeleteBitmap::remove_sentinel_marks() {
1318
0
    std::lock_guard l(lock);
1319
0
    for (auto it = delete_bitmap.begin(), end = delete_bitmap.end(); it != end;) {
1320
0
        if (std::get<1>(it->first) == DeleteBitmap::INVALID_SEGMENT_ID) {
1321
0
            it = delete_bitmap.erase(it);
1322
0
        } else {
1323
0
            ++it;
1324
0
        }
1325
0
    }
1326
0
}
1327
1328
38
int DeleteBitmap::set(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1329
38
    std::lock_guard l(lock);
1330
38
    auto [_, inserted] = delete_bitmap.insert_or_assign(bmk, segment_delete_bitmap);
1331
38
    return inserted;
1332
38
}
1333
1334
7
int DeleteBitmap::get(const BitmapKey& bmk, roaring::Roaring* segment_delete_bitmap) const {
1335
7
    std::shared_lock l(lock);
1336
7
    auto it = delete_bitmap.find(bmk);
1337
7
    if (it == delete_bitmap.end()) return -1;
1338
7
    *segment_delete_bitmap = it->second; // copy
1339
7
    return 0;
1340
7
}
1341
1342
54
const roaring::Roaring* DeleteBitmap::get(const BitmapKey& bmk) const {
1343
54
    std::shared_lock l(lock);
1344
54
    auto it = delete_bitmap.find(bmk);
1345
54
    if (it == delete_bitmap.end()) return nullptr;
1346
41
    return &(it->second); // get address
1347
54
}
1348
1349
void DeleteBitmap::subset(const BitmapKey& start, const BitmapKey& end,
1350
3
                          DeleteBitmap* subset_rowset_map) const {
1351
3
    DCHECK(start < end);
1352
3
    std::shared_lock l(lock);
1353
26
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1354
25
        auto& [k, bm] = *it;
1355
25
        if (k >= end) {
1356
2
            break;
1357
2
        }
1358
23
        subset_rowset_map->set(k, bm);
1359
23
    }
1360
3
}
1361
1362
void DeleteBitmap::subset(std::vector<std::pair<RowsetId, int64_t>>& rowset_ids,
1363
                          int64_t start_version, int64_t end_version,
1364
0
                          DeleteBitmap* subset_delete_map) const {
1365
0
    DCHECK(start_version <= end_version);
1366
0
    for (auto& [rowset_id, _] : rowset_ids) {
1367
0
        BitmapKey start {rowset_id, 0, 0};
1368
0
        BitmapKey end {rowset_id, UINT32_MAX, end_version + 1};
1369
0
        std::shared_lock l(lock);
1370
0
        for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1371
0
            auto& [k, bm] = *it;
1372
0
            if (k >= end) {
1373
0
                break;
1374
0
            }
1375
0
            auto version = std::get<2>(k);
1376
0
            if (version >= start_version && version <= end_version) {
1377
0
                subset_delete_map->merge(k, bm);
1378
0
                VLOG_DEBUG << "subset delete bitmap, tablet=" << _tablet_id << ", version=["
1379
0
                           << start_version << ", " << end_version
1380
0
                           << "]. rowset=" << std::get<0>(k).to_string()
1381
0
                           << ", segment=" << std::get<1>(k) << ", version=" << version
1382
0
                           << ", cardinality=" << bm.cardinality();
1383
0
            }
1384
0
        }
1385
0
    }
1386
0
}
1387
1388
void DeleteBitmap::subset_and_agg(std::vector<std::pair<RowsetId, int64_t>>& rowset_ids,
1389
                                  int64_t start_version, int64_t end_version,
1390
1
                                  DeleteBitmap* subset_delete_map) const {
1391
1
    DCHECK(start_version <= end_version);
1392
2
    for (auto& [rowset_id, segment_num] : rowset_ids) {
1393
6
        for (int64_t seg_id = 0; seg_id < segment_num; ++seg_id) {
1394
4
            BitmapKey end {rowset_id, seg_id, end_version};
1395
4
            auto bm = get_agg_without_cache(end, start_version);
1396
4
            VLOG_DEBUG << "subset delete bitmap, tablet=" << _tablet_id << ", rowset=" << rowset_id
1397
0
                       << ", segment=" << seg_id << ", version=[" << start_version << "-"
1398
0
                       << end_version << "], cardinality=" << bm->cardinality();
1399
4
            if (bm->isEmpty()) {
1400
0
                continue;
1401
0
            }
1402
4
            subset_delete_map->merge(end, *bm);
1403
4
        }
1404
2
    }
1405
1
}
1406
1407
0
size_t DeleteBitmap::get_count_with_range(const BitmapKey& start, const BitmapKey& end) const {
1408
0
    DCHECK(start < end);
1409
0
    size_t count = 0;
1410
0
    std::shared_lock l(lock);
1411
0
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1412
0
        auto& [k, bm] = *it;
1413
0
        if (k >= end) {
1414
0
            break;
1415
0
        }
1416
0
        count++;
1417
0
    }
1418
0
    return count;
1419
0
}
1420
1421
6
void DeleteBitmap::merge(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1422
6
    std::lock_guard l(lock);
1423
6
    auto [iter, succ] = delete_bitmap.emplace(bmk, segment_delete_bitmap);
1424
6
    if (!succ) {
1425
0
        iter->second |= segment_delete_bitmap;
1426
0
    }
1427
6
}
1428
1429
9
void DeleteBitmap::merge(const DeleteBitmap& other) {
1430
9
    std::lock_guard l(lock);
1431
29
    for (auto& i : other.delete_bitmap) {
1432
29
        auto [j, succ] = this->delete_bitmap.insert(i);
1433
29
        if (!succ) j->second |= i.second;
1434
29
    }
1435
9
}
1436
1437
63
uint64_t DeleteBitmap::get_delete_bitmap_count() {
1438
63
    std::shared_lock l(lock);
1439
63
    uint64_t count = 0;
1440
377
    for (auto it = delete_bitmap.begin(); it != delete_bitmap.end(); it++) {
1441
314
        if (std::get<1>(it->first) != DeleteBitmap::INVALID_SEGMENT_ID) {
1442
314
            count++;
1443
314
        }
1444
314
    }
1445
63
    return count;
1446
63
}
1447
1448
void DeleteBitmap::traverse_rowset_and_version(
1449
0
        const std::function<int(const RowsetId& rowsetId, int64_t version)>& func) const {
1450
0
    std::shared_lock l(lock);
1451
0
    auto it = delete_bitmap.cbegin();
1452
0
    while (it != delete_bitmap.cend()) {
1453
0
        RowsetId rowset_id = std::get<0>(it->first);
1454
0
        int64_t version = std::get<2>(it->first);
1455
0
        int result = func(rowset_id, version);
1456
0
        if (result == -2) {
1457
            // find next <rowset, version>
1458
0
            it++;
1459
0
        } else {
1460
            // find next <rowset>
1461
0
            it = delete_bitmap.upper_bound({rowset_id, std::numeric_limits<SegmentId>::max(),
1462
0
                                            std::numeric_limits<Version>::max()});
1463
0
        }
1464
0
    }
1465
0
}
1466
1467
0
bool DeleteBitmap::has_calculated_for_multi_segments(const RowsetId& rowset_id) const {
1468
0
    return contains({rowset_id, INVALID_SEGMENT_ID, TEMP_VERSION_COMMON}, ROWSET_SENTINEL_MARK);
1469
0
}
1470
1471
1
size_t DeleteBitmap::remove_rowset_cache_version(const RowsetId& rowset_id) {
1472
1
    std::lock_guard l(_rowset_cache_version_lock);
1473
1
    _rowset_cache_version.erase(rowset_id);
1474
1
    VLOG_DEBUG << "remove agg cache version for tablet=" << _tablet_id
1475
0
               << ", rowset=" << rowset_id.to_string();
1476
1
    return _rowset_cache_version.size();
1477
1
}
1478
1479
0
void DeleteBitmap::clear_rowset_cache_version() {
1480
0
    std::lock_guard l(_rowset_cache_version_lock);
1481
0
    _rowset_cache_version.clear();
1482
0
    VLOG_DEBUG << "clear agg cache version for tablet=" << _tablet_id;
1483
0
}
1484
1485
0
std::set<std::string> DeleteBitmap::get_rowset_cache_version() {
1486
0
    std::set<std::string> set;
1487
0
    std::shared_lock l(_rowset_cache_version_lock);
1488
0
    for (auto& [k, _] : _rowset_cache_version) {
1489
0
        set.insert(k.to_string());
1490
0
    }
1491
0
    return set;
1492
0
}
1493
1494
48
DeleteBitmap::Version DeleteBitmap::_get_rowset_cache_version(const BitmapKey& bmk) const {
1495
48
    std::shared_lock l(_rowset_cache_version_lock);
1496
48
    if (auto it = _rowset_cache_version.find(std::get<0>(bmk)); it != _rowset_cache_version.end()) {
1497
39
        auto& segment_cache_version = it->second;
1498
39
        if (auto it1 = segment_cache_version.find(std::get<1>(bmk));
1499
39
            it1 != segment_cache_version.end()) {
1500
10
            return it1->second;
1501
10
        }
1502
39
    }
1503
38
    return 0;
1504
48
}
1505
1506
// We cannot just copy the underlying memory to construct a string
1507
// due to equivalent objects may have different padding bytes.
1508
// Reading padding bytes is undefined behavior, neither copy nor
1509
// placement new will help simplify the code.
1510
// Refer to C11 standards §6.2.6.1/6 and §6.7.9/21 for more info.
1511
63
static std::string agg_cache_key(int64_t tablet_id, const DeleteBitmap::BitmapKey& bmk) {
1512
63
    std::string ret(sizeof(tablet_id) + sizeof(bmk), '\0');
1513
63
    *reinterpret_cast<int64_t*>(ret.data()) = tablet_id;
1514
63
    auto t = reinterpret_cast<DeleteBitmap::BitmapKey*>(ret.data() + sizeof(tablet_id));
1515
63
    std::get<RowsetId>(*t).version = std::get<RowsetId>(bmk).version;
1516
63
    std::get<RowsetId>(*t).hi = std::get<RowsetId>(bmk).hi;
1517
63
    std::get<RowsetId>(*t).mi = std::get<RowsetId>(bmk).mi;
1518
63
    std::get<RowsetId>(*t).lo = std::get<RowsetId>(bmk).lo;
1519
63
    std::get<1>(*t) = std::get<1>(bmk);
1520
63
    std::get<2>(*t) = std::get<2>(bmk);
1521
63
    return ret;
1522
63
}
1523
1524
54
std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg(const BitmapKey& bmk) const {
1525
54
    std::string key_str = agg_cache_key(_tablet_id, bmk); // Cache key container
1526
54
    CacheKey key(key_str);
1527
54
    Cache::Handle* handle = DeleteBitmapAggCache::instance()->lookup(key);
1528
1529
54
    DeleteBitmapAggCache::Value* val =
1530
54
            handle == nullptr ? nullptr
1531
54
                              : reinterpret_cast<DeleteBitmapAggCache::Value*>(
1532
6
                                        DeleteBitmapAggCache::instance()->value(handle));
1533
    // FIXME: do we need a mutex here to get rid of duplicated initializations
1534
    //        of cache entries in some cases?
1535
54
    if (val == nullptr) { // Renew if needed, put a new Value to cache
1536
48
        val = new DeleteBitmapAggCache::Value();
1537
48
        Version start_version =
1538
48
                config::enable_mow_get_agg_by_cache ? _get_rowset_cache_version(bmk) : 0;
1539
48
        if (start_version > 0) {
1540
9
            Cache::Handle* handle2 = DeleteBitmapAggCache::instance()->lookup(
1541
9
                    agg_cache_key(_tablet_id, {std::get<0>(bmk), std::get<1>(bmk), start_version}));
1542
1543
9
            DBUG_EXECUTE_IF("DeleteBitmap::get_agg.cache_miss", {
1544
9
                if (handle2 != nullptr) {
1545
9
                    auto p = dp->param("percent", 0.3);
1546
9
                    std::mt19937 gen {std::random_device {}()};
1547
9
                    std::bernoulli_distribution inject_fault {p};
1548
9
                    if (inject_fault(gen)) {
1549
9
                        LOG_INFO("injection DeleteBitmap::get_agg.cache_miss, tablet_id={}",
1550
9
                                 _tablet_id);
1551
9
                        handle2 = nullptr;
1552
9
                    }
1553
9
                }
1554
9
            });
1555
9
            if (handle2 == nullptr || start_version > std::get<2>(bmk)) {
1556
0
                start_version = 0;
1557
9
            } else {
1558
9
                val->bitmap |= reinterpret_cast<DeleteBitmapAggCache::Value*>(
1559
9
                                       DeleteBitmapAggCache::instance()->value(handle2))
1560
9
                                       ->bitmap;
1561
9
                VLOG_DEBUG << "get agg cache version=" << start_version
1562
0
                           << " for tablet=" << _tablet_id
1563
0
                           << ", rowset=" << std::get<0>(bmk).to_string()
1564
0
                           << ", segment=" << std::get<1>(bmk);
1565
9
                start_version += 1;
1566
9
            }
1567
9
            if (handle2 != nullptr) {
1568
9
                DeleteBitmapAggCache::instance()->release(handle2);
1569
9
            }
1570
9
        }
1571
48
        {
1572
48
            std::shared_lock l(lock);
1573
48
            DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1574
87
            for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1575
84
                auto& [k, bm] = *it;
1576
84
                if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1577
84
                    std::get<2>(k) > std::get<2>(bmk)) {
1578
45
                    break;
1579
45
                }
1580
39
                val->bitmap |= bm;
1581
39
            }
1582
48
        }
1583
48
        size_t charge = val->bitmap.getSizeInBytes() + sizeof(DeleteBitmapAggCache::Value);
1584
48
        handle = DeleteBitmapAggCache::instance()->insert(key, val, charge, charge,
1585
48
                                                          CachePriority::NORMAL);
1586
48
        if (config::enable_mow_get_agg_by_cache && !val->bitmap.isEmpty()) {
1587
37
            std::lock_guard l(_rowset_cache_version_lock);
1588
            // this version is already agg
1589
37
            _rowset_cache_version[std::get<0>(bmk)][std::get<1>(bmk)] = std::get<2>(bmk);
1590
37
            VLOG_DEBUG << "set agg cache version=" << std::get<2>(bmk)
1591
0
                       << " for tablet=" << _tablet_id
1592
0
                       << ", rowset=" << std::get<0>(bmk).to_string()
1593
0
                       << ", segment=" << std::get<1>(bmk);
1594
37
        }
1595
48
        if (start_version > 0 && config::enable_mow_get_agg_correctness_check_core) {
1596
0
            std::shared_ptr<roaring::Roaring> bitmap = get_agg_without_cache(bmk);
1597
0
            if (val->bitmap != *bitmap) {
1598
0
                CHECK(false) << ". get agg correctness check failed for tablet=" << _tablet_id
1599
0
                             << ", rowset=" << std::get<0>(bmk).to_string()
1600
0
                             << ", segment=" << std::get<1>(bmk) << ", version=" << std::get<2>(bmk)
1601
0
                             << ". start_version from cache=" << start_version
1602
0
                             << ", delete_bitmap cardinality with cache="
1603
0
                             << val->bitmap.cardinality()
1604
0
                             << ", delete_bitmap cardinality without cache="
1605
0
                             << bitmap->cardinality();
1606
0
            }
1607
0
        }
1608
48
    }
1609
1610
    // It is natural for the cache to reclaim the underlying memory
1611
54
    return std::shared_ptr<roaring::Roaring>(
1612
54
            &val->bitmap, [handle](...) { DeleteBitmapAggCache::instance()->release(handle); });
1613
54
}
1614
1615
std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg_without_cache(
1616
4
        const BitmapKey& bmk, const int64_t start_version) const {
1617
4
    std::shared_ptr<roaring::Roaring> bitmap = std::make_shared<roaring::Roaring>();
1618
4
    std::shared_lock l(lock);
1619
4
    DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), start_version};
1620
24
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1621
23
        auto& [k, bm] = *it;
1622
23
        if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1623
23
            std::get<2>(k) > std::get<2>(bmk)) {
1624
3
            break;
1625
3
        }
1626
20
        *bitmap |= bm;
1627
20
    }
1628
4
    return bitmap;
1629
4
}
1630
1631
0
std::string tablet_state_name(TabletState state) {
1632
0
    switch (state) {
1633
0
    case TABLET_NOTREADY:
1634
0
        return "TABLET_NOTREADY";
1635
1636
0
    case TABLET_RUNNING:
1637
0
        return "TABLET_RUNNING";
1638
1639
0
    case TABLET_TOMBSTONED:
1640
0
        return "TABLET_TOMBSTONED";
1641
1642
0
    case TABLET_STOPPED:
1643
0
        return "TABLET_STOPPED";
1644
1645
0
    case TABLET_SHUTDOWN:
1646
0
        return "TABLET_SHUTDOWN";
1647
1648
0
    default:
1649
0
        return "TabletState(" + std::to_string(state) + ")";
1650
0
    }
1651
0
}
1652
1653
#include "common/compile_check_end.h"
1654
} // namespace doris