Coverage Report

Created: 2025-06-23 17:53

/root/doris/be/src/olap/tablet_meta.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "olap/tablet_meta.h"
19
20
#include <gen_cpp/Descriptors_types.h>
21
#include <gen_cpp/Types_types.h>
22
#include <gen_cpp/olap_common.pb.h>
23
#include <gen_cpp/olap_file.pb.h>
24
#include <gen_cpp/segment_v2.pb.h>
25
#include <gen_cpp/types.pb.h>
26
#include <json2pb/pb_to_json.h>
27
#include <time.h>
28
29
#include <cstdint>
30
#include <limits>
31
#include <memory>
32
#include <set>
33
#include <utility>
34
35
#include "common/config.h"
36
#include "gutil/integral_types.h"
37
#include "io/fs/file_writer.h"
38
#include "olap/data_dir.h"
39
#include "olap/file_header.h"
40
#include "olap/olap_common.h"
41
#include "olap/olap_define.h"
42
#include "olap/rowset/rowset.h"
43
#include "olap/rowset/rowset_meta_manager.h"
44
#include "olap/tablet_fwd.h"
45
#include "olap/tablet_meta_manager.h"
46
#include "olap/tablet_schema_cache.h"
47
#include "olap/utils.h"
48
#include "util/debug_points.h"
49
#include "util/mem_info.h"
50
#include "util/parse_util.h"
51
#include "util/string_util.h"
52
#include "util/time.h"
53
#include "util/uid_util.h"
54
55
using std::string;
56
using std::unordered_map;
57
using std::vector;
58
59
namespace doris {
60
using namespace ErrorCode;
61
62
TabletMetaSharedPtr TabletMeta::create(
63
        const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id,
64
        uint32_t next_unique_id,
65
295
        const unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id) {
66
295
    std::optional<TBinlogConfig> binlog_config;
67
295
    if (request.__isset.binlog_config) {
68
0
        binlog_config = request.binlog_config;
69
0
    }
70
295
    return std::make_shared<TabletMeta>(
71
295
            request.table_id, request.partition_id, request.tablet_id, request.replica_id,
72
295
            request.tablet_schema.schema_hash, shard_id, request.tablet_schema, next_unique_id,
73
295
            col_ordinal_to_unique_id, tablet_uid,
74
295
            request.__isset.tablet_type ? request.tablet_type : TTabletType::TABLET_TYPE_DISK,
75
295
            request.compression_type, request.storage_policy_id,
76
295
            request.__isset.enable_unique_key_merge_on_write
77
295
                    ? request.enable_unique_key_merge_on_write
78
295
                    : false,
79
295
            std::move(binlog_config), request.compaction_policy,
80
295
            request.time_series_compaction_goal_size_mbytes,
81
295
            request.time_series_compaction_file_count_threshold,
82
295
            request.time_series_compaction_time_threshold_seconds,
83
295
            request.time_series_compaction_empty_rowsets_threshold,
84
295
            request.inverted_index_storage_format, request.time_series_compaction_level_threshold);
85
295
}
86
87
974
TabletMeta::~TabletMeta() {
88
974
    if (_handle) {
89
923
        TabletSchemaCache::instance()->release(_handle);
90
923
    }
91
974
}
92
93
TabletMeta::TabletMeta()
94
        : _tablet_uid(0, 0),
95
          _schema(new TabletSchema),
96
536
          _delete_bitmap(new DeleteBitmap(_tablet_id)) {}
97
98
TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id,
99
                       int64_t replica_id, int32_t schema_hash, uint64_t shard_id,
100
                       const TTabletSchema& tablet_schema, uint32_t next_unique_id,
101
                       const std::unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id,
102
                       TabletUid tablet_uid, TTabletType::type tabletType,
103
                       TCompressionType::type compression_type, int64_t storage_policy_id,
104
                       bool enable_unique_key_merge_on_write,
105
                       std::optional<TBinlogConfig> binlog_config, std::string compaction_policy,
106
                       int64_t time_series_compaction_goal_size_mbytes,
107
                       int64_t time_series_compaction_file_count_threshold,
108
                       int64_t time_series_compaction_time_threshold_seconds,
109
                       int64_t time_series_compaction_empty_rowsets_threshold,
110
                       TInvertedIndexStorageFormat::type inverted_index_storage_format,
111
                       int64_t time_series_compaction_level_threshold)
112
        : _tablet_uid(0, 0),
113
          _schema(new TabletSchema),
114
438
          _delete_bitmap(new DeleteBitmap(tablet_id)) {
115
438
    TabletMetaPB tablet_meta_pb;
116
438
    tablet_meta_pb.set_table_id(table_id);
117
438
    tablet_meta_pb.set_partition_id(partition_id);
118
438
    tablet_meta_pb.set_tablet_id(tablet_id);
119
438
    tablet_meta_pb.set_replica_id(replica_id);
120
438
    tablet_meta_pb.set_schema_hash(schema_hash);
121
438
    tablet_meta_pb.set_shard_id(shard_id);
122
    // Persist the creation time, but it is not used
123
438
    tablet_meta_pb.set_creation_time(time(nullptr));
124
438
    tablet_meta_pb.set_cumulative_layer_point(-1);
125
438
    tablet_meta_pb.set_tablet_state(PB_RUNNING);
126
438
    *(tablet_meta_pb.mutable_tablet_uid()) = tablet_uid.to_proto();
127
438
    tablet_meta_pb.set_tablet_type(tabletType == TTabletType::TABLET_TYPE_DISK
128
438
                                           ? TabletTypePB::TABLET_TYPE_DISK
129
438
                                           : TabletTypePB::TABLET_TYPE_MEMORY);
130
438
    tablet_meta_pb.set_enable_unique_key_merge_on_write(enable_unique_key_merge_on_write);
131
438
    tablet_meta_pb.set_storage_policy_id(storage_policy_id);
132
438
    tablet_meta_pb.set_compaction_policy(compaction_policy);
133
438
    tablet_meta_pb.set_time_series_compaction_goal_size_mbytes(
134
438
            time_series_compaction_goal_size_mbytes);
135
438
    tablet_meta_pb.set_time_series_compaction_file_count_threshold(
136
438
            time_series_compaction_file_count_threshold);
137
438
    tablet_meta_pb.set_time_series_compaction_time_threshold_seconds(
138
438
            time_series_compaction_time_threshold_seconds);
139
438
    tablet_meta_pb.set_time_series_compaction_empty_rowsets_threshold(
140
438
            time_series_compaction_empty_rowsets_threshold);
141
438
    tablet_meta_pb.set_time_series_compaction_level_threshold(
142
438
            time_series_compaction_level_threshold);
143
438
    TabletSchemaPB* schema = tablet_meta_pb.mutable_schema();
144
438
    schema->set_num_short_key_columns(tablet_schema.short_key_column_count);
145
438
    schema->set_num_rows_per_row_block(config::default_num_rows_per_column_file_block);
146
438
    schema->set_sequence_col_idx(tablet_schema.sequence_col_idx);
147
438
    switch (tablet_schema.keys_type) {
148
30
    case TKeysType::DUP_KEYS:
149
30
        schema->set_keys_type(KeysType::DUP_KEYS);
150
30
        break;
151
299
    case TKeysType::UNIQUE_KEYS:
152
299
        schema->set_keys_type(KeysType::UNIQUE_KEYS);
153
299
        break;
154
61
    case TKeysType::AGG_KEYS:
155
61
        schema->set_keys_type(KeysType::AGG_KEYS);
156
61
        break;
157
48
    default:
158
48
        LOG(WARNING) << "unknown tablet keys type";
159
48
        break;
160
438
    }
161
    // compress_kind used to compress segment files
162
438
    schema->set_compress_kind(COMPRESS_LZ4);
163
164
    // compression_type used to compress segment page
165
438
    switch (compression_type) {
166
0
    case TCompressionType::NO_COMPRESSION:
167
0
        schema->set_compression_type(segment_v2::NO_COMPRESSION);
168
0
        break;
169
0
    case TCompressionType::SNAPPY:
170
0
        schema->set_compression_type(segment_v2::SNAPPY);
171
0
        break;
172
0
    case TCompressionType::LZ4:
173
0
        schema->set_compression_type(segment_v2::LZ4);
174
0
        break;
175
438
    case TCompressionType::LZ4F:
176
438
        schema->set_compression_type(segment_v2::LZ4F);
177
438
        break;
178
0
    case TCompressionType::ZLIB:
179
0
        schema->set_compression_type(segment_v2::ZLIB);
180
0
        break;
181
0
    case TCompressionType::ZSTD:
182
0
        schema->set_compression_type(segment_v2::ZSTD);
183
0
        break;
184
0
    default:
185
0
        schema->set_compression_type(segment_v2::LZ4F);
186
0
        break;
187
438
    }
188
189
438
    switch (inverted_index_storage_format) {
190
438
    case TInvertedIndexStorageFormat::V1:
191
438
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1);
192
438
        break;
193
0
    case TInvertedIndexStorageFormat::V2:
194
0
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
195
0
        break;
196
0
    default:
197
0
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1);
198
0
        break;
199
438
    }
200
201
438
    switch (tablet_schema.sort_type) {
202
0
    case TSortType::type::ZORDER:
203
0
        schema->set_sort_type(SortType::ZORDER);
204
0
        break;
205
438
    default:
206
438
        schema->set_sort_type(SortType::LEXICAL);
207
438
    }
208
438
    schema->set_sort_col_num(tablet_schema.sort_col_num);
209
438
    for (const auto& i : tablet_schema.cluster_key_idxes) {
210
0
        schema->add_cluster_key_idxes(i);
211
0
    }
212
438
    tablet_meta_pb.set_in_restore_mode(false);
213
214
    // set column information
215
438
    uint32_t col_ordinal = 0;
216
438
    bool has_bf_columns = false;
217
2.09k
    for (TColumn tcolumn : tablet_schema.columns) {
218
2.09k
        ColumnPB* column = schema->add_column();
219
2.09k
        uint32_t unique_id = -1;
220
2.09k
        if (tcolumn.col_unique_id >= 0) {
221
0
            unique_id = tcolumn.col_unique_id;
222
2.09k
        } else {
223
2.09k
            unique_id = col_ordinal_to_unique_id.at(col_ordinal);
224
2.09k
        }
225
2.09k
        col_ordinal++;
226
2.09k
        init_column_from_tcolumn(unique_id, tcolumn, column);
227
228
2.09k
        if (column->is_bf_column()) {
229
0
            has_bf_columns = true;
230
0
        }
231
232
2.09k
        if (tablet_schema.__isset.indexes) {
233
0
            for (auto& index : tablet_schema.indexes) {
234
0
                if (index.index_type == TIndexType::type::BITMAP) {
235
0
                    DCHECK_EQ(index.columns.size(), 1);
236
0
                    if (iequal(tcolumn.column_name, index.columns[0])) {
237
0
                        column->set_has_bitmap_index(true);
238
0
                        break;
239
0
                    }
240
0
                } else if (index.index_type == TIndexType::type::BLOOMFILTER ||
241
0
                           index.index_type == TIndexType::type::NGRAM_BF) {
242
0
                    DCHECK_EQ(index.columns.size(), 1);
243
0
                    if (iequal(tcolumn.column_name, index.columns[0])) {
244
0
                        column->set_is_bf_column(true);
245
0
                        break;
246
0
                    }
247
0
                }
248
0
            }
249
0
        }
250
2.09k
    }
251
252
    // copy index meta
253
438
    if (tablet_schema.__isset.indexes) {
254
0
        for (auto& index : tablet_schema.indexes) {
255
0
            TabletIndexPB* index_pb = schema->add_index();
256
0
            index_pb->set_index_id(index.index_id);
257
0
            index_pb->set_index_name(index.index_name);
258
            // init col_unique_id in index at be side, since col_unique_id may be -1 at fe side
259
            // get column unique id by name
260
0
            for (auto column_name : index.columns) {
261
0
                for (auto column : schema->column()) {
262
0
                    if (iequal(column.name(), column_name)) {
263
0
                        index_pb->add_col_unique_id(column.unique_id());
264
0
                    }
265
0
                }
266
0
            }
267
0
            switch (index.index_type) {
268
0
            case TIndexType::BITMAP:
269
0
                index_pb->set_index_type(IndexType::BITMAP);
270
0
                break;
271
0
            case TIndexType::INVERTED:
272
0
                index_pb->set_index_type(IndexType::INVERTED);
273
0
                break;
274
0
            case TIndexType::BLOOMFILTER:
275
0
                index_pb->set_index_type(IndexType::BLOOMFILTER);
276
0
                break;
277
0
            case TIndexType::NGRAM_BF:
278
0
                index_pb->set_index_type(IndexType::NGRAM_BF);
279
0
                break;
280
0
            }
281
282
0
            if (index.__isset.properties) {
283
0
                auto properties = index_pb->mutable_properties();
284
0
                for (auto kv : index.properties) {
285
0
                    (*properties)[kv.first] = kv.second;
286
0
                }
287
0
            }
288
0
        }
289
0
    }
290
291
438
    schema->set_next_column_unique_id(next_unique_id);
292
438
    if (has_bf_columns && tablet_schema.__isset.bloom_filter_fpp) {
293
0
        schema->set_bf_fpp(tablet_schema.bloom_filter_fpp);
294
0
    }
295
296
438
    if (tablet_schema.__isset.is_in_memory) {
297
0
        schema->set_is_in_memory(tablet_schema.is_in_memory);
298
0
    }
299
300
438
    if (tablet_schema.__isset.disable_auto_compaction) {
301
0
        schema->set_disable_auto_compaction(tablet_schema.disable_auto_compaction);
302
0
    }
303
304
438
    if (tablet_schema.__isset.enable_single_replica_compaction) {
305
438
        schema->set_enable_single_replica_compaction(
306
438
                tablet_schema.enable_single_replica_compaction);
307
438
    }
308
309
438
    if (tablet_schema.__isset.delete_sign_idx) {
310
438
        schema->set_delete_sign_idx(tablet_schema.delete_sign_idx);
311
438
    }
312
438
    if (tablet_schema.__isset.store_row_column) {
313
438
        schema->set_store_row_column(tablet_schema.store_row_column);
314
438
    }
315
438
    if (tablet_schema.__isset.row_store_page_size) {
316
438
        schema->set_row_store_page_size(tablet_schema.row_store_page_size);
317
438
    }
318
438
    if (tablet_schema.__isset.skip_write_index_on_load) {
319
438
        schema->set_skip_write_index_on_load(tablet_schema.skip_write_index_on_load);
320
438
    }
321
438
    if (binlog_config.has_value()) {
322
0
        BinlogConfig tmp_binlog_config;
323
0
        tmp_binlog_config = binlog_config.value();
324
0
        tmp_binlog_config.to_pb(tablet_meta_pb.mutable_binlog_config());
325
0
    }
326
327
438
    init_from_pb(tablet_meta_pb);
328
438
}
329
330
TabletMeta::TabletMeta(const TabletMeta& b)
331
        : MetadataAdder(b),
332
          _table_id(b._table_id),
333
          _partition_id(b._partition_id),
334
          _tablet_id(b._tablet_id),
335
          _replica_id(b._replica_id),
336
          _schema_hash(b._schema_hash),
337
          _shard_id(b._shard_id),
338
          _creation_time(b._creation_time),
339
          _cumulative_layer_point(b._cumulative_layer_point),
340
          _tablet_uid(b._tablet_uid),
341
          _tablet_type(b._tablet_type),
342
          _tablet_state(b._tablet_state),
343
          _schema(b._schema),
344
          _rs_metas(b._rs_metas),
345
          _stale_rs_metas(b._stale_rs_metas),
346
          _in_restore_mode(b._in_restore_mode),
347
          _preferred_rowset_type(b._preferred_rowset_type),
348
          _storage_policy_id(b._storage_policy_id),
349
          _cooldown_meta_id(b._cooldown_meta_id),
350
          _enable_unique_key_merge_on_write(b._enable_unique_key_merge_on_write),
351
          _delete_bitmap(b._delete_bitmap),
352
          _binlog_config(b._binlog_config),
353
          _compaction_policy(b._compaction_policy),
354
          _time_series_compaction_goal_size_mbytes(b._time_series_compaction_goal_size_mbytes),
355
          _time_series_compaction_file_count_threshold(
356
                  b._time_series_compaction_file_count_threshold),
357
          _time_series_compaction_time_threshold_seconds(
358
                  b._time_series_compaction_time_threshold_seconds),
359
          _time_series_compaction_empty_rowsets_threshold(
360
                  b._time_series_compaction_empty_rowsets_threshold),
361
0
          _time_series_compaction_level_threshold(b._time_series_compaction_level_threshold) {};
362
363
void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tcolumn,
364
2.09k
                                          ColumnPB* column) {
365
2.09k
    column->set_unique_id(unique_id);
366
2.09k
    column->set_name(tcolumn.column_name);
367
2.09k
    column->set_has_bitmap_index(tcolumn.has_bitmap_index);
368
2.09k
    column->set_is_auto_increment(tcolumn.is_auto_increment);
369
2.09k
    string data_type;
370
2.09k
    EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type);
371
2.09k
    column->set_type(data_type);
372
373
2.09k
    uint32_t length = TabletColumn::get_field_length_by_type(tcolumn.column_type.type,
374
2.09k
                                                             tcolumn.column_type.len);
375
2.09k
    column->set_length(length);
376
2.09k
    column->set_index_length(length);
377
2.09k
    column->set_precision(tcolumn.column_type.precision);
378
2.09k
    column->set_frac(tcolumn.column_type.scale);
379
380
2.09k
    if (tcolumn.__isset.result_is_nullable) {
381
0
        column->set_result_is_nullable(tcolumn.result_is_nullable);
382
0
    }
383
384
2.09k
    if (tcolumn.column_type.type == TPrimitiveType::VARCHAR ||
385
2.09k
        tcolumn.column_type.type == TPrimitiveType::STRING) {
386
101
        if (!tcolumn.column_type.__isset.index_len) {
387
101
            column->set_index_length(10);
388
101
        } else {
389
0
            column->set_index_length(tcolumn.column_type.index_len);
390
0
        }
391
101
    }
392
2.09k
    if (!tcolumn.is_key) {
393
1.13k
        column->set_is_key(false);
394
1.13k
        if (tcolumn.__isset.aggregation) {
395
0
            column->set_aggregation(tcolumn.aggregation);
396
1.13k
        } else {
397
1.13k
            string aggregation_type;
398
1.13k
            EnumToString(TAggregationType, tcolumn.aggregation_type, aggregation_type);
399
1.13k
            column->set_aggregation(aggregation_type);
400
1.13k
        }
401
1.13k
    } else {
402
967
        column->set_is_key(true);
403
967
        column->set_aggregation("NONE");
404
967
    }
405
2.09k
    column->set_is_nullable(tcolumn.is_allow_null);
406
2.09k
    if (tcolumn.__isset.default_value) {
407
0
        column->set_default_value(tcolumn.default_value);
408
0
    }
409
2.09k
    if (tcolumn.__isset.is_bloom_filter_column) {
410
0
        column->set_is_bf_column(tcolumn.is_bloom_filter_column);
411
0
    }
412
2.09k
    for (size_t i = 0; i < tcolumn.children_column.size(); i++) {
413
0
        ColumnPB* children_column = column->add_children_columns();
414
0
        init_column_from_tcolumn(tcolumn.children_column[i].col_unique_id,
415
0
                                 tcolumn.children_column[i], children_column);
416
0
    }
417
2.09k
}
418
419
5
Status TabletMeta::create_from_file(const string& file_path) {
420
5
    FileHeader<TabletMetaPB> file_header(file_path);
421
    // In file_header.deserialize(), it validates file length, signature, checksum of protobuf.
422
5
    RETURN_IF_ERROR(file_header.deserialize());
423
5
    TabletMetaPB tablet_meta_pb;
424
5
    try {
425
5
        tablet_meta_pb.CopyFrom(file_header.message());
426
5
    } catch (...) {
427
0
        return Status::Error<PARSE_PROTOBUF_ERROR>("fail to copy protocol buffer object. file={}",
428
0
                                                   file_path);
429
0
    }
430
431
5
    init_from_pb(tablet_meta_pb);
432
5
    return Status::OK();
433
5
}
434
435
std::string TabletMeta::construct_header_file_path(const string& schema_hash_path,
436
2
                                                   int64_t tablet_id) {
437
2
    std::stringstream header_name_stream;
438
2
    header_name_stream << schema_hash_path << "/" << tablet_id << ".hdr";
439
2
    return header_name_stream.str();
440
2
}
441
442
0
Status TabletMeta::save_as_json(const string& file_path, DataDir* dir) {
443
0
    std::string json_meta;
444
0
    json2pb::Pb2JsonOptions json_options;
445
0
    json_options.pretty_json = true;
446
0
    json_options.bytes_to_base64 = true;
447
0
    to_json(&json_meta, json_options);
448
    // save to file
449
0
    io::FileWriterPtr file_writer;
450
0
    RETURN_IF_ERROR(dir->fs()->create_file(file_path, &file_writer));
451
0
    RETURN_IF_ERROR(file_writer->append(json_meta));
452
0
    RETURN_IF_ERROR(file_writer->close());
453
0
    return Status::OK();
454
0
}
455
456
227
Status TabletMeta::save(const string& file_path) {
457
227
    TabletMetaPB tablet_meta_pb;
458
227
    to_meta_pb(&tablet_meta_pb);
459
227
    return TabletMeta::save(file_path, tablet_meta_pb);
460
227
}
461
462
229
Status TabletMeta::save(const string& file_path, const TabletMetaPB& tablet_meta_pb) {
463
229
    DCHECK(!file_path.empty());
464
229
    FileHeader<TabletMetaPB> file_header(file_path);
465
229
    try {
466
229
        file_header.mutable_message()->CopyFrom(tablet_meta_pb);
467
229
    } catch (...) {
468
0
        LOG(WARNING) << "fail to copy protocol buffer object. file='" << file_path;
469
0
        return Status::Error<ErrorCode::INTERNAL_ERROR>(
470
0
                "fail to copy protocol buffer object. file={}", file_path);
471
0
    }
472
229
    RETURN_IF_ERROR(file_header.prepare());
473
229
    RETURN_IF_ERROR(file_header.serialize());
474
229
    return Status::OK();
475
229
}
476
477
554
Status TabletMeta::save_meta(DataDir* data_dir) {
478
554
    std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
479
554
    return _save_meta(data_dir);
480
554
}
481
482
554
Status TabletMeta::_save_meta(DataDir* data_dir) {
483
    // check if tablet uid is valid
484
554
    if (_tablet_uid.hi == 0 && _tablet_uid.lo == 0) {
485
0
        LOG(FATAL) << "tablet_uid is invalid"
486
0
                   << " tablet=" << tablet_id() << " _tablet_uid=" << _tablet_uid.to_string();
487
0
    }
488
554
    string meta_binary;
489
490
554
    auto t1 = MonotonicMicros();
491
554
    serialize(&meta_binary);
492
554
    auto t2 = MonotonicMicros();
493
554
    Status status = TabletMetaManager::save(data_dir, tablet_id(), schema_hash(), meta_binary);
494
554
    if (!status.ok()) {
495
0
        LOG(FATAL) << "fail to save tablet_meta. status=" << status << ", tablet_id=" << tablet_id()
496
0
                   << ", schema_hash=" << schema_hash();
497
0
    }
498
554
    auto t3 = MonotonicMicros();
499
554
    auto cost = t3 - t1;
500
554
    if (cost > 1 * 1000 * 1000) {
501
0
        LOG(INFO) << "save tablet(" << tablet_id() << ") meta too slow. serialize cost " << t2 - t1
502
0
                  << "(us), serialized binary size: " << meta_binary.length()
503
0
                  << "(bytes), write rocksdb cost " << t3 - t2 << "(us)";
504
0
    }
505
554
    return status;
506
554
}
507
508
557
void TabletMeta::serialize(string* meta_binary) {
509
557
    TabletMetaPB tablet_meta_pb;
510
557
    to_meta_pb(&tablet_meta_pb);
511
557
    if (tablet_meta_pb.partition_id() <= 0) {
512
461
        LOG(WARNING) << "invalid partition id " << tablet_meta_pb.partition_id() << " tablet "
513
461
                     << tablet_meta_pb.tablet_id();
514
461
    }
515
557
    DBUG_EXECUTE_IF("TabletMeta::serialize::zero_partition_id", {
516
557
        long partition_id = tablet_meta_pb.partition_id();
517
557
        tablet_meta_pb.set_partition_id(0);
518
557
        LOG(WARNING) << "set debug point TabletMeta::serialize::zero_partition_id old="
519
557
                     << partition_id << " new=" << tablet_meta_pb.DebugString();
520
557
    });
521
557
    bool serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
522
557
    if (!_rs_metas.empty() || !_stale_rs_metas.empty()) {
523
557
        _avg_rs_meta_serialize_size =
524
557
                meta_binary->length() / (_rs_metas.size() + _stale_rs_metas.size());
525
557
        if (meta_binary->length() > config::tablet_meta_serialize_size_limit ||
526
557
            !serialize_success) {
527
0
            int64_t origin_meta_size = meta_binary->length();
528
0
            int64_t stale_rowsets_num = tablet_meta_pb.stale_rs_metas().size();
529
0
            tablet_meta_pb.clear_stale_rs_metas();
530
0
            meta_binary->clear();
531
0
            serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
532
0
            LOG(WARNING) << "tablet meta serialization size exceeds limit: "
533
0
                         << config::tablet_meta_serialize_size_limit
534
0
                         << " clean up stale rowsets, tablet id: " << tablet_id()
535
0
                         << " stale rowset num: " << stale_rowsets_num
536
0
                         << " serialization size before clean " << origin_meta_size
537
0
                         << " serialization size after clean " << meta_binary->length();
538
0
        }
539
557
    }
540
541
557
    if (!serialize_success) {
542
0
        LOG(FATAL) << "failed to serialize meta " << tablet_id();
543
0
    }
544
557
}
545
546
457
Status TabletMeta::deserialize(const string& meta_binary) {
547
457
    TabletMetaPB tablet_meta_pb;
548
457
    bool parsed = tablet_meta_pb.ParseFromString(meta_binary);
549
457
    if (!parsed) {
550
0
        return Status::Error<INIT_FAILED>("parse tablet meta failed");
551
0
    }
552
457
    init_from_pb(tablet_meta_pb);
553
457
    return Status::OK();
554
457
}
555
556
2
void TabletMeta::init_rs_metas_fs(const io::FileSystemSPtr& fs) {
557
4
    for (auto& rs_meta : _rs_metas) {
558
4
        if (rs_meta->is_local()) {
559
4
            rs_meta->set_fs(fs);
560
4
        }
561
4
    }
562
2
    for (auto& rs_meta : _stale_rs_metas) {
563
0
        if (rs_meta->is_local()) {
564
0
            rs_meta->set_fs(fs);
565
0
        }
566
0
    }
567
2
}
568
569
926
void TabletMeta::init_from_pb(const TabletMetaPB& tablet_meta_pb) {
570
926
    _table_id = tablet_meta_pb.table_id();
571
926
    _partition_id = tablet_meta_pb.partition_id();
572
926
    _tablet_id = tablet_meta_pb.tablet_id();
573
926
    _replica_id = tablet_meta_pb.replica_id();
574
926
    _schema_hash = tablet_meta_pb.schema_hash();
575
926
    _shard_id = tablet_meta_pb.shard_id();
576
926
    _creation_time = tablet_meta_pb.creation_time();
577
926
    _cumulative_layer_point = tablet_meta_pb.cumulative_layer_point();
578
926
    _tablet_uid = TabletUid(tablet_meta_pb.tablet_uid());
579
926
    if (tablet_meta_pb.has_tablet_type()) {
580
905
        _tablet_type = tablet_meta_pb.tablet_type();
581
905
    } else {
582
21
        _tablet_type = TabletTypePB::TABLET_TYPE_DISK;
583
21
    }
584
585
    // init _tablet_state
586
926
    switch (tablet_meta_pb.tablet_state()) {
587
27
    case PB_NOTREADY:
588
27
        _tablet_state = TabletState::TABLET_NOTREADY;
589
27
        break;
590
675
    case PB_RUNNING:
591
675
        _tablet_state = TabletState::TABLET_RUNNING;
592
675
        break;
593
0
    case PB_TOMBSTONED:
594
0
        _tablet_state = TabletState::TABLET_TOMBSTONED;
595
0
        break;
596
0
    case PB_STOPPED:
597
0
        _tablet_state = TabletState::TABLET_STOPPED;
598
0
        break;
599
224
    case PB_SHUTDOWN:
600
224
        _tablet_state = TabletState::TABLET_SHUTDOWN;
601
224
        break;
602
0
    default:
603
0
        LOG(WARNING) << "tablet has no state. tablet=" << tablet_id()
604
0
                     << ", schema_hash=" << schema_hash();
605
926
    }
606
607
    // init _schema
608
926
    TabletSchemaSPtr schema = std::make_shared<TabletSchema>();
609
926
    schema->init_from_pb(tablet_meta_pb.schema());
610
926
    if (_handle) {
611
3
        TabletSchemaCache::instance()->release(_handle);
612
3
    }
613
926
    auto pair = TabletSchemaCache::instance()->insert(schema->to_key());
614
926
    _handle = pair.first;
615
926
    _schema = pair.second;
616
617
926
    if (tablet_meta_pb.has_enable_unique_key_merge_on_write()) {
618
905
        _enable_unique_key_merge_on_write = tablet_meta_pb.enable_unique_key_merge_on_write();
619
905
    }
620
621
    // init _rs_metas
622
10.6k
    for (auto& it : tablet_meta_pb.rs_metas()) {
623
10.6k
        RowsetMetaSharedPtr rs_meta(new RowsetMeta());
624
10.6k
        rs_meta->init_from_pb(it);
625
10.6k
        _rs_metas.push_back(std::move(rs_meta));
626
10.6k
    }
627
628
    // For mow table, delete bitmap of stale rowsets has not been persisted.
629
    // When be restart, query should not read the stale rowset, otherwise duplicate keys
630
    // will be read out. Therefore, we don't add them to _stale_rs_meta for mow table.
631
926
    if (!config::skip_loading_stale_rowset_meta && !_enable_unique_key_merge_on_write) {
632
892
        for (auto& it : tablet_meta_pb.stale_rs_metas()) {
633
0
            RowsetMetaSharedPtr rs_meta(new RowsetMeta());
634
0
            rs_meta->init_from_pb(it);
635
0
            _stale_rs_metas.push_back(std::move(rs_meta));
636
0
        }
637
892
    }
638
639
926
    if (tablet_meta_pb.has_in_restore_mode()) {
640
905
        _in_restore_mode = tablet_meta_pb.in_restore_mode();
641
905
    }
642
643
926
    if (tablet_meta_pb.has_preferred_rowset_type()) {
644
467
        _preferred_rowset_type = tablet_meta_pb.preferred_rowset_type();
645
467
    }
646
647
926
    _storage_policy_id = tablet_meta_pb.storage_policy_id();
648
926
    if (tablet_meta_pb.has_cooldown_meta_id()) {
649
0
        _cooldown_meta_id = tablet_meta_pb.cooldown_meta_id();
650
0
    }
651
652
926
    if (tablet_meta_pb.has_delete_bitmap()) {
653
0
        int rst_ids_size = tablet_meta_pb.delete_bitmap().rowset_ids_size();
654
0
        int seg_ids_size = tablet_meta_pb.delete_bitmap().segment_ids_size();
655
0
        int versions_size = tablet_meta_pb.delete_bitmap().versions_size();
656
0
        int seg_maps_size = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps_size();
657
0
        CHECK(rst_ids_size == seg_ids_size && seg_ids_size == seg_maps_size &&
658
0
              seg_maps_size == versions_size);
659
0
        for (size_t i = 0; i < rst_ids_size; ++i) {
660
0
            RowsetId rst_id;
661
0
            rst_id.init(tablet_meta_pb.delete_bitmap().rowset_ids(i));
662
0
            auto seg_id = tablet_meta_pb.delete_bitmap().segment_ids(i);
663
0
            uint32_t ver = tablet_meta_pb.delete_bitmap().versions(i);
664
0
            auto bitmap = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps(i).data();
665
0
            delete_bitmap().delete_bitmap[{rst_id, seg_id, ver}] = roaring::Roaring::read(bitmap);
666
0
        }
667
0
    }
668
669
926
    if (tablet_meta_pb.has_binlog_config()) {
670
465
        _binlog_config = tablet_meta_pb.binlog_config();
671
465
    }
672
926
    _compaction_policy = tablet_meta_pb.compaction_policy();
673
926
    _time_series_compaction_goal_size_mbytes =
674
926
            tablet_meta_pb.time_series_compaction_goal_size_mbytes();
675
926
    _time_series_compaction_file_count_threshold =
676
926
            tablet_meta_pb.time_series_compaction_file_count_threshold();
677
926
    _time_series_compaction_time_threshold_seconds =
678
926
            tablet_meta_pb.time_series_compaction_time_threshold_seconds();
679
926
    _time_series_compaction_empty_rowsets_threshold =
680
926
            tablet_meta_pb.time_series_compaction_empty_rowsets_threshold();
681
926
    _time_series_compaction_level_threshold =
682
926
            tablet_meta_pb.time_series_compaction_level_threshold();
683
926
}
684
685
793
void TabletMeta::to_meta_pb(TabletMetaPB* tablet_meta_pb) {
686
793
    tablet_meta_pb->set_table_id(table_id());
687
793
    tablet_meta_pb->set_partition_id(partition_id());
688
793
    tablet_meta_pb->set_tablet_id(tablet_id());
689
793
    tablet_meta_pb->set_replica_id(replica_id());
690
793
    tablet_meta_pb->set_schema_hash(schema_hash());
691
793
    tablet_meta_pb->set_shard_id(shard_id());
692
793
    tablet_meta_pb->set_creation_time(creation_time());
693
793
    tablet_meta_pb->set_cumulative_layer_point(cumulative_layer_point());
694
793
    *(tablet_meta_pb->mutable_tablet_uid()) = tablet_uid().to_proto();
695
793
    tablet_meta_pb->set_tablet_type(_tablet_type);
696
793
    switch (tablet_state()) {
697
6
    case TABLET_NOTREADY:
698
6
        tablet_meta_pb->set_tablet_state(PB_NOTREADY);
699
6
        break;
700
312
    case TABLET_RUNNING:
701
312
        tablet_meta_pb->set_tablet_state(PB_RUNNING);
702
312
        break;
703
0
    case TABLET_TOMBSTONED:
704
0
        tablet_meta_pb->set_tablet_state(PB_TOMBSTONED);
705
0
        break;
706
0
    case TABLET_STOPPED:
707
0
        tablet_meta_pb->set_tablet_state(PB_STOPPED);
708
0
        break;
709
475
    case TABLET_SHUTDOWN:
710
475
        tablet_meta_pb->set_tablet_state(PB_SHUTDOWN);
711
475
        break;
712
793
    }
713
714
21.2k
    for (auto& rs : _rs_metas) {
715
21.2k
        rs->to_rowset_pb(tablet_meta_pb->add_rs_metas());
716
21.2k
    }
717
793
    for (auto rs : _stale_rs_metas) {
718
0
        rs->to_rowset_pb(tablet_meta_pb->add_stale_rs_metas());
719
0
    }
720
793
    _schema->to_schema_pb(tablet_meta_pb->mutable_schema());
721
722
793
    tablet_meta_pb->set_in_restore_mode(in_restore_mode());
723
724
    // to avoid modify tablet meta to the greatest extend
725
793
    if (_preferred_rowset_type == BETA_ROWSET) {
726
793
        tablet_meta_pb->set_preferred_rowset_type(_preferred_rowset_type);
727
793
    }
728
793
    if (_storage_policy_id > 0) {
729
5
        tablet_meta_pb->set_storage_policy_id(_storage_policy_id);
730
5
    }
731
793
    if (_cooldown_meta_id.initialized()) {
732
5
        tablet_meta_pb->mutable_cooldown_meta_id()->CopyFrom(_cooldown_meta_id.to_proto());
733
5
    }
734
735
793
    tablet_meta_pb->set_enable_unique_key_merge_on_write(_enable_unique_key_merge_on_write);
736
737
793
    if (_enable_unique_key_merge_on_write) {
738
2
        std::set<RowsetId> stale_rs_ids;
739
2
        for (const auto& rowset : _stale_rs_metas) {
740
0
            stale_rs_ids.insert(rowset->rowset_id());
741
0
        }
742
2
        DeleteBitmapPB* delete_bitmap_pb = tablet_meta_pb->mutable_delete_bitmap();
743
2
        for (auto& [id, bitmap] : delete_bitmap().snapshot().delete_bitmap) {
744
2
            auto& [rowset_id, segment_id, ver] = id;
745
2
            if (stale_rs_ids.count(rowset_id) != 0) {
746
0
                continue;
747
0
            }
748
2
            delete_bitmap_pb->add_rowset_ids(rowset_id.to_string());
749
2
            delete_bitmap_pb->add_segment_ids(segment_id);
750
2
            delete_bitmap_pb->add_versions(ver);
751
2
            std::string bitmap_data(bitmap.getSizeInBytes(), '\0');
752
2
            bitmap.write(bitmap_data.data());
753
2
            *(delete_bitmap_pb->add_segment_delete_bitmaps()) = std::move(bitmap_data);
754
2
        }
755
2
    }
756
793
    _binlog_config.to_pb(tablet_meta_pb->mutable_binlog_config());
757
793
    tablet_meta_pb->set_compaction_policy(compaction_policy());
758
793
    tablet_meta_pb->set_time_series_compaction_goal_size_mbytes(
759
793
            time_series_compaction_goal_size_mbytes());
760
793
    tablet_meta_pb->set_time_series_compaction_file_count_threshold(
761
793
            time_series_compaction_file_count_threshold());
762
793
    tablet_meta_pb->set_time_series_compaction_time_threshold_seconds(
763
793
            time_series_compaction_time_threshold_seconds());
764
793
    tablet_meta_pb->set_time_series_compaction_empty_rowsets_threshold(
765
793
            time_series_compaction_empty_rowsets_threshold());
766
793
    tablet_meta_pb->set_time_series_compaction_level_threshold(
767
793
            time_series_compaction_level_threshold());
768
793
}
769
770
548
int64_t TabletMeta::mem_size() const {
771
548
    auto size = sizeof(TabletMeta);
772
548
    size += _schema->mem_size();
773
548
    return size;
774
548
}
775
776
2
void TabletMeta::to_json(string* json_string, json2pb::Pb2JsonOptions& options) {
777
2
    TabletMetaPB tablet_meta_pb;
778
2
    to_meta_pb(&tablet_meta_pb);
779
2
    json2pb::ProtoMessageToJson(tablet_meta_pb, json_string, options);
780
2
}
781
782
91
Version TabletMeta::max_version() const {
783
91
    Version max_version = {-1, 0};
784
150
    for (auto& rs_meta : _rs_metas) {
785
150
        if (rs_meta->end_version() > max_version.second) {
786
148
            max_version = rs_meta->version();
787
148
        }
788
150
    }
789
91
    return max_version;
790
91
}
791
792
0
size_t TabletMeta::version_count_cross_with_range(const Version& range) const {
793
0
    size_t count = 0;
794
0
    for (const auto& rs_meta : _rs_metas) {
795
0
        if (!(range.first > rs_meta->version().second || range.second < rs_meta->version().first)) {
796
0
            count++;
797
0
        }
798
0
    }
799
0
    return count;
800
0
}
801
802
10.8k
Status TabletMeta::add_rs_meta(const RowsetMetaSharedPtr& rs_meta) {
803
    // check RowsetMeta is valid
804
337k
    for (auto& rs : _rs_metas) {
805
337k
        if (rs->version() == rs_meta->version()) {
806
0
            if (rs->rowset_id() != rs_meta->rowset_id()) {
807
0
                return Status::Error<PUSH_VERSION_ALREADY_EXIST>(
808
0
                        "version already exist. rowset_id={}, version={}, tablet={}",
809
0
                        rs->rowset_id().to_string(), rs->version().to_string(), tablet_id());
810
0
            } else {
811
                // rowsetid,version is equal, it is a duplicate req, skip it
812
0
                return Status::OK();
813
0
            }
814
0
        }
815
337k
    }
816
10.8k
    _rs_metas.push_back(rs_meta);
817
10.8k
    return Status::OK();
818
10.8k
}
819
820
void TabletMeta::delete_rs_meta_by_version(const Version& version,
821
0
                                           std::vector<RowsetMetaSharedPtr>* deleted_rs_metas) {
822
0
    auto it = _rs_metas.begin();
823
0
    while (it != _rs_metas.end()) {
824
0
        if ((*it)->version() == version) {
825
0
            if (deleted_rs_metas != nullptr) {
826
0
                deleted_rs_metas->push_back(*it);
827
0
            }
828
0
            _rs_metas.erase(it);
829
0
            return;
830
0
        } else {
831
0
            ++it;
832
0
        }
833
0
    }
834
0
}
835
836
void TabletMeta::modify_rs_metas(const std::vector<RowsetMetaSharedPtr>& to_add,
837
                                 const std::vector<RowsetMetaSharedPtr>& to_delete,
838
24
                                 bool same_version) {
839
    // Remove to_delete rowsets from _rs_metas
840
24
    for (auto rs_to_del : to_delete) {
841
18
        auto it = _rs_metas.begin();
842
20
        while (it != _rs_metas.end()) {
843
20
            if (rs_to_del->version() == (*it)->version()) {
844
18
                _rs_metas.erase(it);
845
                // there should be only one rowset match the version
846
18
                break;
847
18
            } else {
848
2
                ++it;
849
2
            }
850
20
        }
851
18
    }
852
24
    if (!same_version) {
853
        // put to_delete rowsets in _stale_rs_metas.
854
6
        _stale_rs_metas.insert(_stale_rs_metas.end(), to_delete.begin(), to_delete.end());
855
6
    }
856
    // put to_add rowsets in _rs_metas.
857
24
    _rs_metas.insert(_rs_metas.end(), to_add.begin(), to_add.end());
858
24
}
859
860
// Use the passing "rs_metas" to replace the rs meta in this tablet meta
861
// Also clear the _stale_rs_metas because this tablet meta maybe copyied from
862
// an existing tablet before. Add after revise, only the passing "rs_metas"
863
// is needed.
864
3
void TabletMeta::revise_rs_metas(std::vector<RowsetMetaSharedPtr>&& rs_metas) {
865
3
    std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
866
3
    _rs_metas = std::move(rs_metas);
867
3
    _stale_rs_metas.clear();
868
3
}
869
870
// This method should call after revise_rs_metas, since new rs_metas might be a subset
871
// of original tablet, we should revise the delete_bitmap according to current rowset.
872
//
873
// Delete bitmap is protected by Tablet::_meta_lock, we don't need to acquire the
874
// TabletMeta's _meta_lock
875
1
void TabletMeta::revise_delete_bitmap_unlocked(const DeleteBitmap& delete_bitmap) {
876
1
    _delete_bitmap = std::make_unique<DeleteBitmap>(tablet_id());
877
2
    for (auto rs : _rs_metas) {
878
2
        DeleteBitmap rs_bm(tablet_id());
879
2
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
880
2
                             &rs_bm);
881
2
        _delete_bitmap->merge(rs_bm);
882
2
    }
883
1
    for (auto rs : _stale_rs_metas) {
884
0
        DeleteBitmap rs_bm(tablet_id());
885
0
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
886
0
                             &rs_bm);
887
0
        _delete_bitmap->merge(rs_bm);
888
0
    }
889
1
}
890
891
0
void TabletMeta::delete_stale_rs_meta_by_version(const Version& version) {
892
0
    auto it = _stale_rs_metas.begin();
893
0
    while (it != _stale_rs_metas.end()) {
894
0
        if ((*it)->version() == version) {
895
0
            if (_enable_unique_key_merge_on_write) {
896
                // remove rowset delete bitmap
897
0
                delete_bitmap().remove({(*it)->rowset_id(), 0, 0},
898
0
                                       {(*it)->rowset_id(), UINT32_MAX, 0});
899
0
            }
900
0
            it = _stale_rs_metas.erase(it);
901
0
        } else {
902
0
            it++;
903
0
        }
904
0
    }
905
0
}
906
907
0
RowsetMetaSharedPtr TabletMeta::acquire_rs_meta_by_version(const Version& version) const {
908
0
    for (auto it : _rs_metas) {
909
0
        if (it->version() == version) {
910
0
            return it;
911
0
        }
912
0
    }
913
0
    return nullptr;
914
0
}
915
916
8
RowsetMetaSharedPtr TabletMeta::acquire_stale_rs_meta_by_version(const Version& version) const {
917
8
    for (auto it : _stale_rs_metas) {
918
0
        if (it->version() == version) {
919
0
            return it;
920
0
        }
921
0
    }
922
8
    return nullptr;
923
8
}
924
925
22
Status TabletMeta::set_partition_id(int64_t partition_id) {
926
22
    if ((_partition_id > 0 && _partition_id != partition_id) || partition_id < 1) {
927
0
        LOG(WARNING) << "cur partition id=" << _partition_id << " new partition id=" << partition_id
928
0
                     << " not equal";
929
0
    }
930
22
    _partition_id = partition_id;
931
22
    return Status::OK();
932
22
}
933
934
1
bool operator==(const TabletMeta& a, const TabletMeta& b) {
935
1
    if (a._table_id != b._table_id) return false;
936
1
    if (a._partition_id != b._partition_id) return false;
937
1
    if (a._tablet_id != b._tablet_id) return false;
938
1
    if (a._replica_id != b._replica_id) return false;
939
1
    if (a._schema_hash != b._schema_hash) return false;
940
1
    if (a._shard_id != b._shard_id) return false;
941
1
    if (a._creation_time != b._creation_time) return false;
942
1
    if (a._cumulative_layer_point != b._cumulative_layer_point) return false;
943
1
    if (a._tablet_uid != b._tablet_uid) return false;
944
1
    if (a._tablet_type != b._tablet_type) return false;
945
1
    if (a._tablet_state != b._tablet_state) return false;
946
1
    if (*a._schema != *b._schema) return false;
947
1
    if (a._rs_metas.size() != b._rs_metas.size()) return false;
948
1
    for (int i = 0; i < a._rs_metas.size(); ++i) {
949
0
        if (a._rs_metas[i] != b._rs_metas[i]) return false;
950
0
    }
951
1
    if (a._in_restore_mode != b._in_restore_mode) return false;
952
1
    if (a._preferred_rowset_type != b._preferred_rowset_type) return false;
953
1
    if (a._storage_policy_id != b._storage_policy_id) return false;
954
1
    if (a._compaction_policy != b._compaction_policy) return false;
955
1
    if (a._time_series_compaction_goal_size_mbytes != b._time_series_compaction_goal_size_mbytes)
956
0
        return false;
957
1
    if (a._time_series_compaction_file_count_threshold !=
958
1
        b._time_series_compaction_file_count_threshold)
959
0
        return false;
960
1
    if (a._time_series_compaction_time_threshold_seconds !=
961
1
        b._time_series_compaction_time_threshold_seconds)
962
0
        return false;
963
1
    if (a._time_series_compaction_empty_rowsets_threshold !=
964
1
        b._time_series_compaction_empty_rowsets_threshold)
965
0
        return false;
966
1
    if (a._time_series_compaction_level_threshold != b._time_series_compaction_level_threshold)
967
0
        return false;
968
1
    return true;
969
1
}
970
971
0
bool operator!=(const TabletMeta& a, const TabletMeta& b) {
972
0
    return !(a == b);
973
0
}
974
975
1.00k
DeleteBitmap::DeleteBitmap(int64_t tablet_id) : _tablet_id(tablet_id) {
976
    // The default delete bitmap cache is set to 100MB,
977
    // which can be insufficient and cause performance issues when the amount of user data is large.
978
    // To mitigate the problem of an inadequate cache,
979
    // we will take the larger of 0.5% of the total memory and 100MB as the delete bitmap cache size.
980
1.00k
    bool is_percent = false;
981
1.00k
    int64_t delete_bitmap_agg_cache_cache_limit =
982
1.00k
            ParseUtil::parse_mem_spec(config::delete_bitmap_dynamic_agg_cache_limit,
983
1.00k
                                      MemInfo::mem_limit(), MemInfo::physical_mem(), &is_percent);
984
1.00k
    _agg_cache.reset(new AggCache(delete_bitmap_agg_cache_cache_limit >
985
1.00k
                                                  config::delete_bitmap_agg_cache_capacity
986
1.00k
                                          ? delete_bitmap_agg_cache_cache_limit
987
1.00k
                                          : config::delete_bitmap_agg_cache_capacity));
988
1.00k
}
989
990
5
DeleteBitmap::DeleteBitmap(const DeleteBitmap& o) {
991
5
    delete_bitmap = o.delete_bitmap; // just copy data
992
5
    _tablet_id = o._tablet_id;
993
5
}
994
995
0
DeleteBitmap& DeleteBitmap::operator=(const DeleteBitmap& o) {
996
0
    delete_bitmap = o.delete_bitmap; // just copy data
997
0
    _tablet_id = o._tablet_id;
998
0
    return *this;
999
0
}
1000
1001
0
DeleteBitmap::DeleteBitmap(DeleteBitmap&& o) {
1002
0
    delete_bitmap = std::move(o.delete_bitmap);
1003
0
    _tablet_id = o._tablet_id;
1004
0
}
1005
1006
0
DeleteBitmap& DeleteBitmap::operator=(DeleteBitmap&& o) {
1007
0
    delete_bitmap = std::move(o.delete_bitmap);
1008
0
    _tablet_id = o._tablet_id;
1009
0
    return *this;
1010
0
}
1011
1012
5
DeleteBitmap DeleteBitmap::snapshot() const {
1013
5
    std::shared_lock l(lock);
1014
5
    return DeleteBitmap(*this);
1015
5
}
1016
1017
3
DeleteBitmap DeleteBitmap::snapshot(Version version) const {
1018
    // Take snapshot first, then remove keys greater than given version.
1019
3
    DeleteBitmap snapshot = this->snapshot();
1020
3
    auto it = snapshot.delete_bitmap.begin();
1021
412
    while (it != snapshot.delete_bitmap.end()) {
1022
409
        if (std::get<2>(it->first) > version) {
1023
4
            it = snapshot.delete_bitmap.erase(it);
1024
405
        } else {
1025
405
            it++;
1026
405
        }
1027
409
    }
1028
3
    return snapshot;
1029
3
}
1030
1031
459k
void DeleteBitmap::add(const BitmapKey& bmk, uint32_t row_id) {
1032
459k
    std::lock_guard l(lock);
1033
459k
    delete_bitmap[bmk].add(row_id);
1034
459k
}
1035
1036
0
int DeleteBitmap::remove(const BitmapKey& bmk, uint32_t row_id) {
1037
0
    std::lock_guard l(lock);
1038
0
    auto it = delete_bitmap.find(bmk);
1039
0
    if (it == delete_bitmap.end()) return -1;
1040
0
    it->second.remove(row_id);
1041
0
    return 0;
1042
0
}
1043
1044
8
void DeleteBitmap::remove(const BitmapKey& start, const BitmapKey& end) {
1045
8
    std::lock_guard l(lock);
1046
107
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) {
1047
101
        auto& [k, _] = *it;
1048
101
        if (k >= end) {
1049
2
            break;
1050
2
        }
1051
99
        it = delete_bitmap.erase(it);
1052
99
    }
1053
8
}
1054
1055
6
bool DeleteBitmap::contains(const BitmapKey& bmk, uint32_t row_id) const {
1056
6
    std::shared_lock l(lock);
1057
6
    auto it = delete_bitmap.find(bmk);
1058
6
    return it != delete_bitmap.end() && it->second.contains(row_id);
1059
6
}
1060
1061
2
bool DeleteBitmap::contains_agg(const BitmapKey& bmk, uint32_t row_id) const {
1062
2
    return get_agg(bmk)->contains(row_id);
1063
2
}
1064
1065
0
bool DeleteBitmap::empty() const {
1066
0
    std::shared_lock l(lock);
1067
0
    return delete_bitmap.empty();
1068
0
}
1069
1070
62
uint64_t DeleteBitmap::cardinality() const {
1071
62
    std::shared_lock l {lock};
1072
62
    uint64_t res = 0;
1073
62
    for (auto entry : delete_bitmap) {
1074
0
        if (std::get<1>(entry.first) != DeleteBitmap::INVALID_SEGMENT_ID) {
1075
0
            res += entry.second.cardinality();
1076
0
        }
1077
0
    }
1078
62
    return res;
1079
62
}
1080
1081
62
uint64_t DeleteBitmap::get_delete_bitmap_count() const {
1082
62
    std::shared_lock l(lock);
1083
62
    uint64_t count = 0;
1084
62
    for (auto it = delete_bitmap.begin(); it != delete_bitmap.end(); it++) {
1085
0
        if (std::get<1>(it->first) != DeleteBitmap::INVALID_SEGMENT_ID) {
1086
0
            count++;
1087
0
        }
1088
0
    }
1089
62
    return count;
1090
62
}
1091
1092
1
bool DeleteBitmap::contains_agg_without_cache(const BitmapKey& bmk, uint32_t row_id) const {
1093
1
    std::shared_lock l(lock);
1094
1
    DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), 0};
1095
1
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1096
0
        auto& [k, bm] = *it;
1097
0
        if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1098
0
            std::get<2>(k) > std::get<2>(bmk)) {
1099
0
            break;
1100
0
        }
1101
0
        if (bm.contains(row_id)) {
1102
0
            return true;
1103
0
        }
1104
0
    }
1105
1
    return false;
1106
1
}
1107
1108
38
int DeleteBitmap::set(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1109
38
    std::lock_guard l(lock);
1110
38
    auto [_, inserted] = delete_bitmap.insert_or_assign(bmk, segment_delete_bitmap);
1111
38
    return inserted;
1112
38
}
1113
1114
3
int DeleteBitmap::get(const BitmapKey& bmk, roaring::Roaring* segment_delete_bitmap) const {
1115
3
    std::shared_lock l(lock);
1116
3
    auto it = delete_bitmap.find(bmk);
1117
3
    if (it == delete_bitmap.end()) return -1;
1118
3
    *segment_delete_bitmap = it->second; // copy
1119
3
    return 0;
1120
3
}
1121
1122
54
const roaring::Roaring* DeleteBitmap::get(const BitmapKey& bmk) const {
1123
54
    std::shared_lock l(lock);
1124
54
    auto it = delete_bitmap.find(bmk);
1125
54
    if (it == delete_bitmap.end()) return nullptr;
1126
41
    return &(it->second); // get address
1127
54
}
1128
1129
void DeleteBitmap::subset(const BitmapKey& start, const BitmapKey& end,
1130
3
                          DeleteBitmap* subset_rowset_map) const {
1131
3
    roaring::Roaring roaring;
1132
3
    DCHECK(start < end);
1133
3
    std::shared_lock l(lock);
1134
26
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1135
25
        auto& [k, bm] = *it;
1136
25
        if (k >= end) {
1137
2
            break;
1138
2
        }
1139
23
        subset_rowset_map->set(k, bm);
1140
23
    }
1141
3
}
1142
1143
2
void DeleteBitmap::merge(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1144
2
    std::lock_guard l(lock);
1145
2
    auto [iter, succ] = delete_bitmap.emplace(bmk, segment_delete_bitmap);
1146
2
    if (!succ) {
1147
0
        iter->second |= segment_delete_bitmap;
1148
0
    }
1149
2
}
1150
1151
9
void DeleteBitmap::merge(const DeleteBitmap& other) {
1152
9
    std::lock_guard l(lock);
1153
29
    for (auto& i : other.delete_bitmap) {
1154
29
        auto [j, succ] = this->delete_bitmap.insert(i);
1155
29
        if (!succ) j->second |= i.second;
1156
29
    }
1157
9
}
1158
1159
0
bool DeleteBitmap::has_calculated_for_multi_segments(const RowsetId& rowset_id) const {
1160
0
    return contains({rowset_id, INVALID_SEGMENT_ID, TEMP_VERSION_COMMON}, ROWSET_SENTINEL_MARK);
1161
0
}
1162
1163
void DeleteBitmap::traverse_rowset_id_prefix(
1164
0
        const std::function<void(const DeleteBitmap&, const RowsetId& rowsetId)>& func) const {
1165
0
    std::shared_lock rlock {lock};
1166
0
    auto it = delete_bitmap.cbegin();
1167
0
    while (it != delete_bitmap.cend()) {
1168
0
        RowsetId rowset_id = std::get<0>(it->first);
1169
0
        func(*this, rowset_id);
1170
        // find next rowset id
1171
0
        it = delete_bitmap.upper_bound({rowset_id, std::numeric_limits<SegmentId>::max(),
1172
0
                                        std::numeric_limits<Version>::max()});
1173
0
    }
1174
0
}
1175
1176
0
uint64_t DeleteBitmap::count_key_with_rowset_id_unlocked(const RowsetId& rowset_id) const {
1177
0
    auto lower_bound = delete_bitmap.lower_bound({rowset_id, 0, 0});
1178
0
    auto upper_bound = delete_bitmap.upper_bound({rowset_id, std::numeric_limits<SegmentId>::max(),
1179
0
                                                  std::numeric_limits<Version>::max()});
1180
0
    return std::distance(lower_bound, upper_bound);
1181
0
}
1182
1183
// We cannot just copy the underlying memory to construct a string
1184
// due to equivalent objects may have different padding bytes.
1185
// Reading padding bytes is undefined behavior, neither copy nor
1186
// placement new will help simplify the code.
1187
// Refer to C11 standards §6.2.6.1/6 and §6.7.9/21 for more info.
1188
44
static std::string agg_cache_key(int64_t tablet_id, const DeleteBitmap::BitmapKey& bmk) {
1189
44
    std::string ret(sizeof(tablet_id) + sizeof(bmk), '\0');
1190
44
    *reinterpret_cast<int64_t*>(ret.data()) = tablet_id;
1191
44
    auto t = reinterpret_cast<DeleteBitmap::BitmapKey*>(ret.data() + sizeof(tablet_id));
1192
44
    std::get<RowsetId>(*t).version = std::get<RowsetId>(bmk).version;
1193
44
    std::get<RowsetId>(*t).hi = std::get<RowsetId>(bmk).hi;
1194
44
    std::get<RowsetId>(*t).mi = std::get<RowsetId>(bmk).mi;
1195
44
    std::get<RowsetId>(*t).lo = std::get<RowsetId>(bmk).lo;
1196
44
    std::get<1>(*t) = std::get<1>(bmk);
1197
44
    std::get<2>(*t) = std::get<2>(bmk);
1198
44
    return ret;
1199
44
}
1200
1201
44
std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg(const BitmapKey& bmk) const {
1202
44
    std::string key_str = agg_cache_key(_tablet_id, bmk); // Cache key container
1203
44
    CacheKey key(key_str);
1204
44
    Cache::Handle* handle = _agg_cache->repr()->lookup(key);
1205
1206
44
    AggCache::Value* val =
1207
44
            handle == nullptr
1208
44
                    ? nullptr
1209
44
                    : reinterpret_cast<AggCache::Value*>(_agg_cache->repr()->value(handle));
1210
    // FIXME: do we need a mutex here to get rid of duplicated initializations
1211
    //        of cache entries in some cases?
1212
44
    if (val == nullptr) { // Renew if needed, put a new Value to cache
1213
38
        val = new AggCache::Value();
1214
38
        {
1215
38
            std::shared_lock l(lock);
1216
38
            DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), 0};
1217
69
            for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1218
66
                auto& [k, bm] = *it;
1219
66
                if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1220
66
                    std::get<2>(k) > std::get<2>(bmk)) {
1221
35
                    break;
1222
35
                }
1223
31
                val->bitmap |= bm;
1224
31
            }
1225
38
        }
1226
38
        size_t charge = val->bitmap.getSizeInBytes() + sizeof(AggCache::Value);
1227
38
        handle = _agg_cache->repr()->insert(key, val, charge, charge, CachePriority::NORMAL);
1228
38
    }
1229
1230
    // It is natural for the cache to reclaim the underlying memory
1231
44
    return std::shared_ptr<roaring::Roaring>(
1232
44
            &val->bitmap, [this, handle](...) { _agg_cache->repr()->release(handle); });
1233
44
}
1234
1235
std::atomic<DeleteBitmap::AggCachePolicy*> DeleteBitmap::AggCache::s_repr {nullptr};
1236
1237
0
std::string tablet_state_name(TabletState state) {
1238
0
    switch (state) {
1239
0
    case TABLET_NOTREADY:
1240
0
        return "TABLET_NOTREADY";
1241
1242
0
    case TABLET_RUNNING:
1243
0
        return "TABLET_RUNNING";
1244
1245
0
    case TABLET_TOMBSTONED:
1246
0
        return "TABLET_TOMBSTONED";
1247
1248
0
    case TABLET_STOPPED:
1249
0
        return "TABLET_STOPPED";
1250
1251
0
    case TABLET_SHUTDOWN:
1252
0
        return "TABLET_SHUTDOWN";
1253
1254
0
    default:
1255
0
        return "TabletState(" + std::to_string(state) + ")";
1256
0
    }
1257
0
}
1258
1259
} // namespace doris