Coverage Report

Created: 2024-11-22 11:49

/root/doris/be/src/olap/tablet_meta.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "olap/tablet_meta.h"
19
20
#include <gen_cpp/Descriptors_types.h>
21
#include <gen_cpp/Types_types.h>
22
#include <gen_cpp/olap_common.pb.h>
23
#include <gen_cpp/olap_file.pb.h>
24
#include <gen_cpp/segment_v2.pb.h>
25
#include <gen_cpp/types.pb.h>
26
#include <json2pb/pb_to_json.h>
27
#include <time.h>
28
29
#include <cstdint>
30
#include <set>
31
#include <utility>
32
33
#include "cloud/cloud_meta_mgr.h"
34
#include "cloud/cloud_storage_engine.h"
35
#include "cloud/config.h"
36
#include "common/config.h"
37
#include "gutil/integral_types.h"
38
#include "io/fs/file_writer.h"
39
#include "io/fs/local_file_system.h"
40
#include "olap/data_dir.h"
41
#include "olap/file_header.h"
42
#include "olap/olap_common.h"
43
#include "olap/olap_define.h"
44
#include "olap/rowset/rowset.h"
45
#include "olap/rowset/rowset_meta_manager.h"
46
#include "olap/tablet_meta_manager.h"
47
#include "olap/utils.h"
48
#include "util/debug_points.h"
49
#include "util/mem_info.h"
50
#include "util/parse_util.h"
51
#include "util/string_util.h"
52
#include "util/time.h"
53
#include "util/uid_util.h"
54
55
using std::string;
56
using std::unordered_map;
57
using std::vector;
58
59
namespace doris {
60
using namespace ErrorCode;
61
62
TabletMetaSharedPtr TabletMeta::create(
63
        const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id,
64
        uint32_t next_unique_id,
65
91
        const unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id) {
66
91
    std::optional<TBinlogConfig> binlog_config;
67
91
    if (request.__isset.binlog_config) {
68
0
        binlog_config = request.binlog_config;
69
0
    }
70
91
    TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format =
71
91
            request.inverted_index_file_storage_format;
72
73
    // We will discard this format. Don't make any further changes here.
74
91
    if (request.__isset.inverted_index_storage_format) {
75
91
        switch (request.inverted_index_storage_format) {
76
0
        case TInvertedIndexStorageFormat::V1:
77
0
            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V1;
78
0
            break;
79
0
        case TInvertedIndexStorageFormat::V2:
80
0
            inverted_index_file_storage_format = TInvertedIndexFileStorageFormat::V2;
81
0
            break;
82
91
        default:
83
91
            break;
84
91
        }
85
91
    }
86
91
    return std::make_shared<TabletMeta>(
87
91
            request.table_id, request.partition_id, request.tablet_id, request.replica_id,
88
91
            request.tablet_schema.schema_hash, shard_id, request.tablet_schema, next_unique_id,
89
91
            col_ordinal_to_unique_id, tablet_uid,
90
91
            request.__isset.tablet_type ? request.tablet_type : TTabletType::TABLET_TYPE_DISK,
91
91
            request.compression_type, request.storage_policy_id,
92
91
            request.__isset.enable_unique_key_merge_on_write
93
91
                    ? request.enable_unique_key_merge_on_write
94
91
                    : false,
95
91
            std::move(binlog_config), request.compaction_policy,
96
91
            request.time_series_compaction_goal_size_mbytes,
97
91
            request.time_series_compaction_file_count_threshold,
98
91
            request.time_series_compaction_time_threshold_seconds,
99
91
            request.time_series_compaction_empty_rowsets_threshold,
100
91
            request.time_series_compaction_level_threshold, inverted_index_file_storage_format);
101
91
}
102
103
TabletMeta::TabletMeta()
104
        : _tablet_uid(0, 0),
105
          _schema(new TabletSchema),
106
108
          _delete_bitmap(new DeleteBitmap(_tablet_id)) {}
107
108
TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id,
109
                       int64_t replica_id, int32_t schema_hash, uint64_t shard_id,
110
                       const TTabletSchema& tablet_schema, uint32_t next_unique_id,
111
                       const std::unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id,
112
                       TabletUid tablet_uid, TTabletType::type tabletType,
113
                       TCompressionType::type compression_type, int64_t storage_policy_id,
114
                       bool enable_unique_key_merge_on_write,
115
                       std::optional<TBinlogConfig> binlog_config, std::string compaction_policy,
116
                       int64_t time_series_compaction_goal_size_mbytes,
117
                       int64_t time_series_compaction_file_count_threshold,
118
                       int64_t time_series_compaction_time_threshold_seconds,
119
                       int64_t time_series_compaction_empty_rowsets_threshold,
120
                       int64_t time_series_compaction_level_threshold,
121
                       TInvertedIndexFileStorageFormat::type inverted_index_file_storage_format)
122
        : _tablet_uid(0, 0),
123
          _schema(new TabletSchema),
124
234
          _delete_bitmap(new DeleteBitmap(tablet_id)) {
125
234
    TabletMetaPB tablet_meta_pb;
126
234
    tablet_meta_pb.set_table_id(table_id);
127
234
    tablet_meta_pb.set_partition_id(partition_id);
128
234
    tablet_meta_pb.set_tablet_id(tablet_id);
129
234
    tablet_meta_pb.set_replica_id(replica_id);
130
234
    tablet_meta_pb.set_schema_hash(schema_hash);
131
234
    tablet_meta_pb.set_shard_id(shard_id);
132
    // Persist the creation time, but it is not used
133
234
    tablet_meta_pb.set_creation_time(time(nullptr));
134
234
    tablet_meta_pb.set_cumulative_layer_point(-1);
135
234
    tablet_meta_pb.set_tablet_state(PB_RUNNING);
136
234
    *(tablet_meta_pb.mutable_tablet_uid()) = tablet_uid.to_proto();
137
234
    tablet_meta_pb.set_tablet_type(tabletType == TTabletType::TABLET_TYPE_DISK
138
234
                                           ? TabletTypePB::TABLET_TYPE_DISK
139
234
                                           : TabletTypePB::TABLET_TYPE_MEMORY);
140
234
    tablet_meta_pb.set_enable_unique_key_merge_on_write(enable_unique_key_merge_on_write);
141
234
    tablet_meta_pb.set_storage_policy_id(storage_policy_id);
142
234
    tablet_meta_pb.set_compaction_policy(compaction_policy);
143
234
    tablet_meta_pb.set_time_series_compaction_goal_size_mbytes(
144
234
            time_series_compaction_goal_size_mbytes);
145
234
    tablet_meta_pb.set_time_series_compaction_file_count_threshold(
146
234
            time_series_compaction_file_count_threshold);
147
234
    tablet_meta_pb.set_time_series_compaction_time_threshold_seconds(
148
234
            time_series_compaction_time_threshold_seconds);
149
234
    tablet_meta_pb.set_time_series_compaction_empty_rowsets_threshold(
150
234
            time_series_compaction_empty_rowsets_threshold);
151
234
    tablet_meta_pb.set_time_series_compaction_level_threshold(
152
234
            time_series_compaction_level_threshold);
153
234
    TabletSchemaPB* schema = tablet_meta_pb.mutable_schema();
154
234
    schema->set_num_short_key_columns(tablet_schema.short_key_column_count);
155
234
    schema->set_num_rows_per_row_block(config::default_num_rows_per_column_file_block);
156
234
    schema->set_sequence_col_idx(tablet_schema.sequence_col_idx);
157
234
    switch (tablet_schema.keys_type) {
158
30
    case TKeysType::DUP_KEYS:
159
30
        schema->set_keys_type(KeysType::DUP_KEYS);
160
30
        break;
161
95
    case TKeysType::UNIQUE_KEYS:
162
95
        schema->set_keys_type(KeysType::UNIQUE_KEYS);
163
95
        break;
164
61
    case TKeysType::AGG_KEYS:
165
61
        schema->set_keys_type(KeysType::AGG_KEYS);
166
61
        break;
167
48
    default:
168
48
        LOG(WARNING) << "unknown tablet keys type";
169
48
        break;
170
234
    }
171
    // compress_kind used to compress segment files
172
234
    schema->set_compress_kind(COMPRESS_LZ4);
173
174
    // compression_type used to compress segment page
175
234
    switch (compression_type) {
176
0
    case TCompressionType::NO_COMPRESSION:
177
0
        schema->set_compression_type(segment_v2::NO_COMPRESSION);
178
0
        break;
179
0
    case TCompressionType::SNAPPY:
180
0
        schema->set_compression_type(segment_v2::SNAPPY);
181
0
        break;
182
0
    case TCompressionType::LZ4:
183
0
        schema->set_compression_type(segment_v2::LZ4);
184
0
        break;
185
234
    case TCompressionType::LZ4F:
186
234
        schema->set_compression_type(segment_v2::LZ4F);
187
234
        break;
188
0
    case TCompressionType::ZLIB:
189
0
        schema->set_compression_type(segment_v2::ZLIB);
190
0
        break;
191
0
    case TCompressionType::ZSTD:
192
0
        schema->set_compression_type(segment_v2::ZSTD);
193
0
        break;
194
0
    default:
195
0
        schema->set_compression_type(segment_v2::LZ4F);
196
0
        break;
197
234
    }
198
199
234
    switch (inverted_index_file_storage_format) {
200
0
    case TInvertedIndexFileStorageFormat::V1:
201
0
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1);
202
0
        break;
203
234
    case TInvertedIndexFileStorageFormat::V2:
204
234
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
205
234
        break;
206
0
    default:
207
0
        schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
208
0
        break;
209
234
    }
210
211
234
    switch (tablet_schema.sort_type) {
212
0
    case TSortType::type::ZORDER:
213
0
        schema->set_sort_type(SortType::ZORDER);
214
0
        break;
215
234
    default:
216
234
        schema->set_sort_type(SortType::LEXICAL);
217
234
    }
218
234
    schema->set_sort_col_num(tablet_schema.sort_col_num);
219
234
    for (const auto& i : tablet_schema.cluster_key_idxes) {
220
2
        schema->add_cluster_key_idxes(i);
221
2
    }
222
234
    tablet_meta_pb.set_in_restore_mode(false);
223
224
    // set column information
225
234
    uint32_t col_ordinal = 0;
226
234
    bool has_bf_columns = false;
227
1.48k
    for (TColumn tcolumn : tablet_schema.columns) {
228
1.48k
        ColumnPB* column = schema->add_column();
229
1.48k
        uint32_t unique_id = -1;
230
1.48k
        if (tcolumn.col_unique_id >= 0) {
231
1
            unique_id = tcolumn.col_unique_id;
232
1.48k
        } else {
233
1.48k
            unique_id = col_ordinal_to_unique_id.at(col_ordinal);
234
1.48k
        }
235
1.48k
        col_ordinal++;
236
1.48k
        init_column_from_tcolumn(unique_id, tcolumn, column);
237
238
1.48k
        if (column->is_bf_column()) {
239
0
            has_bf_columns = true;
240
0
        }
241
242
1.48k
        if (tablet_schema.__isset.indexes) {
243
0
            for (auto& index : tablet_schema.indexes) {
244
0
                if (index.index_type == TIndexType::type::BITMAP) {
245
0
                    DCHECK_EQ(index.columns.size(), 1);
246
0
                    if (iequal(tcolumn.column_name, index.columns[0])) {
247
0
                        column->set_has_bitmap_index(true);
248
0
                        break;
249
0
                    }
250
0
                } else if (index.index_type == TIndexType::type::BLOOMFILTER ||
251
0
                           index.index_type == TIndexType::type::NGRAM_BF) {
252
0
                    DCHECK_EQ(index.columns.size(), 1);
253
0
                    if (iequal(tcolumn.column_name, index.columns[0])) {
254
0
                        column->set_is_bf_column(true);
255
0
                        break;
256
0
                    }
257
0
                }
258
0
            }
259
0
        }
260
1.48k
    }
261
262
    // copy index meta
263
234
    if (tablet_schema.__isset.indexes) {
264
0
        for (auto& index : tablet_schema.indexes) {
265
0
            TabletIndexPB* index_pb = schema->add_index();
266
0
            index_pb->set_index_id(index.index_id);
267
0
            index_pb->set_index_name(index.index_name);
268
            // init col_unique_id in index at be side, since col_unique_id may be -1 at fe side
269
            // get column unique id by name
270
0
            for (auto column_name : index.columns) {
271
0
                for (auto column : schema->column()) {
272
0
                    if (iequal(column.name(), column_name)) {
273
0
                        index_pb->add_col_unique_id(column.unique_id());
274
0
                    }
275
0
                }
276
0
            }
277
0
            switch (index.index_type) {
278
0
            case TIndexType::BITMAP:
279
0
                index_pb->set_index_type(IndexType::BITMAP);
280
0
                break;
281
0
            case TIndexType::INVERTED:
282
0
                index_pb->set_index_type(IndexType::INVERTED);
283
0
                break;
284
0
            case TIndexType::BLOOMFILTER:
285
0
                index_pb->set_index_type(IndexType::BLOOMFILTER);
286
0
                break;
287
0
            case TIndexType::NGRAM_BF:
288
0
                index_pb->set_index_type(IndexType::NGRAM_BF);
289
0
                break;
290
0
            }
291
292
0
            if (index.__isset.properties) {
293
0
                auto properties = index_pb->mutable_properties();
294
0
                for (auto kv : index.properties) {
295
0
                    (*properties)[kv.first] = kv.second;
296
0
                }
297
0
            }
298
0
        }
299
0
    }
300
301
234
    schema->set_next_column_unique_id(next_unique_id);
302
234
    if (has_bf_columns && tablet_schema.__isset.bloom_filter_fpp) {
303
0
        schema->set_bf_fpp(tablet_schema.bloom_filter_fpp);
304
0
    }
305
306
234
    if (tablet_schema.__isset.is_in_memory) {
307
0
        schema->set_is_in_memory(tablet_schema.is_in_memory);
308
0
    }
309
310
234
    if (tablet_schema.__isset.disable_auto_compaction) {
311
0
        schema->set_disable_auto_compaction(tablet_schema.disable_auto_compaction);
312
0
    }
313
314
234
    if (tablet_schema.__isset.variant_enable_flatten_nested) {
315
234
        schema->set_variant_enable_flatten_nested(tablet_schema.variant_enable_flatten_nested);
316
234
    }
317
318
234
    if (tablet_schema.__isset.enable_single_replica_compaction) {
319
234
        schema->set_enable_single_replica_compaction(
320
234
                tablet_schema.enable_single_replica_compaction);
321
234
    }
322
323
234
    if (tablet_schema.__isset.delete_sign_idx) {
324
234
        schema->set_delete_sign_idx(tablet_schema.delete_sign_idx);
325
234
    }
326
234
    if (tablet_schema.__isset.store_row_column) {
327
234
        schema->set_store_row_column(tablet_schema.store_row_column);
328
234
    }
329
234
    if (tablet_schema.__isset.row_store_page_size) {
330
234
        schema->set_row_store_page_size(tablet_schema.row_store_page_size);
331
234
    }
332
234
    if (tablet_schema.__isset.storage_page_size) {
333
234
        schema->set_storage_page_size(tablet_schema.storage_page_size);
334
234
    }
335
234
    if (tablet_schema.__isset.skip_write_index_on_load) {
336
234
        schema->set_skip_write_index_on_load(tablet_schema.skip_write_index_on_load);
337
234
    }
338
234
    if (tablet_schema.__isset.row_store_col_cids) {
339
0
        schema->mutable_row_store_column_unique_ids()->Add(tablet_schema.row_store_col_cids.begin(),
340
0
                                                           tablet_schema.row_store_col_cids.end());
341
0
    }
342
234
    if (binlog_config.has_value()) {
343
0
        BinlogConfig tmp_binlog_config;
344
0
        tmp_binlog_config = binlog_config.value();
345
0
        tmp_binlog_config.to_pb(tablet_meta_pb.mutable_binlog_config());
346
0
    }
347
348
234
    init_from_pb(tablet_meta_pb);
349
234
}
350
351
TabletMeta::TabletMeta(const TabletMeta& b)
352
        : MetadataAdder(b),
353
          _table_id(b._table_id),
354
          _index_id(b._index_id),
355
          _partition_id(b._partition_id),
356
          _tablet_id(b._tablet_id),
357
          _replica_id(b._replica_id),
358
          _schema_hash(b._schema_hash),
359
          _shard_id(b._shard_id),
360
          _creation_time(b._creation_time),
361
          _cumulative_layer_point(b._cumulative_layer_point),
362
          _tablet_uid(b._tablet_uid),
363
          _tablet_type(b._tablet_type),
364
          _tablet_state(b._tablet_state),
365
          _schema(b._schema),
366
          _rs_metas(b._rs_metas),
367
          _stale_rs_metas(b._stale_rs_metas),
368
          _in_restore_mode(b._in_restore_mode),
369
          _preferred_rowset_type(b._preferred_rowset_type),
370
          _storage_policy_id(b._storage_policy_id),
371
          _cooldown_meta_id(b._cooldown_meta_id),
372
          _enable_unique_key_merge_on_write(b._enable_unique_key_merge_on_write),
373
          _delete_bitmap(b._delete_bitmap),
374
          _binlog_config(b._binlog_config),
375
          _compaction_policy(b._compaction_policy),
376
          _time_series_compaction_goal_size_mbytes(b._time_series_compaction_goal_size_mbytes),
377
          _time_series_compaction_file_count_threshold(
378
                  b._time_series_compaction_file_count_threshold),
379
          _time_series_compaction_time_threshold_seconds(
380
                  b._time_series_compaction_time_threshold_seconds),
381
          _time_series_compaction_empty_rowsets_threshold(
382
                  b._time_series_compaction_empty_rowsets_threshold),
383
0
          _time_series_compaction_level_threshold(b._time_series_compaction_level_threshold) {};
384
385
void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tcolumn,
386
1.48k
                                          ColumnPB* column) {
387
1.48k
    column->set_unique_id(unique_id);
388
1.48k
    column->set_name(tcolumn.column_name);
389
1.48k
    column->set_has_bitmap_index(tcolumn.has_bitmap_index);
390
1.48k
    column->set_is_auto_increment(tcolumn.is_auto_increment);
391
1.48k
    string data_type;
392
1.48k
    EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type);
393
1.48k
    column->set_type(data_type);
394
395
1.48k
    uint32_t length = TabletColumn::get_field_length_by_type(tcolumn.column_type.type,
396
1.48k
                                                             tcolumn.column_type.len);
397
1.48k
    column->set_length(length);
398
1.48k
    column->set_index_length(length);
399
1.48k
    column->set_precision(tcolumn.column_type.precision);
400
1.48k
    column->set_frac(tcolumn.column_type.scale);
401
402
1.48k
    if (tcolumn.__isset.result_is_nullable) {
403
0
        column->set_result_is_nullable(tcolumn.result_is_nullable);
404
0
    }
405
406
1.48k
    if (tcolumn.__isset.be_exec_version) {
407
1.48k
        column->set_be_exec_version(tcolumn.be_exec_version);
408
1.48k
    }
409
410
1.48k
    if (tcolumn.column_type.type == TPrimitiveType::VARCHAR ||
411
1.48k
        tcolumn.column_type.type == TPrimitiveType::STRING) {
412
101
        if (!tcolumn.column_type.__isset.index_len) {
413
101
            column->set_index_length(10);
414
101
        } else {
415
0
            column->set_index_length(tcolumn.column_type.index_len);
416
0
        }
417
101
    }
418
1.48k
    if (!tcolumn.is_key) {
419
723
        column->set_is_key(false);
420
723
        if (tcolumn.__isset.aggregation) {
421
0
            column->set_aggregation(tcolumn.aggregation);
422
723
        } else {
423
723
            string aggregation_type;
424
723
            EnumToString(TAggregationType, tcolumn.aggregation_type, aggregation_type);
425
723
            column->set_aggregation(aggregation_type);
426
723
        }
427
764
    } else {
428
764
        column->set_is_key(true);
429
764
        column->set_aggregation("NONE");
430
764
    }
431
1.48k
    column->set_is_nullable(tcolumn.is_allow_null);
432
1.48k
    if (tcolumn.__isset.default_value) {
433
0
        column->set_default_value(tcolumn.default_value);
434
0
    }
435
1.48k
    if (tcolumn.__isset.is_bloom_filter_column) {
436
0
        column->set_is_bf_column(tcolumn.is_bloom_filter_column);
437
0
    }
438
1.48k
    for (size_t i = 0; i < tcolumn.children_column.size(); i++) {
439
0
        ColumnPB* children_column = column->add_children_columns();
440
0
        init_column_from_tcolumn(tcolumn.children_column[i].col_unique_id,
441
0
                                 tcolumn.children_column[i], children_column);
442
0
    }
443
1.48k
}
444
445
5
Status TabletMeta::create_from_file(const string& file_path) {
446
5
    FileHeader<TabletMetaPB> file_header(file_path);
447
    // In file_header.deserialize(), it validates file length, signature, checksum of protobuf.
448
5
    RETURN_IF_ERROR(file_header.deserialize());
449
5
    TabletMetaPB tablet_meta_pb;
450
5
    try {
451
5
        tablet_meta_pb.CopyFrom(file_header.message());
452
5
    } catch (...) {
453
0
        return Status::Error<PARSE_PROTOBUF_ERROR>("fail to copy protocol buffer object. file={}",
454
0
                                                   file_path);
455
0
    }
456
457
5
    init_from_pb(tablet_meta_pb);
458
5
    return Status::OK();
459
5
}
460
461
std::string TabletMeta::construct_header_file_path(const string& schema_hash_path,
462
2
                                                   int64_t tablet_id) {
463
2
    std::stringstream header_name_stream;
464
2
    header_name_stream << schema_hash_path << "/" << tablet_id << ".hdr";
465
2
    return header_name_stream.str();
466
2
}
467
468
0
Status TabletMeta::save_as_json(const string& file_path) {
469
0
    std::string json_meta;
470
0
    json2pb::Pb2JsonOptions json_options;
471
0
    json_options.pretty_json = true;
472
0
    json_options.bytes_to_base64 = true;
473
0
    to_json(&json_meta, json_options);
474
    // save to file
475
0
    io::FileWriterPtr file_writer;
476
0
    RETURN_IF_ERROR(io::global_local_filesystem()->create_file(file_path, &file_writer));
477
0
    RETURN_IF_ERROR(file_writer->append(json_meta));
478
0
    RETURN_IF_ERROR(file_writer->close());
479
0
    return Status::OK();
480
0
}
481
482
26
Status TabletMeta::save(const string& file_path) {
483
26
    TabletMetaPB tablet_meta_pb;
484
26
    to_meta_pb(&tablet_meta_pb);
485
26
    return TabletMeta::save(file_path, tablet_meta_pb);
486
26
}
487
488
28
Status TabletMeta::save(const string& file_path, const TabletMetaPB& tablet_meta_pb) {
489
28
    DCHECK(!file_path.empty());
490
28
    FileHeader<TabletMetaPB> file_header(file_path);
491
28
    try {
492
28
        file_header.mutable_message()->CopyFrom(tablet_meta_pb);
493
28
    } catch (...) {
494
0
        LOG(WARNING) << "fail to copy protocol buffer object. file='" << file_path;
495
0
        return Status::Error<ErrorCode::INTERNAL_ERROR>(
496
0
                "fail to copy protocol buffer object. file={}", file_path);
497
0
    }
498
28
    RETURN_IF_ERROR(file_header.prepare());
499
28
    RETURN_IF_ERROR(file_header.serialize());
500
28
    return Status::OK();
501
28
}
502
503
145
Status TabletMeta::save_meta(DataDir* data_dir) {
504
145
    std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
505
145
    return _save_meta(data_dir);
506
145
}
507
508
145
Status TabletMeta::_save_meta(DataDir* data_dir) {
509
    // check if tablet uid is valid
510
145
    if (_tablet_uid.hi == 0 && _tablet_uid.lo == 0) {
511
0
        LOG(FATAL) << "tablet_uid is invalid"
512
0
                   << " tablet=" << tablet_id() << " _tablet_uid=" << _tablet_uid.to_string();
513
0
    }
514
145
    string meta_binary;
515
516
145
    auto t1 = MonotonicMicros();
517
145
    serialize(&meta_binary);
518
145
    auto t2 = MonotonicMicros();
519
145
    Status status = TabletMetaManager::save(data_dir, tablet_id(), schema_hash(), meta_binary);
520
145
    if (!status.ok()) {
521
0
        LOG(FATAL) << "fail to save tablet_meta. status=" << status << ", tablet_id=" << tablet_id()
522
0
                   << ", schema_hash=" << schema_hash();
523
0
    }
524
145
    auto t3 = MonotonicMicros();
525
145
    auto cost = t3 - t1;
526
145
    if (cost > 1 * 1000 * 1000) {
527
0
        LOG(INFO) << "save tablet(" << tablet_id() << ") meta too slow. serialize cost " << t2 - t1
528
0
                  << "(us), serialized binary size: " << meta_binary.length()
529
0
                  << "(bytes), write rocksdb cost " << t3 - t2 << "(us)";
530
0
    }
531
145
    return status;
532
145
}
533
534
148
void TabletMeta::serialize(string* meta_binary) {
535
148
    TabletMetaPB tablet_meta_pb;
536
148
    to_meta_pb(&tablet_meta_pb);
537
148
    if (tablet_meta_pb.partition_id() <= 0) {
538
50
        LOG(WARNING) << "invalid partition id " << tablet_meta_pb.partition_id() << " tablet "
539
50
                     << tablet_meta_pb.tablet_id();
540
50
    }
541
148
    DBUG_EXECUTE_IF("TabletMeta::serialize::zero_partition_id", {
542
148
        long partition_id = tablet_meta_pb.partition_id();
543
148
        tablet_meta_pb.set_partition_id(0);
544
148
        LOG(WARNING) << "set debug point TabletMeta::serialize::zero_partition_id old="
545
148
                     << partition_id << " new=" << tablet_meta_pb.DebugString();
546
148
    });
547
148
    bool serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
548
148
    if (!_rs_metas.empty() || !_stale_rs_metas.empty()) {
549
148
        _avg_rs_meta_serialize_size =
550
148
                meta_binary->length() / (_rs_metas.size() + _stale_rs_metas.size());
551
148
        if (meta_binary->length() > config::tablet_meta_serialize_size_limit ||
552
148
            !serialize_success) {
553
0
            int64_t origin_meta_size = meta_binary->length();
554
0
            int64_t stale_rowsets_num = tablet_meta_pb.stale_rs_metas().size();
555
0
            tablet_meta_pb.clear_stale_rs_metas();
556
0
            meta_binary->clear();
557
0
            serialize_success = tablet_meta_pb.SerializeToString(meta_binary);
558
0
            LOG(WARNING) << "tablet meta serialization size exceeds limit: "
559
0
                         << config::tablet_meta_serialize_size_limit
560
0
                         << " clean up stale rowsets, tablet id: " << tablet_id()
561
0
                         << " stale rowset num: " << stale_rowsets_num
562
0
                         << " serialization size before clean " << origin_meta_size
563
0
                         << " serialization size after clean " << meta_binary->length();
564
0
        }
565
148
    }
566
567
148
    if (!serialize_success) {
568
0
        LOG(FATAL) << "failed to serialize meta " << tablet_id();
569
0
    }
570
148
}
571
572
51
Status TabletMeta::deserialize(std::string_view meta_binary) {
573
51
    TabletMetaPB tablet_meta_pb;
574
51
    bool parsed = tablet_meta_pb.ParseFromArray(meta_binary.data(), meta_binary.size());
575
51
    if (!parsed) {
576
0
        return Status::Error<INIT_FAILED>("parse tablet meta failed");
577
0
    }
578
51
    init_from_pb(tablet_meta_pb);
579
51
    return Status::OK();
580
51
}
581
582
292
void TabletMeta::init_from_pb(const TabletMetaPB& tablet_meta_pb) {
583
292
    _table_id = tablet_meta_pb.table_id();
584
292
    _index_id = tablet_meta_pb.index_id();
585
292
    _partition_id = tablet_meta_pb.partition_id();
586
292
    _tablet_id = tablet_meta_pb.tablet_id();
587
292
    _replica_id = tablet_meta_pb.replica_id();
588
292
    _schema_hash = tablet_meta_pb.schema_hash();
589
292
    _shard_id = tablet_meta_pb.shard_id();
590
292
    _creation_time = tablet_meta_pb.creation_time();
591
292
    _cumulative_layer_point = tablet_meta_pb.cumulative_layer_point();
592
292
    _tablet_uid = TabletUid(tablet_meta_pb.tablet_uid());
593
292
    _ttl_seconds = tablet_meta_pb.ttl_seconds();
594
292
    if (tablet_meta_pb.has_tablet_type()) {
595
292
        _tablet_type = tablet_meta_pb.tablet_type();
596
292
    } else {
597
0
        _tablet_type = TabletTypePB::TABLET_TYPE_DISK;
598
0
    }
599
600
    // init _tablet_state
601
292
    switch (tablet_meta_pb.tablet_state()) {
602
3
    case PB_NOTREADY:
603
3
        _tablet_state = TabletState::TABLET_NOTREADY;
604
3
        break;
605
266
    case PB_RUNNING:
606
266
        _tablet_state = TabletState::TABLET_RUNNING;
607
266
        break;
608
0
    case PB_TOMBSTONED:
609
0
        _tablet_state = TabletState::TABLET_TOMBSTONED;
610
0
        break;
611
0
    case PB_STOPPED:
612
0
        _tablet_state = TabletState::TABLET_STOPPED;
613
0
        break;
614
23
    case PB_SHUTDOWN:
615
23
        _tablet_state = TabletState::TABLET_SHUTDOWN;
616
23
        break;
617
0
    default:
618
0
        LOG(WARNING) << "tablet has no state. tablet=" << tablet_id()
619
0
                     << ", schema_hash=" << schema_hash();
620
292
    }
621
622
    // init _schema
623
292
    _schema->init_from_pb(tablet_meta_pb.schema());
624
625
292
    if (tablet_meta_pb.has_enable_unique_key_merge_on_write()) {
626
292
        _enable_unique_key_merge_on_write = tablet_meta_pb.enable_unique_key_merge_on_write();
627
292
    }
628
629
    // init _rs_metas
630
338
    for (auto& it : tablet_meta_pb.rs_metas()) {
631
338
        RowsetMetaSharedPtr rs_meta(new RowsetMeta());
632
338
        rs_meta->init_from_pb(it);
633
338
        _rs_metas.push_back(std::move(rs_meta));
634
338
    }
635
636
    // For mow table, delete bitmap of stale rowsets has not been persisted.
637
    // When be restart, query should not read the stale rowset, otherwise duplicate keys
638
    // will be read out. Therefore, we don't add them to _stale_rs_meta for mow table.
639
292
    if (!config::skip_loading_stale_rowset_meta && !_enable_unique_key_merge_on_write) {
640
257
        for (auto& it : tablet_meta_pb.stale_rs_metas()) {
641
0
            RowsetMetaSharedPtr rs_meta(new RowsetMeta());
642
0
            rs_meta->init_from_pb(it);
643
0
            _stale_rs_metas.push_back(std::move(rs_meta));
644
0
        }
645
257
    }
646
647
292
    if (tablet_meta_pb.has_in_restore_mode()) {
648
292
        _in_restore_mode = tablet_meta_pb.in_restore_mode();
649
292
    }
650
651
292
    if (tablet_meta_pb.has_preferred_rowset_type()) {
652
58
        _preferred_rowset_type = tablet_meta_pb.preferred_rowset_type();
653
58
    }
654
655
292
    _storage_policy_id = tablet_meta_pb.storage_policy_id();
656
292
    if (tablet_meta_pb.has_cooldown_meta_id()) {
657
0
        _cooldown_meta_id = tablet_meta_pb.cooldown_meta_id();
658
0
    }
659
660
292
    if (tablet_meta_pb.has_delete_bitmap()) {
661
0
        int rst_ids_size = tablet_meta_pb.delete_bitmap().rowset_ids_size();
662
0
        int seg_ids_size = tablet_meta_pb.delete_bitmap().segment_ids_size();
663
0
        int versions_size = tablet_meta_pb.delete_bitmap().versions_size();
664
0
        int seg_maps_size = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps_size();
665
0
        CHECK(rst_ids_size == seg_ids_size && seg_ids_size == seg_maps_size &&
666
0
              seg_maps_size == versions_size);
667
0
        for (size_t i = 0; i < rst_ids_size; ++i) {
668
0
            RowsetId rst_id;
669
0
            rst_id.init(tablet_meta_pb.delete_bitmap().rowset_ids(i));
670
0
            auto seg_id = tablet_meta_pb.delete_bitmap().segment_ids(i);
671
0
            uint32_t ver = tablet_meta_pb.delete_bitmap().versions(i);
672
0
            auto bitmap = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps(i).data();
673
0
            delete_bitmap().delete_bitmap[{rst_id, seg_id, ver}] = roaring::Roaring::read(bitmap);
674
0
        }
675
0
    }
676
677
292
    if (tablet_meta_pb.has_binlog_config()) {
678
56
        _binlog_config = tablet_meta_pb.binlog_config();
679
56
    }
680
292
    _compaction_policy = tablet_meta_pb.compaction_policy();
681
292
    _time_series_compaction_goal_size_mbytes =
682
292
            tablet_meta_pb.time_series_compaction_goal_size_mbytes();
683
292
    _time_series_compaction_file_count_threshold =
684
292
            tablet_meta_pb.time_series_compaction_file_count_threshold();
685
292
    _time_series_compaction_time_threshold_seconds =
686
292
            tablet_meta_pb.time_series_compaction_time_threshold_seconds();
687
292
    _time_series_compaction_empty_rowsets_threshold =
688
292
            tablet_meta_pb.time_series_compaction_empty_rowsets_threshold();
689
292
    _time_series_compaction_level_threshold =
690
292
            tablet_meta_pb.time_series_compaction_level_threshold();
691
292
}
692
693
180
void TabletMeta::to_meta_pb(TabletMetaPB* tablet_meta_pb) {
694
180
    tablet_meta_pb->set_table_id(table_id());
695
180
    tablet_meta_pb->set_index_id(index_id());
696
180
    tablet_meta_pb->set_partition_id(partition_id());
697
180
    tablet_meta_pb->set_tablet_id(tablet_id());
698
180
    tablet_meta_pb->set_replica_id(replica_id());
699
180
    tablet_meta_pb->set_schema_hash(schema_hash());
700
180
    tablet_meta_pb->set_shard_id(shard_id());
701
180
    tablet_meta_pb->set_creation_time(creation_time());
702
180
    tablet_meta_pb->set_cumulative_layer_point(cumulative_layer_point());
703
180
    *(tablet_meta_pb->mutable_tablet_uid()) = tablet_uid().to_proto();
704
180
    tablet_meta_pb->set_tablet_type(_tablet_type);
705
180
    tablet_meta_pb->set_ttl_seconds(_ttl_seconds);
706
180
    switch (tablet_state()) {
707
3
    case TABLET_NOTREADY:
708
3
        tablet_meta_pb->set_tablet_state(PB_NOTREADY);
709
3
        break;
710
108
    case TABLET_RUNNING:
711
108
        tablet_meta_pb->set_tablet_state(PB_RUNNING);
712
108
        break;
713
0
    case TABLET_TOMBSTONED:
714
0
        tablet_meta_pb->set_tablet_state(PB_TOMBSTONED);
715
0
        break;
716
0
    case TABLET_STOPPED:
717
0
        tablet_meta_pb->set_tablet_state(PB_STOPPED);
718
0
        break;
719
69
    case TABLET_SHUTDOWN:
720
69
        tablet_meta_pb->set_tablet_state(PB_SHUTDOWN);
721
69
        break;
722
180
    }
723
724
    // RowsetMetaPB is separated from TabletMetaPB
725
180
    if (!config::is_cloud_mode()) {
726
762
        for (auto& rs : _rs_metas) {
727
762
            rs->to_rowset_pb(tablet_meta_pb->add_rs_metas());
728
762
        }
729
180
        for (auto rs : _stale_rs_metas) {
730
0
            rs->to_rowset_pb(tablet_meta_pb->add_stale_rs_metas());
731
0
        }
732
180
    }
733
734
180
    _schema->to_schema_pb(tablet_meta_pb->mutable_schema());
735
736
180
    tablet_meta_pb->set_in_restore_mode(in_restore_mode());
737
738
    // to avoid modify tablet meta to the greatest extend
739
180
    if (_preferred_rowset_type == BETA_ROWSET) {
740
180
        tablet_meta_pb->set_preferred_rowset_type(_preferred_rowset_type);
741
180
    }
742
180
    if (_storage_policy_id > 0) {
743
5
        tablet_meta_pb->set_storage_policy_id(_storage_policy_id);
744
5
    }
745
180
    if (_cooldown_meta_id.initialized()) {
746
5
        tablet_meta_pb->mutable_cooldown_meta_id()->CopyFrom(_cooldown_meta_id.to_proto());
747
5
    }
748
749
180
    tablet_meta_pb->set_enable_unique_key_merge_on_write(_enable_unique_key_merge_on_write);
750
751
180
    if (_enable_unique_key_merge_on_write) {
752
4
        std::set<RowsetId> stale_rs_ids;
753
4
        for (const auto& rowset : _stale_rs_metas) {
754
0
            stale_rs_ids.insert(rowset->rowset_id());
755
0
        }
756
4
        DeleteBitmapPB* delete_bitmap_pb = tablet_meta_pb->mutable_delete_bitmap();
757
4
        for (auto& [id, bitmap] : delete_bitmap().snapshot().delete_bitmap) {
758
2
            auto& [rowset_id, segment_id, ver] = id;
759
2
            if (stale_rs_ids.count(rowset_id) != 0) {
760
0
                continue;
761
0
            }
762
2
            delete_bitmap_pb->add_rowset_ids(rowset_id.to_string());
763
2
            delete_bitmap_pb->add_segment_ids(segment_id);
764
2
            delete_bitmap_pb->add_versions(ver);
765
2
            std::string bitmap_data(bitmap.getSizeInBytes(), '\0');
766
2
            bitmap.write(bitmap_data.data());
767
2
            *(delete_bitmap_pb->add_segment_delete_bitmaps()) = std::move(bitmap_data);
768
2
        }
769
4
    }
770
180
    _binlog_config.to_pb(tablet_meta_pb->mutable_binlog_config());
771
180
    tablet_meta_pb->set_compaction_policy(compaction_policy());
772
180
    tablet_meta_pb->set_time_series_compaction_goal_size_mbytes(
773
180
            time_series_compaction_goal_size_mbytes());
774
180
    tablet_meta_pb->set_time_series_compaction_file_count_threshold(
775
180
            time_series_compaction_file_count_threshold());
776
180
    tablet_meta_pb->set_time_series_compaction_time_threshold_seconds(
777
180
            time_series_compaction_time_threshold_seconds());
778
180
    tablet_meta_pb->set_time_series_compaction_empty_rowsets_threshold(
779
180
            time_series_compaction_empty_rowsets_threshold());
780
180
    tablet_meta_pb->set_time_series_compaction_level_threshold(
781
180
            time_series_compaction_level_threshold());
782
180
}
783
784
139
int64_t TabletMeta::mem_size() const {
785
139
    auto size = sizeof(TabletMeta);
786
139
    size += _schema->mem_size();
787
139
    return size;
788
139
}
789
790
2
void TabletMeta::to_json(string* json_string, json2pb::Pb2JsonOptions& options) {
791
2
    TabletMetaPB tablet_meta_pb;
792
2
    to_meta_pb(&tablet_meta_pb);
793
2
    json2pb::ProtoMessageToJson(tablet_meta_pb, json_string, options);
794
2
}
795
796
95
Version TabletMeta::max_version() const {
797
95
    Version max_version = {-1, 0};
798
150
    for (auto& rs_meta : _rs_metas) {
799
150
        if (rs_meta->end_version() > max_version.second) {
800
148
            max_version = rs_meta->version();
801
148
        }
802
150
    }
803
95
    return max_version;
804
95
}
805
806
0
size_t TabletMeta::version_count_cross_with_range(const Version& range) const {
807
0
    size_t count = 0;
808
0
    for (const auto& rs_meta : _rs_metas) {
809
0
        if (!(range.first > rs_meta->version().second || range.second < rs_meta->version().first)) {
810
0
            count++;
811
0
        }
812
0
    }
813
0
    return count;
814
0
}
815
816
701
Status TabletMeta::add_rs_meta(const RowsetMetaSharedPtr& rs_meta) {
817
    // check RowsetMeta is valid
818
4.32k
    for (auto& rs : _rs_metas) {
819
4.32k
        if (rs->version() == rs_meta->version()) {
820
0
            if (rs->rowset_id() != rs_meta->rowset_id()) {
821
0
                return Status::Error<PUSH_VERSION_ALREADY_EXIST>(
822
0
                        "version already exist. rowset_id={}, version={}, tablet={}",
823
0
                        rs->rowset_id().to_string(), rs->version().to_string(), tablet_id());
824
0
            } else {
825
                // rowsetid,version is equal, it is a duplicate req, skip it
826
0
                return Status::OK();
827
0
            }
828
0
        }
829
4.32k
    }
830
701
    _rs_metas.push_back(rs_meta);
831
701
    return Status::OK();
832
701
}
833
834
0
void TabletMeta::add_rowsets_unchecked(const std::vector<RowsetSharedPtr>& to_add) {
835
0
    for (const auto& rs : to_add) {
836
0
        _rs_metas.push_back(rs->rowset_meta());
837
0
    }
838
0
}
839
840
void TabletMeta::delete_rs_meta_by_version(const Version& version,
841
0
                                           std::vector<RowsetMetaSharedPtr>* deleted_rs_metas) {
842
0
    auto it = _rs_metas.begin();
843
0
    while (it != _rs_metas.end()) {
844
0
        if ((*it)->version() == version) {
845
0
            if (deleted_rs_metas != nullptr) {
846
0
                deleted_rs_metas->push_back(*it);
847
0
            }
848
0
            _rs_metas.erase(it);
849
0
            return;
850
0
        } else {
851
0
            ++it;
852
0
        }
853
0
    }
854
0
}
855
856
void TabletMeta::modify_rs_metas(const std::vector<RowsetMetaSharedPtr>& to_add,
857
                                 const std::vector<RowsetMetaSharedPtr>& to_delete,
858
12
                                 bool same_version) {
859
    // Remove to_delete rowsets from _rs_metas
860
25
    for (auto rs_to_del : to_delete) {
861
25
        auto it = _rs_metas.begin();
862
187
        while (it != _rs_metas.end()) {
863
187
            if (rs_to_del->version() == (*it)->version()) {
864
25
                _rs_metas.erase(it);
865
                // there should be only one rowset match the version
866
25
                break;
867
162
            } else {
868
162
                ++it;
869
162
            }
870
187
        }
871
25
    }
872
12
    if (!same_version) {
873
        // put to_delete rowsets in _stale_rs_metas.
874
7
        _stale_rs_metas.insert(_stale_rs_metas.end(), to_delete.begin(), to_delete.end());
875
7
    }
876
    // put to_add rowsets in _rs_metas.
877
12
    _rs_metas.insert(_rs_metas.end(), to_add.begin(), to_add.end());
878
12
}
879
880
// Use the passing "rs_metas" to replace the rs meta in this tablet meta
881
// Also clear the _stale_rs_metas because this tablet meta maybe copyied from
882
// an existing tablet before. Add after revise, only the passing "rs_metas"
883
// is needed.
884
3
void TabletMeta::revise_rs_metas(std::vector<RowsetMetaSharedPtr>&& rs_metas) {
885
3
    std::lock_guard<std::shared_mutex> wrlock(_meta_lock);
886
3
    _rs_metas = std::move(rs_metas);
887
3
    _stale_rs_metas.clear();
888
3
}
889
890
// This method should call after revise_rs_metas, since new rs_metas might be a subset
891
// of original tablet, we should revise the delete_bitmap according to current rowset.
892
//
893
// Delete bitmap is protected by Tablet::_meta_lock, we don't need to acquire the
894
// TabletMeta's _meta_lock
895
1
void TabletMeta::revise_delete_bitmap_unlocked(const DeleteBitmap& delete_bitmap) {
896
1
    _delete_bitmap = std::make_unique<DeleteBitmap>(tablet_id());
897
2
    for (auto rs : _rs_metas) {
898
2
        DeleteBitmap rs_bm(tablet_id());
899
2
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
900
2
                             &rs_bm);
901
2
        _delete_bitmap->merge(rs_bm);
902
2
    }
903
1
    for (auto rs : _stale_rs_metas) {
904
0
        DeleteBitmap rs_bm(tablet_id());
905
0
        delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX},
906
0
                             &rs_bm);
907
0
        _delete_bitmap->merge(rs_bm);
908
0
    }
909
1
}
910
911
0
void TabletMeta::delete_stale_rs_meta_by_version(const Version& version) {
912
0
    auto it = _stale_rs_metas.begin();
913
0
    while (it != _stale_rs_metas.end()) {
914
0
        if ((*it)->version() == version) {
915
0
            if (_enable_unique_key_merge_on_write) {
916
                // remove rowset delete bitmap
917
0
                delete_bitmap().remove({(*it)->rowset_id(), 0, 0},
918
0
                                       {(*it)->rowset_id(), UINT32_MAX, 0});
919
0
            }
920
0
            it = _stale_rs_metas.erase(it);
921
0
        } else {
922
0
            it++;
923
0
        }
924
0
    }
925
0
}
926
927
0
RowsetMetaSharedPtr TabletMeta::acquire_rs_meta_by_version(const Version& version) const {
928
0
    for (auto it : _rs_metas) {
929
0
        if (it->version() == version) {
930
0
            return it;
931
0
        }
932
0
    }
933
0
    return nullptr;
934
0
}
935
936
8
RowsetMetaSharedPtr TabletMeta::acquire_stale_rs_meta_by_version(const Version& version) const {
937
8
    for (auto it : _stale_rs_metas) {
938
0
        if (it->version() == version) {
939
0
            return it;
940
0
        }
941
0
    }
942
8
    return nullptr;
943
8
}
944
945
23
Status TabletMeta::set_partition_id(int64_t partition_id) {
946
23
    if ((_partition_id > 0 && _partition_id != partition_id) || partition_id < 1) {
947
0
        LOG(WARNING) << "cur partition id=" << _partition_id << " new partition id=" << partition_id
948
0
                     << " not equal";
949
0
    }
950
23
    _partition_id = partition_id;
951
23
    return Status::OK();
952
23
}
953
954
1
bool operator==(const TabletMeta& a, const TabletMeta& b) {
955
1
    if (a._table_id != b._table_id) return false;
956
1
    if (a._index_id != b._index_id) return false;
957
1
    if (a._partition_id != b._partition_id) return false;
958
1
    if (a._tablet_id != b._tablet_id) return false;
959
1
    if (a._replica_id != b._replica_id) return false;
960
1
    if (a._schema_hash != b._schema_hash) return false;
961
1
    if (a._shard_id != b._shard_id) return false;
962
1
    if (a._creation_time != b._creation_time) return false;
963
1
    if (a._cumulative_layer_point != b._cumulative_layer_point) return false;
964
1
    if (a._tablet_uid != b._tablet_uid) return false;
965
1
    if (a._tablet_type != b._tablet_type) return false;
966
1
    if (a._tablet_state != b._tablet_state) return false;
967
1
    if (*a._schema != *b._schema) return false;
968
1
    if (a._rs_metas.size() != b._rs_metas.size()) return false;
969
1
    for (int i = 0; i < a._rs_metas.size(); ++i) {
970
0
        if (a._rs_metas[i] != b._rs_metas[i]) return false;
971
0
    }
972
1
    if (a._in_restore_mode != b._in_restore_mode) return false;
973
1
    if (a._preferred_rowset_type != b._preferred_rowset_type) return false;
974
1
    if (a._storage_policy_id != b._storage_policy_id) return false;
975
1
    if (a._compaction_policy != b._compaction_policy) return false;
976
1
    if (a._time_series_compaction_goal_size_mbytes != b._time_series_compaction_goal_size_mbytes)
977
0
        return false;
978
1
    if (a._time_series_compaction_file_count_threshold !=
979
1
        b._time_series_compaction_file_count_threshold)
980
0
        return false;
981
1
    if (a._time_series_compaction_time_threshold_seconds !=
982
1
        b._time_series_compaction_time_threshold_seconds)
983
0
        return false;
984
1
    if (a._time_series_compaction_empty_rowsets_threshold !=
985
1
        b._time_series_compaction_empty_rowsets_threshold)
986
0
        return false;
987
1
    if (a._time_series_compaction_level_threshold != b._time_series_compaction_level_threshold)
988
0
        return false;
989
1
    return true;
990
1
}
991
992
0
bool operator!=(const TabletMeta& a, const TabletMeta& b) {
993
0
    return !(a == b);
994
0
}
995
996
371
DeleteBitmap::DeleteBitmap(int64_t tablet_id) : _tablet_id(tablet_id) {
997
    // The default delete bitmap cache is set to 100MB,
998
    // which can be insufficient and cause performance issues when the amount of user data is large.
999
    // To mitigate the problem of an inadequate cache,
1000
    // we will take the larger of 0.5% of the total memory and 100MB as the delete bitmap cache size.
1001
371
    bool is_percent = false;
1002
371
    int64_t delete_bitmap_agg_cache_cache_limit =
1003
371
            ParseUtil::parse_mem_spec(config::delete_bitmap_dynamic_agg_cache_limit,
1004
371
                                      MemInfo::mem_limit(), MemInfo::physical_mem(), &is_percent);
1005
371
    _agg_cache.reset(new AggCache(delete_bitmap_agg_cache_cache_limit >
1006
371
                                                  config::delete_bitmap_agg_cache_capacity
1007
371
                                          ? delete_bitmap_agg_cache_cache_limit
1008
371
                                          : config::delete_bitmap_agg_cache_capacity));
1009
371
}
1010
1011
7
DeleteBitmap::DeleteBitmap(const DeleteBitmap& o) {
1012
7
    delete_bitmap = o.delete_bitmap; // just copy data
1013
7
    _tablet_id = o._tablet_id;
1014
7
}
1015
1016
0
DeleteBitmap& DeleteBitmap::operator=(const DeleteBitmap& o) {
1017
0
    delete_bitmap = o.delete_bitmap; // just copy data
1018
0
    _tablet_id = o._tablet_id;
1019
0
    return *this;
1020
0
}
1021
1022
0
DeleteBitmap::DeleteBitmap(DeleteBitmap&& o) {
1023
0
    delete_bitmap = std::move(o.delete_bitmap);
1024
0
    _tablet_id = o._tablet_id;
1025
0
}
1026
1027
0
DeleteBitmap& DeleteBitmap::operator=(DeleteBitmap&& o) {
1028
0
    delete_bitmap = std::move(o.delete_bitmap);
1029
0
    _tablet_id = o._tablet_id;
1030
0
    return *this;
1031
0
}
1032
1033
7
DeleteBitmap DeleteBitmap::snapshot() const {
1034
7
    std::shared_lock l(lock);
1035
7
    return DeleteBitmap(*this);
1036
7
}
1037
1038
3
DeleteBitmap DeleteBitmap::snapshot(Version version) const {
1039
    // Take snapshot first, then remove keys greater than given version.
1040
3
    DeleteBitmap snapshot = this->snapshot();
1041
3
    auto it = snapshot.delete_bitmap.begin();
1042
412
    while (it != snapshot.delete_bitmap.end()) {
1043
409
        if (std::get<2>(it->first) > version) {
1044
4
            it = snapshot.delete_bitmap.erase(it);
1045
405
        } else {
1046
405
            it++;
1047
405
        }
1048
409
    }
1049
3
    return snapshot;
1050
3
}
1051
1052
459k
void DeleteBitmap::add(const BitmapKey& bmk, uint32_t row_id) {
1053
459k
    std::lock_guard l(lock);
1054
459k
    delete_bitmap[bmk].add(row_id);
1055
459k
}
1056
1057
0
int DeleteBitmap::remove(const BitmapKey& bmk, uint32_t row_id) {
1058
0
    std::lock_guard l(lock);
1059
0
    auto it = delete_bitmap.find(bmk);
1060
0
    if (it == delete_bitmap.end()) return -1;
1061
0
    it->second.remove(row_id);
1062
0
    return 0;
1063
0
}
1064
1065
8
void DeleteBitmap::remove(const BitmapKey& start, const BitmapKey& end) {
1066
8
    std::lock_guard l(lock);
1067
107
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) {
1068
101
        auto& [k, _] = *it;
1069
101
        if (k >= end) {
1070
2
            break;
1071
2
        }
1072
99
        it = delete_bitmap.erase(it);
1073
99
    }
1074
8
}
1075
1076
6
bool DeleteBitmap::contains(const BitmapKey& bmk, uint32_t row_id) const {
1077
6
    std::shared_lock l(lock);
1078
6
    auto it = delete_bitmap.find(bmk);
1079
6
    return it != delete_bitmap.end() && it->second.contains(row_id);
1080
6
}
1081
1082
2
bool DeleteBitmap::contains_agg(const BitmapKey& bmk, uint32_t row_id) const {
1083
2
    return get_agg(bmk)->contains(row_id);
1084
2
}
1085
1086
0
bool DeleteBitmap::empty() const {
1087
0
    std::shared_lock l(lock);
1088
0
    return delete_bitmap.empty();
1089
0
}
1090
1091
0
uint64_t DeleteBitmap::cardinality() const {
1092
0
    std::shared_lock l(lock);
1093
0
    uint64_t res = 0;
1094
0
    for (auto entry : delete_bitmap) {
1095
0
        res += entry.second.cardinality();
1096
0
    }
1097
0
    return res;
1098
0
}
1099
1100
0
uint64_t DeleteBitmap::get_size() const {
1101
0
    std::shared_lock l(lock);
1102
0
    uint64_t charge = 0;
1103
0
    for (auto& [k, v] : delete_bitmap) {
1104
0
        charge += v.getSizeInBytes();
1105
0
    }
1106
0
    return charge;
1107
0
}
1108
1109
1
bool DeleteBitmap::contains_agg_without_cache(const BitmapKey& bmk, uint32_t row_id) const {
1110
1
    std::shared_lock l(lock);
1111
1
    DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), 0};
1112
1
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1113
0
        auto& [k, bm] = *it;
1114
0
        if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1115
0
            std::get<2>(k) > std::get<2>(bmk)) {
1116
0
            break;
1117
0
        }
1118
0
        if (bm.contains(row_id)) {
1119
0
            return true;
1120
0
        }
1121
0
    }
1122
1
    return false;
1123
1
}
1124
1125
0
void DeleteBitmap::remove_sentinel_marks() {
1126
0
    for (auto it = delete_bitmap.begin(), end = delete_bitmap.end(); it != end;) {
1127
0
        if (std::get<1>(it->first) == DeleteBitmap::INVALID_SEGMENT_ID) {
1128
0
            it = delete_bitmap.erase(it);
1129
0
        } else {
1130
0
            ++it;
1131
0
        }
1132
0
    }
1133
0
}
1134
1135
38
int DeleteBitmap::set(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1136
38
    std::lock_guard l(lock);
1137
38
    auto [_, inserted] = delete_bitmap.insert_or_assign(bmk, segment_delete_bitmap);
1138
38
    return inserted;
1139
38
}
1140
1141
3
int DeleteBitmap::get(const BitmapKey& bmk, roaring::Roaring* segment_delete_bitmap) const {
1142
3
    std::shared_lock l(lock);
1143
3
    auto it = delete_bitmap.find(bmk);
1144
3
    if (it == delete_bitmap.end()) return -1;
1145
3
    *segment_delete_bitmap = it->second; // copy
1146
3
    return 0;
1147
3
}
1148
1149
54
const roaring::Roaring* DeleteBitmap::get(const BitmapKey& bmk) const {
1150
54
    std::shared_lock l(lock);
1151
54
    auto it = delete_bitmap.find(bmk);
1152
54
    if (it == delete_bitmap.end()) return nullptr;
1153
41
    return &(it->second); // get address
1154
54
}
1155
1156
void DeleteBitmap::subset(const BitmapKey& start, const BitmapKey& end,
1157
3
                          DeleteBitmap* subset_rowset_map) const {
1158
3
    roaring::Roaring roaring;
1159
3
    DCHECK(start < end);
1160
3
    std::shared_lock l(lock);
1161
26
    for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1162
25
        auto& [k, bm] = *it;
1163
25
        if (k >= end) {
1164
2
            break;
1165
2
        }
1166
23
        subset_rowset_map->set(k, bm);
1167
23
    }
1168
3
}
1169
1170
2
void DeleteBitmap::merge(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) {
1171
2
    std::lock_guard l(lock);
1172
2
    auto [iter, succ] = delete_bitmap.emplace(bmk, segment_delete_bitmap);
1173
2
    if (!succ) {
1174
0
        iter->second |= segment_delete_bitmap;
1175
0
    }
1176
2
}
1177
1178
8
void DeleteBitmap::merge(const DeleteBitmap& other) {
1179
8
    std::lock_guard l(lock);
1180
29
    for (auto& i : other.delete_bitmap) {
1181
29
        auto [j, succ] = this->delete_bitmap.insert(i);
1182
29
        if (!succ) j->second |= i.second;
1183
29
    }
1184
8
}
1185
1186
void DeleteBitmap::add_to_remove_queue(
1187
        const std::string& version_str,
1188
        const std::vector<std::tuple<int64_t, DeleteBitmap::BitmapKey, DeleteBitmap::BitmapKey>>&
1189
0
                vector) {
1190
0
    std::shared_lock l(stale_delete_bitmap_lock);
1191
0
    _stale_delete_bitmap.emplace(version_str, vector);
1192
0
}
1193
1194
0
void DeleteBitmap::remove_stale_delete_bitmap_from_queue(const std::vector<std::string>& vector) {
1195
0
    if (!config::enable_delete_bitmap_merge_on_compaction) {
1196
0
        return;
1197
0
    }
1198
0
    std::shared_lock l(stale_delete_bitmap_lock);
1199
    //<rowset_id, start_version, end_version>
1200
0
    std::vector<std::tuple<std::string, uint64_t, uint64_t>> to_delete;
1201
0
    int64_t tablet_id = -1;
1202
0
    for (auto& version_str : vector) {
1203
0
        auto it = _stale_delete_bitmap.find(version_str);
1204
0
        if (it != _stale_delete_bitmap.end()) {
1205
0
            auto delete_bitmap_vector = it->second;
1206
0
            for (auto& delete_bitmap_tuple : it->second) {
1207
0
                if (tablet_id < 0) {
1208
0
                    tablet_id = std::get<0>(delete_bitmap_tuple);
1209
0
                }
1210
0
                auto start_bmk = std::get<1>(delete_bitmap_tuple);
1211
0
                auto end_bmk = std::get<2>(delete_bitmap_tuple);
1212
                // the key range of to be removed is [start_bmk,end_bmk),
1213
                // due to the different definitions of the right boundary,
1214
                // so use end_bmk as right boundary when removing local delete bitmap,
1215
                // use (end_bmk - 1) as right boundary when removing ms delete bitmap
1216
0
                remove(start_bmk, end_bmk);
1217
0
                to_delete.emplace_back(std::make_tuple(std::get<0>(start_bmk).to_string(), 0,
1218
0
                                                       std::get<2>(end_bmk) - 1));
1219
0
            }
1220
0
            _stale_delete_bitmap.erase(version_str);
1221
0
        }
1222
0
    }
1223
0
    if (tablet_id == -1 || to_delete.empty()) {
1224
0
        return;
1225
0
    }
1226
0
    CloudStorageEngine& engine = ExecEnv::GetInstance()->storage_engine().to_cloud();
1227
0
    auto st = engine.meta_mgr().remove_old_version_delete_bitmap(tablet_id, to_delete);
1228
0
    if (!st.ok()) {
1229
0
        LOG(WARNING) << "fail to remove_stale_delete_bitmap_from_queue for tablet=" << tablet_id
1230
0
                     << ",st=" << st;
1231
0
    }
1232
0
}
1233
1234
0
uint64_t DeleteBitmap::get_delete_bitmap_count() {
1235
0
    std::shared_lock l(lock);
1236
0
    return delete_bitmap.size();
1237
0
}
1238
1239
// We cannot just copy the underlying memory to construct a string
1240
// due to equivalent objects may have different padding bytes.
1241
// Reading padding bytes is undefined behavior, neither copy nor
1242
// placement new will help simplify the code.
1243
// Refer to C11 standards §6.2.6.1/6 and §6.7.9/21 for more info.
1244
44
static std::string agg_cache_key(int64_t tablet_id, const DeleteBitmap::BitmapKey& bmk) {
1245
44
    std::string ret(sizeof(tablet_id) + sizeof(bmk), '\0');
1246
44
    *reinterpret_cast<int64_t*>(ret.data()) = tablet_id;
1247
44
    auto t = reinterpret_cast<DeleteBitmap::BitmapKey*>(ret.data() + sizeof(tablet_id));
1248
44
    std::get<RowsetId>(*t).version = std::get<RowsetId>(bmk).version;
1249
44
    std::get<RowsetId>(*t).hi = std::get<RowsetId>(bmk).hi;
1250
44
    std::get<RowsetId>(*t).mi = std::get<RowsetId>(bmk).mi;
1251
44
    std::get<RowsetId>(*t).lo = std::get<RowsetId>(bmk).lo;
1252
44
    std::get<1>(*t) = std::get<1>(bmk);
1253
44
    std::get<2>(*t) = std::get<2>(bmk);
1254
44
    return ret;
1255
44
}
1256
1257
44
std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg(const BitmapKey& bmk) const {
1258
44
    std::string key_str = agg_cache_key(_tablet_id, bmk); // Cache key container
1259
44
    CacheKey key(key_str);
1260
44
    Cache::Handle* handle = _agg_cache->repr()->lookup(key);
1261
1262
44
    AggCache::Value* val =
1263
44
            handle == nullptr
1264
44
                    ? nullptr
1265
44
                    : reinterpret_cast<AggCache::Value*>(_agg_cache->repr()->value(handle));
1266
    // FIXME: do we need a mutex here to get rid of duplicated initializations
1267
    //        of cache entries in some cases?
1268
44
    if (val == nullptr) { // Renew if needed, put a new Value to cache
1269
38
        val = new AggCache::Value();
1270
38
        {
1271
38
            std::shared_lock l(lock);
1272
38
            DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), 0};
1273
69
            for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) {
1274
66
                auto& [k, bm] = *it;
1275
66
                if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) ||
1276
66
                    std::get<2>(k) > std::get<2>(bmk)) {
1277
35
                    break;
1278
35
                }
1279
31
                val->bitmap |= bm;
1280
31
            }
1281
38
        }
1282
38
        size_t charge = val->bitmap.getSizeInBytes() + sizeof(AggCache::Value);
1283
38
        handle = _agg_cache->repr()->insert(key, val, charge, charge, CachePriority::NORMAL);
1284
38
    }
1285
1286
    // It is natural for the cache to reclaim the underlying memory
1287
44
    return std::shared_ptr<roaring::Roaring>(
1288
44
            &val->bitmap, [this, handle](...) { _agg_cache->repr()->release(handle); });
1289
44
}
1290
1291
std::atomic<DeleteBitmap::AggCachePolicy*> DeleteBitmap::AggCache::s_repr {nullptr};
1292
1293
0
std::string tablet_state_name(TabletState state) {
1294
0
    switch (state) {
1295
0
    case TABLET_NOTREADY:
1296
0
        return "TABLET_NOTREADY";
1297
1298
0
    case TABLET_RUNNING:
1299
0
        return "TABLET_RUNNING";
1300
1301
0
    case TABLET_TOMBSTONED:
1302
0
        return "TABLET_TOMBSTONED";
1303
1304
0
    case TABLET_STOPPED:
1305
0
        return "TABLET_STOPPED";
1306
1307
0
    case TABLET_SHUTDOWN:
1308
0
        return "TABLET_SHUTDOWN";
1309
1310
0
    default:
1311
0
        return "TabletState(" + std::to_string(state) + ")";
1312
0
    }
1313
0
}
1314
1315
} // namespace doris