/root/doris/be/src/olap/tablet_meta.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "olap/tablet_meta.h" |
19 | | |
20 | | #include <gen_cpp/Descriptors_types.h> |
21 | | #include <gen_cpp/Types_types.h> |
22 | | #include <gen_cpp/olap_common.pb.h> |
23 | | #include <gen_cpp/olap_file.pb.h> |
24 | | #include <gen_cpp/segment_v2.pb.h> |
25 | | #include <gen_cpp/types.pb.h> |
26 | | #include <json2pb/pb_to_json.h> |
27 | | #include <time.h> |
28 | | |
29 | | #include <cstdint> |
30 | | #include <set> |
31 | | #include <utility> |
32 | | |
33 | | #include "common/config.h" |
34 | | #include "gutil/integral_types.h" |
35 | | #include "io/fs/file_writer.h" |
36 | | #include "olap/data_dir.h" |
37 | | #include "olap/file_header.h" |
38 | | #include "olap/olap_common.h" |
39 | | #include "olap/olap_define.h" |
40 | | #include "olap/rowset/rowset.h" |
41 | | #include "olap/rowset/rowset_meta_manager.h" |
42 | | #include "olap/tablet_meta_manager.h" |
43 | | #include "olap/utils.h" |
44 | | #include "util/debug_points.h" |
45 | | #include "util/mem_info.h" |
46 | | #include "util/parse_util.h" |
47 | | #include "util/string_util.h" |
48 | | #include "util/time.h" |
49 | | #include "util/uid_util.h" |
50 | | |
51 | | using std::string; |
52 | | using std::unordered_map; |
53 | | using std::vector; |
54 | | |
55 | | namespace doris { |
56 | | using namespace ErrorCode; |
57 | | |
58 | | TabletMetaSharedPtr TabletMeta::create( |
59 | | const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id, |
60 | | uint32_t next_unique_id, |
61 | 69 | const unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id) { |
62 | 69 | std::optional<TBinlogConfig> binlog_config; |
63 | 69 | if (request.__isset.binlog_config) { |
64 | 0 | binlog_config = request.binlog_config; |
65 | 0 | } |
66 | 69 | return std::make_shared<TabletMeta>( |
67 | 69 | request.table_id, request.partition_id, request.tablet_id, request.replica_id, |
68 | 69 | request.tablet_schema.schema_hash, shard_id, request.tablet_schema, next_unique_id, |
69 | 69 | col_ordinal_to_unique_id, tablet_uid, |
70 | 69 | request.__isset.tablet_type ? request.tablet_type : TTabletType::TABLET_TYPE_DISK, |
71 | 69 | request.compression_type, request.storage_policy_id, |
72 | 69 | request.__isset.enable_unique_key_merge_on_write |
73 | 69 | ? request.enable_unique_key_merge_on_write |
74 | 69 | : false, |
75 | 69 | std::move(binlog_config), request.compaction_policy, |
76 | 69 | request.time_series_compaction_goal_size_mbytes, |
77 | 69 | request.time_series_compaction_file_count_threshold, |
78 | 69 | request.time_series_compaction_time_threshold_seconds, |
79 | 69 | request.time_series_compaction_empty_rowsets_threshold, |
80 | 69 | request.inverted_index_storage_format, request.time_series_compaction_level_threshold); |
81 | 69 | } |
82 | | |
83 | | TabletMeta::TabletMeta() |
84 | | : _tablet_uid(0, 0), |
85 | | _schema(new TabletSchema), |
86 | 67 | _delete_bitmap(new DeleteBitmap(_tablet_id)) {} |
87 | | |
88 | | TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id, |
89 | | int64_t replica_id, int32_t schema_hash, uint64_t shard_id, |
90 | | const TTabletSchema& tablet_schema, uint32_t next_unique_id, |
91 | | const std::unordered_map<uint32_t, uint32_t>& col_ordinal_to_unique_id, |
92 | | TabletUid tablet_uid, TTabletType::type tabletType, |
93 | | TCompressionType::type compression_type, int64_t storage_policy_id, |
94 | | bool enable_unique_key_merge_on_write, |
95 | | std::optional<TBinlogConfig> binlog_config, std::string compaction_policy, |
96 | | int64_t time_series_compaction_goal_size_mbytes, |
97 | | int64_t time_series_compaction_file_count_threshold, |
98 | | int64_t time_series_compaction_time_threshold_seconds, |
99 | | int64_t time_series_compaction_empty_rowsets_threshold, |
100 | | TInvertedIndexStorageFormat::type inverted_index_storage_format, |
101 | | int64_t time_series_compaction_level_threshold) |
102 | | : _tablet_uid(0, 0), |
103 | | _schema(new TabletSchema), |
104 | 212 | _delete_bitmap(new DeleteBitmap(tablet_id)) { |
105 | 212 | TabletMetaPB tablet_meta_pb; |
106 | 212 | tablet_meta_pb.set_table_id(table_id); |
107 | 212 | tablet_meta_pb.set_partition_id(partition_id); |
108 | 212 | tablet_meta_pb.set_tablet_id(tablet_id); |
109 | 212 | tablet_meta_pb.set_replica_id(replica_id); |
110 | 212 | tablet_meta_pb.set_schema_hash(schema_hash); |
111 | 212 | tablet_meta_pb.set_shard_id(shard_id); |
112 | | // Persist the creation time, but it is not used |
113 | 212 | tablet_meta_pb.set_creation_time(time(nullptr)); |
114 | 212 | tablet_meta_pb.set_cumulative_layer_point(-1); |
115 | 212 | tablet_meta_pb.set_tablet_state(PB_RUNNING); |
116 | 212 | *(tablet_meta_pb.mutable_tablet_uid()) = tablet_uid.to_proto(); |
117 | 212 | tablet_meta_pb.set_tablet_type(tabletType == TTabletType::TABLET_TYPE_DISK |
118 | 212 | ? TabletTypePB::TABLET_TYPE_DISK |
119 | 212 | : TabletTypePB::TABLET_TYPE_MEMORY); |
120 | 212 | tablet_meta_pb.set_enable_unique_key_merge_on_write(enable_unique_key_merge_on_write); |
121 | 212 | tablet_meta_pb.set_storage_policy_id(storage_policy_id); |
122 | 212 | tablet_meta_pb.set_compaction_policy(compaction_policy); |
123 | 212 | tablet_meta_pb.set_time_series_compaction_goal_size_mbytes( |
124 | 212 | time_series_compaction_goal_size_mbytes); |
125 | 212 | tablet_meta_pb.set_time_series_compaction_file_count_threshold( |
126 | 212 | time_series_compaction_file_count_threshold); |
127 | 212 | tablet_meta_pb.set_time_series_compaction_time_threshold_seconds( |
128 | 212 | time_series_compaction_time_threshold_seconds); |
129 | 212 | tablet_meta_pb.set_time_series_compaction_empty_rowsets_threshold( |
130 | 212 | time_series_compaction_empty_rowsets_threshold); |
131 | 212 | tablet_meta_pb.set_time_series_compaction_level_threshold( |
132 | 212 | time_series_compaction_level_threshold); |
133 | 212 | TabletSchemaPB* schema = tablet_meta_pb.mutable_schema(); |
134 | 212 | schema->set_num_short_key_columns(tablet_schema.short_key_column_count); |
135 | 212 | schema->set_num_rows_per_row_block(config::default_num_rows_per_column_file_block); |
136 | 212 | schema->set_sequence_col_idx(tablet_schema.sequence_col_idx); |
137 | 212 | switch (tablet_schema.keys_type) { |
138 | 30 | case TKeysType::DUP_KEYS: |
139 | 30 | schema->set_keys_type(KeysType::DUP_KEYS); |
140 | 30 | break; |
141 | 73 | case TKeysType::UNIQUE_KEYS: |
142 | 73 | schema->set_keys_type(KeysType::UNIQUE_KEYS); |
143 | 73 | break; |
144 | 61 | case TKeysType::AGG_KEYS: |
145 | 61 | schema->set_keys_type(KeysType::AGG_KEYS); |
146 | 61 | break; |
147 | 48 | default: |
148 | 48 | LOG(WARNING) << "unknown tablet keys type"; |
149 | 48 | break; |
150 | 212 | } |
151 | | // compress_kind used to compress segment files |
152 | 212 | schema->set_compress_kind(COMPRESS_LZ4); |
153 | | |
154 | | // compression_type used to compress segment page |
155 | 212 | switch (compression_type) { |
156 | 0 | case TCompressionType::NO_COMPRESSION: |
157 | 0 | schema->set_compression_type(segment_v2::NO_COMPRESSION); |
158 | 0 | break; |
159 | 0 | case TCompressionType::SNAPPY: |
160 | 0 | schema->set_compression_type(segment_v2::SNAPPY); |
161 | 0 | break; |
162 | 0 | case TCompressionType::LZ4: |
163 | 0 | schema->set_compression_type(segment_v2::LZ4); |
164 | 0 | break; |
165 | 212 | case TCompressionType::LZ4F: |
166 | 212 | schema->set_compression_type(segment_v2::LZ4F); |
167 | 212 | break; |
168 | 0 | case TCompressionType::ZLIB: |
169 | 0 | schema->set_compression_type(segment_v2::ZLIB); |
170 | 0 | break; |
171 | 0 | case TCompressionType::ZSTD: |
172 | 0 | schema->set_compression_type(segment_v2::ZSTD); |
173 | 0 | break; |
174 | 0 | default: |
175 | 0 | schema->set_compression_type(segment_v2::LZ4F); |
176 | 0 | break; |
177 | 212 | } |
178 | | |
179 | 212 | switch (inverted_index_storage_format) { |
180 | 212 | case TInvertedIndexStorageFormat::V1: |
181 | 212 | schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1); |
182 | 212 | break; |
183 | 0 | case TInvertedIndexStorageFormat::V2: |
184 | 0 | schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); |
185 | 0 | break; |
186 | 0 | default: |
187 | 0 | schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1); |
188 | 0 | break; |
189 | 212 | } |
190 | | |
191 | 212 | switch (tablet_schema.sort_type) { |
192 | 0 | case TSortType::type::ZORDER: |
193 | 0 | schema->set_sort_type(SortType::ZORDER); |
194 | 0 | break; |
195 | 212 | default: |
196 | 212 | schema->set_sort_type(SortType::LEXICAL); |
197 | 212 | } |
198 | 212 | schema->set_sort_col_num(tablet_schema.sort_col_num); |
199 | 212 | for (const auto& i : tablet_schema.cluster_key_idxes) { |
200 | 0 | schema->add_cluster_key_idxes(i); |
201 | 0 | } |
202 | 212 | tablet_meta_pb.set_in_restore_mode(false); |
203 | | |
204 | | // set column information |
205 | 212 | uint32_t col_ordinal = 0; |
206 | 212 | bool has_bf_columns = false; |
207 | 1.41k | for (TColumn tcolumn : tablet_schema.columns) { |
208 | 1.41k | ColumnPB* column = schema->add_column(); |
209 | 1.41k | uint32_t unique_id = -1; |
210 | 1.41k | if (tcolumn.col_unique_id >= 0) { |
211 | 0 | unique_id = tcolumn.col_unique_id; |
212 | 1.41k | } else { |
213 | 1.41k | unique_id = col_ordinal_to_unique_id.at(col_ordinal); |
214 | 1.41k | } |
215 | 1.41k | col_ordinal++; |
216 | 1.41k | init_column_from_tcolumn(unique_id, tcolumn, column); |
217 | | |
218 | 1.41k | if (column->is_bf_column()) { |
219 | 0 | has_bf_columns = true; |
220 | 0 | } |
221 | | |
222 | 1.41k | if (tablet_schema.__isset.indexes) { |
223 | 0 | for (auto& index : tablet_schema.indexes) { |
224 | 0 | if (index.index_type == TIndexType::type::BITMAP) { |
225 | 0 | DCHECK_EQ(index.columns.size(), 1); |
226 | 0 | if (iequal(tcolumn.column_name, index.columns[0])) { |
227 | 0 | column->set_has_bitmap_index(true); |
228 | 0 | break; |
229 | 0 | } |
230 | 0 | } else if (index.index_type == TIndexType::type::BLOOMFILTER || |
231 | 0 | index.index_type == TIndexType::type::NGRAM_BF) { |
232 | 0 | DCHECK_EQ(index.columns.size(), 1); |
233 | 0 | if (iequal(tcolumn.column_name, index.columns[0])) { |
234 | 0 | column->set_is_bf_column(true); |
235 | 0 | break; |
236 | 0 | } |
237 | 0 | } |
238 | 0 | } |
239 | 0 | } |
240 | 1.41k | } |
241 | | |
242 | | // copy index meta |
243 | 212 | if (tablet_schema.__isset.indexes) { |
244 | 0 | for (auto& index : tablet_schema.indexes) { |
245 | 0 | TabletIndexPB* index_pb = schema->add_index(); |
246 | 0 | index_pb->set_index_id(index.index_id); |
247 | 0 | index_pb->set_index_name(index.index_name); |
248 | | // init col_unique_id in index at be side, since col_unique_id may be -1 at fe side |
249 | | // get column unique id by name |
250 | 0 | for (auto column_name : index.columns) { |
251 | 0 | for (auto column : schema->column()) { |
252 | 0 | if (iequal(column.name(), column_name)) { |
253 | 0 | index_pb->add_col_unique_id(column.unique_id()); |
254 | 0 | } |
255 | 0 | } |
256 | 0 | } |
257 | 0 | switch (index.index_type) { |
258 | 0 | case TIndexType::BITMAP: |
259 | 0 | index_pb->set_index_type(IndexType::BITMAP); |
260 | 0 | break; |
261 | 0 | case TIndexType::INVERTED: |
262 | 0 | index_pb->set_index_type(IndexType::INVERTED); |
263 | 0 | break; |
264 | 0 | case TIndexType::BLOOMFILTER: |
265 | 0 | index_pb->set_index_type(IndexType::BLOOMFILTER); |
266 | 0 | break; |
267 | 0 | case TIndexType::NGRAM_BF: |
268 | 0 | index_pb->set_index_type(IndexType::NGRAM_BF); |
269 | 0 | break; |
270 | 0 | } |
271 | | |
272 | 0 | if (index.__isset.properties) { |
273 | 0 | auto properties = index_pb->mutable_properties(); |
274 | 0 | for (auto kv : index.properties) { |
275 | 0 | (*properties)[kv.first] = kv.second; |
276 | 0 | } |
277 | 0 | } |
278 | 0 | } |
279 | 0 | } |
280 | | |
281 | 212 | schema->set_next_column_unique_id(next_unique_id); |
282 | 212 | if (has_bf_columns && tablet_schema.__isset.bloom_filter_fpp) { |
283 | 0 | schema->set_bf_fpp(tablet_schema.bloom_filter_fpp); |
284 | 0 | } |
285 | | |
286 | 212 | if (tablet_schema.__isset.is_in_memory) { |
287 | 0 | schema->set_is_in_memory(tablet_schema.is_in_memory); |
288 | 0 | } |
289 | | |
290 | 212 | if (tablet_schema.__isset.disable_auto_compaction) { |
291 | 0 | schema->set_disable_auto_compaction(tablet_schema.disable_auto_compaction); |
292 | 0 | } |
293 | | |
294 | 212 | if (tablet_schema.__isset.enable_single_replica_compaction) { |
295 | 212 | schema->set_enable_single_replica_compaction( |
296 | 212 | tablet_schema.enable_single_replica_compaction); |
297 | 212 | } |
298 | | |
299 | 212 | if (tablet_schema.__isset.delete_sign_idx) { |
300 | 212 | schema->set_delete_sign_idx(tablet_schema.delete_sign_idx); |
301 | 212 | } |
302 | 212 | if (tablet_schema.__isset.store_row_column) { |
303 | 212 | schema->set_store_row_column(tablet_schema.store_row_column); |
304 | 212 | } |
305 | 212 | if (tablet_schema.__isset.row_store_page_size) { |
306 | 212 | schema->set_row_store_page_size(tablet_schema.row_store_page_size); |
307 | 212 | } |
308 | 212 | if (tablet_schema.__isset.skip_write_index_on_load) { |
309 | 212 | schema->set_skip_write_index_on_load(tablet_schema.skip_write_index_on_load); |
310 | 212 | } |
311 | 212 | if (binlog_config.has_value()) { |
312 | 0 | BinlogConfig tmp_binlog_config; |
313 | 0 | tmp_binlog_config = binlog_config.value(); |
314 | 0 | tmp_binlog_config.to_pb(tablet_meta_pb.mutable_binlog_config()); |
315 | 0 | } |
316 | | |
317 | 212 | init_from_pb(tablet_meta_pb); |
318 | 212 | } |
319 | | |
320 | | TabletMeta::TabletMeta(const TabletMeta& b) |
321 | | : _table_id(b._table_id), |
322 | | _partition_id(b._partition_id), |
323 | | _tablet_id(b._tablet_id), |
324 | | _replica_id(b._replica_id), |
325 | | _schema_hash(b._schema_hash), |
326 | | _shard_id(b._shard_id), |
327 | | _creation_time(b._creation_time), |
328 | | _cumulative_layer_point(b._cumulative_layer_point), |
329 | | _tablet_uid(b._tablet_uid), |
330 | | _tablet_type(b._tablet_type), |
331 | | _tablet_state(b._tablet_state), |
332 | | _schema(b._schema), |
333 | | _rs_metas(b._rs_metas), |
334 | | _stale_rs_metas(b._stale_rs_metas), |
335 | | _in_restore_mode(b._in_restore_mode), |
336 | | _preferred_rowset_type(b._preferred_rowset_type), |
337 | | _storage_policy_id(b._storage_policy_id), |
338 | | _cooldown_meta_id(b._cooldown_meta_id), |
339 | | _enable_unique_key_merge_on_write(b._enable_unique_key_merge_on_write), |
340 | | _delete_bitmap(b._delete_bitmap), |
341 | | _binlog_config(b._binlog_config), |
342 | | _compaction_policy(b._compaction_policy), |
343 | | _time_series_compaction_goal_size_mbytes(b._time_series_compaction_goal_size_mbytes), |
344 | | _time_series_compaction_file_count_threshold( |
345 | | b._time_series_compaction_file_count_threshold), |
346 | | _time_series_compaction_time_threshold_seconds( |
347 | | b._time_series_compaction_time_threshold_seconds), |
348 | | _time_series_compaction_empty_rowsets_threshold( |
349 | | b._time_series_compaction_empty_rowsets_threshold), |
350 | 0 | _time_series_compaction_level_threshold(b._time_series_compaction_level_threshold) {}; |
351 | | |
352 | | void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tcolumn, |
353 | 1.41k | ColumnPB* column) { |
354 | 1.41k | column->set_unique_id(unique_id); |
355 | 1.41k | column->set_name(tcolumn.column_name); |
356 | 1.41k | column->set_has_bitmap_index(tcolumn.has_bitmap_index); |
357 | 1.41k | column->set_is_auto_increment(tcolumn.is_auto_increment); |
358 | 1.41k | string data_type; |
359 | 1.41k | EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type); |
360 | 1.41k | column->set_type(data_type); |
361 | | |
362 | 1.41k | uint32_t length = TabletColumn::get_field_length_by_type(tcolumn.column_type.type, |
363 | 1.41k | tcolumn.column_type.len); |
364 | 1.41k | column->set_length(length); |
365 | 1.41k | column->set_index_length(length); |
366 | 1.41k | column->set_precision(tcolumn.column_type.precision); |
367 | 1.41k | column->set_frac(tcolumn.column_type.scale); |
368 | | |
369 | 1.41k | if (tcolumn.__isset.result_is_nullable) { |
370 | 0 | column->set_result_is_nullable(tcolumn.result_is_nullable); |
371 | 0 | } |
372 | | |
373 | 1.41k | if (tcolumn.column_type.type == TPrimitiveType::VARCHAR || |
374 | 1.41k | tcolumn.column_type.type == TPrimitiveType::STRING) { |
375 | 101 | if (!tcolumn.column_type.__isset.index_len) { |
376 | 101 | column->set_index_length(10); |
377 | 101 | } else { |
378 | 0 | column->set_index_length(tcolumn.column_type.index_len); |
379 | 0 | } |
380 | 101 | } |
381 | 1.41k | if (!tcolumn.is_key) { |
382 | 678 | column->set_is_key(false); |
383 | 678 | if (tcolumn.__isset.aggregation) { |
384 | 0 | column->set_aggregation(tcolumn.aggregation); |
385 | 678 | } else { |
386 | 678 | string aggregation_type; |
387 | 678 | EnumToString(TAggregationType, tcolumn.aggregation_type, aggregation_type); |
388 | 678 | column->set_aggregation(aggregation_type); |
389 | 678 | } |
390 | 741 | } else { |
391 | 741 | column->set_is_key(true); |
392 | 741 | column->set_aggregation("NONE"); |
393 | 741 | } |
394 | 1.41k | column->set_is_nullable(tcolumn.is_allow_null); |
395 | 1.41k | if (tcolumn.__isset.default_value) { |
396 | 0 | column->set_default_value(tcolumn.default_value); |
397 | 0 | } |
398 | 1.41k | if (tcolumn.__isset.is_bloom_filter_column) { |
399 | 0 | column->set_is_bf_column(tcolumn.is_bloom_filter_column); |
400 | 0 | } |
401 | 1.41k | for (size_t i = 0; i < tcolumn.children_column.size(); i++) { |
402 | 0 | ColumnPB* children_column = column->add_children_columns(); |
403 | 0 | init_column_from_tcolumn(tcolumn.children_column[i].col_unique_id, |
404 | 0 | tcolumn.children_column[i], children_column); |
405 | 0 | } |
406 | 1.41k | } |
407 | | |
408 | 5 | Status TabletMeta::create_from_file(const string& file_path) { |
409 | 5 | FileHeader<TabletMetaPB> file_header(file_path); |
410 | | // In file_header.deserialize(), it validates file length, signature, checksum of protobuf. |
411 | 5 | RETURN_IF_ERROR(file_header.deserialize()); |
412 | 5 | TabletMetaPB tablet_meta_pb; |
413 | 5 | try { |
414 | 5 | tablet_meta_pb.CopyFrom(file_header.message()); |
415 | 5 | } catch (...) { |
416 | 0 | return Status::Error<PARSE_PROTOBUF_ERROR>("fail to copy protocol buffer object. file={}", |
417 | 0 | file_path); |
418 | 0 | } |
419 | | |
420 | 5 | init_from_pb(tablet_meta_pb); |
421 | 5 | return Status::OK(); |
422 | 5 | } |
423 | | |
424 | | std::string TabletMeta::construct_header_file_path(const string& schema_hash_path, |
425 | 2 | int64_t tablet_id) { |
426 | 2 | std::stringstream header_name_stream; |
427 | 2 | header_name_stream << schema_hash_path << "/" << tablet_id << ".hdr"; |
428 | 2 | return header_name_stream.str(); |
429 | 2 | } |
430 | | |
431 | 0 | Status TabletMeta::save_as_json(const string& file_path, DataDir* dir) { |
432 | 0 | std::string json_meta; |
433 | 0 | json2pb::Pb2JsonOptions json_options; |
434 | 0 | json_options.pretty_json = true; |
435 | 0 | json_options.bytes_to_base64 = true; |
436 | 0 | to_json(&json_meta, json_options); |
437 | | // save to file |
438 | 0 | io::FileWriterPtr file_writer; |
439 | 0 | RETURN_IF_ERROR(dir->fs()->create_file(file_path, &file_writer)); |
440 | 0 | RETURN_IF_ERROR(file_writer->append(json_meta)); |
441 | 0 | RETURN_IF_ERROR(file_writer->close()); |
442 | 0 | return Status::OK(); |
443 | 0 | } |
444 | | |
445 | 6 | Status TabletMeta::save(const string& file_path) { |
446 | 6 | TabletMetaPB tablet_meta_pb; |
447 | 6 | to_meta_pb(&tablet_meta_pb); |
448 | 6 | return TabletMeta::save(file_path, tablet_meta_pb); |
449 | 6 | } |
450 | | |
451 | 8 | Status TabletMeta::save(const string& file_path, const TabletMetaPB& tablet_meta_pb) { |
452 | 8 | DCHECK(!file_path.empty()); |
453 | 8 | FileHeader<TabletMetaPB> file_header(file_path); |
454 | 8 | try { |
455 | 8 | file_header.mutable_message()->CopyFrom(tablet_meta_pb); |
456 | 8 | } catch (...) { |
457 | 0 | LOG(WARNING) << "fail to copy protocol buffer object. file='" << file_path; |
458 | 0 | return Status::Error<ErrorCode::INTERNAL_ERROR>( |
459 | 0 | "fail to copy protocol buffer object. file={}", file_path); |
460 | 0 | } |
461 | 8 | RETURN_IF_ERROR(file_header.prepare()); |
462 | 8 | RETURN_IF_ERROR(file_header.serialize()); |
463 | 8 | return Status::OK(); |
464 | 8 | } |
465 | | |
466 | 102 | Status TabletMeta::save_meta(DataDir* data_dir) { |
467 | 102 | std::lock_guard<std::shared_mutex> wrlock(_meta_lock); |
468 | 102 | return _save_meta(data_dir); |
469 | 102 | } |
470 | | |
471 | 102 | Status TabletMeta::_save_meta(DataDir* data_dir) { |
472 | | // check if tablet uid is valid |
473 | 102 | if (_tablet_uid.hi == 0 && _tablet_uid.lo == 0) { |
474 | 0 | LOG(FATAL) << "tablet_uid is invalid" |
475 | 0 | << " tablet=" << tablet_id() << " _tablet_uid=" << _tablet_uid.to_string(); |
476 | 0 | } |
477 | 102 | string meta_binary; |
478 | | |
479 | 102 | auto t1 = MonotonicMicros(); |
480 | 102 | serialize(&meta_binary); |
481 | 102 | auto t2 = MonotonicMicros(); |
482 | 102 | Status status = TabletMetaManager::save(data_dir, tablet_id(), schema_hash(), meta_binary); |
483 | 102 | if (!status.ok()) { |
484 | 0 | LOG(FATAL) << "fail to save tablet_meta. status=" << status << ", tablet_id=" << tablet_id() |
485 | 0 | << ", schema_hash=" << schema_hash(); |
486 | 0 | } |
487 | 102 | auto t3 = MonotonicMicros(); |
488 | 102 | auto cost = t3 - t1; |
489 | 102 | if (cost > 1 * 1000 * 1000) { |
490 | 0 | LOG(INFO) << "save tablet(" << tablet_id() << ") meta too slow. serialize cost " << t2 - t1 |
491 | 0 | << "(us), serialized binary size: " << meta_binary.length() |
492 | 0 | << "(bytes), write rocksdb cost " << t3 - t2 << "(us)"; |
493 | 0 | } |
494 | 102 | return status; |
495 | 102 | } |
496 | | |
497 | 105 | void TabletMeta::serialize(string* meta_binary) { |
498 | 105 | TabletMetaPB tablet_meta_pb; |
499 | 105 | to_meta_pb(&tablet_meta_pb); |
500 | 105 | if (tablet_meta_pb.partition_id() <= 0) { |
501 | 9 | LOG(WARNING) << "invalid partition id " << tablet_meta_pb.partition_id() << " tablet " |
502 | 9 | << tablet_meta_pb.tablet_id(); |
503 | 9 | } |
504 | 105 | DBUG_EXECUTE_IF("TabletMeta::serialize::zero_partition_id", { |
505 | 105 | long partition_id = tablet_meta_pb.partition_id(); |
506 | 105 | tablet_meta_pb.set_partition_id(0); |
507 | 105 | LOG(WARNING) << "set debug point TabletMeta::serialize::zero_partition_id old=" |
508 | 105 | << partition_id << " new=" << tablet_meta_pb.DebugString(); |
509 | 105 | }); |
510 | 105 | bool serialize_success = tablet_meta_pb.SerializeToString(meta_binary); |
511 | 105 | if (!_rs_metas.empty() || !_stale_rs_metas.empty()) { |
512 | 105 | _avg_rs_meta_serialize_size = |
513 | 105 | meta_binary->length() / (_rs_metas.size() + _stale_rs_metas.size()); |
514 | 105 | if (meta_binary->length() > config::tablet_meta_serialize_size_limit || |
515 | 105 | !serialize_success) { |
516 | 0 | int64_t origin_meta_size = meta_binary->length(); |
517 | 0 | int64_t stale_rowsets_num = tablet_meta_pb.stale_rs_metas().size(); |
518 | 0 | tablet_meta_pb.clear_stale_rs_metas(); |
519 | 0 | meta_binary->clear(); |
520 | 0 | serialize_success = tablet_meta_pb.SerializeToString(meta_binary); |
521 | 0 | LOG(WARNING) << "tablet meta serialization size exceeds limit: " |
522 | 0 | << config::tablet_meta_serialize_size_limit |
523 | 0 | << " clean up stale rowsets, tablet id: " << tablet_id() |
524 | 0 | << " stale rowset num: " << stale_rowsets_num |
525 | 0 | << " serialization size before clean " << origin_meta_size |
526 | 0 | << " serialization size after clean " << meta_binary->length(); |
527 | 0 | } |
528 | 105 | } |
529 | | |
530 | 105 | if (!serialize_success) { |
531 | 0 | LOG(FATAL) << "failed to serialize meta " << tablet_id(); |
532 | 0 | } |
533 | 105 | } |
534 | | |
535 | 10 | Status TabletMeta::deserialize(const string& meta_binary) { |
536 | 10 | TabletMetaPB tablet_meta_pb; |
537 | 10 | bool parsed = tablet_meta_pb.ParseFromString(meta_binary); |
538 | 10 | if (!parsed) { |
539 | 0 | return Status::Error<INIT_FAILED>("parse tablet meta failed"); |
540 | 0 | } |
541 | 10 | init_from_pb(tablet_meta_pb); |
542 | 10 | return Status::OK(); |
543 | 10 | } |
544 | | |
545 | 2 | void TabletMeta::init_rs_metas_fs(const io::FileSystemSPtr& fs) { |
546 | 4 | for (auto& rs_meta : _rs_metas) { |
547 | 4 | if (rs_meta->is_local()) { |
548 | 4 | rs_meta->set_fs(fs); |
549 | 4 | } |
550 | 4 | } |
551 | 2 | for (auto& rs_meta : _stale_rs_metas) { |
552 | 0 | if (rs_meta->is_local()) { |
553 | 0 | rs_meta->set_fs(fs); |
554 | 0 | } |
555 | 0 | } |
556 | 2 | } |
557 | | |
558 | 229 | void TabletMeta::init_from_pb(const TabletMetaPB& tablet_meta_pb) { |
559 | 229 | _table_id = tablet_meta_pb.table_id(); |
560 | 229 | _partition_id = tablet_meta_pb.partition_id(); |
561 | 229 | _tablet_id = tablet_meta_pb.tablet_id(); |
562 | 229 | _replica_id = tablet_meta_pb.replica_id(); |
563 | 229 | _schema_hash = tablet_meta_pb.schema_hash(); |
564 | 229 | _shard_id = tablet_meta_pb.shard_id(); |
565 | 229 | _creation_time = tablet_meta_pb.creation_time(); |
566 | 229 | _cumulative_layer_point = tablet_meta_pb.cumulative_layer_point(); |
567 | 229 | _tablet_uid = TabletUid(tablet_meta_pb.tablet_uid()); |
568 | 229 | if (tablet_meta_pb.has_tablet_type()) { |
569 | 229 | _tablet_type = tablet_meta_pb.tablet_type(); |
570 | 229 | } else { |
571 | 0 | _tablet_type = TabletTypePB::TABLET_TYPE_DISK; |
572 | 0 | } |
573 | | |
574 | | // init _tablet_state |
575 | 229 | switch (tablet_meta_pb.tablet_state()) { |
576 | 3 | case PB_NOTREADY: |
577 | 3 | _tablet_state = TabletState::TABLET_NOTREADY; |
578 | 3 | break; |
579 | 223 | case PB_RUNNING: |
580 | 223 | _tablet_state = TabletState::TABLET_RUNNING; |
581 | 223 | break; |
582 | 0 | case PB_TOMBSTONED: |
583 | 0 | _tablet_state = TabletState::TABLET_TOMBSTONED; |
584 | 0 | break; |
585 | 0 | case PB_STOPPED: |
586 | 0 | _tablet_state = TabletState::TABLET_STOPPED; |
587 | 0 | break; |
588 | 3 | case PB_SHUTDOWN: |
589 | 3 | _tablet_state = TabletState::TABLET_SHUTDOWN; |
590 | 3 | break; |
591 | 0 | default: |
592 | 0 | LOG(WARNING) << "tablet has no state. tablet=" << tablet_id() |
593 | 0 | << ", schema_hash=" << schema_hash(); |
594 | 229 | } |
595 | | |
596 | | // init _schema |
597 | 229 | _schema->init_from_pb(tablet_meta_pb.schema()); |
598 | | |
599 | 229 | if (tablet_meta_pb.has_enable_unique_key_merge_on_write()) { |
600 | 229 | _enable_unique_key_merge_on_write = tablet_meta_pb.enable_unique_key_merge_on_write(); |
601 | 229 | } |
602 | | |
603 | | // init _rs_metas |
604 | 229 | for (auto& it : tablet_meta_pb.rs_metas()) { |
605 | 27 | RowsetMetaSharedPtr rs_meta(new RowsetMeta()); |
606 | 27 | rs_meta->init_from_pb(it); |
607 | 27 | _rs_metas.push_back(std::move(rs_meta)); |
608 | 27 | } |
609 | | |
610 | | // For mow table, delete bitmap of stale rowsets has not been persisted. |
611 | | // When be restart, query should not read the stale rowset, otherwise duplicate keys |
612 | | // will be read out. Therefore, we don't add them to _stale_rs_meta for mow table. |
613 | 229 | if (!config::skip_loading_stale_rowset_meta && !_enable_unique_key_merge_on_write) { |
614 | 195 | for (auto& it : tablet_meta_pb.stale_rs_metas()) { |
615 | 0 | RowsetMetaSharedPtr rs_meta(new RowsetMeta()); |
616 | 0 | rs_meta->init_from_pb(it); |
617 | 0 | _stale_rs_metas.push_back(std::move(rs_meta)); |
618 | 0 | } |
619 | 195 | } |
620 | | |
621 | 229 | if (tablet_meta_pb.has_in_restore_mode()) { |
622 | 229 | _in_restore_mode = tablet_meta_pb.in_restore_mode(); |
623 | 229 | } |
624 | | |
625 | 229 | if (tablet_meta_pb.has_preferred_rowset_type()) { |
626 | 17 | _preferred_rowset_type = tablet_meta_pb.preferred_rowset_type(); |
627 | 17 | } |
628 | | |
629 | 229 | _storage_policy_id = tablet_meta_pb.storage_policy_id(); |
630 | 229 | if (tablet_meta_pb.has_cooldown_meta_id()) { |
631 | 0 | _cooldown_meta_id = tablet_meta_pb.cooldown_meta_id(); |
632 | 0 | } |
633 | | |
634 | 229 | if (tablet_meta_pb.has_delete_bitmap()) { |
635 | 0 | int rst_ids_size = tablet_meta_pb.delete_bitmap().rowset_ids_size(); |
636 | 0 | int seg_ids_size = tablet_meta_pb.delete_bitmap().segment_ids_size(); |
637 | 0 | int versions_size = tablet_meta_pb.delete_bitmap().versions_size(); |
638 | 0 | int seg_maps_size = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps_size(); |
639 | 0 | CHECK(rst_ids_size == seg_ids_size && seg_ids_size == seg_maps_size && |
640 | 0 | seg_maps_size == versions_size); |
641 | 0 | for (size_t i = 0; i < rst_ids_size; ++i) { |
642 | 0 | RowsetId rst_id; |
643 | 0 | rst_id.init(tablet_meta_pb.delete_bitmap().rowset_ids(i)); |
644 | 0 | auto seg_id = tablet_meta_pb.delete_bitmap().segment_ids(i); |
645 | 0 | uint32_t ver = tablet_meta_pb.delete_bitmap().versions(i); |
646 | 0 | auto bitmap = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps(i).data(); |
647 | 0 | delete_bitmap().delete_bitmap[{rst_id, seg_id, ver}] = roaring::Roaring::read(bitmap); |
648 | 0 | } |
649 | 0 | } |
650 | | |
651 | 229 | if (tablet_meta_pb.has_binlog_config()) { |
652 | 15 | _binlog_config = tablet_meta_pb.binlog_config(); |
653 | 15 | } |
654 | 229 | _compaction_policy = tablet_meta_pb.compaction_policy(); |
655 | 229 | _time_series_compaction_goal_size_mbytes = |
656 | 229 | tablet_meta_pb.time_series_compaction_goal_size_mbytes(); |
657 | 229 | _time_series_compaction_file_count_threshold = |
658 | 229 | tablet_meta_pb.time_series_compaction_file_count_threshold(); |
659 | 229 | _time_series_compaction_time_threshold_seconds = |
660 | 229 | tablet_meta_pb.time_series_compaction_time_threshold_seconds(); |
661 | 229 | _time_series_compaction_empty_rowsets_threshold = |
662 | 229 | tablet_meta_pb.time_series_compaction_empty_rowsets_threshold(); |
663 | 229 | _time_series_compaction_level_threshold = |
664 | 229 | tablet_meta_pb.time_series_compaction_level_threshold(); |
665 | 229 | } |
666 | | |
667 | 117 | void TabletMeta::to_meta_pb(TabletMetaPB* tablet_meta_pb) { |
668 | 117 | tablet_meta_pb->set_table_id(table_id()); |
669 | 117 | tablet_meta_pb->set_partition_id(partition_id()); |
670 | 117 | tablet_meta_pb->set_tablet_id(tablet_id()); |
671 | 117 | tablet_meta_pb->set_replica_id(replica_id()); |
672 | 117 | tablet_meta_pb->set_schema_hash(schema_hash()); |
673 | 117 | tablet_meta_pb->set_shard_id(shard_id()); |
674 | 117 | tablet_meta_pb->set_creation_time(creation_time()); |
675 | 117 | tablet_meta_pb->set_cumulative_layer_point(cumulative_layer_point()); |
676 | 117 | *(tablet_meta_pb->mutable_tablet_uid()) = tablet_uid().to_proto(); |
677 | 117 | tablet_meta_pb->set_tablet_type(_tablet_type); |
678 | 117 | switch (tablet_state()) { |
679 | 3 | case TABLET_NOTREADY: |
680 | 3 | tablet_meta_pb->set_tablet_state(PB_NOTREADY); |
681 | 3 | break; |
682 | 86 | case TABLET_RUNNING: |
683 | 86 | tablet_meta_pb->set_tablet_state(PB_RUNNING); |
684 | 86 | break; |
685 | 0 | case TABLET_TOMBSTONED: |
686 | 0 | tablet_meta_pb->set_tablet_state(PB_TOMBSTONED); |
687 | 0 | break; |
688 | 0 | case TABLET_STOPPED: |
689 | 0 | tablet_meta_pb->set_tablet_state(PB_STOPPED); |
690 | 0 | break; |
691 | 28 | case TABLET_SHUTDOWN: |
692 | 28 | tablet_meta_pb->set_tablet_state(PB_SHUTDOWN); |
693 | 28 | break; |
694 | 117 | } |
695 | | |
696 | 158 | for (auto& rs : _rs_metas) { |
697 | 158 | rs->to_rowset_pb(tablet_meta_pb->add_rs_metas()); |
698 | 158 | } |
699 | 117 | for (auto rs : _stale_rs_metas) { |
700 | 0 | rs->to_rowset_pb(tablet_meta_pb->add_stale_rs_metas()); |
701 | 0 | } |
702 | 117 | _schema->to_schema_pb(tablet_meta_pb->mutable_schema()); |
703 | | |
704 | 117 | tablet_meta_pb->set_in_restore_mode(in_restore_mode()); |
705 | | |
706 | | // to avoid modify tablet meta to the greatest extend |
707 | 117 | if (_preferred_rowset_type == BETA_ROWSET) { |
708 | 117 | tablet_meta_pb->set_preferred_rowset_type(_preferred_rowset_type); |
709 | 117 | } |
710 | 117 | if (_storage_policy_id > 0) { |
711 | 5 | tablet_meta_pb->set_storage_policy_id(_storage_policy_id); |
712 | 5 | } |
713 | 117 | if (_cooldown_meta_id.initialized()) { |
714 | 5 | tablet_meta_pb->mutable_cooldown_meta_id()->CopyFrom(_cooldown_meta_id.to_proto()); |
715 | 5 | } |
716 | | |
717 | 117 | tablet_meta_pb->set_enable_unique_key_merge_on_write(_enable_unique_key_merge_on_write); |
718 | | |
719 | 117 | if (_enable_unique_key_merge_on_write) { |
720 | 2 | std::set<RowsetId> stale_rs_ids; |
721 | 2 | for (const auto& rowset : _stale_rs_metas) { |
722 | 0 | stale_rs_ids.insert(rowset->rowset_id()); |
723 | 0 | } |
724 | 2 | DeleteBitmapPB* delete_bitmap_pb = tablet_meta_pb->mutable_delete_bitmap(); |
725 | 2 | for (auto& [id, bitmap] : delete_bitmap().snapshot().delete_bitmap) { |
726 | 2 | auto& [rowset_id, segment_id, ver] = id; |
727 | 2 | if (stale_rs_ids.count(rowset_id) != 0) { |
728 | 0 | continue; |
729 | 0 | } |
730 | 2 | delete_bitmap_pb->add_rowset_ids(rowset_id.to_string()); |
731 | 2 | delete_bitmap_pb->add_segment_ids(segment_id); |
732 | 2 | delete_bitmap_pb->add_versions(ver); |
733 | 2 | std::string bitmap_data(bitmap.getSizeInBytes(), '\0'); |
734 | 2 | bitmap.write(bitmap_data.data()); |
735 | 2 | *(delete_bitmap_pb->add_segment_delete_bitmaps()) = std::move(bitmap_data); |
736 | 2 | } |
737 | 2 | } |
738 | 117 | _binlog_config.to_pb(tablet_meta_pb->mutable_binlog_config()); |
739 | 117 | tablet_meta_pb->set_compaction_policy(compaction_policy()); |
740 | 117 | tablet_meta_pb->set_time_series_compaction_goal_size_mbytes( |
741 | 117 | time_series_compaction_goal_size_mbytes()); |
742 | 117 | tablet_meta_pb->set_time_series_compaction_file_count_threshold( |
743 | 117 | time_series_compaction_file_count_threshold()); |
744 | 117 | tablet_meta_pb->set_time_series_compaction_time_threshold_seconds( |
745 | 117 | time_series_compaction_time_threshold_seconds()); |
746 | 117 | tablet_meta_pb->set_time_series_compaction_empty_rowsets_threshold( |
747 | 117 | time_series_compaction_empty_rowsets_threshold()); |
748 | 117 | tablet_meta_pb->set_time_series_compaction_level_threshold( |
749 | 117 | time_series_compaction_level_threshold()); |
750 | 117 | } |
751 | | |
752 | 96 | int64_t TabletMeta::mem_size() const { |
753 | 96 | auto size = sizeof(TabletMeta); |
754 | 96 | size += _schema->mem_size(); |
755 | 96 | return size; |
756 | 96 | } |
757 | | |
758 | 2 | void TabletMeta::to_json(string* json_string, json2pb::Pb2JsonOptions& options) { |
759 | 2 | TabletMetaPB tablet_meta_pb; |
760 | 2 | to_meta_pb(&tablet_meta_pb); |
761 | 2 | json2pb::ProtoMessageToJson(tablet_meta_pb, json_string, options); |
762 | 2 | } |
763 | | |
764 | 91 | Version TabletMeta::max_version() const { |
765 | 91 | Version max_version = {-1, 0}; |
766 | 145 | for (auto& rs_meta : _rs_metas) { |
767 | 145 | if (rs_meta->end_version() > max_version.second) { |
768 | 143 | max_version = rs_meta->version(); |
769 | 143 | } |
770 | 145 | } |
771 | 91 | return max_version; |
772 | 91 | } |
773 | | |
774 | 0 | size_t TabletMeta::version_count_cross_with_range(const Version& range) const { |
775 | 0 | size_t count = 0; |
776 | 0 | for (const auto& rs_meta : _rs_metas) { |
777 | 0 | if (!(range.first > rs_meta->version().second || range.second < rs_meta->version().first)) { |
778 | 0 | count++; |
779 | 0 | } |
780 | 0 | } |
781 | 0 | return count; |
782 | 0 | } |
783 | | |
784 | 362 | Status TabletMeta::add_rs_meta(const RowsetMetaSharedPtr& rs_meta) { |
785 | | // check RowsetMeta is valid |
786 | 1.36k | for (auto& rs : _rs_metas) { |
787 | 1.36k | if (rs->version() == rs_meta->version()) { |
788 | 0 | if (rs->rowset_id() != rs_meta->rowset_id()) { |
789 | 0 | return Status::Error<PUSH_VERSION_ALREADY_EXIST>( |
790 | 0 | "version already exist. rowset_id={}, version={}, tablet={}", |
791 | 0 | rs->rowset_id().to_string(), rs->version().to_string(), tablet_id()); |
792 | 0 | } else { |
793 | | // rowsetid,version is equal, it is a duplicate req, skip it |
794 | 0 | return Status::OK(); |
795 | 0 | } |
796 | 0 | } |
797 | 1.36k | } |
798 | 362 | _rs_metas.push_back(rs_meta); |
799 | 362 | return Status::OK(); |
800 | 362 | } |
801 | | |
802 | | void TabletMeta::delete_rs_meta_by_version(const Version& version, |
803 | 0 | std::vector<RowsetMetaSharedPtr>* deleted_rs_metas) { |
804 | 0 | auto it = _rs_metas.begin(); |
805 | 0 | while (it != _rs_metas.end()) { |
806 | 0 | if ((*it)->version() == version) { |
807 | 0 | if (deleted_rs_metas != nullptr) { |
808 | 0 | deleted_rs_metas->push_back(*it); |
809 | 0 | } |
810 | 0 | _rs_metas.erase(it); |
811 | 0 | return; |
812 | 0 | } else { |
813 | 0 | ++it; |
814 | 0 | } |
815 | 0 | } |
816 | 0 | } |
817 | | |
818 | | void TabletMeta::modify_rs_metas(const std::vector<RowsetMetaSharedPtr>& to_add, |
819 | | const std::vector<RowsetMetaSharedPtr>& to_delete, |
820 | 11 | bool same_version) { |
821 | | // Remove to_delete rowsets from _rs_metas |
822 | 11 | for (auto rs_to_del : to_delete) { |
823 | 5 | auto it = _rs_metas.begin(); |
824 | 7 | while (it != _rs_metas.end()) { |
825 | 7 | if (rs_to_del->version() == (*it)->version()) { |
826 | 5 | _rs_metas.erase(it); |
827 | | // there should be only one rowset match the version |
828 | 5 | break; |
829 | 5 | } else { |
830 | 2 | ++it; |
831 | 2 | } |
832 | 7 | } |
833 | 5 | } |
834 | 11 | if (!same_version) { |
835 | | // put to_delete rowsets in _stale_rs_metas. |
836 | 6 | _stale_rs_metas.insert(_stale_rs_metas.end(), to_delete.begin(), to_delete.end()); |
837 | 6 | } |
838 | | // put to_add rowsets in _rs_metas. |
839 | 11 | _rs_metas.insert(_rs_metas.end(), to_add.begin(), to_add.end()); |
840 | 11 | } |
841 | | |
842 | | // Use the passing "rs_metas" to replace the rs meta in this tablet meta |
843 | | // Also clear the _stale_rs_metas because this tablet meta maybe copyied from |
844 | | // an existing tablet before. Add after revise, only the passing "rs_metas" |
845 | | // is needed. |
846 | 3 | void TabletMeta::revise_rs_metas(std::vector<RowsetMetaSharedPtr>&& rs_metas) { |
847 | 3 | std::lock_guard<std::shared_mutex> wrlock(_meta_lock); |
848 | 3 | _rs_metas = std::move(rs_metas); |
849 | 3 | _stale_rs_metas.clear(); |
850 | 3 | } |
851 | | |
852 | | // This method should call after revise_rs_metas, since new rs_metas might be a subset |
853 | | // of original tablet, we should revise the delete_bitmap according to current rowset. |
854 | | // |
855 | | // Delete bitmap is protected by Tablet::_meta_lock, we don't need to acquire the |
856 | | // TabletMeta's _meta_lock |
857 | 1 | void TabletMeta::revise_delete_bitmap_unlocked(const DeleteBitmap& delete_bitmap) { |
858 | 1 | _delete_bitmap = std::make_unique<DeleteBitmap>(tablet_id()); |
859 | 2 | for (auto rs : _rs_metas) { |
860 | 2 | DeleteBitmap rs_bm(tablet_id()); |
861 | 2 | delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX}, |
862 | 2 | &rs_bm); |
863 | 2 | _delete_bitmap->merge(rs_bm); |
864 | 2 | } |
865 | 1 | for (auto rs : _stale_rs_metas) { |
866 | 0 | DeleteBitmap rs_bm(tablet_id()); |
867 | 0 | delete_bitmap.subset({rs->rowset_id(), 0, 0}, {rs->rowset_id(), UINT32_MAX, INT64_MAX}, |
868 | 0 | &rs_bm); |
869 | 0 | _delete_bitmap->merge(rs_bm); |
870 | 0 | } |
871 | 1 | } |
872 | | |
873 | 0 | void TabletMeta::delete_stale_rs_meta_by_version(const Version& version) { |
874 | 0 | auto it = _stale_rs_metas.begin(); |
875 | 0 | while (it != _stale_rs_metas.end()) { |
876 | 0 | if ((*it)->version() == version) { |
877 | 0 | if (_enable_unique_key_merge_on_write) { |
878 | | // remove rowset delete bitmap |
879 | 0 | delete_bitmap().remove({(*it)->rowset_id(), 0, 0}, |
880 | 0 | {(*it)->rowset_id(), UINT32_MAX, 0}); |
881 | 0 | } |
882 | 0 | it = _stale_rs_metas.erase(it); |
883 | 0 | } else { |
884 | 0 | it++; |
885 | 0 | } |
886 | 0 | } |
887 | 0 | } |
888 | | |
889 | 0 | RowsetMetaSharedPtr TabletMeta::acquire_rs_meta_by_version(const Version& version) const { |
890 | 0 | for (auto it : _rs_metas) { |
891 | 0 | if (it->version() == version) { |
892 | 0 | return it; |
893 | 0 | } |
894 | 0 | } |
895 | 0 | return nullptr; |
896 | 0 | } |
897 | | |
898 | 8 | RowsetMetaSharedPtr TabletMeta::acquire_stale_rs_meta_by_version(const Version& version) const { |
899 | 8 | for (auto it : _stale_rs_metas) { |
900 | 0 | if (it->version() == version) { |
901 | 0 | return it; |
902 | 0 | } |
903 | 0 | } |
904 | 8 | return nullptr; |
905 | 8 | } |
906 | | |
907 | 22 | Status TabletMeta::set_partition_id(int64_t partition_id) { |
908 | 22 | if ((_partition_id > 0 && _partition_id != partition_id) || partition_id < 1) { |
909 | 0 | LOG(WARNING) << "cur partition id=" << _partition_id << " new partition id=" << partition_id |
910 | 0 | << " not equal"; |
911 | 0 | } |
912 | 22 | _partition_id = partition_id; |
913 | 22 | return Status::OK(); |
914 | 22 | } |
915 | | |
916 | 1 | bool operator==(const TabletMeta& a, const TabletMeta& b) { |
917 | 1 | if (a._table_id != b._table_id) return false; |
918 | 1 | if (a._partition_id != b._partition_id) return false; |
919 | 1 | if (a._tablet_id != b._tablet_id) return false; |
920 | 1 | if (a._replica_id != b._replica_id) return false; |
921 | 1 | if (a._schema_hash != b._schema_hash) return false; |
922 | 1 | if (a._shard_id != b._shard_id) return false; |
923 | 1 | if (a._creation_time != b._creation_time) return false; |
924 | 1 | if (a._cumulative_layer_point != b._cumulative_layer_point) return false; |
925 | 1 | if (a._tablet_uid != b._tablet_uid) return false; |
926 | 1 | if (a._tablet_type != b._tablet_type) return false; |
927 | 1 | if (a._tablet_state != b._tablet_state) return false; |
928 | 1 | if (*a._schema != *b._schema) return false; |
929 | 1 | if (a._rs_metas.size() != b._rs_metas.size()) return false; |
930 | 1 | for (int i = 0; i < a._rs_metas.size(); ++i) { |
931 | 0 | if (a._rs_metas[i] != b._rs_metas[i]) return false; |
932 | 0 | } |
933 | 1 | if (a._in_restore_mode != b._in_restore_mode) return false; |
934 | 1 | if (a._preferred_rowset_type != b._preferred_rowset_type) return false; |
935 | 1 | if (a._storage_policy_id != b._storage_policy_id) return false; |
936 | 1 | if (a._compaction_policy != b._compaction_policy) return false; |
937 | 1 | if (a._time_series_compaction_goal_size_mbytes != b._time_series_compaction_goal_size_mbytes) |
938 | 0 | return false; |
939 | 1 | if (a._time_series_compaction_file_count_threshold != |
940 | 1 | b._time_series_compaction_file_count_threshold) |
941 | 0 | return false; |
942 | 1 | if (a._time_series_compaction_time_threshold_seconds != |
943 | 1 | b._time_series_compaction_time_threshold_seconds) |
944 | 0 | return false; |
945 | 1 | if (a._time_series_compaction_empty_rowsets_threshold != |
946 | 1 | b._time_series_compaction_empty_rowsets_threshold) |
947 | 0 | return false; |
948 | 1 | if (a._time_series_compaction_level_threshold != b._time_series_compaction_level_threshold) |
949 | 0 | return false; |
950 | 1 | return true; |
951 | 1 | } |
952 | | |
953 | 0 | bool operator!=(const TabletMeta& a, const TabletMeta& b) { |
954 | 0 | return !(a == b); |
955 | 0 | } |
956 | | |
957 | 307 | DeleteBitmap::DeleteBitmap(int64_t tablet_id) : _tablet_id(tablet_id) { |
958 | | // The default delete bitmap cache is set to 100MB, |
959 | | // which can be insufficient and cause performance issues when the amount of user data is large. |
960 | | // To mitigate the problem of an inadequate cache, |
961 | | // we will take the larger of 0.5% of the total memory and 100MB as the delete bitmap cache size. |
962 | 307 | bool is_percent = false; |
963 | 307 | int64_t delete_bitmap_agg_cache_cache_limit = |
964 | 307 | ParseUtil::parse_mem_spec(config::delete_bitmap_dynamic_agg_cache_limit, |
965 | 307 | MemInfo::mem_limit(), MemInfo::physical_mem(), &is_percent); |
966 | 307 | _agg_cache.reset(new AggCache(delete_bitmap_agg_cache_cache_limit > |
967 | 307 | config::delete_bitmap_agg_cache_capacity |
968 | 307 | ? delete_bitmap_agg_cache_cache_limit |
969 | 307 | : config::delete_bitmap_agg_cache_capacity)); |
970 | 307 | } |
971 | | |
972 | 5 | DeleteBitmap::DeleteBitmap(const DeleteBitmap& o) { |
973 | 5 | delete_bitmap = o.delete_bitmap; // just copy data |
974 | 5 | _tablet_id = o._tablet_id; |
975 | 5 | } |
976 | | |
977 | 0 | DeleteBitmap& DeleteBitmap::operator=(const DeleteBitmap& o) { |
978 | 0 | delete_bitmap = o.delete_bitmap; // just copy data |
979 | 0 | _tablet_id = o._tablet_id; |
980 | 0 | return *this; |
981 | 0 | } |
982 | | |
983 | 0 | DeleteBitmap::DeleteBitmap(DeleteBitmap&& o) { |
984 | 0 | delete_bitmap = std::move(o.delete_bitmap); |
985 | 0 | _tablet_id = o._tablet_id; |
986 | 0 | } |
987 | | |
988 | 0 | DeleteBitmap& DeleteBitmap::operator=(DeleteBitmap&& o) { |
989 | 0 | delete_bitmap = std::move(o.delete_bitmap); |
990 | 0 | _tablet_id = o._tablet_id; |
991 | 0 | return *this; |
992 | 0 | } |
993 | | |
994 | 5 | DeleteBitmap DeleteBitmap::snapshot() const { |
995 | 5 | std::shared_lock l(lock); |
996 | 5 | return DeleteBitmap(*this); |
997 | 5 | } |
998 | | |
999 | 3 | DeleteBitmap DeleteBitmap::snapshot(Version version) const { |
1000 | | // Take snapshot first, then remove keys greater than given version. |
1001 | 3 | DeleteBitmap snapshot = this->snapshot(); |
1002 | 3 | auto it = snapshot.delete_bitmap.begin(); |
1003 | 412 | while (it != snapshot.delete_bitmap.end()) { |
1004 | 409 | if (std::get<2>(it->first) > version) { |
1005 | 4 | it = snapshot.delete_bitmap.erase(it); |
1006 | 405 | } else { |
1007 | 405 | it++; |
1008 | 405 | } |
1009 | 409 | } |
1010 | 3 | return snapshot; |
1011 | 3 | } |
1012 | | |
1013 | 459k | void DeleteBitmap::add(const BitmapKey& bmk, uint32_t row_id) { |
1014 | 459k | std::lock_guard l(lock); |
1015 | 459k | delete_bitmap[bmk].add(row_id); |
1016 | 459k | } |
1017 | | |
1018 | 0 | int DeleteBitmap::remove(const BitmapKey& bmk, uint32_t row_id) { |
1019 | 0 | std::lock_guard l(lock); |
1020 | 0 | auto it = delete_bitmap.find(bmk); |
1021 | 0 | if (it == delete_bitmap.end()) return -1; |
1022 | 0 | it->second.remove(row_id); |
1023 | 0 | return 0; |
1024 | 0 | } |
1025 | | |
1026 | 8 | void DeleteBitmap::remove(const BitmapKey& start, const BitmapKey& end) { |
1027 | 8 | std::lock_guard l(lock); |
1028 | 107 | for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end();) { |
1029 | 101 | auto& [k, _] = *it; |
1030 | 101 | if (k >= end) { |
1031 | 2 | break; |
1032 | 2 | } |
1033 | 99 | it = delete_bitmap.erase(it); |
1034 | 99 | } |
1035 | 8 | } |
1036 | | |
1037 | 6 | bool DeleteBitmap::contains(const BitmapKey& bmk, uint32_t row_id) const { |
1038 | 6 | std::shared_lock l(lock); |
1039 | 6 | auto it = delete_bitmap.find(bmk); |
1040 | 6 | return it != delete_bitmap.end() && it->second.contains(row_id); |
1041 | 6 | } |
1042 | | |
1043 | 2 | bool DeleteBitmap::contains_agg(const BitmapKey& bmk, uint32_t row_id) const { |
1044 | 2 | return get_agg(bmk)->contains(row_id); |
1045 | 2 | } |
1046 | | |
1047 | 0 | bool DeleteBitmap::empty() const { |
1048 | 0 | std::shared_lock l(lock); |
1049 | 0 | return delete_bitmap.empty(); |
1050 | 0 | } |
1051 | | |
1052 | 0 | uint64_t DeleteBitmap::cardinality() const { |
1053 | 0 | uint64_t res = 0; |
1054 | 0 | for (auto entry : delete_bitmap) { |
1055 | 0 | res += entry.second.cardinality(); |
1056 | 0 | } |
1057 | 0 | return res; |
1058 | 0 | } |
1059 | | |
1060 | 1 | bool DeleteBitmap::contains_agg_without_cache(const BitmapKey& bmk, uint32_t row_id) const { |
1061 | 1 | std::shared_lock l(lock); |
1062 | 1 | DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), 0}; |
1063 | 1 | for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) { |
1064 | 0 | auto& [k, bm] = *it; |
1065 | 0 | if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) || |
1066 | 0 | std::get<2>(k) > std::get<2>(bmk)) { |
1067 | 0 | break; |
1068 | 0 | } |
1069 | 0 | if (bm.contains(row_id)) { |
1070 | 0 | return true; |
1071 | 0 | } |
1072 | 0 | } |
1073 | 1 | return false; |
1074 | 1 | } |
1075 | | |
1076 | 38 | int DeleteBitmap::set(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) { |
1077 | 38 | std::lock_guard l(lock); |
1078 | 38 | auto [_, inserted] = delete_bitmap.insert_or_assign(bmk, segment_delete_bitmap); |
1079 | 38 | return inserted; |
1080 | 38 | } |
1081 | | |
1082 | 3 | int DeleteBitmap::get(const BitmapKey& bmk, roaring::Roaring* segment_delete_bitmap) const { |
1083 | 3 | std::shared_lock l(lock); |
1084 | 3 | auto it = delete_bitmap.find(bmk); |
1085 | 3 | if (it == delete_bitmap.end()) return -1; |
1086 | 3 | *segment_delete_bitmap = it->second; // copy |
1087 | 3 | return 0; |
1088 | 3 | } |
1089 | | |
1090 | 54 | const roaring::Roaring* DeleteBitmap::get(const BitmapKey& bmk) const { |
1091 | 54 | std::shared_lock l(lock); |
1092 | 54 | auto it = delete_bitmap.find(bmk); |
1093 | 54 | if (it == delete_bitmap.end()) return nullptr; |
1094 | 41 | return &(it->second); // get address |
1095 | 54 | } |
1096 | | |
1097 | | void DeleteBitmap::subset(const BitmapKey& start, const BitmapKey& end, |
1098 | 3 | DeleteBitmap* subset_rowset_map) const { |
1099 | 3 | roaring::Roaring roaring; |
1100 | 3 | DCHECK(start < end); |
1101 | 3 | std::shared_lock l(lock); |
1102 | 26 | for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) { |
1103 | 25 | auto& [k, bm] = *it; |
1104 | 25 | if (k >= end) { |
1105 | 2 | break; |
1106 | 2 | } |
1107 | 23 | subset_rowset_map->set(k, bm); |
1108 | 23 | } |
1109 | 3 | } |
1110 | | |
1111 | 2 | void DeleteBitmap::merge(const BitmapKey& bmk, const roaring::Roaring& segment_delete_bitmap) { |
1112 | 2 | std::lock_guard l(lock); |
1113 | 2 | auto [iter, succ] = delete_bitmap.emplace(bmk, segment_delete_bitmap); |
1114 | 2 | if (!succ) { |
1115 | 0 | iter->second |= segment_delete_bitmap; |
1116 | 0 | } |
1117 | 2 | } |
1118 | | |
1119 | 8 | void DeleteBitmap::merge(const DeleteBitmap& other) { |
1120 | 8 | std::lock_guard l(lock); |
1121 | 29 | for (auto& i : other.delete_bitmap) { |
1122 | 29 | auto [j, succ] = this->delete_bitmap.insert(i); |
1123 | 29 | if (!succ) j->second |= i.second; |
1124 | 29 | } |
1125 | 8 | } |
1126 | | |
1127 | | // We cannot just copy the underlying memory to construct a string |
1128 | | // due to equivalent objects may have different padding bytes. |
1129 | | // Reading padding bytes is undefined behavior, neither copy nor |
1130 | | // placement new will help simplify the code. |
1131 | | // Refer to C11 standards §6.2.6.1/6 and §6.7.9/21 for more info. |
1132 | 44 | static std::string agg_cache_key(int64_t tablet_id, const DeleteBitmap::BitmapKey& bmk) { |
1133 | 44 | std::string ret(sizeof(tablet_id) + sizeof(bmk), '\0'); |
1134 | 44 | *reinterpret_cast<int64_t*>(ret.data()) = tablet_id; |
1135 | 44 | auto t = reinterpret_cast<DeleteBitmap::BitmapKey*>(ret.data() + sizeof(tablet_id)); |
1136 | 44 | std::get<RowsetId>(*t).version = std::get<RowsetId>(bmk).version; |
1137 | 44 | std::get<RowsetId>(*t).hi = std::get<RowsetId>(bmk).hi; |
1138 | 44 | std::get<RowsetId>(*t).mi = std::get<RowsetId>(bmk).mi; |
1139 | 44 | std::get<RowsetId>(*t).lo = std::get<RowsetId>(bmk).lo; |
1140 | 44 | std::get<1>(*t) = std::get<1>(bmk); |
1141 | 44 | std::get<2>(*t) = std::get<2>(bmk); |
1142 | 44 | return ret; |
1143 | 44 | } |
1144 | | |
1145 | 44 | std::shared_ptr<roaring::Roaring> DeleteBitmap::get_agg(const BitmapKey& bmk) const { |
1146 | 44 | std::string key_str = agg_cache_key(_tablet_id, bmk); // Cache key container |
1147 | 44 | CacheKey key(key_str); |
1148 | 44 | Cache::Handle* handle = _agg_cache->repr()->lookup(key); |
1149 | | |
1150 | 44 | AggCache::Value* val = |
1151 | 44 | handle == nullptr |
1152 | 44 | ? nullptr |
1153 | 44 | : reinterpret_cast<AggCache::Value*>(_agg_cache->repr()->value(handle)); |
1154 | | // FIXME: do we need a mutex here to get rid of duplicated initializations |
1155 | | // of cache entries in some cases? |
1156 | 44 | if (val == nullptr) { // Renew if needed, put a new Value to cache |
1157 | 38 | val = new AggCache::Value(); |
1158 | 38 | { |
1159 | 38 | std::shared_lock l(lock); |
1160 | 38 | DeleteBitmap::BitmapKey start {std::get<0>(bmk), std::get<1>(bmk), 0}; |
1161 | 69 | for (auto it = delete_bitmap.lower_bound(start); it != delete_bitmap.end(); ++it) { |
1162 | 66 | auto& [k, bm] = *it; |
1163 | 66 | if (std::get<0>(k) != std::get<0>(bmk) || std::get<1>(k) != std::get<1>(bmk) || |
1164 | 66 | std::get<2>(k) > std::get<2>(bmk)) { |
1165 | 35 | break; |
1166 | 35 | } |
1167 | 31 | val->bitmap |= bm; |
1168 | 31 | } |
1169 | 38 | } |
1170 | 38 | size_t charge = val->bitmap.getSizeInBytes() + sizeof(AggCache::Value); |
1171 | 38 | handle = _agg_cache->repr()->insert(key, val, charge, charge, CachePriority::NORMAL); |
1172 | 38 | } |
1173 | | |
1174 | | // It is natural for the cache to reclaim the underlying memory |
1175 | 44 | return std::shared_ptr<roaring::Roaring>( |
1176 | 44 | &val->bitmap, [this, handle](...) { _agg_cache->repr()->release(handle); }); |
1177 | 44 | } |
1178 | | |
1179 | | std::atomic<DeleteBitmap::AggCachePolicy*> DeleteBitmap::AggCache::s_repr {nullptr}; |
1180 | | |
1181 | 0 | std::string tablet_state_name(TabletState state) { |
1182 | 0 | switch (state) { |
1183 | 0 | case TABLET_NOTREADY: |
1184 | 0 | return "TABLET_NOTREADY"; |
1185 | | |
1186 | 0 | case TABLET_RUNNING: |
1187 | 0 | return "TABLET_RUNNING"; |
1188 | | |
1189 | 0 | case TABLET_TOMBSTONED: |
1190 | 0 | return "TABLET_TOMBSTONED"; |
1191 | | |
1192 | 0 | case TABLET_STOPPED: |
1193 | 0 | return "TABLET_STOPPED"; |
1194 | | |
1195 | 0 | case TABLET_SHUTDOWN: |
1196 | 0 | return "TABLET_SHUTDOWN"; |
1197 | | |
1198 | 0 | default: |
1199 | 0 | return "TabletState(" + std::to_string(state) + ")"; |
1200 | 0 | } |
1201 | 0 | } |
1202 | | |
1203 | | } // namespace doris |