be/src/storage/segment/vertical_segment_writer.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "storage/segment/vertical_segment_writer.h" |
19 | | |
20 | | #include <crc32c/crc32c.h> |
21 | | #include <gen_cpp/olap_file.pb.h> |
22 | | #include <gen_cpp/segment_v2.pb.h> |
23 | | #include <parallel_hashmap/phmap.h> |
24 | | |
25 | | #include <cassert> |
26 | | #include <memory> |
27 | | #include <ostream> |
28 | | #include <string> |
29 | | #include <unordered_map> |
30 | | #include <unordered_set> |
31 | | #include <utility> |
32 | | |
33 | | #include "cloud/config.h" |
34 | | #include "common/cast_set.h" |
35 | | #include "common/compiler_util.h" // IWYU pragma: keep |
36 | | #include "common/config.h" |
37 | | #include "common/logging.h" // LOG |
38 | | #include "common/status.h" |
39 | | #include "core/assert_cast.h" |
40 | | #include "core/block/block.h" |
41 | | #include "core/block/column_with_type_and_name.h" |
42 | | #include "core/column/column_nullable.h" |
43 | | #include "core/column/column_vector.h" |
44 | | #include "core/data_type/data_type.h" |
45 | | #include "core/data_type/data_type_factory.hpp" |
46 | | #include "core/data_type/data_type_number.h" // IWYU pragma: keep |
47 | | #include "core/types.h" |
48 | | #include "exec/common/variant_util.h" |
49 | | #include "io/fs/file_writer.h" |
50 | | #include "io/fs/local_file_system.h" |
51 | | #include "runtime/exec_env.h" |
52 | | #include "runtime/memory/mem_tracker.h" |
53 | | #include "service/point_query_executor.h" |
54 | | #include "storage/data_dir.h" |
55 | | #include "storage/index/index_file_writer.h" |
56 | | #include "storage/index/inverted/inverted_index_desc.h" |
57 | | #include "storage/index/inverted/inverted_index_fs_directory.h" |
58 | | #include "storage/index/primary_key_index.h" |
59 | | #include "storage/index/short_key_index.h" |
60 | | #include "storage/iterator/olap_data_convertor.h" |
61 | | #include "storage/key_coder.h" |
62 | | #include "storage/olap_common.h" |
63 | | #include "storage/partial_update_info.h" |
64 | | #include "storage/row_cursor.h" // RowCursor // IWYU pragma: keep |
65 | | #include "storage/rowset/rowset_fwd.h" |
66 | | #include "storage/rowset/rowset_writer_context.h" // RowsetWriterContext |
67 | | #include "storage/rowset/segment_creator.h" |
68 | | #include "storage/segment/column_writer.h" // ColumnWriter |
69 | | #include "storage/segment/external_col_meta_util.h" |
70 | | #include "storage/segment/page_io.h" |
71 | | #include "storage/segment/page_pointer.h" |
72 | | #include "storage/segment/segment_loader.h" |
73 | | #include "storage/segment/variant/variant_ext_meta_writer.h" |
74 | | #include "storage/tablet/base_tablet.h" |
75 | | #include "storage/tablet/tablet_schema.h" |
76 | | #include "storage/utils.h" |
77 | | #include "util/coding.h" |
78 | | #include "util/debug_points.h" |
79 | | #include "util/faststring.h" |
80 | | #include "util/json/path_in_data.h" |
81 | | #include "util/jsonb/serialize.h" |
82 | | namespace doris::segment_v2 { |
83 | | |
84 | | using namespace ErrorCode; |
85 | | using namespace KeyConsts; |
86 | | |
87 | | static const char* k_segment_magic = "D0R1"; |
88 | | static const uint32_t k_segment_magic_length = 4; |
89 | | |
90 | 1 | Block materialize_rows_in_block_slice(const RowsInBlock& data) { |
91 | 1 | DCHECK_LE(data.row_pos, data.block->rows()); |
92 | 1 | DCHECK_LE(data.num_rows, data.block->rows() - data.row_pos); |
93 | | |
94 | 1 | Block sliced_block = data.block->clone_empty(); |
95 | 5 | for (size_t cid = 0; cid < data.block->columns(); ++cid) { |
96 | 4 | sliced_block.replace_by_position( |
97 | 4 | cid, data.block->get_by_position(cid).column->cut(data.row_pos, data.num_rows)); |
98 | 4 | } |
99 | 1 | return sliced_block; |
100 | 1 | } |
101 | | |
102 | 12 | inline std::string vertical_segment_writer_mem_tracker_name(uint32_t segment_id) { |
103 | 12 | return "VerticalSegmentWriter:Segment-" + std::to_string(segment_id); |
104 | 12 | } |
105 | | |
106 | | VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, |
107 | | TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, |
108 | | DataDir* data_dir, |
109 | | const VerticalSegmentWriterOptions& opts, |
110 | | IndexFileWriter* index_file_writer) |
111 | 12 | : _segment_id(segment_id), |
112 | 12 | _tablet_schema(std::move(tablet_schema)), |
113 | 12 | _tablet(std::move(tablet)), |
114 | 12 | _data_dir(data_dir), |
115 | 12 | _opts(opts), |
116 | 12 | _file_writer(file_writer), |
117 | 12 | _index_file_writer(index_file_writer), |
118 | 12 | _mem_tracker(std::make_unique<MemTracker>( |
119 | 12 | vertical_segment_writer_mem_tracker_name(segment_id))), |
120 | 12 | _mow_context(std::move(opts.mow_ctx)), |
121 | 12 | _block_aggregator(*this) { |
122 | 12 | CHECK_NOTNULL(file_writer); |
123 | 12 | _num_sort_key_columns = _tablet_schema->num_key_columns(); |
124 | 12 | _num_short_key_columns = _tablet_schema->num_short_key_columns(); |
125 | 12 | if (!_is_mow_with_cluster_key()) { |
126 | 11 | DCHECK(_num_sort_key_columns >= _num_short_key_columns) |
127 | 0 | << ", table_id=" << _tablet_schema->table_id() |
128 | 0 | << ", num_key_columns=" << _num_sort_key_columns |
129 | 0 | << ", num_short_key_columns=" << _num_short_key_columns |
130 | 0 | << ", cluster_key_columns=" << _tablet_schema->cluster_key_uids().size(); |
131 | 11 | } |
132 | 43 | for (size_t cid = 0; cid < _num_sort_key_columns; ++cid) { |
133 | 31 | const auto& column = _tablet_schema->column(cid); |
134 | 31 | _key_coders.push_back(get_key_coder(column.type())); |
135 | 31 | _key_index_size.push_back(cast_set<uint16_t>(column.index_length())); |
136 | 31 | } |
137 | | // encode the sequence id into the primary key index |
138 | 12 | if (_is_mow()) { |
139 | 3 | if (_tablet_schema->has_sequence_col()) { |
140 | 3 | const auto& column = _tablet_schema->column(_tablet_schema->sequence_col_idx()); |
141 | 3 | _seq_coder = get_key_coder(column.type()); |
142 | 3 | } |
143 | | // encode the rowid into the primary key index |
144 | 3 | if (_is_mow_with_cluster_key()) { |
145 | 1 | const auto* type_info = get_scalar_type_info<FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT>(); |
146 | 1 | _rowid_coder = get_key_coder(type_info->type()); |
147 | | // primary keys |
148 | 1 | _primary_key_coders.swap(_key_coders); |
149 | | // cluster keys |
150 | 1 | _key_coders.clear(); |
151 | 1 | _key_index_size.clear(); |
152 | 1 | _num_sort_key_columns = _tablet_schema->cluster_key_uids().size(); |
153 | 2 | for (auto cid : _tablet_schema->cluster_key_uids()) { |
154 | 2 | const auto& column = _tablet_schema->column_by_uid(cid); |
155 | 2 | _key_coders.push_back(get_key_coder(column.type())); |
156 | 2 | _key_index_size.push_back(cast_set<uint16_t>(column.index_length())); |
157 | 2 | } |
158 | 1 | } |
159 | 3 | } |
160 | 12 | } |
161 | | |
162 | 12 | VerticalSegmentWriter::~VerticalSegmentWriter() { |
163 | 12 | _mem_tracker->release(_mem_tracker->consumption()); |
164 | 12 | } |
165 | | |
166 | | void VerticalSegmentWriter::_init_column_meta(ColumnMetaPB* meta, uint32_t column_id, |
167 | 61 | const TabletColumn& column) { |
168 | 61 | meta->set_column_id(column_id); |
169 | 61 | meta->set_type(int(column.type())); |
170 | 61 | meta->set_length(cast_set<int32_t>(column.length())); |
171 | 61 | meta->set_encoding(DEFAULT_ENCODING); |
172 | 61 | meta->set_compression(_opts.compression_type); |
173 | 61 | meta->set_is_nullable(column.is_nullable()); |
174 | 61 | meta->set_default_value(column.default_value()); |
175 | 61 | meta->set_precision(column.precision()); |
176 | 61 | meta->set_frac(column.frac()); |
177 | 61 | if (column.has_path_info()) { |
178 | 0 | column.path_info_ptr()->to_protobuf(meta->mutable_column_path_info(), |
179 | 0 | column.parent_unique_id()); |
180 | 0 | } |
181 | 61 | meta->set_unique_id(column.unique_id()); |
182 | 61 | for (uint32_t i = 0; i < column.get_subtype_count(); ++i) { |
183 | 0 | _init_column_meta(meta->add_children_columns(), column_id, column.get_sub_column(i)); |
184 | 0 | } |
185 | 61 | if (column.is_variant_type()) { |
186 | 0 | meta->set_variant_max_subcolumns_count(column.variant_max_subcolumns_count()); |
187 | 0 | meta->set_variant_enable_doc_mode(column.variant_enable_doc_mode()); |
188 | 0 | } |
189 | 61 | meta->set_result_is_nullable(column.get_result_is_nullable()); |
190 | 61 | meta->set_function_name(column.get_aggregation_name()); |
191 | 61 | meta->set_be_exec_version(column.get_be_exec_version()); |
192 | 61 | } |
193 | | |
194 | | Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& column, |
195 | 61 | const TabletSchemaSPtr& tablet_schema) { |
196 | 61 | ColumnWriterOptions opts; |
197 | 61 | opts.meta = _footer.add_columns(); |
198 | | |
199 | 61 | _init_column_meta(opts.meta, cid, column); |
200 | | |
201 | | // now we create zone map for key columns in AGG_KEYS or all column in UNIQUE_KEYS or DUP_KEYS |
202 | | // except for columns whose type don't support zone map. |
203 | 61 | opts.need_zone_map = column.is_key() || tablet_schema->keys_type() != KeysType::AGG_KEYS; |
204 | 61 | opts.need_bloom_filter = column.is_bf_column(); |
205 | 61 | if (opts.need_bloom_filter) { |
206 | 0 | opts.bf_options.fpp = |
207 | 0 | tablet_schema->has_bf_fpp() ? tablet_schema->bloom_filter_fpp() : 0.05; |
208 | 0 | } |
209 | 61 | auto* tablet_index = tablet_schema->get_ngram_bf_index(column.unique_id()); |
210 | 61 | if (tablet_index) { |
211 | 0 | opts.need_bloom_filter = true; |
212 | 0 | opts.is_ngram_bf_index = true; |
213 | | //narrow convert from int32_t to uint8_t and uint16_t which is dangerous |
214 | 0 | auto gram_size = tablet_index->get_gram_size(); |
215 | 0 | auto gram_bf_size = tablet_index->get_gram_bf_size(); |
216 | 0 | if (gram_size > 256 || gram_size < 1) { |
217 | 0 | return Status::NotSupported("Do not support ngram bloom filter for ngram_size: ", |
218 | 0 | gram_size); |
219 | 0 | } |
220 | 0 | if (gram_bf_size > 65535 || gram_bf_size < 64) { |
221 | 0 | return Status::NotSupported("Do not support ngram bloom filter for bf_size: ", |
222 | 0 | gram_bf_size); |
223 | 0 | } |
224 | 0 | opts.gram_size = cast_set<uint8_t>(gram_size); |
225 | 0 | opts.gram_bf_size = cast_set<uint16_t>(gram_bf_size); |
226 | 0 | } |
227 | | |
228 | 61 | bool skip_inverted_index = false; |
229 | 61 | if (_opts.rowset_ctx != nullptr) { |
230 | | // skip write inverted index for index compaction column |
231 | 61 | skip_inverted_index = |
232 | 61 | _opts.rowset_ctx->columns_to_do_index_compaction.contains(column.unique_id()); |
233 | 61 | } |
234 | | // skip write inverted index on load if skip_write_index_on_load is true |
235 | 61 | if (_opts.write_type == DataWriteType::TYPE_DIRECT && |
236 | 61 | tablet_schema->skip_write_index_on_load()) { |
237 | 0 | skip_inverted_index = true; |
238 | 0 | } |
239 | 61 | if (!skip_inverted_index) { |
240 | 61 | auto inverted_indexs = tablet_schema->inverted_indexs(column); |
241 | 61 | if (!inverted_indexs.empty()) { |
242 | 0 | opts.inverted_indexes = inverted_indexs; |
243 | 0 | opts.need_inverted_index = true; |
244 | 0 | DCHECK(_index_file_writer != nullptr); |
245 | 0 | } |
246 | 61 | } |
247 | 61 | opts.index_file_writer = _index_file_writer; |
248 | | |
249 | 61 | if (const auto& index = tablet_schema->ann_index(column); index != nullptr) { |
250 | 0 | opts.ann_index = index; |
251 | 0 | opts.need_ann_index = true; |
252 | 0 | DCHECK(_index_file_writer != nullptr); |
253 | 0 | opts.index_file_writer = _index_file_writer; |
254 | 0 | } |
255 | | |
256 | 61 | #define DISABLE_INDEX_IF_FIELD_TYPE(TYPE) \ |
257 | 549 | if (column.type() == FieldType::OLAP_FIELD_TYPE_##TYPE) { \ |
258 | 0 | opts.need_zone_map = false; \ |
259 | 0 | opts.need_bloom_filter = false; \ |
260 | 0 | } |
261 | | |
262 | 61 | DISABLE_INDEX_IF_FIELD_TYPE(STRUCT) |
263 | 61 | DISABLE_INDEX_IF_FIELD_TYPE(ARRAY) |
264 | 61 | DISABLE_INDEX_IF_FIELD_TYPE(JSONB) |
265 | 61 | DISABLE_INDEX_IF_FIELD_TYPE(AGG_STATE) |
266 | 61 | DISABLE_INDEX_IF_FIELD_TYPE(MAP) |
267 | 61 | DISABLE_INDEX_IF_FIELD_TYPE(BITMAP) |
268 | 61 | DISABLE_INDEX_IF_FIELD_TYPE(HLL) |
269 | 61 | DISABLE_INDEX_IF_FIELD_TYPE(QUANTILE_STATE) |
270 | 61 | DISABLE_INDEX_IF_FIELD_TYPE(VARIANT) |
271 | | |
272 | 61 | #undef DISABLE_INDEX_IF_FIELD_TYPE |
273 | | |
274 | 61 | #undef CHECK_FIELD_TYPE |
275 | | |
276 | 61 | int64_t storage_page_size = _tablet_schema->storage_page_size(); |
277 | | // storage_page_size must be between 4KB and 10MB. |
278 | 61 | if (storage_page_size >= 4096 && storage_page_size <= 10485760) { |
279 | 61 | opts.data_page_size = storage_page_size; |
280 | 61 | } |
281 | 61 | opts.dict_page_size = _tablet_schema->storage_dict_page_size(); |
282 | 61 | DBUG_EXECUTE_IF("VerticalSegmentWriter._create_column_writer.storage_page_size", { |
283 | 61 | auto table_id = DebugPoints::instance()->get_debug_param_or_default<int64_t>( |
284 | 61 | "VerticalSegmentWriter._create_column_writer.storage_page_size", "table_id", |
285 | 61 | INT_MIN); |
286 | 61 | auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default<int64_t>( |
287 | 61 | "VerticalSegmentWriter._create_column_writer.storage_page_size", |
288 | 61 | "storage_page_size", INT_MIN); |
289 | 61 | if (table_id == INT_MIN || target_data_page_size == INT_MIN) { |
290 | 61 | return Status::Error<ErrorCode::INTERNAL_ERROR>( |
291 | 61 | "Debug point parameters missing: either 'table_id' or 'storage_page_size' not " |
292 | 61 | "set."); |
293 | 61 | } |
294 | 61 | if (table_id == _tablet_schema->table_id() && |
295 | 61 | opts.data_page_size != target_data_page_size) { |
296 | 61 | return Status::Error<ErrorCode::INTERNAL_ERROR>( |
297 | 61 | "Mismatch in 'storage_page_size': expected size does not match the current " |
298 | 61 | "data page size. " |
299 | 61 | "Expected: " + |
300 | 61 | std::to_string(target_data_page_size) + |
301 | 61 | ", Actual: " + std::to_string(opts.data_page_size) + "."); |
302 | 61 | } |
303 | 61 | }) |
304 | 61 | if (column.is_row_store_column()) { |
305 | | // smaller page size for row store column |
306 | 0 | auto page_size = _tablet_schema->row_store_page_size(); |
307 | 0 | opts.data_page_size = |
308 | 0 | (page_size > 0) ? page_size : segment_v2::ROW_STORE_PAGE_SIZE_DEFAULT_VALUE; |
309 | 0 | } |
310 | | |
311 | 61 | opts.rowset_ctx = _opts.rowset_ctx; |
312 | 61 | opts.file_writer = _file_writer; |
313 | 61 | opts.compression_type = _opts.compression_type; |
314 | 61 | opts.footer = &_footer; |
315 | 61 | opts.input_rs_readers = _opts.rowset_ctx->input_rs_readers; |
316 | | |
317 | 61 | opts.encoding_preference = {.integer_type_default_use_plain_encoding = |
318 | 61 | _tablet_schema->integer_type_default_use_plain_encoding(), |
319 | 61 | .binary_plain_encoding_default_impl = |
320 | 61 | _tablet_schema->binary_plain_encoding_default_impl()}; |
321 | 61 | std::unique_ptr<ColumnWriter> writer; |
322 | 61 | RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _file_writer, &writer)); |
323 | 61 | RETURN_IF_ERROR(writer->init()); |
324 | 61 | _column_writers[cid] = std::move(writer); |
325 | 61 | _olap_data_convertor->add_column_data_convertor_at(column, cid); |
326 | 61 | return Status::OK(); |
327 | 61 | }; |
328 | | |
329 | 12 | Status VerticalSegmentWriter::init() { |
330 | 12 | DCHECK(_column_writers.empty()); |
331 | 12 | if (_opts.compression_type == UNKNOWN_COMPRESSION) { |
332 | 0 | _opts.compression_type = _tablet_schema->compression_type(); |
333 | 0 | } |
334 | 12 | _olap_data_convertor = std::make_unique<OlapBlockDataConvertor>(); |
335 | 12 | _olap_data_convertor->resize(_tablet_schema->num_columns()); |
336 | 12 | _column_writers.resize(_tablet_schema->num_columns()); |
337 | | // we don't need the short key index for unique key merge on write table. |
338 | 12 | if (_is_mow()) { |
339 | 3 | size_t seq_col_length = 0; |
340 | 3 | if (_tablet_schema->has_sequence_col()) { |
341 | 3 | seq_col_length = |
342 | 3 | _tablet_schema->column(_tablet_schema->sequence_col_idx()).length() + 1; |
343 | 3 | } |
344 | 3 | size_t rowid_length = 0; |
345 | 3 | if (_is_mow_with_cluster_key()) { |
346 | 1 | rowid_length = PrimaryKeyIndexReader::ROW_ID_LENGTH; |
347 | 1 | _short_key_index_builder.reset( |
348 | 1 | new ShortKeyIndexBuilder(_segment_id, _opts.num_rows_per_block)); |
349 | 1 | } |
350 | 3 | _primary_key_index_builder.reset( |
351 | 3 | new PrimaryKeyIndexBuilder(_file_writer, seq_col_length, rowid_length)); |
352 | 3 | RETURN_IF_ERROR(_primary_key_index_builder->init()); |
353 | 9 | } else { |
354 | 9 | _short_key_index_builder.reset( |
355 | 9 | new ShortKeyIndexBuilder(_segment_id, _opts.num_rows_per_block)); |
356 | 9 | } |
357 | 12 | return Status::OK(); |
358 | 12 | } |
359 | | |
360 | 8 | void VerticalSegmentWriter::_maybe_invalid_row_cache(const std::string& key) const { |
361 | | // Just invalid row cache for simplicity, since the rowset is not visible at present. |
362 | | // If we update/insert cache, if load failed rowset will not be visible but cached data |
363 | | // will be visible, and lead to inconsistency. |
364 | 8 | if (!config::disable_storage_row_cache && _tablet_schema->has_row_store_for_all_columns() && |
365 | 8 | _opts.write_type == DataWriteType::TYPE_DIRECT) { |
366 | | // invalidate cache |
367 | 0 | RowCache::instance()->erase({_opts.rowset_ctx->tablet_id, key}); |
368 | 0 | } |
369 | 8 | } |
370 | | |
371 | 12 | void VerticalSegmentWriter::_serialize_block_to_row_column(const Block& block) { |
372 | 12 | if (block.rows() == 0) { |
373 | 0 | return; |
374 | 0 | } |
375 | 12 | MonotonicStopWatch watch; |
376 | 12 | watch.start(); |
377 | 12 | int row_column_id = 0; |
378 | 73 | for (int i = 0; i < _tablet_schema->num_columns(); ++i) { |
379 | 61 | if (_tablet_schema->column(i).is_row_store_column()) { |
380 | 0 | auto* row_store_column = static_cast<ColumnString*>( |
381 | 0 | block.get_by_position(i).column->assume_mutable_ref().assume_mutable().get()); |
382 | 0 | row_store_column->clear(); |
383 | 0 | DataTypeSerDeSPtrs serdes = create_data_type_serdes(block.get_data_types()); |
384 | 0 | std::unordered_set<int> row_store_cids_set(_tablet_schema->row_columns_uids().begin(), |
385 | 0 | _tablet_schema->row_columns_uids().end()); |
386 | 0 | JsonbSerializeUtil::block_to_jsonb(*_tablet_schema, block, *row_store_column, |
387 | 0 | cast_set<int>(_tablet_schema->num_columns()), serdes, |
388 | 0 | row_store_cids_set); |
389 | 0 | break; |
390 | 0 | } |
391 | 61 | } |
392 | | |
393 | 12 | VLOG_DEBUG << "serialize , num_rows:" << block.rows() << ", row_column_id:" << row_column_id |
394 | 0 | << ", total_byte_size:" << block.allocated_bytes() << ", serialize_cost(us)" |
395 | 0 | << watch.elapsed_time() / 1000; |
396 | 12 | } |
397 | | |
398 | | Status VerticalSegmentWriter::_probe_key_for_mow( |
399 | | std::string key, std::size_t segment_pos, bool have_input_seq_column, bool have_delete_sign, |
400 | | const std::vector<RowsetSharedPtr>& specified_rowsets, |
401 | | std::vector<std::unique_ptr<SegmentCacheHandle>>& segment_caches, |
402 | | bool& has_default_or_nullable, std::vector<bool>& use_default_or_null_flag, |
403 | | const std::function<void(const RowLocation& loc)>& found_cb, |
404 | 0 | const std::function<Status()>& not_found_cb, PartialUpdateStats& stats) { |
405 | 0 | RowLocation loc; |
406 | | // save rowset shared ptr so this rowset wouldn't delete |
407 | 0 | RowsetSharedPtr rowset; |
408 | 0 | auto st = _tablet->lookup_row_key(key, _tablet_schema.get(), have_input_seq_column, |
409 | 0 | specified_rowsets, &loc, _mow_context->max_version, |
410 | 0 | segment_caches, &rowset); |
411 | 0 | if (st.is<KEY_NOT_FOUND>()) { |
412 | 0 | if (!have_delete_sign) { |
413 | 0 | RETURN_IF_ERROR(not_found_cb()); |
414 | 0 | } |
415 | 0 | ++stats.num_rows_new_added; |
416 | 0 | has_default_or_nullable = true; |
417 | 0 | use_default_or_null_flag.emplace_back(true); |
418 | 0 | return Status::OK(); |
419 | 0 | } |
420 | 0 | if (!st.ok() && !st.is<KEY_ALREADY_EXISTS>()) { |
421 | 0 | LOG(WARNING) << "failed to lookup row key, error: " << st; |
422 | 0 | return st; |
423 | 0 | } |
424 | | |
425 | | // 1. if the delete sign is marked, it means that the value columns of the row will not |
426 | | // be read. So we don't need to read the missing values from the previous rows. |
427 | | // 2. the one exception is when there are sequence columns in the table, we need to read |
428 | | // the sequence columns, otherwise it may cause the merge-on-read based compaction |
429 | | // policy to produce incorrect results |
430 | | |
431 | | // 3. In flexible partial update, we may delete the existing rows before if there exists |
432 | | // insert after delete in one load. In this case, the insert should also be treated |
433 | | // as newly inserted rows, note that the sequence column value is filled in |
434 | | // BlockAggregator::aggregate_for_insert_after_delete() if this row doesn't specify the sequence column |
435 | 0 | if (st.is<KEY_ALREADY_EXISTS>() || (have_delete_sign && !_tablet_schema->has_sequence_col()) || |
436 | 0 | (_opts.rowset_ctx->partial_update_info->is_flexible_partial_update() && |
437 | 0 | _mow_context->delete_bitmap->contains( |
438 | 0 | {loc.rowset_id, loc.segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, loc.row_id))) { |
439 | 0 | has_default_or_nullable = true; |
440 | 0 | use_default_or_null_flag.emplace_back(true); |
441 | 0 | } else { |
442 | | // partial update should not contain invisible columns |
443 | 0 | use_default_or_null_flag.emplace_back(false); |
444 | 0 | _rsid_to_rowset.emplace(rowset->rowset_id(), rowset); |
445 | 0 | found_cb(loc); |
446 | 0 | } |
447 | |
|
448 | 0 | if (st.is<KEY_ALREADY_EXISTS>()) { |
449 | | // although we need to mark delete current row, we still need to read missing columns |
450 | | // for this row, we need to ensure that each column is aligned |
451 | 0 | _mow_context->delete_bitmap->add( |
452 | 0 | {_opts.rowset_ctx->rowset_id, _segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, |
453 | 0 | cast_set<uint32_t>(segment_pos)); |
454 | 0 | ++stats.num_rows_deleted; |
455 | 0 | } else { |
456 | 0 | _mow_context->delete_bitmap->add( |
457 | 0 | {loc.rowset_id, loc.segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, loc.row_id); |
458 | 0 | ++stats.num_rows_updated; |
459 | 0 | } |
460 | 0 | return Status::OK(); |
461 | 0 | } |
462 | | |
463 | 61 | Status VerticalSegmentWriter::_finalize_column_writer_and_update_meta(size_t cid) { |
464 | 61 | RETURN_IF_ERROR(_column_writers[cid]->finish()); |
465 | 61 | RETURN_IF_ERROR(_column_writers[cid]->write_data()); |
466 | | |
467 | 61 | auto* column_meta = _column_writers[cid]->get_column_meta(); |
468 | 61 | column_meta->set_compressed_data_bytes( |
469 | 61 | _column_writers[cid]->get_total_compressed_data_pages_bytes()); |
470 | 61 | column_meta->set_uncompressed_data_bytes( |
471 | 61 | _column_writers[cid]->get_total_uncompressed_data_pages_bytes()); |
472 | 61 | column_meta->set_raw_data_bytes(_column_writers[cid]->get_raw_data_bytes()); |
473 | 61 | return Status::OK(); |
474 | 61 | } |
475 | | |
476 | | Status VerticalSegmentWriter::_partial_update_preconditions_check(size_t row_pos, |
477 | 0 | bool is_flexible_update) { |
478 | 0 | if (!_is_mow()) { |
479 | 0 | auto msg = fmt::format( |
480 | 0 | "Can only do partial update on merge-on-write unique table, but found: " |
481 | 0 | "keys_type={}, _opts.enable_unique_key_merge_on_write={}, tablet_id={}", |
482 | 0 | _tablet_schema->keys_type(), _opts.enable_unique_key_merge_on_write, |
483 | 0 | _tablet->tablet_id()); |
484 | 0 | DCHECK(false) << msg; |
485 | 0 | return Status::InternalError<false>(msg); |
486 | 0 | } |
487 | 0 | if (_opts.rowset_ctx->partial_update_info == nullptr) { |
488 | 0 | auto msg = |
489 | 0 | fmt::format("partial_update_info should not be nullptr, please check, tablet_id={}", |
490 | 0 | _tablet->tablet_id()); |
491 | 0 | DCHECK(false) << msg; |
492 | 0 | return Status::InternalError<false>(msg); |
493 | 0 | } |
494 | 0 | if (!is_flexible_update) { |
495 | 0 | if (!_opts.rowset_ctx->partial_update_info->is_fixed_partial_update()) { |
496 | 0 | auto msg = fmt::format( |
497 | 0 | "in fixed partial update code, but update_mode={}, please check, tablet_id={}", |
498 | 0 | _opts.rowset_ctx->partial_update_info->update_mode(), _tablet->tablet_id()); |
499 | 0 | DCHECK(false) << msg; |
500 | 0 | return Status::InternalError<false>(msg); |
501 | 0 | } |
502 | 0 | } else { |
503 | 0 | if (!_opts.rowset_ctx->partial_update_info->is_flexible_partial_update()) { |
504 | 0 | auto msg = fmt::format( |
505 | 0 | "in flexible partial update code, but update_mode={}, please check, " |
506 | 0 | "tablet_id={}", |
507 | 0 | _opts.rowset_ctx->partial_update_info->update_mode(), _tablet->tablet_id()); |
508 | 0 | DCHECK(false) << msg; |
509 | 0 | return Status::InternalError<false>(msg); |
510 | 0 | } |
511 | 0 | } |
512 | 0 | if (!is_flexible_update && row_pos != 0) { |
513 | 0 | auto msg = fmt::format("row_pos should be 0, but found {}, tablet_id={}", row_pos, |
514 | 0 | _tablet->tablet_id()); |
515 | 0 | DCHECK(false) << msg; |
516 | 0 | return Status::InternalError<false>(msg); |
517 | 0 | } |
518 | 0 | return Status::OK(); |
519 | 0 | } |
520 | | |
521 | | // for partial update, we should do following steps to fill content of block: |
522 | | // 1. set block data to data convertor, and get all key_column's converted slice |
523 | | // 2. get pk of input block, and read missing columns |
524 | | // 2.1 first find key location{rowset_id, segment_id, row_id} |
525 | | // 2.2 build read plan to read by batch |
526 | | // 2.3 fill block |
527 | | // 3. set columns to data convertor and then write all columns |
528 | | Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& data, |
529 | 0 | Block& full_block) { |
530 | 0 | DBUG_EXECUTE_IF("_append_block_with_partial_content.block", DBUG_BLOCK); |
531 | |
|
532 | 0 | RETURN_IF_ERROR(_partial_update_preconditions_check(data.row_pos, false)); |
533 | | // create full block and fill with input columns |
534 | 0 | full_block = _tablet_schema->create_block(); |
535 | 0 | const auto& including_cids = _opts.rowset_ctx->partial_update_info->update_cids; |
536 | 0 | size_t input_id = 0; |
537 | 0 | for (auto i : including_cids) { |
538 | 0 | full_block.replace_by_position(i, data.block->get_by_position(input_id++).column); |
539 | 0 | } |
540 | |
|
541 | 0 | if (_opts.rowset_ctx->write_type != DataWriteType::TYPE_COMPACTION && |
542 | 0 | _tablet_schema->num_variant_columns() > 0) { |
543 | 0 | RETURN_IF_ERROR(variant_util::parse_and_materialize_variant_columns( |
544 | 0 | full_block, *_tablet_schema, including_cids)); |
545 | 0 | } |
546 | 0 | bool have_input_seq_column = false; |
547 | | // write including columns |
548 | 0 | std::vector<IOlapColumnDataAccessor*> key_columns; |
549 | 0 | IOlapColumnDataAccessor* seq_column = nullptr; |
550 | 0 | uint32_t segment_start_pos = 0; |
551 | 0 | for (auto cid : including_cids) { |
552 | 0 | RETURN_IF_ERROR(_create_column_writer(cid, _tablet_schema->column(cid), _tablet_schema)); |
553 | 0 | RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_columns( |
554 | 0 | &full_block, data.row_pos, data.num_rows, std::vector<uint32_t> {cid})); |
555 | | // here we get segment column row num before append data. |
556 | 0 | segment_start_pos = cast_set<uint32_t>(_column_writers[cid]->get_next_rowid()); |
557 | | // olap data convertor alway start from id = 0 |
558 | 0 | auto [status, column] = _olap_data_convertor->convert_column_data(cid); |
559 | 0 | if (!status.ok()) { |
560 | 0 | return status; |
561 | 0 | } |
562 | 0 | if (cid < _num_sort_key_columns) { |
563 | 0 | key_columns.push_back(column); |
564 | 0 | } else if (_tablet_schema->has_sequence_col() && |
565 | 0 | cid == _tablet_schema->sequence_col_idx()) { |
566 | 0 | seq_column = column; |
567 | 0 | have_input_seq_column = true; |
568 | 0 | } |
569 | 0 | RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), |
570 | 0 | data.num_rows)); |
571 | 0 | RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid)); |
572 | | // Don't clear source content for key columns and sequence column here, |
573 | | // as they will be used later in _full_encode_keys() and _generate_primary_key_index(). |
574 | | // They will be cleared at the end of this method. |
575 | 0 | bool is_key_column = (cid < _num_sort_key_columns); |
576 | 0 | bool is_seq_column = (_tablet_schema->has_sequence_col() && |
577 | 0 | cid == _tablet_schema->sequence_col_idx() && have_input_seq_column); |
578 | 0 | if (!is_key_column && !is_seq_column) { |
579 | 0 | _olap_data_convertor->clear_source_content(cid); |
580 | 0 | } |
581 | 0 | } |
582 | | |
583 | 0 | bool has_default_or_nullable = false; |
584 | 0 | std::vector<bool> use_default_or_null_flag; |
585 | 0 | use_default_or_null_flag.reserve(data.num_rows); |
586 | 0 | const auto* delete_signs = |
587 | 0 | BaseTablet::get_delete_sign_column_data(full_block, data.row_pos + data.num_rows); |
588 | |
|
589 | 0 | DBUG_EXECUTE_IF("VerticalSegmentWriter._append_block_with_partial_content.sleep", |
590 | 0 | { sleep(60); }) |
591 | 0 | const std::vector<RowsetSharedPtr>& specified_rowsets = _mow_context->rowset_ptrs; |
592 | 0 | std::vector<std::unique_ptr<SegmentCacheHandle>> segment_caches(specified_rowsets.size()); |
593 | |
|
594 | 0 | FixedReadPlan read_plan; |
595 | | |
596 | | // locate rows in base data |
597 | 0 | PartialUpdateStats stats; |
598 | |
|
599 | 0 | for (size_t block_pos = data.row_pos; block_pos < data.row_pos + data.num_rows; block_pos++) { |
600 | | // block segment |
601 | | // 2 -> 0 |
602 | | // 3 -> 1 |
603 | | // 4 -> 2 |
604 | | // 5 -> 3 |
605 | | // here row_pos = 2, num_rows = 4. |
606 | 0 | size_t delta_pos = block_pos - data.row_pos; |
607 | 0 | size_t segment_pos = segment_start_pos + delta_pos; |
608 | 0 | std::string key = _full_encode_keys(key_columns, delta_pos); |
609 | 0 | _maybe_invalid_row_cache(key); |
610 | 0 | if (have_input_seq_column) { |
611 | 0 | _encode_seq_column(seq_column, delta_pos, &key); |
612 | 0 | } |
613 | | // If the table have sequence column, and the include-cids don't contain the sequence |
614 | | // column, we need to update the primary key index builder at the end of this method. |
615 | | // At that time, we have a valid sequence column to encode the key with seq col. |
616 | 0 | if (!_tablet_schema->has_sequence_col() || have_input_seq_column) { |
617 | 0 | RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); |
618 | 0 | } |
619 | | |
620 | | // mark key with delete sign as deleted. |
621 | 0 | bool have_delete_sign = (delete_signs != nullptr && delete_signs[block_pos] != 0); |
622 | |
|
623 | 0 | auto not_found_cb = [&]() { |
624 | 0 | return _opts.rowset_ctx->partial_update_info->handle_new_key( |
625 | 0 | *_tablet_schema, [&]() -> std::string { |
626 | 0 | return data.block->dump_one_line(block_pos, |
627 | 0 | cast_set<int>(_num_sort_key_columns)); |
628 | 0 | }); |
629 | 0 | }; |
630 | 0 | auto update_read_plan = [&](const RowLocation& loc) { |
631 | 0 | read_plan.prepare_to_read(loc, segment_pos); |
632 | 0 | }; |
633 | 0 | RETURN_IF_ERROR(_probe_key_for_mow(std::move(key), segment_pos, have_input_seq_column, |
634 | 0 | have_delete_sign, specified_rowsets, segment_caches, |
635 | 0 | has_default_or_nullable, use_default_or_null_flag, |
636 | 0 | update_read_plan, not_found_cb, stats)); |
637 | 0 | } |
638 | 0 | CHECK_EQ(use_default_or_null_flag.size(), data.num_rows); |
639 | |
|
640 | 0 | if (config::enable_merge_on_write_correctness_check) { |
641 | 0 | _tablet->add_sentinel_mark_to_delete_bitmap(_mow_context->delete_bitmap.get(), |
642 | 0 | *_mow_context->rowset_ids); |
643 | 0 | } |
644 | | |
645 | | // read to fill full_block |
646 | 0 | RETURN_IF_ERROR(read_plan.fill_missing_columns( |
647 | 0 | _opts.rowset_ctx, _rsid_to_rowset, *_tablet_schema, full_block, |
648 | 0 | use_default_or_null_flag, has_default_or_nullable, segment_start_pos, data.block)); |
649 | | |
650 | 0 | if (_tablet_schema->num_variant_columns() > 0) { |
651 | 0 | RETURN_IF_ERROR(variant_util::parse_and_materialize_variant_columns( |
652 | 0 | full_block, *_tablet_schema, _opts.rowset_ctx->partial_update_info->missing_cids)); |
653 | 0 | } |
654 | | |
655 | | // row column should be filled here |
656 | | // convert block to row store format |
657 | 0 | _serialize_block_to_row_column(full_block); |
658 | | |
659 | | // convert missing columns and send to column writer |
660 | 0 | const auto& missing_cids = _opts.rowset_ctx->partial_update_info->missing_cids; |
661 | 0 | for (auto cid : missing_cids) { |
662 | 0 | RETURN_IF_ERROR(_create_column_writer(cid, _tablet_schema->column(cid), _tablet_schema)); |
663 | 0 | RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_columns( |
664 | 0 | &full_block, data.row_pos, data.num_rows, std::vector<uint32_t> {cid})); |
665 | 0 | auto [status, column] = _olap_data_convertor->convert_column_data(cid); |
666 | 0 | if (!status.ok()) { |
667 | 0 | return status; |
668 | 0 | } |
669 | 0 | if (_tablet_schema->has_sequence_col() && !have_input_seq_column && |
670 | 0 | cid == _tablet_schema->sequence_col_idx()) { |
671 | 0 | DCHECK_EQ(seq_column, nullptr); |
672 | 0 | seq_column = column; |
673 | 0 | } |
674 | 0 | RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), |
675 | 0 | data.num_rows)); |
676 | 0 | RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid)); |
677 | | // Don't clear source content for sequence column here if it will be used later |
678 | | // in _generate_primary_key_index(). It will be cleared at the end of this method. |
679 | 0 | bool is_seq_column = (_tablet_schema->has_sequence_col() && !have_input_seq_column && |
680 | 0 | cid == _tablet_schema->sequence_col_idx()); |
681 | 0 | if (!is_seq_column) { |
682 | 0 | _olap_data_convertor->clear_source_content(cid); |
683 | 0 | } |
684 | 0 | } |
685 | | |
686 | 0 | _num_rows_updated += stats.num_rows_updated; |
687 | 0 | _num_rows_deleted += stats.num_rows_deleted; |
688 | 0 | _num_rows_new_added += stats.num_rows_new_added; |
689 | 0 | _num_rows_filtered += stats.num_rows_filtered; |
690 | 0 | if (_tablet_schema->has_sequence_col() && !have_input_seq_column) { |
691 | 0 | DCHECK_NE(seq_column, nullptr); |
692 | 0 | if (_num_rows_written != data.row_pos || |
693 | 0 | _primary_key_index_builder->num_rows() != _num_rows_written) { |
694 | 0 | return Status::InternalError( |
695 | 0 | "Correctness check failed, _num_rows_written: {}, row_pos: {}, primary key " |
696 | 0 | "index builder num rows: {}", |
697 | 0 | _num_rows_written, data.row_pos, _primary_key_index_builder->num_rows()); |
698 | 0 | } |
699 | 0 | RETURN_IF_ERROR(_generate_primary_key_index(_key_coders, key_columns, seq_column, |
700 | 0 | data.num_rows, false)); |
701 | 0 | } |
702 | | |
703 | 0 | _num_rows_written += data.num_rows; |
704 | 0 | DCHECK_EQ(_primary_key_index_builder->num_rows(), _num_rows_written) |
705 | 0 | << "primary key index builder num rows(" << _primary_key_index_builder->num_rows() |
706 | 0 | << ") not equal to segment writer's num rows written(" << _num_rows_written << ")"; |
707 | 0 | _olap_data_convertor->clear_source_content(); |
708 | 0 | return Status::OK(); |
709 | 0 | } |
710 | | |
711 | | Status VerticalSegmentWriter::_append_block_with_flexible_partial_content(RowsInBlock& input_data, |
712 | 0 | Block& full_block) { |
713 | 0 | RETURN_IF_ERROR(_partial_update_preconditions_check(input_data.row_pos, true)); |
714 | | |
715 | 0 | Block sliced_block; |
716 | 0 | RowsInBlock normalized_data = input_data; |
717 | 0 | if (input_data.row_pos != 0) { |
718 | 0 | sliced_block = materialize_rows_in_block_slice(input_data); |
719 | 0 | normalized_data = RowsInBlock {&sliced_block, 0, input_data.num_rows}; |
720 | 0 | } |
721 | 0 | RowsInBlock& data = normalized_data; |
722 | | |
723 | | // data.block has the same schema with full_block |
724 | 0 | DCHECK(data.block->columns() == _tablet_schema->num_columns()); |
725 | | |
726 | | // create full block and fill with sort key columns |
727 | 0 | full_block = _tablet_schema->create_block(); |
728 | | |
729 | | // Use _num_rows_written instead of creating column writer 0, since all column writers |
730 | | // should have the same row count, which equals _num_rows_written. |
731 | 0 | uint32_t segment_start_pos = cast_set<uint32_t>(_num_rows_written); |
732 | |
|
733 | 0 | DCHECK(_tablet_schema->has_skip_bitmap_col()); |
734 | 0 | auto skip_bitmap_col_idx = _tablet_schema->skip_bitmap_col_idx(); |
735 | |
|
736 | 0 | bool has_default_or_nullable = false; |
737 | 0 | std::vector<bool> use_default_or_null_flag; |
738 | 0 | use_default_or_null_flag.reserve(data.num_rows); |
739 | |
|
740 | 0 | int32_t seq_map_col_unique_id = _opts.rowset_ctx->partial_update_info->sequence_map_col_uid(); |
741 | 0 | bool schema_has_sequence_col = _tablet_schema->has_sequence_col(); |
742 | |
|
743 | 0 | DBUG_EXECUTE_IF("VerticalSegmentWriter._append_block_with_flexible_partial_content.sleep", |
744 | 0 | { sleep(60); }) |
745 | 0 | const std::vector<RowsetSharedPtr>& specified_rowsets = _mow_context->rowset_ptrs; |
746 | 0 | std::vector<std::unique_ptr<SegmentCacheHandle>> segment_caches(specified_rowsets.size()); |
747 | | |
748 | | // Ensure all primary key column writers and sequence column writer are created before |
749 | | // aggregate_for_flexible_partial_update, because it internally calls convert_pk_columns |
750 | | // and convert_seq_column which need the convertors in _olap_data_convertor |
751 | 0 | for (uint32_t cid = 0; cid < _tablet_schema->num_key_columns(); ++cid) { |
752 | 0 | RETURN_IF_ERROR(_create_column_writer(cid, _tablet_schema->column(cid), _tablet_schema)); |
753 | 0 | } |
754 | 0 | if (schema_has_sequence_col) { |
755 | 0 | uint32_t cid = _tablet_schema->sequence_col_idx(); |
756 | 0 | RETURN_IF_ERROR(_create_column_writer(cid, _tablet_schema->column(cid), _tablet_schema)); |
757 | 0 | } |
758 | | |
759 | 0 | std::vector<BitmapValue>* skip_bitmaps = &( |
760 | 0 | assert_cast<ColumnBitmap*>( |
761 | 0 | data.block->get_by_position(skip_bitmap_col_idx).column->assume_mutable().get()) |
762 | 0 | ->get_data()); |
763 | 0 | if (_tablet_schema->num_variant_columns() > 0) { |
764 | 0 | std::vector<uint32_t> variant_cids; |
765 | 0 | variant_cids.reserve(_tablet_schema->num_variant_columns()); |
766 | 0 | for (size_t cid = _tablet_schema->num_key_columns(); cid < _tablet_schema->num_columns(); |
767 | 0 | ++cid) { |
768 | 0 | const auto& column = _tablet_schema->column(cid); |
769 | 0 | if (!column.is_variant_type()) { |
770 | 0 | continue; |
771 | 0 | } |
772 | 0 | variant_cids.push_back(cast_set<uint32_t>(cid)); |
773 | 0 | } |
774 | 0 | RETURN_IF_ERROR(variant_util::parse_and_materialize_variant_columns( |
775 | 0 | *const_cast<Block*>(data.block), *_tablet_schema, variant_cids, true)); |
776 | 0 | for (auto cid : variant_cids) { |
777 | 0 | const auto& column = _tablet_schema->column(cid); |
778 | 0 | for (size_t block_pos = data.row_pos; block_pos < data.row_pos + data.num_rows; |
779 | 0 | ++block_pos) { |
780 | 0 | auto& skip_bitmap = skip_bitmaps->at(block_pos); |
781 | 0 | if (!skip_bitmap.contains(column.unique_id())) { |
782 | 0 | RETURN_IF_ERROR(variant_util::mark_variant_patch_paths( |
783 | 0 | *data.block->get_by_position(cid).column, block_pos, column.unique_id(), |
784 | 0 | &skip_bitmap)); |
785 | 0 | } |
786 | 0 | } |
787 | 0 | } |
788 | 0 | } |
789 | | |
790 | | // 1. aggregate duplicate rows in block |
791 | 0 | RETURN_IF_ERROR(_block_aggregator.aggregate_for_flexible_partial_update( |
792 | 0 | const_cast<Block*>(data.block), data.num_rows, specified_rowsets, segment_caches)); |
793 | 0 | if (data.block->rows() != data.num_rows) { |
794 | 0 | data.num_rows = data.block->rows(); |
795 | 0 | _olap_data_convertor->clear_source_content(); |
796 | 0 | } |
797 | 0 | skip_bitmaps = &( |
798 | 0 | assert_cast<ColumnBitmap*>( |
799 | 0 | data.block->get_by_position(skip_bitmap_col_idx).column->assume_mutable().get()) |
800 | 0 | ->get_data()); |
801 | | |
802 | | // 2. encode primary key columns |
803 | | // we can only encode primary key columns currently becasue all non-primary columns in flexible partial update |
804 | | // can have missing cells |
805 | 0 | std::vector<IOlapColumnDataAccessor*> key_columns {}; |
806 | 0 | RETURN_IF_ERROR(_block_aggregator.convert_pk_columns(const_cast<Block*>(data.block), |
807 | 0 | data.row_pos, data.num_rows, key_columns)); |
808 | | // 3. encode sequence column |
809 | | // We encode the seguence column even thought it may have invalid values in some rows because we need to |
810 | | // encode the value of sequence column in key for rows that have a valid value in sequence column during |
811 | | // lookup_raw_key. We will encode the sequence column again at the end of this method. At that time, we have |
812 | | // a valid sequence column to encode the key with seq col. |
813 | 0 | IOlapColumnDataAccessor* seq_column {nullptr}; |
814 | 0 | RETURN_IF_ERROR(_block_aggregator.convert_seq_column(const_cast<Block*>(data.block), |
815 | 0 | data.row_pos, data.num_rows, seq_column)); |
816 | | |
817 | 0 | const auto* delete_signs = |
818 | 0 | BaseTablet::get_delete_sign_column_data(*data.block, data.row_pos + data.num_rows); |
819 | 0 | DCHECK(delete_signs != nullptr); |
820 | |
|
821 | 0 | for (std::size_t cid {0}; cid < _tablet_schema->num_key_columns(); cid++) { |
822 | 0 | full_block.replace_by_position(cid, data.block->get_by_position(cid).column); |
823 | 0 | } |
824 | | |
825 | | // 4. write primary key columns data |
826 | 0 | for (std::size_t cid {0}; cid < _tablet_schema->num_key_columns(); cid++) { |
827 | 0 | const auto& column = key_columns[cid]; |
828 | 0 | DCHECK(_column_writers[cid]->get_next_rowid() == _num_rows_written); |
829 | 0 | RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), |
830 | 0 | data.num_rows)); |
831 | 0 | DCHECK(_column_writers[cid]->get_next_rowid() == _num_rows_written + data.num_rows); |
832 | 0 | RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid)); |
833 | 0 | } |
834 | | |
835 | | // 5. genreate read plan |
836 | 0 | FlexibleReadPlan read_plan {_tablet_schema->has_row_store_for_all_columns()}; |
837 | 0 | PartialUpdateStats stats; |
838 | 0 | RETURN_IF_ERROR(_generate_flexible_read_plan( |
839 | 0 | read_plan, data, segment_start_pos, schema_has_sequence_col, seq_map_col_unique_id, |
840 | 0 | skip_bitmaps, key_columns, seq_column, delete_signs, specified_rowsets, segment_caches, |
841 | 0 | has_default_or_nullable, use_default_or_null_flag, stats)); |
842 | 0 | CHECK_EQ(use_default_or_null_flag.size(), data.num_rows); |
843 | |
|
844 | 0 | if (config::enable_merge_on_write_correctness_check) { |
845 | 0 | _tablet->add_sentinel_mark_to_delete_bitmap(_mow_context->delete_bitmap.get(), |
846 | 0 | *_mow_context->rowset_ids); |
847 | 0 | } |
848 | | |
849 | | // 6. read according plan to fill full_block |
850 | 0 | RETURN_IF_ERROR(read_plan.fill_non_primary_key_columns( |
851 | 0 | _opts.rowset_ctx, _rsid_to_rowset, *_tablet_schema, full_block, |
852 | 0 | use_default_or_null_flag, has_default_or_nullable, segment_start_pos, |
853 | 0 | cast_set<uint32_t>(data.row_pos), data.block, skip_bitmaps)); |
854 | | |
855 | | // TODO(bobhan1): should we replace the skip bitmap column with empty bitmaps to reduce storage occupation? |
856 | | // this column is not needed in read path for merge-on-write table |
857 | | |
858 | | // 7. fill row store column |
859 | 0 | _serialize_block_to_row_column(full_block); |
860 | |
|
861 | 0 | std::vector<uint32_t> column_ids; |
862 | 0 | for (uint32_t i = 0; i < _tablet_schema->num_columns(); ++i) { |
863 | 0 | column_ids.emplace_back(i); |
864 | 0 | } |
865 | 0 | if (_opts.rowset_ctx->write_type != DataWriteType::TYPE_COMPACTION && |
866 | 0 | _tablet_schema->num_variant_columns() > 0) { |
867 | 0 | RETURN_IF_ERROR(variant_util::parse_and_materialize_variant_columns( |
868 | 0 | full_block, *_tablet_schema, column_ids)); |
869 | 0 | } |
870 | | |
871 | | // 8. encode and write all non-primary key columns(including sequence column if exists) |
872 | 0 | for (auto cid = _tablet_schema->num_key_columns(); cid < _tablet_schema->num_columns(); cid++) { |
873 | 0 | if (cid != _tablet_schema->sequence_col_idx()) { |
874 | 0 | RETURN_IF_ERROR(_create_column_writer(cast_set<uint32_t>(cid), |
875 | 0 | _tablet_schema->column(cid), _tablet_schema)); |
876 | 0 | } |
877 | 0 | RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_column( |
878 | 0 | full_block.get_by_position(cid), data.row_pos, data.num_rows, |
879 | 0 | cast_set<uint32_t>(cid))); |
880 | 0 | auto [status, column] = _olap_data_convertor->convert_column_data(cid); |
881 | 0 | if (!status.ok()) { |
882 | 0 | return status; |
883 | 0 | } |
884 | 0 | if (cid == _tablet_schema->sequence_col_idx()) { |
885 | | // should use the latest encoded sequence column to build the primary index |
886 | 0 | seq_column = column; |
887 | 0 | } |
888 | 0 | DCHECK(_column_writers[cid]->get_next_rowid() == _num_rows_written); |
889 | 0 | RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), |
890 | 0 | data.num_rows)); |
891 | 0 | DCHECK(_column_writers[cid]->get_next_rowid() == _num_rows_written + data.num_rows); |
892 | 0 | RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid)); |
893 | 0 | } |
894 | | |
895 | 0 | _num_rows_updated += stats.num_rows_updated; |
896 | 0 | _num_rows_deleted += stats.num_rows_deleted; |
897 | 0 | _num_rows_new_added += stats.num_rows_new_added; |
898 | 0 | _num_rows_filtered += stats.num_rows_filtered; |
899 | |
|
900 | 0 | if (_primary_key_index_builder->num_rows() != _num_rows_written) { |
901 | 0 | return Status::InternalError( |
902 | 0 | "Correctness check failed, _num_rows_written: {}, primary key " |
903 | 0 | "index builder num rows: {}", |
904 | 0 | _num_rows_written, _primary_key_index_builder->num_rows()); |
905 | 0 | } |
906 | | |
907 | | // 9. build primary key index |
908 | 0 | RETURN_IF_ERROR(_generate_primary_key_index(_key_coders, key_columns, seq_column, data.num_rows, |
909 | 0 | false)); |
910 | | |
911 | 0 | _num_rows_written += data.num_rows; |
912 | 0 | DCHECK_EQ(_primary_key_index_builder->num_rows(), _num_rows_written) |
913 | 0 | << "primary key index builder num rows(" << _primary_key_index_builder->num_rows() |
914 | 0 | << ") not equal to segment writer's num rows written(" << _num_rows_written << ")"; |
915 | 0 | _olap_data_convertor->clear_source_content(); |
916 | 0 | return Status::OK(); |
917 | 0 | } |
918 | | |
919 | | Status VerticalSegmentWriter::_generate_encoded_default_seq_value(const TabletSchema& tablet_schema, |
920 | | const PartialUpdateInfo& info, |
921 | 0 | std::string* encoded_value) { |
922 | 0 | const auto& seq_column = tablet_schema.column(tablet_schema.sequence_col_idx()); |
923 | 0 | auto block = tablet_schema.create_block_by_cids( |
924 | 0 | {cast_set<uint32_t>(tablet_schema.sequence_col_idx())}); |
925 | 0 | if (seq_column.has_default_value()) { |
926 | 0 | auto idx = tablet_schema.sequence_col_idx() - tablet_schema.num_key_columns(); |
927 | 0 | const auto& default_value = info.default_values[idx]; |
928 | 0 | StringRef str {default_value}; |
929 | 0 | RETURN_IF_ERROR(block.get_by_position(0).type->get_serde()->default_from_string( |
930 | 0 | str, *block.get_by_position(0).column->assume_mutable().get())); |
931 | |
|
932 | 0 | } else { |
933 | 0 | block.get_by_position(0).column->assume_mutable()->insert_default(); |
934 | 0 | } |
935 | 0 | DCHECK_EQ(block.rows(), 1); |
936 | 0 | auto olap_data_convertor = std::make_unique<OlapBlockDataConvertor>(); |
937 | 0 | olap_data_convertor->add_column_data_convertor(seq_column); |
938 | 0 | olap_data_convertor->set_source_content(&block, 0, 1); |
939 | 0 | auto [status, column] = olap_data_convertor->convert_column_data(0); |
940 | 0 | if (!status.ok()) { |
941 | 0 | return status; |
942 | 0 | } |
943 | | // include marker |
944 | 0 | _encode_seq_column(column, 0, encoded_value); |
945 | 0 | return Status::OK(); |
946 | 0 | } |
947 | | |
948 | | Status VerticalSegmentWriter::_generate_flexible_read_plan( |
949 | | FlexibleReadPlan& read_plan, RowsInBlock& data, size_t segment_start_pos, |
950 | | bool schema_has_sequence_col, int32_t seq_map_col_unique_id, |
951 | | std::vector<BitmapValue>* skip_bitmaps, |
952 | | const std::vector<IOlapColumnDataAccessor*>& key_columns, |
953 | | IOlapColumnDataAccessor* seq_column, const signed char* delete_signs, |
954 | | const std::vector<RowsetSharedPtr>& specified_rowsets, |
955 | | std::vector<std::unique_ptr<SegmentCacheHandle>>& segment_caches, |
956 | | bool& has_default_or_nullable, std::vector<bool>& use_default_or_null_flag, |
957 | 0 | PartialUpdateStats& stats) { |
958 | 0 | int32_t delete_sign_col_unique_id = |
959 | 0 | _tablet_schema->column(_tablet_schema->delete_sign_idx()).unique_id(); |
960 | 0 | int32_t seq_col_unique_id = |
961 | 0 | (_tablet_schema->has_sequence_col() |
962 | 0 | ? _tablet_schema->column(_tablet_schema->sequence_col_idx()).unique_id() |
963 | 0 | : -1); |
964 | 0 | for (size_t block_pos = data.row_pos; block_pos < data.row_pos + data.num_rows; block_pos++) { |
965 | 0 | size_t delta_pos = block_pos - data.row_pos; |
966 | 0 | size_t segment_pos = segment_start_pos + delta_pos; |
967 | 0 | auto& skip_bitmap = skip_bitmaps->at(block_pos); |
968 | |
|
969 | 0 | std::string key = _full_encode_keys(key_columns, delta_pos); |
970 | 0 | _maybe_invalid_row_cache(key); |
971 | 0 | bool row_has_sequence_col = |
972 | 0 | (schema_has_sequence_col && !skip_bitmap.contains(seq_col_unique_id)); |
973 | 0 | if (row_has_sequence_col) { |
974 | 0 | _encode_seq_column(seq_column, delta_pos, &key); |
975 | 0 | } |
976 | | |
977 | | // mark key with delete sign as deleted. |
978 | 0 | bool have_delete_sign = |
979 | 0 | (!skip_bitmap.contains(delete_sign_col_unique_id) && delete_signs[block_pos] != 0); |
980 | |
|
981 | 0 | auto not_found_cb = [&]() { |
982 | 0 | return _opts.rowset_ctx->partial_update_info->handle_new_key( |
983 | 0 | *_tablet_schema, |
984 | 0 | [&]() -> std::string { |
985 | 0 | return data.block->dump_one_line(block_pos, |
986 | 0 | cast_set<int>(_num_sort_key_columns)); |
987 | 0 | }, |
988 | 0 | &skip_bitmap); |
989 | 0 | }; |
990 | 0 | auto update_read_plan = [&](const RowLocation& loc) { |
991 | 0 | BitmapValue read_skip_bitmap(skip_bitmap); |
992 | 0 | if (!have_delete_sign) { |
993 | 0 | bool should_merge_variant = false; |
994 | 0 | for (size_t cid = _tablet_schema->num_key_columns(); |
995 | 0 | cid < _tablet_schema->num_columns(); ++cid) { |
996 | 0 | const auto& column = _tablet_schema->column(cid); |
997 | 0 | if (column.is_variant_type() && !skip_bitmap.contains(column.unique_id())) { |
998 | 0 | read_skip_bitmap.add(column.unique_id()); |
999 | 0 | should_merge_variant = true; |
1000 | 0 | } |
1001 | 0 | } |
1002 | 0 | if (should_merge_variant) { |
1003 | 0 | read_skip_bitmap.add(delete_sign_col_unique_id); |
1004 | 0 | } |
1005 | 0 | } |
1006 | 0 | read_plan.prepare_to_read(loc, segment_pos, read_skip_bitmap); |
1007 | 0 | }; |
1008 | |
|
1009 | 0 | RETURN_IF_ERROR(_probe_key_for_mow(std::move(key), segment_pos, row_has_sequence_col, |
1010 | 0 | have_delete_sign, specified_rowsets, segment_caches, |
1011 | 0 | has_default_or_nullable, use_default_or_null_flag, |
1012 | 0 | update_read_plan, not_found_cb, stats)); |
1013 | 0 | } |
1014 | 0 | return Status::OK(); |
1015 | 0 | } |
1016 | | |
1017 | 12 | Status VerticalSegmentWriter::batch_block(const Block* block, size_t row_pos, size_t num_rows) { |
1018 | 12 | if (_opts.rowset_ctx->partial_update_info && |
1019 | 12 | _opts.rowset_ctx->partial_update_info->is_partial_update() && |
1020 | 12 | _opts.write_type == DataWriteType::TYPE_DIRECT && |
1021 | 12 | !_opts.rowset_ctx->is_transient_rowset_writer) { |
1022 | 0 | if (_opts.rowset_ctx->partial_update_info->is_flexible_partial_update()) { |
1023 | 0 | if (block->columns() != _tablet_schema->num_columns()) { |
1024 | 0 | return Status::InvalidArgument( |
1025 | 0 | "illegal flexible partial update block columns, block columns = {}, " |
1026 | 0 | "tablet_schema columns = {}", |
1027 | 0 | block->dump_structure(), _tablet_schema->dump_structure()); |
1028 | 0 | } |
1029 | 0 | } else { |
1030 | 0 | if (block->columns() < _tablet_schema->num_key_columns() || |
1031 | 0 | block->columns() >= _tablet_schema->num_columns()) { |
1032 | 0 | return Status::InvalidArgument(fmt::format( |
1033 | 0 | "illegal partial update block columns: {}, num key columns: {}, total " |
1034 | 0 | "schema columns: {}", |
1035 | 0 | block->columns(), _tablet_schema->num_key_columns(), |
1036 | 0 | _tablet_schema->num_columns())); |
1037 | 0 | } |
1038 | 0 | } |
1039 | 12 | } else if (block->columns() != _tablet_schema->num_columns()) { |
1040 | 0 | return Status::InvalidArgument( |
1041 | 0 | "illegal block columns, block columns = {}, tablet_schema columns = {}", |
1042 | 0 | block->dump_structure(), _tablet_schema->dump_structure()); |
1043 | 0 | } |
1044 | 12 | _batched_blocks.emplace_back(block, row_pos, num_rows); |
1045 | 12 | return Status::OK(); |
1046 | 12 | } |
1047 | | |
1048 | 12 | Status VerticalSegmentWriter::write_batch() { |
1049 | 12 | if (_opts.rowset_ctx->partial_update_info && |
1050 | 12 | _opts.rowset_ctx->partial_update_info->is_partial_update() && |
1051 | 12 | _opts.write_type == DataWriteType::TYPE_DIRECT && |
1052 | 12 | !_opts.rowset_ctx->is_transient_rowset_writer) { |
1053 | 0 | bool is_flexible_partial_update = |
1054 | 0 | _opts.rowset_ctx->partial_update_info->is_flexible_partial_update(); |
1055 | 0 | Block full_block; |
1056 | 0 | for (auto& data : _batched_blocks) { |
1057 | 0 | if (is_flexible_partial_update) { |
1058 | 0 | RETURN_IF_ERROR(_append_block_with_flexible_partial_content(data, full_block)); |
1059 | 0 | } else { |
1060 | 0 | RETURN_IF_ERROR(_append_block_with_partial_content(data, full_block)); |
1061 | 0 | } |
1062 | 0 | } |
1063 | 0 | return Status::OK(); |
1064 | 0 | } |
1065 | | // Row column should be filled here when it's a directly write from memtable |
1066 | | // or it's schema change write(since column data type maybe changed, so we should reubild) |
1067 | 12 | if (_opts.write_type == DataWriteType::TYPE_DIRECT || |
1068 | 12 | _opts.write_type == DataWriteType::TYPE_SCHEMA_CHANGE) { |
1069 | 12 | for (auto& data : _batched_blocks) { |
1070 | | // TODO: maybe we should pass range to this method |
1071 | 12 | _serialize_block_to_row_column(*data.block); |
1072 | 12 | } |
1073 | 12 | } |
1074 | | |
1075 | 12 | std::vector<uint32_t> column_ids; |
1076 | 73 | for (uint32_t i = 0; i < _tablet_schema->num_columns(); ++i) { |
1077 | 61 | column_ids.emplace_back(i); |
1078 | 61 | } |
1079 | 12 | if (_opts.rowset_ctx->write_type != DataWriteType::TYPE_COMPACTION && |
1080 | 12 | _tablet_schema->num_variant_columns() > 0) { |
1081 | 0 | for (auto& data : _batched_blocks) { |
1082 | 0 | RETURN_IF_ERROR(variant_util::parse_and_materialize_variant_columns( |
1083 | 0 | const_cast<Block&>(*data.block), *_tablet_schema, column_ids)); |
1084 | 0 | } |
1085 | 0 | } |
1086 | | |
1087 | 12 | std::vector<IOlapColumnDataAccessor*> key_columns; |
1088 | 12 | IOlapColumnDataAccessor* seq_column = nullptr; |
1089 | | // the key is cluster key column unique id |
1090 | 12 | std::map<uint32_t, IOlapColumnDataAccessor*> cid_to_column; |
1091 | 73 | for (uint32_t cid = 0; cid < _tablet_schema->num_columns(); ++cid) { |
1092 | 61 | RETURN_IF_ERROR(_create_column_writer(cid, _tablet_schema->column(cid), _tablet_schema)); |
1093 | 61 | for (auto& data : _batched_blocks) { |
1094 | 61 | RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_columns( |
1095 | 61 | data.block, data.row_pos, data.num_rows, std::vector<uint32_t> {cid})); |
1096 | | |
1097 | | // convert column data from engine format to storage layer format |
1098 | 61 | auto [status, column] = _olap_data_convertor->convert_column_data(cid); |
1099 | 61 | if (!status.ok()) { |
1100 | 0 | return status; |
1101 | 0 | } |
1102 | 61 | if (cid < _tablet_schema->num_key_columns()) { |
1103 | 31 | key_columns.push_back(column); |
1104 | 31 | } |
1105 | 61 | if (_tablet_schema->has_sequence_col() && cid == _tablet_schema->sequence_col_idx()) { |
1106 | 7 | seq_column = column; |
1107 | 7 | } |
1108 | 61 | auto column_unique_id = _tablet_schema->column(cid).unique_id(); |
1109 | 61 | if (_is_mow_with_cluster_key() && |
1110 | 61 | std::find(_tablet_schema->cluster_key_uids().begin(), |
1111 | 5 | _tablet_schema->cluster_key_uids().end(), |
1112 | 5 | column_unique_id) != _tablet_schema->cluster_key_uids().end()) { |
1113 | 2 | cid_to_column[column_unique_id] = column; |
1114 | 2 | } |
1115 | 61 | RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), |
1116 | 61 | data.num_rows)); |
1117 | 61 | _olap_data_convertor->clear_source_content(); |
1118 | 61 | } |
1119 | 61 | if (_data_dir != nullptr && |
1120 | 61 | _data_dir->reach_capacity_limit(_column_writers[cid]->estimate_buffer_size())) { |
1121 | 0 | return Status::Error<DISK_REACH_CAPACITY_LIMIT>("disk {} exceed capacity limit.", |
1122 | 0 | _data_dir->path_hash()); |
1123 | 0 | } |
1124 | 61 | RETURN_IF_ERROR(_finalize_column_writer_and_update_meta(cid)); |
1125 | 61 | } |
1126 | | |
1127 | 12 | for (auto& data : _batched_blocks) { |
1128 | 12 | _olap_data_convertor->set_source_content(data.block, data.row_pos, data.num_rows); |
1129 | 12 | RETURN_IF_ERROR(_generate_key_index(data, key_columns, seq_column, cid_to_column)); |
1130 | 12 | _olap_data_convertor->clear_source_content(); |
1131 | 12 | _num_rows_written += data.num_rows; |
1132 | 12 | } |
1133 | | |
1134 | 12 | _batched_blocks.clear(); |
1135 | 12 | return Status::OK(); |
1136 | 12 | } |
1137 | | |
1138 | | Status VerticalSegmentWriter::_generate_key_index( |
1139 | | RowsInBlock& data, std::vector<IOlapColumnDataAccessor*>& key_columns, |
1140 | | IOlapColumnDataAccessor* seq_column, |
1141 | 12 | std::map<uint32_t, IOlapColumnDataAccessor*>& cid_to_column) { |
1142 | | // find all row pos for short key indexes |
1143 | 12 | std::vector<size_t> short_key_pos; |
1144 | | // We build a short key index every `_opts.num_rows_per_block` rows. Specifically, we |
1145 | | // build a short key index using 1st rows for first block and `_short_key_row_pos - _row_count` |
1146 | | // for next blocks. |
1147 | 12 | if (_short_key_row_pos == 0 && _num_rows_written == 0) { |
1148 | 12 | short_key_pos.push_back(0); |
1149 | 12 | } |
1150 | 12 | while (_short_key_row_pos + _opts.num_rows_per_block < _num_rows_written + data.num_rows) { |
1151 | 0 | _short_key_row_pos += _opts.num_rows_per_block; |
1152 | 0 | short_key_pos.push_back(_short_key_row_pos - _num_rows_written); |
1153 | 0 | } |
1154 | 12 | if (_is_mow_with_cluster_key()) { |
1155 | | // 1. generate primary key index |
1156 | 1 | RETURN_IF_ERROR(_generate_primary_key_index(_primary_key_coders, key_columns, seq_column, |
1157 | 1 | data.num_rows, true)); |
1158 | | // 2. generate short key index (use cluster key) |
1159 | 1 | std::vector<IOlapColumnDataAccessor*> short_key_columns; |
1160 | 2 | for (const auto& cid : _tablet_schema->cluster_key_uids()) { |
1161 | 2 | short_key_columns.push_back(cid_to_column[cid]); |
1162 | 2 | } |
1163 | 1 | RETURN_IF_ERROR(_generate_short_key_index(short_key_columns, data.num_rows, short_key_pos)); |
1164 | 11 | } else if (_is_mow()) { |
1165 | 2 | RETURN_IF_ERROR(_generate_primary_key_index(_key_coders, key_columns, seq_column, |
1166 | 2 | data.num_rows, false)); |
1167 | 9 | } else { // other tables |
1168 | 9 | RETURN_IF_ERROR(_generate_short_key_index(key_columns, data.num_rows, short_key_pos)); |
1169 | 9 | } |
1170 | 12 | return Status::OK(); |
1171 | 12 | } |
1172 | | |
1173 | | Status VerticalSegmentWriter::_generate_primary_key_index( |
1174 | | const std::vector<const KeyCoder*>& primary_key_coders, |
1175 | | const std::vector<IOlapColumnDataAccessor*>& primary_key_columns, |
1176 | 3 | IOlapColumnDataAccessor* seq_column, size_t num_rows, bool need_sort) { |
1177 | 3 | if (!need_sort) { // mow table without cluster key |
1178 | 2 | std::string last_key; |
1179 | 6 | for (size_t pos = 0; pos < num_rows; pos++) { |
1180 | | // use _key_coders |
1181 | 4 | std::string key = _full_encode_keys(primary_key_columns, pos); |
1182 | 4 | _maybe_invalid_row_cache(key); |
1183 | 4 | if (_tablet_schema->has_sequence_col()) { |
1184 | 4 | _encode_seq_column(seq_column, pos, &key); |
1185 | 4 | } |
1186 | 4 | DCHECK(key.compare(last_key) > 0) |
1187 | 0 | << "found duplicate key or key is not sorted! current key: " << key |
1188 | 0 | << ", last key: " << last_key; |
1189 | 4 | RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); |
1190 | 4 | last_key = std::move(key); |
1191 | 4 | } |
1192 | 2 | } else { // mow table with cluster key |
1193 | | // 1. generate primary keys in memory |
1194 | 1 | std::vector<std::string> primary_keys; |
1195 | 5 | for (uint32_t pos = 0; pos < num_rows; pos++) { |
1196 | 4 | std::string key = _full_encode_keys(primary_key_coders, primary_key_columns, pos); |
1197 | 4 | _maybe_invalid_row_cache(key); |
1198 | 4 | if (_tablet_schema->has_sequence_col()) { |
1199 | 4 | _encode_seq_column(seq_column, pos, &key); |
1200 | 4 | } |
1201 | 4 | _encode_rowid(pos, &key); |
1202 | 4 | primary_keys.emplace_back(std::move(key)); |
1203 | 4 | } |
1204 | | // 2. sort primary keys |
1205 | 1 | std::sort(primary_keys.begin(), primary_keys.end()); |
1206 | | // 3. write primary keys index |
1207 | 1 | std::string last_key; |
1208 | 4 | for (const auto& key : primary_keys) { |
1209 | 4 | DCHECK(key.compare(last_key) > 0) |
1210 | 0 | << "found duplicate key or key is not sorted! current key: " << key |
1211 | 0 | << ", last key: " << last_key; |
1212 | 4 | RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); |
1213 | 4 | last_key = key; |
1214 | 4 | } |
1215 | 1 | } |
1216 | 3 | return Status::OK(); |
1217 | 3 | } |
1218 | | |
1219 | | Status VerticalSegmentWriter::_generate_short_key_index( |
1220 | | std::vector<IOlapColumnDataAccessor*>& key_columns, size_t num_rows, |
1221 | 10 | const std::vector<size_t>& short_key_pos) { |
1222 | | // use _key_coders |
1223 | 10 | _set_min_key(_full_encode_keys(key_columns, 0)); |
1224 | 10 | _set_max_key(_full_encode_keys(key_columns, num_rows - 1)); |
1225 | 10 | DCHECK(Slice(_max_key.data(), _max_key.size()) |
1226 | 0 | .compare(Slice(_min_key.data(), _min_key.size())) >= 0) |
1227 | 0 | << "key is not sorted! min key: " << _min_key << ", max key: " << _max_key; |
1228 | | |
1229 | 10 | key_columns.resize(_num_short_key_columns); |
1230 | 10 | std::string last_key; |
1231 | 10 | for (const auto pos : short_key_pos) { |
1232 | 10 | std::string key = _encode_keys(key_columns, pos); |
1233 | 10 | DCHECK(key.compare(last_key) >= 0) |
1234 | 0 | << "key is not sorted! current key: " << key << ", last key: " << last_key; |
1235 | 10 | RETURN_IF_ERROR(_short_key_index_builder->add_item(key)); |
1236 | 10 | last_key = std::move(key); |
1237 | 10 | } |
1238 | 10 | return Status::OK(); |
1239 | 10 | } |
1240 | | |
1241 | 4 | void VerticalSegmentWriter::_encode_rowid(const uint32_t rowid, std::string* encoded_keys) { |
1242 | 4 | encoded_keys->push_back(KEY_NORMAL_MARKER); |
1243 | 4 | _rowid_coder->full_encode_ascending(&rowid, encoded_keys); |
1244 | 4 | } |
1245 | | |
1246 | | std::string VerticalSegmentWriter::_full_encode_keys( |
1247 | 24 | const std::vector<IOlapColumnDataAccessor*>& key_columns, size_t pos) { |
1248 | 24 | assert(_key_index_size.size() == _num_sort_key_columns); |
1249 | 24 | if (!(key_columns.size() == _num_sort_key_columns && |
1250 | 24 | _key_coders.size() == _num_sort_key_columns)) { |
1251 | 0 | LOG_INFO("key_columns.size()={}, _key_coders.size()={}, _num_sort_key_columns={}, ", |
1252 | 0 | key_columns.size(), _key_coders.size(), _num_sort_key_columns); |
1253 | 0 | } |
1254 | 24 | assert(key_columns.size() == _num_sort_key_columns && |
1255 | 24 | _key_coders.size() == _num_sort_key_columns); |
1256 | 24 | return _full_encode_keys(_key_coders, key_columns, pos); |
1257 | 24 | } |
1258 | | |
1259 | | std::string VerticalSegmentWriter::_full_encode_keys( |
1260 | | const std::vector<const KeyCoder*>& key_coders, |
1261 | 28 | const std::vector<IOlapColumnDataAccessor*>& key_columns, size_t pos) { |
1262 | 28 | assert(key_columns.size() == key_coders.size()); |
1263 | | |
1264 | 28 | std::string encoded_keys; |
1265 | 28 | size_t cid = 0; |
1266 | 70 | for (const auto& column : key_columns) { |
1267 | 70 | auto field = column->get_data_at(pos); |
1268 | 70 | if (UNLIKELY(!field)) { |
1269 | 0 | encoded_keys.push_back(KEY_NULL_FIRST_MARKER); |
1270 | 0 | ++cid; |
1271 | 0 | continue; |
1272 | 0 | } |
1273 | 70 | encoded_keys.push_back(KEY_NORMAL_MARKER); |
1274 | 70 | DCHECK(key_coders[cid] != nullptr); |
1275 | 70 | key_coders[cid]->full_encode_ascending(field, &encoded_keys); |
1276 | 70 | ++cid; |
1277 | 70 | } |
1278 | 28 | return encoded_keys; |
1279 | 28 | } |
1280 | | |
1281 | | void VerticalSegmentWriter::_encode_seq_column(const IOlapColumnDataAccessor* seq_column, |
1282 | 8 | size_t pos, std::string* encoded_keys) { |
1283 | 8 | const auto* field = seq_column->get_data_at(pos); |
1284 | | // To facilitate the use of the primary key index, encode the seq column |
1285 | | // to the minimum value of the corresponding length when the seq column |
1286 | | // is null |
1287 | 8 | if (UNLIKELY(!field)) { |
1288 | 0 | encoded_keys->push_back(KEY_NULL_FIRST_MARKER); |
1289 | 0 | size_t seq_col_length = _tablet_schema->column(_tablet_schema->sequence_col_idx()).length(); |
1290 | 0 | encoded_keys->append(seq_col_length, KEY_MINIMAL_MARKER); |
1291 | 0 | return; |
1292 | 0 | } |
1293 | 8 | encoded_keys->push_back(KEY_NORMAL_MARKER); |
1294 | 8 | _seq_coder->full_encode_ascending(field, encoded_keys); |
1295 | 8 | } |
1296 | | |
1297 | | std::string VerticalSegmentWriter::_encode_keys( |
1298 | 10 | const std::vector<IOlapColumnDataAccessor*>& key_columns, size_t pos) { |
1299 | 10 | assert(key_columns.size() == _num_short_key_columns); |
1300 | | |
1301 | 10 | std::string encoded_keys; |
1302 | 10 | size_t cid = 0; |
1303 | 22 | for (const auto& column : key_columns) { |
1304 | 22 | auto field = column->get_data_at(pos); |
1305 | 22 | if (UNLIKELY(!field)) { |
1306 | 0 | encoded_keys.push_back(KEY_NULL_FIRST_MARKER); |
1307 | 0 | ++cid; |
1308 | 0 | continue; |
1309 | 0 | } |
1310 | 22 | encoded_keys.push_back(KEY_NORMAL_MARKER); |
1311 | 22 | _key_coders[cid]->encode_ascending(field, _key_index_size[cid], &encoded_keys); |
1312 | 22 | ++cid; |
1313 | 22 | } |
1314 | 10 | return encoded_keys; |
1315 | 10 | } |
1316 | | |
1317 | | // TODO(lingbin): Currently this function does not include the size of various indexes, |
1318 | | // We should make this more precise. |
1319 | 12 | uint64_t VerticalSegmentWriter::_estimated_remaining_size() { |
1320 | | // footer_size(4) + checksum(4) + segment_magic(4) |
1321 | 12 | uint64_t size = 12; |
1322 | 12 | if (_is_mow_with_cluster_key()) { |
1323 | 1 | size += _primary_key_index_builder->size() + _short_key_index_builder->size(); |
1324 | 11 | } else if (_is_mow()) { |
1325 | 2 | size += _primary_key_index_builder->size(); |
1326 | 9 | } else { |
1327 | 9 | size += _short_key_index_builder->size(); |
1328 | 9 | } |
1329 | | |
1330 | | // update the mem_tracker of segment size |
1331 | 12 | _mem_tracker->consume(size - _mem_tracker->consumption()); |
1332 | 12 | return size; |
1333 | 12 | } |
1334 | | |
1335 | 12 | Status VerticalSegmentWriter::finalize_columns_index(uint64_t* index_size) { |
1336 | 12 | uint64_t index_start = _file_writer->bytes_appended(); |
1337 | 12 | RETURN_IF_ERROR(_write_ordinal_index()); |
1338 | 12 | RETURN_IF_ERROR(_write_zone_map()); |
1339 | 12 | RETURN_IF_ERROR(_write_inverted_index()); |
1340 | 12 | RETURN_IF_ERROR(_write_ann_index()); |
1341 | 12 | RETURN_IF_ERROR(_write_bloom_filter_index()); |
1342 | | |
1343 | 12 | *index_size = _file_writer->bytes_appended() - index_start; |
1344 | 12 | if (_is_mow_with_cluster_key()) { |
1345 | 1 | RETURN_IF_ERROR(_write_short_key_index()); |
1346 | 1 | *index_size = _file_writer->bytes_appended() - index_start; |
1347 | 1 | RETURN_IF_ERROR(_write_primary_key_index()); |
1348 | 1 | *index_size += _primary_key_index_builder->disk_size(); |
1349 | 11 | } else if (_is_mow()) { |
1350 | 2 | RETURN_IF_ERROR(_write_primary_key_index()); |
1351 | | // IndexedColumnWriter write data pages mixed with segment data, we should use |
1352 | | // the stat from primary key index builder. |
1353 | 2 | *index_size += _primary_key_index_builder->disk_size(); |
1354 | 9 | } else { |
1355 | 9 | RETURN_IF_ERROR(_write_short_key_index()); |
1356 | 9 | *index_size = _file_writer->bytes_appended() - index_start; |
1357 | 9 | } |
1358 | | |
1359 | | // reset all column writers and data_conveter |
1360 | 12 | clear(); |
1361 | | |
1362 | 12 | return Status::OK(); |
1363 | 12 | } |
1364 | | |
1365 | 12 | Status VerticalSegmentWriter::finalize_footer(uint64_t* segment_file_size) { |
1366 | 12 | RETURN_IF_ERROR(_write_footer()); |
1367 | | // finish |
1368 | 12 | RETURN_IF_ERROR(_file_writer->close(true)); |
1369 | 12 | *segment_file_size = _file_writer->bytes_appended(); |
1370 | 12 | if (*segment_file_size == 0) { |
1371 | 0 | return Status::Corruption("Bad segment, file size = 0"); |
1372 | 0 | } |
1373 | 12 | return Status::OK(); |
1374 | 12 | } |
1375 | | |
1376 | 12 | Status VerticalSegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size) { |
1377 | 12 | MonotonicStopWatch timer; |
1378 | 12 | timer.start(); |
1379 | | // check disk capacity |
1380 | 12 | if (_data_dir != nullptr && |
1381 | 12 | _data_dir->reach_capacity_limit((int64_t)_estimated_remaining_size())) { |
1382 | 0 | return Status::Error<DISK_REACH_CAPACITY_LIMIT>("disk {} exceed capacity limit.", |
1383 | 0 | _data_dir->path_hash()); |
1384 | 0 | } |
1385 | 12 | _row_count = _num_rows_written; |
1386 | 12 | _num_rows_written = 0; |
1387 | | // write index |
1388 | 12 | RETURN_IF_ERROR(finalize_columns_index(index_size)); |
1389 | | // write footer |
1390 | 12 | RETURN_IF_ERROR(finalize_footer(segment_file_size)); |
1391 | | |
1392 | 12 | if (timer.elapsed_time() > 5000000000L) { |
1393 | 0 | LOG(INFO) << "segment flush consumes a lot time_ns " << timer.elapsed_time() |
1394 | 0 | << ", segmemt_size " << *segment_file_size; |
1395 | 0 | } |
1396 | 12 | return Status::OK(); |
1397 | 12 | } |
1398 | | |
1399 | 12 | void VerticalSegmentWriter::clear() { |
1400 | 61 | for (auto& column_writer : _column_writers) { |
1401 | 61 | column_writer.reset(); |
1402 | 61 | } |
1403 | 12 | _column_writers.clear(); |
1404 | 12 | _olap_data_convertor.reset(); |
1405 | 12 | } |
1406 | | |
1407 | | // write ordinal index after data has been written |
1408 | 12 | Status VerticalSegmentWriter::_write_ordinal_index() { |
1409 | 61 | for (auto& column_writer : _column_writers) { |
1410 | 61 | RETURN_IF_ERROR(column_writer->write_ordinal_index()); |
1411 | 61 | } |
1412 | 12 | return Status::OK(); |
1413 | 12 | } |
1414 | | |
1415 | 12 | Status VerticalSegmentWriter::_write_zone_map() { |
1416 | 61 | for (auto& column_writer : _column_writers) { |
1417 | 61 | RETURN_IF_ERROR(column_writer->write_zone_map()); |
1418 | 61 | } |
1419 | 12 | return Status::OK(); |
1420 | 12 | } |
1421 | | |
1422 | 12 | Status VerticalSegmentWriter::_write_inverted_index() { |
1423 | 61 | for (auto& column_writer : _column_writers) { |
1424 | 61 | RETURN_IF_ERROR(column_writer->write_inverted_index()); |
1425 | 61 | } |
1426 | 12 | return Status::OK(); |
1427 | 12 | } |
1428 | | |
1429 | 12 | Status VerticalSegmentWriter::_write_ann_index() { |
1430 | 61 | for (auto& column_writer : _column_writers) { |
1431 | 61 | RETURN_IF_ERROR(column_writer->write_ann_index()); |
1432 | 61 | } |
1433 | 12 | return Status::OK(); |
1434 | 12 | } |
1435 | | |
1436 | 12 | Status VerticalSegmentWriter::_write_bloom_filter_index() { |
1437 | 61 | for (auto& column_writer : _column_writers) { |
1438 | 61 | RETURN_IF_ERROR(column_writer->write_bloom_filter_index()); |
1439 | 61 | } |
1440 | 12 | return Status::OK(); |
1441 | 12 | } |
1442 | | |
1443 | 10 | Status VerticalSegmentWriter::_write_short_key_index() { |
1444 | 10 | std::vector<Slice> body; |
1445 | 10 | PageFooterPB footer; |
1446 | 10 | RETURN_IF_ERROR(_short_key_index_builder->finalize(_row_count, &body, &footer)); |
1447 | 10 | PagePointer pp; |
1448 | | // short key index page is not compressed right now |
1449 | 10 | RETURN_IF_ERROR(PageIO::write_page(_file_writer, body, footer, &pp)); |
1450 | 10 | pp.to_proto(_footer.mutable_short_key_index_page()); |
1451 | 10 | return Status::OK(); |
1452 | 10 | } |
1453 | | |
1454 | 3 | Status VerticalSegmentWriter::_write_primary_key_index() { |
1455 | 3 | CHECK_EQ(_primary_key_index_builder->num_rows(), _row_count); |
1456 | 3 | return _primary_key_index_builder->finalize(_footer.mutable_primary_key_index_meta()); |
1457 | 3 | } |
1458 | | |
1459 | 12 | Status VerticalSegmentWriter::_write_footer() { |
1460 | 12 | _footer.set_num_rows(_row_count); |
1461 | | |
1462 | | // Decide whether to externalize ColumnMetaPB by tablet default, and stamp footer version |
1463 | | |
1464 | 12 | if (_tablet_schema->is_external_segment_column_meta_used()) { |
1465 | 0 | _footer.set_version(SEGMENT_FOOTER_VERSION_V3_EXT_COL_META); |
1466 | 0 | VLOG_DEBUG << "use external column meta"; |
1467 | | // External ColumnMetaPB writing (optional) |
1468 | 0 | RETURN_IF_ERROR(ExternalColMetaUtil::write_external_column_meta( |
1469 | 0 | _file_writer, &_footer, _opts.compression_type, |
1470 | 0 | [this](const std::vector<Slice>& slices) { return _write_raw_data(slices); })); |
1471 | 0 | } |
1472 | | |
1473 | | // Footer := SegmentFooterPB, FooterPBSize(4), FooterPBChecksum(4), MagicNumber(4) |
1474 | 12 | VLOG_DEBUG << "footer " << _footer.DebugString(); |
1475 | 12 | std::string footer_buf; |
1476 | 12 | if (!_footer.SerializeToString(&footer_buf)) { |
1477 | 0 | return Status::InternalError("failed to serialize segment footer"); |
1478 | 0 | } |
1479 | | |
1480 | 12 | faststring fixed_buf; |
1481 | | // footer's size |
1482 | 12 | put_fixed32_le(&fixed_buf, cast_set<uint32_t>(footer_buf.size())); |
1483 | | // footer's checksum |
1484 | 12 | uint32_t checksum = crc32c::Crc32c(footer_buf.data(), footer_buf.size()); |
1485 | 12 | put_fixed32_le(&fixed_buf, checksum); |
1486 | | // Append magic number. we don't write magic number in the header because |
1487 | | // that will need an extra seek when reading |
1488 | 12 | fixed_buf.append(k_segment_magic, k_segment_magic_length); |
1489 | | |
1490 | 12 | std::vector<Slice> slices {footer_buf, fixed_buf}; |
1491 | 12 | return _write_raw_data(slices); |
1492 | 12 | } |
1493 | | |
1494 | 12 | Status VerticalSegmentWriter::_write_raw_data(const std::vector<Slice>& slices) { |
1495 | 12 | RETURN_IF_ERROR(_file_writer->appendv(&slices[0], slices.size())); |
1496 | 12 | return Status::OK(); |
1497 | 12 | } |
1498 | | |
1499 | 12 | Slice VerticalSegmentWriter::min_encoded_key() { |
1500 | 12 | return (_primary_key_index_builder == nullptr) ? Slice(_min_key.data(), _min_key.size()) |
1501 | 12 | : _primary_key_index_builder->min_key(); |
1502 | 12 | } |
1503 | 12 | Slice VerticalSegmentWriter::max_encoded_key() { |
1504 | 12 | return (_primary_key_index_builder == nullptr) ? Slice(_max_key.data(), _max_key.size()) |
1505 | 12 | : _primary_key_index_builder->max_key(); |
1506 | 12 | } |
1507 | | |
1508 | 0 | void VerticalSegmentWriter::_set_min_max_key(const Slice& key) { |
1509 | 0 | if (UNLIKELY(_is_first_row)) { |
1510 | 0 | _min_key.append(key.get_data(), key.get_size()); |
1511 | 0 | _is_first_row = false; |
1512 | 0 | } |
1513 | 0 | if (key.compare(_max_key) > 0) { |
1514 | 0 | _max_key.clear(); |
1515 | 0 | _max_key.append(key.get_data(), key.get_size()); |
1516 | 0 | } |
1517 | 0 | } |
1518 | | |
1519 | 10 | void VerticalSegmentWriter::_set_min_key(const Slice& key) { |
1520 | 10 | if (UNLIKELY(_is_first_row)) { |
1521 | 10 | _min_key.append(key.get_data(), key.get_size()); |
1522 | 10 | _is_first_row = false; |
1523 | 10 | } |
1524 | 10 | } |
1525 | | |
1526 | 10 | void VerticalSegmentWriter::_set_max_key(const Slice& key) { |
1527 | 10 | _max_key.clear(); |
1528 | 10 | _max_key.append(key.get_data(), key.get_size()); |
1529 | 10 | } |
1530 | | |
1531 | 172 | inline bool VerticalSegmentWriter::_is_mow() { |
1532 | 172 | return _tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write; |
1533 | 172 | } |
1534 | | |
1535 | 115 | inline bool VerticalSegmentWriter::_is_mow_with_cluster_key() { |
1536 | 115 | return _is_mow() && !_tablet_schema->cluster_key_uids().empty(); |
1537 | 115 | } |
1538 | | |
1539 | | } // namespace doris::segment_v2 |