be/src/storage/merger.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "storage/merger.h" |
19 | | |
20 | | #include <gen_cpp/olap_file.pb.h> |
21 | | #include <gen_cpp/types.pb.h> |
22 | | #include <stddef.h> |
23 | | #include <unistd.h> |
24 | | |
25 | | #include <algorithm> |
26 | | #include <iterator> |
27 | | #include <memory> |
28 | | #include <mutex> |
29 | | #include <numeric> |
30 | | #include <ostream> |
31 | | #include <shared_mutex> |
32 | | #include <string> |
33 | | #include <unordered_map> |
34 | | #include <utility> |
35 | | #include <vector> |
36 | | |
37 | | #include "cloud/config.h" |
38 | | #include "common/config.h" |
39 | | #include "common/logging.h" |
40 | | #include "common/status.h" |
41 | | #include "core/block/block.h" |
42 | | #include "storage/iterator/block_reader.h" |
43 | | #include "storage/iterator/vertical_block_reader.h" |
44 | | #include "storage/iterator/vertical_merge_iterator.h" |
45 | | #include "storage/iterators.h" |
46 | | #include "storage/olap_common.h" |
47 | | #include "storage/olap_define.h" |
48 | | #include "storage/rowid_conversion.h" |
49 | | #include "storage/rowset/beta_rowset.h" |
50 | | #include "storage/rowset/rowset.h" |
51 | | #include "storage/rowset/rowset_meta.h" |
52 | | #include "storage/rowset/rowset_writer.h" |
53 | | #include "storage/segment/segment.h" |
54 | | #include "storage/segment/segment_writer.h" |
55 | | #include "storage/storage_engine.h" |
56 | | #include "storage/tablet/base_tablet.h" |
57 | | #include "storage/tablet/tablet.h" |
58 | | #include "storage/tablet/tablet_fwd.h" |
59 | | #include "storage/tablet/tablet_meta.h" |
60 | | #include "storage/tablet/tablet_reader.h" |
61 | | #include "storage/types.h" |
62 | | #include "storage/utils.h" |
63 | | #include "util/slice.h" |
64 | | |
65 | | namespace doris { |
66 | | Status Merger::vmerge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type, |
67 | | const TabletSchema& cur_tablet_schema, |
68 | | const std::vector<RowsetReaderSharedPtr>& src_rowset_readers, |
69 | 1.45k | RowsetWriter* dst_rowset_writer, Statistics* stats_output) { |
70 | 1.45k | if (!cur_tablet_schema.cluster_key_uids().empty()) { |
71 | 0 | return Status::InternalError( |
72 | 0 | "mow table with cluster keys does not support non vertical compaction"); |
73 | 0 | } |
74 | 1.45k | BlockReader reader; |
75 | 1.45k | TabletReader::ReaderParams reader_params; |
76 | 1.45k | reader_params.tablet = tablet; |
77 | 1.45k | reader_params.reader_type = reader_type; |
78 | | |
79 | 1.45k | TabletReadSource read_source; |
80 | 1.45k | read_source.rs_splits.reserve(src_rowset_readers.size()); |
81 | 1.56k | for (const RowsetReaderSharedPtr& rs_reader : src_rowset_readers) { |
82 | 1.56k | read_source.rs_splits.emplace_back(rs_reader); |
83 | 1.56k | } |
84 | 1.45k | read_source.fill_delete_predicates(); |
85 | 1.45k | reader_params.set_read_source(std::move(read_source)); |
86 | | |
87 | 1.45k | reader_params.version = dst_rowset_writer->version(); |
88 | | |
89 | 1.45k | TabletSchemaSPtr merge_tablet_schema = std::make_shared<TabletSchema>(); |
90 | 1.45k | merge_tablet_schema->copy_from(cur_tablet_schema); |
91 | | |
92 | | // Merge the columns in delete predicate that not in latest schema in to current tablet schema |
93 | 1.45k | for (auto& del_pred_rs : reader_params.delete_predicates) { |
94 | 24 | merge_tablet_schema->merge_dropped_columns(*del_pred_rs->tablet_schema()); |
95 | 24 | } |
96 | 1.45k | reader_params.tablet_schema = merge_tablet_schema; |
97 | 1.45k | if (!tablet->tablet_schema()->cluster_key_uids().empty()) { |
98 | 0 | reader_params.delete_bitmap = tablet->tablet_meta()->delete_bitmap_ptr(); |
99 | 0 | } |
100 | 1.45k | if (reader_params.reader_type == ReaderType::READER_BINLOG_COMPACTION) { |
101 | 0 | reader_params.delete_bitmap = tablet->tablet_meta()->binlog_delvec_ptr(); |
102 | 0 | } |
103 | | |
104 | 1.45k | if (stats_output && stats_output->rowid_conversion) { |
105 | 48 | reader_params.record_rowids = true; |
106 | 48 | reader_params.rowid_conversion = stats_output->rowid_conversion; |
107 | 48 | stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); |
108 | 48 | } |
109 | | |
110 | 1.45k | reader_params.return_columns.resize(cur_tablet_schema.num_columns()); |
111 | 1.45k | std::iota(reader_params.return_columns.begin(), reader_params.return_columns.end(), 0); |
112 | 1.45k | reader_params.origin_return_columns = &reader_params.return_columns; |
113 | 1.45k | RETURN_IF_ERROR(reader.init(reader_params)); |
114 | | |
115 | 1.45k | Block block = cur_tablet_schema.create_block(reader_params.return_columns); |
116 | 1.45k | size_t output_rows = 0; |
117 | 1.45k | bool eof = false; |
118 | 4.24k | while (!eof && !ExecEnv::GetInstance()->storage_engine().stopped()) { |
119 | 2.79k | auto tablet_state = tablet->tablet_state(); |
120 | 2.79k | if (tablet_state != TABLET_RUNNING && tablet_state != TABLET_NOTREADY) { |
121 | 0 | tablet->clear_cache(); |
122 | 0 | return Status::Error<INTERNAL_ERROR>("tablet {} is not used any more", |
123 | 0 | tablet->tablet_id()); |
124 | 0 | } |
125 | | |
126 | | // Read one block from block reader |
127 | 2.79k | RETURN_NOT_OK_STATUS_WITH_WARN(reader.next_block_with_aggregation(&block, &eof), |
128 | 2.79k | "failed to read next block when merging rowsets of tablet " + |
129 | 2.79k | std::to_string(tablet->tablet_id())); |
130 | 2.79k | RETURN_NOT_OK_STATUS_WITH_WARN(dst_rowset_writer->add_block(&block), |
131 | 2.79k | "failed to write block when merging rowsets of tablet " + |
132 | 2.79k | std::to_string(tablet->tablet_id())); |
133 | | |
134 | 2.79k | if (reader_params.record_rowids && block.rows() > 0) { |
135 | 578 | std::vector<uint32_t> segment_num_rows; |
136 | 578 | RETURN_IF_ERROR(dst_rowset_writer->get_segment_num_rows(&segment_num_rows)); |
137 | 578 | stats_output->rowid_conversion->add(reader.current_block_row_locations(), |
138 | 578 | segment_num_rows); |
139 | 578 | } |
140 | | |
141 | 2.79k | output_rows += block.rows(); |
142 | 2.79k | block.clear_column_data(); |
143 | 2.79k | } |
144 | 1.45k | if (ExecEnv::GetInstance()->storage_engine().stopped()) { |
145 | 0 | return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped", |
146 | 0 | tablet->tablet_id()); |
147 | 0 | } |
148 | | |
149 | 1.45k | if (stats_output != nullptr) { |
150 | 1.45k | stats_output->output_rows = output_rows; |
151 | 1.45k | stats_output->merged_rows = reader.merged_rows(); |
152 | 1.45k | stats_output->filtered_rows = reader.filtered_rows(); |
153 | 1.45k | stats_output->bytes_read_from_local = reader.stats().file_cache_stats.bytes_read_from_local; |
154 | 1.45k | stats_output->bytes_read_from_remote = |
155 | 1.45k | reader.stats().file_cache_stats.bytes_read_from_remote; |
156 | 1.45k | stats_output->cached_bytes_total = reader.stats().file_cache_stats.bytes_write_into_cache; |
157 | 1.45k | if (config::is_cloud_mode()) { |
158 | 1.41k | stats_output->cloud_local_read_time = |
159 | 1.41k | reader.stats().file_cache_stats.local_io_timer / 1000; |
160 | 1.41k | stats_output->cloud_remote_read_time = |
161 | 1.41k | reader.stats().file_cache_stats.remote_io_timer / 1000; |
162 | 1.41k | } |
163 | 1.45k | } |
164 | | |
165 | 1.45k | RETURN_NOT_OK_STATUS_WITH_WARN(dst_rowset_writer->flush(), |
166 | 1.45k | "failed to flush rowset when merging rowsets of tablet " + |
167 | 1.45k | std::to_string(tablet->tablet_id())); |
168 | | |
169 | 1.45k | return Status::OK(); |
170 | 1.45k | } |
171 | | |
172 | | // split columns into several groups, make sure all keys in one group |
173 | | // unique_key should consider sequence&delete column |
174 | | void Merger::vertical_split_columns(const TabletSchema& tablet_schema, |
175 | | std::vector<std::vector<uint32_t>>* column_groups, |
176 | | std::vector<uint32_t>* key_group_cluster_key_idxes, |
177 | 10.2k | int32_t num_columns_per_group) { |
178 | 10.2k | size_t num_key_cols = tablet_schema.num_key_columns(); |
179 | 10.2k | size_t total_cols = tablet_schema.num_columns(); |
180 | 10.2k | std::vector<uint32_t> key_columns; |
181 | 47.1k | for (auto i = 0; i < num_key_cols; ++i) { |
182 | 36.9k | key_columns.emplace_back(i); |
183 | 36.9k | } |
184 | | // in unique key, sequence & delete sign column should merge with key columns |
185 | 10.2k | int32_t sequence_col_idx = -1; |
186 | 10.2k | int32_t delete_sign_idx = -1; |
187 | | // in key column compaction, seq_col real index is _num_key_columns |
188 | | // and delete_sign column is _block->columns() - 1 |
189 | 10.2k | if (tablet_schema.keys_type() == KeysType::UNIQUE_KEYS) { |
190 | 5.80k | if (tablet_schema.has_sequence_col()) { |
191 | 86 | sequence_col_idx = tablet_schema.sequence_col_idx(); |
192 | 86 | key_columns.emplace_back(sequence_col_idx); |
193 | 86 | } |
194 | 5.80k | delete_sign_idx = tablet_schema.field_index(DELETE_SIGN); |
195 | 5.80k | if (delete_sign_idx != -1) { |
196 | 5.79k | key_columns.emplace_back(delete_sign_idx); |
197 | 5.79k | } |
198 | 5.80k | if (!tablet_schema.cluster_key_uids().empty()) { |
199 | 442 | for (const auto& cid : tablet_schema.cluster_key_uids()) { |
200 | 442 | auto idx = tablet_schema.field_index(cid); |
201 | 442 | DCHECK(idx >= 0) << "could not find cluster key column with unique_id=" << cid |
202 | 0 | << " in tablet schema, table_id=" << tablet_schema.table_id(); |
203 | 442 | if (idx >= num_key_cols) { |
204 | 213 | key_columns.emplace_back(idx); |
205 | 213 | } |
206 | 442 | } |
207 | | // tablet schema unique ids: [1, 2, 5, 3, 6, 4], [1 2] is key columns |
208 | | // cluster key unique ids: [3, 1, 4] |
209 | | // the key_columns should be [0, 1, 3, 5] |
210 | | // the key_group_cluster_key_idxes should be [2, 1, 3] |
211 | 441 | for (const auto& cid : tablet_schema.cluster_key_uids()) { |
212 | 441 | auto idx = tablet_schema.field_index(cid); |
213 | 2.53k | for (auto i = 0; i < key_columns.size(); ++i) { |
214 | 2.53k | if (idx == key_columns[i]) { |
215 | 442 | key_group_cluster_key_idxes->emplace_back(i); |
216 | 442 | break; |
217 | 442 | } |
218 | 2.53k | } |
219 | 441 | } |
220 | 149 | } |
221 | 5.80k | } |
222 | 10.2k | VLOG_NOTICE << "sequence_col_idx=" << sequence_col_idx |
223 | 46 | << ", delete_sign_idx=" << delete_sign_idx; |
224 | | // for duplicate no keys |
225 | 10.2k | if (!key_columns.empty()) { |
226 | 10.1k | column_groups->emplace_back(key_columns); |
227 | 10.1k | } |
228 | | |
229 | 10.2k | std::vector<uint32_t> value_columns; |
230 | | |
231 | 79.9k | for (size_t i = num_key_cols; i < total_cols; ++i) { |
232 | 69.7k | if (i == sequence_col_idx || i == delete_sign_idx || |
233 | 69.7k | key_columns.end() != std::find(key_columns.begin(), key_columns.end(), i)) { |
234 | 6.07k | continue; |
235 | 6.07k | } |
236 | | |
237 | 63.6k | if (!value_columns.empty() && value_columns.size() % num_columns_per_group == 0) { |
238 | 6.91k | column_groups->push_back(value_columns); |
239 | 6.91k | value_columns.clear(); |
240 | 6.91k | } |
241 | 63.6k | value_columns.push_back(cast_set<uint32_t>(i)); |
242 | 63.6k | } |
243 | | |
244 | 10.2k | if (!value_columns.empty()) { |
245 | 9.68k | column_groups->push_back(value_columns); |
246 | 9.68k | } |
247 | 10.2k | } |
248 | | |
249 | | Status Merger::vertical_compact_one_group( |
250 | | BaseTabletSPtr tablet, ReaderType reader_type, const TabletSchema& tablet_schema, |
251 | | bool is_key, const std::vector<uint32_t>& column_group, RowSourcesBuffer* row_source_buf, |
252 | | const std::vector<RowsetReaderSharedPtr>& src_rowset_readers, |
253 | | RowsetWriter* dst_rowset_writer, uint32_t max_rows_per_segment, Statistics* stats_output, |
254 | | std::vector<uint32_t> key_group_cluster_key_idxes, int64_t batch_size, |
255 | 26.8k | CompactionSampleInfo* sample_info, bool enable_sparse_optimization) { |
256 | | // build tablet reader |
257 | 26.8k | VLOG_NOTICE << "vertical compact one group, max_rows_per_segment=" << max_rows_per_segment; |
258 | 26.8k | VerticalBlockReader reader(row_source_buf); |
259 | 26.8k | TabletReader::ReaderParams reader_params; |
260 | 26.8k | reader_params.is_key_column_group = is_key; |
261 | 26.8k | reader_params.key_group_cluster_key_idxes = key_group_cluster_key_idxes; |
262 | 26.8k | reader_params.tablet = tablet; |
263 | 26.8k | reader_params.reader_type = reader_type; |
264 | 26.8k | reader_params.enable_sparse_optimization = enable_sparse_optimization; |
265 | | |
266 | 26.8k | TabletReadSource read_source; |
267 | 26.8k | read_source.rs_splits.reserve(src_rowset_readers.size()); |
268 | 216k | for (const RowsetReaderSharedPtr& rs_reader : src_rowset_readers) { |
269 | 216k | read_source.rs_splits.emplace_back(rs_reader); |
270 | 216k | } |
271 | 26.8k | read_source.fill_delete_predicates(); |
272 | 26.8k | reader_params.set_read_source(std::move(read_source)); |
273 | | |
274 | 26.8k | reader_params.version = dst_rowset_writer->version(); |
275 | | |
276 | 26.8k | TabletSchemaSPtr merge_tablet_schema = std::make_shared<TabletSchema>(); |
277 | 26.8k | merge_tablet_schema->copy_from(tablet_schema); |
278 | | |
279 | 26.8k | for (auto& del_pred_rs : reader_params.delete_predicates) { |
280 | 670 | merge_tablet_schema->merge_dropped_columns(*del_pred_rs->tablet_schema()); |
281 | 670 | } |
282 | | |
283 | 26.8k | reader_params.tablet_schema = merge_tablet_schema; |
284 | 26.8k | bool has_cluster_key = false; |
285 | 26.8k | if (!tablet->tablet_schema()->cluster_key_uids().empty()) { |
286 | 420 | reader_params.delete_bitmap = tablet->tablet_meta()->delete_bitmap_ptr(); |
287 | 420 | has_cluster_key = true; |
288 | 420 | } |
289 | 26.8k | if (reader_params.reader_type == ReaderType::READER_BINLOG_COMPACTION) { |
290 | 0 | reader_params.delete_bitmap = tablet->tablet_meta()->binlog_delvec_ptr(); |
291 | 0 | } |
292 | | |
293 | 26.8k | if (is_key && stats_output && stats_output->rowid_conversion) { |
294 | 6.08k | reader_params.record_rowids = true; |
295 | 6.08k | reader_params.rowid_conversion = stats_output->rowid_conversion; |
296 | 6.08k | stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); |
297 | 6.08k | } |
298 | | |
299 | 26.8k | reader_params.return_columns = column_group; |
300 | 26.8k | reader_params.origin_return_columns = &reader_params.return_columns; |
301 | 26.8k | reader_params.batch_size = batch_size; |
302 | 26.8k | RETURN_IF_ERROR(reader.init(reader_params, sample_info)); |
303 | | |
304 | 26.8k | Block block = tablet_schema.create_block(reader_params.return_columns); |
305 | 26.8k | size_t output_rows = 0; |
306 | 26.8k | bool eof = false; |
307 | 64.5k | while (!eof && !ExecEnv::GetInstance()->storage_engine().stopped()) { |
308 | 37.7k | auto tablet_state = tablet->tablet_state(); |
309 | 37.7k | if (tablet_state != TABLET_RUNNING && tablet_state != TABLET_NOTREADY) { |
310 | 0 | tablet->clear_cache(); |
311 | 0 | return Status::Error<INTERNAL_ERROR>("tablet {} is not used any more", |
312 | 0 | tablet->tablet_id()); |
313 | 0 | } |
314 | | // Read one block from block reader |
315 | 37.7k | RETURN_NOT_OK_STATUS_WITH_WARN(reader.next_block_with_aggregation(&block, &eof), |
316 | 37.7k | "failed to read next block when merging rowsets of tablet " + |
317 | 37.7k | std::to_string(tablet->tablet_id())); |
318 | 37.7k | RETURN_NOT_OK_STATUS_WITH_WARN( |
319 | 37.7k | dst_rowset_writer->add_columns(&block, column_group, is_key, max_rows_per_segment, |
320 | 37.7k | has_cluster_key), |
321 | 37.7k | "failed to write block when merging rowsets of tablet " + |
322 | 37.7k | std::to_string(tablet->tablet_id())); |
323 | | |
324 | 37.7k | if (is_key && reader_params.record_rowids && block.rows() > 0) { |
325 | 6.33k | std::vector<uint32_t> segment_num_rows; |
326 | 6.33k | RETURN_IF_ERROR(dst_rowset_writer->get_segment_num_rows(&segment_num_rows)); |
327 | 6.33k | stats_output->rowid_conversion->add(reader.current_block_row_locations(), |
328 | 6.33k | segment_num_rows); |
329 | 6.33k | } |
330 | 37.7k | output_rows += block.rows(); |
331 | 37.7k | block.clear_column_data(); |
332 | 37.7k | } |
333 | 26.8k | if (ExecEnv::GetInstance()->storage_engine().stopped()) { |
334 | 0 | return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped", |
335 | 0 | tablet->tablet_id()); |
336 | 0 | } |
337 | | |
338 | 26.8k | if (stats_output != nullptr) { |
339 | 26.7k | if (is_key) { |
340 | 10.1k | stats_output->output_rows = output_rows; |
341 | 10.1k | stats_output->merged_rows = reader.merged_rows(); |
342 | 10.1k | stats_output->filtered_rows = reader.filtered_rows(); |
343 | 10.1k | } |
344 | 26.7k | stats_output->bytes_read_from_local = reader.stats().file_cache_stats.bytes_read_from_local; |
345 | 26.7k | stats_output->bytes_read_from_remote = |
346 | 26.7k | reader.stats().file_cache_stats.bytes_read_from_remote; |
347 | 26.7k | stats_output->cached_bytes_total = reader.stats().file_cache_stats.bytes_write_into_cache; |
348 | 26.7k | if (config::is_cloud_mode()) { |
349 | 18.9k | stats_output->cloud_local_read_time = |
350 | 18.9k | reader.stats().file_cache_stats.local_io_timer / 1000; |
351 | 18.9k | stats_output->cloud_remote_read_time = |
352 | 18.9k | reader.stats().file_cache_stats.remote_io_timer / 1000; |
353 | 18.9k | } |
354 | 26.7k | } |
355 | 26.8k | RETURN_IF_ERROR(dst_rowset_writer->flush_columns(is_key)); |
356 | | |
357 | 26.8k | return Status::OK(); |
358 | 26.8k | } |
359 | | |
360 | | // for segcompaction |
361 | | Status Merger::vertical_compact_one_group( |
362 | | int64_t tablet_id, ReaderType reader_type, const TabletSchema& tablet_schema, bool is_key, |
363 | | const std::vector<uint32_t>& column_group, RowSourcesBuffer* row_source_buf, |
364 | | VerticalBlockReader& src_block_reader, segment_v2::SegmentWriter& dst_segment_writer, |
365 | | Statistics* stats_output, uint64_t* index_size, KeyBoundsPB& key_bounds, |
366 | 22 | SimpleRowIdConversion* rowid_conversion) { |
367 | | // TODO: record_rowids |
368 | 22 | Block block = tablet_schema.create_block(column_group); |
369 | 22 | size_t output_rows = 0; |
370 | 22 | bool eof = false; |
371 | 138 | while (!eof && !ExecEnv::GetInstance()->storage_engine().stopped()) { |
372 | | // Read one block from block reader |
373 | 116 | RETURN_NOT_OK_STATUS_WITH_WARN(src_block_reader.next_block_with_aggregation(&block, &eof), |
374 | 116 | "failed to read next block when merging rowsets of tablet " + |
375 | 116 | std::to_string(tablet_id)); |
376 | 116 | if (!block.rows()) { |
377 | 0 | break; |
378 | 0 | } |
379 | 116 | RETURN_NOT_OK_STATUS_WITH_WARN(dst_segment_writer.append_block(&block, 0, block.rows()), |
380 | 116 | "failed to write block when merging rowsets of tablet " + |
381 | 116 | std::to_string(tablet_id)); |
382 | | |
383 | 116 | if (is_key && rowid_conversion != nullptr) { |
384 | 30 | rowid_conversion->add(src_block_reader.current_block_row_locations()); |
385 | 30 | } |
386 | 116 | output_rows += block.rows(); |
387 | 116 | block.clear_column_data(); |
388 | 116 | } |
389 | 22 | if (ExecEnv::GetInstance()->storage_engine().stopped()) { |
390 | 0 | return Status::Error<INTERNAL_ERROR>("tablet {} failed to do compaction, engine stopped", |
391 | 0 | tablet_id); |
392 | 0 | } |
393 | | |
394 | 22 | if (stats_output != nullptr) { |
395 | 22 | if (is_key) { |
396 | 11 | stats_output->output_rows = output_rows; |
397 | 11 | stats_output->merged_rows = src_block_reader.merged_rows(); |
398 | 11 | stats_output->filtered_rows = src_block_reader.filtered_rows(); |
399 | 11 | } |
400 | 22 | stats_output->bytes_read_from_local = |
401 | 22 | src_block_reader.stats().file_cache_stats.bytes_read_from_local; |
402 | 22 | stats_output->bytes_read_from_remote = |
403 | 22 | src_block_reader.stats().file_cache_stats.bytes_read_from_remote; |
404 | 22 | stats_output->cached_bytes_total = |
405 | 22 | src_block_reader.stats().file_cache_stats.bytes_write_into_cache; |
406 | 22 | } |
407 | | |
408 | | // segcompaction produce only one segment at once |
409 | 22 | RETURN_IF_ERROR(dst_segment_writer.finalize_columns_data()); |
410 | 22 | RETURN_IF_ERROR(dst_segment_writer.finalize_columns_index(index_size)); |
411 | | |
412 | 22 | if (is_key) { |
413 | 11 | Slice min_key = dst_segment_writer.min_encoded_key(); |
414 | 11 | Slice max_key = dst_segment_writer.max_encoded_key(); |
415 | 11 | DCHECK_LE(min_key.compare(max_key), 0); |
416 | 11 | key_bounds.set_min_key(min_key.to_string()); |
417 | 11 | key_bounds.set_max_key(max_key.to_string()); |
418 | 11 | } |
419 | | |
420 | 22 | return Status::OK(); |
421 | 22 | } |
422 | | |
423 | | int64_t estimate_batch_size(int group_index, BaseTabletSPtr tablet, int64_t way_cnt, |
424 | | ReaderType reader_type, int64_t group_per_row_from_footer, |
425 | 26.5k | bool footer_fallback) { |
426 | 26.5k | auto& sample_info_lock = tablet->get_sample_info_lock(reader_type); |
427 | 26.5k | auto& sample_infos = tablet->get_sample_infos(reader_type); |
428 | 26.5k | std::unique_lock<std::mutex> lock(sample_info_lock); |
429 | 26.5k | CompactionSampleInfo info = sample_infos[group_index]; |
430 | 26.5k | if (way_cnt <= 0) { |
431 | 6.86k | LOG(INFO) << "estimate batch size for vertical compaction, tablet id: " |
432 | 6.86k | << tablet->tablet_id() << " way cnt: " << way_cnt; |
433 | 6.86k | return 4096 - 32; |
434 | 6.86k | } |
435 | 19.7k | int64_t block_mem_limit = config::compaction_memory_bytes_limit / way_cnt; |
436 | 19.7k | if (tablet->last_compaction_status.is<ErrorCode::MEM_LIMIT_EXCEEDED>()) { |
437 | 0 | block_mem_limit /= 4; |
438 | 0 | } |
439 | | |
440 | 19.7k | int64_t group_data_size = 0; |
441 | 19.7k | if (info.group_data_size > 0 && info.bytes > 0 && info.rows > 0) { |
442 | 0 | double smoothing_factor = 0.5; |
443 | 0 | group_data_size = |
444 | 0 | int64_t((cast_set<double>(info.group_data_size) * (1 - smoothing_factor)) + |
445 | 0 | (cast_set<double>(info.bytes / info.rows) * smoothing_factor)); |
446 | 0 | sample_infos[group_index].group_data_size = group_data_size; |
447 | 19.7k | } else if (info.group_data_size > 0 && (info.bytes <= 0 || info.rows <= 0)) { |
448 | 0 | group_data_size = info.group_data_size; |
449 | 19.7k | } else if (info.group_data_size <= 0 && info.bytes > 0 && info.rows > 0) { |
450 | 12.1k | group_data_size = info.bytes / info.rows; |
451 | 12.1k | sample_infos[group_index].group_data_size = group_data_size; |
452 | 12.1k | } else { |
453 | | // No historical sampling data available. |
454 | | // Try to use raw_data_bytes from segment footer for a better estimate. |
455 | 7.53k | if (!footer_fallback && group_per_row_from_footer > 0) { |
456 | 7.12k | int64_t batch_size = block_mem_limit / group_per_row_from_footer; |
457 | 7.12k | int64_t res = std::max(std::min(batch_size, int64_t(4096 - 32)), int64_t(32L)); |
458 | 7.12k | LOG(INFO) << "estimate batch size from footer for vertical compaction, tablet id: " |
459 | 7.12k | << tablet->tablet_id() |
460 | 7.12k | << " group_per_row_from_footer: " << group_per_row_from_footer |
461 | 7.12k | << " way cnt: " << way_cnt << " batch size: " << res; |
462 | 7.12k | return res; |
463 | 7.12k | } |
464 | 7.53k | LOG(INFO) << "estimate batch size for vertical compaction, tablet id: " |
465 | 413 | << tablet->tablet_id() << " group data size: " << info.group_data_size |
466 | 413 | << " row num: " << info.rows << " consume bytes: " << info.bytes |
467 | 413 | << " footer_fallback: " << footer_fallback; |
468 | 413 | return 1024 - 32; |
469 | 7.53k | } |
470 | | |
471 | 12.1k | if (group_data_size <= 0) { |
472 | 0 | LOG(WARNING) << "estimate batch size for vertical compaction, tablet id: " |
473 | 0 | << tablet->tablet_id() << " unexpected group data size: " << group_data_size; |
474 | 0 | return 4096 - 32; |
475 | 0 | } |
476 | | |
477 | 12.1k | sample_infos[group_index].bytes = 0; |
478 | 12.1k | sample_infos[group_index].rows = 0; |
479 | | |
480 | 12.1k | int64_t batch_size = block_mem_limit / group_data_size; |
481 | 12.1k | int64_t res = std::max(std::min(batch_size, int64_t(4096 - 32)), int64_t(32L)); |
482 | 12.1k | LOG(INFO) << "estimate batch size for vertical compaction, tablet id: " << tablet->tablet_id() |
483 | 12.1k | << " group data size: " << info.group_data_size << " row num: " << info.rows |
484 | 12.1k | << " consume bytes: " << info.bytes << " way cnt: " << way_cnt |
485 | 12.1k | << " batch size: " << res; |
486 | 12.1k | return res; |
487 | 12.1k | } |
488 | | |
489 | | // steps to do vertical merge: |
490 | | // 1. split columns into column groups |
491 | | // 2. compact groups one by one, generate a row_source_buf when compact key group |
492 | | // and use this row_source_buf to compact value column groups |
493 | | // 3. build output rowset |
494 | | Status Merger::vertical_merge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type, |
495 | | const TabletSchema& tablet_schema, |
496 | | const std::vector<RowsetReaderSharedPtr>& src_rowset_readers, |
497 | | RowsetWriter* dst_rowset_writer, |
498 | | uint32_t max_rows_per_segment, int64_t merge_way_num, |
499 | | Statistics* stats_output, |
500 | 10.1k | VerticalCompactionProgressCallback progress_cb) { |
501 | 10.1k | LOG(INFO) << "Start to do vertical compaction, tablet_id: " << tablet->tablet_id(); |
502 | 10.1k | std::vector<std::vector<uint32_t>> column_groups; |
503 | 10.1k | std::vector<uint32_t> key_group_cluster_key_idxes; |
504 | | // If BE config vertical_compaction_num_columns_per_group has been modified from |
505 | | // its default value (5), use the BE config; otherwise use the tablet meta value. |
506 | 10.1k | constexpr int32_t default_num_columns_per_group = 5; |
507 | 10.1k | int32_t num_columns_per_group = |
508 | 10.1k | config::vertical_compaction_num_columns_per_group != default_num_columns_per_group |
509 | 10.1k | ? config::vertical_compaction_num_columns_per_group |
510 | 10.1k | : tablet->tablet_meta()->vertical_compaction_num_columns_per_group(); |
511 | | |
512 | 10.1k | DBUG_EXECUTE_IF("Merger.vertical_merge_rowsets.check_num_columns_per_group", { |
513 | 10.1k | auto expected_value = DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
514 | 10.1k | "Merger.vertical_merge_rowsets.check_num_columns_per_group", "expected_value", -1); |
515 | 10.1k | auto expected_tablet_id = DebugPoints::instance()->get_debug_param_or_default<int64_t>( |
516 | 10.1k | "Merger.vertical_merge_rowsets.check_num_columns_per_group", "tablet_id", -1); |
517 | 10.1k | if (expected_tablet_id != -1 && expected_tablet_id == tablet->tablet_id()) { |
518 | 10.1k | if (expected_value != -1 && expected_value != num_columns_per_group) { |
519 | 10.1k | LOG(FATAL) << "DEBUG_POINT CHECK FAILED: expected num_columns_per_group=" |
520 | 10.1k | << expected_value << " but got " << num_columns_per_group |
521 | 10.1k | << " for tablet_id=" << tablet->tablet_id(); |
522 | 10.1k | } else { |
523 | 10.1k | LOG(INFO) << "DEBUG_POINT CHECK PASSED: num_columns_per_group=" |
524 | 10.1k | << num_columns_per_group << ", tablet_id=" << tablet->tablet_id(); |
525 | 10.1k | } |
526 | 10.1k | } |
527 | 10.1k | }); |
528 | | |
529 | 10.1k | vertical_split_columns(tablet_schema, &column_groups, &key_group_cluster_key_idxes, |
530 | 10.1k | num_columns_per_group); |
531 | | |
532 | 10.1k | if (progress_cb) { |
533 | 10.1k | progress_cb(column_groups.size(), 0); |
534 | 10.1k | } |
535 | | |
536 | | // Calculate total rows for density calculation after compaction |
537 | 10.1k | int64_t total_rows = 0; |
538 | 83.5k | for (const auto& rs_reader : src_rowset_readers) { |
539 | 83.5k | total_rows += rs_reader->rowset()->rowset_meta()->num_rows(); |
540 | 83.5k | } |
541 | | |
542 | | // Use historical density for sparse wide table optimization |
543 | | // density = (total_cells - null_cells) / total_cells, smaller means more sparse |
544 | | // When density <= threshold, enable sparse optimization |
545 | | // threshold = 0 means disable, 1 means always enable (default) |
546 | 10.1k | bool enable_sparse_optimization = false; |
547 | 10.1k | if (config::sparse_column_compaction_threshold_percent > 0 && |
548 | 10.2k | tablet->keys_type() == KeysType::UNIQUE_KEYS) { |
549 | 5.79k | double density = tablet->compaction_density.load(); |
550 | 5.79k | enable_sparse_optimization = density <= config::sparse_column_compaction_threshold_percent; |
551 | | |
552 | 5.79k | LOG(INFO) << "Vertical compaction sparse optimization check: tablet_id=" |
553 | 5.79k | << tablet->tablet_id() << ", density=" << density |
554 | 5.79k | << ", threshold=" << config::sparse_column_compaction_threshold_percent |
555 | 5.79k | << ", total_rows=" << total_rows |
556 | 5.79k | << ", num_columns=" << tablet_schema.num_columns() |
557 | 5.79k | << ", total_cells=" << total_rows * tablet_schema.num_columns() |
558 | 5.79k | << ", enable_sparse_optimization=" << enable_sparse_optimization; |
559 | 5.79k | } |
560 | | |
561 | 10.1k | RowSourcesBuffer row_sources_buf(tablet->tablet_id(), dst_rowset_writer->context().tablet_path, |
562 | 10.1k | reader_type); |
563 | 10.1k | Merger::Statistics total_stats; |
564 | 10.2k | if (stats_output != nullptr) { |
565 | 10.2k | total_stats.rowid_conversion = stats_output->rowid_conversion; |
566 | 10.2k | } |
567 | 10.1k | auto& sample_info_lock = tablet->get_sample_info_lock(reader_type); |
568 | 10.1k | auto& sample_infos = tablet->get_sample_infos(reader_type); |
569 | 10.1k | { |
570 | 10.1k | std::unique_lock<std::mutex> lock(sample_info_lock); |
571 | 10.1k | sample_infos.resize(column_groups.size()); |
572 | 10.1k | } |
573 | | // Collect per-column raw_data_bytes from segment footer for first-time batch size estimation. |
574 | | // raw_data_bytes is the original data size before encoding, close to runtime Block::bytes(). |
575 | | // Only collect when needed: skip if manual batch_size override is set, or if ALL groups |
576 | | // already have historical sampling data. Use per-group granularity so that schema evolution |
577 | | // (new groups without history) still gets footer-based estimation. |
578 | 10.1k | struct ColumnRawSizeInfo { |
579 | 10.1k | int64_t total_raw_bytes = 0; |
580 | 10.1k | int64_t rows_with_data = 0; |
581 | 10.1k | }; |
582 | 10.1k | std::unordered_map<int32_t, ColumnRawSizeInfo> column_raw_sizes; |
583 | 10.1k | bool need_footer_collection = false; |
584 | 10.1k | if (config::compaction_batch_size == -1) { |
585 | 10.1k | std::unique_lock<std::mutex> lock(sample_info_lock); |
586 | 19.2k | for (const auto& info : sample_infos) { |
587 | 19.2k | if (info.group_data_size <= 0 && info.bytes <= 0 && info.rows <= 0) { |
588 | 4.89k | need_footer_collection = true; |
589 | 4.89k | break; |
590 | 4.89k | } |
591 | 19.2k | } |
592 | 10.1k | } |
593 | 10.1k | if (need_footer_collection) { |
594 | 35.0k | for (const auto& rs_reader : src_rowset_readers) { |
595 | 35.0k | auto beta_rowset = std::dynamic_pointer_cast<BetaRowset>(rs_reader->rowset()); |
596 | 35.0k | if (!beta_rowset) { |
597 | 0 | continue; |
598 | 0 | } |
599 | 35.0k | std::vector<segment_v2::SegmentSharedPtr> segments; |
600 | 35.0k | auto st = beta_rowset->load_segments(&segments); |
601 | 35.0k | if (!st.ok()) { |
602 | 0 | LOG(WARNING) << "Failed to load segments for footer raw_data_bytes collection" |
603 | 0 | << ", tablet_id: " << tablet->tablet_id() |
604 | 0 | << ", rowset_id: " << beta_rowset->rowset_id() << ", status: " << st; |
605 | 0 | continue; |
606 | 0 | } |
607 | 35.0k | for (const auto& segment : segments) { |
608 | 13.6k | int64_t row_count = segment->num_rows(); |
609 | 13.6k | auto collect_st = segment->traverse_column_meta_pbs( |
610 | 132k | [&](const segment_v2::ColumnMetaPB& meta) { |
611 | 132k | int32_t uid = meta.unique_id(); |
612 | 132k | if (uid >= 0 && meta.has_raw_data_bytes()) { |
613 | 125k | auto& info = column_raw_sizes[uid]; |
614 | 125k | info.total_raw_bytes += meta.raw_data_bytes(); |
615 | 125k | info.rows_with_data += row_count; |
616 | 125k | } |
617 | 132k | }); |
618 | 13.6k | if (!collect_st.ok()) { |
619 | 0 | LOG(WARNING) << "Failed to traverse column meta for footer collection" |
620 | 0 | << ", tablet_id: " << tablet->tablet_id() |
621 | 0 | << ", status: " << collect_st; |
622 | 0 | } |
623 | 13.6k | } |
624 | 35.0k | } |
625 | 4.88k | } |
626 | | |
627 | | // Pre-compute per-row estimate for each column group from footer data. |
628 | 10.1k | std::vector<int64_t> group_per_row_from_footer(column_groups.size(), 0); |
629 | 10.1k | std::vector<bool> group_footer_fallback(column_groups.size(), false); |
630 | 36.9k | for (size_t i = 0; i < column_groups.size(); ++i) { |
631 | 26.7k | int64_t group_per_row = 0; |
632 | 26.7k | bool need_fallback = false; |
633 | 44.9k | for (uint32_t col_ordinal : column_groups[i]) { |
634 | 44.9k | const auto& col = tablet_schema.column(col_ordinal); |
635 | 44.9k | int32_t uid = col.unique_id(); |
636 | | |
637 | | // Variant columns (root or subcolumn): raw_data_bytes is 0 (TODO in writer), |
638 | | // cannot estimate from footer, fallback to default for the entire group. |
639 | 44.9k | if (uid < 0 || col.is_variant_type()) { |
640 | 1.03k | need_fallback = true; |
641 | 1.03k | break; |
642 | 1.03k | } |
643 | | |
644 | | // Any column without footer data (e.g. legacy segments written before |
645 | | // raw_data_bytes existed) makes the group sample partial and unreliable. |
646 | | // Fall back to the default for the whole group instead of summing only |
647 | | // the columns we measured. |
648 | 43.9k | auto it = column_raw_sizes.find(uid); |
649 | 43.9k | if (it == column_raw_sizes.end() || it->second.rows_with_data <= 0) { |
650 | 18.6k | need_fallback = true; |
651 | 18.6k | break; |
652 | 18.6k | } |
653 | | |
654 | 25.3k | int64_t raw_per_row = it->second.total_raw_bytes / it->second.rows_with_data; |
655 | 25.3k | int64_t col_per_row = 0; |
656 | | |
657 | 25.3k | if (col.type() == FieldType::OLAP_FIELD_TYPE_ARRAY || |
658 | 25.3k | col.type() == FieldType::OLAP_FIELD_TYPE_MAP || |
659 | 25.3k | col.type() == FieldType::OLAP_FIELD_TYPE_STRUCT) { |
660 | | // Complex types: raw_data_bytes recursively aggregates sub-writers. |
661 | 1.32k | col_per_row = raw_per_row; |
662 | 24.0k | } else if (col.is_length_variable_type()) { |
663 | | // Variable-length scalar (VARCHAR/STRING/HLL/BITMAP/...): raw_per_row |
664 | | // is the average char payload across all rows; reader still pays an |
665 | | // 8-byte offset entry per row regardless of null-ness. |
666 | 9.01k | col_per_row = raw_per_row + 8; |
667 | 9.01k | if (col.is_nullable()) { |
668 | 4.72k | col_per_row += 1; // null map |
669 | 4.72k | } |
670 | 15.0k | } else { |
671 | | // Fixed-width scalar (INT/BIGINT/DOUBLE/DATE/...). |
672 | | // raw_data_bytes only counts non-null payload (append_nulls() does |
673 | | // not advance the page builder), but FileColumnIterator::next_batch |
674 | | // still calls ColumnNullable::insert_many_defaults() for null runs, |
675 | | // which grows the nested PODArray by N * type_size. So the runtime |
676 | | // per-row footprint is at least type_size, no matter how sparse. |
677 | 15.0k | int64_t type_size = field_type_size(col.type()); |
678 | 15.0k | col_per_row = std::max(raw_per_row, type_size); |
679 | 15.0k | if (col.is_nullable()) { |
680 | 8.66k | col_per_row += 1; // null map |
681 | 8.66k | } |
682 | 15.0k | } |
683 | | |
684 | 25.3k | group_per_row += col_per_row; |
685 | 25.3k | } |
686 | 26.7k | group_per_row_from_footer[i] = group_per_row; |
687 | 26.7k | group_footer_fallback[i] = need_fallback; |
688 | 26.7k | } |
689 | | |
690 | | // compact group one by one |
691 | 36.9k | for (auto i = 0; i < column_groups.size(); ++i) { |
692 | 26.7k | VLOG_NOTICE << "row source size: " << row_sources_buf.total_size(); |
693 | 26.7k | bool is_key = (i == 0); |
694 | 26.7k | int64_t batch_size = config::compaction_batch_size != -1 |
695 | 26.7k | ? config::compaction_batch_size |
696 | 26.7k | : estimate_batch_size(i, tablet, merge_way_num, reader_type, |
697 | 26.6k | group_per_row_from_footer[i], |
698 | 26.6k | group_footer_fallback[i]); |
699 | 26.7k | CompactionSampleInfo sample_info; |
700 | 26.7k | Merger::Statistics group_stats; |
701 | 26.7k | group_stats.rowid_conversion = total_stats.rowid_conversion; |
702 | 18.4E | Merger::Statistics* group_stats_ptr = stats_output != nullptr ? &group_stats : nullptr; |
703 | 26.7k | Status st = vertical_compact_one_group( |
704 | 26.7k | tablet, reader_type, tablet_schema, is_key, column_groups[i], &row_sources_buf, |
705 | 26.7k | src_rowset_readers, dst_rowset_writer, max_rows_per_segment, group_stats_ptr, |
706 | 26.7k | key_group_cluster_key_idxes, batch_size, &sample_info, enable_sparse_optimization); |
707 | 26.7k | { |
708 | 26.7k | std::unique_lock<std::mutex> lock(sample_info_lock); |
709 | 26.7k | sample_infos[i] = sample_info; |
710 | 26.7k | } |
711 | 26.7k | RETURN_IF_ERROR(st); |
712 | 26.7k | if (stats_output != nullptr) { |
713 | 26.7k | total_stats.bytes_read_from_local += group_stats.bytes_read_from_local; |
714 | 26.7k | total_stats.bytes_read_from_remote += group_stats.bytes_read_from_remote; |
715 | 26.7k | total_stats.cached_bytes_total += group_stats.cached_bytes_total; |
716 | 26.7k | total_stats.cloud_local_read_time += group_stats.cloud_local_read_time; |
717 | 26.7k | total_stats.cloud_remote_read_time += group_stats.cloud_remote_read_time; |
718 | 26.7k | if (is_key) { |
719 | 10.1k | total_stats.output_rows = group_stats.output_rows; |
720 | 10.1k | total_stats.merged_rows = group_stats.merged_rows; |
721 | 10.1k | total_stats.filtered_rows = group_stats.filtered_rows; |
722 | 10.1k | total_stats.rowid_conversion = group_stats.rowid_conversion; |
723 | 10.1k | } |
724 | 26.7k | } |
725 | 26.7k | if (progress_cb) { |
726 | 26.4k | progress_cb(column_groups.size(), i + 1); |
727 | 26.4k | } |
728 | 26.7k | if (is_key) { |
729 | 10.2k | RETURN_IF_ERROR(row_sources_buf.flush()); |
730 | 10.2k | } |
731 | 26.7k | RETURN_IF_ERROR(row_sources_buf.seek_to_begin()); |
732 | 26.7k | } |
733 | | |
734 | | // Calculate and store density for next compaction's sparse optimization threshold |
735 | | // density = (total_cells - total_null_count) / total_cells |
736 | | // Smaller density means more sparse |
737 | 10.1k | { |
738 | 10.1k | std::unique_lock<std::mutex> lock(sample_info_lock); |
739 | 10.1k | int64_t total_null_count = 0; |
740 | 26.8k | for (const auto& info : sample_infos) { |
741 | 26.8k | total_null_count += info.null_count; |
742 | 26.8k | } |
743 | 10.1k | int64_t total_cells = total_rows * tablet_schema.num_columns(); |
744 | 10.1k | if (total_cells > 0) { |
745 | 7.39k | double density = static_cast<double>(total_cells - total_null_count) / |
746 | 7.39k | static_cast<double>(total_cells); |
747 | 7.39k | tablet->compaction_density.store(density); |
748 | 7.39k | LOG(INFO) << "Vertical compaction density update: tablet_id=" << tablet->tablet_id() |
749 | 7.39k | << ", total_cells=" << total_cells |
750 | 7.39k | << ", total_null_count=" << total_null_count << ", density=" << density; |
751 | 7.39k | } |
752 | 10.1k | } |
753 | | |
754 | | // finish compact, build output rowset |
755 | 10.1k | VLOG_NOTICE << "finish compact groups"; |
756 | 10.1k | RETURN_IF_ERROR(dst_rowset_writer->final_flush()); |
757 | | |
758 | 10.2k | if (stats_output != nullptr) { |
759 | 10.2k | *stats_output = total_stats; |
760 | 10.2k | } |
761 | | |
762 | 10.1k | return Status::OK(); |
763 | 10.1k | } |
764 | | } // namespace doris |