/root/doris/be/src/olap/compaction.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "olap/compaction.h" |
19 | | |
20 | | #include <fmt/format.h> |
21 | | #include <gen_cpp/olap_file.pb.h> |
22 | | #include <glog/logging.h> |
23 | | |
24 | | #include <algorithm> |
25 | | #include <cstdlib> |
26 | | #include <list> |
27 | | #include <map> |
28 | | #include <memory> |
29 | | #include <mutex> |
30 | | #include <nlohmann/json.hpp> |
31 | | #include <numeric> |
32 | | #include <ostream> |
33 | | #include <set> |
34 | | #include <shared_mutex> |
35 | | #include <utility> |
36 | | |
37 | | #include "common/config.h" |
38 | | #include "common/status.h" |
39 | | #include "io/fs/file_system.h" |
40 | | #include "io/fs/file_writer.h" |
41 | | #include "io/fs/remote_file_system.h" |
42 | | #include "olap/cumulative_compaction_policy.h" |
43 | | #include "olap/cumulative_compaction_time_series_policy.h" |
44 | | #include "olap/data_dir.h" |
45 | | #include "olap/olap_common.h" |
46 | | #include "olap/olap_define.h" |
47 | | #include "olap/rowset/beta_rowset.h" |
48 | | #include "olap/rowset/rowset.h" |
49 | | #include "olap/rowset/rowset_meta.h" |
50 | | #include "olap/rowset/rowset_writer.h" |
51 | | #include "olap/rowset/rowset_writer_context.h" |
52 | | #include "olap/rowset/segment_v2/inverted_index_compaction.h" |
53 | | #include "olap/rowset/segment_v2/inverted_index_file_reader.h" |
54 | | #include "olap/rowset/segment_v2/inverted_index_file_writer.h" |
55 | | #include "olap/rowset/segment_v2/inverted_index_fs_directory.h" |
56 | | #include "olap/storage_engine.h" |
57 | | #include "olap/storage_policy.h" |
58 | | #include "olap/tablet.h" |
59 | | #include "olap/tablet_meta.h" |
60 | | #include "olap/tablet_meta_manager.h" |
61 | | #include "olap/task/engine_checksum_task.h" |
62 | | #include "olap/txn_manager.h" |
63 | | #include "olap/utils.h" |
64 | | #include "runtime/memory/mem_tracker_limiter.h" |
65 | | #include "runtime/thread_context.h" |
66 | | #include "util/time.h" |
67 | | #include "util/trace.h" |
68 | | |
69 | | using std::vector; |
70 | | |
71 | | namespace doris { |
72 | | using namespace ErrorCode; |
73 | | |
74 | | Compaction::Compaction(const TabletSharedPtr& tablet, const std::string& label) |
75 | | : _tablet(tablet), |
76 | | _input_rowsets_size(0), |
77 | | _input_row_num(0), |
78 | | _input_num_segments(0), |
79 | | _input_index_size(0), |
80 | 19 | _state(CompactionState::INITED) { |
81 | 19 | _mem_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::COMPACTION, label); |
82 | 19 | init_profile(label); |
83 | 19 | } |
84 | | |
85 | 19 | Compaction::~Compaction() { |
86 | 19 | SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_mem_tracker); |
87 | 19 | _output_rs_writer.reset(); |
88 | 19 | _tablet.reset(); |
89 | 19 | _input_rowsets.clear(); |
90 | 19 | _output_rowset.reset(); |
91 | 19 | _cur_tablet_schema.reset(); |
92 | 19 | } |
93 | | |
94 | 19 | void Compaction::init_profile(const std::string& label) { |
95 | 19 | _profile = std::make_unique<RuntimeProfile>(label); |
96 | | |
97 | 19 | _input_rowsets_data_size_counter = |
98 | 19 | ADD_COUNTER(_profile, "input_rowsets_data_size", TUnit::BYTES); |
99 | 19 | _input_rowsets_counter = ADD_COUNTER(_profile, "input_rowsets_count", TUnit::UNIT); |
100 | 19 | _input_row_num_counter = ADD_COUNTER(_profile, "input_row_num", TUnit::UNIT); |
101 | 19 | _input_segments_num_counter = ADD_COUNTER(_profile, "input_segments_num", TUnit::UNIT); |
102 | 19 | _merged_rows_counter = ADD_COUNTER(_profile, "merged_rows", TUnit::UNIT); |
103 | 19 | _filtered_rows_counter = ADD_COUNTER(_profile, "filtered_rows", TUnit::UNIT); |
104 | 19 | _output_rowset_data_size_counter = |
105 | 19 | ADD_COUNTER(_profile, "output_rowset_data_size", TUnit::BYTES); |
106 | 19 | _output_row_num_counter = ADD_COUNTER(_profile, "output_row_num", TUnit::UNIT); |
107 | 19 | _output_segments_num_counter = ADD_COUNTER(_profile, "output_segments_num", TUnit::UNIT); |
108 | 19 | _merge_rowsets_latency_timer = ADD_TIMER(_profile, "merge_rowsets_latency"); |
109 | 19 | } |
110 | | |
111 | 0 | Status Compaction::compact() { |
112 | 0 | RETURN_IF_ERROR(prepare_compact()); |
113 | 0 | RETURN_IF_ERROR(execute_compact()); |
114 | 0 | return Status::OK(); |
115 | 0 | } |
116 | | |
117 | 0 | Status Compaction::execute_compact() { |
118 | 0 | Status st = execute_compact_impl(); |
119 | 0 | if (!st.ok()) { |
120 | 0 | gc_output_rowset(); |
121 | 0 | } |
122 | 0 | return st; |
123 | 0 | } |
124 | | |
125 | 0 | Status Compaction::do_compaction(int64_t permits) { |
126 | 0 | uint32_t checksum_before; |
127 | 0 | uint32_t checksum_after; |
128 | 0 | if (config::enable_compaction_checksum) { |
129 | 0 | EngineChecksumTask checksum_task(_tablet->tablet_id(), _tablet->schema_hash(), |
130 | 0 | _input_rowsets.back()->end_version(), &checksum_before); |
131 | 0 | RETURN_IF_ERROR(checksum_task.execute()); |
132 | 0 | } |
133 | | |
134 | 0 | _tablet->data_dir()->disks_compaction_score_increment(permits); |
135 | 0 | _tablet->data_dir()->disks_compaction_num_increment(1); |
136 | 0 | Status st = do_compaction_impl(permits); |
137 | 0 | _tablet->data_dir()->disks_compaction_score_increment(-permits); |
138 | 0 | _tablet->data_dir()->disks_compaction_num_increment(-1); |
139 | |
|
140 | 0 | if (config::enable_compaction_checksum) { |
141 | 0 | EngineChecksumTask checksum_task(_tablet->tablet_id(), _tablet->schema_hash(), |
142 | 0 | _input_rowsets.back()->end_version(), &checksum_after); |
143 | 0 | RETURN_IF_ERROR(checksum_task.execute()); |
144 | 0 | if (checksum_before != checksum_after) { |
145 | 0 | LOG(WARNING) << "Compaction tablet=" << _tablet->tablet_id() |
146 | 0 | << " checksum not consistent" |
147 | 0 | << ", before=" << checksum_before << ", checksum_after=" << checksum_after; |
148 | 0 | } |
149 | 0 | } |
150 | 0 | if (st.ok()) { |
151 | 0 | _load_segment_to_cache(); |
152 | 0 | } |
153 | 0 | return st; |
154 | 0 | } |
155 | | |
156 | 0 | bool Compaction::should_vertical_compaction() { |
157 | | // some conditions that not use vertical compaction |
158 | 0 | if (!config::enable_vertical_compaction) { |
159 | 0 | return false; |
160 | 0 | } |
161 | 0 | return true; |
162 | 0 | } |
163 | | |
164 | 0 | int64_t Compaction::get_avg_segment_rows() { |
165 | | // take care of empty rowset |
166 | | // input_rowsets_size is total disk_size of input_rowset, this size is the |
167 | | // final size after codec and compress, so expect dest segment file size |
168 | | // in disk is config::vertical_compaction_max_segment_size |
169 | 0 | const auto& meta = _tablet->tablet_meta(); |
170 | 0 | if (meta->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY) { |
171 | 0 | int64_t compaction_goal_size_mbytes = meta->time_series_compaction_goal_size_mbytes(); |
172 | 0 | return (compaction_goal_size_mbytes * 1024 * 1024 * 2) / |
173 | 0 | (_input_rowsets_size / (_input_row_num + 1) + 1); |
174 | 0 | } |
175 | 0 | return config::vertical_compaction_max_segment_size / |
176 | 0 | (_input_rowsets_size / (_input_row_num + 1) + 1); |
177 | 0 | } |
178 | | |
179 | 5 | bool Compaction::is_rowset_tidy(std::string& pre_max_key, const RowsetSharedPtr& rhs) { |
180 | 5 | size_t min_tidy_size = config::ordered_data_compaction_min_segment_size; |
181 | 5 | if (rhs->num_segments() == 0) { |
182 | 0 | return true; |
183 | 0 | } |
184 | 5 | if (rhs->is_segments_overlapping()) { |
185 | 0 | return false; |
186 | 0 | } |
187 | | // check segment size |
188 | 5 | auto beta_rowset = reinterpret_cast<BetaRowset*>(rhs.get()); |
189 | 5 | std::vector<size_t> segments_size; |
190 | 5 | RETURN_FALSE_IF_ERROR(beta_rowset->get_segments_size(&segments_size)); |
191 | 10 | for (auto segment_size : segments_size) { |
192 | | // is segment is too small, need to do compaction |
193 | 10 | if (segment_size < min_tidy_size) { |
194 | 0 | return false; |
195 | 0 | } |
196 | 10 | } |
197 | 5 | std::string min_key; |
198 | 5 | auto ret = rhs->first_key(&min_key); |
199 | 5 | if (!ret) { |
200 | 0 | return false; |
201 | 0 | } |
202 | 5 | if (min_key <= pre_max_key) { |
203 | 0 | return false; |
204 | 0 | } |
205 | 5 | CHECK(rhs->last_key(&pre_max_key)); |
206 | | |
207 | 5 | return true; |
208 | 5 | } |
209 | | |
210 | 1 | Status Compaction::do_compact_ordered_rowsets() { |
211 | 1 | build_basic_info(); |
212 | 1 | RowsetWriterContext ctx; |
213 | 1 | RETURN_IF_ERROR(construct_output_rowset_writer(ctx)); |
214 | | |
215 | 1 | LOG(INFO) << "start to do ordered data compaction, tablet=" << _tablet->tablet_id() |
216 | 1 | << ", output_version=" << _output_version; |
217 | | // link data to new rowset |
218 | 1 | auto seg_id = 0; |
219 | 1 | std::vector<KeyBoundsPB> segment_key_bounds; |
220 | 5 | for (auto rowset : _input_rowsets) { |
221 | 5 | RETURN_IF_ERROR(rowset->link_files_to(_tablet->tablet_path(), |
222 | 5 | _output_rs_writer->rowset_id(), seg_id)); |
223 | 5 | seg_id += rowset->num_segments(); |
224 | | |
225 | 5 | std::vector<KeyBoundsPB> key_bounds; |
226 | 5 | RETURN_IF_ERROR(rowset->get_segments_key_bounds(&key_bounds)); |
227 | 5 | segment_key_bounds.insert(segment_key_bounds.end(), key_bounds.begin(), key_bounds.end()); |
228 | 5 | } |
229 | | // build output rowset |
230 | 1 | RowsetMetaSharedPtr rowset_meta = std::make_shared<RowsetMeta>(); |
231 | 1 | rowset_meta->set_num_rows(_input_row_num); |
232 | 1 | rowset_meta->set_total_disk_size(_input_rowsets_size); |
233 | 1 | rowset_meta->set_data_disk_size(_input_rowsets_size); |
234 | 1 | rowset_meta->set_index_disk_size(_input_index_size); |
235 | 1 | rowset_meta->set_empty(_input_row_num == 0); |
236 | 1 | rowset_meta->set_num_segments(_input_num_segments); |
237 | 1 | rowset_meta->set_segments_overlap(NONOVERLAPPING); |
238 | 1 | rowset_meta->set_rowset_state(VISIBLE); |
239 | | |
240 | 1 | rowset_meta->set_segments_key_bounds(segment_key_bounds); |
241 | 1 | _output_rowset = _output_rs_writer->manual_build(rowset_meta); |
242 | 1 | return Status::OK(); |
243 | 1 | } |
244 | | |
245 | 3 | void Compaction::build_basic_info() { |
246 | 11 | for (auto& rowset : _input_rowsets) { |
247 | 11 | _input_rowsets_size += rowset->data_disk_size(); |
248 | 11 | _input_index_size += rowset->index_disk_size(); |
249 | 11 | _input_row_num += rowset->num_rows(); |
250 | 11 | _input_num_segments += rowset->num_segments(); |
251 | 11 | } |
252 | 3 | COUNTER_UPDATE(_input_rowsets_data_size_counter, _input_rowsets_size); |
253 | 3 | COUNTER_UPDATE(_input_row_num_counter, _input_row_num); |
254 | 3 | COUNTER_UPDATE(_input_segments_num_counter, _input_num_segments); |
255 | | |
256 | 3 | _output_version = |
257 | 3 | Version(_input_rowsets.front()->start_version(), _input_rowsets.back()->end_version()); |
258 | | |
259 | 3 | _newest_write_timestamp = _input_rowsets.back()->newest_write_timestamp(); |
260 | | |
261 | 3 | std::vector<RowsetMetaSharedPtr> rowset_metas(_input_rowsets.size()); |
262 | 3 | std::transform(_input_rowsets.begin(), _input_rowsets.end(), rowset_metas.begin(), |
263 | 11 | [](const RowsetSharedPtr& rowset) { return rowset->rowset_meta(); }); |
264 | 3 | _cur_tablet_schema = _tablet->tablet_schema_with_merged_max_schema_version(rowset_metas); |
265 | 3 | } |
266 | | |
267 | 1 | bool Compaction::handle_ordered_data_compaction() { |
268 | 1 | if (!config::enable_ordered_data_compaction) { |
269 | 0 | return false; |
270 | 0 | } |
271 | 1 | if (compaction_type() == ReaderType::READER_COLD_DATA_COMPACTION) { |
272 | | // The remote file system does not support to link files. |
273 | 0 | return false; |
274 | 0 | } |
275 | 1 | if (_tablet->keys_type() == KeysType::UNIQUE_KEYS && |
276 | 1 | _tablet->enable_unique_key_merge_on_write()) { |
277 | 0 | return false; |
278 | 0 | } |
279 | | |
280 | 1 | if (_tablet->tablet_meta()->tablet_schema()->skip_write_index_on_load()) { |
281 | | // Expected to create index through normal compaction |
282 | 0 | return false; |
283 | 0 | } |
284 | | |
285 | | // check delete version: if compaction type is base compaction and |
286 | | // has a delete version, use original compaction |
287 | 1 | if (compaction_type() == ReaderType::READER_BASE_COMPACTION) { |
288 | 0 | for (auto& rowset : _input_rowsets) { |
289 | 0 | if (rowset->rowset_meta()->has_delete_predicate()) { |
290 | 0 | return false; |
291 | 0 | } |
292 | 0 | } |
293 | 0 | } |
294 | | |
295 | | // check if rowsets are tidy so we can just modify meta and do link |
296 | | // files to handle compaction |
297 | 1 | auto input_size = _input_rowsets.size(); |
298 | 1 | std::string pre_max_key; |
299 | 6 | for (auto i = 0; i < input_size; ++i) { |
300 | 5 | if (!is_rowset_tidy(pre_max_key, _input_rowsets[i])) { |
301 | 0 | if (i <= input_size / 2) { |
302 | 0 | return false; |
303 | 0 | } else { |
304 | 0 | _input_rowsets.resize(i); |
305 | 0 | break; |
306 | 0 | } |
307 | 0 | } |
308 | 5 | } |
309 | | // most rowset of current compaction is nonoverlapping |
310 | | // just handle nonoverlappint rowsets |
311 | 1 | auto st = do_compact_ordered_rowsets(); |
312 | 1 | if (!st.ok()) { |
313 | 0 | LOG(WARNING) << "failed to compact ordered rowsets: " << st; |
314 | 0 | _pending_rs_guard.drop(); |
315 | 0 | } |
316 | | |
317 | 1 | return st.ok(); |
318 | 1 | } |
319 | | |
320 | 0 | int64_t Compaction::merge_way_num() { |
321 | 0 | int64_t way_num = 0; |
322 | 0 | for (auto&& rowset : _input_rowsets) { |
323 | 0 | way_num += rowset->rowset_meta()->get_merge_way_num(); |
324 | 0 | } |
325 | |
|
326 | 0 | return way_num; |
327 | 0 | } |
328 | | |
329 | 0 | Status Compaction::do_compaction_impl(int64_t permits) { |
330 | 0 | OlapStopWatch watch; |
331 | |
|
332 | 0 | if (handle_ordered_data_compaction()) { |
333 | 0 | RETURN_IF_ERROR(modify_rowsets()); |
334 | | |
335 | 0 | int64_t now = UnixMillis(); |
336 | 0 | if (compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION) { |
337 | | // TIME_SERIES_POLICY, generating an empty rowset doesn't need to update the timestamp. |
338 | 0 | if (!(_tablet->tablet_meta()->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY && |
339 | 0 | _output_rowset->num_segments() == 0)) { |
340 | 0 | _tablet->set_last_cumu_compaction_success_time(now); |
341 | 0 | } |
342 | 0 | } else if (compaction_type() == ReaderType::READER_BASE_COMPACTION) { |
343 | 0 | _tablet->set_last_base_compaction_success_time(now); |
344 | 0 | } else if (compaction_type() == ReaderType::READER_FULL_COMPACTION) { |
345 | 0 | _tablet->set_last_full_compaction_success_time(now); |
346 | 0 | } |
347 | 0 | auto cumu_policy = _tablet->cumulative_compaction_policy(); |
348 | 0 | LOG(INFO) << "succeed to do ordered data " << compaction_name() |
349 | 0 | << ". tablet=" << _tablet->tablet_id() << ", output_version=" << _output_version |
350 | 0 | << ", disk=" << _tablet->data_dir()->path() |
351 | 0 | << ", segments=" << _input_num_segments << ", input_row_num=" << _input_row_num |
352 | 0 | << ", output_row_num=" << _output_rowset->num_rows() |
353 | 0 | << ", input_rowset_size=" << _input_rowsets_size |
354 | 0 | << ", output_rowset_size=" << _output_rowset->data_disk_size() |
355 | 0 | << ". elapsed time=" << watch.get_elapse_second() |
356 | 0 | << "s. cumulative_compaction_policy=" |
357 | 0 | << (cumu_policy == nullptr ? "quick" : cumu_policy->name()); |
358 | 0 | return Status::OK(); |
359 | 0 | } |
360 | 0 | build_basic_info(); |
361 | |
|
362 | 0 | VLOG_DEBUG << "dump tablet schema: " << _cur_tablet_schema->dump_structure(); |
363 | |
|
364 | 0 | LOG(INFO) << "start " << compaction_name() << ". tablet=" << _tablet->tablet_id() |
365 | 0 | << ", output_version=" << _output_version << ", permits: " << permits; |
366 | 0 | bool vertical_compaction = should_vertical_compaction(); |
367 | 0 | RowsetWriterContext ctx; |
368 | 0 | RETURN_IF_ERROR(construct_input_rowset_readers()); |
369 | 0 | RETURN_IF_ERROR(construct_output_rowset_writer(ctx, vertical_compaction)); |
370 | | |
371 | | // 2. write merged rows to output rowset |
372 | | // The test results show that merger is low-memory-footprint, there is no need to tracker its mem pool |
373 | 0 | Merger::Statistics stats; |
374 | | // if ctx.columns_to_do_index_compaction.size() > 0, it means we need to do inverted index compaction. |
375 | | // the row ID conversion matrix needs to be used for inverted index compaction. |
376 | 0 | if (!ctx.columns_to_do_index_compaction.empty() || |
377 | 0 | (_tablet->keys_type() == KeysType::UNIQUE_KEYS && |
378 | 0 | _tablet->enable_unique_key_merge_on_write())) { |
379 | 0 | stats.rowid_conversion = &_rowid_conversion; |
380 | 0 | } |
381 | 0 | int64_t way_num = merge_way_num(); |
382 | |
|
383 | 0 | Status res; |
384 | 0 | { |
385 | 0 | SCOPED_TIMER(_merge_rowsets_latency_timer); |
386 | 0 | if (vertical_compaction) { |
387 | 0 | res = Merger::vertical_merge_rowsets(_tablet, compaction_type(), _cur_tablet_schema, |
388 | 0 | _input_rs_readers, _output_rs_writer.get(), |
389 | 0 | get_avg_segment_rows(), way_num, &stats); |
390 | 0 | } else { |
391 | 0 | res = Merger::vmerge_rowsets(_tablet, compaction_type(), _cur_tablet_schema, |
392 | 0 | _input_rs_readers, _output_rs_writer.get(), &stats); |
393 | 0 | } |
394 | 0 | } |
395 | |
|
396 | 0 | _tablet->last_compaction_status = res; |
397 | |
|
398 | 0 | if (!res.ok()) { |
399 | 0 | LOG(WARNING) << "fail to do " << compaction_name() << ". res=" << res |
400 | 0 | << ", tablet=" << _tablet->tablet_id() |
401 | 0 | << ", output_version=" << _output_version; |
402 | 0 | return res; |
403 | 0 | } |
404 | 0 | COUNTER_UPDATE(_merged_rows_counter, stats.merged_rows); |
405 | 0 | COUNTER_UPDATE(_filtered_rows_counter, stats.filtered_rows); |
406 | |
|
407 | 0 | RETURN_NOT_OK_STATUS_WITH_WARN(_output_rs_writer->build(_output_rowset), |
408 | 0 | fmt::format("rowset writer build failed. output_version: {}", |
409 | 0 | _output_version.to_string())); |
410 | | // Now we support delete in cumu compaction, to make all data in rowsets whose version |
411 | | // is below output_version to be delete in the future base compaction, we should carry |
412 | | // all delete predicate in the output rowset. |
413 | | // Output start version > 2 means we must set the delete predicate in the output rowset |
414 | 0 | if (allow_delete_in_cumu_compaction() && _output_rowset->version().first > 2) { |
415 | 0 | DeletePredicatePB delete_predicate; |
416 | 0 | std::accumulate( |
417 | 0 | _input_rs_readers.begin(), _input_rs_readers.end(), &delete_predicate, |
418 | 0 | [](DeletePredicatePB* delete_predicate, const RowsetReaderSharedPtr& reader) { |
419 | 0 | if (reader->rowset()->rowset_meta()->has_delete_predicate()) { |
420 | 0 | delete_predicate->MergeFrom( |
421 | 0 | reader->rowset()->rowset_meta()->delete_predicate()); |
422 | 0 | } |
423 | 0 | return delete_predicate; |
424 | 0 | }); |
425 | | // now version in delete_predicate is deprecated |
426 | 0 | if (!delete_predicate.in_predicates().empty() || |
427 | 0 | !delete_predicate.sub_predicates_v2().empty() || |
428 | 0 | !delete_predicate.sub_predicates().empty()) { |
429 | 0 | _output_rowset->rowset_meta()->set_delete_predicate(std::move(delete_predicate)); |
430 | 0 | } |
431 | 0 | } |
432 | |
|
433 | 0 | COUNTER_UPDATE(_output_rowset_data_size_counter, _output_rowset->data_disk_size()); |
434 | 0 | COUNTER_UPDATE(_output_row_num_counter, _output_rowset->num_rows()); |
435 | 0 | COUNTER_UPDATE(_output_segments_num_counter, _output_rowset->num_segments()); |
436 | | |
437 | | // 3. check correctness |
438 | 0 | RETURN_IF_ERROR(check_correctness(stats)); |
439 | | |
440 | 0 | if (_input_row_num > 0 && stats.rowid_conversion && config::inverted_index_compaction_enable && |
441 | 0 | !ctx.columns_to_do_index_compaction.empty()) { |
442 | 0 | OlapStopWatch inverted_watch; |
443 | | |
444 | | // translation vec |
445 | | // <<dest_idx_num, dest_docId>> |
446 | | // the first level vector: index indicates src segment. |
447 | | // the second level vector: index indicates row id of source segment, |
448 | | // value indicates row id of destination segment. |
449 | | // <UINT32_MAX, UINT32_MAX> indicates current row not exist. |
450 | 0 | std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec = |
451 | 0 | stats.rowid_conversion->get_rowid_conversion_map(); |
452 | | |
453 | | // source rowset,segment -> index_id |
454 | 0 | std::map<std::pair<RowsetId, uint32_t>, uint32_t> src_seg_to_id_map = |
455 | 0 | stats.rowid_conversion->get_src_segment_to_id_map(); |
456 | | // dest rowset id |
457 | 0 | RowsetId dest_rowset_id = stats.rowid_conversion->get_dst_rowset_id(); |
458 | | // dest segment id -> num rows |
459 | 0 | std::vector<uint32_t> dest_segment_num_rows; |
460 | 0 | RETURN_IF_ERROR(_output_rs_writer->get_segment_num_rows(&dest_segment_num_rows)); |
461 | | |
462 | 0 | auto src_segment_num = src_seg_to_id_map.size(); |
463 | 0 | auto dest_segment_num = dest_segment_num_rows.size(); |
464 | |
|
465 | 0 | if (dest_segment_num > 0) { |
466 | | // src index files |
467 | | // format: rowsetId_segmentId |
468 | 0 | std::vector<std::string> src_index_files(src_segment_num); |
469 | 0 | for (const auto& m : src_seg_to_id_map) { |
470 | 0 | std::pair<RowsetId, uint32_t> p = m.first; |
471 | 0 | src_index_files[m.second] = p.first.to_string() + "_" + std::to_string(p.second); |
472 | 0 | } |
473 | | |
474 | | // dest index files |
475 | | // format: rowsetId_segmentId |
476 | 0 | std::vector<std::string> dest_index_files(dest_segment_num); |
477 | 0 | for (int i = 0; i < dest_segment_num; ++i) { |
478 | 0 | auto prefix = dest_rowset_id.to_string() + "_" + std::to_string(i); |
479 | 0 | dest_index_files[i] = prefix; |
480 | 0 | } |
481 | | |
482 | | // Only write info files when debug index compaction is enabled. |
483 | | // The files are used to debug index compaction and works with index_tool. |
484 | 0 | if (config::debug_inverted_index_compaction) { |
485 | 0 | auto write_json_to_file = [&](const nlohmann::json& json_obj, |
486 | 0 | const std::string& file_name) { |
487 | 0 | io::FileWriterPtr file_writer; |
488 | 0 | std::string file_path = |
489 | 0 | fmt::format("{}/{}.json", std::string(getenv("LOG_DIR")), file_name); |
490 | 0 | RETURN_IF_ERROR( |
491 | 0 | io::global_local_filesystem()->create_file(file_path, &file_writer)); |
492 | 0 | RETURN_IF_ERROR(file_writer->append(json_obj.dump())); |
493 | 0 | RETURN_IF_ERROR(file_writer->append("\n")); |
494 | 0 | return file_writer->close(); |
495 | 0 | }; |
496 | | |
497 | | // Convert trans_vec to JSON and print it |
498 | 0 | nlohmann::json trans_vec_json = trans_vec; |
499 | 0 | auto output_version = _output_version.to_string().substr( |
500 | 0 | 1, _output_version.to_string().size() - 2); |
501 | 0 | RETURN_IF_ERROR(write_json_to_file( |
502 | 0 | trans_vec_json, |
503 | 0 | fmt::format("trans_vec_{}_{}", _tablet->tablet_id(), output_version))); |
504 | | |
505 | 0 | nlohmann::json src_index_files_json = src_index_files; |
506 | 0 | RETURN_IF_ERROR(write_json_to_file( |
507 | 0 | src_index_files_json, |
508 | 0 | fmt::format("src_idx_dirs_{}_{}", _tablet->tablet_id(), output_version))); |
509 | | |
510 | 0 | nlohmann::json dest_index_files_json = dest_index_files; |
511 | 0 | RETURN_IF_ERROR(write_json_to_file( |
512 | 0 | dest_index_files_json, |
513 | 0 | fmt::format("dest_idx_dirs_{}_{}", _tablet->tablet_id(), output_version))); |
514 | | |
515 | 0 | nlohmann::json dest_segment_num_rows_json = dest_segment_num_rows; |
516 | 0 | RETURN_IF_ERROR( |
517 | 0 | write_json_to_file(dest_segment_num_rows_json, |
518 | 0 | fmt::format("dest_seg_num_rows_{}_{}", |
519 | 0 | _tablet->tablet_id(), output_version))); |
520 | 0 | } |
521 | | |
522 | | // create index_writer to compaction indexes |
523 | 0 | const auto& fs = _output_rowset->rowset_meta()->fs(); |
524 | 0 | const auto& tablet_path = _tablet->tablet_path(); |
525 | | |
526 | | // src index dirs |
527 | | // format: rowsetId_segmentId |
528 | 0 | std::vector<std::unique_ptr<InvertedIndexFileReader>> inverted_index_file_readers( |
529 | 0 | src_segment_num); |
530 | 0 | for (const auto& m : src_seg_to_id_map) { |
531 | 0 | std::pair<RowsetId, uint32_t> p = m.first; |
532 | 0 | auto segment_file_name = |
533 | 0 | p.first.to_string() + "_" + std::to_string(p.second) + ".dat"; |
534 | 0 | auto inverted_index_file_reader = std::make_unique<InvertedIndexFileReader>( |
535 | 0 | fs, tablet_path, segment_file_name, |
536 | 0 | _cur_tablet_schema->get_inverted_index_storage_format()); |
537 | 0 | bool open_idx_file_cache = false; |
538 | 0 | auto st = inverted_index_file_reader->init(config::inverted_index_read_buffer_size, |
539 | 0 | open_idx_file_cache); |
540 | 0 | if (!st.ok()) { |
541 | 0 | LOG(ERROR) << "init inverted index " |
542 | 0 | << InvertedIndexDescriptor::get_index_file_name(segment_file_name) |
543 | 0 | << " failed in compaction when init inverted index file reader"; |
544 | 0 | return st; |
545 | 0 | } |
546 | 0 | inverted_index_file_readers[m.second] = std::move(inverted_index_file_reader); |
547 | 0 | } |
548 | | |
549 | | // dest index files |
550 | | // format: rowsetId_segmentId |
551 | 0 | std::vector<std::unique_ptr<InvertedIndexFileWriter>> inverted_index_file_writers( |
552 | 0 | dest_segment_num); |
553 | | |
554 | | // Some columns have already been indexed |
555 | | // key: seg_id, value: inverted index file size |
556 | 0 | std::unordered_map<int, int64_t> compacted_idx_file_size; |
557 | 0 | for (int seg_id = 0; seg_id < dest_segment_num; ++seg_id) { |
558 | 0 | auto prefix = dest_rowset_id.to_string() + "_" + std::to_string(seg_id) + ".dat"; |
559 | 0 | auto inverted_index_file_reader = std::make_unique<InvertedIndexFileReader>( |
560 | 0 | fs, tablet_path, prefix, |
561 | 0 | _cur_tablet_schema->get_inverted_index_storage_format()); |
562 | 0 | bool open_idx_file_cache = false; |
563 | 0 | auto st = inverted_index_file_reader->init(config::inverted_index_read_buffer_size, |
564 | 0 | open_idx_file_cache); |
565 | 0 | if (st.ok()) { |
566 | 0 | auto index_not_need_to_compact = |
567 | 0 | DORIS_TRY(inverted_index_file_reader->get_all_directories()); |
568 | | // V1: each index is a separate file |
569 | | // V2: all indexes are in a single file |
570 | 0 | if (_cur_tablet_schema->get_inverted_index_storage_format() != |
571 | 0 | doris::InvertedIndexStorageFormatPB::V1) { |
572 | 0 | int64_t fsize = 0; |
573 | 0 | st = fs->file_size(InvertedIndexDescriptor::get_index_file_name(prefix), |
574 | 0 | &fsize); |
575 | 0 | if (!st.ok()) { |
576 | 0 | LOG(ERROR) << "file size error in index compaction, error:" << st.msg(); |
577 | 0 | return st; |
578 | 0 | } |
579 | 0 | compacted_idx_file_size[seg_id] = fsize; |
580 | 0 | } |
581 | 0 | auto inverted_index_file_writer = std::make_unique<InvertedIndexFileWriter>( |
582 | 0 | fs, tablet_path, prefix, |
583 | 0 | _cur_tablet_schema->get_inverted_index_storage_format()); |
584 | 0 | RETURN_NOT_OK_STATUS_WITH_WARN( |
585 | 0 | inverted_index_file_writer->initialize(index_not_need_to_compact), |
586 | 0 | "failed to initialize inverted_index_file_writer for " + |
587 | 0 | inverted_index_file_writer->get_index_file_name()); |
588 | 0 | inverted_index_file_writers[seg_id] = std::move(inverted_index_file_writer); |
589 | 0 | } else if (st.is<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>()) { |
590 | 0 | auto inverted_index_file_writer = std::make_unique<InvertedIndexFileWriter>( |
591 | 0 | fs, tablet_path, prefix, |
592 | 0 | _cur_tablet_schema->get_inverted_index_storage_format()); |
593 | 0 | inverted_index_file_writers[seg_id] = std::move(inverted_index_file_writer); |
594 | | // no index file |
595 | 0 | compacted_idx_file_size[seg_id] = 0; |
596 | 0 | } else { |
597 | 0 | LOG(ERROR) << "init inverted index " |
598 | 0 | << InvertedIndexDescriptor::get_index_file_name(prefix) |
599 | 0 | << " failed in compaction when create inverted index file writer"; |
600 | 0 | return st; |
601 | 0 | } |
602 | 0 | } |
603 | | |
604 | | // we choose the first destination segment name as the temporary index writer path |
605 | | // Used to distinguish between different index compaction |
606 | 0 | auto index_tmp_path = tablet_path + "/" + dest_rowset_id.to_string() + "_" + "tmp"; |
607 | 0 | LOG(INFO) << "start index compaction" |
608 | 0 | << ". tablet=" << _tablet->tablet_id() |
609 | 0 | << ", source index size=" << src_segment_num |
610 | 0 | << ", destination index size=" << dest_segment_num << "."; |
611 | |
|
612 | 0 | auto error_handler = [this](int64_t index_id, int64_t column_uniq_id) { |
613 | 0 | LOG(WARNING) << "failed to do index compaction" |
614 | 0 | << ". tablet=" << _tablet->tablet_id() |
615 | 0 | << ". column uniq id=" << column_uniq_id << ". index_id=" << index_id; |
616 | 0 | for (auto& rowset : _input_rowsets) { |
617 | 0 | rowset->set_skip_index_compaction(column_uniq_id); |
618 | 0 | LOG(INFO) << "mark skipping inverted index compaction next time" |
619 | 0 | << ". tablet=" << _tablet->tablet_id() |
620 | 0 | << ", rowset=" << rowset->rowset_id() |
621 | 0 | << ", column uniq id=" << column_uniq_id << ", index_id=" << index_id; |
622 | 0 | } |
623 | 0 | }; |
624 | |
|
625 | 0 | Status status = Status::OK(); |
626 | 0 | for (auto&& column_uniq_id : ctx.columns_to_do_index_compaction) { |
627 | 0 | auto col = _cur_tablet_schema->column_by_uid(column_uniq_id); |
628 | 0 | const auto* index_meta = _cur_tablet_schema->get_inverted_index(col); |
629 | | |
630 | | // if index properties are different, index compaction maybe needs to be skipped. |
631 | 0 | bool is_continue = false; |
632 | 0 | std::optional<std::map<std::string, std::string>> first_properties; |
633 | 0 | for (const auto& rowset : _input_rowsets) { |
634 | 0 | const auto* tablet_index = rowset->tablet_schema()->get_inverted_index(col); |
635 | | // no inverted index or index id is different from current index id |
636 | 0 | if (tablet_index == nullptr || |
637 | 0 | tablet_index->index_id() != index_meta->index_id()) { |
638 | 0 | error_handler(index_meta->index_id(), column_uniq_id); |
639 | 0 | status = Status::Error<INVERTED_INDEX_COMPACTION_ERROR>( |
640 | 0 | "index ids are different, skip index compaction"); |
641 | 0 | is_continue = true; |
642 | 0 | break; |
643 | 0 | } |
644 | 0 | const auto& properties = tablet_index->properties(); |
645 | 0 | if (!first_properties.has_value()) { |
646 | 0 | first_properties = properties; |
647 | 0 | } else { |
648 | 0 | if (properties != first_properties.value()) { |
649 | 0 | error_handler(index_meta->index_id(), column_uniq_id); |
650 | 0 | status = Status::Error<INVERTED_INDEX_COMPACTION_ERROR>( |
651 | 0 | "if index properties are different, index compaction needs to " |
652 | 0 | "be " |
653 | 0 | "skipped."); |
654 | 0 | is_continue = true; |
655 | 0 | break; |
656 | 0 | } |
657 | 0 | } |
658 | 0 | } |
659 | 0 | if (is_continue) { |
660 | 0 | continue; |
661 | 0 | } |
662 | | |
663 | 0 | std::vector<lucene::store::Directory*> dest_index_dirs(dest_segment_num); |
664 | 0 | try { |
665 | 0 | std::vector<std::unique_ptr<DorisCompoundReader>> src_idx_dirs(src_segment_num); |
666 | 0 | for (int src_segment_id = 0; src_segment_id < src_segment_num; |
667 | 0 | src_segment_id++) { |
668 | 0 | src_idx_dirs[src_segment_id] = DORIS_TRY( |
669 | 0 | inverted_index_file_readers[src_segment_id]->open(index_meta)); |
670 | 0 | } |
671 | 0 | for (int dest_segment_id = 0; dest_segment_id < dest_segment_num; |
672 | 0 | dest_segment_id++) { |
673 | 0 | auto* dest_dir = DORIS_TRY( |
674 | 0 | inverted_index_file_writers[dest_segment_id]->open(index_meta)); |
675 | 0 | dest_index_dirs[dest_segment_id] = dest_dir; |
676 | 0 | } |
677 | 0 | auto st = compact_column(index_meta->index_id(), src_idx_dirs, dest_index_dirs, |
678 | 0 | fs, index_tmp_path, trans_vec, dest_segment_num_rows); |
679 | 0 | if (!st.ok()) { |
680 | 0 | error_handler(index_meta->index_id(), column_uniq_id); |
681 | 0 | status = Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg()); |
682 | 0 | } |
683 | 0 | } catch (CLuceneError& e) { |
684 | 0 | error_handler(index_meta->index_id(), column_uniq_id); |
685 | 0 | status = Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(e.what()); |
686 | 0 | } |
687 | 0 | } |
688 | 0 | uint64_t inverted_index_file_size = 0; |
689 | 0 | for (int seg_id = 0; seg_id < dest_segment_num; ++seg_id) { |
690 | 0 | auto inverted_index_file_writer = inverted_index_file_writers[seg_id].get(); |
691 | 0 | if (Status st = inverted_index_file_writer->close(); !st.ok()) { |
692 | 0 | status = Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg()); |
693 | 0 | } else { |
694 | 0 | inverted_index_file_size += inverted_index_file_writer->get_index_file_size(); |
695 | 0 | inverted_index_file_size -= compacted_idx_file_size[seg_id]; |
696 | 0 | } |
697 | 0 | } |
698 | | // check index compaction status. If status is not ok, we should return error and end this compaction round. |
699 | 0 | if (!status.ok()) { |
700 | 0 | return status; |
701 | 0 | } |
702 | | |
703 | | // index compaction should update total disk size and index disk size |
704 | 0 | _output_rowset->rowset_meta()->set_data_disk_size(_output_rowset->data_disk_size() + |
705 | 0 | inverted_index_file_size); |
706 | 0 | _output_rowset->rowset_meta()->set_total_disk_size(_output_rowset->data_disk_size() + |
707 | 0 | inverted_index_file_size); |
708 | 0 | _output_rowset->rowset_meta()->set_index_disk_size(_output_rowset->index_disk_size() + |
709 | 0 | inverted_index_file_size); |
710 | |
|
711 | 0 | COUNTER_UPDATE(_output_rowset_data_size_counter, _output_rowset->data_disk_size()); |
712 | 0 | LOG(INFO) << "succeed to do index compaction" |
713 | 0 | << ". tablet=" << _tablet->tablet_id() |
714 | 0 | << ", input row number=" << _input_row_num |
715 | 0 | << ", output row number=" << _output_rowset->num_rows() |
716 | 0 | << ", input_rowset_size=" << _input_rowsets_size |
717 | 0 | << ", output_rowset_size=" << _output_rowset->data_disk_size() |
718 | 0 | << ", inverted index file size=" << inverted_index_file_size |
719 | 0 | << ". elapsed time=" << inverted_watch.get_elapse_second() << "s."; |
720 | 0 | } else { |
721 | 0 | LOG(INFO) << "skip doing index compaction due to no output segments" |
722 | 0 | << ". tablet=" << _tablet->tablet_id() |
723 | 0 | << ", input row number=" << _input_row_num |
724 | 0 | << ", output row number=" << _output_rowset->num_rows() |
725 | 0 | << ". elapsed time=" << inverted_watch.get_elapse_second() << "s."; |
726 | 0 | } |
727 | 0 | } |
728 | | |
729 | | // 4. modify rowsets in memory |
730 | 0 | RETURN_IF_ERROR(modify_rowsets(&stats)); |
731 | | |
732 | | // 5. update last success compaction time |
733 | 0 | int64_t now = UnixMillis(); |
734 | | // TODO(yingchun): do the judge in Tablet class |
735 | 0 | if (compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION) { |
736 | | // TIME_SERIES_POLICY, generating an empty rowset doesn't need to update the timestamp. |
737 | 0 | if (!(_tablet->tablet_meta()->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY && |
738 | 0 | _output_rowset->num_segments() == 0)) { |
739 | 0 | _tablet->set_last_cumu_compaction_success_time(now); |
740 | 0 | } |
741 | 0 | } else if (compaction_type() == ReaderType::READER_BASE_COMPACTION) { |
742 | 0 | _tablet->set_last_base_compaction_success_time(now); |
743 | 0 | } else if (compaction_type() == ReaderType::READER_FULL_COMPACTION) { |
744 | 0 | _tablet->set_last_full_compaction_success_time(now); |
745 | 0 | } |
746 | |
|
747 | 0 | int64_t current_max_version; |
748 | 0 | { |
749 | 0 | std::shared_lock rdlock(_tablet->get_header_lock()); |
750 | 0 | RowsetSharedPtr max_rowset = _tablet->rowset_with_max_version(); |
751 | 0 | if (max_rowset == nullptr) { |
752 | 0 | current_max_version = -1; |
753 | 0 | } else { |
754 | 0 | current_max_version = _tablet->rowset_with_max_version()->end_version(); |
755 | 0 | } |
756 | 0 | } |
757 | |
|
758 | 0 | auto cumu_policy = _tablet->cumulative_compaction_policy(); |
759 | 0 | DCHECK(cumu_policy); |
760 | 0 | LOG(INFO) << "succeed to do " << compaction_name() << " is_vertical=" << vertical_compaction |
761 | 0 | << ". tablet=" << _tablet->tablet_id() << ", output_version=" << _output_version |
762 | 0 | << ", current_max_version=" << current_max_version |
763 | 0 | << ", disk=" << _tablet->data_dir()->path() << ", segments=" << _input_num_segments |
764 | 0 | << ", input_rowset_size=" << _input_rowsets_size |
765 | 0 | << ", output_rowset_size=" << _output_rowset->data_disk_size() |
766 | 0 | << ", input_row_num=" << _input_row_num |
767 | 0 | << ", output_row_num=" << _output_rowset->num_rows() |
768 | 0 | << ", filtered_row_num=" << stats.filtered_rows |
769 | 0 | << ", merged_row_num=" << stats.merged_rows |
770 | 0 | << ". elapsed time=" << watch.get_elapse_second() |
771 | 0 | << "s. cumulative_compaction_policy=" << cumu_policy->name() |
772 | 0 | << ", compact_row_per_second=" << int(_input_row_num / watch.get_elapse_second()); |
773 | |
|
774 | 0 | return Status::OK(); |
775 | 0 | } |
776 | | |
777 | 3 | Status Compaction::construct_output_rowset_writer(RowsetWriterContext& ctx, bool is_vertical) { |
778 | 3 | ctx.version = _output_version; |
779 | 3 | ctx.rowset_state = VISIBLE; |
780 | 3 | ctx.segments_overlap = NONOVERLAPPING; |
781 | 3 | ctx.tablet_schema = _cur_tablet_schema; |
782 | 3 | ctx.newest_write_timestamp = _newest_write_timestamp; |
783 | 3 | ctx.write_type = DataWriteType::TYPE_COMPACTION; |
784 | 3 | if (config::inverted_index_compaction_enable && |
785 | 3 | (((_tablet->keys_type() == KeysType::UNIQUE_KEYS && |
786 | 1 | _tablet->enable_unique_key_merge_on_write()) || |
787 | 1 | _tablet->keys_type() == KeysType::DUP_KEYS)) && |
788 | 3 | _cur_tablet_schema->get_inverted_index_storage_format() == |
789 | 1 | InvertedIndexStorageFormatPB::V1) { |
790 | 4 | for (const auto& index : _cur_tablet_schema->indexes()) { |
791 | 4 | if (index.index_type() == IndexType::INVERTED) { |
792 | 4 | auto col_unique_ids = index.col_unique_ids(); |
793 | | // check if column unique ids is empty to avoid crash |
794 | 4 | if (col_unique_ids.empty()) { |
795 | 0 | LOG(WARNING) << "tablet[" << _tablet->tablet_id() << "] index[" |
796 | 0 | << index.index_id() |
797 | 0 | << "] has no column unique id, will skip index compaction." |
798 | 0 | << " tablet_schema=" << _cur_tablet_schema->dump_full_schema(); |
799 | 0 | continue; |
800 | 0 | } |
801 | 4 | auto col_unique_id = col_unique_ids[0]; |
802 | | // Avoid doing inverted index compaction on non-slice type columns |
803 | 4 | if (!field_is_slice_type(_cur_tablet_schema->column_by_uid(col_unique_id).type())) { |
804 | 2 | continue; |
805 | 2 | } |
806 | | //NOTE: here src_rs may be in building index progress, so it would not contain inverted index info. |
807 | 2 | bool all_have_inverted_index = std::all_of( |
808 | 6 | _input_rowsets.begin(), _input_rowsets.end(), [&](const auto& src_rs) { |
809 | 6 | BetaRowsetSharedPtr rowset = |
810 | 6 | std::static_pointer_cast<BetaRowset>(src_rs); |
811 | 6 | if (rowset == nullptr) { |
812 | 0 | LOG(WARNING) << "tablet[" << _tablet->tablet_id() |
813 | 0 | << "] rowset is null, will skip index compaction"; |
814 | 0 | return false; |
815 | 0 | } |
816 | 6 | if (rowset->is_skip_index_compaction(col_unique_id)) { |
817 | 0 | LOG(WARNING) |
818 | 0 | << "tablet[" << _tablet->tablet_id() << "] rowset[" |
819 | 0 | << rowset->rowset_id() << "] column_unique_id[" |
820 | 0 | << col_unique_id |
821 | 0 | << "] skip inverted index compaction due to last failure"; |
822 | 0 | return false; |
823 | 0 | } |
824 | 6 | auto fs = rowset->rowset_meta()->fs(); |
825 | | |
826 | 6 | const auto* index_meta = |
827 | 6 | rowset->tablet_schema()->get_inverted_index(col_unique_id, ""); |
828 | 6 | if (index_meta == nullptr) { |
829 | 0 | LOG(WARNING) << "tablet[" << _tablet->tablet_id() |
830 | 0 | << "] column_unique_id[" << col_unique_id |
831 | 0 | << "] index meta is null, will skip index compaction"; |
832 | 0 | return false; |
833 | 0 | } |
834 | 26 | for (auto i = 0; i < rowset->num_segments(); i++) { |
835 | 20 | auto segment_file = rowset->segment_file_path(i); |
836 | 20 | io::Path segment_path(segment_file); |
837 | 20 | auto inverted_index_file_reader = |
838 | 20 | std::make_unique<InvertedIndexFileReader>( |
839 | 20 | fs, segment_path.parent_path(), |
840 | 20 | segment_path.filename(), |
841 | 20 | _cur_tablet_schema |
842 | 20 | ->get_inverted_index_storage_format()); |
843 | 20 | bool open_idx_file_cache = false; |
844 | 20 | auto st = inverted_index_file_reader->init( |
845 | 20 | config::inverted_index_read_buffer_size, |
846 | 20 | open_idx_file_cache); |
847 | 20 | if (!st.ok()) { |
848 | 0 | LOG(WARNING) << "init index " |
849 | 0 | << inverted_index_file_reader->get_index_file_path( |
850 | 0 | index_meta) |
851 | 0 | << " error:" << st; |
852 | 0 | return false; |
853 | 0 | } |
854 | | |
855 | 20 | bool exists = false; |
856 | 20 | if (!inverted_index_file_reader |
857 | 20 | ->index_file_exist(index_meta, &exists) |
858 | 20 | .ok()) { |
859 | 0 | LOG(ERROR) << inverted_index_file_reader->get_index_file_path( |
860 | 0 | index_meta) |
861 | 0 | << " fs->exists error"; |
862 | 0 | return false; |
863 | 0 | } |
864 | | |
865 | 20 | if (!exists) { |
866 | 0 | LOG(WARNING) << "tablet[" << _tablet->tablet_id() |
867 | 0 | << "] column_unique_id[" << col_unique_id << "]," |
868 | 0 | << inverted_index_file_reader->get_index_file_path( |
869 | 0 | index_meta) |
870 | 0 | << " is not exists, will skip index compaction"; |
871 | 0 | return false; |
872 | 0 | } |
873 | | |
874 | | // check index meta |
875 | 20 | auto result = inverted_index_file_reader->open(index_meta); |
876 | 20 | if (!result.has_value()) { |
877 | 0 | LOG(WARNING) << "open index " |
878 | 0 | << inverted_index_file_reader->get_index_file_path( |
879 | 0 | index_meta) |
880 | 0 | << " error:" << result.error(); |
881 | 0 | return false; |
882 | 0 | } |
883 | 20 | auto reader = std::move(result.value()); |
884 | 20 | std::vector<std::string> files; |
885 | 20 | reader->list(&files); |
886 | 20 | reader->close(); |
887 | | |
888 | | // why is 3? |
889 | | // slice type index file at least has 3 files: null_bitmap, segments_N, segments.gen |
890 | 20 | if (files.size() < 3) { |
891 | 0 | LOG(WARNING) << "tablet[" << _tablet->tablet_id() |
892 | 0 | << "] column_unique_id[" << col_unique_id << "]," |
893 | 0 | << inverted_index_file_reader->get_index_file_path( |
894 | 0 | index_meta) |
895 | 0 | << " is corrupted, will skip index compaction"; |
896 | 0 | return false; |
897 | 0 | } |
898 | 20 | } |
899 | 6 | return true; |
900 | 0 | return true; |
901 | 6 | }); |
902 | 2 | if (all_have_inverted_index) { |
903 | 2 | ctx.columns_to_do_index_compaction.insert(col_unique_id); |
904 | 2 | } |
905 | 2 | } |
906 | 4 | } |
907 | 1 | } |
908 | 3 | if (compaction_type() == ReaderType::READER_COLD_DATA_COMPACTION) { |
909 | | // write output rowset to storage policy resource |
910 | 0 | auto storage_policy = get_storage_policy(_tablet->storage_policy_id()); |
911 | 0 | if (storage_policy == nullptr) { |
912 | 0 | return Status::InternalError("could not find storage_policy, storage_policy_id={}", |
913 | 0 | _tablet->storage_policy_id()); |
914 | 0 | } |
915 | 0 | auto resource = get_storage_resource(storage_policy->resource_id); |
916 | 0 | if (resource.fs == nullptr) { |
917 | 0 | return Status::InternalError("could not find resource, resouce_id={}", |
918 | 0 | storage_policy->resource_id); |
919 | 0 | } |
920 | 0 | DCHECK(atol(resource.fs->id().c_str()) == storage_policy->resource_id); |
921 | 0 | DCHECK(resource.fs->type() != io::FileSystemType::LOCAL); |
922 | 0 | ctx.fs = std::move(resource.fs); |
923 | 0 | } |
924 | 3 | _output_rs_writer = DORIS_TRY(_tablet->create_rowset_writer(ctx, is_vertical)); |
925 | 3 | _pending_rs_guard = StorageEngine::instance()->add_pending_rowset(ctx); |
926 | 3 | return Status::OK(); |
927 | 3 | } |
928 | | |
929 | 0 | Status Compaction::construct_input_rowset_readers() { |
930 | 0 | for (auto& rowset : _input_rowsets) { |
931 | 0 | RowsetReaderSharedPtr rs_reader; |
932 | 0 | RETURN_IF_ERROR(rowset->create_reader(&rs_reader)); |
933 | 0 | _input_rs_readers.push_back(std::move(rs_reader)); |
934 | 0 | } |
935 | 0 | return Status::OK(); |
936 | 0 | } |
937 | | |
938 | 0 | Status Compaction::modify_rowsets(const Merger::Statistics* stats) { |
939 | 0 | std::vector<RowsetSharedPtr> output_rowsets; |
940 | 0 | output_rowsets.push_back(_output_rowset); |
941 | |
|
942 | 0 | if (_tablet->keys_type() == KeysType::UNIQUE_KEYS && |
943 | 0 | _tablet->enable_unique_key_merge_on_write() && |
944 | 0 | _tablet->tablet_schema()->cluster_key_idxes().empty()) { |
945 | 0 | Version version = _tablet->max_version(); |
946 | 0 | DeleteBitmap output_rowset_delete_bitmap(_tablet->tablet_id()); |
947 | 0 | std::unique_ptr<RowLocationSet> missed_rows; |
948 | 0 | if (config::enable_missing_rows_correctness_check && !allow_delete_in_cumu_compaction() && |
949 | 0 | compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION) { |
950 | 0 | missed_rows = std::make_unique<RowLocationSet>(); |
951 | 0 | LOG(INFO) << "RowLocation Set inited succ for tablet:" << _tablet->tablet_id(); |
952 | 0 | } |
953 | 0 | std::unique_ptr<std::map<RowsetSharedPtr, RowLocationPairList>> location_map; |
954 | 0 | if (config::enable_rowid_conversion_correctness_check) { |
955 | 0 | location_map = std::make_unique<std::map<RowsetSharedPtr, RowLocationPairList>>(); |
956 | 0 | LOG(INFO) << "Location Map inited succ for tablet:" << _tablet->tablet_id(); |
957 | 0 | } |
958 | | // Convert the delete bitmap of the input rowsets to output rowset. |
959 | | // New loads are not blocked, so some keys of input rowsets might |
960 | | // be deleted during the time. We need to deal with delete bitmap |
961 | | // of incremental data later. |
962 | | // TODO(LiaoXin): check if there are duplicate keys |
963 | 0 | std::size_t missed_rows_size = 0; |
964 | 0 | _tablet->calc_compaction_output_rowset_delete_bitmap( |
965 | 0 | _input_rowsets, _rowid_conversion, 0, version.second + 1, missed_rows.get(), |
966 | 0 | location_map.get(), _tablet->tablet_meta()->delete_bitmap(), |
967 | 0 | &output_rowset_delete_bitmap); |
968 | 0 | if (missed_rows) { |
969 | 0 | missed_rows_size = missed_rows->size(); |
970 | 0 | if (_tablet->tablet_state() == TABLET_RUNNING && stats != nullptr && |
971 | 0 | stats->merged_rows != missed_rows_size) { |
972 | 0 | std::stringstream ss; |
973 | 0 | ss << "cumulative compaction: the merged rows(" << stats->merged_rows |
974 | 0 | << ") is not equal to missed rows(" << missed_rows_size |
975 | 0 | << ") in rowid conversion, tablet_id: " << _tablet->tablet_id() |
976 | 0 | << ", table_id:" << _tablet->table_id(); |
977 | 0 | if (missed_rows_size == 0) { |
978 | 0 | ss << ", debug info: "; |
979 | 0 | DeleteBitmap subset_map(_tablet->tablet_id()); |
980 | 0 | for (auto rs : _input_rowsets) { |
981 | 0 | _tablet->tablet_meta()->delete_bitmap().subset( |
982 | 0 | {rs->rowset_id(), 0, 0}, |
983 | 0 | {rs->rowset_id(), rs->num_segments(), version.second + 1}, |
984 | 0 | &subset_map); |
985 | 0 | ss << "(rowset id: " << rs->rowset_id() |
986 | 0 | << ", delete bitmap cardinality: " << subset_map.cardinality() << ")"; |
987 | 0 | } |
988 | 0 | ss << ", version[0-" << version.second + 1 << "]"; |
989 | 0 | } |
990 | 0 | DCHECK(false) << ss.str(); |
991 | 0 | LOG(WARNING) << ss.str(); |
992 | 0 | } |
993 | 0 | } |
994 | |
|
995 | 0 | if (config::enable_rowid_conversion_correctness_check) { |
996 | 0 | RETURN_IF_ERROR(_tablet->check_rowid_conversion(_output_rowset, *location_map)); |
997 | 0 | location_map->clear(); |
998 | 0 | } |
999 | | |
1000 | 0 | { |
1001 | 0 | std::lock_guard<std::mutex> wrlock_(_tablet->get_rowset_update_lock()); |
1002 | 0 | std::lock_guard<std::shared_mutex> wrlock(_tablet->get_header_lock()); |
1003 | 0 | SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); |
1004 | | |
1005 | | // Here we will calculate all the rowsets delete bitmaps which are committed but not published to reduce the calculation pressure |
1006 | | // of publish phase. |
1007 | | // All rowsets which need to recalculate have been published so we don't need to acquire lock. |
1008 | | // Step1: collect this tablet's all committed rowsets' delete bitmaps |
1009 | 0 | CommitTabletTxnInfoVec commit_tablet_txn_info_vec {}; |
1010 | 0 | StorageEngine::instance()->txn_manager()->get_all_commit_tablet_txn_info_by_tablet( |
1011 | 0 | _tablet, &commit_tablet_txn_info_vec); |
1012 | | |
1013 | | // Step2: calculate all rowsets' delete bitmaps which are published during compaction. |
1014 | 0 | for (auto& it : commit_tablet_txn_info_vec) { |
1015 | 0 | if (!_check_if_includes_input_rowsets(it.rowset_ids)) { |
1016 | | // When calculating the delete bitmap of all committed rowsets relative to the compaction, |
1017 | | // there may be cases where the compacted rowsets are newer than the committed rowsets. |
1018 | | // At this time, row number conversion cannot be performed, otherwise data will be missing. |
1019 | | // Therefore, we need to check if every committed rowset has calculated delete bitmap for |
1020 | | // all compaction input rowsets. |
1021 | 0 | continue; |
1022 | 0 | } |
1023 | 0 | DeleteBitmap txn_output_delete_bitmap(_tablet->tablet_id()); |
1024 | 0 | _tablet->calc_compaction_output_rowset_delete_bitmap( |
1025 | 0 | _input_rowsets, _rowid_conversion, 0, UINT64_MAX, missed_rows.get(), |
1026 | 0 | location_map.get(), *it.delete_bitmap.get(), &txn_output_delete_bitmap); |
1027 | 0 | if (config::enable_merge_on_write_correctness_check) { |
1028 | 0 | RowsetIdUnorderedSet rowsetids; |
1029 | 0 | rowsetids.insert(_output_rowset->rowset_id()); |
1030 | 0 | _tablet->add_sentinel_mark_to_delete_bitmap(&txn_output_delete_bitmap, |
1031 | 0 | rowsetids); |
1032 | 0 | } |
1033 | 0 | it.delete_bitmap->merge(txn_output_delete_bitmap); |
1034 | | // Step3: write back updated delete bitmap and tablet info. |
1035 | 0 | it.rowset_ids.insert(_output_rowset->rowset_id()); |
1036 | 0 | StorageEngine::instance()->txn_manager()->set_txn_related_delete_bitmap( |
1037 | 0 | it.partition_id, it.transaction_id, _tablet->tablet_id(), |
1038 | 0 | _tablet->tablet_uid(), true, it.delete_bitmap, it.rowset_ids, |
1039 | 0 | it.partial_update_info); |
1040 | 0 | } |
1041 | | |
1042 | | // Convert the delete bitmap of the input rowsets to output rowset for |
1043 | | // incremental data. |
1044 | 0 | _tablet->calc_compaction_output_rowset_delete_bitmap( |
1045 | 0 | _input_rowsets, _rowid_conversion, version.second, UINT64_MAX, |
1046 | 0 | missed_rows.get(), location_map.get(), _tablet->tablet_meta()->delete_bitmap(), |
1047 | 0 | &output_rowset_delete_bitmap); |
1048 | |
|
1049 | 0 | if (missed_rows) { |
1050 | 0 | DCHECK_EQ(missed_rows->size(), missed_rows_size); |
1051 | 0 | if (missed_rows->size() != missed_rows_size) { |
1052 | 0 | LOG(WARNING) << "missed rows don't match, before: " << missed_rows_size |
1053 | 0 | << " after: " << missed_rows->size(); |
1054 | 0 | } |
1055 | 0 | } |
1056 | |
|
1057 | 0 | if (location_map) { |
1058 | 0 | RETURN_IF_ERROR(_tablet->check_rowid_conversion(_output_rowset, *location_map)); |
1059 | 0 | } |
1060 | | |
1061 | 0 | _tablet->merge_delete_bitmap(output_rowset_delete_bitmap); |
1062 | 0 | RETURN_IF_ERROR(_tablet->modify_rowsets(output_rowsets, _input_rowsets, true)); |
1063 | 0 | } |
1064 | 0 | } else { |
1065 | 0 | std::lock_guard<std::shared_mutex> wrlock(_tablet->get_header_lock()); |
1066 | 0 | SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); |
1067 | 0 | RETURN_IF_ERROR(_tablet->modify_rowsets(output_rowsets, _input_rowsets, true)); |
1068 | 0 | } |
1069 | | |
1070 | 0 | if (config::tablet_rowset_stale_sweep_by_size && |
1071 | 0 | _tablet->tablet_meta()->all_stale_rs_metas().size() >= |
1072 | 0 | config::tablet_rowset_stale_sweep_threshold_size) { |
1073 | 0 | _tablet->delete_expired_stale_rowset(); |
1074 | 0 | } |
1075 | |
|
1076 | 0 | int64_t cur_max_version = 0; |
1077 | 0 | { |
1078 | 0 | std::shared_lock rlock(_tablet->get_header_lock()); |
1079 | 0 | cur_max_version = _tablet->max_version_unlocked().second; |
1080 | 0 | _tablet->save_meta(); |
1081 | 0 | } |
1082 | 0 | if (_tablet->keys_type() == KeysType::UNIQUE_KEYS && |
1083 | 0 | _tablet->enable_unique_key_merge_on_write()) { |
1084 | 0 | auto st = TabletMetaManager::remove_old_version_delete_bitmap( |
1085 | 0 | _tablet->data_dir(), _tablet->tablet_id(), cur_max_version); |
1086 | 0 | if (!st.ok()) { |
1087 | 0 | LOG(WARNING) << "failed to remove old version delete bitmap, st: " << st; |
1088 | 0 | } |
1089 | 0 | } |
1090 | 0 | return Status::OK(); |
1091 | 0 | } |
1092 | | |
1093 | | bool Compaction::_check_if_includes_input_rowsets( |
1094 | 0 | const RowsetIdUnorderedSet& commit_rowset_ids_set) const { |
1095 | 0 | std::vector<RowsetId> commit_rowset_ids {}; |
1096 | 0 | commit_rowset_ids.insert(commit_rowset_ids.end(), commit_rowset_ids_set.begin(), |
1097 | 0 | commit_rowset_ids_set.end()); |
1098 | 0 | std::sort(commit_rowset_ids.begin(), commit_rowset_ids.end()); |
1099 | 0 | std::vector<RowsetId> input_rowset_ids {}; |
1100 | 0 | for (const auto& rowset : _input_rowsets) { |
1101 | 0 | input_rowset_ids.emplace_back(rowset->rowset_meta()->rowset_id()); |
1102 | 0 | } |
1103 | 0 | std::sort(input_rowset_ids.begin(), input_rowset_ids.end()); |
1104 | 0 | return std::includes(commit_rowset_ids.begin(), commit_rowset_ids.end(), |
1105 | 0 | input_rowset_ids.begin(), input_rowset_ids.end()); |
1106 | 0 | } |
1107 | | |
1108 | 0 | void Compaction::gc_output_rowset() { |
1109 | 0 | if (_state != CompactionState::SUCCESS && _output_rowset != nullptr) { |
1110 | 0 | if (!_output_rowset->is_local()) { |
1111 | 0 | _tablet->record_unused_remote_rowset(_output_rowset->rowset_id(), |
1112 | 0 | _output_rowset->rowset_meta()->resource_id(), |
1113 | 0 | _output_rowset->num_segments()); |
1114 | 0 | return; |
1115 | 0 | } |
1116 | 0 | StorageEngine::instance()->add_unused_rowset(_output_rowset); |
1117 | 0 | } |
1118 | 0 | } |
1119 | | |
1120 | | // Find the longest consecutive version path in "rowset", from beginning. |
1121 | | // Two versions before and after the missing version will be saved in missing_version, |
1122 | | // if missing_version is not null. |
1123 | | Status Compaction::find_longest_consecutive_version(std::vector<RowsetSharedPtr>* rowsets, |
1124 | 16 | std::vector<Version>* missing_version) { |
1125 | 16 | if (rowsets->empty()) { |
1126 | 2 | return Status::OK(); |
1127 | 2 | } |
1128 | 14 | RowsetSharedPtr prev_rowset = rowsets->front(); |
1129 | 14 | size_t i = 1; |
1130 | 288 | for (; i < rowsets->size(); ++i) { |
1131 | 276 | RowsetSharedPtr rowset = (*rowsets)[i]; |
1132 | 276 | if (rowset->start_version() != prev_rowset->end_version() + 1) { |
1133 | 2 | if (missing_version != nullptr) { |
1134 | 0 | missing_version->push_back(prev_rowset->version()); |
1135 | 0 | missing_version->push_back(rowset->version()); |
1136 | 0 | } |
1137 | 2 | break; |
1138 | 2 | } |
1139 | 274 | prev_rowset = rowset; |
1140 | 274 | } |
1141 | | |
1142 | 14 | rowsets->resize(i); |
1143 | 14 | return Status::OK(); |
1144 | 16 | } |
1145 | | |
1146 | 1 | Status Compaction::check_version_continuity(const std::vector<RowsetSharedPtr>& rowsets) { |
1147 | 1 | if (rowsets.empty()) { |
1148 | 0 | return Status::OK(); |
1149 | 0 | } |
1150 | 1 | RowsetSharedPtr prev_rowset = rowsets.front(); |
1151 | 24 | for (size_t i = 1; i < rowsets.size(); ++i) { |
1152 | 23 | RowsetSharedPtr rowset = rowsets[i]; |
1153 | 23 | if (rowset->start_version() != prev_rowset->end_version() + 1) { |
1154 | 0 | return Status::Error<CUMULATIVE_MISS_VERSION>( |
1155 | 0 | "There are missed versions among rowsets. prev_rowset version={}-{}, rowset " |
1156 | 0 | "version={}-{}", |
1157 | 0 | prev_rowset->start_version(), prev_rowset->end_version(), |
1158 | 0 | rowset->start_version(), rowset->end_version()); |
1159 | 0 | } |
1160 | 23 | prev_rowset = rowset; |
1161 | 23 | } |
1162 | | |
1163 | 1 | return Status::OK(); |
1164 | 1 | } |
1165 | | |
1166 | 0 | Status Compaction::check_correctness(const Merger::Statistics& stats) { |
1167 | | // 1. check row number |
1168 | 0 | if (_input_row_num != _output_rowset->num_rows() + stats.merged_rows + stats.filtered_rows) { |
1169 | 0 | return Status::Error<CHECK_LINES_ERROR>( |
1170 | 0 | "row_num does not match between cumulative input and output! tablet={}, " |
1171 | 0 | "input_row_num={}, merged_row_num={}, filtered_row_num={}, output_row_num={}", |
1172 | 0 | _tablet->tablet_id(), _input_row_num, stats.merged_rows, stats.filtered_rows, |
1173 | 0 | _output_rowset->num_rows()); |
1174 | 0 | } |
1175 | 0 | return Status::OK(); |
1176 | 0 | } |
1177 | | |
1178 | 0 | int64_t Compaction::get_compaction_permits() { |
1179 | 0 | int64_t permits = 0; |
1180 | 0 | for (auto rowset : _input_rowsets) { |
1181 | 0 | permits += rowset->rowset_meta()->get_compaction_score(); |
1182 | 0 | } |
1183 | 0 | return permits; |
1184 | 0 | } |
1185 | | |
1186 | 0 | void Compaction::_load_segment_to_cache() { |
1187 | | // Load new rowset's segments to cache. |
1188 | 0 | SegmentCacheHandle handle; |
1189 | 0 | auto st = SegmentLoader::instance()->load_segments( |
1190 | 0 | std::static_pointer_cast<BetaRowset>(_output_rowset), &handle, true); |
1191 | 0 | if (!st.ok()) { |
1192 | 0 | LOG(WARNING) << "failed to load segment to cache! output rowset version=" |
1193 | 0 | << _output_rowset->start_version() << "-" << _output_rowset->end_version() |
1194 | 0 | << "."; |
1195 | 0 | } |
1196 | 0 | } |
1197 | | |
1198 | | #ifdef BE_TEST |
1199 | 1 | void Compaction::set_input_rowset(const std::vector<RowsetSharedPtr>& rowsets) { |
1200 | 1 | _input_rowsets = rowsets; |
1201 | 1 | } |
1202 | | |
1203 | 1 | RowsetSharedPtr Compaction::output_rowset() { |
1204 | 1 | return _output_rowset; |
1205 | 1 | } |
1206 | | #endif |
1207 | | |
1208 | | } // namespace doris |