be/src/load/delta_writer/delta_writer.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "load/delta_writer/delta_writer.h" |
19 | | |
20 | | #include <brpc/controller.h> |
21 | | #include <butil/errno.h> |
22 | | #include <fmt/format.h> |
23 | | #include <gen_cpp/internal_service.pb.h> |
24 | | #include <gen_cpp/olap_file.pb.h> |
25 | | |
26 | | #include <filesystem> |
27 | | #include <ostream> |
28 | | #include <string> |
29 | | #include <utility> |
30 | | |
31 | | // IWYU pragma: no_include <opentelemetry/common/threadlocal.h> |
32 | | #include "common/compiler_util.h" // IWYU pragma: keep |
33 | | #include "common/config.h" |
34 | | #include "common/logging.h" |
35 | | #include "common/status.h" |
36 | | #include "core/block/block.h" |
37 | | #include "io/fs/file_writer.h" // IWYU pragma: keep |
38 | | #include "load/memtable/memtable_flush_executor.h" |
39 | | #include "runtime/exec_env.h" |
40 | | #include "runtime/thread_context.h" |
41 | | #include "service/backend_options.h" |
42 | | #include "storage/index/inverted/inverted_index_desc.h" |
43 | | #include "storage/olap_define.h" |
44 | | #include "storage/rowset/beta_rowset.h" |
45 | | #include "storage/rowset/beta_rowset_writer.h" |
46 | | #include "storage/rowset/rowset_meta.h" |
47 | | #include "storage/rowset_builder.h" |
48 | | #include "storage/schema_change/schema_change.h" |
49 | | #include "storage/storage_engine.h" |
50 | | #include "storage/tablet/tablet_manager.h" |
51 | | #include "storage/tablet_info.h" |
52 | | #include "storage/txn/txn_manager.h" |
53 | | #include "util/brpc_client_cache.h" |
54 | | #include "util/brpc_closure.h" |
55 | | #include "util/mem_info.h" |
56 | | #include "util/stopwatch.hpp" |
57 | | #include "util/time.h" |
58 | | |
59 | | namespace doris { |
60 | | using namespace ErrorCode; |
61 | | |
62 | | BaseDeltaWriter::BaseDeltaWriter(const WriteRequest& req, RuntimeProfile* profile, |
63 | | const UniqueId& load_id) |
64 | 262k | : _req(req), _memtable_writer(new MemTableWriter(req)) { |
65 | 262k | _init_profile(profile); |
66 | 262k | } |
67 | | |
68 | | DeltaWriter::DeltaWriter(StorageEngine& engine, const WriteRequest& req, RuntimeProfile* profile, |
69 | | const UniqueId& load_id) |
70 | 2.70k | : BaseDeltaWriter(req, profile, load_id), _engine(engine) { |
71 | 2.70k | _rowset_builder = std::make_unique<RowsetBuilder>(_engine, req, profile); |
72 | 2.70k | } |
73 | | |
74 | 262k | void BaseDeltaWriter::_init_profile(RuntimeProfile* profile) { |
75 | 262k | _profile = profile->create_child(fmt::format("DeltaWriter {}", _req.tablet_id), true, true); |
76 | 262k | _close_wait_timer = ADD_TIMER(_profile, "CloseWaitTime"); |
77 | 262k | _wait_flush_limit_timer = ADD_TIMER(_profile, "WaitFlushLimitTime"); |
78 | 262k | } |
79 | | |
80 | 0 | void DeltaWriter::_init_profile(RuntimeProfile* profile) { |
81 | 0 | BaseDeltaWriter::_init_profile(profile); |
82 | 0 | _commit_txn_timer = ADD_TIMER(_profile, "CommitTxnTime"); |
83 | 0 | } |
84 | | |
85 | 262k | BaseDeltaWriter::~BaseDeltaWriter() { |
86 | 262k | if (!_is_init) { |
87 | 208k | return; |
88 | 208k | } |
89 | | |
90 | | // cancel and wait all memtables in flush queue to be finished |
91 | 54.1k | static_cast<void>(_memtable_writer->cancel()); |
92 | | |
93 | 54.1k | if (_rowset_builder->tablet() != nullptr) { |
94 | 54.1k | const FlushStatistic& stat = _memtable_writer->get_flush_token_stats(); |
95 | 54.1k | _rowset_builder->tablet()->flush_bytes->increment(stat.flush_size_bytes); |
96 | 54.1k | _rowset_builder->tablet()->flush_finish_count->increment(stat.flush_finish_count); |
97 | 54.1k | } |
98 | 54.1k | } |
99 | | |
100 | | void BaseDeltaWriter::collect_tablet_load_rowset_num_info( |
101 | | BaseTablet* tablet, |
102 | 127k | google::protobuf::RepeatedPtrField<PTabletLoadRowsetInfo>* tablet_infos) { |
103 | 127k | if (tablet == nullptr) { |
104 | 0 | return; |
105 | 0 | } |
106 | 127k | auto max_version_config = tablet->max_version_config(); |
107 | 127k | if (auto version_cnt = tablet->tablet_meta()->version_count(); |
108 | 127k | UNLIKELY(version_cnt > |
109 | 127k | (max_version_config * config::load_back_pressure_version_threshold / 100))) { |
110 | 0 | auto* load_info = tablet_infos->Add(); |
111 | 0 | load_info->set_current_rowset_nums(static_cast<int32_t>(version_cnt)); |
112 | 0 | load_info->set_max_config_rowset_nums(max_version_config); |
113 | 0 | } |
114 | 127k | } |
115 | | |
116 | | void BaseDeltaWriter::set_tablet_load_rowset_num_info( |
117 | 113k | google::protobuf::RepeatedPtrField<PTabletLoadRowsetInfo>* tablet_infos) { |
118 | 113k | auto* tablet = _rowset_builder->tablet().get(); |
119 | 113k | collect_tablet_load_rowset_num_info(tablet, tablet_infos); |
120 | 113k | } |
121 | | |
122 | 2.70k | DeltaWriter::~DeltaWriter() = default; |
123 | | |
124 | 54.1k | Status BaseDeltaWriter::init() { |
125 | 54.1k | if (_is_init) { |
126 | 0 | return Status::OK(); |
127 | 0 | } |
128 | 54.1k | std::shared_ptr<WorkloadGroup> wg_sptr = nullptr; |
129 | 54.1k | if (doris::thread_context()->is_attach_task()) { |
130 | 54.0k | wg_sptr = doris::thread_context()->resource_ctx()->workload_group(); |
131 | 54.0k | } |
132 | 54.1k | RETURN_IF_ERROR(_rowset_builder->init()); |
133 | 54.1k | RETURN_IF_ERROR(_memtable_writer->init( |
134 | 54.1k | _rowset_builder->rowset_writer(), _rowset_builder->tablet_schema(), |
135 | 54.1k | _rowset_builder->get_partial_update_info(), wg_sptr, |
136 | 54.1k | _rowset_builder->tablet()->enable_unique_key_merge_on_write())); |
137 | 54.1k | ExecEnv::GetInstance()->memtable_memory_limiter()->register_writer(_memtable_writer); |
138 | 54.1k | _is_init = true; |
139 | 54.1k | return Status::OK(); |
140 | 54.1k | } |
141 | | |
142 | 606 | Status DeltaWriter::write(const Block* block, const DorisVector<uint32_t>& row_idxs) { |
143 | 606 | if (UNLIKELY(row_idxs.empty())) { |
144 | 0 | return Status::OK(); |
145 | 0 | } |
146 | 606 | _lock_watch.start(); |
147 | 606 | std::lock_guard<std::mutex> l(_lock); |
148 | 606 | _lock_watch.stop(); |
149 | 606 | if (!_is_init && !_is_cancelled) { |
150 | 598 | RETURN_IF_ERROR(init()); |
151 | 598 | } |
152 | 606 | { |
153 | 606 | SCOPED_TIMER(_wait_flush_limit_timer); |
154 | 606 | while (_memtable_writer->flush_running_count() >= |
155 | 606 | config::memtable_flush_running_count_limit) { |
156 | 0 | std::this_thread::sleep_for(std::chrono::milliseconds(10)); |
157 | 0 | } |
158 | 606 | } |
159 | 606 | return _memtable_writer->write(block, row_idxs); |
160 | 606 | } |
161 | | |
162 | 1.77k | Status BaseDeltaWriter::wait_flush() { |
163 | 1.77k | return _memtable_writer->wait_flush(); |
164 | 1.77k | } |
165 | | |
166 | 1.78k | Status DeltaWriter::close() { |
167 | 1.78k | _lock_watch.start(); |
168 | 1.78k | std::lock_guard<std::mutex> l(_lock); |
169 | 1.78k | _lock_watch.stop(); |
170 | 1.78k | if (!_is_init && !_is_cancelled) { |
171 | | // if this delta writer is not initialized, but close() is called. |
172 | | // which means this tablet has no data loaded, but at least one tablet |
173 | | // in same partition has data loaded. |
174 | | // so we have to also init this DeltaWriter, so that it can create an empty rowset |
175 | | // for this tablet when being closed. |
176 | 1.18k | RETURN_IF_ERROR(init()); |
177 | 1.18k | } |
178 | 1.78k | return _memtable_writer->close(); |
179 | 1.78k | } |
180 | | |
181 | 54.1k | Status BaseDeltaWriter::build_rowset() { |
182 | 54.1k | SCOPED_TIMER(_close_wait_timer); |
183 | 54.1k | RETURN_IF_ERROR(_memtable_writer->close_wait(_profile)); |
184 | 54.1k | return _rowset_builder->build_rowset(); |
185 | 54.1k | } |
186 | | |
187 | 1.78k | Status DeltaWriter::build_rowset() { |
188 | 1.78k | std::lock_guard<std::mutex> l(_lock); |
189 | 1.78k | DCHECK(_is_init) |
190 | 0 | << "delta writer is supposed be to initialized before build_rowset() being called"; |
191 | 1.78k | return BaseDeltaWriter::build_rowset(); |
192 | 1.78k | } |
193 | | |
194 | 171k | Status BaseDeltaWriter::submit_calc_delete_bitmap_task() { |
195 | 171k | return _rowset_builder->submit_calc_delete_bitmap_task(); |
196 | 171k | } |
197 | | |
198 | 171k | Status BaseDeltaWriter::wait_calc_delete_bitmap() { |
199 | 171k | return _rowset_builder->wait_calc_delete_bitmap(); |
200 | 171k | } |
201 | | |
202 | 1.78k | RowsetBuilder* DeltaWriter::rowset_builder() { |
203 | 1.78k | return static_cast<RowsetBuilder*>(_rowset_builder.get()); |
204 | 1.78k | } |
205 | | |
206 | 1.78k | Status DeltaWriter::commit_txn(const PSlaveTabletNodes& slave_tablet_nodes) { |
207 | 1.78k | std::lock_guard<std::mutex> l(_lock); |
208 | 1.78k | SCOPED_TIMER(_commit_txn_timer); |
209 | 1.78k | RETURN_IF_ERROR(rowset_builder()->commit_txn()); |
210 | | |
211 | 1.78k | for (auto&& node_info : slave_tablet_nodes.slave_nodes()) { |
212 | 0 | _request_slave_tablet_pull_rowset(node_info); |
213 | 0 | } |
214 | 1.78k | return Status::OK(); |
215 | 1.78k | } |
216 | | |
217 | | bool DeltaWriter::check_slave_replicas_done( |
218 | 0 | google::protobuf::Map<int64_t, PSuccessSlaveTabletNodeIds>* success_slave_tablet_node_ids) { |
219 | 0 | std::lock_guard<std::shared_mutex> lock(_slave_node_lock); |
220 | 0 | if (_unfinished_slave_node.empty()) { |
221 | 0 | success_slave_tablet_node_ids->insert({_req.tablet_id, _success_slave_node_ids}); |
222 | 0 | return true; |
223 | 0 | } |
224 | 0 | return false; |
225 | 0 | } |
226 | | |
227 | | void DeltaWriter::add_finished_slave_replicas( |
228 | 0 | google::protobuf::Map<int64_t, PSuccessSlaveTabletNodeIds>* success_slave_tablet_node_ids) { |
229 | 0 | std::lock_guard<std::shared_mutex> lock(_slave_node_lock); |
230 | 0 | success_slave_tablet_node_ids->insert({_req.tablet_id, _success_slave_node_ids}); |
231 | 0 | } |
232 | | |
233 | 90.5k | Status BaseDeltaWriter::cancel() { |
234 | 90.5k | return cancel_with_status(Status::Cancelled("already cancelled")); |
235 | 90.5k | } |
236 | | |
237 | 90.5k | Status BaseDeltaWriter::cancel_with_status(const Status& st) { |
238 | 90.5k | if (_is_cancelled) { |
239 | 0 | return Status::OK(); |
240 | 0 | } |
241 | 90.5k | RETURN_IF_ERROR(_memtable_writer->cancel_with_status(st)); |
242 | 90.5k | _is_cancelled = true; |
243 | 90.5k | return Status::OK(); |
244 | 90.5k | } |
245 | | |
246 | 924 | Status DeltaWriter::cancel_with_status(const Status& st) { |
247 | 924 | std::lock_guard<std::mutex> l(_lock); |
248 | 924 | return BaseDeltaWriter::cancel_with_status(st); |
249 | 924 | } |
250 | | |
251 | 88 | int64_t BaseDeltaWriter::mem_consumption(MemType mem) { |
252 | 88 | return _memtable_writer->mem_consumption(mem); |
253 | 88 | } |
254 | | |
255 | 0 | void DeltaWriter::_request_slave_tablet_pull_rowset(const PNodeInfo& node_info) { |
256 | 0 | std::shared_ptr<PBackendService_Stub> stub = |
257 | 0 | ExecEnv::GetInstance()->brpc_internal_client_cache()->get_client( |
258 | 0 | node_info.host(), node_info.async_internal_port()); |
259 | 0 | if (stub == nullptr) { |
260 | 0 | LOG(WARNING) << "failed to send pull rowset request to slave replica. get rpc stub failed, " |
261 | 0 | "slave host=" |
262 | 0 | << node_info.host() << ", port=" << node_info.async_internal_port() |
263 | 0 | << ", tablet_id=" << _req.tablet_id << ", txn_id=" << _req.txn_id; |
264 | 0 | return; |
265 | 0 | } |
266 | | |
267 | 0 | _engine.txn_manager()->add_txn_tablet_delta_writer(_req.txn_id, _req.tablet_id, this); |
268 | 0 | { |
269 | 0 | std::lock_guard<std::shared_mutex> lock(_slave_node_lock); |
270 | 0 | _unfinished_slave_node.insert(node_info.id()); |
271 | 0 | } |
272 | |
|
273 | 0 | std::vector<std::pair<int64_t, std::string>> indices_ids; |
274 | 0 | auto cur_rowset = _rowset_builder->rowset(); |
275 | 0 | auto tablet_schema = cur_rowset->rowset_meta()->tablet_schema(); |
276 | 0 | if (!tablet_schema->skip_write_index_on_load()) { |
277 | 0 | for (const auto& column : tablet_schema->columns()) { |
278 | 0 | auto index_metas = tablet_schema->inverted_indexs(*column); |
279 | 0 | for (const auto* index_meta : index_metas) { |
280 | 0 | indices_ids.emplace_back(index_meta->index_id(), index_meta->get_index_suffix()); |
281 | 0 | } |
282 | 0 | } |
283 | 0 | } |
284 | |
|
285 | 0 | auto request = std::make_shared<PTabletWriteSlaveRequest>(); |
286 | 0 | auto* request_mutable_rs_meta = request->mutable_rowset_meta(); |
287 | 0 | *request_mutable_rs_meta = cur_rowset->rowset_meta()->get_rowset_pb(); |
288 | 0 | if (request_mutable_rs_meta != nullptr && request_mutable_rs_meta->has_partition_id() && |
289 | 0 | request_mutable_rs_meta->partition_id() == 0) { |
290 | | // TODO(dx): remove log after fix partition id eq 0 bug |
291 | 0 | request_mutable_rs_meta->set_partition_id(_req.partition_id); |
292 | 0 | LOG(WARNING) << "cant get partition id from local rs pb, get from _req, partition_id=" |
293 | 0 | << _req.partition_id; |
294 | 0 | } |
295 | 0 | request->set_host(BackendOptions::get_localhost()); |
296 | 0 | request->set_http_port(config::webserver_port); |
297 | 0 | const auto& tablet_path = cur_rowset->tablet_path(); |
298 | 0 | request->set_rowset_path(tablet_path); |
299 | 0 | request->set_token(ExecEnv::GetInstance()->token()); |
300 | 0 | request->set_brpc_port(config::brpc_port); |
301 | 0 | request->set_node_id(static_cast<int32_t>(node_info.id())); |
302 | 0 | for (int segment_id = 0; segment_id < cur_rowset->rowset_meta()->num_segments(); segment_id++) { |
303 | 0 | auto seg_path = |
304 | 0 | local_segment_path(tablet_path, cur_rowset->rowset_id().to_string(), segment_id); |
305 | 0 | int64_t segment_size = std::filesystem::file_size(seg_path); |
306 | 0 | request->mutable_segments_size()->insert({segment_id, segment_size}); |
307 | 0 | auto index_path_prefix = InvertedIndexDescriptor::get_index_file_path_prefix(seg_path); |
308 | 0 | if (!indices_ids.empty()) { |
309 | 0 | if (tablet_schema->get_inverted_index_storage_format() == |
310 | 0 | InvertedIndexStorageFormatPB::V1) { |
311 | 0 | for (auto index_meta : indices_ids) { |
312 | 0 | std::string inverted_index_file = |
313 | 0 | InvertedIndexDescriptor::get_index_file_path_v1( |
314 | 0 | index_path_prefix, index_meta.first, index_meta.second); |
315 | 0 | int64_t size = std::filesystem::file_size(inverted_index_file); |
316 | 0 | PTabletWriteSlaveRequest::IndexSize index_size; |
317 | 0 | index_size.set_indexid(index_meta.first); |
318 | 0 | index_size.set_size(size); |
319 | 0 | index_size.set_suffix_path(index_meta.second); |
320 | | // Fetch the map value for the current segment_id. |
321 | | // If it doesn't exist, this will insert a new default-constructed IndexSizeMapValue |
322 | 0 | auto& index_size_map_value = |
323 | 0 | (*(request->mutable_inverted_indices_size()))[segment_id]; |
324 | | // Add the new index size to the map value. |
325 | 0 | *index_size_map_value.mutable_index_sizes()->Add() = std::move(index_size); |
326 | 0 | } |
327 | 0 | } else { |
328 | 0 | std::string inverted_index_file = |
329 | 0 | InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix); |
330 | 0 | int64_t size = std::filesystem::file_size(inverted_index_file); |
331 | 0 | PTabletWriteSlaveRequest::IndexSize index_size; |
332 | | // special id for non-V1 format |
333 | 0 | index_size.set_indexid(0); |
334 | 0 | index_size.set_size(size); |
335 | 0 | index_size.set_suffix_path(""); |
336 | | // Fetch the map value for the current segment_id. |
337 | | // If it doesn't exist, this will insert a new default-constructed IndexSizeMapValue |
338 | 0 | auto& index_size_map_value = |
339 | 0 | (*(request->mutable_inverted_indices_size()))[segment_id]; |
340 | | // Add the new index size to the map value. |
341 | 0 | *index_size_map_value.mutable_index_sizes()->Add() = std::move(index_size); |
342 | 0 | } |
343 | 0 | } |
344 | 0 | } |
345 | |
|
346 | 0 | auto pull_callback = DummyBrpcCallback<PTabletWriteSlaveResult>::create_shared(); |
347 | 0 | auto closure = AutoReleaseClosure< |
348 | 0 | PTabletWriteSlaveRequest, |
349 | 0 | DummyBrpcCallback<PTabletWriteSlaveResult>>::create_unique(request, pull_callback); |
350 | 0 | closure->cntl_->set_timeout_ms(config::slave_replica_writer_rpc_timeout_sec * 1000); |
351 | 0 | closure->cntl_->ignore_eovercrowded(); |
352 | 0 | stub->request_slave_tablet_pull_rowset(closure->cntl_.get(), closure->request_.get(), |
353 | 0 | closure->response_.get(), closure.get()); |
354 | |
|
355 | 0 | closure.release(); |
356 | 0 | pull_callback->join(); |
357 | 0 | if (pull_callback->cntl_->Failed()) { |
358 | 0 | if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available( |
359 | 0 | stub, node_info.host(), node_info.async_internal_port())) { |
360 | 0 | ExecEnv::GetInstance()->brpc_internal_client_cache()->erase( |
361 | 0 | pull_callback->cntl_->remote_side()); |
362 | 0 | } |
363 | 0 | LOG(WARNING) << "failed to send pull rowset request to slave replica, error=" |
364 | 0 | << berror(pull_callback->cntl_->ErrorCode()) |
365 | 0 | << ", error_text=" << pull_callback->cntl_->ErrorText() |
366 | 0 | << ". slave host: " << node_info.host() << ", tablet_id=" << _req.tablet_id |
367 | 0 | << ", txn_id=" << _req.txn_id; |
368 | 0 | std::lock_guard<std::shared_mutex> lock(_slave_node_lock); |
369 | 0 | _unfinished_slave_node.erase(node_info.id()); |
370 | 0 | } |
371 | 0 | } |
372 | | |
373 | 0 | void DeltaWriter::finish_slave_tablet_pull_rowset(int64_t node_id, bool is_succeed) { |
374 | 0 | std::lock_guard<std::shared_mutex> lock(_slave_node_lock); |
375 | 0 | if (is_succeed) { |
376 | 0 | _success_slave_node_ids.add_slave_node_ids(node_id); |
377 | 0 | VLOG_CRITICAL << "record successful slave replica for txn [" << _req.txn_id |
378 | 0 | << "], tablet_id=" << _req.tablet_id << ", node_id=" << node_id; |
379 | 0 | } |
380 | 0 | _unfinished_slave_node.erase(node_id); |
381 | 0 | } |
382 | | |
383 | 173k | int64_t BaseDeltaWriter::num_rows_filtered() const { |
384 | 173k | auto rowset_writer = _rowset_builder->rowset_writer(); |
385 | 173k | return rowset_writer == nullptr ? 0 : rowset_writer->num_rows_filtered(); |
386 | 173k | } |
387 | | |
388 | | } // namespace doris |