Coverage Report

Created: 2026-06-09 14:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/load/delta_writer/delta_writer.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "load/delta_writer/delta_writer.h"
19
20
#include <brpc/controller.h>
21
#include <butil/errno.h>
22
#include <fmt/format.h>
23
#include <gen_cpp/internal_service.pb.h>
24
#include <gen_cpp/olap_file.pb.h>
25
26
#include <filesystem>
27
#include <ostream>
28
#include <string>
29
#include <utility>
30
31
// IWYU pragma: no_include <opentelemetry/common/threadlocal.h>
32
#include "common/compiler_util.h" // IWYU pragma: keep
33
#include "common/config.h"
34
#include "common/logging.h"
35
#include "common/status.h"
36
#include "core/block/block.h"
37
#include "io/fs/file_writer.h" // IWYU pragma: keep
38
#include "load/memtable/memtable_flush_executor.h"
39
#include "load/memtable/memtable_memory_limiter.h"
40
#include "runtime/exec_env.h"
41
#include "runtime/thread_context.h"
42
#include "service/backend_options.h"
43
#include "storage/index/inverted/inverted_index_desc.h"
44
#include "storage/olap_define.h"
45
#include "storage/rowset/beta_rowset.h"
46
#include "storage/rowset/beta_rowset_writer.h"
47
#include "storage/rowset/rowset_meta.h"
48
#include "storage/rowset_builder.h"
49
#include "storage/schema_change/schema_change.h"
50
#include "storage/storage_engine.h"
51
#include "storage/tablet/tablet_manager.h"
52
#include "storage/tablet_info.h"
53
#include "storage/txn/txn_manager.h"
54
#include "util/brpc_client_cache.h"
55
#include "util/brpc_closure.h"
56
#include "util/mem_info.h"
57
#include "util/stopwatch.hpp"
58
#include "util/time.h"
59
60
namespace doris {
61
using namespace ErrorCode;
62
63
BaseDeltaWriter::BaseDeltaWriter(const WriteRequest& req, RuntimeProfile* profile,
64
                                 const UniqueId& load_id)
65
15
        : _req(req), _memtable_writer(new MemTableWriter(req)) {
66
15
    if (profile != nullptr) {
67
15
        _init_profile(profile);
68
15
    }
69
15
}
70
71
DeltaWriter::DeltaWriter(StorageEngine& engine, const WriteRequest& req, RuntimeProfile* profile,
72
                         const UniqueId& load_id)
73
15
        : BaseDeltaWriter(req, profile, load_id), _engine(engine) {
74
15
    DCHECK(req.write_req_type == WriteRequestType::DATA);
75
15
    _rowset_builder = std::make_unique<RowsetBuilder>(_engine, req, profile);
76
15
}
77
78
DeltaWriter::DeltaWriter(StorageEngine& engine, const WriteRequest& group_build_req,
79
                         const WriteRequest& sub_data_req, const WriteRequest& sub_row_binlog_req,
80
                         RuntimeProfile* profile, const UniqueId& load_id)
81
0
        : BaseDeltaWriter(group_build_req, profile, load_id), _engine(engine) {
82
0
    DCHECK(group_build_req.write_req_type == WriteRequestType::GROUP &&
83
0
           sub_data_req.write_req_type == WriteRequestType::DATA &&
84
0
           sub_row_binlog_req.write_req_type == WriteRequestType::ROW_BINLOG);
85
0
    _rowset_builder = std::make_unique<GroupRowsetBuilder>(_engine, group_build_req, sub_data_req,
86
0
                                                           sub_row_binlog_req, profile);
87
0
}
88
89
15
void BaseDeltaWriter::_init_profile(RuntimeProfile* profile) {
90
15
    DCHECK(profile != nullptr);
91
15
    _profile = profile->create_child(fmt::format("DeltaWriter {}", _req.tablet_id), true, true);
92
15
    _close_wait_timer = ADD_TIMER(_profile, "CloseWaitTime");
93
15
    _wait_flush_limit_timer = ADD_TIMER(_profile, "WaitFlushLimitTime");
94
15
}
95
96
0
void DeltaWriter::_init_profile(RuntimeProfile* profile) {
97
0
    DCHECK(profile != nullptr);
98
0
    BaseDeltaWriter::_init_profile(profile);
99
0
    _commit_txn_timer = ADD_TIMER(_profile, "CommitTxnTime");
100
0
}
101
102
15
BaseDeltaWriter::~BaseDeltaWriter() {
103
15
    if (!_is_init) {
104
0
        return;
105
0
    }
106
107
    // cancel and wait all memtables in flush queue to be finished
108
15
    static_cast<void>(_memtable_writer->cancel());
109
110
15
    if (_rowset_builder->tablet() != nullptr) {
111
15
        const FlushStatistic& stat = _memtable_writer->get_flush_token_stats();
112
15
        _rowset_builder->tablet()->flush_bytes->increment(stat.flush_size_bytes);
113
15
        _rowset_builder->tablet()->flush_finish_count->increment(stat.flush_finish_count);
114
15
    }
115
15
}
116
117
void BaseDeltaWriter::collect_tablet_load_rowset_num_info(
118
        BaseTablet* tablet,
119
0
        google::protobuf::RepeatedPtrField<PTabletLoadRowsetInfo>* tablet_infos) {
120
0
    if (tablet == nullptr) {
121
0
        return;
122
0
    }
123
0
    auto max_version_config = tablet->max_version_config();
124
0
    if (auto version_cnt = tablet->tablet_meta()->version_count();
125
0
        UNLIKELY(version_cnt >
126
0
                 (max_version_config * config::load_back_pressure_version_threshold / 100))) {
127
0
        auto* load_info = tablet_infos->Add();
128
0
        load_info->set_current_rowset_nums(static_cast<int32_t>(version_cnt));
129
0
        load_info->set_max_config_rowset_nums(max_version_config);
130
0
    }
131
0
}
132
133
void BaseDeltaWriter::set_tablet_load_rowset_num_info(
134
0
        google::protobuf::RepeatedPtrField<PTabletLoadRowsetInfo>* tablet_infos) {
135
0
    auto* tablet = _rowset_builder->tablet().get();
136
0
    collect_tablet_load_rowset_num_info(tablet, tablet_infos);
137
0
}
138
139
0
int64_t BaseDeltaWriter::table_id() const {
140
0
    DORIS_CHECK(_req.table_schema_param != nullptr);
141
0
    return _req.table_schema_param->table_id();
142
0
}
143
144
15
DeltaWriter::~DeltaWriter() = default;
145
146
15
Status BaseDeltaWriter::init() {
147
15
    if (_is_init) {
148
0
        return Status::OK();
149
0
    }
150
15
    std::shared_ptr<WorkloadGroup> wg_sptr = nullptr;
151
15
    if (doris::thread_context()->is_attach_task()) {
152
0
        wg_sptr = doris::thread_context()->resource_ctx()->workload_group();
153
0
    }
154
15
    RETURN_IF_ERROR(_rowset_builder->init());
155
15
    RETURN_IF_ERROR(_memtable_writer->init(
156
15
            _rowset_builder->rowset_writer(), _rowset_builder->tablet_schema(),
157
15
            _rowset_builder->get_partial_update_info(), wg_sptr,
158
15
            _rowset_builder->tablet_sptr()->enable_unique_key_merge_on_write()));
159
15
    ExecEnv::GetInstance()->memtable_memory_limiter()->register_writer(_memtable_writer);
160
15
    _is_init = true;
161
15
    return Status::OK();
162
15
}
163
164
Status DeltaWriter::write(const Block* block, const DorisVector<uint32_t>& row_idxs,
165
20
                          bool* memtable_flushed) {
166
20
    if (memtable_flushed != nullptr) {
167
0
        *memtable_flushed = false;
168
0
    }
169
20
    if (UNLIKELY(row_idxs.empty())) {
170
0
        return Status::OK();
171
0
    }
172
20
    if (_req.enable_table_memtable_backpressure) {
173
0
        ExecEnv::GetInstance()->memtable_memory_limiter()->handle_table_memtable_backpressure(
174
0
                [this]() {
175
0
                    std::lock_guard<std::mutex> l(_lock);
176
0
                    return _is_cancelled;
177
0
                },
178
0
                table_id());
179
0
    }
180
20
    _lock_watch.start();
181
20
    std::lock_guard<std::mutex> l(_lock);
182
20
    _lock_watch.stop();
183
20
    if (!_is_init && !_is_cancelled) {
184
12
        RETURN_IF_ERROR(init());
185
12
    }
186
20
    {
187
20
        SCOPED_TIMER(_wait_flush_limit_timer);
188
20
        while (_memtable_writer->flush_running_count() >=
189
20
               config::memtable_flush_running_count_limit) {
190
0
            std::this_thread::sleep_for(std::chrono::milliseconds(10));
191
0
        }
192
20
    }
193
20
    return _memtable_writer->write(block, row_idxs, memtable_flushed);
194
20
}
195
196
9
Status BaseDeltaWriter::wait_flush() {
197
9
    return _memtable_writer->wait_flush();
198
9
}
199
200
0
Status BaseDeltaWriter::flush_memtable_async() {
201
0
    return _memtable_writer->flush_async();
202
0
}
203
204
0
Status DeltaWriter::flush_memtable_async() {
205
0
    _lock_watch.start();
206
0
    std::lock_guard<std::mutex> l(_lock);
207
0
    _lock_watch.stop();
208
0
    return BaseDeltaWriter::flush_memtable_async();
209
0
}
210
211
15
Status DeltaWriter::close() {
212
15
    _lock_watch.start();
213
15
    std::lock_guard<std::mutex> l(_lock);
214
15
    _lock_watch.stop();
215
15
    if (!_is_init && !_is_cancelled) {
216
        // if this delta writer is not initialized, but close() is called.
217
        // which means this tablet has no data loaded, but at least one tablet
218
        // in same partition has data loaded.
219
        // so we have to also init this DeltaWriter, so that it can create an empty rowset
220
        // for this tablet when being closed.
221
3
        RETURN_IF_ERROR(init());
222
3
    }
223
15
    return _memtable_writer->close();
224
15
}
225
226
15
Status BaseDeltaWriter::build_rowset() {
227
15
    SCOPED_TIMER(_close_wait_timer);
228
15
    RETURN_IF_ERROR(_memtable_writer->close_wait(_profile));
229
15
    return _rowset_builder->build_rowset();
230
15
}
231
232
15
Status DeltaWriter::build_rowset() {
233
15
    std::lock_guard<std::mutex> l(_lock);
234
15
    DCHECK(_is_init)
235
0
            << "delta writer is supposed be to initialized before build_rowset() being called";
236
15
    return BaseDeltaWriter::build_rowset();
237
15
}
238
239
9
Status BaseDeltaWriter::submit_calc_delete_bitmap_task() {
240
9
    return _rowset_builder->submit_calc_delete_bitmap_task();
241
9
}
242
243
9
Status BaseDeltaWriter::wait_calc_delete_bitmap() {
244
9
    return _rowset_builder->wait_calc_delete_bitmap();
245
9
}
246
247
15
Status DeltaWriter::commit_txn(const PSlaveTabletNodes& slave_tablet_nodes) {
248
15
    std::lock_guard<std::mutex> l(_lock);
249
15
    SCOPED_TIMER(_commit_txn_timer);
250
15
    RETURN_IF_ERROR(_rowset_builder->commit_txn());
251
252
15
    for (auto&& node_info : slave_tablet_nodes.slave_nodes()) {
253
0
        _request_slave_tablet_pull_rowset(node_info);
254
0
    }
255
15
    return Status::OK();
256
15
}
257
258
bool DeltaWriter::check_slave_replicas_done(
259
0
        google::protobuf::Map<int64_t, PSuccessSlaveTabletNodeIds>* success_slave_tablet_node_ids) {
260
0
    std::lock_guard<std::shared_mutex> lock(_slave_node_lock);
261
0
    if (_unfinished_slave_node.empty()) {
262
0
        success_slave_tablet_node_ids->insert({_req.tablet_id, _success_slave_node_ids});
263
0
        return true;
264
0
    }
265
0
    return false;
266
0
}
267
268
void DeltaWriter::add_finished_slave_replicas(
269
0
        google::protobuf::Map<int64_t, PSuccessSlaveTabletNodeIds>* success_slave_tablet_node_ids) {
270
0
    std::lock_guard<std::shared_mutex> lock(_slave_node_lock);
271
0
    success_slave_tablet_node_ids->insert({_req.tablet_id, _success_slave_node_ids});
272
0
}
273
274
0
Status BaseDeltaWriter::cancel() {
275
0
    return cancel_with_status(Status::Cancelled("already cancelled"));
276
0
}
277
278
0
Status BaseDeltaWriter::cancel_with_status(const Status& st) {
279
0
    if (_is_cancelled) {
280
0
        return Status::OK();
281
0
    }
282
0
    RETURN_IF_ERROR(_memtable_writer->cancel_with_status(st));
283
0
    _is_cancelled = true;
284
0
    return Status::OK();
285
0
}
286
287
0
Status DeltaWriter::cancel_with_status(const Status& st) {
288
0
    std::lock_guard<std::mutex> l(_lock);
289
0
    return BaseDeltaWriter::cancel_with_status(st);
290
0
}
291
292
0
int64_t BaseDeltaWriter::mem_consumption(MemType mem) {
293
0
    return _memtable_writer->mem_consumption(mem);
294
0
}
295
296
0
void DeltaWriter::_request_slave_tablet_pull_rowset(const PNodeInfo& node_info) {
297
0
    std::shared_ptr<PBackendService_Stub> stub =
298
0
            ExecEnv::GetInstance()->brpc_internal_client_cache()->get_client(
299
0
                    node_info.host(), node_info.async_internal_port());
300
0
    if (stub == nullptr) {
301
0
        LOG(WARNING) << "failed to send pull rowset request to slave replica. get rpc stub failed, "
302
0
                        "slave host="
303
0
                     << node_info.host() << ", port=" << node_info.async_internal_port()
304
0
                     << ", tablet_id=" << _req.tablet_id << ", txn_id=" << _req.txn_id;
305
0
        return;
306
0
    }
307
308
0
    _engine.txn_manager()->add_txn_tablet_delta_writer(_req.txn_id, _req.tablet_id, this);
309
0
    {
310
0
        std::lock_guard<std::shared_mutex> lock(_slave_node_lock);
311
0
        _unfinished_slave_node.insert(node_info.id());
312
0
    }
313
314
0
    std::vector<std::pair<int64_t, std::string>> indices_ids;
315
0
    auto cur_rowset = _rowset_builder->rowset();
316
0
    auto tablet_schema = cur_rowset->rowset_meta()->tablet_schema();
317
0
    if (!tablet_schema->skip_write_index_on_load()) {
318
0
        for (const auto& column : tablet_schema->columns()) {
319
0
            auto index_metas = tablet_schema->inverted_indexs(*column);
320
0
            for (const auto* index_meta : index_metas) {
321
0
                indices_ids.emplace_back(index_meta->index_id(), index_meta->get_index_suffix());
322
0
            }
323
0
        }
324
0
    }
325
326
0
    auto request = std::make_shared<PTabletWriteSlaveRequest>();
327
0
    auto* request_mutable_rs_meta = request->mutable_rowset_meta();
328
0
    *request_mutable_rs_meta = cur_rowset->rowset_meta()->get_rowset_pb();
329
0
    if (request_mutable_rs_meta != nullptr && request_mutable_rs_meta->has_partition_id() &&
330
0
        request_mutable_rs_meta->partition_id() == 0) {
331
        // TODO(dx): remove log after fix partition id eq 0 bug
332
0
        request_mutable_rs_meta->set_partition_id(_req.partition_id);
333
0
        LOG(WARNING) << "cant get partition id from local rs pb, get from _req, partition_id="
334
0
                     << _req.partition_id;
335
0
    }
336
0
    request->set_host(BackendOptions::get_localhost());
337
0
    request->set_http_port(config::webserver_port);
338
0
    const auto& tablet_path = cur_rowset->tablet_path();
339
0
    request->set_rowset_path(tablet_path);
340
0
    request->set_token(ExecEnv::GetInstance()->token());
341
0
    request->set_brpc_port(config::brpc_port);
342
0
    request->set_node_id(static_cast<int32_t>(node_info.id()));
343
0
    for (int segment_id = 0; segment_id < cur_rowset->rowset_meta()->num_segments(); segment_id++) {
344
0
        auto seg_path =
345
0
                local_segment_path(tablet_path, cur_rowset->rowset_id().to_string(), segment_id);
346
0
        int64_t segment_size = std::filesystem::file_size(seg_path);
347
0
        request->mutable_segments_size()->insert({segment_id, segment_size});
348
0
        auto index_path_prefix = InvertedIndexDescriptor::get_index_file_path_prefix(seg_path);
349
0
        if (!indices_ids.empty()) {
350
0
            if (tablet_schema->get_inverted_index_storage_format() ==
351
0
                InvertedIndexStorageFormatPB::V1) {
352
0
                for (auto index_meta : indices_ids) {
353
0
                    std::string inverted_index_file =
354
0
                            InvertedIndexDescriptor::get_index_file_path_v1(
355
0
                                    index_path_prefix, index_meta.first, index_meta.second);
356
0
                    int64_t size = std::filesystem::file_size(inverted_index_file);
357
0
                    PTabletWriteSlaveRequest::IndexSize index_size;
358
0
                    index_size.set_indexid(index_meta.first);
359
0
                    index_size.set_size(size);
360
0
                    index_size.set_suffix_path(index_meta.second);
361
                    // Fetch the map value for the current segment_id.
362
                    // If it doesn't exist, this will insert a new default-constructed IndexSizeMapValue
363
0
                    auto& index_size_map_value =
364
0
                            (*(request->mutable_inverted_indices_size()))[segment_id];
365
                    // Add the new index size to the map value.
366
0
                    *index_size_map_value.mutable_index_sizes()->Add() = std::move(index_size);
367
0
                }
368
0
            } else {
369
0
                std::string inverted_index_file =
370
0
                        InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix);
371
0
                int64_t size = std::filesystem::file_size(inverted_index_file);
372
0
                PTabletWriteSlaveRequest::IndexSize index_size;
373
                // special id for non-V1 format
374
0
                index_size.set_indexid(0);
375
0
                index_size.set_size(size);
376
0
                index_size.set_suffix_path("");
377
                // Fetch the map value for the current segment_id.
378
                // If it doesn't exist, this will insert a new default-constructed IndexSizeMapValue
379
0
                auto& index_size_map_value =
380
0
                        (*(request->mutable_inverted_indices_size()))[segment_id];
381
                // Add the new index size to the map value.
382
0
                *index_size_map_value.mutable_index_sizes()->Add() = std::move(index_size);
383
0
            }
384
0
        }
385
0
    }
386
387
0
    auto pull_callback = DummyBrpcCallback<PTabletWriteSlaveResult>::create_shared();
388
0
    auto closure = AutoReleaseClosure<
389
0
            PTabletWriteSlaveRequest,
390
0
            DummyBrpcCallback<PTabletWriteSlaveResult>>::create_unique(request, pull_callback);
391
0
    closure->cntl_->set_timeout_ms(config::slave_replica_writer_rpc_timeout_sec * 1000);
392
0
    closure->cntl_->ignore_eovercrowded();
393
0
    stub->request_slave_tablet_pull_rowset(closure->cntl_.get(), closure->request_.get(),
394
0
                                           closure->response_.get(), closure.get());
395
396
0
    closure.release();
397
0
    pull_callback->join();
398
0
    if (pull_callback->cntl_->Failed()) {
399
0
        if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available(
400
0
                    stub, node_info.host(), node_info.async_internal_port())) {
401
0
            ExecEnv::GetInstance()->brpc_internal_client_cache()->erase(
402
0
                    pull_callback->cntl_->remote_side());
403
0
        }
404
0
        LOG(WARNING) << "failed to send pull rowset request to slave replica, error="
405
0
                     << berror(pull_callback->cntl_->ErrorCode())
406
0
                     << ", error_text=" << pull_callback->cntl_->ErrorText()
407
0
                     << ". slave host: " << node_info.host() << ", tablet_id=" << _req.tablet_id
408
0
                     << ", txn_id=" << _req.txn_id;
409
0
        std::lock_guard<std::shared_mutex> lock(_slave_node_lock);
410
0
        _unfinished_slave_node.erase(node_info.id());
411
0
    }
412
0
}
413
414
0
void DeltaWriter::finish_slave_tablet_pull_rowset(int64_t node_id, bool is_succeed) {
415
0
    std::lock_guard<std::shared_mutex> lock(_slave_node_lock);
416
0
    if (is_succeed) {
417
0
        _success_slave_node_ids.add_slave_node_ids(node_id);
418
0
        VLOG_CRITICAL << "record successful slave replica for txn [" << _req.txn_id
419
0
                      << "], tablet_id=" << _req.tablet_id << ", node_id=" << node_id;
420
0
    }
421
0
    _unfinished_slave_node.erase(node_id);
422
0
}
423
424
0
int64_t BaseDeltaWriter::num_rows_filtered() const {
425
0
    auto rowset_writer = _rowset_builder->rowset_writer();
426
0
    return rowset_writer == nullptr ? 0 : rowset_writer->num_rows_filtered();
427
0
}
428
429
} // namespace doris