Coverage Report

Created: 2025-06-20 11:05

/root/doris/be/src/olap/delta_writer.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "olap/delta_writer.h"
19
20
#include <brpc/controller.h>
21
#include <butil/errno.h>
22
#include <fmt/format.h>
23
#include <gen_cpp/internal_service.pb.h>
24
#include <gen_cpp/olap_file.pb.h>
25
26
#include <filesystem>
27
#include <ostream>
28
#include <string>
29
#include <utility>
30
31
// IWYU pragma: no_include <opentelemetry/common/threadlocal.h>
32
#include "common/compiler_util.h" // IWYU pragma: keep
33
#include "common/config.h"
34
#include "common/logging.h"
35
#include "common/status.h"
36
#include "exec/tablet_info.h"
37
#include "gutil/strings/numbers.h"
38
#include "io/fs/file_writer.h" // IWYU pragma: keep
39
#include "olap/memtable_flush_executor.h"
40
#include "olap/rowset/beta_rowset.h"
41
#include "olap/rowset/beta_rowset_writer.h"
42
#include "olap/rowset/rowset_meta.h"
43
#include "olap/rowset/segment_v2/inverted_index_desc.h"
44
#include "olap/rowset_builder.h"
45
#include "olap/schema_change.h"
46
#include "olap/storage_engine.h"
47
#include "olap/tablet_manager.h"
48
#include "olap/txn_manager.h"
49
#include "runtime/exec_env.h"
50
#include "service/backend_options.h"
51
#include "util/brpc_client_cache.h"
52
#include "util/mem_info.h"
53
#include "util/ref_count_closure.h"
54
#include "util/stopwatch.hpp"
55
#include "util/time.h"
56
#include "vec/core/block.h"
57
58
namespace doris {
59
using namespace ErrorCode;
60
61
BaseDeltaWriter::BaseDeltaWriter(WriteRequest* req, RuntimeProfile* profile,
62
                                 const UniqueId& load_id)
63
11
        : _req(*req), _memtable_writer(new MemTableWriter(*req)) {
64
11
    _init_profile(profile);
65
11
}
66
67
DeltaWriter::DeltaWriter(StorageEngine& engine, WriteRequest* req, RuntimeProfile* profile,
68
                         const UniqueId& load_id)
69
11
        : BaseDeltaWriter(req, profile, load_id), _engine(engine) {
70
11
    _rowset_builder = std::make_unique<RowsetBuilder>(_engine, *req, profile);
71
11
}
72
73
11
void BaseDeltaWriter::_init_profile(RuntimeProfile* profile) {
74
11
    _profile = profile->create_child(fmt::format("DeltaWriter {}", _req.tablet_id), true, true);
75
11
    _close_wait_timer = ADD_TIMER(_profile, "CloseWaitTime");
76
11
    _wait_flush_limit_timer = ADD_TIMER(_profile, "WaitFlushLimitTime");
77
11
}
78
79
0
void DeltaWriter::_init_profile(RuntimeProfile* profile) {
80
0
    BaseDeltaWriter::_init_profile(profile);
81
0
    _commit_txn_timer = ADD_TIMER(_profile, "CommitTxnTime");
82
0
}
83
84
11
BaseDeltaWriter::~BaseDeltaWriter() {
85
11
    if (!_is_init) {
86
0
        return;
87
0
    }
88
89
    // cancel and wait all memtables in flush queue to be finished
90
11
    static_cast<void>(_memtable_writer->cancel());
91
92
11
    if (_rowset_builder->tablet() != nullptr) {
93
11
        const FlushStatistic& stat = _memtable_writer->get_flush_token_stats();
94
11
        _rowset_builder->tablet()->flush_bytes->increment(stat.flush_size_bytes);
95
11
        _rowset_builder->tablet()->flush_finish_count->increment(stat.flush_finish_count);
96
11
    }
97
11
}
98
99
11
DeltaWriter::~DeltaWriter() = default;
100
101
11
Status BaseDeltaWriter::init() {
102
11
    if (_is_init) {
103
0
        return Status::OK();
104
0
    }
105
11
    auto* t_ctx = doris::thread_context(true);
106
11
    std::shared_ptr<WorkloadGroup> wg_sptr = nullptr;
107
11
    if (t_ctx) {
108
11
        wg_sptr = t_ctx->workload_group().lock();
109
11
    }
110
11
    RETURN_IF_ERROR(_rowset_builder->init());
111
11
    RETURN_IF_ERROR(_memtable_writer->init(
112
11
            _rowset_builder->rowset_writer(), _rowset_builder->tablet_schema(),
113
11
            _rowset_builder->get_partial_update_info(), wg_sptr,
114
11
            _rowset_builder->tablet()->enable_unique_key_merge_on_write()));
115
11
    ExecEnv::GetInstance()->memtable_memory_limiter()->register_writer(_memtable_writer);
116
11
    _is_init = true;
117
11
    return Status::OK();
118
11
}
119
120
Status BaseDeltaWriter::write(const vectorized::Block* block,
121
12
                              const std::vector<uint32_t>& row_idxs) {
122
12
    if (UNLIKELY(row_idxs.empty())) {
123
0
        return Status::OK();
124
0
    }
125
12
    _lock_watch.start();
126
12
    std::lock_guard<std::mutex> l(_lock);
127
12
    _lock_watch.stop();
128
12
    if (!_is_init && !_is_cancelled) {
129
8
        RETURN_IF_ERROR(init());
130
8
    }
131
12
    {
132
12
        SCOPED_TIMER(_wait_flush_limit_timer);
133
12
        while (_memtable_writer->flush_running_count() >=
134
12
               config::memtable_flush_running_count_limit) {
135
0
            std::this_thread::sleep_for(std::chrono::milliseconds(10));
136
0
        }
137
12
    }
138
12
    return _memtable_writer->write(block, row_idxs);
139
12
}
140
5
Status BaseDeltaWriter::wait_flush() {
141
5
    return _memtable_writer->wait_flush();
142
5
}
143
144
11
Status BaseDeltaWriter::close() {
145
11
    _lock_watch.start();
146
11
    std::lock_guard<std::mutex> l(_lock);
147
11
    _lock_watch.stop();
148
11
    if (!_is_init && !_is_cancelled) {
149
        // if this delta writer is not initialized, but close() is called.
150
        // which means this tablet has no data loaded, but at least one tablet
151
        // in same partition has data loaded.
152
        // so we have to also init this DeltaWriter, so that it can create an empty rowset
153
        // for this tablet when being closed.
154
3
        RETURN_IF_ERROR(init());
155
3
    }
156
11
    return _memtable_writer->close();
157
11
}
158
159
11
Status BaseDeltaWriter::build_rowset() {
160
11
    std::lock_guard<std::mutex> l(_lock);
161
11
    DCHECK(_is_init)
162
0
            << "delta writer is supposed be to initialized before build_rowset() being called";
163
164
11
    SCOPED_TIMER(_close_wait_timer);
165
11
    RETURN_IF_ERROR(_memtable_writer->close_wait(_profile));
166
11
    return _rowset_builder->build_rowset();
167
11
}
168
169
5
Status BaseDeltaWriter::submit_calc_delete_bitmap_task() {
170
5
    return _rowset_builder->submit_calc_delete_bitmap_task();
171
5
}
172
173
5
Status BaseDeltaWriter::wait_calc_delete_bitmap() {
174
5
    return _rowset_builder->wait_calc_delete_bitmap();
175
5
}
176
177
11
Status DeltaWriter::commit_txn(const PSlaveTabletNodes& slave_tablet_nodes) {
178
11
    std::lock_guard<std::mutex> l(_lock);
179
11
    SCOPED_TIMER(_commit_txn_timer);
180
11
    RETURN_IF_ERROR(_rowset_builder->commit_txn());
181
182
11
    for (auto&& node_info : slave_tablet_nodes.slave_nodes()) {
183
0
        _request_slave_tablet_pull_rowset(node_info);
184
0
    }
185
11
    return Status::OK();
186
11
}
187
188
bool DeltaWriter::check_slave_replicas_done(
189
0
        google::protobuf::Map<int64_t, PSuccessSlaveTabletNodeIds>* success_slave_tablet_node_ids) {
190
0
    std::lock_guard<std::shared_mutex> lock(_slave_node_lock);
191
0
    if (_unfinished_slave_node.empty()) {
192
0
        success_slave_tablet_node_ids->insert({_req.tablet_id, _success_slave_node_ids});
193
0
        return true;
194
0
    }
195
0
    return false;
196
0
}
197
198
void DeltaWriter::add_finished_slave_replicas(
199
0
        google::protobuf::Map<int64_t, PSuccessSlaveTabletNodeIds>* success_slave_tablet_node_ids) {
200
0
    std::lock_guard<std::shared_mutex> lock(_slave_node_lock);
201
0
    success_slave_tablet_node_ids->insert({_req.tablet_id, _success_slave_node_ids});
202
0
}
203
204
0
Status BaseDeltaWriter::cancel() {
205
0
    return cancel_with_status(Status::Cancelled("already cancelled"));
206
0
}
207
208
0
Status BaseDeltaWriter::cancel_with_status(const Status& st) {
209
0
    std::lock_guard<std::mutex> l(_lock);
210
0
    if (_is_cancelled) {
211
0
        return Status::OK();
212
0
    }
213
0
    RETURN_IF_ERROR(_memtable_writer->cancel_with_status(st));
214
0
    _is_cancelled = true;
215
0
    return Status::OK();
216
0
}
217
218
0
int64_t BaseDeltaWriter::mem_consumption(MemType mem) {
219
0
    return _memtable_writer->mem_consumption(mem);
220
0
}
221
222
0
void DeltaWriter::_request_slave_tablet_pull_rowset(PNodeInfo node_info) {
223
0
    std::shared_ptr<PBackendService_Stub> stub =
224
0
            ExecEnv::GetInstance()->brpc_internal_client_cache()->get_client(
225
0
                    node_info.host(), node_info.async_internal_port());
226
0
    if (stub == nullptr) {
227
0
        LOG(WARNING) << "failed to send pull rowset request to slave replica. get rpc stub failed, "
228
0
                        "slave host="
229
0
                     << node_info.host() << ", port=" << node_info.async_internal_port()
230
0
                     << ", tablet_id=" << _req.tablet_id << ", txn_id=" << _req.txn_id;
231
0
        return;
232
0
    }
233
234
0
    _engine.txn_manager()->add_txn_tablet_delta_writer(_req.txn_id, _req.tablet_id, this);
235
0
    {
236
0
        std::lock_guard<std::shared_mutex> lock(_slave_node_lock);
237
0
        _unfinished_slave_node.insert(node_info.id());
238
0
    }
239
240
0
    std::vector<std::pair<int64_t, std::string>> indices_ids;
241
0
    auto cur_rowset = _rowset_builder->rowset();
242
0
    auto tablet_schema = cur_rowset->rowset_meta()->tablet_schema();
243
0
    if (!tablet_schema->skip_write_index_on_load()) {
244
0
        for (auto& column : tablet_schema->columns()) {
245
0
            const TabletIndex* index_meta = tablet_schema->get_inverted_index(*column);
246
0
            if (index_meta) {
247
0
                indices_ids.emplace_back(index_meta->index_id(), index_meta->get_index_suffix());
248
0
            }
249
0
        }
250
0
    }
251
252
0
    auto request = std::make_shared<PTabletWriteSlaveRequest>();
253
0
    auto request_mutable_rs_meta = request->mutable_rowset_meta();
254
0
    *request_mutable_rs_meta = cur_rowset->rowset_meta()->get_rowset_pb();
255
0
    if (request_mutable_rs_meta != nullptr && request_mutable_rs_meta->has_partition_id() &&
256
0
        request_mutable_rs_meta->partition_id() == 0) {
257
        // TODO(dx): remove log after fix partition id eq 0 bug
258
0
        request_mutable_rs_meta->set_partition_id(_req.partition_id);
259
0
        LOG(WARNING) << "cant get partition id from local rs pb, get from _req, partition_id="
260
0
                     << _req.partition_id;
261
0
    }
262
0
    request->set_host(BackendOptions::get_localhost());
263
0
    request->set_http_port(config::webserver_port);
264
0
    string tablet_path = _rowset_builder->tablet()->tablet_path();
265
0
    request->set_rowset_path(tablet_path);
266
0
    request->set_token(ExecEnv::GetInstance()->token());
267
0
    request->set_brpc_port(config::brpc_port);
268
0
    request->set_node_id(node_info.id());
269
0
    for (int segment_id = 0; segment_id < cur_rowset->rowset_meta()->num_segments(); segment_id++) {
270
0
        std::stringstream segment_name;
271
0
        segment_name << cur_rowset->rowset_id() << "_" << segment_id << ".dat";
272
0
        int64_t segment_size = std::filesystem::file_size(tablet_path + "/" + segment_name.str());
273
0
        request->mutable_segments_size()->insert({segment_id, segment_size});
274
275
0
        if (!indices_ids.empty()) {
276
0
            if (tablet_schema->get_inverted_index_storage_format() !=
277
0
                InvertedIndexStorageFormatPB::V1) {
278
0
                std::string inverted_index_file = InvertedIndexDescriptor::get_index_file_name(
279
0
                        tablet_path + "/" + segment_name.str());
280
0
                int64_t size = std::filesystem::file_size(inverted_index_file);
281
0
                PTabletWriteSlaveRequest::IndexSize index_size;
282
                // special id for non-V1 format
283
0
                index_size.set_indexid(0);
284
0
                index_size.set_size(size);
285
0
                index_size.set_suffix_path("");
286
                // Fetch the map value for the current segment_id.
287
                // If it doesn't exist, this will insert a new default-constructed IndexSizeMapValue
288
0
                auto& index_size_map_value =
289
0
                        (*(request->mutable_inverted_indices_size()))[segment_id];
290
                // Add the new index size to the map value.
291
0
                *index_size_map_value.mutable_index_sizes()->Add() = std::move(index_size);
292
0
            } else {
293
0
                for (auto index_id : indices_ids) {
294
0
                    std::string inverted_index_file = InvertedIndexDescriptor::get_index_file_name(
295
0
                            tablet_path + "/" + segment_name.str(), index_id.first,
296
0
                            index_id.second);
297
0
                    int64_t size = std::filesystem::file_size(inverted_index_file);
298
0
                    PTabletWriteSlaveRequest::IndexSize index_size;
299
0
                    index_size.set_indexid(index_id.first);
300
0
                    index_size.set_size(size);
301
0
                    index_size.set_suffix_path(index_id.second);
302
                    // Fetch the map value for the current segment_id.
303
                    // If it doesn't exist, this will insert a new default-constructed IndexSizeMapValue
304
0
                    auto& index_size_map_value =
305
0
                            (*(request->mutable_inverted_indices_size()))[segment_id];
306
                    // Add the new index size to the map value.
307
0
                    *index_size_map_value.mutable_index_sizes()->Add() = std::move(index_size);
308
0
                }
309
0
            }
310
0
        }
311
0
    }
312
313
0
    auto pull_callback = DummyBrpcCallback<PTabletWriteSlaveResult>::create_shared();
314
0
    auto closure = AutoReleaseClosure<
315
0
            PTabletWriteSlaveRequest,
316
0
            DummyBrpcCallback<PTabletWriteSlaveResult>>::create_unique(request, pull_callback);
317
0
    closure->cntl_->set_timeout_ms(config::slave_replica_writer_rpc_timeout_sec * 1000);
318
0
    closure->cntl_->ignore_eovercrowded();
319
0
    stub->request_slave_tablet_pull_rowset(closure->cntl_.get(), closure->request_.get(),
320
0
                                           closure->response_.get(), closure.get());
321
322
0
    closure.release();
323
0
    pull_callback->join();
324
0
    if (pull_callback->cntl_->Failed()) {
325
0
        if (!ExecEnv::GetInstance()->brpc_internal_client_cache()->available(
326
0
                    stub, node_info.host(), node_info.async_internal_port())) {
327
0
            ExecEnv::GetInstance()->brpc_internal_client_cache()->erase(
328
0
                    pull_callback->cntl_->remote_side());
329
0
        }
330
0
        LOG(WARNING) << "failed to send pull rowset request to slave replica, error="
331
0
                     << berror(pull_callback->cntl_->ErrorCode())
332
0
                     << ", error_text=" << pull_callback->cntl_->ErrorText()
333
0
                     << ". slave host: " << node_info.host() << ", tablet_id=" << _req.tablet_id
334
0
                     << ", txn_id=" << _req.txn_id;
335
0
        std::lock_guard<std::shared_mutex> lock(_slave_node_lock);
336
0
        _unfinished_slave_node.erase(node_info.id());
337
0
    }
338
0
}
339
340
0
void DeltaWriter::finish_slave_tablet_pull_rowset(int64_t node_id, bool is_succeed) {
341
0
    std::lock_guard<std::shared_mutex> lock(_slave_node_lock);
342
0
    if (is_succeed) {
343
0
        _success_slave_node_ids.add_slave_node_ids(node_id);
344
0
        VLOG_CRITICAL << "record successful slave replica for txn [" << _req.txn_id
345
0
                      << "], tablet_id=" << _req.tablet_id << ", node_id=" << node_id;
346
0
    }
347
0
    _unfinished_slave_node.erase(node_id);
348
0
}
349
350
0
int64_t BaseDeltaWriter::num_rows_filtered() const {
351
0
    auto rowset_writer = _rowset_builder->rowset_writer();
352
0
    return rowset_writer == nullptr ? 0 : rowset_writer->num_rows_filtered();
353
0
}
354
355
} // namespace doris