Coverage Report

Created: 2026-06-02 18:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/task/engine_clone_task.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/task/engine_clone_task.h"
19
20
#include <absl/strings/str_split.h>
21
#include <curl/curl.h>
22
#include <fcntl.h>
23
#include <fmt/format.h>
24
#include <gen_cpp/AgentService_types.h>
25
#include <gen_cpp/BackendService.h>
26
#include <gen_cpp/HeartbeatService_types.h>
27
#include <gen_cpp/MasterService_types.h>
28
#include <gen_cpp/Status_types.h>
29
#include <gen_cpp/Types_constants.h>
30
#include <sys/stat.h>
31
32
#include <filesystem>
33
#include <memory>
34
#include <mutex>
35
#include <ostream>
36
#include <shared_mutex>
37
#include <system_error>
38
#include <unordered_map>
39
#include <unordered_set>
40
#include <utility>
41
#include <vector>
42
43
#include "common/config.h"
44
#include "common/logging.h"
45
#include "io/fs/file_system.h"
46
#include "io/fs/local_file_system.h"
47
#include "io/fs/path.h"
48
#include "runtime/memory/mem_tracker_limiter.h"
49
#include "runtime/thread_context.h"
50
#include "service/http/http_client.h"
51
#include "service/http/utils.h"
52
#include "storage/binlog.h"
53
#include "storage/data_dir.h"
54
#include "storage/olap_common.h"
55
#include "storage/olap_define.h"
56
#include "storage/pb_helper.h"
57
#include "storage/rowset/rowset.h"
58
#include "storage/snapshot/snapshot_manager.h"
59
#include "storage/storage_engine.h"
60
#include "storage/tablet/tablet.h"
61
#include "storage/tablet/tablet_manager.h"
62
#include "storage/tablet/tablet_meta.h"
63
#include "util/client_cache.h"
64
#include "util/debug_points.h"
65
#include "util/defer_op.h"
66
#include "util/network_util.h"
67
#include "util/security.h"
68
#include "util/stopwatch.hpp"
69
#include "util/thrift_rpc_helper.h"
70
#include "util/trace.h"
71
72
using std::stringstream;
73
74
namespace doris {
75
using namespace ErrorCode;
76
77
namespace {
78
/// if binlog file exist, then check if binlog file md5sum equal
79
/// if equal, then skip link file
80
/// if not equal, then return error
81
/// return value: if binlog file not exist, then return to binlog file path
82
Result<std::string> check_dest_binlog_valid(const std::string& tablet_dir,
83
                                            const std::string& clone_dir,
84
0
                                            const std::string& clone_file, bool* skip_link_file) {
85
0
    std::string from, to;
86
0
    std::string new_clone_file = clone_file;
87
0
    if (clone_file.ends_with(".binlog")) {
88
        // change clone_file suffix from .binlog to .dat
89
0
        new_clone_file.replace(clone_file.size() - 7, 7, ".dat");
90
0
    } else if (clone_file.ends_with(".binlog-index")) {
91
        // change clone_file suffix from .binlog-index to .idx
92
0
        new_clone_file.replace(clone_file.size() - 13, 13, ".idx");
93
0
    }
94
0
    from = fmt::format("{}/{}", clone_dir, clone_file);
95
0
    to = fmt::format("{}/_binlog/{}", tablet_dir, new_clone_file);
96
97
    // check to to file exist
98
0
    bool exists = true;
99
0
    auto status = io::global_local_filesystem()->exists(to, &exists);
100
0
    if (!status.ok()) {
101
0
        return ResultError(std::move(status));
102
0
    }
103
104
0
    if (!exists) {
105
0
        return to;
106
0
    }
107
108
0
    LOG(WARNING) << "binlog file already exist. "
109
0
                 << "tablet_dir=" << tablet_dir << ", clone_file=" << from << ", to=" << to;
110
111
0
    std::string clone_file_md5sum;
112
0
    status = io::global_local_filesystem()->md5sum(from, &clone_file_md5sum);
113
0
    if (!status.ok()) {
114
0
        return ResultError(std::move(status));
115
0
    }
116
0
    std::string to_file_md5sum;
117
0
    status = io::global_local_filesystem()->md5sum(to, &to_file_md5sum);
118
0
    if (!status.ok()) {
119
0
        return ResultError(std::move(status));
120
0
    }
121
122
0
    if (clone_file_md5sum == to_file_md5sum) {
123
        // if md5sum equal, then skip link file
124
0
        *skip_link_file = true;
125
0
        return to;
126
0
    } else {
127
0
        auto err_msg = fmt::format(
128
0
                "binlog file already exist, but md5sum not equal. "
129
0
                "tablet_dir={}, clone_file={}",
130
0
                tablet_dir, clone_file);
131
0
        LOG(WARNING) << err_msg;
132
0
        return ResultError(Status::InternalError(std::move(err_msg)));
133
0
    }
134
0
}
135
} // namespace
136
137
#define RETURN_IF_ERROR_(status, stmt) \
138
0
    do {                               \
139
0
        status = (stmt);               \
140
0
        if (UNLIKELY(!status.ok())) {  \
141
0
            return status;             \
142
0
        }                              \
143
0
    } while (false)
144
145
EngineCloneTask::EngineCloneTask(StorageEngine& engine, const TCloneReq& clone_req,
146
                                 const ClusterInfo* cluster_info, int64_t signature,
147
                                 std::vector<TTabletInfo>* tablet_infos)
148
0
        : _engine(engine),
149
0
          _clone_req(clone_req),
150
0
          _tablet_infos(tablet_infos),
151
0
          _signature(signature),
152
0
          _cluster_info(cluster_info) {
153
0
    _mem_tracker = MemTrackerLimiter::create_shared(
154
0
            MemTrackerLimiter::Type::OTHER,
155
0
            "EngineCloneTask#tabletId=" + std::to_string(_clone_req.tablet_id));
156
0
}
157
158
0
Status EngineCloneTask::execute() {
159
    // register the tablet to avoid it is deleted by gc thread during clone process
160
0
    Status st = _do_clone();
161
0
    _engine.tablet_manager()->update_partitions_visible_version(
162
0
            {{_clone_req.partition_id, _clone_req.version}});
163
0
    return st;
164
0
}
165
166
0
Status EngineCloneTask::_do_clone() {
167
0
    DBUG_EXECUTE_IF("EngineCloneTask.wait_clone", {
168
0
        auto duration = std::chrono::milliseconds(dp->param("duration", 10 * 1000));
169
0
        std::this_thread::sleep_for(duration);
170
0
    });
171
172
0
    DBUG_EXECUTE_IF("EngineCloneTask.failed_clone", {
173
0
        LOG_WARNING("EngineCloneTask.failed_clone")
174
0
                .tag("tablet_id", _clone_req.tablet_id)
175
0
                .tag("replica_id", _clone_req.replica_id)
176
0
                .tag("version", _clone_req.version);
177
0
        return Status::InternalError(
178
0
                "in debug point, EngineCloneTask.failed_clone tablet={}, replica={}, version={}",
179
0
                _clone_req.tablet_id, _clone_req.replica_id, _clone_req.version);
180
0
    });
181
0
    Status status = Status::OK();
182
0
    std::string src_file_path;
183
0
    TBackend src_host;
184
0
    int32_t copy_type =
185
0
            _clone_req.__isset.copy_type ? _clone_req.copy_type : TabletCopyType::DEFAULT;
186
0
    RETURN_IF_ERROR(TabletCopyType::validate(copy_type));
187
0
    bool copy_row_binlog = TabletCopyType::has(copy_type, TTabletCopyType::ROW_BINLOG);
188
0
    RETURN_IF_ERROR(
189
0
            _engine.tablet_manager()->register_transition_tablet(_clone_req.tablet_id, "clone"));
190
0
    Defer defer {[&]() {
191
0
        _engine.tablet_manager()->unregister_transition_tablet(_clone_req.tablet_id, "clone");
192
0
    }};
193
194
    // Check local tablet exist or not
195
0
    TabletSharedPtr tablet = _engine.tablet_manager()->get_tablet(_clone_req.tablet_id);
196
197
    // The status of a tablet is not ready, indicating that it is a residual tablet after a schema
198
    // change failure. Clone a new tablet from remote be to overwrite it. This situation basically only
199
    // occurs when the be_rebalancer_fuzzy_test configuration is enabled.
200
0
    if (tablet && tablet->tablet_state() == TABLET_NOTREADY) {
201
0
        LOG(WARNING) << "tablet state is not ready when clone, need to drop old tablet, tablet_id="
202
0
                     << tablet->tablet_id();
203
0
        RETURN_IF_ERROR(_engine.tablet_manager()->drop_tablet(tablet->tablet_id(),
204
0
                                                              tablet->replica_id(), false));
205
0
        tablet.reset();
206
0
    }
207
0
    _is_new_tablet = tablet == nullptr;
208
    // try to incremental clone
209
0
    Versions missed_versions;
210
    // try to repair a tablet with missing version
211
0
    if (tablet != nullptr) {
212
0
        std::shared_lock migration_rlock(tablet->get_migration_lock(), std::try_to_lock);
213
0
        if (!migration_rlock.owns_lock()) {
214
0
            return Status::Error<TRY_LOCK_FAILED>(
215
0
                    "EngineCloneTask::_do_clone meet try lock failed");
216
0
        }
217
0
        if (tablet->replica_id() < _clone_req.replica_id) {
218
            // `tablet` may be a dropped replica in FE, e.g:
219
            //   BE1 migrates replica of tablet_1 to BE2, but before BE1 drop this replica, another new replica of tablet_1 is migrated to BE1.
220
            // Clone can still continue in this case. But to keep `replica_id` consitent with FE, MUST reset `replica_id` with request `replica_id`.
221
0
            tablet->tablet_meta()->set_replica_id(_clone_req.replica_id);
222
0
        }
223
224
        // get download path
225
0
        auto local_data_path = fmt::format("{}/{}", tablet->tablet_path(), CLONE_PREFIX);
226
0
        bool allow_incremental_clone = false;
227
228
0
        int64_t specified_version = _clone_req.version;
229
0
        if (tablet->enable_unique_key_merge_on_write()) {
230
0
            int64_t min_pending_ver = _engine.get_pending_publish_min_version(tablet->tablet_id());
231
0
            if (min_pending_ver - 1 < specified_version) {
232
0
                LOG(INFO) << "use min pending publish version for clone, min_pending_ver: "
233
0
                          << min_pending_ver << " visible_version: " << _clone_req.version;
234
0
                specified_version = min_pending_ver - 1;
235
0
            }
236
0
        }
237
238
0
        missed_versions = tablet->get_missed_versions(specified_version);
239
240
        // if missed version size is 0, then it is useless to clone from remote be, it means local data is
241
        // completed. Or remote be will just return header not the rowset files. clone will failed.
242
0
        if (missed_versions.empty()) {
243
0
            LOG(INFO) << "missed version size = 0, skip clone and return success. tablet_id="
244
0
                      << _clone_req.tablet_id << " replica_id=" << _clone_req.replica_id;
245
0
            RETURN_IF_ERROR(_set_tablet_info());
246
0
            return Status::OK();
247
0
        }
248
249
0
        LOG(INFO) << "clone to existed tablet. missed_versions_size=" << missed_versions.size()
250
0
                  << ", allow_incremental_clone=" << allow_incremental_clone
251
0
                  << ", signature=" << _signature << ", tablet_id=" << _clone_req.tablet_id
252
0
                  << ", visible_version=" << _clone_req.version
253
0
                  << ", replica_id=" << _clone_req.replica_id;
254
255
        // try to download missing version from src backend.
256
        // if tablet on src backend does not contains missing version, it will download all versions,
257
        // and set allow_incremental_clone to false
258
0
        RETURN_IF_ERROR(_make_and_download_snapshots(*(tablet->data_dir()), local_data_path,
259
0
                                                     &src_host, &src_file_path, missed_versions,
260
0
                                                     &allow_incremental_clone));
261
0
        RETURN_IF_ERROR(_finish_clone(tablet.get(), local_data_path, specified_version,
262
0
                                      allow_incremental_clone, copy_row_binlog));
263
0
    } else {
264
0
        LOG(INFO) << "clone tablet not exist, begin clone a new tablet from remote be. "
265
0
                  << "signature=" << _signature << ", tablet_id=" << _clone_req.tablet_id
266
0
                  << ", visible_version=" << _clone_req.version
267
0
                  << ", req replica=" << _clone_req.replica_id;
268
        // create a new tablet in this be
269
        // Get local disk from olap
270
0
        std::string local_shard_root_path;
271
0
        DataDir* store = nullptr;
272
0
        RETURN_IF_ERROR(_engine.obtain_shard_path(_clone_req.storage_medium,
273
0
                                                  _clone_req.dest_path_hash, &local_shard_root_path,
274
0
                                                  &store, _clone_req.partition_id));
275
0
        auto tablet_dir = fmt::format("{}/{}/{}", local_shard_root_path, _clone_req.tablet_id,
276
0
                                      _clone_req.schema_hash);
277
278
0
        Defer remove_useless_dir {[&] {
279
0
            if (status.ok()) {
280
0
                return;
281
0
            }
282
0
            LOG(INFO) << "clone failed. want to delete local dir: " << tablet_dir
283
0
                      << ". signature: " << _signature;
284
0
            WARN_IF_ERROR(io::global_local_filesystem()->delete_directory(tablet_dir),
285
0
                          "failed to delete useless clone dir ");
286
0
            WARN_IF_ERROR(DataDir::delete_tablet_parent_path_if_empty(tablet_dir),
287
0
                          "failed to delete parent dir");
288
0
        }};
289
290
0
        bool exists = true;
291
0
        Status exists_st = io::global_local_filesystem()->exists(tablet_dir, &exists);
292
0
        if (!exists_st) {
293
0
            LOG(WARNING) << "cant get path=" << tablet_dir << " state, st=" << exists_st;
294
0
            return exists_st;
295
0
        }
296
0
        if (exists) {
297
0
            LOG(WARNING) << "before clone dest path=" << tablet_dir << " exist, remove it first";
298
0
            RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(tablet_dir));
299
0
        }
300
301
0
        bool allow_incremental_clone = false;
302
0
        RETURN_IF_ERROR_(status,
303
0
                         _make_and_download_snapshots(*store, tablet_dir, &src_host, &src_file_path,
304
0
                                                      missed_versions, &allow_incremental_clone));
305
306
0
        LOG(INFO) << "clone copy done. src_host: " << src_host.host
307
0
                  << " src_file_path: " << src_file_path;
308
0
        auto tablet_manager = _engine.tablet_manager();
309
0
        RETURN_IF_ERROR_(status, tablet_manager->load_tablet_from_dir(store, _clone_req.tablet_id,
310
0
                                                                      _clone_req.schema_hash,
311
0
                                                                      tablet_dir, false));
312
0
        auto nested_tablet = tablet_manager->get_tablet(_clone_req.tablet_id);
313
0
        if (!nested_tablet) {
314
0
            status = Status::NotFound("tablet not found, tablet_id={}", _clone_req.tablet_id);
315
0
            return status;
316
0
        }
317
        // MUST reset `replica_id` to request `replica_id` to keep consistent with FE
318
0
        nested_tablet->tablet_meta()->set_replica_id(_clone_req.replica_id);
319
        // clone success, delete .hdr file because tablet meta is stored in rocksdb
320
0
        std::string header_path =
321
0
                TabletMeta::construct_header_file_path(tablet_dir, _clone_req.tablet_id);
322
0
        RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(header_path));
323
0
    }
324
325
0
    return _set_tablet_info();
326
0
}
327
328
0
Status EngineCloneTask::_set_tablet_info() {
329
    // Get clone tablet info
330
0
    TTabletInfo tablet_info;
331
0
    tablet_info.__set_tablet_id(_clone_req.tablet_id);
332
0
    tablet_info.__set_replica_id(_clone_req.replica_id);
333
0
    tablet_info.__set_schema_hash(_clone_req.schema_hash);
334
0
    RETURN_IF_ERROR(_engine.tablet_manager()->report_tablet_info(&tablet_info));
335
0
    if (_clone_req.__isset.version && tablet_info.version < _clone_req.version) {
336
        // if it is a new tablet and clone failed, then remove the tablet
337
        // if it is incremental clone, then must not drop the tablet
338
0
        if (_is_new_tablet) {
339
            // we need to check if this cloned table's version is what we expect.
340
            // if not, maybe this is a stale remaining table which is waiting for drop.
341
            // we drop it.
342
0
            LOG(WARNING) << "begin to drop the stale tablet. tablet_id:" << _clone_req.tablet_id
343
0
                         << ", replica_id:" << _clone_req.replica_id
344
0
                         << ", schema_hash:" << _clone_req.schema_hash
345
0
                         << ", signature:" << _signature << ", version:" << tablet_info.version
346
0
                         << ", expected_version: " << _clone_req.version;
347
0
            WARN_IF_ERROR(_engine.tablet_manager()->drop_tablet(_clone_req.tablet_id,
348
0
                                                                _clone_req.replica_id, false),
349
0
                          "drop stale cloned table failed");
350
0
        }
351
0
        return Status::InternalError("unexpected version. tablet version: {}, expected version: {}",
352
0
                                     tablet_info.version, _clone_req.version);
353
0
    }
354
0
    LOG(INFO) << "clone get tablet info success. tablet_id:" << _clone_req.tablet_id
355
0
              << ", schema_hash:" << _clone_req.schema_hash << ", signature:" << _signature
356
0
              << ", replica id:" << _clone_req.replica_id << ", version:" << tablet_info.version;
357
0
    _tablet_infos->push_back(tablet_info);
358
0
    return Status::OK();
359
0
}
360
361
/// This method will do following things:
362
/// 1. Make snapshots on source BE.
363
/// 2. Download all snapshots to CLONE dir.
364
/// 3. Convert rowset ids of downloaded snapshots(would also change the replica id).
365
/// 4. Release the snapshots on source BE.
366
Status EngineCloneTask::_make_and_download_snapshots(DataDir& data_dir,
367
                                                     const std::string& local_data_path,
368
                                                     TBackend* src_host, std::string* snapshot_path,
369
                                                     const std::vector<Version>& missed_versions,
370
0
                                                     bool* allow_incremental_clone) {
371
0
    Status status;
372
373
0
    const auto& token = _cluster_info->token;
374
375
0
    int timeout_s = 0;
376
0
    if (_clone_req.__isset.timeout_s) {
377
0
        timeout_s = _clone_req.timeout_s;
378
0
    }
379
380
0
    for (auto&& src : _clone_req.src_backends) {
381
        // Make snapshot in remote olap engine
382
0
        *src_host = src;
383
        // make snapshot
384
0
        status = _make_snapshot(src.host, src.be_port, _clone_req.tablet_id, _clone_req.schema_hash,
385
0
                                timeout_s, missed_versions, snapshot_path, allow_incremental_clone);
386
0
        if (!status.ok()) [[unlikely]] {
387
0
            LOG_WARNING("failed to make snapshot in remote BE")
388
0
                    .tag("host", src.host)
389
0
                    .tag("port", src.be_port)
390
0
                    .tag("tablet", _clone_req.tablet_id)
391
0
                    .tag("signature", _signature)
392
0
                    .tag("missed_versions", missed_versions)
393
0
                    .error(status);
394
0
            continue; // Try another BE
395
0
        }
396
0
        LOG_INFO("successfully make snapshot in remote BE")
397
0
                .tag("host", src.host)
398
0
                .tag("port", src.be_port)
399
0
                .tag("tablet", _clone_req.tablet_id)
400
0
                .tag("snapshot_path", *snapshot_path)
401
0
                .tag("signature", _signature)
402
0
                .tag("missed_versions", missed_versions);
403
0
        Defer defer {[host = src.host, port = src.be_port, &snapshot_path = *snapshot_path, this] {
404
            // TODO(plat1ko): Async release snapshot
405
0
            auto st = _release_snapshot(host, port, snapshot_path);
406
0
            if (!st.ok()) [[unlikely]] {
407
0
                LOG_WARNING("failed to release snapshot in remote BE")
408
0
                        .tag("host", host)
409
0
                        .tag("port", port)
410
0
                        .tag("snapshot_path", snapshot_path)
411
0
                        .error(st);
412
0
            }
413
0
        }};
414
415
0
        std::string remote_dir;
416
0
        {
417
0
            std::stringstream ss;
418
0
            if (snapshot_path->back() == '/') {
419
0
                ss << *snapshot_path << _clone_req.tablet_id << "/" << _clone_req.schema_hash
420
0
                   << "/";
421
0
            } else {
422
0
                ss << *snapshot_path << "/" << _clone_req.tablet_id << "/" << _clone_req.schema_hash
423
0
                   << "/";
424
0
            }
425
0
            remote_dir = ss.str();
426
0
        }
427
428
0
        std::string address = get_host_port(src.host, src.http_port);
429
0
        int32_t copy_type =
430
0
                _clone_req.__isset.copy_type ? _clone_req.copy_type : TabletCopyType::DEFAULT;
431
0
        RETURN_IF_ERROR(TabletCopyType::validate(copy_type));
432
0
        bool copy_row_binlog = TabletCopyType::has(copy_type, TTabletCopyType::ROW_BINLOG);
433
434
0
        if (config::enable_batch_download && is_support_batch_download(address).ok()) {
435
            // download files via batch api.
436
0
            LOG_INFO("remote BE supports batch download, use batch file download")
437
0
                    .tag("address", address)
438
0
                    .tag("remote_dir", remote_dir);
439
0
            status = _batch_download_files(&data_dir, address, remote_dir, local_data_path);
440
0
            if (!status.ok()) [[unlikely]] {
441
0
                LOG_WARNING("failed to download snapshot from remote BE in batch")
442
0
                        .tag("address", address)
443
0
                        .tag("remote_dir", remote_dir)
444
0
                        .error(status);
445
0
                continue; // Try another BE
446
0
            }
447
0
            if (copy_row_binlog) {
448
0
                std::string row_binlog_remote_dir =
449
0
                        fmt::format("{}{}/", remote_dir, FDRowBinlogSuffix);
450
0
                std::string row_binlog_local_path =
451
0
                        fmt::format("{}/{}", local_data_path, FDRowBinlogSuffix);
452
0
                status = _batch_download_files(&data_dir, address, row_binlog_remote_dir,
453
0
                                               row_binlog_local_path);
454
0
                if (!status.ok()) [[unlikely]] {
455
0
                    LOG_WARNING("failed to download row binlog snapshot from remote BE in batch")
456
0
                            .tag("address", address)
457
0
                            .tag("remote_dir", row_binlog_remote_dir)
458
0
                            .error(status);
459
0
                    continue; // Try another BE
460
0
                }
461
0
            }
462
0
        } else {
463
0
            if (config::enable_batch_download) {
464
0
                LOG_INFO("remote BE does not support batch download, use single file download")
465
0
                        .tag("address", address)
466
0
                        .tag("remote_dir", remote_dir);
467
0
            } else {
468
0
                LOG_INFO("batch download is disabled, use single file download")
469
0
                        .tag("address", address)
470
0
                        .tag("remote_dir", remote_dir);
471
0
            }
472
473
0
            std::string remote_url_prefix;
474
0
            {
475
0
                std::stringstream ss;
476
0
                ss << "http://" << address << HTTP_REQUEST_PREFIX << HTTP_REQUEST_TOKEN_PARAM
477
0
                   << token << HTTP_REQUEST_FILE_PARAM << remote_dir;
478
0
                remote_url_prefix = ss.str();
479
0
            }
480
481
0
            status = _download_files(&data_dir, remote_url_prefix, local_data_path);
482
0
            if (!status.ok()) [[unlikely]] {
483
0
                LOG_WARNING("failed to download snapshot from remote BE")
484
0
                        .tag("url", mask_token(remote_url_prefix))
485
0
                        .error(status);
486
0
                continue; // Try another BE
487
0
            }
488
0
            if (copy_row_binlog) {
489
0
                std::string row_binlog_remote_url_prefix;
490
0
                {
491
0
                    std::stringstream ss;
492
0
                    ss << "http://" << address << HTTP_REQUEST_PREFIX << HTTP_REQUEST_TOKEN_PARAM
493
0
                       << token << HTTP_REQUEST_FILE_PARAM << remote_dir << FDRowBinlogSuffix
494
0
                       << "/";
495
0
                    row_binlog_remote_url_prefix = ss.str();
496
0
                }
497
0
                std::string row_binlog_local_path =
498
0
                        fmt::format("{}/{}", local_data_path, FDRowBinlogSuffix);
499
0
                status = _download_files(&data_dir, row_binlog_remote_url_prefix,
500
0
                                         row_binlog_local_path);
501
0
                if (!status.ok()) [[unlikely]] {
502
0
                    LOG_WARNING("failed to download row binlog snapshot from remote BE")
503
0
                            .tag("url", mask_token(row_binlog_remote_url_prefix))
504
0
                            .error(status);
505
0
                    continue; // Try another BE
506
0
                }
507
0
            }
508
0
        }
509
510
        // No need to try again with another BE
511
0
        _pending_rs_guards = DORIS_TRY(_engine.snapshot_mgr()->convert_rowset_ids(
512
0
                local_data_path, _clone_req.tablet_id, _clone_req.replica_id, _clone_req.table_id,
513
0
                _clone_req.partition_id, _clone_req.schema_hash));
514
0
        break;
515
0
    } // clone copy from one backend
516
0
    return status;
517
0
}
518
519
Status EngineCloneTask::_make_snapshot(const std::string& ip, int port, TTableId tablet_id,
520
                                       TSchemaHash schema_hash, int timeout_s,
521
                                       const std::vector<Version>& missed_versions,
522
0
                                       std::string* snapshot_path, bool* allow_incremental_clone) {
523
0
    TSnapshotRequest request;
524
0
    request.__set_tablet_id(tablet_id);
525
0
    request.__set_schema_hash(schema_hash);
526
0
    request.__set_preferred_snapshot_version(g_Types_constants.TPREFER_SNAPSHOT_REQ_VERSION);
527
0
    request.__set_version(_clone_req.version);
528
0
    int32_t copy_type =
529
0
            _clone_req.__isset.copy_type ? _clone_req.copy_type : TabletCopyType::DEFAULT;
530
0
    RETURN_IF_ERROR(TabletCopyType::validate(copy_type));
531
0
    request.__set_copy_type(copy_type);
532
0
    request.__set_is_copy_binlog(TabletCopyType::has(copy_type, TTabletCopyType::CCR_BINLOG));
533
    // TODO: missing version composed of singleton delta.
534
    // if not, this place should be rewrote.
535
    // we make every TSnapshotRequest sent from be with __isset.missing_version = true
536
    // then if one be received one req with __isset.missing_version = false it means
537
    // this req is sent from FE(FE would never set this field)
538
0
    request.__isset.missing_version = true;
539
0
    for (auto& version : missed_versions) {
540
0
        request.missing_version.push_back(version.first);
541
0
    }
542
0
    if (timeout_s > 0) {
543
0
        request.__set_timeout(timeout_s);
544
0
    }
545
546
0
    TAgentResult result;
547
0
    RETURN_IF_ERROR(ThriftRpcHelper::rpc<BackendServiceClient>(
548
0
            ip, port, [&request, &result](BackendServiceConnection& client) {
549
0
                client->make_snapshot(result, request);
550
0
            }));
551
0
    if (result.status.status_code != TStatusCode::OK) {
552
0
        return Status::create(result.status);
553
0
    }
554
555
0
    if (!result.__isset.snapshot_path) {
556
0
        return Status::InternalError("success snapshot request without snapshot path");
557
0
    }
558
0
    *snapshot_path = result.snapshot_path;
559
0
    if (snapshot_path->at(snapshot_path->length() - 1) != '/') {
560
0
        snapshot_path->append("/");
561
0
    }
562
563
0
    if (result.__isset.allow_incremental_clone) {
564
        // During upgrading, some BE nodes still be installed an old previous old.
565
        // which incremental clone is not ready in those nodes.
566
        // should add a symbol to indicate it.
567
0
        *allow_incremental_clone = result.allow_incremental_clone;
568
0
    }
569
0
    return Status::OK();
570
0
}
571
572
Status EngineCloneTask::_release_snapshot(const std::string& ip, int port,
573
0
                                          const std::string& snapshot_path) {
574
0
    TAgentResult result;
575
0
    RETURN_IF_ERROR(ThriftRpcHelper::rpc<BackendServiceClient>(
576
0
            ip, port, [&snapshot_path, &result](BackendServiceConnection& client) {
577
0
                client->release_snapshot(result, snapshot_path);
578
0
            }));
579
0
    return Status::create(result.status);
580
0
}
581
582
Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& remote_url_prefix,
583
0
                                        const std::string& local_path) {
584
    // Check local path exist, if exist, remove it, then create the dir
585
    // local_file_full_path = tabletid/clone, for a specific tablet, there should be only one folder
586
    // if this folder exists, then should remove it
587
    // for example, BE clone from BE 1 to download file 1 with version (2,2), but clone from BE 1 failed
588
    // then it will try to clone from BE 2, but it will find the file 1 already exist, but file 1 with same
589
    // name may have different versions.
590
0
    RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(local_path));
591
0
    RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(local_path));
592
593
    // Get remote dir file list
594
0
    std::string file_list_str;
595
0
    auto list_files_cb = [&remote_url_prefix, &file_list_str](HttpClient* client) {
596
0
        RETURN_IF_ERROR(client->init(remote_url_prefix));
597
0
        client->set_timeout_ms(LIST_REMOTE_FILE_TIMEOUT * 1000);
598
0
        return client->execute(&file_list_str);
599
0
    };
600
0
    RETURN_IF_ERROR(HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, list_files_cb));
601
0
    std::vector<std::string> file_name_list =
602
0
            absl::StrSplit(file_list_str, "\n", absl::SkipWhitespace());
603
604
    // If the header file is not exist, the table couldn't loaded by olap engine.
605
    // Avoid of data is not complete, we copy the header file at last.
606
    // The header file's name is end of .hdr.
607
0
    for (int i = 0; i + 1 < file_name_list.size(); ++i) {
608
0
        if (file_name_list[i].ends_with(".hdr")) {
609
0
            std::swap(file_name_list[i], file_name_list[file_name_list.size() - 1]);
610
0
            break;
611
0
        }
612
0
    }
613
614
    // Get copy from remote
615
0
    uint64_t total_file_size = 0;
616
0
    MonotonicStopWatch watch;
617
0
    watch.start();
618
0
    for (auto& file_name : file_name_list) {
619
0
        auto remote_file_url = remote_url_prefix + file_name;
620
621
        // get file length
622
0
        uint64_t file_size = 0;
623
0
        auto get_file_size_cb = [&remote_file_url, &file_size](HttpClient* client) {
624
0
            RETURN_IF_ERROR(client->init(remote_file_url));
625
0
            client->set_timeout_ms(GET_LENGTH_TIMEOUT * 1000);
626
0
            RETURN_IF_ERROR(client->head());
627
0
            RETURN_IF_ERROR(client->get_content_length(&file_size));
628
0
            return Status::OK();
629
0
        };
630
0
        RETURN_IF_ERROR(
631
0
                HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, get_file_size_cb));
632
        // check disk capacity
633
0
        if (data_dir->reach_capacity_limit(file_size)) {
634
0
            return Status::Error<EXCEEDED_LIMIT>(
635
0
                    "reach the capacity limit of path {}, file_size={}", data_dir->path(),
636
0
                    file_size);
637
0
        }
638
639
0
        total_file_size += file_size;
640
0
        uint64_t estimate_timeout = file_size / config::download_low_speed_limit_kbps / 1024;
641
0
        if (estimate_timeout < config::download_low_speed_time) {
642
0
            estimate_timeout = config::download_low_speed_time;
643
0
        }
644
645
0
        std::string local_file_path = local_path + "/" + file_name;
646
647
0
        LOG(INFO) << "clone begin to download file from: " << mask_token(remote_file_url)
648
0
                  << " to: " << local_file_path << ". size(B): " << file_size
649
0
                  << ", timeout(s): " << estimate_timeout;
650
651
0
        auto download_cb = [&remote_file_url, estimate_timeout, &local_file_path,
652
0
                            file_size](HttpClient* client) {
653
0
            RETURN_IF_ERROR(client->init(remote_file_url));
654
0
            client->set_timeout_ms(estimate_timeout * 1000);
655
0
            RETURN_IF_ERROR(client->download(local_file_path));
656
657
0
            std::error_code ec;
658
            // Check file length
659
0
            uint64_t local_file_size = std::filesystem::file_size(local_file_path, ec);
660
0
            if (ec) {
661
0
                LOG(WARNING) << "download file error" << ec.message();
662
0
                return Status::IOError("can't retrive file_size of {}, due to {}", local_file_path,
663
0
                                       ec.message());
664
0
            }
665
0
            if (local_file_size != file_size) {
666
0
                LOG(WARNING) << "download file length error"
667
0
                             << ", remote_path=" << mask_token(remote_file_url)
668
0
                             << ", file_size=" << file_size
669
0
                             << ", local_file_size=" << local_file_size;
670
0
                return Status::InternalError("downloaded file size is not equal");
671
0
            }
672
0
            return io::global_local_filesystem()->permission(local_file_path,
673
0
                                                             io::LocalFileSystem::PERMS_OWNER_RW);
674
0
        };
675
0
        RETURN_IF_ERROR(HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, download_cb));
676
0
    } // Clone files from remote backend
677
678
0
    uint64_t total_time_ms = watch.elapsed_time() / 1000 / 1000;
679
0
    total_time_ms = total_time_ms > 0 ? total_time_ms : 0;
680
0
    double copy_rate = 0.0;
681
0
    if (total_time_ms > 0) {
682
0
        copy_rate = total_file_size / ((double)total_time_ms) / 1000;
683
0
    }
684
0
    _copy_size = (int64_t)total_file_size;
685
0
    _copy_time_ms = (int64_t)total_time_ms;
686
0
    LOG(INFO) << "succeed to copy tablet " << _signature
687
0
              << ", total files: " << file_name_list.size()
688
0
              << ", total file size: " << total_file_size << " B, cost: " << total_time_ms << " ms"
689
0
              << ", rate: " << copy_rate << " MB/s";
690
0
    return Status::OK();
691
0
}
692
693
Status EngineCloneTask::_batch_download_files(DataDir* data_dir, const std::string& address,
694
                                              const std::string& remote_dir,
695
0
                                              const std::string& local_dir) {
696
0
    constexpr size_t BATCH_FILE_SIZE = 64 << 20; // 64MB
697
0
    constexpr size_t BATCH_FILE_NUM = 64;
698
699
    // Check local path exist, if exist, remove it, then create the dir
700
    // local_file_full_path = tabletid/clone, for a specific tablet, there should be only one folder
701
    // if this folder exists, then should remove it
702
    // for example, BE clone from BE 1 to download file 1 with version (2,2), but clone from BE 1 failed
703
    // then it will try to clone from BE 2, but it will find the file 1 already exist, but file 1 with same
704
    // name may have different versions.
705
0
    RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(local_dir));
706
0
    RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(local_dir));
707
708
0
    const std::string& token = _cluster_info->token;
709
0
    std::vector<std::pair<std::string, size_t>> file_info_list;
710
0
    RETURN_IF_ERROR(list_remote_files_v2(address, token, remote_dir, &file_info_list));
711
712
    // If the header file is not exist, the table couldn't loaded by olap engine.
713
    // Avoid of data is not complete, we copy the header file at last.
714
    // The header file's name is end of .hdr.
715
0
    for (int i = 0; i + 1 < file_info_list.size(); ++i) {
716
0
        if (file_info_list[i].first.ends_with(".hdr")) {
717
0
            std::swap(file_info_list[i], file_info_list[file_info_list.size() - 1]);
718
0
            break;
719
0
        }
720
0
    }
721
722
0
    MonotonicStopWatch watch;
723
0
    watch.start();
724
725
0
    size_t total_file_size = 0;
726
0
    size_t total_files = file_info_list.size();
727
0
    std::vector<std::pair<std::string, size_t>> batch_files;
728
0
    for (size_t i = 0; i < total_files;) {
729
0
        size_t batch_file_size = 0;
730
0
        for (size_t j = i; j < total_files; j++) {
731
            // Split batchs by file number and file size,
732
0
            if (BATCH_FILE_NUM <= batch_files.size() || BATCH_FILE_SIZE <= batch_file_size ||
733
                // ... or separate the last .hdr file into a single batch.
734
0
                (j + 1 == total_files && !batch_files.empty())) {
735
0
                break;
736
0
            }
737
0
            batch_files.push_back(file_info_list[j]);
738
0
            batch_file_size += file_info_list[j].second;
739
0
        }
740
741
        // check disk capacity
742
0
        if (data_dir->reach_capacity_limit(batch_file_size)) {
743
0
            return Status::Error<EXCEEDED_LIMIT>(
744
0
                    "reach the capacity limit of path {}, file_size={}", data_dir->path(),
745
0
                    batch_file_size);
746
0
        }
747
748
0
        RETURN_IF_ERROR(download_files_v2(address, token, remote_dir, local_dir, batch_files));
749
750
0
        total_file_size += batch_file_size;
751
0
        i += batch_files.size();
752
0
        batch_files.clear();
753
0
    }
754
755
0
    uint64_t total_time_ms = watch.elapsed_time() / 1000 / 1000;
756
0
    total_time_ms = total_time_ms > 0 ? total_time_ms : 0;
757
0
    double copy_rate = 0.0;
758
0
    if (total_time_ms > 0) {
759
0
        copy_rate = total_file_size / ((double)total_time_ms) / 1000;
760
0
    }
761
0
    _copy_size = (int64_t)total_file_size;
762
0
    _copy_time_ms = (int64_t)total_time_ms;
763
0
    LOG(INFO) << "succeed to copy tablet " << _signature
764
0
              << ", total files: " << file_info_list.size()
765
0
              << ", total file size: " << total_file_size << " B, cost: " << total_time_ms << " ms"
766
0
              << ", rate: " << copy_rate << " MB/s";
767
768
0
    return Status::OK();
769
0
}
770
771
/// This method will only be called if tablet already exist in this BE when doing clone.
772
/// This method will do the following things:
773
/// 1. Link all files from CLONE dir to tablet dir if file does not exist in tablet dir
774
/// 2. Call _finish_xx_clone() to revise the tablet meta.
775
Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_dir, int64_t version,
776
0
                                      bool is_incremental_clone, bool copy_row_binlog) {
777
0
    Defer remove_clone_dir {[&]() {
778
0
        std::error_code ec;
779
0
        std::filesystem::remove_all(clone_dir, ec);
780
0
        if (ec) {
781
0
            LOG(WARNING) << "failed to remove=" << clone_dir << " msg=" << ec.message();
782
0
        }
783
0
    }};
784
785
    // check clone dir existed
786
0
    bool exists = true;
787
0
    RETURN_IF_ERROR(io::global_local_filesystem()->exists(clone_dir, &exists));
788
0
    if (!exists) {
789
0
        return Status::InternalError("clone dir not existed. clone_dir={}", clone_dir);
790
0
    }
791
792
    // Load src header.
793
    // The tablet meta info is downloaded from source BE as .hdr file.
794
    // So we load it and generate cloned_tablet_meta.
795
0
    auto cloned_tablet_meta_file = fmt::format("{}/{}.hdr", clone_dir, tablet->tablet_id());
796
0
    auto cloned_tablet_meta = std::make_shared<TabletMeta>();
797
0
    RETURN_IF_ERROR(cloned_tablet_meta->create_from_file(cloned_tablet_meta_file));
798
799
    // remove the cloned meta file
800
0
    RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(cloned_tablet_meta_file));
801
802
    // remove rowset binlog metas
803
0
    const auto& tablet_dir = tablet->tablet_path();
804
0
    auto binlog_metas_file = fmt::format("{}/rowset_binlog_metas.pb", clone_dir);
805
0
    bool binlog_metas_file_exists = false;
806
0
    auto file_exists_status =
807
0
            io::global_local_filesystem()->exists(binlog_metas_file, &binlog_metas_file_exists);
808
0
    if (!file_exists_status.ok()) {
809
0
        return file_exists_status;
810
0
    }
811
0
    bool contain_binlog = false;
812
0
    RowsetBinlogMetasPB rowset_binlog_metas_pb;
813
0
    if (binlog_metas_file_exists) {
814
0
        std::error_code ec;
815
0
        auto binlog_meta_filesize = std::filesystem::file_size(binlog_metas_file, ec);
816
0
        if (ec) {
817
0
            LOG(WARNING) << "get file size error" << ec.message();
818
0
            return Status::IOError("can't retrive file_size of {}, due to {}", binlog_metas_file,
819
0
                                   ec.message());
820
0
        }
821
0
        if (binlog_meta_filesize > 0) {
822
0
            contain_binlog = true;
823
0
            RETURN_IF_ERROR(read_pb(binlog_metas_file, &rowset_binlog_metas_pb));
824
0
        }
825
0
        RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(binlog_metas_file));
826
0
    }
827
0
    if (contain_binlog) {
828
0
        auto binlog_dir = fmt::format("{}/_binlog", tablet_dir);
829
0
        RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(binlog_dir));
830
0
    }
831
832
0
    enum class CloneFileType { DATA, ROW_BINLOG, CCR_BINLOG };
833
834
    // check all files in /clone and /tablet
835
0
    std::vector<io::FileInfo> clone_files;
836
0
    RETURN_IF_ERROR(io::global_local_filesystem()->list(clone_dir, true, &clone_files, &exists));
837
0
    std::vector<std::pair<CloneFileType, std::string>> clone_file_names;
838
0
    for (auto& file : clone_files) {
839
0
        CloneFileType file_type = CloneFileType::DATA;
840
0
        if (file.file_name.ends_with(".binlog") || file.file_name.ends_with(".binlog-index")) {
841
0
            file_type = CloneFileType::CCR_BINLOG;
842
0
        }
843
0
        clone_file_names.emplace_back(file_type, file.file_name);
844
0
    }
845
0
    auto row_binlog_clone_dir = fmt::format("{}/{}", clone_dir, FDRowBinlogSuffix);
846
0
    if (copy_row_binlog) {
847
0
        clone_files.clear();
848
0
        RETURN_IF_ERROR(io::global_local_filesystem()->list(row_binlog_clone_dir, true,
849
0
                                                            &clone_files, &exists));
850
0
        if (!exists) {
851
0
            return Status::InternalError("row binlog clone dir not existed. clone_dir={}",
852
0
                                         row_binlog_clone_dir);
853
0
        }
854
0
        for (auto& file : clone_files) {
855
0
            clone_file_names.emplace_back(CloneFileType::ROW_BINLOG, file.file_name);
856
0
        }
857
0
    }
858
859
0
    std::vector<io::FileInfo> local_files;
860
0
    RETURN_IF_ERROR(io::global_local_filesystem()->list(tablet_dir, true, &local_files, &exists));
861
0
    std::unordered_set<std::string> data_local_file_names;
862
0
    for (auto& file : local_files) {
863
0
        data_local_file_names.insert(file.file_name);
864
0
    }
865
0
    auto row_binlog_dir = tablet->row_binlog_path();
866
0
    std::unordered_set<std::string> row_binlog_local_file_names;
867
0
    if (copy_row_binlog) {
868
0
        RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(row_binlog_dir));
869
0
        local_files.clear();
870
0
        RETURN_IF_ERROR(
871
0
                io::global_local_filesystem()->list(row_binlog_dir, true, &local_files, &exists));
872
0
        for (auto& file : local_files) {
873
0
            row_binlog_local_file_names.insert(file.file_name);
874
0
        }
875
0
    }
876
877
0
    Status status;
878
0
    std::vector<std::string> linked_success_files;
879
0
    Defer remove_linked_files {[&]() { // clear linked files if errors happen
880
0
        if (!status.ok()) {
881
0
            std::vector<io::Path> paths;
882
0
            for (auto& file : linked_success_files) {
883
0
                paths.emplace_back(file);
884
0
            }
885
0
            static_cast<void>(io::global_local_filesystem()->batch_delete(paths));
886
0
        }
887
0
    }};
888
    /// Traverse all downloaded clone files in CLONE dir.
889
    /// If it does not exist in local tablet dir, link the file to local tablet dir
890
    /// And save all linked files in linked_success_files.
891
0
    for (const auto& [clone_file_type, clone_file] : clone_file_names) {
892
0
        if (clone_file_type == CloneFileType::DATA &&
893
0
            data_local_file_names.find(clone_file) != data_local_file_names.end()) {
894
0
            VLOG_NOTICE << "find same file when clone, skip it. "
895
0
                        << "tablet=" << tablet->tablet_id() << ", clone_file=" << clone_file;
896
0
            continue;
897
0
        }
898
0
        if (clone_file_type == CloneFileType::ROW_BINLOG &&
899
0
            row_binlog_local_file_names.find(clone_file) != row_binlog_local_file_names.end()) {
900
0
            VLOG_NOTICE << "find same row binlog file when clone, skip it. "
901
0
                        << "tablet=" << tablet->tablet_id() << ", clone_file=" << clone_file;
902
0
            continue;
903
0
        }
904
905
        /// if binlog exist in clone dir and md5sum equal, then skip link file
906
0
        bool skip_link_file = false;
907
0
        std::string to;
908
0
        if (clone_file_type == CloneFileType::CCR_BINLOG) {
909
0
            if (!contain_binlog) {
910
0
                LOG(WARNING) << "clone binlog file, but not contain binlog metas. "
911
0
                             << "tablet=" << tablet->tablet_id() << ", clone_file=" << clone_file;
912
0
                break;
913
0
            }
914
915
0
            if (auto&& result =
916
0
                        check_dest_binlog_valid(tablet_dir, clone_dir, clone_file, &skip_link_file);
917
0
                result) {
918
0
                to = std::move(result.value());
919
0
            } else {
920
0
                status = std::move(result.error());
921
0
                return status;
922
0
            }
923
0
        } else {
924
0
            auto& local_dir =
925
0
                    clone_file_type == CloneFileType::ROW_BINLOG ? row_binlog_dir : tablet_dir;
926
0
            to = fmt::format("{}/{}", local_dir, clone_file);
927
0
        }
928
929
0
        if (!skip_link_file) {
930
0
            auto& clone_file_dir =
931
0
                    clone_file_type == CloneFileType::ROW_BINLOG ? row_binlog_clone_dir : clone_dir;
932
0
            auto from = fmt::format("{}/{}", clone_file_dir, clone_file);
933
0
            status = io::global_local_filesystem()->link_file(from, to);
934
0
            if (!status.ok()) {
935
0
                return status;
936
0
            }
937
0
            linked_success_files.emplace_back(std::move(to));
938
0
        }
939
0
    }
940
0
    if (contain_binlog) {
941
0
        status = tablet->ingest_binlog_metas(&rowset_binlog_metas_pb);
942
0
        if (!status.ok()) {
943
0
            return status;
944
0
        }
945
0
    }
946
947
    // clone and compaction operation should be performed sequentially
948
0
    std::lock_guard base_compaction_lock(tablet->get_base_compaction_lock());
949
0
    std::lock_guard cumulative_compaction_lock(tablet->get_cumulative_compaction_lock());
950
0
    std::lock_guard cold_compaction_lock(tablet->get_cold_compaction_lock());
951
0
    std::lock_guard build_inverted_index_lock(tablet->get_build_inverted_index_lock());
952
0
    std::lock_guard<std::mutex> push_lock(tablet->get_push_lock());
953
0
    std::lock_guard<std::mutex> rwlock(tablet->get_rowset_update_lock());
954
0
    std::lock_guard<std::shared_mutex> wrlock(tablet->get_header_lock());
955
0
    SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD);
956
0
    if (is_incremental_clone) {
957
0
        status = _finish_incremental_clone(tablet, cloned_tablet_meta, version, copy_row_binlog);
958
0
    } else {
959
0
        status = _finish_full_clone(tablet, cloned_tablet_meta, copy_row_binlog);
960
0
    }
961
962
    // if full clone success, need to update cumulative layer point
963
0
    if (!is_incremental_clone && status.ok()) {
964
0
        tablet->set_cumulative_layer_point(Tablet::K_INVALID_CUMULATIVE_POINT);
965
0
    }
966
967
    // clear clone dir
968
0
    return status;
969
0
}
970
971
/// This method will do:
972
/// 1. Get missing version from local tablet again and check if they exist in cloned tablet.
973
/// 2. Revise the local tablet meta to add all incremental cloned rowset's meta.
974
Status EngineCloneTask::_finish_incremental_clone(Tablet* tablet,
975
                                                  const TabletMetaSharedPtr& cloned_tablet_meta,
976
0
                                                  int64_t version, bool copy_row_binlog) {
977
0
    LOG(INFO) << "begin to finish incremental clone. tablet=" << tablet->tablet_id()
978
0
              << ", visible_version=" << version
979
0
              << ", cloned_tablet_replica_id=" << cloned_tablet_meta->replica_id();
980
981
    /// Get missing versions again from local tablet.
982
    /// We got it before outside the lock, so it has to be got again.
983
0
    Versions missed_versions = tablet->get_missed_versions_unlocked(version);
984
0
    VLOG_NOTICE << "get missed versions again when finish incremental clone. "
985
0
                << "tablet=" << tablet->tablet_id() << ", clone version=" << version
986
0
                << ", missed_versions_size=" << missed_versions.size();
987
988
    // check missing versions exist in clone src
989
0
    std::vector<RowsetSharedPtr> rowsets_to_clone;
990
0
    for (Version nested_version : missed_versions) {
991
0
        auto rs_meta = cloned_tablet_meta->acquire_rs_meta_by_version(nested_version);
992
0
        if (rs_meta == nullptr) {
993
0
            return Status::InternalError("missed version {} is not found in cloned tablet meta",
994
0
                                         nested_version.to_string());
995
0
        }
996
0
        RowsetSharedPtr rs;
997
0
        RETURN_IF_ERROR(tablet->create_rowset(rs_meta, &rs));
998
0
        rowsets_to_clone.push_back(std::move(rs));
999
0
        if (copy_row_binlog) {
1000
0
            rs_meta = cloned_tablet_meta->acquire_row_binlog_rs_meta_by_version(nested_version);
1001
0
            if (rs_meta == nullptr) {
1002
0
                return Status::InternalError(
1003
0
                        "missed version {} row binlog is not found in cloned tablet meta",
1004
0
                        nested_version.to_string());
1005
0
            }
1006
0
            RETURN_IF_ERROR(tablet->create_rowset(rs_meta, &rs));
1007
0
            rowsets_to_clone.push_back(std::move(rs));
1008
0
        }
1009
0
    }
1010
1011
0
    if (copy_row_binlog && tablet->enable_unique_key_merge_on_write()) {
1012
0
        tablet->tablet_meta()->binlog_delvec().merge(cloned_tablet_meta->binlog_delvec());
1013
0
    }
1014
1015
    /// clone_data to tablet
1016
    /// For incremental clone, nothing will be deleted.
1017
    /// So versions_to_delete is empty.
1018
0
    return tablet->revise_tablet_meta(rowsets_to_clone, {}, true, copy_row_binlog);
1019
0
}
1020
1021
/// This method will do:
1022
/// 1. Compare the version of local tablet and cloned tablet to decide which version to keep
1023
/// 2. Revise the local tablet meta
1024
Status EngineCloneTask::_finish_full_clone(Tablet* tablet,
1025
                                           const TabletMetaSharedPtr& cloned_tablet_meta,
1026
0
                                           bool copy_row_binlog) {
1027
0
    Version cloned_max_version = cloned_tablet_meta->max_version();
1028
0
    LOG(INFO) << "begin to finish full clone. tablet=" << tablet->tablet_id()
1029
0
              << ", cloned_max_version=" << cloned_max_version;
1030
1031
    // Compare the version of local tablet and cloned tablet.
1032
    // For example:
1033
    // clone version is 8
1034
    //
1035
    //      local tablet: [0-1] [2-5] [6-6] [7-7] [9-10]
1036
    //      clone tablet: [0-1] [2-4] [5-6] [7-8]
1037
    //
1038
    // after compare, the version mark with "x" will be deleted
1039
    //
1040
    //      local tablet: [0-1]x [2-5]x [6-6]x [7-7]x [9-10]
1041
    //      clone tablet: [0-1]  [2-4]  [5-6]  [7-8]
1042
1043
0
    std::vector<RowsetSharedPtr> to_delete;
1044
0
    std::vector<RowsetSharedPtr> to_add;
1045
0
    for (auto& [v, rs] : tablet->rowset_map()) {
1046
        // if local version cross src latest, clone failed
1047
        // if local version is : 0-0, 1-1, 2-10, 12-14, 15-15,16-16
1048
        // cloned max version is 13-13, this clone is failed, because could not
1049
        // fill local data by using cloned data.
1050
        // It should not happen because if there is a hole, the following delta will not
1051
        // do compaction.
1052
0
        if (v.first <= cloned_max_version.second && v.second > cloned_max_version.second) {
1053
0
            return Status::InternalError(
1054
0
                    "version cross src latest. cloned_max_version={}, local_version={}",
1055
0
                    cloned_max_version.second, v.to_string());
1056
0
        }
1057
0
        if (v.second <= cloned_max_version.second) {
1058
0
            to_delete.push_back(rs);
1059
0
        } else {
1060
            // cooldowned rowsets MUST be continuous, so rowsets whose version > missed version MUST be local rowset
1061
0
            DCHECK(rs->is_local());
1062
0
        }
1063
0
    }
1064
0
    if (copy_row_binlog) {
1065
0
        for (auto& [v, rs] : tablet->row_binlog_rowset_map()) {
1066
0
            if (v.first <= cloned_max_version.second && v.second > cloned_max_version.second) {
1067
0
                return Status::InternalError(
1068
0
                        "row binlog version cross src latest. cloned_max_version={}, "
1069
0
                        "local_version={}",
1070
0
                        cloned_max_version.second, v.to_string());
1071
0
            }
1072
0
            if (v.second <= cloned_max_version.second) {
1073
0
                to_delete.push_back(rs);
1074
0
            } else {
1075
0
                DCHECK(rs->is_local());
1076
0
            }
1077
0
        }
1078
0
    }
1079
1080
0
    to_add.reserve(cloned_tablet_meta->all_rs_metas().size() +
1081
0
                   cloned_tablet_meta->all_row_binlog_rs_metas().size());
1082
0
    for (const auto& [_, rs_meta] : cloned_tablet_meta->all_rs_metas()) {
1083
0
        RowsetSharedPtr rs;
1084
0
        RETURN_IF_ERROR(tablet->create_rowset(rs_meta, &rs));
1085
0
        to_add.push_back(std::move(rs));
1086
0
    }
1087
0
    if (copy_row_binlog) {
1088
0
        for (const auto& [_, rs_meta] : cloned_tablet_meta->all_row_binlog_rs_metas()) {
1089
0
            RowsetSharedPtr rs;
1090
0
            RETURN_IF_ERROR(tablet->create_rowset(rs_meta, &rs));
1091
0
            to_add.push_back(std::move(rs));
1092
0
        }
1093
0
    }
1094
0
    {
1095
0
        std::shared_lock cooldown_conf_rlock(tablet->get_cooldown_conf_lock());
1096
0
        if (tablet->cooldown_conf_unlocked().cooldown_replica_id == tablet->replica_id()) {
1097
            // If this replica is cooldown replica, MUST generate a new `cooldown_meta_id` to avoid use `cooldown_meta_id`
1098
            // generated in old cooldown term which may lead to such situation:
1099
            // Replica A is cooldown replica, cooldown_meta_id=2,
1100
            // Replica B: cooldown_replica=A, cooldown_meta_id=1
1101
            // Replica A: full clone Replica A, cooldown_meta_id=1, but remote cooldown_meta is still with cooldown_meta_id=2
1102
            // After tablet report. FE finds all replicas' cooldowned data is consistent
1103
            // Replica A: confirm_unused_remote_files, delete some cooldowned data of cooldown_meta_id=2
1104
            // Replica B: follow_cooldown_data, cooldown_meta_id=2, data lost
1105
0
            tablet->tablet_meta()->set_cooldown_meta_id(UniqueId::gen_uid());
1106
0
        } else {
1107
0
            tablet->tablet_meta()->set_cooldown_meta_id(cloned_tablet_meta->cooldown_meta_id());
1108
0
        }
1109
0
    }
1110
0
    if (tablet->enable_unique_key_merge_on_write()) {
1111
0
        tablet->tablet_meta()->delete_bitmap().merge(cloned_tablet_meta->delete_bitmap());
1112
0
        if (copy_row_binlog) {
1113
0
            tablet->tablet_meta()->binlog_delvec().merge(cloned_tablet_meta->binlog_delvec());
1114
0
        }
1115
0
    }
1116
0
    return tablet->revise_tablet_meta(to_add, to_delete, false, copy_row_binlog);
1117
    // TODO(plat1ko): write cooldown meta to remote if this replica is cooldown replica
1118
0
}
1119
} // namespace doris