be/src/storage/task/engine_clone_task.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "storage/task/engine_clone_task.h" |
19 | | |
20 | | #include <absl/strings/str_split.h> |
21 | | #include <curl/curl.h> |
22 | | #include <fcntl.h> |
23 | | #include <fmt/format.h> |
24 | | #include <gen_cpp/AgentService_types.h> |
25 | | #include <gen_cpp/BackendService.h> |
26 | | #include <gen_cpp/HeartbeatService_types.h> |
27 | | #include <gen_cpp/MasterService_types.h> |
28 | | #include <gen_cpp/Status_types.h> |
29 | | #include <gen_cpp/Types_constants.h> |
30 | | #include <sys/stat.h> |
31 | | |
32 | | #include <filesystem> |
33 | | #include <memory> |
34 | | #include <mutex> |
35 | | #include <ostream> |
36 | | #include <shared_mutex> |
37 | | #include <system_error> |
38 | | #include <unordered_map> |
39 | | #include <unordered_set> |
40 | | #include <utility> |
41 | | #include <vector> |
42 | | |
43 | | #include "common/config.h" |
44 | | #include "common/logging.h" |
45 | | #include "io/fs/file_system.h" |
46 | | #include "io/fs/local_file_system.h" |
47 | | #include "io/fs/path.h" |
48 | | #include "runtime/memory/mem_tracker_limiter.h" |
49 | | #include "runtime/thread_context.h" |
50 | | #include "service/http/http_client.h" |
51 | | #include "service/http/utils.h" |
52 | | #include "storage/binlog.h" |
53 | | #include "storage/data_dir.h" |
54 | | #include "storage/olap_common.h" |
55 | | #include "storage/olap_define.h" |
56 | | #include "storage/pb_helper.h" |
57 | | #include "storage/rowset/rowset.h" |
58 | | #include "storage/snapshot/snapshot_manager.h" |
59 | | #include "storage/storage_engine.h" |
60 | | #include "storage/tablet/tablet.h" |
61 | | #include "storage/tablet/tablet_manager.h" |
62 | | #include "storage/tablet/tablet_meta.h" |
63 | | #include "util/client_cache.h" |
64 | | #include "util/debug_points.h" |
65 | | #include "util/defer_op.h" |
66 | | #include "util/network_util.h" |
67 | | #include "util/security.h" |
68 | | #include "util/stopwatch.hpp" |
69 | | #include "util/thrift_rpc_helper.h" |
70 | | #include "util/trace.h" |
71 | | |
72 | | using std::stringstream; |
73 | | |
74 | | namespace doris { |
75 | | using namespace ErrorCode; |
76 | | |
77 | | namespace { |
78 | | /// if binlog file exist, then check if binlog file md5sum equal |
79 | | /// if equal, then skip link file |
80 | | /// if not equal, then return error |
81 | | /// return value: if binlog file not exist, then return to binlog file path |
82 | | Result<std::string> check_dest_binlog_valid(const std::string& tablet_dir, |
83 | | const std::string& clone_dir, |
84 | 0 | const std::string& clone_file, bool* skip_link_file) { |
85 | 0 | std::string from, to; |
86 | 0 | std::string new_clone_file = clone_file; |
87 | 0 | if (clone_file.ends_with(".binlog")) { |
88 | | // change clone_file suffix from .binlog to .dat |
89 | 0 | new_clone_file.replace(clone_file.size() - 7, 7, ".dat"); |
90 | 0 | } else if (clone_file.ends_with(".binlog-index")) { |
91 | | // change clone_file suffix from .binlog-index to .idx |
92 | 0 | new_clone_file.replace(clone_file.size() - 13, 13, ".idx"); |
93 | 0 | } |
94 | 0 | from = fmt::format("{}/{}", clone_dir, clone_file); |
95 | 0 | to = fmt::format("{}/_binlog/{}", tablet_dir, new_clone_file); |
96 | | |
97 | | // check to to file exist |
98 | 0 | bool exists = true; |
99 | 0 | auto status = io::global_local_filesystem()->exists(to, &exists); |
100 | 0 | if (!status.ok()) { |
101 | 0 | return ResultError(std::move(status)); |
102 | 0 | } |
103 | | |
104 | 0 | if (!exists) { |
105 | 0 | return to; |
106 | 0 | } |
107 | | |
108 | 0 | LOG(WARNING) << "binlog file already exist. " |
109 | 0 | << "tablet_dir=" << tablet_dir << ", clone_file=" << from << ", to=" << to; |
110 | |
|
111 | 0 | std::string clone_file_md5sum; |
112 | 0 | status = io::global_local_filesystem()->md5sum(from, &clone_file_md5sum); |
113 | 0 | if (!status.ok()) { |
114 | 0 | return ResultError(std::move(status)); |
115 | 0 | } |
116 | 0 | std::string to_file_md5sum; |
117 | 0 | status = io::global_local_filesystem()->md5sum(to, &to_file_md5sum); |
118 | 0 | if (!status.ok()) { |
119 | 0 | return ResultError(std::move(status)); |
120 | 0 | } |
121 | | |
122 | 0 | if (clone_file_md5sum == to_file_md5sum) { |
123 | | // if md5sum equal, then skip link file |
124 | 0 | *skip_link_file = true; |
125 | 0 | return to; |
126 | 0 | } else { |
127 | 0 | auto err_msg = fmt::format( |
128 | 0 | "binlog file already exist, but md5sum not equal. " |
129 | 0 | "tablet_dir={}, clone_file={}", |
130 | 0 | tablet_dir, clone_file); |
131 | 0 | LOG(WARNING) << err_msg; |
132 | 0 | return ResultError(Status::InternalError(std::move(err_msg))); |
133 | 0 | } |
134 | 0 | } |
135 | | } // namespace |
136 | | |
137 | | #define RETURN_IF_ERROR_(status, stmt) \ |
138 | 0 | do { \ |
139 | 0 | status = (stmt); \ |
140 | 0 | if (UNLIKELY(!status.ok())) { \ |
141 | 0 | return status; \ |
142 | 0 | } \ |
143 | 0 | } while (false) |
144 | | |
145 | | EngineCloneTask::EngineCloneTask(StorageEngine& engine, const TCloneReq& clone_req, |
146 | | const ClusterInfo* cluster_info, int64_t signature, |
147 | | std::vector<TTabletInfo>* tablet_infos) |
148 | 0 | : _engine(engine), |
149 | 0 | _clone_req(clone_req), |
150 | 0 | _tablet_infos(tablet_infos), |
151 | 0 | _signature(signature), |
152 | 0 | _cluster_info(cluster_info) { |
153 | 0 | _mem_tracker = MemTrackerLimiter::create_shared( |
154 | 0 | MemTrackerLimiter::Type::OTHER, |
155 | 0 | "EngineCloneTask#tabletId=" + std::to_string(_clone_req.tablet_id)); |
156 | 0 | } |
157 | | |
158 | 0 | Status EngineCloneTask::execute() { |
159 | | // register the tablet to avoid it is deleted by gc thread during clone process |
160 | 0 | Status st = _do_clone(); |
161 | 0 | _engine.tablet_manager()->update_partitions_visible_version( |
162 | 0 | {{_clone_req.partition_id, _clone_req.version}}); |
163 | 0 | return st; |
164 | 0 | } |
165 | | |
166 | 0 | Status EngineCloneTask::_do_clone() { |
167 | 0 | DBUG_EXECUTE_IF("EngineCloneTask.wait_clone", { |
168 | 0 | auto duration = std::chrono::milliseconds(dp->param("duration", 10 * 1000)); |
169 | 0 | std::this_thread::sleep_for(duration); |
170 | 0 | }); |
171 | |
|
172 | 0 | DBUG_EXECUTE_IF("EngineCloneTask.failed_clone", { |
173 | 0 | LOG_WARNING("EngineCloneTask.failed_clone") |
174 | 0 | .tag("tablet_id", _clone_req.tablet_id) |
175 | 0 | .tag("replica_id", _clone_req.replica_id) |
176 | 0 | .tag("version", _clone_req.version); |
177 | 0 | return Status::InternalError( |
178 | 0 | "in debug point, EngineCloneTask.failed_clone tablet={}, replica={}, version={}", |
179 | 0 | _clone_req.tablet_id, _clone_req.replica_id, _clone_req.version); |
180 | 0 | }); |
181 | 0 | Status status = Status::OK(); |
182 | 0 | std::string src_file_path; |
183 | 0 | TBackend src_host; |
184 | 0 | int32_t copy_type = |
185 | 0 | _clone_req.__isset.copy_type ? _clone_req.copy_type : TabletCopyType::DEFAULT; |
186 | 0 | RETURN_IF_ERROR(TabletCopyType::validate(copy_type)); |
187 | 0 | bool copy_row_binlog = TabletCopyType::has(copy_type, TTabletCopyType::ROW_BINLOG); |
188 | 0 | RETURN_IF_ERROR( |
189 | 0 | _engine.tablet_manager()->register_transition_tablet(_clone_req.tablet_id, "clone")); |
190 | 0 | Defer defer {[&]() { |
191 | 0 | _engine.tablet_manager()->unregister_transition_tablet(_clone_req.tablet_id, "clone"); |
192 | 0 | }}; |
193 | | |
194 | | // Check local tablet exist or not |
195 | 0 | TabletSharedPtr tablet = _engine.tablet_manager()->get_tablet(_clone_req.tablet_id); |
196 | | |
197 | | // The status of a tablet is not ready, indicating that it is a residual tablet after a schema |
198 | | // change failure. Clone a new tablet from remote be to overwrite it. This situation basically only |
199 | | // occurs when the be_rebalancer_fuzzy_test configuration is enabled. |
200 | 0 | if (tablet && tablet->tablet_state() == TABLET_NOTREADY) { |
201 | 0 | LOG(WARNING) << "tablet state is not ready when clone, need to drop old tablet, tablet_id=" |
202 | 0 | << tablet->tablet_id(); |
203 | 0 | RETURN_IF_ERROR(_engine.tablet_manager()->drop_tablet(tablet->tablet_id(), |
204 | 0 | tablet->replica_id(), false)); |
205 | 0 | tablet.reset(); |
206 | 0 | } |
207 | 0 | _is_new_tablet = tablet == nullptr; |
208 | | // try to incremental clone |
209 | 0 | Versions missed_versions; |
210 | | // try to repair a tablet with missing version |
211 | 0 | if (tablet != nullptr) { |
212 | 0 | std::shared_lock migration_rlock(tablet->get_migration_lock(), std::try_to_lock); |
213 | 0 | if (!migration_rlock.owns_lock()) { |
214 | 0 | return Status::Error<TRY_LOCK_FAILED>( |
215 | 0 | "EngineCloneTask::_do_clone meet try lock failed"); |
216 | 0 | } |
217 | 0 | if (tablet->replica_id() < _clone_req.replica_id) { |
218 | | // `tablet` may be a dropped replica in FE, e.g: |
219 | | // BE1 migrates replica of tablet_1 to BE2, but before BE1 drop this replica, another new replica of tablet_1 is migrated to BE1. |
220 | | // Clone can still continue in this case. But to keep `replica_id` consitent with FE, MUST reset `replica_id` with request `replica_id`. |
221 | 0 | tablet->tablet_meta()->set_replica_id(_clone_req.replica_id); |
222 | 0 | } |
223 | | |
224 | | // get download path |
225 | 0 | auto local_data_path = fmt::format("{}/{}", tablet->tablet_path(), CLONE_PREFIX); |
226 | 0 | bool allow_incremental_clone = false; |
227 | |
|
228 | 0 | int64_t specified_version = _clone_req.version; |
229 | 0 | if (tablet->enable_unique_key_merge_on_write()) { |
230 | 0 | int64_t min_pending_ver = _engine.get_pending_publish_min_version(tablet->tablet_id()); |
231 | 0 | if (min_pending_ver - 1 < specified_version) { |
232 | 0 | LOG(INFO) << "use min pending publish version for clone, min_pending_ver: " |
233 | 0 | << min_pending_ver << " visible_version: " << _clone_req.version; |
234 | 0 | specified_version = min_pending_ver - 1; |
235 | 0 | } |
236 | 0 | } |
237 | |
|
238 | 0 | missed_versions = tablet->get_missed_versions(specified_version); |
239 | | |
240 | | // if missed version size is 0, then it is useless to clone from remote be, it means local data is |
241 | | // completed. Or remote be will just return header not the rowset files. clone will failed. |
242 | 0 | if (missed_versions.empty()) { |
243 | 0 | LOG(INFO) << "missed version size = 0, skip clone and return success. tablet_id=" |
244 | 0 | << _clone_req.tablet_id << " replica_id=" << _clone_req.replica_id; |
245 | 0 | RETURN_IF_ERROR(_set_tablet_info()); |
246 | 0 | return Status::OK(); |
247 | 0 | } |
248 | | |
249 | 0 | LOG(INFO) << "clone to existed tablet. missed_versions_size=" << missed_versions.size() |
250 | 0 | << ", allow_incremental_clone=" << allow_incremental_clone |
251 | 0 | << ", signature=" << _signature << ", tablet_id=" << _clone_req.tablet_id |
252 | 0 | << ", visible_version=" << _clone_req.version |
253 | 0 | << ", replica_id=" << _clone_req.replica_id; |
254 | | |
255 | | // try to download missing version from src backend. |
256 | | // if tablet on src backend does not contains missing version, it will download all versions, |
257 | | // and set allow_incremental_clone to false |
258 | 0 | RETURN_IF_ERROR(_make_and_download_snapshots(*(tablet->data_dir()), local_data_path, |
259 | 0 | &src_host, &src_file_path, missed_versions, |
260 | 0 | &allow_incremental_clone)); |
261 | 0 | RETURN_IF_ERROR(_finish_clone(tablet.get(), local_data_path, specified_version, |
262 | 0 | allow_incremental_clone, copy_row_binlog)); |
263 | 0 | } else { |
264 | 0 | LOG(INFO) << "clone tablet not exist, begin clone a new tablet from remote be. " |
265 | 0 | << "signature=" << _signature << ", tablet_id=" << _clone_req.tablet_id |
266 | 0 | << ", visible_version=" << _clone_req.version |
267 | 0 | << ", req replica=" << _clone_req.replica_id; |
268 | | // create a new tablet in this be |
269 | | // Get local disk from olap |
270 | 0 | std::string local_shard_root_path; |
271 | 0 | DataDir* store = nullptr; |
272 | 0 | RETURN_IF_ERROR(_engine.obtain_shard_path(_clone_req.storage_medium, |
273 | 0 | _clone_req.dest_path_hash, &local_shard_root_path, |
274 | 0 | &store, _clone_req.partition_id)); |
275 | 0 | auto tablet_dir = fmt::format("{}/{}/{}", local_shard_root_path, _clone_req.tablet_id, |
276 | 0 | _clone_req.schema_hash); |
277 | |
|
278 | 0 | Defer remove_useless_dir {[&] { |
279 | 0 | if (status.ok()) { |
280 | 0 | return; |
281 | 0 | } |
282 | 0 | LOG(INFO) << "clone failed. want to delete local dir: " << tablet_dir |
283 | 0 | << ". signature: " << _signature; |
284 | 0 | WARN_IF_ERROR(io::global_local_filesystem()->delete_directory(tablet_dir), |
285 | 0 | "failed to delete useless clone dir "); |
286 | 0 | WARN_IF_ERROR(DataDir::delete_tablet_parent_path_if_empty(tablet_dir), |
287 | 0 | "failed to delete parent dir"); |
288 | 0 | }}; |
289 | |
|
290 | 0 | bool exists = true; |
291 | 0 | Status exists_st = io::global_local_filesystem()->exists(tablet_dir, &exists); |
292 | 0 | if (!exists_st) { |
293 | 0 | LOG(WARNING) << "cant get path=" << tablet_dir << " state, st=" << exists_st; |
294 | 0 | return exists_st; |
295 | 0 | } |
296 | 0 | if (exists) { |
297 | 0 | LOG(WARNING) << "before clone dest path=" << tablet_dir << " exist, remove it first"; |
298 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(tablet_dir)); |
299 | 0 | } |
300 | | |
301 | 0 | bool allow_incremental_clone = false; |
302 | 0 | RETURN_IF_ERROR_(status, |
303 | 0 | _make_and_download_snapshots(*store, tablet_dir, &src_host, &src_file_path, |
304 | 0 | missed_versions, &allow_incremental_clone)); |
305 | | |
306 | 0 | LOG(INFO) << "clone copy done. src_host: " << src_host.host |
307 | 0 | << " src_file_path: " << src_file_path; |
308 | 0 | auto tablet_manager = _engine.tablet_manager(); |
309 | 0 | RETURN_IF_ERROR_(status, tablet_manager->load_tablet_from_dir(store, _clone_req.tablet_id, |
310 | 0 | _clone_req.schema_hash, |
311 | 0 | tablet_dir, false)); |
312 | 0 | auto nested_tablet = tablet_manager->get_tablet(_clone_req.tablet_id); |
313 | 0 | if (!nested_tablet) { |
314 | 0 | status = Status::NotFound("tablet not found, tablet_id={}", _clone_req.tablet_id); |
315 | 0 | return status; |
316 | 0 | } |
317 | | // MUST reset `replica_id` to request `replica_id` to keep consistent with FE |
318 | 0 | nested_tablet->tablet_meta()->set_replica_id(_clone_req.replica_id); |
319 | | // clone success, delete .hdr file because tablet meta is stored in rocksdb |
320 | 0 | std::string header_path = |
321 | 0 | TabletMeta::construct_header_file_path(tablet_dir, _clone_req.tablet_id); |
322 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(header_path)); |
323 | 0 | } |
324 | | |
325 | 0 | return _set_tablet_info(); |
326 | 0 | } |
327 | | |
328 | 0 | Status EngineCloneTask::_set_tablet_info() { |
329 | | // Get clone tablet info |
330 | 0 | TTabletInfo tablet_info; |
331 | 0 | tablet_info.__set_tablet_id(_clone_req.tablet_id); |
332 | 0 | tablet_info.__set_replica_id(_clone_req.replica_id); |
333 | 0 | tablet_info.__set_schema_hash(_clone_req.schema_hash); |
334 | 0 | RETURN_IF_ERROR(_engine.tablet_manager()->report_tablet_info(&tablet_info)); |
335 | 0 | if (_clone_req.__isset.version && tablet_info.version < _clone_req.version) { |
336 | | // if it is a new tablet and clone failed, then remove the tablet |
337 | | // if it is incremental clone, then must not drop the tablet |
338 | 0 | if (_is_new_tablet) { |
339 | | // we need to check if this cloned table's version is what we expect. |
340 | | // if not, maybe this is a stale remaining table which is waiting for drop. |
341 | | // we drop it. |
342 | 0 | LOG(WARNING) << "begin to drop the stale tablet. tablet_id:" << _clone_req.tablet_id |
343 | 0 | << ", replica_id:" << _clone_req.replica_id |
344 | 0 | << ", schema_hash:" << _clone_req.schema_hash |
345 | 0 | << ", signature:" << _signature << ", version:" << tablet_info.version |
346 | 0 | << ", expected_version: " << _clone_req.version; |
347 | 0 | WARN_IF_ERROR(_engine.tablet_manager()->drop_tablet(_clone_req.tablet_id, |
348 | 0 | _clone_req.replica_id, false), |
349 | 0 | "drop stale cloned table failed"); |
350 | 0 | } |
351 | 0 | return Status::InternalError("unexpected version. tablet version: {}, expected version: {}", |
352 | 0 | tablet_info.version, _clone_req.version); |
353 | 0 | } |
354 | 0 | LOG(INFO) << "clone get tablet info success. tablet_id:" << _clone_req.tablet_id |
355 | 0 | << ", schema_hash:" << _clone_req.schema_hash << ", signature:" << _signature |
356 | 0 | << ", replica id:" << _clone_req.replica_id << ", version:" << tablet_info.version; |
357 | 0 | _tablet_infos->push_back(tablet_info); |
358 | 0 | return Status::OK(); |
359 | 0 | } |
360 | | |
361 | | /// This method will do following things: |
362 | | /// 1. Make snapshots on source BE. |
363 | | /// 2. Download all snapshots to CLONE dir. |
364 | | /// 3. Convert rowset ids of downloaded snapshots(would also change the replica id). |
365 | | /// 4. Release the snapshots on source BE. |
366 | | Status EngineCloneTask::_make_and_download_snapshots(DataDir& data_dir, |
367 | | const std::string& local_data_path, |
368 | | TBackend* src_host, std::string* snapshot_path, |
369 | | const std::vector<Version>& missed_versions, |
370 | 0 | bool* allow_incremental_clone) { |
371 | 0 | Status status; |
372 | |
|
373 | 0 | const auto& token = _cluster_info->token; |
374 | |
|
375 | 0 | int timeout_s = 0; |
376 | 0 | if (_clone_req.__isset.timeout_s) { |
377 | 0 | timeout_s = _clone_req.timeout_s; |
378 | 0 | } |
379 | |
|
380 | 0 | for (auto&& src : _clone_req.src_backends) { |
381 | | // Make snapshot in remote olap engine |
382 | 0 | *src_host = src; |
383 | | // make snapshot |
384 | 0 | status = _make_snapshot(src.host, src.be_port, _clone_req.tablet_id, _clone_req.schema_hash, |
385 | 0 | timeout_s, missed_versions, snapshot_path, allow_incremental_clone); |
386 | 0 | if (!status.ok()) [[unlikely]] { |
387 | 0 | LOG_WARNING("failed to make snapshot in remote BE") |
388 | 0 | .tag("host", src.host) |
389 | 0 | .tag("port", src.be_port) |
390 | 0 | .tag("tablet", _clone_req.tablet_id) |
391 | 0 | .tag("signature", _signature) |
392 | 0 | .tag("missed_versions", missed_versions) |
393 | 0 | .error(status); |
394 | 0 | continue; // Try another BE |
395 | 0 | } |
396 | 0 | LOG_INFO("successfully make snapshot in remote BE") |
397 | 0 | .tag("host", src.host) |
398 | 0 | .tag("port", src.be_port) |
399 | 0 | .tag("tablet", _clone_req.tablet_id) |
400 | 0 | .tag("snapshot_path", *snapshot_path) |
401 | 0 | .tag("signature", _signature) |
402 | 0 | .tag("missed_versions", missed_versions); |
403 | 0 | Defer defer {[host = src.host, port = src.be_port, &snapshot_path = *snapshot_path, this] { |
404 | | // TODO(plat1ko): Async release snapshot |
405 | 0 | auto st = _release_snapshot(host, port, snapshot_path); |
406 | 0 | if (!st.ok()) [[unlikely]] { |
407 | 0 | LOG_WARNING("failed to release snapshot in remote BE") |
408 | 0 | .tag("host", host) |
409 | 0 | .tag("port", port) |
410 | 0 | .tag("snapshot_path", snapshot_path) |
411 | 0 | .error(st); |
412 | 0 | } |
413 | 0 | }}; |
414 | |
|
415 | 0 | std::string remote_dir; |
416 | 0 | { |
417 | 0 | std::stringstream ss; |
418 | 0 | if (snapshot_path->back() == '/') { |
419 | 0 | ss << *snapshot_path << _clone_req.tablet_id << "/" << _clone_req.schema_hash |
420 | 0 | << "/"; |
421 | 0 | } else { |
422 | 0 | ss << *snapshot_path << "/" << _clone_req.tablet_id << "/" << _clone_req.schema_hash |
423 | 0 | << "/"; |
424 | 0 | } |
425 | 0 | remote_dir = ss.str(); |
426 | 0 | } |
427 | |
|
428 | 0 | std::string address = get_host_port(src.host, src.http_port); |
429 | 0 | int32_t copy_type = |
430 | 0 | _clone_req.__isset.copy_type ? _clone_req.copy_type : TabletCopyType::DEFAULT; |
431 | 0 | RETURN_IF_ERROR(TabletCopyType::validate(copy_type)); |
432 | 0 | bool copy_row_binlog = TabletCopyType::has(copy_type, TTabletCopyType::ROW_BINLOG); |
433 | |
|
434 | 0 | if (config::enable_batch_download && is_support_batch_download(address).ok()) { |
435 | | // download files via batch api. |
436 | 0 | LOG_INFO("remote BE supports batch download, use batch file download") |
437 | 0 | .tag("address", address) |
438 | 0 | .tag("remote_dir", remote_dir); |
439 | 0 | status = _batch_download_files(&data_dir, address, remote_dir, local_data_path); |
440 | 0 | if (!status.ok()) [[unlikely]] { |
441 | 0 | LOG_WARNING("failed to download snapshot from remote BE in batch") |
442 | 0 | .tag("address", address) |
443 | 0 | .tag("remote_dir", remote_dir) |
444 | 0 | .error(status); |
445 | 0 | continue; // Try another BE |
446 | 0 | } |
447 | 0 | if (copy_row_binlog) { |
448 | 0 | std::string row_binlog_remote_dir = |
449 | 0 | fmt::format("{}{}/", remote_dir, FDRowBinlogSuffix); |
450 | 0 | std::string row_binlog_local_path = |
451 | 0 | fmt::format("{}/{}", local_data_path, FDRowBinlogSuffix); |
452 | 0 | status = _batch_download_files(&data_dir, address, row_binlog_remote_dir, |
453 | 0 | row_binlog_local_path); |
454 | 0 | if (!status.ok()) [[unlikely]] { |
455 | 0 | LOG_WARNING("failed to download row binlog snapshot from remote BE in batch") |
456 | 0 | .tag("address", address) |
457 | 0 | .tag("remote_dir", row_binlog_remote_dir) |
458 | 0 | .error(status); |
459 | 0 | continue; // Try another BE |
460 | 0 | } |
461 | 0 | } |
462 | 0 | } else { |
463 | 0 | if (config::enable_batch_download) { |
464 | 0 | LOG_INFO("remote BE does not support batch download, use single file download") |
465 | 0 | .tag("address", address) |
466 | 0 | .tag("remote_dir", remote_dir); |
467 | 0 | } else { |
468 | 0 | LOG_INFO("batch download is disabled, use single file download") |
469 | 0 | .tag("address", address) |
470 | 0 | .tag("remote_dir", remote_dir); |
471 | 0 | } |
472 | |
|
473 | 0 | std::string remote_url_prefix; |
474 | 0 | { |
475 | 0 | std::stringstream ss; |
476 | 0 | ss << "http://" << address << HTTP_REQUEST_PREFIX << HTTP_REQUEST_TOKEN_PARAM |
477 | 0 | << token << HTTP_REQUEST_FILE_PARAM << remote_dir; |
478 | 0 | remote_url_prefix = ss.str(); |
479 | 0 | } |
480 | |
|
481 | 0 | status = _download_files(&data_dir, remote_url_prefix, local_data_path); |
482 | 0 | if (!status.ok()) [[unlikely]] { |
483 | 0 | LOG_WARNING("failed to download snapshot from remote BE") |
484 | 0 | .tag("url", mask_token(remote_url_prefix)) |
485 | 0 | .error(status); |
486 | 0 | continue; // Try another BE |
487 | 0 | } |
488 | 0 | if (copy_row_binlog) { |
489 | 0 | std::string row_binlog_remote_url_prefix; |
490 | 0 | { |
491 | 0 | std::stringstream ss; |
492 | 0 | ss << "http://" << address << HTTP_REQUEST_PREFIX << HTTP_REQUEST_TOKEN_PARAM |
493 | 0 | << token << HTTP_REQUEST_FILE_PARAM << remote_dir << FDRowBinlogSuffix |
494 | 0 | << "/"; |
495 | 0 | row_binlog_remote_url_prefix = ss.str(); |
496 | 0 | } |
497 | 0 | std::string row_binlog_local_path = |
498 | 0 | fmt::format("{}/{}", local_data_path, FDRowBinlogSuffix); |
499 | 0 | status = _download_files(&data_dir, row_binlog_remote_url_prefix, |
500 | 0 | row_binlog_local_path); |
501 | 0 | if (!status.ok()) [[unlikely]] { |
502 | 0 | LOG_WARNING("failed to download row binlog snapshot from remote BE") |
503 | 0 | .tag("url", mask_token(row_binlog_remote_url_prefix)) |
504 | 0 | .error(status); |
505 | 0 | continue; // Try another BE |
506 | 0 | } |
507 | 0 | } |
508 | 0 | } |
509 | | |
510 | | // No need to try again with another BE |
511 | 0 | _pending_rs_guards = DORIS_TRY(_engine.snapshot_mgr()->convert_rowset_ids( |
512 | 0 | local_data_path, _clone_req.tablet_id, _clone_req.replica_id, _clone_req.table_id, |
513 | 0 | _clone_req.partition_id, _clone_req.schema_hash)); |
514 | 0 | break; |
515 | 0 | } // clone copy from one backend |
516 | 0 | return status; |
517 | 0 | } |
518 | | |
519 | | Status EngineCloneTask::_make_snapshot(const std::string& ip, int port, TTableId tablet_id, |
520 | | TSchemaHash schema_hash, int timeout_s, |
521 | | const std::vector<Version>& missed_versions, |
522 | 0 | std::string* snapshot_path, bool* allow_incremental_clone) { |
523 | 0 | TSnapshotRequest request; |
524 | 0 | request.__set_tablet_id(tablet_id); |
525 | 0 | request.__set_schema_hash(schema_hash); |
526 | 0 | request.__set_preferred_snapshot_version(g_Types_constants.TPREFER_SNAPSHOT_REQ_VERSION); |
527 | 0 | request.__set_version(_clone_req.version); |
528 | 0 | int32_t copy_type = |
529 | 0 | _clone_req.__isset.copy_type ? _clone_req.copy_type : TabletCopyType::DEFAULT; |
530 | 0 | RETURN_IF_ERROR(TabletCopyType::validate(copy_type)); |
531 | 0 | request.__set_copy_type(copy_type); |
532 | 0 | request.__set_is_copy_binlog(TabletCopyType::has(copy_type, TTabletCopyType::CCR_BINLOG)); |
533 | | // TODO: missing version composed of singleton delta. |
534 | | // if not, this place should be rewrote. |
535 | | // we make every TSnapshotRequest sent from be with __isset.missing_version = true |
536 | | // then if one be received one req with __isset.missing_version = false it means |
537 | | // this req is sent from FE(FE would never set this field) |
538 | 0 | request.__isset.missing_version = true; |
539 | 0 | for (auto& version : missed_versions) { |
540 | 0 | request.missing_version.push_back(version.first); |
541 | 0 | } |
542 | 0 | if (timeout_s > 0) { |
543 | 0 | request.__set_timeout(timeout_s); |
544 | 0 | } |
545 | |
|
546 | 0 | TAgentResult result; |
547 | 0 | RETURN_IF_ERROR(ThriftRpcHelper::rpc<BackendServiceClient>( |
548 | 0 | ip, port, [&request, &result](BackendServiceConnection& client) { |
549 | 0 | client->make_snapshot(result, request); |
550 | 0 | })); |
551 | 0 | if (result.status.status_code != TStatusCode::OK) { |
552 | 0 | return Status::create(result.status); |
553 | 0 | } |
554 | | |
555 | 0 | if (!result.__isset.snapshot_path) { |
556 | 0 | return Status::InternalError("success snapshot request without snapshot path"); |
557 | 0 | } |
558 | 0 | *snapshot_path = result.snapshot_path; |
559 | 0 | if (snapshot_path->at(snapshot_path->length() - 1) != '/') { |
560 | 0 | snapshot_path->append("/"); |
561 | 0 | } |
562 | |
|
563 | 0 | if (result.__isset.allow_incremental_clone) { |
564 | | // During upgrading, some BE nodes still be installed an old previous old. |
565 | | // which incremental clone is not ready in those nodes. |
566 | | // should add a symbol to indicate it. |
567 | 0 | *allow_incremental_clone = result.allow_incremental_clone; |
568 | 0 | } |
569 | 0 | return Status::OK(); |
570 | 0 | } |
571 | | |
572 | | Status EngineCloneTask::_release_snapshot(const std::string& ip, int port, |
573 | 0 | const std::string& snapshot_path) { |
574 | 0 | TAgentResult result; |
575 | 0 | RETURN_IF_ERROR(ThriftRpcHelper::rpc<BackendServiceClient>( |
576 | 0 | ip, port, [&snapshot_path, &result](BackendServiceConnection& client) { |
577 | 0 | client->release_snapshot(result, snapshot_path); |
578 | 0 | })); |
579 | 0 | return Status::create(result.status); |
580 | 0 | } |
581 | | |
582 | | Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& remote_url_prefix, |
583 | 0 | const std::string& local_path) { |
584 | | // Check local path exist, if exist, remove it, then create the dir |
585 | | // local_file_full_path = tabletid/clone, for a specific tablet, there should be only one folder |
586 | | // if this folder exists, then should remove it |
587 | | // for example, BE clone from BE 1 to download file 1 with version (2,2), but clone from BE 1 failed |
588 | | // then it will try to clone from BE 2, but it will find the file 1 already exist, but file 1 with same |
589 | | // name may have different versions. |
590 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(local_path)); |
591 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(local_path)); |
592 | | |
593 | | // Get remote dir file list |
594 | 0 | std::string file_list_str; |
595 | 0 | auto list_files_cb = [&remote_url_prefix, &file_list_str](HttpClient* client) { |
596 | 0 | RETURN_IF_ERROR(client->init(remote_url_prefix)); |
597 | 0 | client->set_timeout_ms(LIST_REMOTE_FILE_TIMEOUT * 1000); |
598 | 0 | return client->execute(&file_list_str); |
599 | 0 | }; |
600 | 0 | RETURN_IF_ERROR(HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, list_files_cb)); |
601 | 0 | std::vector<std::string> file_name_list = |
602 | 0 | absl::StrSplit(file_list_str, "\n", absl::SkipWhitespace()); |
603 | | |
604 | | // If the header file is not exist, the table couldn't loaded by olap engine. |
605 | | // Avoid of data is not complete, we copy the header file at last. |
606 | | // The header file's name is end of .hdr. |
607 | 0 | for (int i = 0; i + 1 < file_name_list.size(); ++i) { |
608 | 0 | if (file_name_list[i].ends_with(".hdr")) { |
609 | 0 | std::swap(file_name_list[i], file_name_list[file_name_list.size() - 1]); |
610 | 0 | break; |
611 | 0 | } |
612 | 0 | } |
613 | | |
614 | | // Get copy from remote |
615 | 0 | uint64_t total_file_size = 0; |
616 | 0 | MonotonicStopWatch watch; |
617 | 0 | watch.start(); |
618 | 0 | for (auto& file_name : file_name_list) { |
619 | 0 | auto remote_file_url = remote_url_prefix + file_name; |
620 | | |
621 | | // get file length |
622 | 0 | uint64_t file_size = 0; |
623 | 0 | auto get_file_size_cb = [&remote_file_url, &file_size](HttpClient* client) { |
624 | 0 | RETURN_IF_ERROR(client->init(remote_file_url)); |
625 | 0 | client->set_timeout_ms(GET_LENGTH_TIMEOUT * 1000); |
626 | 0 | RETURN_IF_ERROR(client->head()); |
627 | 0 | RETURN_IF_ERROR(client->get_content_length(&file_size)); |
628 | 0 | return Status::OK(); |
629 | 0 | }; |
630 | 0 | RETURN_IF_ERROR( |
631 | 0 | HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, get_file_size_cb)); |
632 | | // check disk capacity |
633 | 0 | if (data_dir->reach_capacity_limit(file_size)) { |
634 | 0 | return Status::Error<EXCEEDED_LIMIT>( |
635 | 0 | "reach the capacity limit of path {}, file_size={}", data_dir->path(), |
636 | 0 | file_size); |
637 | 0 | } |
638 | | |
639 | 0 | total_file_size += file_size; |
640 | 0 | uint64_t estimate_timeout = file_size / config::download_low_speed_limit_kbps / 1024; |
641 | 0 | if (estimate_timeout < config::download_low_speed_time) { |
642 | 0 | estimate_timeout = config::download_low_speed_time; |
643 | 0 | } |
644 | |
|
645 | 0 | std::string local_file_path = local_path + "/" + file_name; |
646 | |
|
647 | 0 | LOG(INFO) << "clone begin to download file from: " << mask_token(remote_file_url) |
648 | 0 | << " to: " << local_file_path << ". size(B): " << file_size |
649 | 0 | << ", timeout(s): " << estimate_timeout; |
650 | |
|
651 | 0 | auto download_cb = [&remote_file_url, estimate_timeout, &local_file_path, |
652 | 0 | file_size](HttpClient* client) { |
653 | 0 | RETURN_IF_ERROR(client->init(remote_file_url)); |
654 | 0 | client->set_timeout_ms(estimate_timeout * 1000); |
655 | 0 | RETURN_IF_ERROR(client->download(local_file_path)); |
656 | | |
657 | 0 | std::error_code ec; |
658 | | // Check file length |
659 | 0 | uint64_t local_file_size = std::filesystem::file_size(local_file_path, ec); |
660 | 0 | if (ec) { |
661 | 0 | LOG(WARNING) << "download file error" << ec.message(); |
662 | 0 | return Status::IOError("can't retrive file_size of {}, due to {}", local_file_path, |
663 | 0 | ec.message()); |
664 | 0 | } |
665 | 0 | if (local_file_size != file_size) { |
666 | 0 | LOG(WARNING) << "download file length error" |
667 | 0 | << ", remote_path=" << mask_token(remote_file_url) |
668 | 0 | << ", file_size=" << file_size |
669 | 0 | << ", local_file_size=" << local_file_size; |
670 | 0 | return Status::InternalError("downloaded file size is not equal"); |
671 | 0 | } |
672 | 0 | return io::global_local_filesystem()->permission(local_file_path, |
673 | 0 | io::LocalFileSystem::PERMS_OWNER_RW); |
674 | 0 | }; |
675 | 0 | RETURN_IF_ERROR(HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, download_cb)); |
676 | 0 | } // Clone files from remote backend |
677 | | |
678 | 0 | uint64_t total_time_ms = watch.elapsed_time() / 1000 / 1000; |
679 | 0 | total_time_ms = total_time_ms > 0 ? total_time_ms : 0; |
680 | 0 | double copy_rate = 0.0; |
681 | 0 | if (total_time_ms > 0) { |
682 | 0 | copy_rate = total_file_size / ((double)total_time_ms) / 1000; |
683 | 0 | } |
684 | 0 | _copy_size = (int64_t)total_file_size; |
685 | 0 | _copy_time_ms = (int64_t)total_time_ms; |
686 | 0 | LOG(INFO) << "succeed to copy tablet " << _signature |
687 | 0 | << ", total files: " << file_name_list.size() |
688 | 0 | << ", total file size: " << total_file_size << " B, cost: " << total_time_ms << " ms" |
689 | 0 | << ", rate: " << copy_rate << " MB/s"; |
690 | 0 | return Status::OK(); |
691 | 0 | } |
692 | | |
693 | | Status EngineCloneTask::_batch_download_files(DataDir* data_dir, const std::string& address, |
694 | | const std::string& remote_dir, |
695 | 0 | const std::string& local_dir) { |
696 | 0 | constexpr size_t BATCH_FILE_SIZE = 64 << 20; // 64MB |
697 | 0 | constexpr size_t BATCH_FILE_NUM = 64; |
698 | | |
699 | | // Check local path exist, if exist, remove it, then create the dir |
700 | | // local_file_full_path = tabletid/clone, for a specific tablet, there should be only one folder |
701 | | // if this folder exists, then should remove it |
702 | | // for example, BE clone from BE 1 to download file 1 with version (2,2), but clone from BE 1 failed |
703 | | // then it will try to clone from BE 2, but it will find the file 1 already exist, but file 1 with same |
704 | | // name may have different versions. |
705 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(local_dir)); |
706 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(local_dir)); |
707 | | |
708 | 0 | const std::string& token = _cluster_info->token; |
709 | 0 | std::vector<std::pair<std::string, size_t>> file_info_list; |
710 | 0 | RETURN_IF_ERROR(list_remote_files_v2(address, token, remote_dir, &file_info_list)); |
711 | | |
712 | | // If the header file is not exist, the table couldn't loaded by olap engine. |
713 | | // Avoid of data is not complete, we copy the header file at last. |
714 | | // The header file's name is end of .hdr. |
715 | 0 | for (int i = 0; i + 1 < file_info_list.size(); ++i) { |
716 | 0 | if (file_info_list[i].first.ends_with(".hdr")) { |
717 | 0 | std::swap(file_info_list[i], file_info_list[file_info_list.size() - 1]); |
718 | 0 | break; |
719 | 0 | } |
720 | 0 | } |
721 | |
|
722 | 0 | MonotonicStopWatch watch; |
723 | 0 | watch.start(); |
724 | |
|
725 | 0 | size_t total_file_size = 0; |
726 | 0 | size_t total_files = file_info_list.size(); |
727 | 0 | std::vector<std::pair<std::string, size_t>> batch_files; |
728 | 0 | for (size_t i = 0; i < total_files;) { |
729 | 0 | size_t batch_file_size = 0; |
730 | 0 | for (size_t j = i; j < total_files; j++) { |
731 | | // Split batchs by file number and file size, |
732 | 0 | if (BATCH_FILE_NUM <= batch_files.size() || BATCH_FILE_SIZE <= batch_file_size || |
733 | | // ... or separate the last .hdr file into a single batch. |
734 | 0 | (j + 1 == total_files && !batch_files.empty())) { |
735 | 0 | break; |
736 | 0 | } |
737 | 0 | batch_files.push_back(file_info_list[j]); |
738 | 0 | batch_file_size += file_info_list[j].second; |
739 | 0 | } |
740 | | |
741 | | // check disk capacity |
742 | 0 | if (data_dir->reach_capacity_limit(batch_file_size)) { |
743 | 0 | return Status::Error<EXCEEDED_LIMIT>( |
744 | 0 | "reach the capacity limit of path {}, file_size={}", data_dir->path(), |
745 | 0 | batch_file_size); |
746 | 0 | } |
747 | | |
748 | 0 | RETURN_IF_ERROR(download_files_v2(address, token, remote_dir, local_dir, batch_files)); |
749 | | |
750 | 0 | total_file_size += batch_file_size; |
751 | 0 | i += batch_files.size(); |
752 | 0 | batch_files.clear(); |
753 | 0 | } |
754 | | |
755 | 0 | uint64_t total_time_ms = watch.elapsed_time() / 1000 / 1000; |
756 | 0 | total_time_ms = total_time_ms > 0 ? total_time_ms : 0; |
757 | 0 | double copy_rate = 0.0; |
758 | 0 | if (total_time_ms > 0) { |
759 | 0 | copy_rate = total_file_size / ((double)total_time_ms) / 1000; |
760 | 0 | } |
761 | 0 | _copy_size = (int64_t)total_file_size; |
762 | 0 | _copy_time_ms = (int64_t)total_time_ms; |
763 | 0 | LOG(INFO) << "succeed to copy tablet " << _signature |
764 | 0 | << ", total files: " << file_info_list.size() |
765 | 0 | << ", total file size: " << total_file_size << " B, cost: " << total_time_ms << " ms" |
766 | 0 | << ", rate: " << copy_rate << " MB/s"; |
767 | |
|
768 | 0 | return Status::OK(); |
769 | 0 | } |
770 | | |
771 | | /// This method will only be called if tablet already exist in this BE when doing clone. |
772 | | /// This method will do the following things: |
773 | | /// 1. Link all files from CLONE dir to tablet dir if file does not exist in tablet dir |
774 | | /// 2. Call _finish_xx_clone() to revise the tablet meta. |
775 | | Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_dir, int64_t version, |
776 | 0 | bool is_incremental_clone, bool copy_row_binlog) { |
777 | 0 | Defer remove_clone_dir {[&]() { |
778 | 0 | std::error_code ec; |
779 | 0 | std::filesystem::remove_all(clone_dir, ec); |
780 | 0 | if (ec) { |
781 | 0 | LOG(WARNING) << "failed to remove=" << clone_dir << " msg=" << ec.message(); |
782 | 0 | } |
783 | 0 | }}; |
784 | | |
785 | | // check clone dir existed |
786 | 0 | bool exists = true; |
787 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->exists(clone_dir, &exists)); |
788 | 0 | if (!exists) { |
789 | 0 | return Status::InternalError("clone dir not existed. clone_dir={}", clone_dir); |
790 | 0 | } |
791 | | |
792 | | // Load src header. |
793 | | // The tablet meta info is downloaded from source BE as .hdr file. |
794 | | // So we load it and generate cloned_tablet_meta. |
795 | 0 | auto cloned_tablet_meta_file = fmt::format("{}/{}.hdr", clone_dir, tablet->tablet_id()); |
796 | 0 | auto cloned_tablet_meta = std::make_shared<TabletMeta>(); |
797 | 0 | RETURN_IF_ERROR(cloned_tablet_meta->create_from_file(cloned_tablet_meta_file)); |
798 | | |
799 | | // remove the cloned meta file |
800 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(cloned_tablet_meta_file)); |
801 | | |
802 | | // remove rowset binlog metas |
803 | 0 | const auto& tablet_dir = tablet->tablet_path(); |
804 | 0 | auto binlog_metas_file = fmt::format("{}/rowset_binlog_metas.pb", clone_dir); |
805 | 0 | bool binlog_metas_file_exists = false; |
806 | 0 | auto file_exists_status = |
807 | 0 | io::global_local_filesystem()->exists(binlog_metas_file, &binlog_metas_file_exists); |
808 | 0 | if (!file_exists_status.ok()) { |
809 | 0 | return file_exists_status; |
810 | 0 | } |
811 | 0 | bool contain_binlog = false; |
812 | 0 | RowsetBinlogMetasPB rowset_binlog_metas_pb; |
813 | 0 | if (binlog_metas_file_exists) { |
814 | 0 | std::error_code ec; |
815 | 0 | auto binlog_meta_filesize = std::filesystem::file_size(binlog_metas_file, ec); |
816 | 0 | if (ec) { |
817 | 0 | LOG(WARNING) << "get file size error" << ec.message(); |
818 | 0 | return Status::IOError("can't retrive file_size of {}, due to {}", binlog_metas_file, |
819 | 0 | ec.message()); |
820 | 0 | } |
821 | 0 | if (binlog_meta_filesize > 0) { |
822 | 0 | contain_binlog = true; |
823 | 0 | RETURN_IF_ERROR(read_pb(binlog_metas_file, &rowset_binlog_metas_pb)); |
824 | 0 | } |
825 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(binlog_metas_file)); |
826 | 0 | } |
827 | 0 | if (contain_binlog) { |
828 | 0 | auto binlog_dir = fmt::format("{}/_binlog", tablet_dir); |
829 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(binlog_dir)); |
830 | 0 | } |
831 | | |
832 | 0 | enum class CloneFileType { DATA, ROW_BINLOG, CCR_BINLOG }; |
833 | | |
834 | | // check all files in /clone and /tablet |
835 | 0 | std::vector<io::FileInfo> clone_files; |
836 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->list(clone_dir, true, &clone_files, &exists)); |
837 | 0 | std::vector<std::pair<CloneFileType, std::string>> clone_file_names; |
838 | 0 | for (auto& file : clone_files) { |
839 | 0 | CloneFileType file_type = CloneFileType::DATA; |
840 | 0 | if (file.file_name.ends_with(".binlog") || file.file_name.ends_with(".binlog-index")) { |
841 | 0 | file_type = CloneFileType::CCR_BINLOG; |
842 | 0 | } |
843 | 0 | clone_file_names.emplace_back(file_type, file.file_name); |
844 | 0 | } |
845 | 0 | auto row_binlog_clone_dir = fmt::format("{}/{}", clone_dir, FDRowBinlogSuffix); |
846 | 0 | if (copy_row_binlog) { |
847 | 0 | clone_files.clear(); |
848 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->list(row_binlog_clone_dir, true, |
849 | 0 | &clone_files, &exists)); |
850 | 0 | if (!exists) { |
851 | 0 | return Status::InternalError("row binlog clone dir not existed. clone_dir={}", |
852 | 0 | row_binlog_clone_dir); |
853 | 0 | } |
854 | 0 | for (auto& file : clone_files) { |
855 | 0 | clone_file_names.emplace_back(CloneFileType::ROW_BINLOG, file.file_name); |
856 | 0 | } |
857 | 0 | } |
858 | | |
859 | 0 | std::vector<io::FileInfo> local_files; |
860 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->list(tablet_dir, true, &local_files, &exists)); |
861 | 0 | std::unordered_set<std::string> data_local_file_names; |
862 | 0 | for (auto& file : local_files) { |
863 | 0 | data_local_file_names.insert(file.file_name); |
864 | 0 | } |
865 | 0 | auto row_binlog_dir = tablet->row_binlog_path(); |
866 | 0 | std::unordered_set<std::string> row_binlog_local_file_names; |
867 | 0 | if (copy_row_binlog) { |
868 | 0 | RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(row_binlog_dir)); |
869 | 0 | local_files.clear(); |
870 | 0 | RETURN_IF_ERROR( |
871 | 0 | io::global_local_filesystem()->list(row_binlog_dir, true, &local_files, &exists)); |
872 | 0 | for (auto& file : local_files) { |
873 | 0 | row_binlog_local_file_names.insert(file.file_name); |
874 | 0 | } |
875 | 0 | } |
876 | | |
877 | 0 | Status status; |
878 | 0 | std::vector<std::string> linked_success_files; |
879 | 0 | Defer remove_linked_files {[&]() { // clear linked files if errors happen |
880 | 0 | if (!status.ok()) { |
881 | 0 | std::vector<io::Path> paths; |
882 | 0 | for (auto& file : linked_success_files) { |
883 | 0 | paths.emplace_back(file); |
884 | 0 | } |
885 | 0 | static_cast<void>(io::global_local_filesystem()->batch_delete(paths)); |
886 | 0 | } |
887 | 0 | }}; |
888 | | /// Traverse all downloaded clone files in CLONE dir. |
889 | | /// If it does not exist in local tablet dir, link the file to local tablet dir |
890 | | /// And save all linked files in linked_success_files. |
891 | 0 | for (const auto& [clone_file_type, clone_file] : clone_file_names) { |
892 | 0 | if (clone_file_type == CloneFileType::DATA && |
893 | 0 | data_local_file_names.find(clone_file) != data_local_file_names.end()) { |
894 | 0 | VLOG_NOTICE << "find same file when clone, skip it. " |
895 | 0 | << "tablet=" << tablet->tablet_id() << ", clone_file=" << clone_file; |
896 | 0 | continue; |
897 | 0 | } |
898 | 0 | if (clone_file_type == CloneFileType::ROW_BINLOG && |
899 | 0 | row_binlog_local_file_names.find(clone_file) != row_binlog_local_file_names.end()) { |
900 | 0 | VLOG_NOTICE << "find same row binlog file when clone, skip it. " |
901 | 0 | << "tablet=" << tablet->tablet_id() << ", clone_file=" << clone_file; |
902 | 0 | continue; |
903 | 0 | } |
904 | | |
905 | | /// if binlog exist in clone dir and md5sum equal, then skip link file |
906 | 0 | bool skip_link_file = false; |
907 | 0 | std::string to; |
908 | 0 | if (clone_file_type == CloneFileType::CCR_BINLOG) { |
909 | 0 | if (!contain_binlog) { |
910 | 0 | LOG(WARNING) << "clone binlog file, but not contain binlog metas. " |
911 | 0 | << "tablet=" << tablet->tablet_id() << ", clone_file=" << clone_file; |
912 | 0 | break; |
913 | 0 | } |
914 | | |
915 | 0 | if (auto&& result = |
916 | 0 | check_dest_binlog_valid(tablet_dir, clone_dir, clone_file, &skip_link_file); |
917 | 0 | result) { |
918 | 0 | to = std::move(result.value()); |
919 | 0 | } else { |
920 | 0 | status = std::move(result.error()); |
921 | 0 | return status; |
922 | 0 | } |
923 | 0 | } else { |
924 | 0 | auto& local_dir = |
925 | 0 | clone_file_type == CloneFileType::ROW_BINLOG ? row_binlog_dir : tablet_dir; |
926 | 0 | to = fmt::format("{}/{}", local_dir, clone_file); |
927 | 0 | } |
928 | | |
929 | 0 | if (!skip_link_file) { |
930 | 0 | auto& clone_file_dir = |
931 | 0 | clone_file_type == CloneFileType::ROW_BINLOG ? row_binlog_clone_dir : clone_dir; |
932 | 0 | auto from = fmt::format("{}/{}", clone_file_dir, clone_file); |
933 | 0 | status = io::global_local_filesystem()->link_file(from, to); |
934 | 0 | if (!status.ok()) { |
935 | 0 | return status; |
936 | 0 | } |
937 | 0 | linked_success_files.emplace_back(std::move(to)); |
938 | 0 | } |
939 | 0 | } |
940 | 0 | if (contain_binlog) { |
941 | 0 | status = tablet->ingest_binlog_metas(&rowset_binlog_metas_pb); |
942 | 0 | if (!status.ok()) { |
943 | 0 | return status; |
944 | 0 | } |
945 | 0 | } |
946 | | |
947 | | // clone and compaction operation should be performed sequentially |
948 | 0 | std::lock_guard base_compaction_lock(tablet->get_base_compaction_lock()); |
949 | 0 | std::lock_guard cumulative_compaction_lock(tablet->get_cumulative_compaction_lock()); |
950 | 0 | std::lock_guard cold_compaction_lock(tablet->get_cold_compaction_lock()); |
951 | 0 | std::lock_guard build_inverted_index_lock(tablet->get_build_inverted_index_lock()); |
952 | 0 | std::lock_guard<std::mutex> push_lock(tablet->get_push_lock()); |
953 | 0 | std::lock_guard<std::mutex> rwlock(tablet->get_rowset_update_lock()); |
954 | 0 | std::lock_guard<std::shared_mutex> wrlock(tablet->get_header_lock()); |
955 | 0 | SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); |
956 | 0 | if (is_incremental_clone) { |
957 | 0 | status = _finish_incremental_clone(tablet, cloned_tablet_meta, version, copy_row_binlog); |
958 | 0 | } else { |
959 | 0 | status = _finish_full_clone(tablet, cloned_tablet_meta, copy_row_binlog); |
960 | 0 | } |
961 | | |
962 | | // if full clone success, need to update cumulative layer point |
963 | 0 | if (!is_incremental_clone && status.ok()) { |
964 | 0 | tablet->set_cumulative_layer_point(Tablet::K_INVALID_CUMULATIVE_POINT); |
965 | 0 | } |
966 | | |
967 | | // clear clone dir |
968 | 0 | return status; |
969 | 0 | } |
970 | | |
971 | | /// This method will do: |
972 | | /// 1. Get missing version from local tablet again and check if they exist in cloned tablet. |
973 | | /// 2. Revise the local tablet meta to add all incremental cloned rowset's meta. |
974 | | Status EngineCloneTask::_finish_incremental_clone(Tablet* tablet, |
975 | | const TabletMetaSharedPtr& cloned_tablet_meta, |
976 | 0 | int64_t version, bool copy_row_binlog) { |
977 | 0 | LOG(INFO) << "begin to finish incremental clone. tablet=" << tablet->tablet_id() |
978 | 0 | << ", visible_version=" << version |
979 | 0 | << ", cloned_tablet_replica_id=" << cloned_tablet_meta->replica_id(); |
980 | | |
981 | | /// Get missing versions again from local tablet. |
982 | | /// We got it before outside the lock, so it has to be got again. |
983 | 0 | Versions missed_versions = tablet->get_missed_versions_unlocked(version); |
984 | 0 | VLOG_NOTICE << "get missed versions again when finish incremental clone. " |
985 | 0 | << "tablet=" << tablet->tablet_id() << ", clone version=" << version |
986 | 0 | << ", missed_versions_size=" << missed_versions.size(); |
987 | | |
988 | | // check missing versions exist in clone src |
989 | 0 | std::vector<RowsetSharedPtr> rowsets_to_clone; |
990 | 0 | for (Version nested_version : missed_versions) { |
991 | 0 | auto rs_meta = cloned_tablet_meta->acquire_rs_meta_by_version(nested_version); |
992 | 0 | if (rs_meta == nullptr) { |
993 | 0 | return Status::InternalError("missed version {} is not found in cloned tablet meta", |
994 | 0 | nested_version.to_string()); |
995 | 0 | } |
996 | 0 | RowsetSharedPtr rs; |
997 | 0 | RETURN_IF_ERROR(tablet->create_rowset(rs_meta, &rs)); |
998 | 0 | rowsets_to_clone.push_back(std::move(rs)); |
999 | 0 | if (copy_row_binlog) { |
1000 | 0 | rs_meta = cloned_tablet_meta->acquire_row_binlog_rs_meta_by_version(nested_version); |
1001 | 0 | if (rs_meta == nullptr) { |
1002 | 0 | return Status::InternalError( |
1003 | 0 | "missed version {} row binlog is not found in cloned tablet meta", |
1004 | 0 | nested_version.to_string()); |
1005 | 0 | } |
1006 | 0 | RETURN_IF_ERROR(tablet->create_rowset(rs_meta, &rs)); |
1007 | 0 | rowsets_to_clone.push_back(std::move(rs)); |
1008 | 0 | } |
1009 | 0 | } |
1010 | | |
1011 | 0 | if (copy_row_binlog && tablet->enable_unique_key_merge_on_write()) { |
1012 | 0 | tablet->tablet_meta()->binlog_delvec().merge(cloned_tablet_meta->binlog_delvec()); |
1013 | 0 | } |
1014 | | |
1015 | | /// clone_data to tablet |
1016 | | /// For incremental clone, nothing will be deleted. |
1017 | | /// So versions_to_delete is empty. |
1018 | 0 | return tablet->revise_tablet_meta(rowsets_to_clone, {}, true, copy_row_binlog); |
1019 | 0 | } |
1020 | | |
1021 | | /// This method will do: |
1022 | | /// 1. Compare the version of local tablet and cloned tablet to decide which version to keep |
1023 | | /// 2. Revise the local tablet meta |
1024 | | Status EngineCloneTask::_finish_full_clone(Tablet* tablet, |
1025 | | const TabletMetaSharedPtr& cloned_tablet_meta, |
1026 | 0 | bool copy_row_binlog) { |
1027 | 0 | Version cloned_max_version = cloned_tablet_meta->max_version(); |
1028 | 0 | LOG(INFO) << "begin to finish full clone. tablet=" << tablet->tablet_id() |
1029 | 0 | << ", cloned_max_version=" << cloned_max_version; |
1030 | | |
1031 | | // Compare the version of local tablet and cloned tablet. |
1032 | | // For example: |
1033 | | // clone version is 8 |
1034 | | // |
1035 | | // local tablet: [0-1] [2-5] [6-6] [7-7] [9-10] |
1036 | | // clone tablet: [0-1] [2-4] [5-6] [7-8] |
1037 | | // |
1038 | | // after compare, the version mark with "x" will be deleted |
1039 | | // |
1040 | | // local tablet: [0-1]x [2-5]x [6-6]x [7-7]x [9-10] |
1041 | | // clone tablet: [0-1] [2-4] [5-6] [7-8] |
1042 | |
|
1043 | 0 | std::vector<RowsetSharedPtr> to_delete; |
1044 | 0 | std::vector<RowsetSharedPtr> to_add; |
1045 | 0 | for (auto& [v, rs] : tablet->rowset_map()) { |
1046 | | // if local version cross src latest, clone failed |
1047 | | // if local version is : 0-0, 1-1, 2-10, 12-14, 15-15,16-16 |
1048 | | // cloned max version is 13-13, this clone is failed, because could not |
1049 | | // fill local data by using cloned data. |
1050 | | // It should not happen because if there is a hole, the following delta will not |
1051 | | // do compaction. |
1052 | 0 | if (v.first <= cloned_max_version.second && v.second > cloned_max_version.second) { |
1053 | 0 | return Status::InternalError( |
1054 | 0 | "version cross src latest. cloned_max_version={}, local_version={}", |
1055 | 0 | cloned_max_version.second, v.to_string()); |
1056 | 0 | } |
1057 | 0 | if (v.second <= cloned_max_version.second) { |
1058 | 0 | to_delete.push_back(rs); |
1059 | 0 | } else { |
1060 | | // cooldowned rowsets MUST be continuous, so rowsets whose version > missed version MUST be local rowset |
1061 | 0 | DCHECK(rs->is_local()); |
1062 | 0 | } |
1063 | 0 | } |
1064 | 0 | if (copy_row_binlog) { |
1065 | 0 | for (auto& [v, rs] : tablet->row_binlog_rowset_map()) { |
1066 | 0 | if (v.first <= cloned_max_version.second && v.second > cloned_max_version.second) { |
1067 | 0 | return Status::InternalError( |
1068 | 0 | "row binlog version cross src latest. cloned_max_version={}, " |
1069 | 0 | "local_version={}", |
1070 | 0 | cloned_max_version.second, v.to_string()); |
1071 | 0 | } |
1072 | 0 | if (v.second <= cloned_max_version.second) { |
1073 | 0 | to_delete.push_back(rs); |
1074 | 0 | } else { |
1075 | 0 | DCHECK(rs->is_local()); |
1076 | 0 | } |
1077 | 0 | } |
1078 | 0 | } |
1079 | | |
1080 | 0 | to_add.reserve(cloned_tablet_meta->all_rs_metas().size() + |
1081 | 0 | cloned_tablet_meta->all_row_binlog_rs_metas().size()); |
1082 | 0 | for (const auto& [_, rs_meta] : cloned_tablet_meta->all_rs_metas()) { |
1083 | 0 | RowsetSharedPtr rs; |
1084 | 0 | RETURN_IF_ERROR(tablet->create_rowset(rs_meta, &rs)); |
1085 | 0 | to_add.push_back(std::move(rs)); |
1086 | 0 | } |
1087 | 0 | if (copy_row_binlog) { |
1088 | 0 | for (const auto& [_, rs_meta] : cloned_tablet_meta->all_row_binlog_rs_metas()) { |
1089 | 0 | RowsetSharedPtr rs; |
1090 | 0 | RETURN_IF_ERROR(tablet->create_rowset(rs_meta, &rs)); |
1091 | 0 | to_add.push_back(std::move(rs)); |
1092 | 0 | } |
1093 | 0 | } |
1094 | 0 | { |
1095 | 0 | std::shared_lock cooldown_conf_rlock(tablet->get_cooldown_conf_lock()); |
1096 | 0 | if (tablet->cooldown_conf_unlocked().cooldown_replica_id == tablet->replica_id()) { |
1097 | | // If this replica is cooldown replica, MUST generate a new `cooldown_meta_id` to avoid use `cooldown_meta_id` |
1098 | | // generated in old cooldown term which may lead to such situation: |
1099 | | // Replica A is cooldown replica, cooldown_meta_id=2, |
1100 | | // Replica B: cooldown_replica=A, cooldown_meta_id=1 |
1101 | | // Replica A: full clone Replica A, cooldown_meta_id=1, but remote cooldown_meta is still with cooldown_meta_id=2 |
1102 | | // After tablet report. FE finds all replicas' cooldowned data is consistent |
1103 | | // Replica A: confirm_unused_remote_files, delete some cooldowned data of cooldown_meta_id=2 |
1104 | | // Replica B: follow_cooldown_data, cooldown_meta_id=2, data lost |
1105 | 0 | tablet->tablet_meta()->set_cooldown_meta_id(UniqueId::gen_uid()); |
1106 | 0 | } else { |
1107 | 0 | tablet->tablet_meta()->set_cooldown_meta_id(cloned_tablet_meta->cooldown_meta_id()); |
1108 | 0 | } |
1109 | 0 | } |
1110 | 0 | if (tablet->enable_unique_key_merge_on_write()) { |
1111 | 0 | tablet->tablet_meta()->delete_bitmap().merge(cloned_tablet_meta->delete_bitmap()); |
1112 | 0 | if (copy_row_binlog) { |
1113 | 0 | tablet->tablet_meta()->binlog_delvec().merge(cloned_tablet_meta->binlog_delvec()); |
1114 | 0 | } |
1115 | 0 | } |
1116 | 0 | return tablet->revise_tablet_meta(to_add, to_delete, false, copy_row_binlog); |
1117 | | // TODO(plat1ko): write cooldown meta to remote if this replica is cooldown replica |
1118 | 0 | } |
1119 | | } // namespace doris |