Coverage Report

Created: 2024-11-18 10:37

/root/doris/be/src/olap/tablet_manager.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "olap/tablet_manager.h"
19
20
#include <fmt/format.h>
21
#include <gen_cpp/AgentService_types.h>
22
#include <gen_cpp/BackendService_types.h>
23
#include <gen_cpp/Descriptors_types.h>
24
#include <gen_cpp/MasterService_types.h>
25
#include <gen_cpp/Types_types.h>
26
#include <gen_cpp/olap_file.pb.h>
27
#include <re2/re2.h>
28
#include <unistd.h>
29
30
#include <algorithm>
31
#include <list>
32
#include <mutex>
33
#include <ostream>
34
35
// IWYU pragma: no_include <opentelemetry/common/threadlocal.h>
36
#include "common/compiler_util.h" // IWYU pragma: keep
37
#include "common/config.h"
38
#include "common/logging.h"
39
#include "gutil/integral_types.h"
40
#include "gutil/strings/strcat.h"
41
#include "gutil/strings/substitute.h"
42
#include "io/fs/local_file_system.h"
43
#include "olap/cumulative_compaction_time_series_policy.h"
44
#include "olap/data_dir.h"
45
#include "olap/olap_common.h"
46
#include "olap/olap_define.h"
47
#include "olap/olap_meta.h"
48
#include "olap/pb_helper.h"
49
#include "olap/rowset/beta_rowset.h"
50
#include "olap/rowset/rowset.h"
51
#include "olap/rowset/rowset_meta_manager.h"
52
#include "olap/storage_engine.h"
53
#include "olap/tablet.h"
54
#include "olap/tablet_meta.h"
55
#include "olap/tablet_meta_manager.h"
56
#include "olap/tablet_schema.h"
57
#include "olap/txn_manager.h"
58
#include "runtime/exec_env.h"
59
#include "runtime/memory/mem_tracker.h"
60
#include "runtime/thread_context.h"
61
#include "service/backend_options.h"
62
#include "util/defer_op.h"
63
#include "util/doris_metrics.h"
64
#include "util/histogram.h"
65
#include "util/metrics.h"
66
#include "util/path_util.h"
67
#include "util/scoped_cleanup.h"
68
#include "util/stopwatch.hpp"
69
#include "util/time.h"
70
#include "util/trace.h"
71
#include "util/uid_util.h"
72
73
namespace doris {
74
class CumulativeCompactionPolicy;
75
} // namespace doris
76
77
using std::map;
78
using std::set;
79
using std::string;
80
using std::vector;
81
82
namespace doris {
83
using namespace ErrorCode;
84
85
DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(tablet_meta_mem_consumption, MetricUnit::BYTES, "",
86
                                   mem_consumption, Labels({{"type", "tablet_meta"}}));
87
88
TabletManager::TabletManager(int32_t tablet_map_lock_shard_size)
89
        : _mem_tracker(std::make_shared<MemTracker>(
90
                  "TabletManager", ExecEnv::GetInstance()->experimental_mem_tracker())),
91
          _tablet_meta_mem_tracker(std::make_shared<MemTracker>(
92
                  "TabletMeta", ExecEnv::GetInstance()->experimental_mem_tracker())),
93
          _tablets_shards_size(tablet_map_lock_shard_size),
94
57
          _tablets_shards_mask(tablet_map_lock_shard_size - 1) {
95
57
    CHECK_GT(_tablets_shards_size, 0);
96
57
    CHECK_EQ(_tablets_shards_size & _tablets_shards_mask, 0);
97
57
    _tablets_shards.resize(_tablets_shards_size);
98
57
    REGISTER_HOOK_METRIC(tablet_meta_mem_consumption,
99
57
                         [this]() { return _mem_tracker->consumption(); });
100
57
}
101
102
55
TabletManager::~TabletManager() {
103
55
    DEREGISTER_HOOK_METRIC(tablet_meta_mem_consumption);
104
55
}
105
106
Status TabletManager::_add_tablet_unlocked(TTabletId tablet_id, const TabletSharedPtr& tablet,
107
26
                                           bool update_meta, bool force, RuntimeProfile* profile) {
108
26
    if (profile->get_counter("AddTablet") == nullptr) {
109
2
        ADD_TIMER(profile, "AddTablet");
110
2
    }
111
26
    Status res = Status::OK();
112
26
    VLOG_NOTICE << "begin to add tablet to TabletManager. "
113
0
                << "tablet_id=" << tablet_id << ", force=" << force;
114
115
26
    TabletSharedPtr existed_tablet = nullptr;
116
26
    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
117
26
    const auto& iter = tablet_map.find(tablet_id);
118
26
    if (iter != tablet_map.end()) {
119
2
        existed_tablet = iter->second;
120
2
    }
121
122
26
    if (existed_tablet == nullptr) {
123
24
        return _add_tablet_to_map_unlocked(tablet_id, tablet, update_meta, false /*keep_files*/,
124
24
                                           false /*drop_old*/, profile);
125
24
    }
126
    // During restore process, the tablet is exist and snapshot loader will replace the tablet's rowsets
127
    // and then reload the tablet, the tablet's path will the same
128
2
    if (!force) {
129
2
        if (existed_tablet->tablet_path() == tablet->tablet_path()) {
130
0
            return Status::Error<ENGINE_INSERT_EXISTS_TABLE>(
131
0
                    "add the same tablet twice! tablet_id={}, tablet_path={}", tablet_id,
132
0
                    tablet->tablet_path());
133
0
        }
134
2
        if (existed_tablet->data_dir() == tablet->data_dir()) {
135
0
            return Status::Error<ENGINE_INSERT_EXISTS_TABLE>(
136
0
                    "add tablet with same data dir twice! tablet_id={}", tablet_id);
137
0
        }
138
2
    }
139
140
2
    MonotonicStopWatch watch;
141
2
    watch.start();
142
143
    // During storage migration, the tablet is moved to another disk, have to check
144
    // if the new tablet's rowset version is larger than the old one to prevent losting data during
145
    // migration
146
2
    int64_t old_time, new_time;
147
2
    int32_t old_version, new_version;
148
2
    {
149
2
        std::shared_lock rdlock(existed_tablet->get_header_lock());
150
2
        const RowsetSharedPtr old_rowset = existed_tablet->rowset_with_max_version();
151
2
        const RowsetSharedPtr new_rowset = tablet->rowset_with_max_version();
152
        // If new tablet is empty, it is a newly created schema change tablet.
153
        // the old tablet is dropped before add tablet. it should not exist old tablet
154
2
        if (new_rowset == nullptr) {
155
            // it seems useless to call unlock and return here.
156
            // it could prevent error when log level is changed in the future.
157
0
            return Status::Error<ENGINE_INSERT_EXISTS_TABLE>(
158
0
                    "new tablet is empty and old tablet exists. it should not happen. tablet_id={}",
159
0
                    tablet_id);
160
0
        }
161
2
        old_time = old_rowset == nullptr ? -1 : old_rowset->creation_time();
162
2
        new_time = new_rowset->creation_time();
163
2
        old_version = old_rowset == nullptr ? -1 : old_rowset->end_version();
164
2
        new_version = new_rowset->end_version();
165
2
    }
166
2
    COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "GetExistTabletVersion", "AddTablet"),
167
2
                   static_cast<int64_t>(watch.reset()));
168
169
    // In restore process, we replace all origin files in tablet dir with
170
    // the downloaded snapshot files. Then we try to reload tablet header.
171
    // force == true means we forcibly replace the Tablet in tablet_map
172
    // with the new one. But if we do so, the files in the tablet dir will be
173
    // dropped when the origin Tablet deconstruct.
174
    // So we set keep_files == true to not delete files when the
175
    // origin Tablet deconstruct.
176
    // During restore process, snapshot loader
177
    // replaced the old tablet's rowset with new rowsets, but the tablet path is reused, if drop files
178
    // here, the new rowset's file will also be dropped, so use keep files here
179
2
    bool keep_files = force ? true : false;
180
2
    if (force ||
181
2
        (new_version > old_version || (new_version == old_version && new_time >= old_time))) {
182
        // check if new tablet's meta is in store and add new tablet's meta to meta store
183
2
        res = _add_tablet_to_map_unlocked(tablet_id, tablet, update_meta, keep_files,
184
2
                                          true /*drop_old*/, profile);
185
2
    } else {
186
0
        tablet->set_tablet_state(TABLET_SHUTDOWN);
187
0
        tablet->save_meta();
188
0
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "SaveMeta", "AddTablet"),
189
0
                       static_cast<int64_t>(watch.reset()));
190
0
        {
191
0
            std::lock_guard<std::shared_mutex> shutdown_tablets_wrlock(_shutdown_tablets_lock);
192
0
            _shutdown_tablets.push_back(tablet);
193
0
        }
194
195
0
        res = Status::Error<ENGINE_INSERT_OLD_TABLET>(
196
0
                "set tablet to shutdown state. tablet_id={}, tablet_path={}", tablet->tablet_id(),
197
0
                tablet->tablet_path());
198
0
    }
199
2
    LOG(WARNING) << "add duplicated tablet. force=" << force << ", res=" << res
200
2
                 << ", tablet_id=" << tablet_id << ", old_version=" << old_version
201
2
                 << ", new_version=" << new_version << ", old_time=" << old_time
202
2
                 << ", new_time=" << new_time
203
2
                 << ", old_tablet_path=" << existed_tablet->tablet_path()
204
2
                 << ", new_tablet_path=" << tablet->tablet_path();
205
206
2
    return res;
207
2
}
208
209
Status TabletManager::_add_tablet_to_map_unlocked(TTabletId tablet_id,
210
                                                  const TabletSharedPtr& tablet, bool update_meta,
211
                                                  bool keep_files, bool drop_old,
212
26
                                                  RuntimeProfile* profile) {
213
    // check if new tablet's meta is in store and add new tablet's meta to meta store
214
26
    Status res = Status::OK();
215
26
    MonotonicStopWatch watch;
216
26
    watch.start();
217
26
    if (update_meta) {
218
        // call tablet save meta in order to valid the meta
219
26
        tablet->save_meta();
220
26
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "SaveMeta", "AddTablet"),
221
26
                       static_cast<int64_t>(watch.reset()));
222
26
    }
223
26
    if (drop_old) {
224
        // If the new tablet is fresher than the existing one, then replace
225
        // the existing tablet with the new one.
226
        // Use default replica_id to ignore whether replica_id is match when drop tablet.
227
2
        Status status = _drop_tablet_unlocked(tablet_id, /* replica_id */ 0, keep_files, false);
228
2
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropOldTablet", "AddTablet"),
229
2
                       static_cast<int64_t>(watch.reset()));
230
2
        RETURN_NOT_OK_STATUS_WITH_WARN(
231
2
                status, strings::Substitute("failed to drop old tablet when add new tablet. "
232
2
                                            "tablet_id=$0",
233
2
                                            tablet_id));
234
2
    }
235
    // Register tablet into DataDir, so that we can manage tablet from
236
    // the perspective of root path.
237
    // Example: unregister all tables when a bad disk found.
238
26
    tablet->register_tablet_into_dir();
239
26
    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
240
26
    tablet_map[tablet_id] = tablet;
241
26
    _add_tablet_to_partition(tablet);
242
    // TODO: remove multiply 2 of tablet meta mem size
243
    // Because table schema will copy in tablet, there will be double mem cost
244
    // so here multiply 2
245
26
    _tablet_meta_mem_tracker->consume(tablet->tablet_meta()->mem_size() * 2);
246
26
    COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "RegisterTabletInfo", "AddTablet"),
247
26
                   static_cast<int64_t>(watch.reset()));
248
249
26
    VLOG_NOTICE << "add tablet to map successfully."
250
0
                << " tablet_id=" << tablet_id;
251
252
26
    return res;
253
26
}
254
255
0
bool TabletManager::check_tablet_id_exist(TTabletId tablet_id) {
256
0
    std::shared_lock rdlock(_get_tablets_shard_lock(tablet_id));
257
0
    return _check_tablet_id_exist_unlocked(tablet_id);
258
0
}
259
260
0
bool TabletManager::_check_tablet_id_exist_unlocked(TTabletId tablet_id) {
261
0
    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
262
0
    return tablet_map.find(tablet_id) != tablet_map.end();
263
0
}
264
265
Status TabletManager::create_tablet(const TCreateTabletReq& request, std::vector<DataDir*> stores,
266
27
                                    RuntimeProfile* profile) {
267
27
    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
268
27
    DorisMetrics::instance()->create_tablet_requests_total->increment(1);
269
270
27
    int64_t tablet_id = request.tablet_id;
271
27
    LOG(INFO) << "begin to create tablet. tablet_id=" << tablet_id
272
27
              << ", table_id=" << request.table_id << ", partition_id=" << request.partition_id
273
27
              << ", replica_id=" << request.replica_id;
274
275
    // when we create rollup tablet A(assume on shard-1) from tablet B(assume on shard-2)
276
    // we need use write lock on shard-1 and then use read lock on shard-2
277
    // if there have create rollup tablet C(assume on shard-2) from tablet D(assume on shard-1) at the same time, we will meet deadlock
278
27
    std::unique_lock two_tablet_lock(_two_tablet_mtx, std::defer_lock);
279
27
    bool in_restore_mode = request.__isset.in_restore_mode && request.in_restore_mode;
280
27
    bool is_schema_change_or_atomic_restore =
281
27
            request.__isset.base_tablet_id && request.base_tablet_id > 0;
282
27
    bool need_two_lock =
283
27
            is_schema_change_or_atomic_restore &&
284
27
            ((_tablets_shards_mask & request.base_tablet_id) != (_tablets_shards_mask & tablet_id));
285
27
    if (need_two_lock) {
286
0
        SCOPED_TIMER(ADD_TIMER(profile, "GetTwoTableLock"));
287
0
        two_tablet_lock.lock();
288
0
    }
289
290
27
    MonotonicStopWatch shard_lock_watch;
291
27
    shard_lock_watch.start();
292
27
    std::lock_guard wrlock(_get_tablets_shard_lock(tablet_id));
293
27
    shard_lock_watch.stop();
294
27
    COUNTER_UPDATE(ADD_TIMER(profile, "GetShardLock"),
295
27
                   static_cast<int64_t>(shard_lock_watch.elapsed_time()));
296
    // Make create_tablet operation to be idempotent:
297
    // 1. Return true if tablet with same tablet_id and schema_hash exist;
298
    //           false if tablet with same tablet_id but different schema_hash exist.
299
    // 2. When this is an alter task, if the tablet(both tablet_id and schema_hash are
300
    // same) already exist, then just return true(an duplicate request). But if
301
    // tablet_id exist but with different schema_hash, return an error(report task will
302
    // eventually trigger its deletion).
303
27
    {
304
27
        SCOPED_TIMER(ADD_TIMER(profile, "GetTabletUnlocked"));
305
27
        if (_get_tablet_unlocked(tablet_id) != nullptr) {
306
3
            LOG(INFO) << "success to create tablet. tablet already exist. tablet_id=" << tablet_id;
307
3
            return Status::OK();
308
3
        }
309
27
    }
310
311
24
    TabletSharedPtr base_tablet = nullptr;
312
    // If the CreateTabletReq has base_tablet_id then it is a alter-tablet request
313
24
    if (is_schema_change_or_atomic_restore) {
314
        // if base_tablet_id's lock diffrent with new_tablet_id, we need lock it.
315
0
        if (need_two_lock) {
316
0
            SCOPED_TIMER(ADD_TIMER(profile, "GetBaseTablet"));
317
0
            base_tablet = get_tablet(request.base_tablet_id);
318
0
            two_tablet_lock.unlock();
319
0
        } else {
320
0
            SCOPED_TIMER(ADD_TIMER(profile, "GetBaseTabletUnlocked"));
321
0
            base_tablet = _get_tablet_unlocked(request.base_tablet_id);
322
0
        }
323
0
        if (base_tablet == nullptr) {
324
0
            DorisMetrics::instance()->create_tablet_requests_failed->increment(1);
325
0
            return Status::Error<TABLE_CREATE_META_ERROR>(
326
0
                    "fail to create tablet(change schema/atomic restore), base tablet does not "
327
0
                    "exist. new_tablet_id={}, base_tablet_id={}",
328
0
                    tablet_id, request.base_tablet_id);
329
0
        }
330
        // If we are doing schema-change or atomic-restore, we should use the same data dir
331
        // TODO(lingbin): A litter trick here, the directory should be determined before
332
        // entering this method
333
        //
334
        // ATTN: Since all restored replicas will be saved to HDD, so no storage_medium check here.
335
0
        if (in_restore_mode ||
336
0
            request.storage_medium == base_tablet->data_dir()->storage_medium()) {
337
0
            LOG(INFO) << "create tablet use the base tablet data dir. tablet_id=" << tablet_id
338
0
                      << ", base tablet_id=" << request.base_tablet_id
339
0
                      << ", data dir=" << base_tablet->data_dir()->path();
340
0
            stores.clear();
341
0
            stores.push_back(base_tablet->data_dir());
342
0
        }
343
0
    }
344
345
    // set alter type to schema-change. it is useless
346
24
    TabletSharedPtr tablet = _internal_create_tablet_unlocked(
347
24
            request, is_schema_change_or_atomic_restore, base_tablet.get(), stores, profile);
348
24
    if (tablet == nullptr) {
349
0
        DorisMetrics::instance()->create_tablet_requests_failed->increment(1);
350
0
        return Status::Error<CE_CMD_PARAMS_ERROR>("fail to create tablet. tablet_id={}",
351
0
                                                  request.tablet_id);
352
0
    }
353
354
24
    LOG(INFO) << "success to create tablet. tablet_id=" << tablet_id;
355
24
    return Status::OK();
356
24
}
357
358
TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
359
        const TCreateTabletReq& request, const bool is_schema_change, const Tablet* base_tablet,
360
24
        const std::vector<DataDir*>& data_dirs, RuntimeProfile* profile) {
361
    // If in schema-change state, base_tablet must also be provided.
362
    // i.e., is_schema_change and base_tablet are either assigned or not assigned
363
24
    DCHECK((is_schema_change && base_tablet) || (!is_schema_change && !base_tablet));
364
365
    // NOTE: The existence of tablet_id and schema_hash has already been checked,
366
    // no need check again here.
367
368
24
    const std::string parent_timer_name = "InternalCreateTablet";
369
24
    SCOPED_TIMER(ADD_TIMER(profile, parent_timer_name));
370
371
24
    MonotonicStopWatch watch;
372
24
    watch.start();
373
24
    auto create_meta_timer = ADD_CHILD_TIMER(profile, "CreateMeta", parent_timer_name);
374
24
    auto tablet = _create_tablet_meta_and_dir_unlocked(request, is_schema_change, base_tablet,
375
24
                                                       data_dirs, profile);
376
24
    COUNTER_UPDATE(create_meta_timer, static_cast<int64_t>(watch.reset()));
377
24
    if (tablet == nullptr) {
378
0
        return nullptr;
379
0
    }
380
381
24
    int64_t new_tablet_id = request.tablet_id;
382
24
    int32_t new_schema_hash = request.tablet_schema.schema_hash;
383
384
    // should remove the tablet's pending_id no matter create-tablet success or not
385
24
    DataDir* data_dir = tablet->data_dir();
386
387
    // TODO(yiguolei)
388
    // the following code is very difficult to understand because it mixed alter tablet v2
389
    // and alter tablet v1 should remove alter tablet v1 code after v0.12
390
24
    Status res = Status::OK();
391
24
    bool is_tablet_added = false;
392
24
    do {
393
24
        res = tablet->init();
394
24
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "TabletInit", parent_timer_name),
395
24
                       static_cast<int64_t>(watch.reset()));
396
24
        if (!res.ok()) {
397
0
            LOG(WARNING) << "tablet init failed. tablet:" << tablet->full_name();
398
0
            break;
399
0
        }
400
401
        // Create init version if this is not a restore mode replica and request.version is set
402
        // bool in_restore_mode = request.__isset.in_restore_mode && request.in_restore_mode;
403
        // if (!in_restore_mode && request.__isset.version) {
404
        // create initial rowset before add it to storage engine could omit many locks
405
24
        res = tablet->create_initial_rowset(request.version);
406
24
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "InitRowset", parent_timer_name),
407
24
                       static_cast<int64_t>(watch.reset()));
408
24
        if (!res.ok()) {
409
0
            LOG(WARNING) << "fail to create initial version for tablet. res=" << res;
410
0
            break;
411
0
        }
412
413
24
        if (is_schema_change) {
414
            // if this is a new alter tablet, has to set its state to not ready
415
            // because schema change handler depends on it to check whether history data
416
            // convert finished
417
0
            tablet->set_tablet_state(TabletState::TABLET_NOTREADY);
418
0
        }
419
        // Add tablet to StorageEngine will make it visible to user
420
        // Will persist tablet meta
421
24
        auto add_tablet_timer = ADD_CHILD_TIMER(profile, "AddTablet", parent_timer_name);
422
24
        res = _add_tablet_unlocked(new_tablet_id, tablet, /*update_meta*/ true, false, profile);
423
24
        COUNTER_UPDATE(add_tablet_timer, static_cast<int64_t>(watch.reset()));
424
24
        if (!res.ok()) {
425
0
            LOG(WARNING) << "fail to add tablet to StorageEngine. res=" << res;
426
0
            break;
427
0
        }
428
24
        is_tablet_added = true;
429
430
        // TODO(lingbin): The following logic seems useless, can be removed?
431
        // Because if _add_tablet_unlocked() return OK, we must can get it from map.
432
24
        TabletSharedPtr tablet_ptr = _get_tablet_unlocked(new_tablet_id);
433
24
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "GetTablet", parent_timer_name),
434
24
                       static_cast<int64_t>(watch.reset()));
435
24
        if (tablet_ptr == nullptr) {
436
0
            res = Status::Error<TABLE_NOT_FOUND>("fail to get tablet. res={}", res);
437
0
            break;
438
0
        }
439
24
    } while (false);
440
441
24
    if (res.ok()) {
442
24
        return tablet;
443
24
    }
444
    // something is wrong, we need clear environment
445
0
    if (is_tablet_added) {
446
0
        Status status = _drop_tablet_unlocked(new_tablet_id, request.replica_id, false, false);
447
0
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropTablet", parent_timer_name),
448
0
                       static_cast<int64_t>(watch.reset()));
449
0
        if (!status.ok()) {
450
0
            LOG(WARNING) << "fail to drop tablet when create tablet failed. res=" << res;
451
0
        }
452
0
    } else {
453
0
        tablet->delete_all_files();
454
0
        TabletMetaManager::remove(data_dir, new_tablet_id, new_schema_hash);
455
0
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "RemoveTabletFiles", parent_timer_name),
456
0
                       static_cast<int64_t>(watch.reset()));
457
0
    }
458
0
    return nullptr;
459
24
}
460
461
24
static string _gen_tablet_dir(const string& dir, int16_t shard_id, int64_t tablet_id) {
462
24
    string path = dir;
463
24
    path = path_util::join_path_segments(path, DATA_PREFIX);
464
24
    path = path_util::join_path_segments(path, std::to_string(shard_id));
465
24
    path = path_util::join_path_segments(path, std::to_string(tablet_id));
466
24
    return path;
467
24
}
468
469
TabletSharedPtr TabletManager::_create_tablet_meta_and_dir_unlocked(
470
        const TCreateTabletReq& request, const bool is_schema_change, const Tablet* base_tablet,
471
24
        const std::vector<DataDir*>& data_dirs, RuntimeProfile* profile) {
472
24
    string pending_id = StrCat(TABLET_ID_PREFIX, request.tablet_id);
473
    // Many attempts are made here in the hope that even if a disk fails, it can still continue.
474
24
    std::string parent_timer_name = "CreateMeta";
475
24
    MonotonicStopWatch watch;
476
24
    watch.start();
477
24
    for (auto& data_dir : data_dirs) {
478
24
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "RemovePendingIds", parent_timer_name),
479
24
                       static_cast<int64_t>(watch.reset()));
480
481
24
        TabletMetaSharedPtr tablet_meta;
482
        // if create meta failed, do not need to clean dir, because it is only in memory
483
24
        Status res = _create_tablet_meta_unlocked(request, data_dir, is_schema_change, base_tablet,
484
24
                                                  &tablet_meta);
485
24
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "CreateMetaUnlock", parent_timer_name),
486
24
                       static_cast<int64_t>(watch.reset()));
487
24
        if (!res.ok()) {
488
0
            LOG(WARNING) << "fail to create tablet meta. res=" << res
489
0
                         << ", root=" << data_dir->path();
490
0
            continue;
491
0
        }
492
493
24
        string tablet_dir =
494
24
                _gen_tablet_dir(data_dir->path(), tablet_meta->shard_id(), request.tablet_id);
495
24
        string schema_hash_dir = path_util::join_path_segments(
496
24
                tablet_dir, std::to_string(request.tablet_schema.schema_hash));
497
498
        // Because the tablet is removed asynchronously, so that the dir may still exist when BE
499
        // receive create-tablet request again, For example retried schema-change request
500
24
        bool exists = true;
501
24
        res = io::global_local_filesystem()->exists(schema_hash_dir, &exists);
502
24
        if (!res.ok()) {
503
0
            continue;
504
0
        }
505
24
        if (exists) {
506
0
            LOG(WARNING) << "skip this dir because tablet path exist, path=" << schema_hash_dir;
507
0
            continue;
508
24
        } else {
509
24
            Status st = io::global_local_filesystem()->create_directory(schema_hash_dir);
510
24
            if (!st.ok()) {
511
0
                continue;
512
0
            }
513
24
        }
514
515
24
        if (tablet_meta->partition_id() <= 0) {
516
1
            LOG(WARNING) << "invalid partition id " << tablet_meta->partition_id() << ", tablet "
517
1
                         << tablet_meta->tablet_id();
518
1
        }
519
520
24
        TabletSharedPtr new_tablet = Tablet::create_tablet_from_meta(tablet_meta, data_dir);
521
24
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "CreateTabletFromMeta", parent_timer_name),
522
24
                       static_cast<int64_t>(watch.reset()));
523
24
        DCHECK(new_tablet != nullptr);
524
24
        return new_tablet;
525
24
    }
526
0
    return nullptr;
527
24
}
528
529
Status TabletManager::drop_tablet(TTabletId tablet_id, TReplicaId replica_id,
530
22
                                  bool is_drop_table_or_partition) {
531
22
    auto& shard = _get_tablets_shard(tablet_id);
532
22
    std::lock_guard wrlock(shard.lock);
533
22
    return _drop_tablet_unlocked(tablet_id, replica_id, false, is_drop_table_or_partition);
534
22
}
535
536
// Drop specified tablet.
537
Status TabletManager::_drop_tablet_unlocked(TTabletId tablet_id, TReplicaId replica_id,
538
24
                                            bool keep_files, bool is_drop_table_or_partition) {
539
24
    LOG(INFO) << "begin drop tablet. tablet_id=" << tablet_id << ", replica_id=" << replica_id
540
24
              << ", is_drop_table_or_partition=" << is_drop_table_or_partition;
541
24
    DorisMetrics::instance()->drop_tablet_requests_total->increment(1);
542
543
24
    RETURN_IF_ERROR(register_transition_tablet(tablet_id, "drop tablet"));
544
24
    Defer defer {[&]() { unregister_transition_tablet(tablet_id, "drop tablet"); }};
545
546
    // Fetch tablet which need to be dropped
547
24
    TabletSharedPtr to_drop_tablet = _get_tablet_unlocked(tablet_id);
548
24
    if (to_drop_tablet == nullptr) {
549
1
        LOG(WARNING) << "fail to drop tablet because it does not exist. "
550
1
                     << "tablet_id=" << tablet_id;
551
1
        return Status::OK();
552
1
    }
553
554
    // We should compare replica id to avoid dropping new cloned tablet.
555
    // Iff request replica id is 0, FE may be an older release, then we drop this tablet as before.
556
23
    if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) {
557
0
        return Status::Aborted("replica_id not match({} vs {})", to_drop_tablet->replica_id(),
558
0
                               replica_id);
559
0
    }
560
561
23
    _remove_tablet_from_partition(to_drop_tablet);
562
23
    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
563
23
    tablet_map.erase(tablet_id);
564
23
    to_drop_tablet->clear_cache();
565
566
23
    if (!keep_files) {
567
        // drop tablet will update tablet meta, should lock
568
23
        std::lock_guard<std::shared_mutex> wrlock(to_drop_tablet->get_header_lock());
569
23
        SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD);
570
23
        LOG(INFO) << "set tablet to shutdown state and remove it from memory. "
571
23
                  << "tablet_id=" << tablet_id << ", tablet_path=" << to_drop_tablet->tablet_path();
572
        // NOTE: has to update tablet here, but must not update tablet meta directly.
573
        // because other thread may hold the tablet object, they may save meta too.
574
        // If update meta directly here, other thread may override the meta
575
        // and the tablet will be loaded at restart time.
576
        // To avoid this exception, we first set the state of the tablet to `SHUTDOWN`.
577
23
        to_drop_tablet->set_tablet_state(TABLET_SHUTDOWN);
578
        // We must record unused remote rowsets path info to OlapMeta before tablet state is marked as TABLET_SHUTDOWN in OlapMeta,
579
        // otherwise if BE shutdown after saving tablet state, these remote rowsets path info will lost.
580
23
        if (is_drop_table_or_partition) {
581
0
            RETURN_IF_ERROR(to_drop_tablet->remove_all_remote_rowsets());
582
0
        }
583
23
        to_drop_tablet->save_meta();
584
23
        {
585
23
            std::lock_guard<std::shared_mutex> wrdlock(_shutdown_tablets_lock);
586
23
            _shutdown_tablets.push_back(to_drop_tablet);
587
23
        }
588
23
    }
589
590
23
    to_drop_tablet->deregister_tablet_from_dir();
591
23
    _tablet_meta_mem_tracker->release(to_drop_tablet->tablet_meta()->mem_size() * 2);
592
23
    return Status::OK();
593
23
}
594
595
2.10k
TabletSharedPtr TabletManager::get_tablet(TTabletId tablet_id, bool include_deleted, string* err) {
596
2.10k
    std::shared_lock rdlock(_get_tablets_shard_lock(tablet_id));
597
2.10k
    return _get_tablet_unlocked(tablet_id, include_deleted, err);
598
2.10k
}
599
600
25
std::vector<TabletSharedPtr> TabletManager::get_all_tablet(std::function<bool(Tablet*)>&& filter) {
601
25
    std::vector<TabletSharedPtr> res;
602
25
    for_each_tablet([&](const TabletSharedPtr& tablet) { res.emplace_back(tablet); },
603
25
                    std::move(filter));
604
25
    return res;
605
25
}
606
607
void TabletManager::for_each_tablet(std::function<void(const TabletSharedPtr&)>&& handler,
608
143
                                    std::function<bool(Tablet*)>&& filter) {
609
143
    std::vector<TabletSharedPtr> tablets;
610
143
    for (const auto& tablets_shard : _tablets_shards) {
611
143
        tablets.clear();
612
143
        {
613
143
            std::shared_lock rdlock(tablets_shard.lock);
614
143
            for (const auto& [id, tablet] : tablets_shard.tablet_map) {
615
4
                if (filter(tablet.get())) {
616
4
                    tablets.emplace_back(tablet);
617
4
                }
618
4
            }
619
143
        }
620
143
        for (const auto& tablet : tablets) {
621
4
            handler(tablet);
622
4
        }
623
143
    }
624
143
}
625
626
std::pair<TabletSharedPtr, Status> TabletManager::get_tablet_and_status(TTabletId tablet_id,
627
0
                                                                        bool include_deleted) {
628
0
    std::string err;
629
0
    auto tablet = get_tablet(tablet_id, include_deleted, &err);
630
0
    if (tablet == nullptr) {
631
0
        return {tablet,
632
0
                Status::InternalError("failed to get tablet: {}, reason: {}", tablet_id, err)};
633
0
    }
634
635
0
    return {tablet, Status::OK()};
636
0
}
637
638
TabletSharedPtr TabletManager::_get_tablet_unlocked(TTabletId tablet_id, bool include_deleted,
639
2.10k
                                                    string* err) {
640
2.10k
    TabletSharedPtr tablet;
641
2.10k
    tablet = _get_tablet_unlocked(tablet_id);
642
2.10k
    if (tablet == nullptr && include_deleted) {
643
3
        std::shared_lock rdlock(_shutdown_tablets_lock);
644
3
        for (auto& deleted_tablet : _shutdown_tablets) {
645
2
            CHECK(deleted_tablet != nullptr) << "deleted tablet is nullptr";
646
2
            if (deleted_tablet->tablet_id() == tablet_id) {
647
2
                tablet = deleted_tablet;
648
2
                break;
649
2
            }
650
2
        }
651
3
    }
652
653
2.10k
    if (tablet == nullptr) {
654
11
        if (err != nullptr) {
655
0
            *err = "tablet does not exist. " + BackendOptions::get_localhost();
656
0
        }
657
11
        return nullptr;
658
11
    }
659
660
2.09k
    if (!tablet->is_used()) {
661
0
        LOG(WARNING) << "tablet cannot be used. tablet=" << tablet_id;
662
0
        if (err != nullptr) {
663
0
            *err = "tablet cannot be used. " + BackendOptions::get_localhost();
664
0
        }
665
0
        return nullptr;
666
0
    }
667
668
2.09k
    return tablet;
669
2.09k
}
670
671
TabletSharedPtr TabletManager::get_tablet(TTabletId tablet_id, TabletUid tablet_uid,
672
0
                                          bool include_deleted, string* err) {
673
0
    std::shared_lock rdlock(_get_tablets_shard_lock(tablet_id));
674
0
    TabletSharedPtr tablet = _get_tablet_unlocked(tablet_id, include_deleted, err);
675
0
    if (tablet != nullptr && tablet->tablet_uid() == tablet_uid) {
676
0
        return tablet;
677
0
    }
678
0
    return nullptr;
679
0
}
680
681
0
uint64_t TabletManager::get_rowset_nums() {
682
0
    uint64_t rowset_nums = 0;
683
0
    for_each_tablet([&](const TabletSharedPtr& tablet) { rowset_nums += tablet->version_count(); },
684
0
                    filter_all_tablets);
685
0
    return rowset_nums;
686
0
}
687
688
0
uint64_t TabletManager::get_segment_nums() {
689
0
    uint64_t segment_nums = 0;
690
0
    for_each_tablet([&](const TabletSharedPtr& tablet) { segment_nums += tablet->segment_count(); },
691
0
                    filter_all_tablets);
692
0
    return segment_nums;
693
0
}
694
695
bool TabletManager::get_tablet_id_and_schema_hash_from_path(const string& path,
696
                                                            TTabletId* tablet_id,
697
11
                                                            TSchemaHash* schema_hash) {
698
    // the path like: /data/14/10080/964828783/
699
11
    static re2::RE2 normal_re("/data/\\d+/(\\d+)/(\\d+)($|/)");
700
    // match tablet schema hash data path, for example, the path is /data/1/16791/29998
701
    // 1 is shard id , 16791 is tablet id, 29998 is schema hash
702
11
    if (RE2::PartialMatch(path, normal_re, tablet_id, schema_hash)) {
703
7
        return true;
704
7
    }
705
706
    // If we can't match normal path pattern, this may be a path which is a empty tablet
707
    // directory. Use this pattern to match empty tablet directory. In this case schema_hash
708
    // will be set to zero.
709
4
    static re2::RE2 empty_tablet_re("/data/\\d+/(\\d+)($|/$)");
710
4
    if (!RE2::PartialMatch(path, empty_tablet_re, tablet_id)) {
711
2
        return false;
712
2
    }
713
2
    *schema_hash = 0;
714
2
    return true;
715
4
}
716
717
4
bool TabletManager::get_rowset_id_from_path(const string& path, RowsetId* rowset_id) {
718
    // the path like: /data/14/10080/964828783/02000000000000969144d8725cb62765f9af6cd3125d5a91_0.dat
719
4
    static re2::RE2 re("/data/\\d+/\\d+/\\d+/([A-Fa-f0-9]+)_.*");
720
4
    string id_str;
721
4
    bool ret = RE2::PartialMatch(path, re, &id_str);
722
4
    if (ret) {
723
2
        rowset_id->init(id_str);
724
2
        return true;
725
2
    }
726
2
    return false;
727
4
}
728
729
0
void TabletManager::get_tablet_stat(TTabletStatResult* result) {
730
0
    std::shared_ptr<std::vector<TTabletStat>> local_cache;
731
0
    {
732
0
        std::lock_guard<std::mutex> guard(_tablet_stat_cache_mutex);
733
0
        local_cache = _tablet_stat_list_cache;
734
0
    }
735
0
    result->__set_tablet_stat_list(*local_cache);
736
0
}
737
738
TabletSharedPtr TabletManager::find_best_tablet_to_compaction(
739
        CompactionType compaction_type, DataDir* data_dir,
740
        const std::unordered_set<TTabletId>& tablet_submitted_compaction, uint32_t* score,
741
        const std::unordered_map<std::string_view, std::shared_ptr<CumulativeCompactionPolicy>>&
742
76
                all_cumulative_compaction_policies) {
743
76
    int64_t now_ms = UnixMillis();
744
76
    const string& compaction_type_str =
745
76
            compaction_type == CompactionType::BASE_COMPACTION ? "base" : "cumulative";
746
76
    uint32_t highest_score = 0;
747
76
    uint32_t compaction_score = 0;
748
76
    TabletSharedPtr best_tablet;
749
76
    auto handler = [&](const TabletSharedPtr& tablet_ptr) {
750
4
        if (tablet_ptr->tablet_meta()->tablet_schema()->disable_auto_compaction()) {
751
0
            LOG_EVERY_N(INFO, 500) << "Tablet " << tablet_ptr->tablet_id()
752
0
                                   << " will be ignored by automatic compaction tasks since it's "
753
0
                                   << "set to disabled automatic compaction.";
754
0
            return;
755
0
        }
756
757
4
        if (config::enable_skip_tablet_compaction &&
758
4
            tablet_ptr->should_skip_compaction(compaction_type, UnixSeconds())) {
759
2
            return;
760
2
        }
761
2
        if (!tablet_ptr->can_do_compaction(data_dir->path_hash(), compaction_type)) {
762
1
            return;
763
1
        }
764
765
1
        auto search = tablet_submitted_compaction.find(tablet_ptr->tablet_id());
766
1
        if (search != tablet_submitted_compaction.end()) {
767
0
            return;
768
0
        }
769
770
1
        int64_t last_failure_ms = tablet_ptr->last_cumu_compaction_failure_time();
771
1
        if (compaction_type == CompactionType::BASE_COMPACTION) {
772
0
            last_failure_ms = tablet_ptr->last_base_compaction_failure_time();
773
0
        }
774
1
        if (now_ms - last_failure_ms <= 5000) {
775
0
            VLOG_DEBUG << "Too often to check compaction, skip it. "
776
0
                       << "compaction_type=" << compaction_type_str
777
0
                       << ", last_failure_time_ms=" << last_failure_ms
778
0
                       << ", tablet_id=" << tablet_ptr->tablet_id();
779
0
            return;
780
0
        }
781
782
1
        if (compaction_type == CompactionType::BASE_COMPACTION) {
783
0
            std::unique_lock<std::mutex> lock(tablet_ptr->get_base_compaction_lock(),
784
0
                                              std::try_to_lock);
785
0
            if (!lock.owns_lock()) {
786
0
                LOG(INFO) << "can not get base lock: " << tablet_ptr->tablet_id();
787
0
                return;
788
0
            }
789
1
        } else {
790
1
            std::unique_lock<std::mutex> lock(tablet_ptr->get_cumulative_compaction_lock(),
791
1
                                              std::try_to_lock);
792
1
            if (!lock.owns_lock()) {
793
0
                LOG(INFO) << "can not get cumu lock: " << tablet_ptr->tablet_id();
794
0
                return;
795
0
            }
796
1
        }
797
1
        auto cumulative_compaction_policy = all_cumulative_compaction_policies.at(
798
1
                tablet_ptr->tablet_meta()->compaction_policy());
799
1
        uint32_t current_compaction_score =
800
1
                tablet_ptr->calc_compaction_score(compaction_type, cumulative_compaction_policy);
801
1
        if (current_compaction_score < 5) {
802
1
            tablet_ptr->set_skip_compaction(true, compaction_type, UnixSeconds());
803
1
        }
804
1
        if (current_compaction_score > highest_score) {
805
1
            highest_score = current_compaction_score;
806
1
            compaction_score = current_compaction_score;
807
1
            best_tablet = tablet_ptr;
808
1
        }
809
1
    };
810
811
76
    for_each_tablet(handler, filter_all_tablets);
812
76
    if (best_tablet != nullptr) {
813
1
        VLOG_CRITICAL << "Found the best tablet for compaction. "
814
0
                      << "compaction_type=" << compaction_type_str
815
0
                      << ", tablet_id=" << best_tablet->tablet_id() << ", path=" << data_dir->path()
816
0
                      << ", compaction_score=" << compaction_score
817
0
                      << ", highest_score=" << highest_score;
818
1
        *score = compaction_score;
819
1
    }
820
76
    return best_tablet;
821
76
}
822
823
Status TabletManager::load_tablet_from_meta(DataDir* data_dir, TTabletId tablet_id,
824
                                            TSchemaHash schema_hash, const string& meta_binary,
825
                                            bool update_meta, bool force, bool restore,
826
2
                                            bool check_path) {
827
2
    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
828
2
    TabletMetaSharedPtr tablet_meta(new TabletMeta());
829
2
    Status status = tablet_meta->deserialize(meta_binary);
830
2
    if (!status.ok()) {
831
0
        return Status::Error<HEADER_PB_PARSE_FAILED>(
832
0
                "fail to load tablet because can not parse meta_binary string. tablet_id={}, "
833
0
                "schema_hash={}, path={}, status={}",
834
0
                tablet_id, schema_hash, data_dir->path(), status);
835
0
    }
836
2
    tablet_meta->init_rs_metas_fs(data_dir->fs());
837
838
    // check if tablet meta is valid
839
2
    if (tablet_meta->tablet_id() != tablet_id || tablet_meta->schema_hash() != schema_hash) {
840
0
        return Status::Error<HEADER_PB_PARSE_FAILED>(
841
0
                "fail to load tablet because meet invalid tablet meta. trying to load "
842
0
                "tablet(tablet_id={}, schema_hash={}), but meet tablet={}, path={}",
843
0
                tablet_id, schema_hash, tablet_meta->full_name(), data_dir->path());
844
0
    }
845
2
    if (tablet_meta->tablet_uid().hi == 0 && tablet_meta->tablet_uid().lo == 0) {
846
0
        return Status::Error<HEADER_PB_PARSE_FAILED>(
847
0
                "fail to load tablet because its uid == 0. tablet={}, path={}",
848
0
                tablet_meta->full_name(), data_dir->path());
849
0
    }
850
851
2
    if (restore) {
852
        // we're restoring tablet from trash, tablet state should be changed from shutdown back to running
853
0
        tablet_meta->set_tablet_state(TABLET_RUNNING);
854
0
    }
855
856
2
    if (tablet_meta->partition_id() == 0) {
857
0
        LOG(WARNING) << "tablet=" << tablet_id << " load from meta but partition id eq 0";
858
0
    }
859
860
2
    TabletSharedPtr tablet = Tablet::create_tablet_from_meta(tablet_meta, data_dir);
861
2
    if (tablet == nullptr) {
862
0
        return Status::Error<TABLE_CREATE_FROM_HEADER_ERROR>(
863
0
                "fail to load tablet. tablet_id={}, schema_hash={}", tablet_id, schema_hash);
864
0
    }
865
866
    // NOTE: method load_tablet_from_meta could be called by two cases as below
867
    // case 1: BE start;
868
    // case 2: Clone Task/Restore
869
    // For case 1 doesn't need path check because BE is just starting and not ready,
870
    // just check tablet meta status to judge whether tablet is delete is enough.
871
    // For case 2, If a tablet has just been copied to local BE,
872
    // it may be cleared by gc-thread(see perform_path_gc_by_tablet) because the tablet meta may not be loaded to memory.
873
    // So clone task should check path and then failed and retry in this case.
874
2
    if (check_path) {
875
2
        bool exists = true;
876
2
        RETURN_IF_ERROR(io::global_local_filesystem()->exists(tablet->tablet_path(), &exists));
877
2
        if (!exists) {
878
0
            return Status::Error<TABLE_ALREADY_DELETED_ERROR>(
879
0
                    "tablet path not exists, create tablet failed, path={}", tablet->tablet_path());
880
0
        }
881
2
    }
882
883
2
    if (tablet_meta->tablet_state() == TABLET_SHUTDOWN) {
884
0
        {
885
0
            std::lock_guard<std::shared_mutex> shutdown_tablets_wrlock(_shutdown_tablets_lock);
886
0
            _shutdown_tablets.push_back(tablet);
887
0
        }
888
0
        return Status::Error<TABLE_ALREADY_DELETED_ERROR>(
889
0
                "fail to load tablet because it is to be deleted. tablet_id={}, schema_hash={}, "
890
0
                "path={}",
891
0
                tablet_id, schema_hash, data_dir->path());
892
0
    }
893
    // NOTE: We do not check tablet's initial version here, because if BE restarts when
894
    // one tablet is doing schema-change, we may meet empty tablet.
895
2
    if (tablet->max_version().first == -1 && tablet->tablet_state() == TABLET_RUNNING) {
896
        // tablet state is invalid, drop tablet
897
0
        return Status::Error<TABLE_INDEX_VALIDATE_ERROR>(
898
0
                "fail to load tablet. it is in running state but without delta. tablet={}, path={}",
899
0
                tablet->full_name(), data_dir->path());
900
0
    }
901
902
2
    RETURN_NOT_OK_STATUS_WITH_WARN(
903
2
            tablet->init(),
904
2
            strings::Substitute("tablet init failed. tablet=$0", tablet->full_name()));
905
906
2
    RuntimeProfile profile("CreateTablet");
907
2
    std::lock_guard<std::shared_mutex> wrlock(_get_tablets_shard_lock(tablet_id));
908
2
    RETURN_NOT_OK_STATUS_WITH_WARN(
909
2
            _add_tablet_unlocked(tablet_id, tablet, update_meta, force, &profile),
910
2
            strings::Substitute("fail to add tablet. tablet=$0", tablet->full_name()));
911
912
2
    return Status::OK();
913
2
}
914
915
Status TabletManager::load_tablet_from_dir(DataDir* store, TTabletId tablet_id,
916
                                           SchemaHash schema_hash, const string& schema_hash_path,
917
2
                                           bool force, bool restore) {
918
2
    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
919
2
    LOG(INFO) << "begin to load tablet from dir. "
920
2
              << " tablet_id=" << tablet_id << " schema_hash=" << schema_hash
921
2
              << " path = " << schema_hash_path << " force = " << force << " restore = " << restore;
922
    // not add lock here, because load_tablet_from_meta already add lock
923
2
    std::string header_path = TabletMeta::construct_header_file_path(schema_hash_path, tablet_id);
924
    // should change shard id before load tablet
925
2
    std::string shard_path =
926
2
            path_util::dir_name(path_util::dir_name(path_util::dir_name(header_path)));
927
2
    std::string shard_str = shard_path.substr(shard_path.find_last_of('/') + 1);
928
2
    int32_t shard = stol(shard_str);
929
930
2
    bool exists = false;
931
2
    RETURN_IF_ERROR(io::global_local_filesystem()->exists(header_path, &exists));
932
2
    if (!exists) {
933
0
        return Status::Error<FILE_NOT_EXIST>("fail to find header file. [header_path={}]",
934
0
                                             header_path);
935
0
    }
936
937
2
    TabletMetaSharedPtr tablet_meta(new TabletMeta());
938
2
    if (!tablet_meta->create_from_file(header_path).ok()) {
939
0
        return Status::Error<ENGINE_LOAD_INDEX_TABLE_ERROR>(
940
0
                "fail to load tablet_meta. file_path={}", header_path);
941
0
    }
942
2
    TabletUid tablet_uid = TabletUid::gen_uid();
943
944
    // remove rowset binlog metas
945
2
    auto binlog_metas_file = fmt::format("{}/rowset_binlog_metas.pb", schema_hash_path);
946
2
    bool binlog_metas_file_exists = false;
947
2
    auto file_exists_status =
948
2
            io::global_local_filesystem()->exists(binlog_metas_file, &binlog_metas_file_exists);
949
2
    if (!file_exists_status.ok()) {
950
0
        return file_exists_status;
951
0
    }
952
2
    bool contain_binlog = false;
953
2
    RowsetBinlogMetasPB rowset_binlog_metas_pb;
954
2
    if (binlog_metas_file_exists) {
955
0
        auto binlog_meta_filesize = std::filesystem::file_size(binlog_metas_file);
956
0
        if (binlog_meta_filesize > 0) {
957
0
            contain_binlog = true;
958
0
            RETURN_IF_ERROR(read_pb(binlog_metas_file, &rowset_binlog_metas_pb));
959
0
            VLOG_DEBUG << "load rowset binlog metas from file. file_path=" << binlog_metas_file;
960
0
        }
961
0
        RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(binlog_metas_file));
962
0
    }
963
2
    if (contain_binlog) {
964
0
        auto binlog_dir = fmt::format("{}/_binlog", schema_hash_path);
965
0
        RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(binlog_dir));
966
967
0
        std::vector<io::FileInfo> files;
968
0
        RETURN_IF_ERROR(
969
0
                io::global_local_filesystem()->list(schema_hash_path, true, &files, &exists));
970
0
        for (auto& file : files) {
971
0
            auto& filename = file.file_name;
972
0
            std::string new_suffix;
973
0
            std::string old_suffix;
974
975
0
            if (filename.ends_with(".binlog")) {
976
0
                old_suffix = ".binlog";
977
0
                new_suffix = ".dat";
978
0
            } else if (filename.ends_with(".binlog-index")) {
979
0
                old_suffix = ".binlog-index";
980
0
                new_suffix = ".idx";
981
0
            } else {
982
0
                continue;
983
0
            }
984
985
0
            std::string new_filename = filename;
986
0
            new_filename.replace(filename.size() - old_suffix.size(), old_suffix.size(),
987
0
                                 new_suffix);
988
0
            auto from = fmt::format("{}/{}", schema_hash_path, filename);
989
0
            auto to = fmt::format("{}/_binlog/{}", schema_hash_path, new_filename);
990
0
            RETURN_IF_ERROR(io::global_local_filesystem()->rename(from, to));
991
0
        }
992
993
0
        auto* meta = store->get_meta();
994
        // if ingest binlog metas error, it will be gc in gc_unused_binlog_metas
995
0
        RETURN_IF_ERROR(
996
0
                RowsetMetaManager::ingest_binlog_metas(meta, tablet_uid, &rowset_binlog_metas_pb));
997
0
    }
998
999
    // has to change shard id here, because meta file maybe copied from other source
1000
    // its shard is different from local shard
1001
2
    tablet_meta->set_shard_id(shard);
1002
    // load dir is called by clone, restore, storage migration
1003
    // should change tablet uid when tablet object changed
1004
2
    tablet_meta->set_tablet_uid(std::move(tablet_uid));
1005
2
    std::string meta_binary;
1006
2
    tablet_meta->serialize(&meta_binary);
1007
2
    RETURN_NOT_OK_STATUS_WITH_WARN(
1008
2
            load_tablet_from_meta(store, tablet_id, schema_hash, meta_binary, true, force, restore,
1009
2
                                  true),
1010
2
            strings::Substitute("fail to load tablet. header_path=$0", header_path));
1011
1012
2
    return Status::OK();
1013
2
}
1014
1015
0
Status TabletManager::report_tablet_info(TTabletInfo* tablet_info) {
1016
0
    DorisMetrics::instance()->report_tablet_requests_total->increment(1);
1017
0
    LOG(INFO) << "begin to process report tablet info."
1018
0
              << "tablet_id=" << tablet_info->tablet_id;
1019
1020
0
    Status res = Status::OK();
1021
1022
0
    TabletSharedPtr tablet = get_tablet(tablet_info->tablet_id);
1023
0
    if (tablet == nullptr) {
1024
0
        return Status::Error<TABLE_NOT_FOUND>("can't find tablet={}", tablet_info->tablet_id);
1025
0
    }
1026
1027
0
    tablet->build_tablet_report_info(tablet_info);
1028
0
    VLOG_TRACE << "success to process report tablet info.";
1029
0
    return res;
1030
0
}
1031
1032
0
Status TabletManager::build_all_report_tablets_info(std::map<TTabletId, TTablet>* tablets_info) {
1033
0
    DCHECK(tablets_info != nullptr);
1034
0
    VLOG_NOTICE << "begin to build all report tablets info";
1035
1036
    // build the expired txn map first, outside the tablet map lock
1037
0
    std::map<TabletInfo, std::vector<int64_t>> expire_txn_map;
1038
0
    StorageEngine::instance()->txn_manager()->build_expire_txn_map(&expire_txn_map);
1039
0
    LOG(INFO) << "find expired transactions for " << expire_txn_map.size() << " tablets";
1040
1041
0
    DorisMetrics::instance()->report_all_tablets_requests_total->increment(1);
1042
0
    HistogramStat tablet_version_num_hist;
1043
0
    auto local_cache = std::make_shared<std::vector<TTabletStat>>();
1044
0
    auto handler = [&](const TabletSharedPtr& tablet) {
1045
0
        auto& t_tablet = (*tablets_info)[tablet->tablet_id()];
1046
0
        TTabletInfo& tablet_info = t_tablet.tablet_infos.emplace_back();
1047
0
        tablet->build_tablet_report_info(&tablet_info, true, true);
1048
        // find expired transaction corresponding to this tablet
1049
0
        TabletInfo tinfo(tablet->tablet_id(), tablet->schema_hash(), tablet->tablet_uid());
1050
0
        auto find = expire_txn_map.find(tinfo);
1051
0
        if (find != expire_txn_map.end()) {
1052
0
            tablet_info.__set_transaction_ids(find->second);
1053
0
            expire_txn_map.erase(find);
1054
0
        }
1055
0
        tablet_version_num_hist.add(tablet->version_count());
1056
0
        auto& t_tablet_stat = local_cache->emplace_back();
1057
0
        t_tablet_stat.__set_tablet_id(tablet_info.tablet_id);
1058
0
        t_tablet_stat.__set_data_size(tablet_info.data_size);
1059
0
        t_tablet_stat.__set_remote_data_size(tablet_info.remote_data_size);
1060
0
        t_tablet_stat.__set_row_num(tablet_info.row_count);
1061
0
        t_tablet_stat.__set_version_count(tablet_info.version_count);
1062
0
        t_tablet_stat.__set_visible_version(tablet_info.version);
1063
0
    };
1064
0
    for_each_tablet(handler, filter_all_tablets);
1065
1066
0
    {
1067
0
        std::lock_guard<std::mutex> guard(_tablet_stat_cache_mutex);
1068
0
        _tablet_stat_list_cache.swap(local_cache);
1069
0
    }
1070
0
    DorisMetrics::instance()->tablet_version_num_distribution->set_histogram(
1071
0
            tablet_version_num_hist);
1072
0
    LOG(INFO) << "success to build all report tablets info. tablet_count=" << tablets_info->size();
1073
0
    return Status::OK();
1074
0
}
1075
1076
16
Status TabletManager::start_trash_sweep() {
1077
16
    DBUG_EXECUTE_IF("TabletManager.start_trash_sweep.sleep", DBUG_BLOCK);
1078
16
    std::unique_lock<std::mutex> lock(_gc_tablets_lock, std::defer_lock);
1079
16
    if (!lock.try_lock()) {
1080
0
        return Status::OK();
1081
0
    }
1082
1083
16
    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
1084
16
    for_each_tablet([](const TabletSharedPtr& tablet) { tablet->delete_expired_stale_rowset(); },
1085
16
                    filter_all_tablets);
1086
1087
16
    std::list<TabletSharedPtr>::iterator last_it;
1088
16
    {
1089
16
        std::shared_lock rdlock(_shutdown_tablets_lock);
1090
16
        last_it = _shutdown_tablets.begin();
1091
16
        if (last_it == _shutdown_tablets.end()) {
1092
12
            return Status::OK();
1093
12
        }
1094
16
    }
1095
1096
7
    auto get_batch_tablets = [this, &last_it](int limit) {
1097
7
        std::vector<TabletSharedPtr> batch_tablets;
1098
7
        std::lock_guard<std::shared_mutex> wrdlock(_shutdown_tablets_lock);
1099
11
        while (last_it != _shutdown_tablets.end() && batch_tablets.size() < limit) {
1100
            // it means current tablet is referenced by other thread
1101
4
            if (last_it->use_count() > 1) {
1102
1
                last_it++;
1103
3
            } else {
1104
3
                batch_tablets.push_back(*last_it);
1105
3
                last_it = _shutdown_tablets.erase(last_it);
1106
3
            }
1107
4
        }
1108
1109
7
        return batch_tablets;
1110
7
    };
1111
1112
4
    std::list<TabletSharedPtr> failed_tablets;
1113
    // return true if need continue delete
1114
4
    auto delete_one_batch = [this, get_batch_tablets, &failed_tablets]() -> bool {
1115
4
        int limit = 200;
1116
7
        for (;;) {
1117
7
            auto batch_tablets = get_batch_tablets(limit);
1118
7
            for (const auto& tablet : batch_tablets) {
1119
3
                if (_move_tablet_to_trash(tablet)) {
1120
3
                    limit--;
1121
3
                } else {
1122
0
                    failed_tablets.push_back(tablet);
1123
0
                }
1124
3
            }
1125
7
            if (limit <= 0) {
1126
0
                return true;
1127
0
            }
1128
7
            if (batch_tablets.empty()) {
1129
4
                return false;
1130
4
            }
1131
7
        }
1132
1133
0
        return false;
1134
4
    };
1135
1136
4
    while (delete_one_batch()) {
1137
#ifndef BE_TEST
1138
        sleep(1);
1139
#endif
1140
0
    }
1141
1142
4
    if (!failed_tablets.empty()) {
1143
0
        std::lock_guard<std::shared_mutex> wrlock(_shutdown_tablets_lock);
1144
0
        _shutdown_tablets.splice(_shutdown_tablets.end(), failed_tablets);
1145
0
    }
1146
1147
4
    return Status::OK();
1148
16
}
1149
1150
3
bool TabletManager::_move_tablet_to_trash(const TabletSharedPtr& tablet) {
1151
3
    RETURN_IF_ERROR(register_transition_tablet(tablet->tablet_id(), "move to trash"));
1152
3
    Defer defer {[&]() { unregister_transition_tablet(tablet->tablet_id(), "move to trash"); }};
1153
1154
3
    TabletSharedPtr tablet_in_not_shutdown = get_tablet(tablet->tablet_id());
1155
3
    if (tablet_in_not_shutdown) {
1156
0
        TSchemaHash schema_hash_not_shutdown = tablet_in_not_shutdown->schema_hash();
1157
0
        size_t path_hash_not_shutdown = tablet_in_not_shutdown->data_dir()->path_hash();
1158
0
        if (tablet->schema_hash() == schema_hash_not_shutdown &&
1159
0
            tablet->data_dir()->path_hash() == path_hash_not_shutdown) {
1160
0
            tablet->clear_cache();
1161
            // shard_id in memory not eq shard_id in shutdown
1162
0
            if (tablet_in_not_shutdown->tablet_path() != tablet->tablet_path()) {
1163
0
                LOG(INFO) << "tablet path not eq shutdown tablet path, move it to trash, tablet_id="
1164
0
                          << tablet_in_not_shutdown->tablet_id()
1165
0
                          << " mem manager tablet path=" << tablet_in_not_shutdown->tablet_path()
1166
0
                          << " shutdown tablet path=" << tablet->tablet_path();
1167
0
                return tablet->data_dir()->move_to_trash(tablet->tablet_path());
1168
0
            } else {
1169
0
                LOG(INFO) << "tablet path eq shutdown tablet path, not move to trash, tablet_id="
1170
0
                          << tablet_in_not_shutdown->tablet_id()
1171
0
                          << " mem manager tablet path=" << tablet_in_not_shutdown->tablet_path()
1172
0
                          << " shutdown tablet path=" << tablet->tablet_path();
1173
0
                return true;
1174
0
            }
1175
0
        }
1176
0
    }
1177
1178
3
    TabletMetaSharedPtr tablet_meta(new TabletMeta());
1179
3
    int64_t get_meta_ts = MonotonicMicros();
1180
3
    Status check_st = TabletMetaManager::get_meta(tablet->data_dir(), tablet->tablet_id(),
1181
3
                                                  tablet->schema_hash(), tablet_meta);
1182
3
    if (check_st.ok()) {
1183
3
        if (tablet_meta->tablet_state() != TABLET_SHUTDOWN ||
1184
3
            tablet_meta->tablet_uid() != tablet->tablet_uid()) {
1185
0
            LOG(WARNING) << "tablet's state changed to normal, skip remove dirs"
1186
0
                         << " tablet id = " << tablet_meta->tablet_id()
1187
0
                         << " schema hash = " << tablet_meta->schema_hash()
1188
0
                         << " old tablet_uid=" << tablet->tablet_uid()
1189
0
                         << " cur tablet_uid=" << tablet_meta->tablet_uid();
1190
0
            return true;
1191
0
        }
1192
1193
3
        tablet->clear_cache();
1194
1195
        // move data to trash
1196
3
        const auto& tablet_path = tablet->tablet_path();
1197
3
        bool exists = false;
1198
3
        Status exists_st = io::global_local_filesystem()->exists(tablet_path, &exists);
1199
3
        if (!exists_st) {
1200
0
            return false;
1201
0
        }
1202
3
        if (exists) {
1203
            // take snapshot of tablet meta
1204
3
            auto meta_file_path = fmt::format("{}/{}.hdr", tablet_path, tablet->tablet_id());
1205
3
            int64_t save_meta_ts = MonotonicMicros();
1206
3
            auto save_st = tablet->tablet_meta()->save(meta_file_path);
1207
3
            if (!save_st.ok()) {
1208
0
                LOG(WARNING) << "failed to save meta, tablet_id=" << tablet_meta->tablet_id()
1209
0
                             << ", tablet_uid=" << tablet_meta->tablet_uid()
1210
0
                             << ", error=" << save_st;
1211
0
                return false;
1212
0
            }
1213
3
            int64_t now = MonotonicMicros();
1214
3
            LOG(INFO) << "start to move tablet to trash. " << tablet_path
1215
3
                      << ". rocksdb get meta cost " << (save_meta_ts - get_meta_ts)
1216
3
                      << " us, rocksdb save meta cost " << (now - save_meta_ts) << " us";
1217
3
            Status rm_st = tablet->data_dir()->move_to_trash(tablet_path);
1218
3
            if (!rm_st.ok()) {
1219
0
                LOG(WARNING) << "fail to move dir to trash. " << tablet_path;
1220
0
                return false;
1221
0
            }
1222
3
        }
1223
        // remove tablet meta
1224
3
        auto remove_st = TabletMetaManager::remove(tablet->data_dir(), tablet->tablet_id(),
1225
3
                                                   tablet->schema_hash());
1226
3
        if (!remove_st.ok()) {
1227
0
            LOG(WARNING) << "failed to remove meta, tablet_id=" << tablet_meta->tablet_id()
1228
0
                         << ", tablet_uid=" << tablet_meta->tablet_uid() << ", error=" << remove_st;
1229
0
            return false;
1230
0
        }
1231
3
        LOG(INFO) << "successfully move tablet to trash. "
1232
3
                  << "tablet_id=" << tablet->tablet_id()
1233
3
                  << ", schema_hash=" << tablet->schema_hash() << ", tablet_path=" << tablet_path;
1234
3
        return true;
1235
3
    } else {
1236
0
        tablet->clear_cache();
1237
        // if could not find tablet info in meta store, then check if dir existed
1238
0
        const auto& tablet_path = tablet->tablet_path();
1239
0
        bool exists = false;
1240
0
        Status exists_st = io::global_local_filesystem()->exists(tablet_path, &exists);
1241
0
        if (!exists_st) {
1242
0
            return false;
1243
0
        }
1244
0
        if (exists) {
1245
0
            if (check_st.is<META_KEY_NOT_FOUND>()) {
1246
0
                LOG(INFO) << "could not find tablet meta in rocksdb, so just delete it path "
1247
0
                          << "tablet_id=" << tablet->tablet_id()
1248
0
                          << ", schema_hash=" << tablet->schema_hash()
1249
0
                          << ", delete tablet_path=" << tablet_path;
1250
0
                RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(tablet_path));
1251
0
                RETURN_IF_ERROR(DataDir::delete_tablet_parent_path_if_empty(tablet_path));
1252
0
                return true;
1253
0
            }
1254
0
            LOG(WARNING) << "errors while load meta from store, skip this tablet. "
1255
0
                         << "tablet_id=" << tablet->tablet_id()
1256
0
                         << ", schema_hash=" << tablet->schema_hash();
1257
0
            return false;
1258
0
        } else {
1259
0
            LOG(INFO) << "could not find tablet dir, skip it and remove it from gc-queue. "
1260
0
                      << "tablet_id=" << tablet->tablet_id()
1261
0
                      << ", schema_hash=" << tablet->schema_hash()
1262
0
                      << ", tablet_path=" << tablet_path;
1263
0
            return true;
1264
0
        }
1265
0
    }
1266
3
}
1267
1268
29
Status TabletManager::register_transition_tablet(int64_t tablet_id, std::string reason) {
1269
29
    tablets_shard& shard = _get_tablets_shard(tablet_id);
1270
29
    std::thread::id thread_id = std::this_thread::get_id();
1271
29
    std::lock_guard<std::mutex> lk(shard.lock_for_transition);
1272
29
    if (auto search = shard.tablets_under_transition.find(tablet_id);
1273
29
        search == shard.tablets_under_transition.end()) {
1274
        // not found
1275
27
        shard.tablets_under_transition[tablet_id] = std::make_tuple(reason, thread_id, 1);
1276
27
        LOG(INFO) << "add tablet_id= " << tablet_id << " to map, reason=" << reason
1277
27
                  << " lock times=1 thread_id_in_map=" << thread_id;
1278
27
        return Status::OK();
1279
27
    } else {
1280
        // found
1281
2
        auto& [r, thread_id_in_map, lock_times] = search->second;
1282
2
        if (thread_id != thread_id_in_map) {
1283
            // other thread, failed
1284
0
            LOG(INFO) << "tablet_id = " << tablet_id << " is doing " << r
1285
0
                      << " thread_id_in_map=" << thread_id_in_map << " , add reason=" << reason
1286
0
                      << " thread_id=" << thread_id;
1287
0
            return Status::InternalError<false>("{} failed try later, tablet_id={}", reason,
1288
0
                                                tablet_id);
1289
0
        }
1290
        // add lock times
1291
2
        ++lock_times;
1292
2
        LOG(INFO) << "add tablet_id= " << tablet_id << " to map, reason=" << reason
1293
2
                  << " lock times=" << lock_times << " thread_id_in_map=" << thread_id_in_map;
1294
2
        return Status::OK();
1295
2
    }
1296
29
}
1297
1298
29
void TabletManager::unregister_transition_tablet(int64_t tablet_id, std::string reason) {
1299
29
    tablets_shard& shard = _get_tablets_shard(tablet_id);
1300
29
    std::thread::id thread_id = std::this_thread::get_id();
1301
29
    std::lock_guard<std::mutex> lk(shard.lock_for_transition);
1302
29
    if (auto search = shard.tablets_under_transition.find(tablet_id);
1303
29
        search == shard.tablets_under_transition.end()) {
1304
        // impossible, bug
1305
0
        DCHECK(false) << "tablet " << tablet_id
1306
0
                      << " must be found, before unreg must have been reg";
1307
29
    } else {
1308
29
        auto& [r, thread_id_in_map, lock_times] = search->second;
1309
29
        if (thread_id_in_map != thread_id) {
1310
            // impossible, bug
1311
0
            DCHECK(false) << "tablet " << tablet_id << " unreg thread must same reg thread";
1312
0
        }
1313
        // sub lock times
1314
29
        --lock_times;
1315
29
        if (lock_times != 0) {
1316
2
            LOG(INFO) << "erase tablet_id= " << tablet_id << " from map, reason=" << reason
1317
2
                      << " left=" << lock_times << " thread_id_in_map=" << thread_id_in_map;
1318
27
        } else {
1319
27
            LOG(INFO) << "erase tablet_id= " << tablet_id << " from map, reason=" << reason
1320
27
                      << " thread_id_in_map=" << thread_id_in_map;
1321
27
            shard.tablets_under_transition.erase(tablet_id);
1322
27
        }
1323
29
    }
1324
29
}
1325
1326
void TabletManager::try_delete_unused_tablet_path(DataDir* data_dir, TTabletId tablet_id,
1327
                                                  SchemaHash schema_hash,
1328
                                                  const string& schema_hash_path,
1329
2
                                                  int16_t shard_id) {
1330
2
    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
1331
    // acquire the read lock, so that there is no creating tablet or load tablet from meta tasks
1332
    // create tablet and load tablet task should check whether the dir exists
1333
2
    tablets_shard& shard = _get_tablets_shard(tablet_id);
1334
2
    std::shared_lock rdlock(shard.lock);
1335
1336
    // check if meta already exists
1337
2
    TabletMetaSharedPtr tablet_meta(new TabletMeta());
1338
2
    Status check_st = TabletMetaManager::get_meta(data_dir, tablet_id, schema_hash, tablet_meta);
1339
2
    if (check_st.ok() && tablet_meta->shard_id() == shard_id) {
1340
2
        return;
1341
2
    }
1342
1343
0
    LOG(INFO) << "tablet meta not exists, try delete tablet path " << schema_hash_path;
1344
1345
0
    bool succ = register_transition_tablet(tablet_id, "path gc");
1346
0
    if (!succ) {
1347
0
        return;
1348
0
    }
1349
0
    Defer defer {[&]() { unregister_transition_tablet(tablet_id, "path gc"); }};
1350
1351
0
    TabletSharedPtr tablet = _get_tablet_unlocked(tablet_id);
1352
0
    if (tablet != nullptr && tablet->tablet_path() == schema_hash_path) {
1353
0
        LOG(INFO) << "tablet , skip delete the path " << schema_hash_path;
1354
0
        return;
1355
0
    }
1356
1357
    // TODO(ygl): may do other checks in the future
1358
0
    bool exists = false;
1359
0
    Status exists_st = io::global_local_filesystem()->exists(schema_hash_path, &exists);
1360
0
    if (exists_st && exists) {
1361
0
        LOG(INFO) << "start to move tablet to trash. tablet_path = " << schema_hash_path;
1362
0
        Status rm_st = data_dir->move_to_trash(schema_hash_path);
1363
0
        if (!rm_st.ok()) {
1364
0
            LOG(WARNING) << "fail to move dir to trash. dir=" << schema_hash_path;
1365
0
        } else {
1366
0
            LOG(INFO) << "move path " << schema_hash_path << " to trash successfully";
1367
0
        }
1368
0
    }
1369
0
}
1370
1371
void TabletManager::update_root_path_info(std::map<string, DataDirInfo>* path_map,
1372
12
                                          size_t* tablet_count) {
1373
12
    DCHECK(tablet_count);
1374
12
    *tablet_count = 0;
1375
12
    auto filter = [path_map, tablet_count](Tablet* t) -> bool {
1376
0
        ++(*tablet_count);
1377
0
        auto iter = path_map->find(t->data_dir()->path());
1378
0
        return iter != path_map->end() && iter->second.is_used;
1379
0
    };
1380
1381
12
    auto handler = [&](const TabletSharedPtr& tablet) {
1382
0
        auto& data_dir_info = (*path_map)[tablet->data_dir()->path()];
1383
0
        data_dir_info.local_used_capacity += tablet->tablet_local_size();
1384
0
        data_dir_info.remote_used_capacity += tablet->tablet_remote_size();
1385
0
    };
1386
1387
12
    for_each_tablet(handler, filter);
1388
12
}
1389
1390
void TabletManager::get_partition_related_tablets(int64_t partition_id,
1391
0
                                                  std::set<TabletInfo>* tablet_infos) {
1392
0
    std::shared_lock rdlock(_partition_tablet_map_lock);
1393
0
    if (_partition_tablet_map.find(partition_id) != _partition_tablet_map.end()) {
1394
0
        *tablet_infos = _partition_tablet_map[partition_id];
1395
0
    }
1396
0
}
1397
1398
13
void TabletManager::do_tablet_meta_checkpoint(DataDir* data_dir) {
1399
13
    SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
1400
13
    auto filter = [data_dir](Tablet* tablet) -> bool {
1401
0
        return tablet->tablet_state() == TABLET_RUNNING &&
1402
0
               tablet->data_dir()->path_hash() == data_dir->path_hash() && tablet->is_used() &&
1403
0
               tablet->init_succeeded();
1404
0
    };
1405
1406
13
    std::vector<TabletSharedPtr> related_tablets = get_all_tablet(filter);
1407
13
    int counter = 0;
1408
13
    MonotonicStopWatch watch;
1409
13
    watch.start();
1410
13
    for (TabletSharedPtr tablet : related_tablets) {
1411
0
        if (tablet->do_tablet_meta_checkpoint()) {
1412
0
            ++counter;
1413
0
        }
1414
0
    }
1415
13
    int64_t cost = watch.elapsed_time() / 1000 / 1000;
1416
13
    LOG(INFO) << "finish to do meta checkpoint on dir: " << data_dir->path()
1417
13
              << ", number: " << counter << ", cost(ms): " << cost;
1418
13
}
1419
1420
Status TabletManager::_create_tablet_meta_unlocked(const TCreateTabletReq& request, DataDir* store,
1421
                                                   const bool is_schema_change,
1422
                                                   const Tablet* base_tablet,
1423
24
                                                   TabletMetaSharedPtr* tablet_meta) {
1424
24
    uint32_t next_unique_id = 0;
1425
24
    std::unordered_map<uint32_t, uint32_t> col_idx_to_unique_id;
1426
24
    if (!is_schema_change) {
1427
252
        for (uint32_t col_idx = 0; col_idx < request.tablet_schema.columns.size(); ++col_idx) {
1428
228
            col_idx_to_unique_id[col_idx] = col_idx;
1429
228
        }
1430
24
        next_unique_id = request.tablet_schema.columns.size();
1431
24
    } else {
1432
0
        next_unique_id = base_tablet->next_unique_id();
1433
0
        auto& new_columns = request.tablet_schema.columns;
1434
0
        for (uint32_t new_col_idx = 0; new_col_idx < new_columns.size(); ++new_col_idx) {
1435
0
            const TColumn& column = new_columns[new_col_idx];
1436
            // For schema change, compare old_tablet and new_tablet:
1437
            // 1. if column exist in both new_tablet and old_tablet, choose the column's
1438
            //    unique_id in old_tablet to be the column's ordinal number in new_tablet
1439
            // 2. if column exists only in new_tablet, assign next_unique_id of old_tablet
1440
            //    to the new column
1441
0
            int32_t old_col_idx = base_tablet->tablet_schema()->field_index(column.column_name);
1442
0
            if (old_col_idx != -1) {
1443
0
                uint32_t old_unique_id =
1444
0
                        base_tablet->tablet_schema()->column(old_col_idx).unique_id();
1445
0
                col_idx_to_unique_id[new_col_idx] = old_unique_id;
1446
0
            } else {
1447
                // Not exist in old tablet, it is a new added column
1448
0
                col_idx_to_unique_id[new_col_idx] = next_unique_id++;
1449
0
            }
1450
0
        }
1451
0
    }
1452
24
    VLOG_NOTICE << "creating tablet meta. next_unique_id=" << next_unique_id;
1453
1454
    // We generate a new tablet_uid for this new tablet.
1455
24
    uint64_t shard_id = 0;
1456
24
    RETURN_NOT_OK_STATUS_WITH_WARN(store->get_shard(&shard_id), "fail to get root path shard");
1457
24
    Status res = TabletMeta::create(request, TabletUid::gen_uid(), shard_id, next_unique_id,
1458
24
                                    col_idx_to_unique_id, tablet_meta);
1459
24
    RETURN_IF_ERROR(res);
1460
24
    if (request.__isset.storage_format) {
1461
6
        if (request.storage_format == TStorageFormat::DEFAULT) {
1462
0
            (*tablet_meta)
1463
0
                    ->set_preferred_rowset_type(StorageEngine::instance()->default_rowset_type());
1464
6
        } else if (request.storage_format == TStorageFormat::V1) {
1465
0
            (*tablet_meta)->set_preferred_rowset_type(ALPHA_ROWSET);
1466
6
        } else if (request.storage_format == TStorageFormat::V2) {
1467
6
            (*tablet_meta)->set_preferred_rowset_type(BETA_ROWSET);
1468
6
        } else {
1469
0
            return Status::Error<CE_CMD_PARAMS_ERROR>("invalid TStorageFormat: {}",
1470
0
                                                      request.storage_format);
1471
0
        }
1472
6
    }
1473
24
    return res;
1474
24
}
1475
1476
2.18k
TabletSharedPtr TabletManager::_get_tablet_unlocked(TTabletId tablet_id) {
1477
2.18k
    VLOG_NOTICE << "begin to get tablet. tablet_id=" << tablet_id;
1478
2.18k
    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
1479
2.18k
    const auto& iter = tablet_map.find(tablet_id);
1480
2.18k
    if (iter != tablet_map.end()) {
1481
2.14k
        return iter->second;
1482
2.14k
    }
1483
38
    return nullptr;
1484
2.18k
}
1485
1486
26
void TabletManager::_add_tablet_to_partition(const TabletSharedPtr& tablet) {
1487
26
    std::lock_guard<std::shared_mutex> wrlock(_partition_tablet_map_lock);
1488
26
    _partition_tablet_map[tablet->partition_id()].insert(tablet->get_tablet_info());
1489
26
}
1490
1491
23
void TabletManager::_remove_tablet_from_partition(const TabletSharedPtr& tablet) {
1492
23
    std::lock_guard<std::shared_mutex> wrlock(_partition_tablet_map_lock);
1493
23
    _partition_tablet_map[tablet->partition_id()].erase(tablet->get_tablet_info());
1494
23
    if (_partition_tablet_map[tablet->partition_id()].empty()) {
1495
23
        _partition_tablet_map.erase(tablet->partition_id());
1496
23
    }
1497
23
}
1498
1499
void TabletManager::obtain_specific_quantity_tablets(vector<TabletInfo>& tablets_info,
1500
0
                                                     int64_t num) {
1501
0
    for (const auto& tablets_shard : _tablets_shards) {
1502
0
        std::shared_lock rdlock(tablets_shard.lock);
1503
0
        for (const auto& item : tablets_shard.tablet_map) {
1504
0
            TabletSharedPtr tablet = item.second;
1505
0
            if (tablets_info.size() >= num) {
1506
0
                return;
1507
0
            }
1508
0
            if (tablet == nullptr) {
1509
0
                continue;
1510
0
            }
1511
0
            tablets_info.push_back(tablet->get_tablet_info());
1512
0
        }
1513
0
    }
1514
0
}
1515
1516
2.13k
std::shared_mutex& TabletManager::_get_tablets_shard_lock(TTabletId tabletId) {
1517
2.13k
    return _get_tablets_shard(tabletId).lock;
1518
2.13k
}
1519
1520
2.25k
TabletManager::tablet_map_t& TabletManager::_get_tablet_map(TTabletId tabletId) {
1521
2.25k
    return _get_tablets_shard(tabletId).tablet_map;
1522
2.25k
}
1523
1524
4.47k
TabletManager::tablets_shard& TabletManager::_get_tablets_shard(TTabletId tabletId) {
1525
4.47k
    return _tablets_shards[tabletId & _tablets_shards_mask];
1526
4.47k
}
1527
1528
void TabletManager::get_tablets_distribution_on_different_disks(
1529
        std::map<int64_t, std::map<DataDir*, int64_t>>& tablets_num_on_disk,
1530
0
        std::map<int64_t, std::map<DataDir*, std::vector<TabletSize>>>& tablets_info_on_disk) {
1531
0
    std::vector<DataDir*> data_dirs = StorageEngine::instance()->get_stores();
1532
0
    std::map<int64_t, std::set<TabletInfo>> partition_tablet_map;
1533
0
    {
1534
        // When drop tablet, '_partition_tablet_map_lock' is locked in 'tablet_shard_lock'.
1535
        // To avoid locking 'tablet_shard_lock' in '_partition_tablet_map_lock', we lock and
1536
        // copy _partition_tablet_map here.
1537
0
        std::shared_lock rdlock(_partition_tablet_map_lock);
1538
0
        partition_tablet_map = _partition_tablet_map;
1539
0
    }
1540
0
    std::map<int64_t, std::set<TabletInfo>>::iterator partition_iter = partition_tablet_map.begin();
1541
0
    for (; partition_iter != partition_tablet_map.end(); ++partition_iter) {
1542
0
        std::map<DataDir*, int64_t> tablets_num;
1543
0
        std::map<DataDir*, std::vector<TabletSize>> tablets_info;
1544
0
        for (int i = 0; i < data_dirs.size(); i++) {
1545
0
            tablets_num[data_dirs[i]] = 0;
1546
0
        }
1547
0
        int64_t partition_id = partition_iter->first;
1548
0
        std::set<TabletInfo>::iterator tablet_info_iter = (partition_iter->second).begin();
1549
0
        for (; tablet_info_iter != (partition_iter->second).end(); ++tablet_info_iter) {
1550
            // get_tablet() will hold 'tablet_shard_lock'
1551
0
            TabletSharedPtr tablet = get_tablet(tablet_info_iter->tablet_id);
1552
0
            if (tablet == nullptr) {
1553
0
                continue;
1554
0
            }
1555
0
            DataDir* data_dir = tablet->data_dir();
1556
0
            size_t tablet_footprint = tablet->tablet_footprint();
1557
0
            tablets_num[data_dir]++;
1558
0
            TabletSize tablet_size(tablet_info_iter->tablet_id, tablet_info_iter->schema_hash,
1559
0
                                   tablet_footprint);
1560
0
            tablets_info[data_dir].push_back(tablet_size);
1561
0
        }
1562
0
        tablets_num_on_disk[partition_id] = tablets_num;
1563
0
        tablets_info_on_disk[partition_id] = tablets_info;
1564
0
    }
1565
0
}
1566
1567
struct SortCtx {
1568
    SortCtx(TabletSharedPtr tablet, int64_t cooldown_timestamp, int64_t file_size)
1569
0
            : tablet(tablet), cooldown_timestamp(cooldown_timestamp), file_size(file_size) {}
1570
    TabletSharedPtr tablet;
1571
    // to ensure the tablet with -1 would always be greater than other
1572
    uint64_t cooldown_timestamp;
1573
    int64_t file_size;
1574
0
    bool operator<(const SortCtx& other) const {
1575
0
        if (this->cooldown_timestamp == other.cooldown_timestamp) {
1576
0
            return this->file_size > other.file_size;
1577
0
        }
1578
0
        return this->cooldown_timestamp < other.cooldown_timestamp;
1579
0
    }
1580
};
1581
1582
void TabletManager::get_cooldown_tablets(std::vector<TabletSharedPtr>* tablets,
1583
14
                                         std::function<bool(const TabletSharedPtr&)> skip_tablet) {
1584
14
    std::vector<SortCtx> sort_ctx_vec;
1585
14
    std::vector<std::weak_ptr<Tablet>> candidates;
1586
14
    for_each_tablet([&](const TabletSharedPtr& tablet) { candidates.emplace_back(tablet); },
1587
14
                    filter_all_tablets);
1588
14
    auto get_cooldown_tablet = [&sort_ctx_vec, &skip_tablet](std::weak_ptr<Tablet>& t) {
1589
0
        const TabletSharedPtr& tablet = t.lock();
1590
0
        if (UNLIKELY(nullptr == tablet)) {
1591
0
            return;
1592
0
        }
1593
0
        std::shared_lock rdlock(tablet->get_header_lock());
1594
0
        int64_t cooldown_timestamp = -1;
1595
0
        size_t file_size = -1;
1596
0
        if (!skip_tablet(tablet) && tablet->need_cooldown(&cooldown_timestamp, &file_size)) {
1597
0
            sort_ctx_vec.emplace_back(tablet, cooldown_timestamp, file_size);
1598
0
        }
1599
0
    };
1600
14
    std::for_each(candidates.begin(), candidates.end(), get_cooldown_tablet);
1601
1602
14
    std::sort(sort_ctx_vec.begin(), sort_ctx_vec.end());
1603
1604
14
    for (SortCtx& ctx : sort_ctx_vec) {
1605
0
        VLOG_DEBUG << "get cooldown tablet: " << ctx.tablet->tablet_id();
1606
0
        tablets->push_back(std::move(ctx.tablet));
1607
0
    }
1608
14
}
1609
1610
0
void TabletManager::get_all_tablets_storage_format(TCheckStorageFormatResult* result) {
1611
0
    DCHECK(result != nullptr);
1612
0
    auto handler = [result](const TabletSharedPtr& tablet) {
1613
0
        if (tablet->all_beta()) {
1614
0
            result->v2_tablets.push_back(tablet->tablet_id());
1615
0
        } else {
1616
0
            result->v1_tablets.push_back(tablet->tablet_id());
1617
0
        }
1618
0
    };
1619
1620
0
    for_each_tablet(handler, filter_all_tablets);
1621
0
    result->__isset.v1_tablets = true;
1622
0
    result->__isset.v2_tablets = true;
1623
0
}
1624
1625
0
std::set<int64_t> TabletManager::check_all_tablet_segment(bool repair) {
1626
0
    std::set<int64_t> bad_tablets;
1627
0
    std::map<int64_t, std::vector<int64_t>> repair_shard_bad_tablets;
1628
0
    auto handler = [&](const TabletSharedPtr& tablet) {
1629
0
        if (!tablet->check_all_rowset_segment()) {
1630
0
            int64_t tablet_id = tablet->tablet_id();
1631
0
            bad_tablets.insert(tablet_id);
1632
0
            if (repair) {
1633
0
                repair_shard_bad_tablets[tablet_id & _tablets_shards_mask].push_back(tablet_id);
1634
0
            }
1635
0
        }
1636
0
    };
1637
0
    for_each_tablet(handler, filter_all_tablets);
1638
1639
0
    for (const auto& [shard_index, shard_tablets] : repair_shard_bad_tablets) {
1640
0
        auto& tablets_shard = _tablets_shards[shard_index];
1641
0
        auto& tablet_map = tablets_shard.tablet_map;
1642
0
        std::lock_guard<std::shared_mutex> wrlock(tablets_shard.lock);
1643
0
        for (auto tablet_id : shard_tablets) {
1644
0
            auto it = tablet_map.find(tablet_id);
1645
0
            if (it == tablet_map.end()) {
1646
0
                bad_tablets.erase(tablet_id);
1647
0
                LOG(WARNING) << "Bad tablet has be removed. tablet_id=" << tablet_id;
1648
0
            } else {
1649
0
                const auto& tablet = it->second;
1650
0
                tablet->set_tablet_state(TABLET_SHUTDOWN);
1651
0
                tablet->save_meta();
1652
0
                {
1653
0
                    std::lock_guard<std::shared_mutex> shutdown_tablets_wrlock(
1654
0
                            _shutdown_tablets_lock);
1655
0
                    _shutdown_tablets.push_back(tablet);
1656
0
                }
1657
0
                LOG(WARNING) << "There are some segments lost, set tablet to shutdown state."
1658
0
                             << "tablet_id=" << tablet->tablet_id()
1659
0
                             << ", tablet_path=" << tablet->tablet_path();
1660
0
            }
1661
0
        }
1662
0
    }
1663
1664
0
    return bad_tablets;
1665
0
}
1666
1667
bool TabletManager::update_tablet_partition_id(::doris::TPartitionId partition_id,
1668
0
                                               ::doris::TTabletId tablet_id) {
1669
0
    std::shared_lock rdlock(_get_tablets_shard_lock(tablet_id));
1670
0
    TabletSharedPtr tablet = _get_tablet_unlocked(tablet_id);
1671
0
    if (tablet == nullptr) {
1672
0
        LOG(WARNING) << "get tablet err partition_id: " << partition_id
1673
0
                     << " tablet_id:" << tablet_id;
1674
0
        return false;
1675
0
    }
1676
0
    _remove_tablet_from_partition(tablet);
1677
0
    auto st = tablet->tablet_meta()->set_partition_id(partition_id);
1678
0
    if (!st.ok()) {
1679
0
        LOG(WARNING) << "set partition id err partition_id: " << partition_id
1680
0
                     << " tablet_id:" << tablet_id;
1681
0
        return false;
1682
0
    }
1683
0
    _add_tablet_to_partition(tablet);
1684
0
    return true;
1685
0
}
1686
1687
} // end namespace doris