Coverage Report

Created: 2025-04-29 12:50

/root/doris/be/src/olap/tablet_manager.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "olap/tablet_manager.h"
19
20
#include <fmt/format.h>
21
#include <gen_cpp/AgentService_types.h>
22
#include <gen_cpp/BackendService_types.h>
23
#include <gen_cpp/Descriptors_types.h>
24
#include <gen_cpp/MasterService_types.h>
25
#include <gen_cpp/Types_types.h>
26
#include <gen_cpp/olap_file.pb.h>
27
#include <re2/re2.h>
28
#include <unistd.h>
29
30
#include <algorithm>
31
#include <list>
32
#include <mutex>
33
#include <ostream>
34
#include <string_view>
35
36
#include "bvar/bvar.h"
37
#include "common/compiler_util.h" // IWYU pragma: keep
38
#include "common/config.h"
39
#include "common/logging.h"
40
#include "gutil/integral_types.h"
41
#include "gutil/strings/strcat.h"
42
#include "gutil/strings/substitute.h"
43
#include "io/fs/local_file_system.h"
44
#include "olap/cumulative_compaction_time_series_policy.h"
45
#include "olap/data_dir.h"
46
#include "olap/olap_common.h"
47
#include "olap/olap_define.h"
48
#include "olap/olap_meta.h"
49
#include "olap/pb_helper.h"
50
#include "olap/rowset/beta_rowset.h"
51
#include "olap/rowset/rowset.h"
52
#include "olap/rowset/rowset_meta_manager.h"
53
#include "olap/storage_engine.h"
54
#include "olap/tablet.h"
55
#include "olap/tablet_meta.h"
56
#include "olap/tablet_meta_manager.h"
57
#include "olap/tablet_schema.h"
58
#include "olap/txn_manager.h"
59
#include "runtime/exec_env.h"
60
#include "service/backend_options.h"
61
#include "util/defer_op.h"
62
#include "util/doris_metrics.h"
63
#include "util/histogram.h"
64
#include "util/metrics.h"
65
#include "util/path_util.h"
66
#include "util/scoped_cleanup.h"
67
#include "util/stopwatch.hpp"
68
#include "util/time.h"
69
#include "util/trace.h"
70
#include "util/uid_util.h"
71
72
namespace doris {
73
class CumulativeCompactionPolicy;
74
} // namespace doris
75
76
using std::map;
77
using std::set;
78
using std::string;
79
using std::vector;
80
81
namespace doris {
82
using namespace ErrorCode;
83
84
bvar::Adder<int64_t> g_tablet_meta_schema_columns_count("tablet_meta_schema_columns_count");
85
86
TabletManager::TabletManager(StorageEngine& engine, int32_t tablet_map_lock_shard_size)
87
        : _engine(engine),
88
          _tablets_shards_size(tablet_map_lock_shard_size),
89
189
          _tablets_shards_mask(tablet_map_lock_shard_size - 1) {
90
189
    CHECK_GT(_tablets_shards_size, 0);
91
189
    CHECK_EQ(_tablets_shards_size & _tablets_shards_mask, 0);
92
189
    _tablets_shards.resize(_tablets_shards_size);
93
189
}
94
95
189
TabletManager::~TabletManager() = default;
96
97
Status TabletManager::_add_tablet_unlocked(TTabletId tablet_id, const TabletSharedPtr& tablet,
98
304
                                           bool update_meta, bool force, RuntimeProfile* profile) {
99
304
    if (profile->get_counter("AddTablet") == nullptr) {
100
4
        ADD_TIMER(profile, "AddTablet");
101
4
    }
102
304
    Status res = Status::OK();
103
304
    VLOG_NOTICE << "begin to add tablet to TabletManager. "
104
0
                << "tablet_id=" << tablet_id << ", force=" << force;
105
106
304
    TabletSharedPtr existed_tablet = nullptr;
107
304
    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
108
304
    const auto& iter = tablet_map.find(tablet_id);
109
304
    if (iter != tablet_map.end()) {
110
4
        existed_tablet = iter->second;
111
4
    }
112
113
304
    if (existed_tablet == nullptr) {
114
300
        return _add_tablet_to_map_unlocked(tablet_id, tablet, update_meta, false /*keep_files*/,
115
300
                                           false /*drop_old*/, profile);
116
300
    }
117
    // During restore process, the tablet is exist and snapshot loader will replace the tablet's rowsets
118
    // and then reload the tablet, the tablet's path will the same
119
4
    if (!force) {
120
2
        if (existed_tablet->tablet_path() == tablet->tablet_path()) {
121
0
            return Status::Error<ENGINE_INSERT_EXISTS_TABLE>(
122
0
                    "add the same tablet twice! tablet_id={}, tablet_path={}", tablet_id,
123
0
                    tablet->tablet_path());
124
0
        }
125
2
        if (existed_tablet->data_dir() == tablet->data_dir()) {
126
0
            return Status::Error<ENGINE_INSERT_EXISTS_TABLE>(
127
0
                    "add tablet with same data dir twice! tablet_id={}", tablet_id);
128
0
        }
129
2
    }
130
131
4
    MonotonicStopWatch watch;
132
4
    watch.start();
133
134
    // During storage migration, the tablet is moved to another disk, have to check
135
    // if the new tablet's rowset version is larger than the old one to prevent losting data during
136
    // migration
137
4
    int64_t old_time, new_time;
138
4
    int32_t old_version, new_version;
139
4
    {
140
4
        std::shared_lock rdlock(existed_tablet->get_header_lock());
141
4
        const RowsetSharedPtr old_rowset = existed_tablet->get_rowset_with_max_version();
142
4
        const RowsetSharedPtr new_rowset = tablet->get_rowset_with_max_version();
143
        // If new tablet is empty, it is a newly created schema change tablet.
144
        // the old tablet is dropped before add tablet. it should not exist old tablet
145
4
        if (new_rowset == nullptr) {
146
            // it seems useless to call unlock and return here.
147
            // it could prevent error when log level is changed in the future.
148
0
            return Status::Error<ENGINE_INSERT_EXISTS_TABLE>(
149
0
                    "new tablet is empty and old tablet exists. it should not happen. tablet_id={}",
150
0
                    tablet_id);
151
0
        }
152
4
        old_time = old_rowset == nullptr ? -1 : old_rowset->creation_time();
153
4
        new_time = new_rowset->creation_time();
154
4
        old_version = old_rowset == nullptr ? -1 : old_rowset->end_version();
155
4
        new_version = new_rowset->end_version();
156
4
    }
157
4
    COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "GetExistTabletVersion", "AddTablet"),
158
4
                   static_cast<int64_t>(watch.reset()));
159
160
    // In restore process, we replace all origin files in tablet dir with
161
    // the downloaded snapshot files. Then we try to reload tablet header.
162
    // force == true means we forcibly replace the Tablet in tablet_map
163
    // with the new one. But if we do so, the files in the tablet dir will be
164
    // dropped when the origin Tablet deconstruct.
165
    // So we set keep_files == true to not delete files when the
166
    // origin Tablet deconstruct.
167
    // During restore process, snapshot loader
168
    // replaced the old tablet's rowset with new rowsets, but the tablet path is reused, if drop files
169
    // here, the new rowset's file will also be dropped, so use keep files here
170
4
    bool keep_files = force;
171
4
    if (force ||
172
4
        (new_version > old_version || (new_version == old_version && new_time >= old_time))) {
173
        // check if new tablet's meta is in store and add new tablet's meta to meta store
174
4
        res = _add_tablet_to_map_unlocked(tablet_id, tablet, update_meta, keep_files,
175
4
                                          true /*drop_old*/, profile);
176
4
    } else {
177
0
        RETURN_IF_ERROR(tablet->set_tablet_state(TABLET_SHUTDOWN));
178
0
        tablet->save_meta();
179
0
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "SaveMeta", "AddTablet"),
180
0
                       static_cast<int64_t>(watch.reset()));
181
0
        {
182
0
            std::lock_guard<std::shared_mutex> shutdown_tablets_wrlock(_shutdown_tablets_lock);
183
0
            _shutdown_tablets.push_back(tablet);
184
0
        }
185
186
0
        res = Status::Error<ENGINE_INSERT_OLD_TABLET>(
187
0
                "set tablet to shutdown state. tablet_id={}, tablet_path={}", tablet->tablet_id(),
188
0
                tablet->tablet_path());
189
0
    }
190
4
    LOG(WARNING) << "add duplicated tablet. force=" << force << ", res=" << res
191
4
                 << ", tablet_id=" << tablet_id << ", old_version=" << old_version
192
4
                 << ", new_version=" << new_version << ", old_time=" << old_time
193
4
                 << ", new_time=" << new_time
194
4
                 << ", old_tablet_path=" << existed_tablet->tablet_path()
195
4
                 << ", new_tablet_path=" << tablet->tablet_path();
196
197
4
    return res;
198
4
}
199
200
Status TabletManager::_add_tablet_to_map_unlocked(TTabletId tablet_id,
201
                                                  const TabletSharedPtr& tablet, bool update_meta,
202
                                                  bool keep_files, bool drop_old,
203
304
                                                  RuntimeProfile* profile) {
204
    // check if new tablet's meta is in store and add new tablet's meta to meta store
205
304
    Status res = Status::OK();
206
304
    MonotonicStopWatch watch;
207
304
    watch.start();
208
304
    if (update_meta) {
209
        // call tablet save meta in order to valid the meta
210
304
        tablet->save_meta();
211
304
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "SaveMeta", "AddTablet"),
212
304
                       static_cast<int64_t>(watch.reset()));
213
304
    }
214
304
    if (drop_old) {
215
        // If the new tablet is fresher than the existing one, then replace
216
        // the existing tablet with the new one.
217
        // Use default replica_id to ignore whether replica_id is match when drop tablet.
218
4
        Status status = _drop_tablet(tablet_id, /* replica_id */ 0, keep_files, false, true);
219
4
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropOldTablet", "AddTablet"),
220
4
                       static_cast<int64_t>(watch.reset()));
221
4
        RETURN_NOT_OK_STATUS_WITH_WARN(
222
4
                status, strings::Substitute("failed to drop old tablet when add new tablet. "
223
4
                                            "tablet_id=$0",
224
4
                                            tablet_id));
225
4
    }
226
    // Register tablet into DataDir, so that we can manage tablet from
227
    // the perspective of root path.
228
    // Example: unregister all tables when a bad disk found.
229
304
    tablet->register_tablet_into_dir();
230
304
    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
231
304
    tablet_map[tablet_id] = tablet;
232
304
    _add_tablet_to_partition(tablet);
233
304
    g_tablet_meta_schema_columns_count << tablet->tablet_meta()->tablet_columns_num();
234
304
    COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "RegisterTabletInfo", "AddTablet"),
235
304
                   static_cast<int64_t>(watch.reset()));
236
237
304
    VLOG_NOTICE << "add tablet to map successfully."
238
0
                << " tablet_id=" << tablet_id;
239
240
304
    return res;
241
304
}
242
243
0
bool TabletManager::check_tablet_id_exist(TTabletId tablet_id) {
244
0
    std::shared_lock rdlock(_get_tablets_shard_lock(tablet_id));
245
0
    return _check_tablet_id_exist_unlocked(tablet_id);
246
0
}
247
248
0
bool TabletManager::_check_tablet_id_exist_unlocked(TTabletId tablet_id) {
249
0
    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
250
0
    return tablet_map.find(tablet_id) != tablet_map.end();
251
0
}
252
253
Status TabletManager::create_tablet(const TCreateTabletReq& request, std::vector<DataDir*> stores,
254
303
                                    RuntimeProfile* profile) {
255
303
    DorisMetrics::instance()->create_tablet_requests_total->increment(1);
256
257
303
    int64_t tablet_id = request.tablet_id;
258
303
    LOG(INFO) << "begin to create tablet. tablet_id=" << tablet_id
259
303
              << ", table_id=" << request.table_id << ", partition_id=" << request.partition_id
260
303
              << ", replica_id=" << request.replica_id << ", stores.size=" << stores.size()
261
303
              << ", first store=" << stores[0]->path();
262
263
    // when we create rollup tablet A(assume on shard-1) from tablet B(assume on shard-2)
264
    // we need use write lock on shard-1 and then use read lock on shard-2
265
    // if there have create rollup tablet C(assume on shard-2) from tablet D(assume on shard-1) at the same time, we will meet deadlock
266
303
    std::unique_lock two_tablet_lock(_two_tablet_mtx, std::defer_lock);
267
303
    bool in_restore_mode = request.__isset.in_restore_mode && request.in_restore_mode;
268
303
    bool is_schema_change_or_atomic_restore =
269
303
            request.__isset.base_tablet_id && request.base_tablet_id > 0;
270
303
    bool need_two_lock =
271
303
            is_schema_change_or_atomic_restore &&
272
303
            ((_tablets_shards_mask & request.base_tablet_id) != (_tablets_shards_mask & tablet_id));
273
303
    if (need_two_lock) {
274
0
        SCOPED_TIMER(ADD_TIMER(profile, "GetTwoTableLock"));
275
0
        two_tablet_lock.lock();
276
0
    }
277
278
303
    MonotonicStopWatch shard_lock_watch;
279
303
    shard_lock_watch.start();
280
303
    std::lock_guard wrlock(_get_tablets_shard_lock(tablet_id));
281
303
    shard_lock_watch.stop();
282
303
    COUNTER_UPDATE(ADD_TIMER(profile, "GetShardLock"),
283
303
                   static_cast<int64_t>(shard_lock_watch.elapsed_time()));
284
    // Make create_tablet operation to be idempotent:
285
    // 1. Return true if tablet with same tablet_id and schema_hash exist;
286
    //           false if tablet with same tablet_id but different schema_hash exist.
287
    // 2. When this is an alter task, if the tablet(both tablet_id and schema_hash are
288
    // same) already exist, then just return true(an duplicate request). But if
289
    // tablet_id exist but with different schema_hash, return an error(report task will
290
    // eventually trigger its deletion).
291
303
    {
292
303
        SCOPED_TIMER(ADD_TIMER(profile, "GetTabletUnlocked"));
293
303
        if (_get_tablet_unlocked(tablet_id) != nullptr) {
294
3
            LOG(INFO) << "success to create tablet. tablet already exist. tablet_id=" << tablet_id;
295
3
            return Status::OK();
296
3
        }
297
303
    }
298
299
300
    TabletSharedPtr base_tablet = nullptr;
300
    // If the CreateTabletReq has base_tablet_id then it is a alter-tablet request
301
300
    if (is_schema_change_or_atomic_restore) {
302
        // if base_tablet_id's lock diffrent with new_tablet_id, we need lock it.
303
0
        if (need_two_lock) {
304
0
            SCOPED_TIMER(ADD_TIMER(profile, "GetBaseTablet"));
305
0
            base_tablet = get_tablet(request.base_tablet_id);
306
0
            two_tablet_lock.unlock();
307
0
        } else {
308
0
            SCOPED_TIMER(ADD_TIMER(profile, "GetBaseTabletUnlocked"));
309
0
            base_tablet = _get_tablet_unlocked(request.base_tablet_id);
310
0
        }
311
0
        if (base_tablet == nullptr) {
312
0
            DorisMetrics::instance()->create_tablet_requests_failed->increment(1);
313
0
            return Status::Error<TABLE_CREATE_META_ERROR>(
314
0
                    "fail to create tablet(change schema/atomic restore), base tablet does not "
315
0
                    "exist. new_tablet_id={}, base_tablet_id={}",
316
0
                    tablet_id, request.base_tablet_id);
317
0
        }
318
        // If we are doing schema-change or atomic-restore, we should use the same data dir
319
        // TODO(lingbin): A litter trick here, the directory should be determined before
320
        // entering this method
321
        //
322
        // ATTN: Since all restored replicas will be saved to HDD, so no storage_medium check here.
323
0
        if (in_restore_mode ||
324
0
            request.storage_medium == base_tablet->data_dir()->storage_medium()) {
325
0
            LOG(INFO) << "create tablet use the base tablet data dir. tablet_id=" << tablet_id
326
0
                      << ", base tablet_id=" << request.base_tablet_id
327
0
                      << ", data dir=" << base_tablet->data_dir()->path();
328
0
            stores.clear();
329
0
            stores.push_back(base_tablet->data_dir());
330
0
        }
331
0
    }
332
333
    // set alter type to schema-change. it is useless
334
300
    TabletSharedPtr tablet = _internal_create_tablet_unlocked(
335
300
            request, is_schema_change_or_atomic_restore, base_tablet.get(), stores, profile);
336
300
    if (tablet == nullptr) {
337
0
        DorisMetrics::instance()->create_tablet_requests_failed->increment(1);
338
0
        return Status::Error<CE_CMD_PARAMS_ERROR>("fail to create tablet. tablet_id={}",
339
0
                                                  request.tablet_id);
340
0
    }
341
342
300
    LOG(INFO) << "success to create tablet. tablet_id=" << tablet_id
343
300
              << ", tablet_path=" << tablet->tablet_path();
344
300
    return Status::OK();
345
300
}
346
347
TabletSharedPtr TabletManager::_internal_create_tablet_unlocked(
348
        const TCreateTabletReq& request, const bool is_schema_change, const Tablet* base_tablet,
349
300
        const std::vector<DataDir*>& data_dirs, RuntimeProfile* profile) {
350
    // If in schema-change state, base_tablet must also be provided.
351
    // i.e., is_schema_change and base_tablet are either assigned or not assigned
352
300
    DCHECK((is_schema_change && base_tablet) || (!is_schema_change && !base_tablet));
353
354
    // NOTE: The existence of tablet_id and schema_hash has already been checked,
355
    // no need check again here.
356
357
300
    const std::string parent_timer_name = "InternalCreateTablet";
358
300
    SCOPED_TIMER(ADD_TIMER(profile, parent_timer_name));
359
360
300
    MonotonicStopWatch watch;
361
300
    watch.start();
362
300
    auto create_meta_timer = ADD_CHILD_TIMER(profile, "CreateMeta", parent_timer_name);
363
300
    auto tablet = _create_tablet_meta_and_dir_unlocked(request, is_schema_change, base_tablet,
364
300
                                                       data_dirs, profile);
365
300
    COUNTER_UPDATE(create_meta_timer, static_cast<int64_t>(watch.reset()));
366
300
    if (tablet == nullptr) {
367
0
        return nullptr;
368
0
    }
369
370
300
    int64_t new_tablet_id = request.tablet_id;
371
300
    int32_t new_schema_hash = request.tablet_schema.schema_hash;
372
373
    // should remove the tablet's pending_id no matter create-tablet success or not
374
300
    DataDir* data_dir = tablet->data_dir();
375
376
    // TODO(yiguolei)
377
    // the following code is very difficult to understand because it mixed alter tablet v2
378
    // and alter tablet v1 should remove alter tablet v1 code after v0.12
379
300
    Status res = Status::OK();
380
300
    bool is_tablet_added = false;
381
300
    do {
382
300
        res = tablet->init();
383
300
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "TabletInit", parent_timer_name),
384
300
                       static_cast<int64_t>(watch.reset()));
385
300
        if (!res.ok()) {
386
0
            LOG(WARNING) << "tablet init failed. tablet:" << tablet->tablet_id();
387
0
            break;
388
0
        }
389
390
        // Create init version if this is not a restore mode replica and request.version is set
391
        // bool in_restore_mode = request.__isset.in_restore_mode && request.in_restore_mode;
392
        // if (!in_restore_mode && request.__isset.version) {
393
        // create initial rowset before add it to storage engine could omit many locks
394
300
        res = tablet->create_initial_rowset(request.version);
395
300
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "InitRowset", parent_timer_name),
396
300
                       static_cast<int64_t>(watch.reset()));
397
300
        if (!res.ok()) {
398
0
            LOG(WARNING) << "fail to create initial version for tablet. res=" << res;
399
0
            break;
400
0
        }
401
402
300
        if (is_schema_change) {
403
            // if this is a new alter tablet, has to set its state to not ready
404
            // because schema change handler depends on it to check whether history data
405
            // convert finished
406
0
            static_cast<void>(tablet->set_tablet_state(TabletState::TABLET_NOTREADY));
407
0
        }
408
        // Add tablet to StorageEngine will make it visible to user
409
        // Will persist tablet meta
410
300
        auto add_tablet_timer = ADD_CHILD_TIMER(profile, "AddTablet", parent_timer_name);
411
300
        res = _add_tablet_unlocked(new_tablet_id, tablet, /*update_meta*/ true, false, profile);
412
300
        COUNTER_UPDATE(add_tablet_timer, static_cast<int64_t>(watch.reset()));
413
300
        if (!res.ok()) {
414
0
            LOG(WARNING) << "fail to add tablet to StorageEngine. res=" << res;
415
0
            break;
416
0
        }
417
300
        is_tablet_added = true;
418
419
        // TODO(lingbin): The following logic seems useless, can be removed?
420
        // Because if _add_tablet_unlocked() return OK, we must can get it from map.
421
300
        TabletSharedPtr tablet_ptr = _get_tablet_unlocked(new_tablet_id);
422
300
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "GetTablet", parent_timer_name),
423
300
                       static_cast<int64_t>(watch.reset()));
424
300
        if (tablet_ptr == nullptr) {
425
0
            res = Status::Error<TABLE_NOT_FOUND>("fail to get tablet. res={}", res);
426
0
            break;
427
0
        }
428
300
    } while (false);
429
430
300
    if (res.ok()) {
431
300
        return tablet;
432
300
    }
433
    // something is wrong, we need clear environment
434
0
    if (is_tablet_added) {
435
0
        Status status = _drop_tablet(new_tablet_id, request.replica_id, false, false, true);
436
0
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropTablet", parent_timer_name),
437
0
                       static_cast<int64_t>(watch.reset()));
438
0
        if (!status.ok()) {
439
0
            LOG(WARNING) << "fail to drop tablet when create tablet failed. res=" << res;
440
0
        }
441
0
    } else {
442
0
        tablet->delete_all_files();
443
0
        static_cast<void>(TabletMetaManager::remove(data_dir, new_tablet_id, new_schema_hash));
444
0
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "RemoveTabletFiles", parent_timer_name),
445
0
                       static_cast<int64_t>(watch.reset()));
446
0
    }
447
0
    return nullptr;
448
300
}
449
450
300
static string _gen_tablet_dir(const string& dir, int16_t shard_id, int64_t tablet_id) {
451
300
    string path = dir;
452
300
    path = path_util::join_path_segments(path, DATA_PREFIX);
453
300
    path = path_util::join_path_segments(path, std::to_string(shard_id));
454
300
    path = path_util::join_path_segments(path, std::to_string(tablet_id));
455
300
    return path;
456
300
}
457
458
TabletSharedPtr TabletManager::_create_tablet_meta_and_dir_unlocked(
459
        const TCreateTabletReq& request, const bool is_schema_change, const Tablet* base_tablet,
460
300
        const std::vector<DataDir*>& data_dirs, RuntimeProfile* profile) {
461
300
    string pending_id = StrCat(TABLET_ID_PREFIX, request.tablet_id);
462
    // Many attempts are made here in the hope that even if a disk fails, it can still continue.
463
300
    std::string parent_timer_name = "CreateMeta";
464
300
    MonotonicStopWatch watch;
465
300
    watch.start();
466
300
    for (auto& data_dir : data_dirs) {
467
300
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "RemovePendingIds", parent_timer_name),
468
300
                       static_cast<int64_t>(watch.reset()));
469
470
300
        TabletMetaSharedPtr tablet_meta;
471
        // if create meta failed, do not need to clean dir, because it is only in memory
472
300
        Status res = _create_tablet_meta_unlocked(request, data_dir, is_schema_change, base_tablet,
473
300
                                                  &tablet_meta);
474
300
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "CreateMetaUnlock", parent_timer_name),
475
300
                       static_cast<int64_t>(watch.reset()));
476
300
        if (!res.ok()) {
477
0
            LOG(WARNING) << "fail to create tablet meta. res=" << res
478
0
                         << ", root=" << data_dir->path();
479
0
            continue;
480
0
        }
481
482
300
        string tablet_dir =
483
300
                _gen_tablet_dir(data_dir->path(), tablet_meta->shard_id(), request.tablet_id);
484
300
        string schema_hash_dir = path_util::join_path_segments(
485
300
                tablet_dir, std::to_string(request.tablet_schema.schema_hash));
486
487
        // Because the tablet is removed asynchronously, so that the dir may still exist when BE
488
        // receive create-tablet request again, For example retried schema-change request
489
300
        bool exists = true;
490
300
        res = io::global_local_filesystem()->exists(schema_hash_dir, &exists);
491
300
        if (!res.ok()) {
492
0
            continue;
493
0
        }
494
300
        if (exists) {
495
0
            LOG(WARNING) << "skip this dir because tablet path exist, path=" << schema_hash_dir;
496
0
            continue;
497
300
        } else {
498
300
            Status st = io::global_local_filesystem()->create_directory(schema_hash_dir);
499
300
            if (!st.ok()) {
500
0
                continue;
501
0
            }
502
300
        }
503
504
300
        if (tablet_meta->partition_id() <= 0) {
505
233
            LOG(WARNING) << "invalid partition id " << tablet_meta->partition_id() << ", tablet "
506
233
                         << tablet_meta->tablet_id();
507
233
        }
508
300
        TabletSharedPtr new_tablet =
509
300
                std::make_shared<Tablet>(_engine, std::move(tablet_meta), data_dir);
510
300
        COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "CreateTabletFromMeta", parent_timer_name),
511
300
                       static_cast<int64_t>(watch.reset()));
512
300
        return new_tablet;
513
300
    }
514
0
    return nullptr;
515
300
}
516
517
Status TabletManager::drop_tablet(TTabletId tablet_id, TReplicaId replica_id,
518
252
                                  bool is_drop_table_or_partition) {
519
252
    return _drop_tablet(tablet_id, replica_id, false, is_drop_table_or_partition, false);
520
252
}
521
522
// Drop specified tablet.
523
Status TabletManager::_drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool keep_files,
524
256
                                   bool is_drop_table_or_partition, bool had_held_shard_lock) {
525
256
    LOG(INFO) << "begin drop tablet. tablet_id=" << tablet_id << ", replica_id=" << replica_id
526
256
              << ", is_drop_table_or_partition=" << is_drop_table_or_partition
527
256
              << ", keep_files=" << keep_files;
528
256
    DorisMetrics::instance()->drop_tablet_requests_total->increment(1);
529
530
256
    RETURN_IF_ERROR(register_transition_tablet(tablet_id, "drop tablet"));
531
256
    Defer defer {[&]() { unregister_transition_tablet(tablet_id, "drop tablet"); }};
532
533
    // Fetch tablet which need to be dropped
534
256
    TabletSharedPtr to_drop_tablet;
535
256
    {
536
256
        std::unique_lock<std::shared_mutex> wlock(_get_tablets_shard_lock(tablet_id),
537
256
                                                  std::defer_lock);
538
256
        if (!had_held_shard_lock) {
539
252
            wlock.lock();
540
252
        }
541
256
        to_drop_tablet = _get_tablet_unlocked(tablet_id);
542
256
        if (to_drop_tablet == nullptr) {
543
1
            LOG(WARNING) << "fail to drop tablet because it does not exist. "
544
1
                         << "tablet_id=" << tablet_id;
545
1
            return Status::OK();
546
1
        }
547
548
        // We should compare replica id to avoid dropping new cloned tablet.
549
        // Iff request replica id is 0, FE may be an older release, then we drop this tablet as before.
550
255
        if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) {
551
0
            return Status::Aborted("replica_id not match({} vs {})", to_drop_tablet->replica_id(),
552
0
                                   replica_id);
553
0
        }
554
555
255
        _remove_tablet_from_partition(to_drop_tablet);
556
255
        tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
557
255
        tablet_map.erase(tablet_id);
558
255
    }
559
560
0
    to_drop_tablet->clear_cache();
561
562
255
    {
563
        // drop tablet will update tablet meta, should lock
564
255
        std::lock_guard<std::shared_mutex> wrlock(to_drop_tablet->get_header_lock());
565
255
        SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD);
566
        // NOTE: has to update tablet here, but must not update tablet meta directly.
567
        // because other thread may hold the tablet object, they may save meta too.
568
        // If update meta directly here, other thread may override the meta
569
        // and the tablet will be loaded at restart time.
570
        // To avoid this exception, we first set the state of the tablet to `SHUTDOWN`.
571
        //
572
        // Until now, only the restore task uses keep files.
573
255
        RETURN_IF_ERROR(to_drop_tablet->set_tablet_state(TABLET_SHUTDOWN));
574
255
        if (!keep_files) {
575
253
            LOG(INFO) << "set tablet to shutdown state and remove it from memory. "
576
253
                      << "tablet_id=" << tablet_id
577
253
                      << ", tablet_path=" << to_drop_tablet->tablet_path();
578
            // We must record unused remote rowsets path info to OlapMeta before tablet state is marked as TABLET_SHUTDOWN in OlapMeta,
579
            // otherwise if BE shutdown after saving tablet state, these remote rowsets path info will lost.
580
253
            if (is_drop_table_or_partition) {
581
0
                RETURN_IF_ERROR(to_drop_tablet->remove_all_remote_rowsets());
582
0
            }
583
253
            to_drop_tablet->save_meta();
584
253
            {
585
253
                std::lock_guard<std::shared_mutex> wrdlock(_shutdown_tablets_lock);
586
253
                _shutdown_tablets.push_back(to_drop_tablet);
587
253
            }
588
253
        }
589
255
    }
590
591
255
    to_drop_tablet->deregister_tablet_from_dir();
592
255
    g_tablet_meta_schema_columns_count << -to_drop_tablet->tablet_meta()->tablet_columns_num();
593
255
    return Status::OK();
594
255
}
595
596
2.69k
TabletSharedPtr TabletManager::get_tablet(TTabletId tablet_id, bool include_deleted, string* err) {
597
2.69k
    std::shared_lock rdlock(_get_tablets_shard_lock(tablet_id));
598
2.69k
    return _get_tablet_unlocked(tablet_id, include_deleted, err);
599
2.69k
}
600
601
0
std::vector<TabletSharedPtr> TabletManager::get_all_tablet(std::function<bool(Tablet*)>&& filter) {
602
0
    std::vector<TabletSharedPtr> res;
603
0
    for_each_tablet([&](const TabletSharedPtr& tablet) { res.emplace_back(tablet); },
604
0
                    std::move(filter));
605
0
    return res;
606
0
}
607
608
void TabletManager::for_each_tablet(std::function<void(const TabletSharedPtr&)>&& handler,
609
11
                                    std::function<bool(Tablet*)>&& filter) {
610
11
    std::vector<TabletSharedPtr> tablets;
611
11
    for (const auto& tablets_shard : _tablets_shards) {
612
11
        tablets.clear();
613
11
        {
614
11
            std::shared_lock rdlock(tablets_shard.lock);
615
257
            for (const auto& [id, tablet] : tablets_shard.tablet_map) {
616
257
                if (filter(tablet.get())) {
617
257
                    tablets.emplace_back(tablet);
618
257
                }
619
257
            }
620
11
        }
621
257
        for (const auto& tablet : tablets) {
622
257
            handler(tablet);
623
257
        }
624
11
    }
625
11
}
626
627
TabletSharedPtr TabletManager::_get_tablet_unlocked(TTabletId tablet_id, bool include_deleted,
628
2.69k
                                                    string* err) {
629
2.69k
    TabletSharedPtr tablet;
630
2.69k
    tablet = _get_tablet_unlocked(tablet_id);
631
2.69k
    if (tablet == nullptr && include_deleted) {
632
4
        std::shared_lock rdlock(_shutdown_tablets_lock);
633
4
        for (auto& deleted_tablet : _shutdown_tablets) {
634
2
            CHECK(deleted_tablet != nullptr) << "deleted tablet is nullptr";
635
2
            if (deleted_tablet->tablet_id() == tablet_id) {
636
2
                tablet = deleted_tablet;
637
2
                break;
638
2
            }
639
2
        }
640
4
    }
641
642
2.69k
    if (tablet == nullptr) {
643
239
        if (err != nullptr) {
644
1
            *err = "tablet does not exist. " + BackendOptions::get_localhost();
645
1
        }
646
239
        return nullptr;
647
239
    }
648
#ifndef BE_TEST
649
    if (!tablet->is_used()) {
650
        LOG(WARNING) << "tablet cannot be used. tablet=" << tablet_id;
651
        if (err != nullptr) {
652
            *err = "tablet cannot be used. " + BackendOptions::get_localhost();
653
        }
654
        return nullptr;
655
    }
656
#endif
657
658
2.45k
    return tablet;
659
2.69k
}
660
661
TabletSharedPtr TabletManager::get_tablet(TTabletId tablet_id, TabletUid tablet_uid,
662
0
                                          bool include_deleted, string* err) {
663
0
    std::shared_lock rdlock(_get_tablets_shard_lock(tablet_id));
664
0
    TabletSharedPtr tablet = _get_tablet_unlocked(tablet_id, include_deleted, err);
665
0
    if (tablet != nullptr && tablet->tablet_uid() == tablet_uid) {
666
0
        return tablet;
667
0
    }
668
0
    return nullptr;
669
0
}
670
671
0
uint64_t TabletManager::get_rowset_nums() {
672
0
    uint64_t rowset_nums = 0;
673
0
    for_each_tablet([&](const TabletSharedPtr& tablet) { rowset_nums += tablet->version_count(); },
674
0
                    filter_all_tablets);
675
0
    return rowset_nums;
676
0
}
677
678
0
uint64_t TabletManager::get_segment_nums() {
679
0
    uint64_t segment_nums = 0;
680
0
    for_each_tablet([&](const TabletSharedPtr& tablet) { segment_nums += tablet->segment_count(); },
681
0
                    filter_all_tablets);
682
0
    return segment_nums;
683
0
}
684
685
bool TabletManager::get_tablet_id_and_schema_hash_from_path(const string& path,
686
                                                            TTabletId* tablet_id,
687
37
                                                            TSchemaHash* schema_hash) {
688
    // the path like: /data/14/10080/964828783/
689
37
    static re2::RE2 normal_re("/data/\\d+/(\\d+)/(\\d+)($|/)");
690
    // match tablet schema hash data path, for example, the path is /data/1/16791/29998
691
    // 1 is shard id , 16791 is tablet id, 29998 is schema hash
692
37
    if (RE2::PartialMatch(path, normal_re, tablet_id, schema_hash)) {
693
33
        return true;
694
33
    }
695
696
    // If we can't match normal path pattern, this may be a path which is a empty tablet
697
    // directory. Use this pattern to match empty tablet directory. In this case schema_hash
698
    // will be set to zero.
699
4
    static re2::RE2 empty_tablet_re("/data/\\d+/(\\d+)($|/$)");
700
4
    if (!RE2::PartialMatch(path, empty_tablet_re, tablet_id)) {
701
2
        return false;
702
2
    }
703
2
    *schema_hash = 0;
704
2
    return true;
705
4
}
706
707
3
bool TabletManager::get_rowset_id_from_path(const string& path, RowsetId* rowset_id) {
708
    // the path like: /data/14/10080/964828783/02000000000000969144d8725cb62765f9af6cd3125d5a91_0.dat
709
3
    static re2::RE2 re("/data/\\d+/\\d+/\\d+/([A-Fa-f0-9]+)_.*");
710
3
    string id_str;
711
3
    bool ret = RE2::PartialMatch(path, re, &id_str);
712
3
    if (ret) {
713
1
        rowset_id->init(id_str);
714
1
        return true;
715
1
    }
716
2
    return false;
717
3
}
718
719
0
void TabletManager::get_tablet_stat(TTabletStatResult* result) {
720
0
    std::shared_ptr<std::vector<TTabletStat>> local_cache;
721
0
    {
722
0
        std::lock_guard<std::mutex> guard(_tablet_stat_cache_mutex);
723
0
        local_cache = _tablet_stat_list_cache;
724
0
    }
725
0
    result->__set_tablet_stat_list(*local_cache);
726
0
}
727
728
struct TabletScore {
729
    TabletSharedPtr tablet_ptr;
730
    int score;
731
};
732
733
std::vector<TabletSharedPtr> TabletManager::find_best_tablets_to_compaction(
734
        CompactionType compaction_type, DataDir* data_dir,
735
        const std::unordered_set<TabletSharedPtr>& tablet_submitted_compaction, uint32_t* score,
736
        const std::unordered_map<std::string_view, std::shared_ptr<CumulativeCompactionPolicy>>&
737
6
                all_cumulative_compaction_policies) {
738
6
    int64_t now_ms = UnixMillis();
739
6
    const string& compaction_type_str =
740
6
            compaction_type == CompactionType::BASE_COMPACTION ? "base" : "cumulative";
741
6
    uint32_t highest_score = 0;
742
    // find the single compaction tablet
743
6
    uint32_t single_compact_highest_score = 0;
744
6
    TabletSharedPtr best_tablet;
745
6
    TabletSharedPtr best_single_compact_tablet;
746
92
    auto cmp = [](TabletScore left, TabletScore right) { return left.score > right.score; };
747
6
    std::priority_queue<TabletScore, std::vector<TabletScore>, decltype(cmp)> top_tablets(cmp);
748
749
257
    auto handler = [&](const TabletSharedPtr& tablet_ptr) {
750
257
        if (tablet_ptr->tablet_meta()->tablet_schema()->disable_auto_compaction()) {
751
0
            LOG_EVERY_N(INFO, 500) << "Tablet " << tablet_ptr->tablet_id()
752
0
                                   << " will be ignored by automatic compaction tasks since it's "
753
0
                                   << "set to disabled automatic compaction.";
754
0
            return;
755
0
        }
756
757
257
        if (config::enable_skip_tablet_compaction &&
758
257
            tablet_ptr->should_skip_compaction(compaction_type, UnixSeconds())) {
759
2
            return;
760
2
        }
761
255
        if (!tablet_ptr->can_do_compaction(data_dir->path_hash(), compaction_type)) {
762
0
            return;
763
0
        }
764
765
255
        auto search = tablet_submitted_compaction.find(tablet_ptr);
766
255
        if (search != tablet_submitted_compaction.end()) {
767
0
            return;
768
0
        }
769
770
255
        int64_t last_failure_ms = tablet_ptr->last_cumu_compaction_failure_time();
771
255
        if (compaction_type == CompactionType::BASE_COMPACTION) {
772
0
            last_failure_ms = tablet_ptr->last_base_compaction_failure_time();
773
0
        }
774
255
        if (now_ms - last_failure_ms <= config::tablet_sched_delay_time_ms) {
775
0
            VLOG_DEBUG << "Too often to check compaction, skip it. "
776
0
                       << "compaction_type=" << compaction_type_str
777
0
                       << ", last_failure_time_ms=" << last_failure_ms
778
0
                       << ", tablet_id=" << tablet_ptr->tablet_id();
779
0
            return;
780
0
        }
781
782
255
        if (compaction_type == CompactionType::BASE_COMPACTION) {
783
0
            std::unique_lock<std::mutex> lock(tablet_ptr->get_base_compaction_lock(),
784
0
                                              std::try_to_lock);
785
0
            if (!lock.owns_lock()) {
786
0
                LOG(INFO) << "can not get base lock: " << tablet_ptr->tablet_id();
787
0
                return;
788
0
            }
789
255
        } else {
790
255
            std::unique_lock<std::mutex> lock(tablet_ptr->get_cumulative_compaction_lock(),
791
255
                                              std::try_to_lock);
792
255
            if (!lock.owns_lock()) {
793
0
                LOG(INFO) << "can not get cumu lock: " << tablet_ptr->tablet_id();
794
0
                return;
795
0
            }
796
255
        }
797
255
        auto cumulative_compaction_policy = all_cumulative_compaction_policies.at(
798
255
                tablet_ptr->tablet_meta()->compaction_policy());
799
255
        uint32_t current_compaction_score =
800
255
                tablet_ptr->calc_compaction_score(compaction_type, cumulative_compaction_policy);
801
255
        if (current_compaction_score < 5) {
802
11
            tablet_ptr->set_skip_compaction(true, compaction_type, UnixSeconds());
803
11
        }
804
805
        // tablet should do single compaction
806
255
        if (current_compaction_score > single_compact_highest_score &&
807
255
            tablet_ptr->should_fetch_from_peer()) {
808
3
            single_compact_highest_score = current_compaction_score;
809
3
            best_single_compact_tablet = tablet_ptr;
810
3
        }
811
812
255
        if (config::compaction_num_per_round > 1 && !tablet_ptr->should_fetch_from_peer()) {
813
205
            TabletScore ts;
814
205
            ts.score = current_compaction_score;
815
205
            ts.tablet_ptr = tablet_ptr;
816
205
            if ((top_tablets.size() >= config::compaction_num_per_round &&
817
205
                 current_compaction_score > top_tablets.top().score) ||
818
205
                top_tablets.size() < config::compaction_num_per_round) {
819
25
                top_tablets.push(ts);
820
25
                if (top_tablets.size() > config::compaction_num_per_round) {
821
0
                    top_tablets.pop();
822
0
                }
823
25
                if (current_compaction_score > highest_score) {
824
3
                    highest_score = current_compaction_score;
825
3
                }
826
25
            }
827
205
        } else {
828
50
            if (current_compaction_score > highest_score && !tablet_ptr->should_fetch_from_peer()) {
829
3
                highest_score = current_compaction_score;
830
3
                best_tablet = tablet_ptr;
831
3
            }
832
50
        }
833
255
    };
834
835
6
    for_each_tablet(handler, filter_all_tablets);
836
6
    std::vector<TabletSharedPtr> picked_tablet;
837
6
    if (best_tablet != nullptr) {
838
3
        VLOG_CRITICAL << "Found the best tablet for compaction. "
839
0
                      << "compaction_type=" << compaction_type_str
840
0
                      << ", tablet_id=" << best_tablet->tablet_id() << ", path=" << data_dir->path()
841
0
                      << ", highest_score=" << highest_score
842
0
                      << ", fetch from peer: " << best_tablet->should_fetch_from_peer();
843
3
        picked_tablet.emplace_back(std::move(best_tablet));
844
3
    }
845
846
6
    std::vector<TabletSharedPtr> reverse_top_tablets;
847
31
    while (!top_tablets.empty()) {
848
25
        reverse_top_tablets.emplace_back(top_tablets.top().tablet_ptr);
849
25
        top_tablets.pop();
850
25
    }
851
852
31
    for (auto it = reverse_top_tablets.rbegin(); it != reverse_top_tablets.rend(); ++it) {
853
25
        picked_tablet.emplace_back(*it);
854
25
    }
855
856
    // pick single compaction tablet needs the highest score
857
6
    if (best_single_compact_tablet != nullptr && single_compact_highest_score >= highest_score) {
858
2
        VLOG_CRITICAL << "Found the best tablet for single compaction. "
859
0
                      << "compaction_type=" << compaction_type_str
860
0
                      << ", tablet_id=" << best_single_compact_tablet->tablet_id()
861
0
                      << ", path=" << data_dir->path()
862
0
                      << ", highest_score=" << single_compact_highest_score << ", fetch from peer: "
863
0
                      << best_single_compact_tablet->should_fetch_from_peer();
864
2
        picked_tablet.emplace_back(std::move(best_single_compact_tablet));
865
2
    }
866
6
    *score = highest_score > single_compact_highest_score ? highest_score
867
6
                                                          : single_compact_highest_score;
868
6
    return picked_tablet;
869
6
}
870
871
Status TabletManager::load_tablet_from_meta(DataDir* data_dir, TTabletId tablet_id,
872
                                            TSchemaHash schema_hash, std::string_view meta_binary,
873
                                            bool update_meta, bool force, bool restore,
874
4
                                            bool check_path) {
875
4
    TabletMetaSharedPtr tablet_meta(new TabletMeta());
876
4
    Status status = tablet_meta->deserialize(meta_binary);
877
4
    if (!status.ok()) {
878
0
        return Status::Error<HEADER_PB_PARSE_FAILED>(
879
0
                "fail to load tablet because can not parse meta_binary string. tablet_id={}, "
880
0
                "schema_hash={}, path={}, status={}",
881
0
                tablet_id, schema_hash, data_dir->path(), status);
882
0
    }
883
884
    // check if tablet meta is valid
885
4
    if (tablet_meta->tablet_id() != tablet_id || tablet_meta->schema_hash() != schema_hash) {
886
0
        return Status::Error<HEADER_PB_PARSE_FAILED>(
887
0
                "fail to load tablet because meet invalid tablet meta. trying to load "
888
0
                "tablet(tablet_id={}, schema_hash={}), but meet tablet={}, path={}",
889
0
                tablet_id, schema_hash, tablet_meta->tablet_id(), data_dir->path());
890
0
    }
891
4
    if (tablet_meta->tablet_uid().hi == 0 && tablet_meta->tablet_uid().lo == 0) {
892
0
        return Status::Error<HEADER_PB_PARSE_FAILED>(
893
0
                "fail to load tablet because its uid == 0. tablet={}, path={}",
894
0
                tablet_meta->tablet_id(), data_dir->path());
895
0
    }
896
897
4
    if (restore) {
898
        // we're restoring tablet from trash, tablet state should be changed from shutdown back to running
899
0
        tablet_meta->set_tablet_state(TABLET_RUNNING);
900
0
    }
901
902
4
    if (tablet_meta->partition_id() == 0) {
903
1
        LOG(WARNING) << "tablet=" << tablet_id << " load from meta but partition id eq 0";
904
1
    }
905
906
4
    TabletSharedPtr tablet = std::make_shared<Tablet>(_engine, std::move(tablet_meta), data_dir);
907
908
    // NOTE: method load_tablet_from_meta could be called by two cases as below
909
    // case 1: BE start;
910
    // case 2: Clone Task/Restore
911
    // For case 1 doesn't need path check because BE is just starting and not ready,
912
    // just check tablet meta status to judge whether tablet is delete is enough.
913
    // For case 2, If a tablet has just been copied to local BE,
914
    // it may be cleared by gc-thread(see perform_tablet_gc) because the tablet meta may not be loaded to memory.
915
    // So clone task should check path and then failed and retry in this case.
916
4
    if (check_path) {
917
4
        bool exists = true;
918
4
        RETURN_IF_ERROR(io::global_local_filesystem()->exists(tablet->tablet_path(), &exists));
919
4
        if (!exists) {
920
0
            return Status::Error<TABLE_ALREADY_DELETED_ERROR>(
921
0
                    "tablet path not exists, create tablet failed, path={}", tablet->tablet_path());
922
0
        }
923
4
    }
924
925
4
    if (tablet->tablet_meta()->tablet_state() == TABLET_SHUTDOWN) {
926
0
        {
927
0
            std::lock_guard<std::shared_mutex> shutdown_tablets_wrlock(_shutdown_tablets_lock);
928
0
            _shutdown_tablets.push_back(tablet);
929
0
        }
930
0
        return Status::Error<TABLE_ALREADY_DELETED_ERROR>(
931
0
                "fail to load tablet because it is to be deleted. tablet_id={}, schema_hash={}, "
932
0
                "path={}",
933
0
                tablet_id, schema_hash, data_dir->path());
934
0
    }
935
    // NOTE: We do not check tablet's initial version here, because if BE restarts when
936
    // one tablet is doing schema-change, we may meet empty tablet.
937
4
    if (tablet->max_version().first == -1 && tablet->tablet_state() == TABLET_RUNNING) {
938
        // tablet state is invalid, drop tablet
939
0
        return Status::Error<TABLE_INDEX_VALIDATE_ERROR>(
940
0
                "fail to load tablet. it is in running state but without delta. tablet={}, path={}",
941
0
                tablet->tablet_id(), data_dir->path());
942
0
    }
943
944
4
    RETURN_NOT_OK_STATUS_WITH_WARN(
945
4
            tablet->init(),
946
4
            strings::Substitute("tablet init failed. tablet=$0", tablet->tablet_id()));
947
948
4
    RuntimeProfile profile("CreateTablet");
949
4
    std::lock_guard<std::shared_mutex> wrlock(_get_tablets_shard_lock(tablet_id));
950
4
    RETURN_NOT_OK_STATUS_WITH_WARN(
951
4
            _add_tablet_unlocked(tablet_id, tablet, update_meta, force, &profile),
952
4
            strings::Substitute("fail to add tablet. tablet=$0", tablet->tablet_id()));
953
954
4
    return Status::OK();
955
4
}
956
957
Status TabletManager::load_tablet_from_dir(DataDir* store, TTabletId tablet_id,
958
                                           SchemaHash schema_hash, const string& schema_hash_path,
959
3
                                           bool force, bool restore) {
960
3
    LOG(INFO) << "begin to load tablet from dir. "
961
3
              << " tablet_id=" << tablet_id << " schema_hash=" << schema_hash
962
3
              << " path = " << schema_hash_path << " force = " << force << " restore = " << restore;
963
    // not add lock here, because load_tablet_from_meta already add lock
964
3
    std::string header_path = TabletMeta::construct_header_file_path(schema_hash_path, tablet_id);
965
    // should change shard id before load tablet
966
3
    std::string shard_path =
967
3
            path_util::dir_name(path_util::dir_name(path_util::dir_name(header_path)));
968
3
    std::string shard_str = shard_path.substr(shard_path.find_last_of('/') + 1);
969
3
    int32_t shard = stol(shard_str);
970
971
3
    bool exists = false;
972
3
    RETURN_IF_ERROR(io::global_local_filesystem()->exists(header_path, &exists));
973
3
    if (!exists) {
974
0
        return Status::Error<NOT_FOUND>("fail to find header file. [header_path={}]", header_path);
975
0
    }
976
977
3
    TabletMetaSharedPtr tablet_meta(new TabletMeta());
978
3
    if (!tablet_meta->create_from_file(header_path).ok()) {
979
0
        return Status::Error<ENGINE_LOAD_INDEX_TABLE_ERROR>(
980
0
                "fail to load tablet_meta. file_path={}", header_path);
981
0
    }
982
3
    TabletUid tablet_uid = TabletUid::gen_uid();
983
984
    // remove rowset binlog metas
985
3
    auto binlog_metas_file = fmt::format("{}/rowset_binlog_metas.pb", schema_hash_path);
986
3
    bool binlog_metas_file_exists = false;
987
3
    auto file_exists_status =
988
3
            io::global_local_filesystem()->exists(binlog_metas_file, &binlog_metas_file_exists);
989
3
    if (!file_exists_status.ok()) {
990
0
        return file_exists_status;
991
0
    }
992
3
    bool contain_binlog = false;
993
3
    RowsetBinlogMetasPB rowset_binlog_metas_pb;
994
3
    if (binlog_metas_file_exists) {
995
0
        auto binlog_meta_filesize = std::filesystem::file_size(binlog_metas_file);
996
0
        if (binlog_meta_filesize > 0) {
997
0
            contain_binlog = true;
998
0
            RETURN_IF_ERROR(read_pb(binlog_metas_file, &rowset_binlog_metas_pb));
999
0
            VLOG_DEBUG << "load rowset binlog metas from file. file_path=" << binlog_metas_file;
1000
0
        }
1001
0
        RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(binlog_metas_file));
1002
0
    }
1003
3
    if (contain_binlog) {
1004
0
        auto binlog_dir = fmt::format("{}/_binlog", schema_hash_path);
1005
0
        RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(binlog_dir));
1006
1007
0
        std::vector<io::FileInfo> files;
1008
0
        RETURN_IF_ERROR(
1009
0
                io::global_local_filesystem()->list(schema_hash_path, true, &files, &exists));
1010
0
        for (auto& file : files) {
1011
0
            auto& filename = file.file_name;
1012
0
            std::string new_suffix;
1013
0
            std::string old_suffix;
1014
1015
0
            if (filename.ends_with(".binlog")) {
1016
0
                old_suffix = ".binlog";
1017
0
                new_suffix = ".dat";
1018
0
            } else if (filename.ends_with(".binlog-index")) {
1019
0
                old_suffix = ".binlog-index";
1020
0
                new_suffix = ".idx";
1021
0
            } else {
1022
0
                continue;
1023
0
            }
1024
1025
0
            std::string new_filename = filename;
1026
0
            new_filename.replace(filename.size() - old_suffix.size(), old_suffix.size(),
1027
0
                                 new_suffix);
1028
0
            auto from = fmt::format("{}/{}", schema_hash_path, filename);
1029
0
            auto to = fmt::format("{}/_binlog/{}", schema_hash_path, new_filename);
1030
0
            RETURN_IF_ERROR(io::global_local_filesystem()->rename(from, to));
1031
0
        }
1032
1033
0
        auto* meta = store->get_meta();
1034
        // if ingest binlog metas error, it will be gc in gc_unused_binlog_metas
1035
0
        RETURN_IF_ERROR(
1036
0
                RowsetMetaManager::ingest_binlog_metas(meta, tablet_uid, &rowset_binlog_metas_pb));
1037
0
    }
1038
1039
    // has to change shard id here, because meta file maybe copied from other source
1040
    // its shard is different from local shard
1041
3
    tablet_meta->set_shard_id(shard);
1042
    // load dir is called by clone, restore, storage migration
1043
    // should change tablet uid when tablet object changed
1044
3
    tablet_meta->set_tablet_uid(std::move(tablet_uid));
1045
3
    std::string meta_binary;
1046
3
    tablet_meta->serialize(&meta_binary);
1047
3
    RETURN_NOT_OK_STATUS_WITH_WARN(
1048
3
            load_tablet_from_meta(store, tablet_id, schema_hash, meta_binary, true, force, restore,
1049
3
                                  true),
1050
3
            strings::Substitute("fail to load tablet. header_path=$0", header_path));
1051
1052
3
    return Status::OK();
1053
3
}
1054
1055
0
Status TabletManager::report_tablet_info(TTabletInfo* tablet_info) {
1056
0
    LOG(INFO) << "begin to process report tablet info."
1057
0
              << "tablet_id=" << tablet_info->tablet_id;
1058
1059
0
    Status res = Status::OK();
1060
1061
0
    TabletSharedPtr tablet = get_tablet(tablet_info->tablet_id);
1062
0
    if (tablet == nullptr) {
1063
0
        return Status::Error<TABLE_NOT_FOUND>("can't find tablet={}", tablet_info->tablet_id);
1064
0
    }
1065
1066
0
    tablet->build_tablet_report_info(tablet_info);
1067
0
    VLOG_TRACE << "success to process report tablet info.";
1068
0
    return res;
1069
0
}
1070
1071
0
void TabletManager::build_all_report_tablets_info(std::map<TTabletId, TTablet>* tablets_info) {
1072
0
    DCHECK(tablets_info != nullptr);
1073
0
    VLOG_NOTICE << "begin to build all report tablets info";
1074
1075
    // build the expired txn map first, outside the tablet map lock
1076
0
    std::map<TabletInfo, std::vector<int64_t>> expire_txn_map;
1077
0
    _engine.txn_manager()->build_expire_txn_map(&expire_txn_map);
1078
0
    LOG(INFO) << "find expired transactions for " << expire_txn_map.size() << " tablets";
1079
1080
0
    HistogramStat tablet_version_num_hist;
1081
0
    auto local_cache = std::make_shared<std::vector<TTabletStat>>();
1082
0
    auto handler = [&](const TabletSharedPtr& tablet) {
1083
0
        auto& t_tablet = (*tablets_info)[tablet->tablet_id()];
1084
0
        TTabletInfo& tablet_info = t_tablet.tablet_infos.emplace_back();
1085
0
        tablet->build_tablet_report_info(&tablet_info, true, true);
1086
        // find expired transaction corresponding to this tablet
1087
0
        TabletInfo tinfo(tablet->tablet_id(), tablet->tablet_uid());
1088
0
        auto find = expire_txn_map.find(tinfo);
1089
0
        if (find != expire_txn_map.end()) {
1090
0
            tablet_info.__set_transaction_ids(find->second);
1091
0
            expire_txn_map.erase(find);
1092
0
        }
1093
0
        tablet_version_num_hist.add(tablet_info.total_version_count);
1094
0
        auto& t_tablet_stat = local_cache->emplace_back();
1095
0
        t_tablet_stat.__set_tablet_id(tablet_info.tablet_id);
1096
0
        t_tablet_stat.__set_data_size(tablet_info.data_size);
1097
0
        t_tablet_stat.__set_remote_data_size(tablet_info.remote_data_size);
1098
0
        t_tablet_stat.__set_row_count(tablet_info.row_count);
1099
0
        t_tablet_stat.__set_total_version_count(tablet_info.total_version_count);
1100
0
        t_tablet_stat.__set_visible_version_count(tablet_info.visible_version_count);
1101
0
        t_tablet_stat.__set_visible_version(tablet_info.version);
1102
0
        t_tablet_stat.__set_local_index_size(tablet_info.local_index_size);
1103
0
        t_tablet_stat.__set_local_segment_size(tablet_info.local_segment_size);
1104
0
        t_tablet_stat.__set_remote_index_size(tablet_info.remote_index_size);
1105
0
        t_tablet_stat.__set_remote_segment_size(tablet_info.remote_segment_size);
1106
0
    };
1107
0
    for_each_tablet(handler, filter_all_tablets);
1108
1109
0
    {
1110
0
        std::lock_guard<std::mutex> guard(_tablet_stat_cache_mutex);
1111
0
        _tablet_stat_list_cache.swap(local_cache);
1112
0
    }
1113
0
    DorisMetrics::instance()->tablet_version_num_distribution->set_histogram(
1114
0
            tablet_version_num_hist);
1115
0
    LOG(INFO) << "success to build all report tablets info. tablet_count=" << tablets_info->size();
1116
0
}
1117
1118
5
Status TabletManager::start_trash_sweep() {
1119
5
    DBUG_EXECUTE_IF("TabletManager.start_trash_sweep.sleep", DBUG_BLOCK);
1120
5
    std::unique_lock<std::mutex> lock(_gc_tablets_lock, std::defer_lock);
1121
5
    if (!lock.try_lock()) {
1122
0
        return Status::OK();
1123
0
    }
1124
1125
5
    for_each_tablet([](const TabletSharedPtr& tablet) { tablet->delete_expired_stale_rowset(); },
1126
5
                    filter_all_tablets);
1127
1128
5
    std::list<TabletSharedPtr>::iterator last_it;
1129
5
    {
1130
5
        std::shared_lock rdlock(_shutdown_tablets_lock);
1131
5
        last_it = _shutdown_tablets.begin();
1132
5
        if (last_it == _shutdown_tablets.end()) {
1133
0
            return Status::OK();
1134
0
        }
1135
5
    }
1136
1137
10
    auto get_batch_tablets = [this, &last_it](int limit) {
1138
10
        std::vector<TabletSharedPtr> batch_tablets;
1139
10
        std::lock_guard<std::shared_mutex> wrdlock(_shutdown_tablets_lock);
1140
241
        while (last_it != _shutdown_tablets.end() && batch_tablets.size() < limit) {
1141
            // it means current tablet is referenced by other thread
1142
231
            if (last_it->use_count() > 1) {
1143
6
                last_it++;
1144
225
            } else {
1145
225
                batch_tablets.push_back(*last_it);
1146
225
                last_it = _shutdown_tablets.erase(last_it);
1147
225
            }
1148
231
        }
1149
1150
10
        return batch_tablets;
1151
10
    };
1152
1153
5
    std::list<TabletSharedPtr> failed_tablets;
1154
    // return true if need continue delete
1155
6
    auto delete_one_batch = [this, get_batch_tablets, &failed_tablets]() -> bool {
1156
6
        int limit = 200;
1157
10
        for (;;) {
1158
10
            auto batch_tablets = get_batch_tablets(limit);
1159
225
            for (const auto& tablet : batch_tablets) {
1160
225
                if (_move_tablet_to_trash(tablet)) {
1161
225
                    limit--;
1162
225
                } else {
1163
0
                    failed_tablets.push_back(tablet);
1164
0
                }
1165
225
            }
1166
10
            if (limit <= 0) {
1167
1
                return true;
1168
1
            }
1169
9
            if (batch_tablets.empty()) {
1170
5
                return false;
1171
5
            }
1172
9
        }
1173
1174
0
        return false;
1175
6
    };
1176
1177
6
    while (delete_one_batch()) {
1178
#ifndef BE_TEST
1179
        sleep(1);
1180
#endif
1181
1
    }
1182
1183
5
    if (!failed_tablets.empty()) {
1184
0
        std::lock_guard<std::shared_mutex> wrlock(_shutdown_tablets_lock);
1185
0
        _shutdown_tablets.splice(_shutdown_tablets.end(), failed_tablets);
1186
0
    }
1187
1188
5
    return Status::OK();
1189
5
}
1190
1191
225
bool TabletManager::_move_tablet_to_trash(const TabletSharedPtr& tablet) {
1192
225
    RETURN_IF_ERROR(register_transition_tablet(tablet->tablet_id(), "move to trash"));
1193
225
    Defer defer {[&]() { unregister_transition_tablet(tablet->tablet_id(), "move to trash"); }};
1194
1195
225
    TabletSharedPtr tablet_in_not_shutdown = get_tablet(tablet->tablet_id());
1196
225
    if (tablet_in_not_shutdown) {
1197
0
        TSchemaHash schema_hash_not_shutdown = tablet_in_not_shutdown->schema_hash();
1198
0
        size_t path_hash_not_shutdown = tablet_in_not_shutdown->data_dir()->path_hash();
1199
0
        if (tablet->schema_hash() == schema_hash_not_shutdown &&
1200
0
            tablet->data_dir()->path_hash() == path_hash_not_shutdown) {
1201
0
            tablet->clear_cache();
1202
            // shard_id in memory not eq shard_id in shutdown
1203
0
            if (tablet_in_not_shutdown->tablet_path() != tablet->tablet_path()) {
1204
0
                LOG(INFO) << "tablet path not eq shutdown tablet path, move it to trash, tablet_id="
1205
0
                          << tablet_in_not_shutdown->tablet_id()
1206
0
                          << ", mem manager tablet path=" << tablet_in_not_shutdown->tablet_path()
1207
0
                          << ", shutdown tablet path=" << tablet->tablet_path();
1208
0
                return tablet->data_dir()->move_to_trash(tablet->tablet_path());
1209
0
            } else {
1210
0
                LOG(INFO) << "tablet path eq shutdown tablet path, not move to trash, tablet_id="
1211
0
                          << tablet_in_not_shutdown->tablet_id()
1212
0
                          << ", mem manager tablet path=" << tablet_in_not_shutdown->tablet_path()
1213
0
                          << ", shutdown tablet path=" << tablet->tablet_path();
1214
0
                return true;
1215
0
            }
1216
0
        }
1217
0
    }
1218
1219
225
    TabletMetaSharedPtr tablet_meta(new TabletMeta());
1220
225
    int64_t get_meta_ts = MonotonicMicros();
1221
225
    Status check_st = TabletMetaManager::get_meta(tablet->data_dir(), tablet->tablet_id(),
1222
225
                                                  tablet->schema_hash(), tablet_meta);
1223
225
    if (check_st.ok()) {
1224
225
        if (tablet_meta->tablet_state() != TABLET_SHUTDOWN ||
1225
225
            tablet_meta->tablet_uid() != tablet->tablet_uid()) {
1226
0
            LOG(WARNING) << "tablet's state changed to normal, skip remove dirs"
1227
0
                         << " tablet id = " << tablet_meta->tablet_id()
1228
0
                         << " schema hash = " << tablet_meta->schema_hash()
1229
0
                         << " old tablet_uid=" << tablet->tablet_uid()
1230
0
                         << " cur tablet_uid=" << tablet_meta->tablet_uid();
1231
0
            return true;
1232
0
        }
1233
1234
225
        tablet->clear_cache();
1235
1236
        // move data to trash
1237
225
        const auto& tablet_path = tablet->tablet_path();
1238
225
        bool exists = false;
1239
225
        Status exists_st = io::global_local_filesystem()->exists(tablet_path, &exists);
1240
225
        if (!exists_st) {
1241
0
            return false;
1242
0
        }
1243
225
        if (exists) {
1244
            // take snapshot of tablet meta
1245
225
            auto meta_file_path = fmt::format("{}/{}.hdr", tablet_path, tablet->tablet_id());
1246
225
            int64_t save_meta_ts = MonotonicMicros();
1247
225
            auto save_st = tablet->tablet_meta()->save(meta_file_path);
1248
225
            if (!save_st.ok()) {
1249
0
                LOG(WARNING) << "failed to save meta, tablet_id=" << tablet_meta->tablet_id()
1250
0
                             << ", tablet_uid=" << tablet_meta->tablet_uid()
1251
0
                             << ", error=" << save_st;
1252
0
                return false;
1253
0
            }
1254
225
            int64_t now = MonotonicMicros();
1255
225
            LOG(INFO) << "start to move tablet to trash. " << tablet_path
1256
225
                      << ". rocksdb get meta cost " << (save_meta_ts - get_meta_ts)
1257
225
                      << " us, rocksdb save meta cost " << (now - save_meta_ts) << " us";
1258
225
            Status rm_st = tablet->data_dir()->move_to_trash(tablet_path);
1259
225
            if (!rm_st.ok()) {
1260
0
                LOG(WARNING) << "fail to move dir to trash. " << tablet_path;
1261
0
                return false;
1262
0
            }
1263
225
        }
1264
        // remove tablet meta
1265
225
        auto remove_st = TabletMetaManager::remove(tablet->data_dir(), tablet->tablet_id(),
1266
225
                                                   tablet->schema_hash());
1267
225
        if (!remove_st.ok()) {
1268
0
            LOG(WARNING) << "failed to remove meta, tablet_id=" << tablet_meta->tablet_id()
1269
0
                         << ", tablet_uid=" << tablet_meta->tablet_uid() << ", error=" << remove_st;
1270
0
            return false;
1271
0
        }
1272
225
        LOG(INFO) << "successfully move tablet to trash. "
1273
225
                  << "tablet_id=" << tablet->tablet_id()
1274
225
                  << ", schema_hash=" << tablet->schema_hash() << ", tablet_path=" << tablet_path;
1275
225
        return true;
1276
225
    } else {
1277
0
        tablet->clear_cache();
1278
        // if could not find tablet info in meta store, then check if dir existed
1279
0
        const auto& tablet_path = tablet->tablet_path();
1280
0
        bool exists = false;
1281
0
        Status exists_st = io::global_local_filesystem()->exists(tablet_path, &exists);
1282
0
        if (!exists_st) {
1283
0
            return false;
1284
0
        }
1285
0
        if (exists) {
1286
0
            if (check_st.is<META_KEY_NOT_FOUND>()) {
1287
0
                LOG(INFO) << "could not find tablet meta in rocksdb, so just delete it path "
1288
0
                          << "tablet_id=" << tablet->tablet_id()
1289
0
                          << ", schema_hash=" << tablet->schema_hash()
1290
0
                          << ", delete tablet_path=" << tablet_path;
1291
0
                RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(tablet_path));
1292
0
                RETURN_IF_ERROR(DataDir::delete_tablet_parent_path_if_empty(tablet_path));
1293
0
                return true;
1294
0
            }
1295
0
            LOG(WARNING) << "errors while load meta from store, skip this tablet. "
1296
0
                         << "tablet_id=" << tablet->tablet_id()
1297
0
                         << ", schema_hash=" << tablet->schema_hash();
1298
0
            return false;
1299
0
        } else {
1300
0
            LOG(INFO) << "could not find tablet dir, skip it and remove it from gc-queue. "
1301
0
                      << "tablet_id=" << tablet->tablet_id()
1302
0
                      << ", schema_hash=" << tablet->schema_hash()
1303
0
                      << ", tablet_path=" << tablet_path;
1304
0
            return true;
1305
0
        }
1306
0
    }
1307
225
}
1308
1309
493
Status TabletManager::register_transition_tablet(int64_t tablet_id, std::string reason) {
1310
493
    tablets_shard& shard = _get_tablets_shard(tablet_id);
1311
493
    std::thread::id thread_id = std::this_thread::get_id();
1312
493
    std::lock_guard<std::mutex> lk(shard.lock_for_transition);
1313
493
    if (auto search = shard.tablets_under_transition.find(tablet_id);
1314
493
        search == shard.tablets_under_transition.end()) {
1315
        // not found
1316
491
        shard.tablets_under_transition[tablet_id] = std::make_tuple(reason, thread_id, 1);
1317
491
        LOG(INFO) << "add tablet_id= " << tablet_id << " to map, reason=" << reason
1318
491
                  << ", lock times=1, thread_id_in_map=" << thread_id;
1319
491
        return Status::OK();
1320
491
    } else {
1321
        // found
1322
2
        auto& [r, thread_id_in_map, lock_times] = search->second;
1323
2
        if (thread_id != thread_id_in_map) {
1324
            // other thread, failed
1325
0
            LOG(INFO) << "tablet_id = " << tablet_id << " is doing " << r
1326
0
                      << ", thread_id_in_map=" << thread_id_in_map << " , add reason=" << reason
1327
0
                      << ", thread_id=" << thread_id;
1328
0
            return Status::InternalError<false>("{} failed try later, tablet_id={}", reason,
1329
0
                                                tablet_id);
1330
0
        }
1331
        // add lock times
1332
2
        ++lock_times;
1333
2
        LOG(INFO) << "add tablet_id= " << tablet_id << " to map, reason=" << reason
1334
2
                  << ", lock times=" << lock_times << ", thread_id_in_map=" << thread_id_in_map;
1335
2
        return Status::OK();
1336
2
    }
1337
493
}
1338
1339
493
void TabletManager::unregister_transition_tablet(int64_t tablet_id, std::string reason) {
1340
493
    tablets_shard& shard = _get_tablets_shard(tablet_id);
1341
493
    std::thread::id thread_id = std::this_thread::get_id();
1342
493
    std::lock_guard<std::mutex> lk(shard.lock_for_transition);
1343
493
    if (auto search = shard.tablets_under_transition.find(tablet_id);
1344
493
        search == shard.tablets_under_transition.end()) {
1345
        // impossible, bug
1346
0
        DCHECK(false) << "tablet " << tablet_id
1347
0
                      << " must be found, before unreg must have been reg";
1348
493
    } else {
1349
493
        auto& [r, thread_id_in_map, lock_times] = search->second;
1350
493
        if (thread_id_in_map != thread_id) {
1351
            // impossible, bug
1352
0
            DCHECK(false) << "tablet " << tablet_id << " unreg thread must same reg thread";
1353
0
        }
1354
        // sub lock times
1355
493
        --lock_times;
1356
493
        if (lock_times != 0) {
1357
2
            LOG(INFO) << "erase tablet_id= " << tablet_id << " from map, reason=" << reason
1358
2
                      << ", left=" << lock_times << ", thread_id_in_map=" << thread_id_in_map;
1359
491
        } else {
1360
491
            LOG(INFO) << "erase tablet_id= " << tablet_id << " from map, reason=" << reason
1361
491
                      << ", thread_id_in_map=" << thread_id_in_map;
1362
491
            shard.tablets_under_transition.erase(tablet_id);
1363
491
        }
1364
493
    }
1365
493
}
1366
1367
void TabletManager::try_delete_unused_tablet_path(DataDir* data_dir, TTabletId tablet_id,
1368
                                                  SchemaHash schema_hash,
1369
                                                  const string& schema_hash_path,
1370
10
                                                  int16_t shard_id) {
1371
    // acquire the read lock, so that there is no creating tablet or load tablet from meta tasks
1372
    // create tablet and load tablet task should check whether the dir exists
1373
10
    tablets_shard& shard = _get_tablets_shard(tablet_id);
1374
10
    std::shared_lock rdlock(shard.lock);
1375
1376
    // check if meta already exists
1377
10
    TabletMetaSharedPtr tablet_meta(new TabletMeta());
1378
10
    Status check_st = TabletMetaManager::get_meta(data_dir, tablet_id, schema_hash, tablet_meta);
1379
10
    if (check_st.ok() && tablet_meta->shard_id() == shard_id) {
1380
0
        return;
1381
0
    }
1382
1383
10
    LOG(INFO) << "tablet meta not exists, try delete tablet path " << schema_hash_path;
1384
1385
10
    bool succ = register_transition_tablet(tablet_id, "path gc");
1386
10
    if (!succ) {
1387
0
        return;
1388
0
    }
1389
10
    Defer defer {[&]() { unregister_transition_tablet(tablet_id, "path gc"); }};
1390
1391
10
    TabletSharedPtr tablet = _get_tablet_unlocked(tablet_id);
1392
10
    if (tablet != nullptr && tablet->tablet_path() == schema_hash_path) {
1393
0
        LOG(INFO) << "tablet exists, skip delete the path " << schema_hash_path;
1394
0
        return;
1395
0
    }
1396
1397
    // TODO(ygl): may do other checks in the future
1398
10
    bool exists = false;
1399
10
    Status exists_st = io::global_local_filesystem()->exists(schema_hash_path, &exists);
1400
10
    if (exists_st && exists) {
1401
10
        LOG(INFO) << "start to move tablet to trash. tablet_path = " << schema_hash_path;
1402
10
        Status rm_st = data_dir->move_to_trash(schema_hash_path);
1403
10
        if (!rm_st.ok()) {
1404
0
            LOG(WARNING) << "fail to move dir to trash. dir=" << schema_hash_path;
1405
10
        } else {
1406
10
            LOG(INFO) << "move path " << schema_hash_path << " to trash successfully";
1407
10
        }
1408
10
    }
1409
10
}
1410
1411
void TabletManager::update_root_path_info(std::map<string, DataDirInfo>* path_map,
1412
0
                                          size_t* tablet_count) {
1413
0
    DCHECK(tablet_count);
1414
0
    *tablet_count = 0;
1415
0
    auto filter = [path_map, tablet_count](Tablet* t) -> bool {
1416
0
        ++(*tablet_count);
1417
0
        auto iter = path_map->find(t->data_dir()->path());
1418
0
        return iter != path_map->end() && iter->second.is_used;
1419
0
    };
1420
1421
0
    auto handler = [&](const TabletSharedPtr& tablet) {
1422
0
        auto& data_dir_info = (*path_map)[tablet->data_dir()->path()];
1423
0
        data_dir_info.local_used_capacity += tablet->tablet_local_size();
1424
0
        data_dir_info.remote_used_capacity += tablet->tablet_remote_size();
1425
0
    };
1426
1427
0
    for_each_tablet(handler, filter);
1428
0
}
1429
1430
void TabletManager::get_partition_related_tablets(int64_t partition_id,
1431
0
                                                  std::set<TabletInfo>* tablet_infos) {
1432
0
    std::shared_lock rdlock(_partitions_lock);
1433
0
    auto it = _partitions.find(partition_id);
1434
0
    if (it != _partitions.end()) {
1435
0
        *tablet_infos = it->second.tablets;
1436
0
    }
1437
0
}
1438
1439
0
void TabletManager::get_partitions_visible_version(std::map<int64_t, int64_t>* partitions_version) {
1440
0
    std::shared_lock rdlock(_partitions_lock);
1441
0
    for (const auto& [partition_id, partition] : _partitions) {
1442
0
        partitions_version->insert(
1443
0
                {partition_id, partition.visible_version->version.load(std::memory_order_relaxed)});
1444
0
    }
1445
0
}
1446
1447
void TabletManager::update_partitions_visible_version(
1448
0
        const std::map<int64_t, int64_t>& partitions_version) {
1449
0
    std::shared_lock rdlock(_partitions_lock);
1450
0
    for (auto [partition_id, version] : partitions_version) {
1451
0
        auto it = _partitions.find(partition_id);
1452
0
        if (it != _partitions.end()) {
1453
0
            it->second.visible_version->update_version_monoto(version);
1454
0
        }
1455
0
    }
1456
0
}
1457
1458
0
void TabletManager::do_tablet_meta_checkpoint(DataDir* data_dir) {
1459
0
    auto filter = [data_dir](Tablet* tablet) -> bool {
1460
0
        return tablet->tablet_state() == TABLET_RUNNING &&
1461
0
               tablet->data_dir()->path_hash() == data_dir->path_hash() && tablet->is_used() &&
1462
0
               tablet->init_succeeded();
1463
0
    };
1464
1465
0
    std::vector<TabletSharedPtr> related_tablets = get_all_tablet(filter);
1466
0
    int counter = 0;
1467
0
    MonotonicStopWatch watch;
1468
0
    watch.start();
1469
0
    for (TabletSharedPtr tablet : related_tablets) {
1470
0
        if (tablet->do_tablet_meta_checkpoint()) {
1471
0
            ++counter;
1472
0
        }
1473
0
    }
1474
0
    int64_t cost = watch.elapsed_time() / 1000 / 1000;
1475
0
    LOG(INFO) << "finish to do meta checkpoint on dir: " << data_dir->path()
1476
0
              << ", number: " << counter << ", cost(ms): " << cost;
1477
0
}
1478
1479
Status TabletManager::_create_tablet_meta_unlocked(const TCreateTabletReq& request, DataDir* store,
1480
                                                   const bool is_schema_change,
1481
                                                   const Tablet* base_tablet,
1482
300
                                                   TabletMetaSharedPtr* tablet_meta) {
1483
300
    uint32_t next_unique_id = 0;
1484
300
    std::unordered_map<uint32_t, uint32_t> col_idx_to_unique_id;
1485
300
    if (!is_schema_change) {
1486
2.15k
        for (uint32_t col_idx = 0; col_idx < request.tablet_schema.columns.size(); ++col_idx) {
1487
1.85k
            col_idx_to_unique_id[col_idx] = col_idx;
1488
1.85k
        }
1489
300
        next_unique_id = request.tablet_schema.columns.size();
1490
300
    } else {
1491
0
        next_unique_id = base_tablet->next_unique_id();
1492
0
        auto& new_columns = request.tablet_schema.columns;
1493
0
        for (uint32_t new_col_idx = 0; new_col_idx < new_columns.size(); ++new_col_idx) {
1494
0
            const TColumn& column = new_columns[new_col_idx];
1495
            // For schema change, compare old_tablet and new_tablet:
1496
            // 1. if column exist in both new_tablet and old_tablet, choose the column's
1497
            //    unique_id in old_tablet to be the column's ordinal number in new_tablet
1498
            // 2. if column exists only in new_tablet, assign next_unique_id of old_tablet
1499
            //    to the new column
1500
0
            int32_t old_col_idx = base_tablet->tablet_schema()->field_index(column.column_name);
1501
0
            if (old_col_idx != -1) {
1502
0
                uint32_t old_unique_id =
1503
0
                        base_tablet->tablet_schema()->column(old_col_idx).unique_id();
1504
0
                col_idx_to_unique_id[new_col_idx] = old_unique_id;
1505
0
            } else {
1506
                // Not exist in old tablet, it is a new added column
1507
0
                col_idx_to_unique_id[new_col_idx] = next_unique_id++;
1508
0
            }
1509
0
        }
1510
0
    }
1511
300
    VLOG_NOTICE << "creating tablet meta. next_unique_id=" << next_unique_id;
1512
1513
    // We generate a new tablet_uid for this new tablet.
1514
300
    uint64_t shard_id = store->get_shard();
1515
300
    *tablet_meta = TabletMeta::create(request, TabletUid::gen_uid(), shard_id, next_unique_id,
1516
300
                                      col_idx_to_unique_id);
1517
300
    if (request.__isset.storage_format) {
1518
52
        if (request.storage_format == TStorageFormat::DEFAULT) {
1519
0
            (*tablet_meta)->set_preferred_rowset_type(_engine.default_rowset_type());
1520
52
        } else if (request.storage_format == TStorageFormat::V1) {
1521
0
            (*tablet_meta)->set_preferred_rowset_type(ALPHA_ROWSET);
1522
52
        } else if (request.storage_format == TStorageFormat::V2) {
1523
52
            (*tablet_meta)->set_preferred_rowset_type(BETA_ROWSET);
1524
52
        } else {
1525
0
            return Status::Error<CE_CMD_PARAMS_ERROR>("invalid TStorageFormat: {}",
1526
0
                                                      request.storage_format);
1527
0
        }
1528
52
    }
1529
300
    return Status::OK();
1530
300
}
1531
1532
3.56k
TabletSharedPtr TabletManager::_get_tablet_unlocked(TTabletId tablet_id) {
1533
3.56k
    VLOG_NOTICE << "begin to get tablet. tablet_id=" << tablet_id;
1534
3.56k
    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
1535
3.56k
    const auto& iter = tablet_map.find(tablet_id);
1536
3.56k
    if (iter != tablet_map.end()) {
1537
3.01k
        return iter->second;
1538
3.01k
    }
1539
552
    return nullptr;
1540
3.56k
}
1541
1542
304
void TabletManager::_add_tablet_to_partition(const TabletSharedPtr& tablet) {
1543
304
    std::lock_guard<std::shared_mutex> wrlock(_partitions_lock);
1544
304
    auto& partition = _partitions[tablet->partition_id()];
1545
304
    partition.tablets.insert(tablet->get_tablet_info());
1546
304
    tablet->set_visible_version(
1547
304
            std::static_pointer_cast<const VersionWithTime>(partition.visible_version));
1548
304
}
1549
1550
255
void TabletManager::_remove_tablet_from_partition(const TabletSharedPtr& tablet) {
1551
255
    tablet->set_visible_version(nullptr);
1552
255
    std::lock_guard<std::shared_mutex> wrlock(_partitions_lock);
1553
255
    auto it = _partitions.find(tablet->partition_id());
1554
255
    if (it == _partitions.end()) {
1555
0
        return;
1556
0
    }
1557
1558
255
    auto& tablets = it->second.tablets;
1559
255
    tablets.erase(tablet->get_tablet_info());
1560
255
    if (tablets.empty()) {
1561
32
        _partitions.erase(it);
1562
32
    }
1563
255
}
1564
1565
void TabletManager::obtain_specific_quantity_tablets(vector<TabletInfo>& tablets_info,
1566
0
                                                     int64_t num) {
1567
0
    for (const auto& tablets_shard : _tablets_shards) {
1568
0
        std::shared_lock rdlock(tablets_shard.lock);
1569
0
        for (const auto& item : tablets_shard.tablet_map) {
1570
0
            TabletSharedPtr tablet = item.second;
1571
0
            if (tablets_info.size() >= num) {
1572
0
                return;
1573
0
            }
1574
0
            if (tablet == nullptr) {
1575
0
                continue;
1576
0
            }
1577
0
            tablets_info.push_back(tablet->get_tablet_info());
1578
0
        }
1579
0
    }
1580
0
}
1581
1582
3.25k
std::shared_mutex& TabletManager::_get_tablets_shard_lock(TTabletId tabletId) {
1583
3.25k
    return _get_tablets_shard(tabletId).lock;
1584
3.25k
}
1585
1586
4.45k
TabletManager::tablet_map_t& TabletManager::_get_tablet_map(TTabletId tabletId) {
1587
4.45k
    return _get_tablets_shard(tabletId).tablet_map;
1588
4.45k
}
1589
1590
8.70k
TabletManager::tablets_shard& TabletManager::_get_tablets_shard(TTabletId tabletId) {
1591
8.70k
    return _tablets_shards[tabletId & _tablets_shards_mask];
1592
8.70k
}
1593
1594
void TabletManager::get_tablets_distribution_on_different_disks(
1595
        std::map<int64_t, std::map<DataDir*, int64_t>>& tablets_num_on_disk,
1596
0
        std::map<int64_t, std::map<DataDir*, std::vector<TabletSize>>>& tablets_info_on_disk) {
1597
0
    std::vector<DataDir*> data_dirs = _engine.get_stores();
1598
0
    std::map<int64_t, Partition> partitions;
1599
0
    {
1600
        // When drop tablet, '_partitions_lock' is locked in 'tablet_shard_lock'.
1601
        // To avoid locking 'tablet_shard_lock' in '_partitions_lock', we lock and
1602
        // copy _partitions here.
1603
0
        std::shared_lock rdlock(_partitions_lock);
1604
0
        partitions = _partitions;
1605
0
    }
1606
1607
0
    for (const auto& [partition_id, partition] : partitions) {
1608
0
        std::map<DataDir*, int64_t> tablets_num;
1609
0
        std::map<DataDir*, std::vector<TabletSize>> tablets_info;
1610
0
        for (auto* data_dir : data_dirs) {
1611
0
            tablets_num[data_dir] = 0;
1612
0
        }
1613
1614
0
        for (const auto& tablet_info : partition.tablets) {
1615
            // get_tablet() will hold 'tablet_shard_lock'
1616
0
            TabletSharedPtr tablet = get_tablet(tablet_info.tablet_id);
1617
0
            if (tablet == nullptr) {
1618
0
                continue;
1619
0
            }
1620
0
            DataDir* data_dir = tablet->data_dir();
1621
0
            size_t tablet_footprint = tablet->tablet_footprint();
1622
0
            tablets_num[data_dir]++;
1623
0
            TabletSize tablet_size(tablet_info.tablet_id, tablet_footprint);
1624
0
            tablets_info[data_dir].push_back(tablet_size);
1625
0
        }
1626
0
        tablets_num_on_disk[partition_id] = tablets_num;
1627
0
        tablets_info_on_disk[partition_id] = tablets_info;
1628
0
    }
1629
0
}
1630
1631
struct SortCtx {
1632
    SortCtx(TabletSharedPtr tablet, RowsetSharedPtr rowset, int64_t cooldown_timestamp,
1633
            int64_t file_size)
1634
0
            : tablet(tablet), cooldown_timestamp(cooldown_timestamp), file_size(file_size) {}
1635
    TabletSharedPtr tablet;
1636
    RowsetSharedPtr rowset;
1637
    // to ensure the tablet with -1 would always be greater than other
1638
    uint64_t cooldown_timestamp;
1639
    int64_t file_size;
1640
0
    bool operator<(const SortCtx& other) const {
1641
0
        if (this->cooldown_timestamp == other.cooldown_timestamp) {
1642
0
            return this->file_size > other.file_size;
1643
0
        }
1644
0
        return this->cooldown_timestamp < other.cooldown_timestamp;
1645
0
    }
1646
};
1647
1648
void TabletManager::get_cooldown_tablets(std::vector<TabletSharedPtr>* tablets,
1649
                                         std::vector<RowsetSharedPtr>* rowsets,
1650
0
                                         std::function<bool(const TabletSharedPtr&)> skip_tablet) {
1651
0
    std::vector<SortCtx> sort_ctx_vec;
1652
0
    std::vector<std::weak_ptr<Tablet>> candidates;
1653
0
    for_each_tablet([&](const TabletSharedPtr& tablet) { candidates.emplace_back(tablet); },
1654
0
                    filter_all_tablets);
1655
0
    auto get_cooldown_tablet = [&sort_ctx_vec, &skip_tablet](std::weak_ptr<Tablet>& t) {
1656
0
        const TabletSharedPtr& tablet = t.lock();
1657
0
        RowsetSharedPtr rowset = nullptr;
1658
0
        if (UNLIKELY(nullptr == tablet)) {
1659
0
            return;
1660
0
        }
1661
0
        int64_t cooldown_timestamp = -1;
1662
0
        size_t file_size = -1;
1663
0
        if (!skip_tablet(tablet) &&
1664
0
            (rowset = tablet->need_cooldown(&cooldown_timestamp, &file_size))) {
1665
0
            sort_ctx_vec.emplace_back(tablet, rowset, cooldown_timestamp, file_size);
1666
0
        }
1667
0
    };
1668
0
    std::for_each(candidates.begin(), candidates.end(), get_cooldown_tablet);
1669
1670
0
    std::sort(sort_ctx_vec.begin(), sort_ctx_vec.end());
1671
1672
0
    for (SortCtx& ctx : sort_ctx_vec) {
1673
0
        VLOG_DEBUG << "get cooldown tablet: " << ctx.tablet->tablet_id();
1674
0
        tablets->push_back(std::move(ctx.tablet));
1675
0
        rowsets->push_back(std::move(ctx.rowset));
1676
0
    }
1677
0
}
1678
1679
0
void TabletManager::get_all_tablets_storage_format(TCheckStorageFormatResult* result) {
1680
0
    DCHECK(result != nullptr);
1681
0
    auto handler = [result](const TabletSharedPtr& tablet) {
1682
0
        if (tablet->all_beta()) {
1683
0
            result->v2_tablets.push_back(tablet->tablet_id());
1684
0
        } else {
1685
0
            result->v1_tablets.push_back(tablet->tablet_id());
1686
0
        }
1687
0
    };
1688
1689
0
    for_each_tablet(handler, filter_all_tablets);
1690
0
    result->__isset.v1_tablets = true;
1691
0
    result->__isset.v2_tablets = true;
1692
0
}
1693
1694
0
std::set<int64_t> TabletManager::check_all_tablet_segment(bool repair) {
1695
0
    std::set<int64_t> bad_tablets;
1696
0
    std::map<int64_t, std::vector<int64_t>> repair_shard_bad_tablets;
1697
0
    auto handler = [&](const TabletSharedPtr& tablet) {
1698
0
        if (!tablet->check_all_rowset_segment()) {
1699
0
            int64_t tablet_id = tablet->tablet_id();
1700
0
            bad_tablets.insert(tablet_id);
1701
0
            if (repair) {
1702
0
                repair_shard_bad_tablets[tablet_id & _tablets_shards_mask].push_back(tablet_id);
1703
0
            }
1704
0
        }
1705
0
    };
1706
0
    for_each_tablet(handler, filter_all_tablets);
1707
1708
0
    for (const auto& [shard_index, shard_tablets] : repair_shard_bad_tablets) {
1709
0
        auto& tablets_shard = _tablets_shards[shard_index];
1710
0
        auto& tablet_map = tablets_shard.tablet_map;
1711
0
        std::lock_guard<std::shared_mutex> wrlock(tablets_shard.lock);
1712
0
        for (auto tablet_id : shard_tablets) {
1713
0
            auto it = tablet_map.find(tablet_id);
1714
0
            if (it == tablet_map.end()) {
1715
0
                bad_tablets.erase(tablet_id);
1716
0
                LOG(WARNING) << "Bad tablet has be removed. tablet_id=" << tablet_id;
1717
0
            } else {
1718
0
                const auto& tablet = it->second;
1719
0
                static_cast<void>(tablet->set_tablet_state(TABLET_SHUTDOWN));
1720
0
                tablet->save_meta();
1721
0
                {
1722
0
                    std::lock_guard<std::shared_mutex> shutdown_tablets_wrlock(
1723
0
                            _shutdown_tablets_lock);
1724
0
                    _shutdown_tablets.push_back(tablet);
1725
0
                }
1726
0
                LOG(WARNING) << "There are some segments lost, set tablet to shutdown state."
1727
0
                             << "tablet_id=" << tablet->tablet_id()
1728
0
                             << ", tablet_path=" << tablet->tablet_path();
1729
0
            }
1730
0
        }
1731
0
    }
1732
1733
0
    return bad_tablets;
1734
0
}
1735
1736
bool TabletManager::update_tablet_partition_id(::doris::TPartitionId partition_id,
1737
0
                                               ::doris::TTabletId tablet_id) {
1738
0
    std::shared_lock rdlock(_get_tablets_shard_lock(tablet_id));
1739
0
    TabletSharedPtr tablet = _get_tablet_unlocked(tablet_id);
1740
0
    if (tablet == nullptr) {
1741
0
        LOG(WARNING) << "get tablet err partition_id: " << partition_id
1742
0
                     << " tablet_id:" << tablet_id;
1743
0
        return false;
1744
0
    }
1745
0
    _remove_tablet_from_partition(tablet);
1746
0
    auto st = tablet->tablet_meta()->set_partition_id(partition_id);
1747
0
    if (!st.ok()) {
1748
0
        LOG(WARNING) << "set partition id err partition_id: " << partition_id
1749
0
                     << " tablet_id:" << tablet_id;
1750
0
        return false;
1751
0
    }
1752
0
    _add_tablet_to_partition(tablet);
1753
0
    return true;
1754
0
}
1755
1756
void TabletManager::get_topn_tablet_delete_bitmap_score(
1757
0
        uint64_t* max_delete_bitmap_score, uint64_t* max_base_rowset_delete_bitmap_score) {
1758
0
    int64_t max_delete_bitmap_score_tablet_id = 0;
1759
0
    int64_t max_base_rowset_delete_bitmap_score_tablet_id = 0;
1760
0
    OlapStopWatch watch;
1761
0
    uint64_t total_delete_map_count = 0;
1762
0
    int n = config::check_tablet_delete_bitmap_score_top_n;
1763
0
    std::vector<std::pair<std::shared_ptr<Tablet>, int64_t>> buf;
1764
0
    buf.reserve(n + 1);
1765
0
    auto handler = [&](const TabletSharedPtr& tablet) {
1766
0
        uint64_t delete_bitmap_count =
1767
0
                tablet->tablet_meta()->delete_bitmap().get_delete_bitmap_count();
1768
0
        total_delete_map_count += delete_bitmap_count;
1769
0
        if (delete_bitmap_count > *max_delete_bitmap_score) {
1770
0
            max_delete_bitmap_score_tablet_id = tablet->tablet_id();
1771
0
            *max_delete_bitmap_score = delete_bitmap_count;
1772
0
        }
1773
0
        buf.emplace_back(std::move(tablet), delete_bitmap_count);
1774
0
        std::sort(buf.begin(), buf.end(), [](auto& a, auto& b) { return a.second > b.second; });
1775
0
        if (buf.size() > n) {
1776
0
            buf.pop_back();
1777
0
        }
1778
0
    };
1779
0
    for_each_tablet(handler, filter_all_tablets);
1780
0
    for (auto& [t, _] : buf) {
1781
0
        t->get_base_rowset_delete_bitmap_count(max_base_rowset_delete_bitmap_score,
1782
0
                                               &max_base_rowset_delete_bitmap_score_tablet_id);
1783
0
    }
1784
0
    std::stringstream ss;
1785
0
    for (auto& i : buf) {
1786
0
        ss << i.first->tablet_id() << ":" << i.second << ",";
1787
0
    }
1788
0
    LOG(INFO) << "get_topn_tablet_delete_bitmap_score, n=" << n
1789
0
              << ",tablet size=" << _tablets_shards.size()
1790
0
              << ",total_delete_map_count=" << total_delete_map_count
1791
0
              << ",cost(us)=" << watch.get_elapse_time_us()
1792
0
              << ",max_delete_bitmap_score=" << *max_delete_bitmap_score
1793
0
              << ",max_delete_bitmap_score_tablet_id=" << max_delete_bitmap_score_tablet_id
1794
0
              << ",max_base_rowset_delete_bitmap_score=" << *max_base_rowset_delete_bitmap_score
1795
0
              << ",max_base_rowset_delete_bitmap_score_tablet_id="
1796
0
              << max_base_rowset_delete_bitmap_score_tablet_id << ",tablets=[" << ss.str() << "]";
1797
0
}
1798
1799
} // end namespace doris