Coverage Report

Created: 2026-06-10 12:23

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/cloud/cloud_internal_service.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "cloud/cloud_internal_service.h"
19
20
#include <bthread/countdown_event.h>
21
22
#include <algorithm>
23
#include <chrono>
24
#include <optional>
25
#include <thread>
26
27
#include "cloud/cloud_storage_engine.h"
28
#include "cloud/cloud_tablet.h"
29
#include "cloud/cloud_tablet_mgr.h"
30
#include "cloud/cloud_warm_up_manager.h"
31
#include "cloud/config.h"
32
#include "io/cache/block_file_cache.h"
33
#include "io/cache/block_file_cache_downloader.h"
34
#include "io/cache/block_file_cache_factory.h"
35
#include "runtime/thread_context.h"
36
#include "runtime/workload_management/io_throttle.h"
37
#include "util/async_io.h"
38
#include "util/debug_points.h"
39
40
namespace doris {
41
#include "common/compile_check_avoid_begin.h"
42
43
bvar::Adder<uint64_t> g_file_cache_get_by_peer_num("file_cache_get_by_peer_num");
44
bvar::Adder<uint64_t> g_file_cache_get_by_peer_blocks_num("file_cache_get_by_peer_blocks_num");
45
bvar::Adder<uint64_t> g_file_cache_get_by_peer_success_num("file_cache_get_by_peer_success_num");
46
bvar::Adder<uint64_t> g_file_cache_get_by_peer_failed_num("file_cache_get_by_peer_failed_num");
47
bvar::LatencyRecorder g_file_cache_get_by_peer_server_latency(
48
        "file_cache_get_by_peer_server_latency");
49
bvar::LatencyRecorder g_file_cache_get_by_peer_read_cache_file_latency(
50
        "file_cache_get_by_peer_read_cache_file_latency");
51
bvar::LatencyRecorder g_cloud_internal_service_get_file_cache_meta_by_tablet_id_latency(
52
        "cloud_internal_service_get_file_cache_meta_by_tablet_id_latency");
53
bvar::Adder<int64_t> g_cloud_sync_tablet_meta_requests_total(
54
        "cloud_sync_tablet_meta_requests_total");
55
bvar::Adder<int64_t> g_cloud_sync_tablet_meta_synced_total("cloud_sync_tablet_meta_synced_total");
56
bvar::Adder<int64_t> g_cloud_sync_tablet_meta_skipped_total("cloud_sync_tablet_meta_skipped_total");
57
bvar::Adder<int64_t> g_cloud_sync_tablet_meta_failed_total("cloud_sync_tablet_meta_failed_total");
58
59
CloudInternalServiceImpl::CloudInternalServiceImpl(CloudStorageEngine& engine, ExecEnv* exec_env)
60
3
        : PInternalService(exec_env), _engine(engine) {}
61
62
2
CloudInternalServiceImpl::~CloudInternalServiceImpl() = default;
63
64
void CloudInternalServiceImpl::sync_tablet_meta(google::protobuf::RpcController* controller,
65
                                                const PSyncTabletMetaRequest* request,
66
                                                PSyncTabletMetaResponse* response,
67
19
                                                google::protobuf::Closure* done) {
68
19
    auto start_time = std::chrono::steady_clock::now();
69
19
    bool ret = _light_work_pool.try_offer([this, request, response, done, start_time]() {
70
18
        brpc::ClosureGuard closure_guard(done);
71
18
        LOG(INFO) << "begin to sync tablet meta, request=" << request->ShortDebugString();
72
18
        int64_t synced = 0;
73
18
        int64_t skipped = 0;
74
18
        int64_t failed = 0;
75
18
        g_cloud_sync_tablet_meta_requests_total << 1;
76
34
        for (const auto tablet_id : request->tablet_ids()) {
77
34
            auto tablet = _engine.tablet_mgr().get_tablet_if_cached(tablet_id);
78
34
            if (!tablet) {
79
25
                ++skipped;
80
25
                continue;
81
25
            }
82
9
            auto st = tablet->sync_meta();
83
9
            if (!st.ok()) {
84
1
                ++failed;
85
1
                LOG(WARNING) << "failed to sync tablet meta from cloud meta service, tablet="
86
1
                             << tablet_id << ", err=" << st;
87
1
                continue;
88
1
            }
89
8
            ++synced;
90
8
        }
91
18
        g_cloud_sync_tablet_meta_synced_total << synced;
92
18
        g_cloud_sync_tablet_meta_skipped_total << skipped;
93
18
        g_cloud_sync_tablet_meta_failed_total << failed;
94
18
        response->set_synced_tablets(synced);
95
18
        response->set_skipped_tablets(skipped);
96
18
        response->set_failed_tablets(failed);
97
18
        Status::OK().to_protobuf(response->mutable_status());
98
18
        auto cost_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
99
18
                               std::chrono::steady_clock::now() - start_time)
100
18
                               .count();
101
18
        LOG(INFO) << "finish to sync tablet meta, request=" << request->ShortDebugString()
102
18
                  << ", response=" << response->ShortDebugString() << ", cost_ms=" << cost_ms;
103
18
    });
104
19
    if (!ret) {
105
1
        brpc::ClosureGuard closure_guard(done);
106
1
        Status::InternalError("failed to offer sync_tablet_meta request to work pool")
107
1
                .to_protobuf(response->mutable_status());
108
1
        auto cost_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
109
1
                               std::chrono::steady_clock::now() - start_time)
110
1
                               .count();
111
1
        LOG(WARNING) << "failed to offer sync_tablet_meta request to work pool, request="
112
1
                     << request->ShortDebugString() << ", response=" << response->ShortDebugString()
113
1
                     << ", cost_ms=" << cost_ms;
114
1
    }
115
19
}
116
117
void CloudInternalServiceImpl::alter_vault_sync(google::protobuf::RpcController* controller,
118
                                                const doris::PAlterVaultSyncRequest* request,
119
                                                PAlterVaultSyncResponse* response,
120
0
                                                google::protobuf::Closure* done) {
121
0
    LOG(INFO) << "alter be to sync vault info from Meta Service";
122
    // If the vaults containing hdfs vault then it would try to create hdfs connection using jni
123
    // which would acuiqre one thread local jniEnv. But bthread context can't guarantee that the brpc
124
    // worker thread wouldn't do bthread switch between worker threads.
125
0
    bool ret = _heavy_work_pool.try_offer([this, done]() {
126
0
        brpc::ClosureGuard closure_guard(done);
127
0
        _engine.sync_storage_vault();
128
0
    });
129
0
    if (!ret) {
130
0
        brpc::ClosureGuard closure_guard(done);
131
0
        LOG(WARNING) << "fail to offer alter_vault_sync request to the work pool, pool="
132
0
                     << _heavy_work_pool.get_info();
133
0
    }
134
0
}
135
136
0
FileCacheType cache_type_to_pb(io::FileCacheType type) {
137
0
    switch (type) {
138
0
    case io::FileCacheType::TTL:
139
0
        return FileCacheType::TTL;
140
0
    case io::FileCacheType::INDEX:
141
0
        return FileCacheType::INDEX;
142
0
    case io::FileCacheType::NORMAL:
143
0
        return FileCacheType::NORMAL;
144
0
    default:
145
0
        DCHECK(false);
146
0
    }
147
0
    return FileCacheType::NORMAL;
148
0
}
149
150
0
static int64_t current_unix_time_us() {
151
0
    return std::chrono::duration_cast<std::chrono::microseconds>(
152
0
                   std::chrono::system_clock::now().time_since_epoch())
153
0
            .count();
154
0
}
155
156
static std::optional<int64_t> warm_up_rowset_cross_host_latency_us(int64_t start_unix_ts_us,
157
0
                                                                   int64_t end_unix_ts_us) {
158
    // The start timestamp is generated by the caller BE. Mixed-version callers may omit it, and
159
    // system clocks across BEs are not guaranteed to be ordered.
160
0
    if (start_unix_ts_us <= 0 || end_unix_ts_us < start_unix_ts_us) {
161
0
        return std::nullopt;
162
0
    }
163
0
    return end_unix_ts_us - start_unix_ts_us;
164
0
}
165
166
static void add_file_cache_block_meta_to_response(
167
        PGetFileCacheMetaResponse* resp, int64_t tablet_id, const std::string& rowset_id,
168
        int32_t segment_id, const std::string& file_name,
169
        const std::tuple<int64_t, int64_t, io::FileCacheType, int64_t>& tuple,
170
0
        const RowsetSharedPtr& rowset, bool is_index) {
171
0
    FileCacheBlockMeta* meta = resp->add_file_cache_block_metas();
172
0
    meta->set_tablet_id(tablet_id);
173
0
    meta->set_rowset_id(rowset_id);
174
0
    meta->set_segment_id(segment_id);
175
0
    meta->set_file_name(file_name);
176
177
0
    if (!is_index) {
178
        // .dat
179
0
        meta->set_file_size(rowset->rowset_meta()->segment_file_size(segment_id));
180
0
        meta->set_file_type(doris::FileType::SEGMENT_FILE);
181
0
    } else {
182
        // .idx
183
0
        const auto& idx_file_info = rowset->rowset_meta()->inverted_index_file_info(segment_id);
184
0
        meta->set_file_size(idx_file_info.has_index_size() ? idx_file_info.index_size() : -1);
185
0
        meta->set_file_type(doris::FileType::INVERTED_INDEX_FILE);
186
0
    }
187
188
0
    meta->set_offset(std::get<0>(tuple));
189
0
    meta->set_size(std::get<1>(tuple));
190
0
    meta->set_cache_type(cache_type_to_pb(std::get<2>(tuple)));
191
0
    meta->set_expiration_time(std::get<3>(tuple));
192
0
}
193
194
static void process_segment_file_cache_meta(PGetFileCacheMetaResponse* resp,
195
                                            const RowsetSharedPtr& rowset, int64_t tablet_id,
196
                                            const std::string& rowset_id, int32_t segment_id,
197
0
                                            bool is_index) {
198
0
    const char* extension = is_index ? ".idx" : ".dat";
199
0
    std::string file_name = fmt::format("{}_{}{}", rowset_id, segment_id, extension);
200
0
    auto cache_key = io::BlockFileCache::hash(file_name);
201
0
    auto* cache = io::FileCacheFactory::instance()->get_by_path(cache_key);
202
0
    if (!cache) return;
203
0
    auto segments_meta = cache->get_hot_blocks_meta(cache_key);
204
0
    for (const auto& tuple : segments_meta) {
205
0
        add_file_cache_block_meta_to_response(resp, tablet_id, rowset_id, segment_id, file_name,
206
0
                                              tuple, rowset, is_index);
207
0
    }
208
0
}
209
210
void CloudInternalServiceImpl::get_file_cache_meta_by_tablet_id(
211
        google::protobuf::RpcController* controller [[maybe_unused]],
212
        const PGetFileCacheMetaRequest* request, PGetFileCacheMetaResponse* response,
213
0
        google::protobuf::Closure* done) {
214
0
    brpc::ClosureGuard closure_guard(done);
215
0
    if (!config::enable_file_cache) {
216
0
        LOG_WARNING("try to access tablet file cache meta, but file cache not enabled");
217
0
        return;
218
0
    }
219
0
    auto begin_ts = std::chrono::duration_cast<std::chrono::microseconds>(
220
0
                            std::chrono::steady_clock::now().time_since_epoch())
221
0
                            .count();
222
0
    std::ostringstream tablet_ids_stream;
223
0
    int count = 0;
224
0
    for (const auto& tablet_id : request->tablet_ids()) {
225
0
        tablet_ids_stream << tablet_id << ", ";
226
0
        count++;
227
0
        if (count >= 10) {
228
0
            break;
229
0
        }
230
0
    }
231
0
    LOG(INFO) << "warm up get meta from this be, tablets num=" << request->tablet_ids().size()
232
0
              << ", first 10 tablet_ids=[ " << tablet_ids_stream.str() << " ]";
233
0
    for (const auto& tablet_id : request->tablet_ids()) {
234
0
        auto res = _engine.tablet_mgr().get_tablet(tablet_id);
235
0
        if (!res.has_value()) {
236
0
            LOG(ERROR) << "failed to get tablet: " << tablet_id
237
0
                       << " err msg: " << res.error().msg();
238
0
            continue;
239
0
        }
240
0
        CloudTabletSPtr tablet = std::move(res.value());
241
0
        auto st = tablet->sync_rowsets();
242
0
        if (!st) {
243
            // just log failed, try it best
244
0
            LOG(WARNING) << "failed to sync rowsets: " << tablet_id
245
0
                         << " err msg: " << st.to_string();
246
0
        }
247
0
        auto rowsets = tablet->get_snapshot_rowset();
248
249
0
        for (const RowsetSharedPtr& rowset : rowsets) {
250
0
            std::string rowset_id = rowset->rowset_id().to_string();
251
0
            for (int32_t segment_id = 0; segment_id < rowset->num_segments(); ++segment_id) {
252
0
                process_segment_file_cache_meta(response, rowset, tablet_id, rowset_id, segment_id,
253
0
                                                false);
254
0
                process_segment_file_cache_meta(response, rowset, tablet_id, rowset_id, segment_id,
255
0
                                                true);
256
0
            }
257
0
        }
258
0
    }
259
0
    auto end_ts = std::chrono::duration_cast<std::chrono::microseconds>(
260
0
                          std::chrono::steady_clock::now().time_since_epoch())
261
0
                          .count();
262
0
    g_cloud_internal_service_get_file_cache_meta_by_tablet_id_latency << (end_ts - begin_ts);
263
0
    LOG(INFO) << "get file cache meta by tablet ids = [ " << tablet_ids_stream.str() << " ] took "
264
0
              << end_ts - begin_ts << " us";
265
0
    VLOG_DEBUG << "get file cache meta by tablet id request=" << request->DebugString()
266
0
               << ", response=" << response->DebugString();
267
0
}
268
269
namespace {
270
// Helper functions for fetch_peer_data
271
272
0
Status handle_peer_file_range_request(const std::string& path, PFetchPeerDataResponse* response) {
273
    // Read specific range [file_offset, file_offset+file_size) across cached blocks
274
0
    auto datas = io::FileCacheFactory::instance()->get_cache_data_by_path(path);
275
0
    for (auto& cb : datas) {
276
0
        *(response->add_datas()) = std::move(cb);
277
0
    }
278
0
    return Status::OK();
279
0
}
280
281
0
void set_error_response(PFetchPeerDataResponse* response, const std::string& error_msg) {
282
0
    response->mutable_status()->add_error_msgs(error_msg);
283
0
    response->mutable_status()->set_status_code(TStatusCode::INTERNAL_ERROR);
284
0
}
285
286
Status read_file_block(const std::shared_ptr<io::FileBlock>& file_block, size_t file_size,
287
0
                       doris::CacheBlockPB* output) {
288
0
    std::string data;
289
    // ATTN: calculate the rightmost boundary value of the block, due to inaccurate current block meta information.
290
    // see CachedRemoteFileReader::read_at_impl for more details.
291
    // Ensure file_size >= file_block->offset() to avoid underflow
292
0
    if (file_size < file_block->offset()) {
293
0
        LOG(WARNING) << "file_size (" << file_size << ") < file_block->offset("
294
0
                     << file_block->offset() << ")";
295
0
        return Status::InternalError<false>("file_size less than block offset");
296
0
    }
297
0
    size_t read_size = std::min(static_cast<size_t>(file_size - file_block->offset()),
298
0
                                file_block->range().size());
299
0
    data.resize(read_size);
300
301
0
    auto begin_read_file_ts = std::chrono::duration_cast<std::chrono::microseconds>(
302
0
                                      std::chrono::steady_clock::now().time_since_epoch())
303
0
                                      .count();
304
305
0
    SCOPED_ATTACH_TASK(ExecEnv::GetInstance()->s3_file_buffer_tracker());
306
0
    Slice slice(data.data(), data.size());
307
0
    Status read_st = file_block->read(slice, /*read_offset=*/0);
308
309
0
    auto end_read_file_ts = std::chrono::duration_cast<std::chrono::microseconds>(
310
0
                                    std::chrono::steady_clock::now().time_since_epoch())
311
0
                                    .count();
312
0
    g_file_cache_get_by_peer_read_cache_file_latency << (end_read_file_ts - begin_read_file_ts);
313
314
0
    if (read_st.ok()) {
315
0
        output->set_block_offset(static_cast<int64_t>(file_block->offset()));
316
0
        output->set_block_size(static_cast<int64_t>(read_size));
317
0
        output->set_data(std::move(data));
318
0
        return Status::OK();
319
0
    } else {
320
0
        g_file_cache_get_by_peer_failed_num << 1;
321
0
        LOG(WARNING) << "read cache block failed: " << read_st;
322
0
        return read_st;
323
0
    }
324
0
}
325
326
Status handle_peer_file_cache_block_request(const PFetchPeerDataRequest* request,
327
0
                                            PFetchPeerDataResponse* response) {
328
0
    const auto& path = request->path();
329
0
    auto hash = io::BlockFileCache::hash(path);
330
0
    auto* cache = io::FileCacheFactory::instance()->get_by_path(hash);
331
0
    if (cache == nullptr) {
332
0
        g_file_cache_get_by_peer_failed_num << 1;
333
0
        set_error_response(response, "can't get file cache instance");
334
0
        return Status::InternalError<false>("can't get file cache instance");
335
0
    }
336
337
0
    io::CacheContext ctx {};
338
0
    io::ReadStatistics local_stats;
339
0
    ctx.stats = &local_stats;
340
341
0
    for (const auto& cb_req : request->cache_req()) {
342
0
        size_t offset = static_cast<size_t>(std::max<int64_t>(0, cb_req.block_offset()));
343
0
        size_t size = static_cast<size_t>(std::max<int64_t>(0, cb_req.block_size()));
344
0
        auto holder = cache->get_or_set(hash, offset, size, ctx);
345
346
0
        for (auto& fb : holder.file_blocks) {
347
0
            if (fb->state() != io::FileBlock::State::DOWNLOADED) {
348
0
                g_file_cache_get_by_peer_failed_num << 1;
349
0
                LOG(WARNING) << "read cache block failed, state=" << fb->state();
350
0
                set_error_response(response, "read cache file error");
351
0
                return Status::InternalError<false>("cache block not downloaded");
352
0
            }
353
354
0
            g_file_cache_get_by_peer_blocks_num << 1;
355
0
            doris::CacheBlockPB* out = response->add_datas();
356
0
            Status read_status = read_file_block(fb, request->file_size(), out);
357
0
            if (!read_status.ok()) {
358
0
                set_error_response(response, "read cache file error");
359
0
                return read_status;
360
0
            }
361
0
        }
362
0
    }
363
364
0
    return Status::OK();
365
0
}
366
} // namespace
367
368
void CloudInternalServiceImpl::fetch_peer_data(google::protobuf::RpcController* controller
369
                                               [[maybe_unused]],
370
                                               const PFetchPeerDataRequest* request,
371
                                               PFetchPeerDataResponse* response,
372
0
                                               google::protobuf::Closure* done) {
373
0
    bool ret = _heavy_work_pool.try_offer([request, response, done]() {
374
0
        brpc::ClosureGuard closure_guard(done);
375
0
        g_file_cache_get_by_peer_num << 1;
376
377
0
        if (!config::enable_file_cache) {
378
0
            LOG_WARNING("try to access file cache data, but file cache not enabled");
379
0
            return;
380
0
        }
381
382
0
        auto begin_ts = std::chrono::duration_cast<std::chrono::microseconds>(
383
0
                                std::chrono::steady_clock::now().time_since_epoch())
384
0
                                .count();
385
386
0
        const auto type = request->type();
387
0
        const auto& path = request->path();
388
0
        response->mutable_status()->set_status_code(TStatusCode::OK);
389
390
0
        Status status = Status::OK();
391
0
        if (type == PFetchPeerDataRequest_Type_PEER_FILE_RANGE) {
392
0
            status = handle_peer_file_range_request(path, response);
393
0
        } else if (type == PFetchPeerDataRequest_Type_PEER_FILE_CACHE_BLOCK) {
394
0
            status = handle_peer_file_cache_block_request(request, response);
395
0
        }
396
397
0
        if (!status.ok()) {
398
0
            LOG(WARNING) << "fetch peer data failed: " << status.to_string();
399
0
            set_error_response(response, status.to_string());
400
0
        }
401
402
0
        DBUG_EXECUTE_IF("CloudInternalServiceImpl::fetch_peer_data_slower", {
403
0
            int st_us = dp->param<int>("sleep", 1000);
404
0
            LOG_WARNING("CloudInternalServiceImpl::fetch_peer_data_slower").tag("sleep", st_us);
405
0
            bthread_usleep(st_us);
406
0
        });
407
408
0
        auto end_ts = std::chrono::duration_cast<std::chrono::microseconds>(
409
0
                              std::chrono::steady_clock::now().time_since_epoch())
410
0
                              .count();
411
0
        g_file_cache_get_by_peer_server_latency << (end_ts - begin_ts);
412
0
        g_file_cache_get_by_peer_success_num << 1;
413
414
0
        VLOG_DEBUG << "fetch cache request=" << request->DebugString()
415
0
                   << ", response=" << response->DebugString();
416
0
    });
417
418
0
    if (!ret) {
419
0
        brpc::ClosureGuard closure_guard(done);
420
0
        LOG(WARNING) << "fail to offer fetch peer data request to the work pool, pool="
421
0
                     << _heavy_work_pool.get_info();
422
0
    }
423
0
}
424
425
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_submitted_segment_num(
426
        "file_cache_event_driven_warm_up_submitted_segment_num");
427
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_finished_segment_num(
428
        "file_cache_event_driven_warm_up_finished_segment_num");
429
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_failed_segment_num(
430
        "file_cache_event_driven_warm_up_failed_segment_num");
431
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_submitted_segment_size(
432
        "file_cache_event_driven_warm_up_submitted_segment_size");
433
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_finished_segment_size(
434
        "file_cache_event_driven_warm_up_finished_segment_size");
435
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_failed_segment_size(
436
        "file_cache_event_driven_warm_up_failed_segment_size");
437
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_submitted_index_num(
438
        "file_cache_event_driven_warm_up_submitted_index_num");
439
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_finished_index_num(
440
        "file_cache_event_driven_warm_up_finished_index_num");
441
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_failed_index_num(
442
        "file_cache_event_driven_warm_up_failed_index_num");
443
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_submitted_index_size(
444
        "file_cache_event_driven_warm_up_submitted_index_size");
445
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_finished_index_size(
446
        "file_cache_event_driven_warm_up_finished_index_size");
447
bvar::Adder<uint64_t> g_file_cache_event_driven_warm_up_failed_index_size(
448
        "file_cache_event_driven_warm_up_failed_index_size");
449
bvar::Status<int64_t> g_file_cache_warm_up_rowset_last_handle_unix_ts(
450
        "file_cache_warm_up_rowset_last_handle_unix_ts", 0);
451
bvar::Status<int64_t> g_file_cache_warm_up_rowset_last_finish_unix_ts(
452
        "file_cache_warm_up_rowset_last_finish_unix_ts", 0);
453
bvar::LatencyRecorder g_file_cache_warm_up_rowset_latency("file_cache_warm_up_rowset_latency");
454
bvar::LatencyRecorder g_file_cache_warm_up_rowset_request_to_handle_latency(
455
        "file_cache_warm_up_rowset_request_to_handle_latency");
456
bvar::LatencyRecorder g_file_cache_warm_up_rowset_handle_to_finish_latency(
457
        "file_cache_warm_up_rowset_handle_to_finish_latency");
458
bvar::Adder<uint64_t> g_file_cache_warm_up_rowset_slow_count(
459
        "file_cache_warm_up_rowset_slow_count");
460
bvar::Adder<uint64_t> g_file_cache_warm_up_rowset_request_to_handle_slow_count(
461
        "file_cache_warm_up_rowset_request_to_handle_slow_count");
462
bvar::Adder<uint64_t> g_file_cache_warm_up_rowset_handle_to_finish_slow_count(
463
        "file_cache_warm_up_rowset_handle_to_finish_slow_count");
464
bvar::Adder<uint64_t> g_file_cache_warm_up_rowset_wait_for_compaction_num(
465
        "file_cache_warm_up_rowset_wait_for_compaction_num");
466
bvar::Adder<uint64_t> g_file_cache_warm_up_rowset_wait_for_compaction_timeout_num(
467
        "file_cache_warm_up_rowset_wait_for_compaction_timeout_num");
468
469
void handle_segment_download_done(Status st, int64_t tablet_id, const RowsetId& rowset_id,
470
                                  int64_t segment_id, std::shared_ptr<CloudTablet> tablet,
471
                                  std::shared_ptr<bthread::CountdownEvent> wait, Version version,
472
0
                                  int64_t segment_size, int64_t request_ts, int64_t handle_ts) {
473
0
    DBUG_EXECUTE_IF("CloudInternalServiceImpl::warm_up_rowset.download_segment", {
474
0
        auto sleep_time = dp->param<int>("sleep", 3);
475
0
        LOG_INFO("[verbose] block download for rowset={}, version={}, sleep={}",
476
0
                 rowset_id.to_string(), version.to_string(), sleep_time);
477
0
        std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
478
0
    });
479
0
    DBUG_EXECUTE_IF(
480
0
            "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_"
481
0
            "error",
482
0
            {
483
0
                st = Status::InternalError("injected error");
484
0
                LOG_INFO("[verbose] inject error, tablet={}, rowset={}, st={}", tablet_id,
485
0
                         rowset_id.to_string(), st.to_string());
486
0
            });
487
0
    if (st.ok()) {
488
0
        g_file_cache_event_driven_warm_up_finished_segment_num << 1;
489
0
        g_file_cache_event_driven_warm_up_finished_segment_size << segment_size;
490
0
        int64_t now_ts = current_unix_time_us();
491
0
        g_file_cache_warm_up_rowset_last_finish_unix_ts.set_value(now_ts);
492
0
        auto rowset_latency_us = warm_up_rowset_cross_host_latency_us(request_ts, now_ts);
493
0
        if (rowset_latency_us.has_value()) {
494
0
            g_file_cache_warm_up_rowset_latency << *rowset_latency_us;
495
0
        }
496
0
        g_file_cache_warm_up_rowset_handle_to_finish_latency << (now_ts - handle_ts);
497
0
        if (rowset_latency_us.has_value() &&
498
0
            *rowset_latency_us > config::warm_up_rowset_slow_log_ms * 1000) {
499
0
            g_file_cache_warm_up_rowset_slow_count << 1;
500
0
            LOG(INFO) << "warm up rowset took " << *rowset_latency_us
501
0
                      << " us, tablet_id: " << tablet_id << ", rowset_id: " << rowset_id.to_string()
502
0
                      << ", segment_id: " << segment_id;
503
0
        }
504
0
        if (now_ts - handle_ts > config::warm_up_rowset_slow_log_ms * 1000) {
505
0
            g_file_cache_warm_up_rowset_handle_to_finish_slow_count << 1;
506
0
            LOG(INFO) << "warm up rowset (handle to finish) took " << now_ts - handle_ts
507
0
                      << " us, tablet_id: " << tablet_id << ", rowset_id: " << rowset_id.to_string()
508
0
                      << ", segment_id: " << segment_id;
509
0
        }
510
0
    } else {
511
0
        g_file_cache_event_driven_warm_up_failed_segment_num << 1;
512
0
        g_file_cache_event_driven_warm_up_failed_segment_size << segment_size;
513
0
        LOG(WARNING) << "download segment failed, tablet_id: " << tablet_id
514
0
                     << " rowset_id: " << rowset_id.to_string() << ", error: " << st;
515
0
    }
516
0
    if (tablet->complete_rowset_segment_warmup(WarmUpTriggerSource::EVENT_DRIVEN, rowset_id, st, 1,
517
0
                                               0)
518
0
                .trigger_source == WarmUpTriggerSource::EVENT_DRIVEN) {
519
0
        VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" << rowset_id.to_string()
520
0
                   << ") completed";
521
0
    }
522
0
    if (wait) {
523
0
        wait->signal();
524
0
    }
525
0
}
526
527
void handle_inverted_index_download_done(Status st, int64_t tablet_id, const RowsetId& rowset_id,
528
                                         int64_t segment_id, std::string index_path,
529
                                         std::shared_ptr<CloudTablet> tablet,
530
                                         std::shared_ptr<bthread::CountdownEvent> wait,
531
                                         Version version, uint64_t idx_size, int64_t request_ts,
532
0
                                         int64_t handle_ts) {
533
0
    DBUG_EXECUTE_IF("CloudInternalServiceImpl::warm_up_rowset.download_inverted_idx", {
534
0
        auto sleep_time = dp->param<int>("sleep", 3);
535
0
        LOG_INFO(
536
0
                "[verbose] block download for rowset={}, inverted index "
537
0
                "file={}, sleep={}",
538
0
                rowset_id.to_string(), index_path, sleep_time);
539
0
        std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
540
0
    });
541
0
    if (st.ok()) {
542
0
        g_file_cache_event_driven_warm_up_finished_index_num << 1;
543
0
        g_file_cache_event_driven_warm_up_finished_index_size << idx_size;
544
0
        int64_t now_ts = current_unix_time_us();
545
0
        g_file_cache_warm_up_rowset_last_finish_unix_ts.set_value(now_ts);
546
0
        auto rowset_latency_us = warm_up_rowset_cross_host_latency_us(request_ts, now_ts);
547
0
        if (rowset_latency_us.has_value()) {
548
0
            g_file_cache_warm_up_rowset_latency << *rowset_latency_us;
549
0
        }
550
0
        g_file_cache_warm_up_rowset_handle_to_finish_latency << (now_ts - handle_ts);
551
0
        if (rowset_latency_us.has_value() &&
552
0
            *rowset_latency_us > config::warm_up_rowset_slow_log_ms * 1000) {
553
0
            g_file_cache_warm_up_rowset_slow_count << 1;
554
0
            LOG(INFO) << "warm up rowset took " << *rowset_latency_us
555
0
                      << " us, tablet_id: " << tablet_id << ", rowset_id: " << rowset_id.to_string()
556
0
                      << ", segment_id: " << segment_id;
557
0
        }
558
0
        if (now_ts - handle_ts > config::warm_up_rowset_slow_log_ms * 1000) {
559
0
            g_file_cache_warm_up_rowset_handle_to_finish_slow_count << 1;
560
0
            LOG(INFO) << "warm up rowset (handle to finish) took " << now_ts - handle_ts
561
0
                      << " us, tablet_id: " << tablet_id << ", rowset_id: " << rowset_id.to_string()
562
0
                      << ", segment_id: " << segment_id;
563
0
        }
564
0
    } else {
565
0
        g_file_cache_event_driven_warm_up_failed_index_num << 1;
566
0
        g_file_cache_event_driven_warm_up_failed_index_size << idx_size;
567
0
        LOG(WARNING) << "download inverted index failed, tablet_id: " << tablet_id
568
0
                     << " rowset_id: " << rowset_id << ", error: " << st;
569
0
    }
570
0
    if (tablet->complete_rowset_segment_warmup(WarmUpTriggerSource::EVENT_DRIVEN, rowset_id, st, 0,
571
0
                                               1)
572
0
                .trigger_source == WarmUpTriggerSource::EVENT_DRIVEN) {
573
0
        VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" << rowset_id.to_string()
574
0
                   << ") completed";
575
0
    }
576
0
    if (wait) {
577
0
        wait->signal();
578
0
    }
579
0
}
580
581
void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* controller
582
                                              [[maybe_unused]],
583
                                              const PWarmUpRowsetRequest* request,
584
                                              PWarmUpRowsetResponse* response,
585
0
                                              google::protobuf::Closure* done) {
586
0
    brpc::ClosureGuard closure_guard(done);
587
0
    std::shared_ptr<bthread::CountdownEvent> wait = nullptr;
588
0
    timespec due_time;
589
0
    if (request->has_sync_wait_timeout_ms() && request->sync_wait_timeout_ms() > 0) {
590
0
        g_file_cache_warm_up_rowset_wait_for_compaction_num << 1;
591
0
        wait = std::make_shared<bthread::CountdownEvent>(0);
592
0
        VLOG_DEBUG << "sync_wait_timeout: " << request->sync_wait_timeout_ms() << " ms";
593
0
        due_time = butil::milliseconds_from_now(request->sync_wait_timeout_ms());
594
0
    }
595
596
0
    for (auto& rs_meta_pb : request->rowset_metas()) {
597
0
        RowsetMeta rs_meta;
598
0
        rs_meta.init_from_pb(rs_meta_pb);
599
0
        auto storage_resource = rs_meta.remote_storage_resource();
600
0
        if (!storage_resource) {
601
0
            LOG(WARNING) << storage_resource.error();
602
0
            continue;
603
0
        }
604
0
        int64_t tablet_id = rs_meta.tablet_id();
605
0
        auto rowset_id = rs_meta.rowset_id();
606
0
        bool local_only = !(request->has_skip_existence_check() && request->skip_existence_check());
607
0
        auto res = _engine.tablet_mgr().get_tablet(tablet_id, /* warmup_data = */ false,
608
0
                                                   /* sync_delete_bitmap = */ true,
609
0
                                                   /* sync_stats = */ nullptr,
610
0
                                                   /* local_only = */ local_only);
611
0
        if (!res.has_value()) {
612
0
            LOG_WARNING("Warm up error ").tag("tablet_id", tablet_id).error(res.error());
613
0
            if (res.error().msg().find("local_only=true") != std::string::npos ||
614
0
                res.error().msg().find("force_use_only_cached=true") != std::string::npos) {
615
0
                res.error().set_code(ErrorCode::TABLE_NOT_FOUND);
616
0
            }
617
0
            res.error().to_protobuf(response->mutable_status());
618
0
            continue;
619
0
        }
620
0
        auto tablet = res.value();
621
0
        auto tablet_meta = tablet->tablet_meta();
622
623
0
        int64_t handle_ts = current_unix_time_us();
624
0
        g_file_cache_warm_up_rowset_last_handle_unix_ts.set_value(handle_ts);
625
0
        int64_t request_ts = request->has_unix_ts_us() ? request->unix_ts_us() : 0;
626
0
        auto request_to_handle_latency_us =
627
0
                warm_up_rowset_cross_host_latency_us(request_ts, handle_ts);
628
0
        if (request_to_handle_latency_us.has_value()) {
629
0
            g_file_cache_warm_up_rowset_request_to_handle_latency << *request_to_handle_latency_us;
630
0
        }
631
0
        if (request_to_handle_latency_us.has_value() &&
632
0
            *request_to_handle_latency_us > config::warm_up_rowset_slow_log_ms * 1000) {
633
0
            g_file_cache_warm_up_rowset_request_to_handle_slow_count << 1;
634
0
            LOG(INFO) << "warm up rowset (request to handle) took " << *request_to_handle_latency_us
635
0
                      << " us, tablet_id: " << rs_meta.tablet_id()
636
0
                      << ", rowset_id: " << rowset_id.to_string();
637
0
        }
638
0
        int64_t expiration_time = tablet_meta->ttl_seconds();
639
640
0
        if (!tablet->add_rowset_warmup_state(rs_meta, WarmUpTriggerSource::EVENT_DRIVEN)) {
641
0
            LOG(INFO) << "found duplicate warmup task for rowset " << rowset_id.to_string()
642
0
                      << ", skip it";
643
0
            continue;
644
0
        }
645
646
0
        for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) {
647
0
            if (!config::file_cache_enable_only_warm_up_idx) {
648
0
                auto segment_size = rs_meta.segment_file_size(segment_id);
649
650
                // Use rs_meta.fs() instead of storage_resource.value()->fs to support packed files.
651
                // PackedFileSystem wrapper in rs_meta.fs() handles the index_map lookup and
652
                // reads from the correct packed file.
653
0
                io::DownloadFileMeta download_meta {
654
0
                        .path = storage_resource.value()->remote_segment_path(rs_meta, segment_id),
655
0
                        .file_size = segment_size,
656
0
                        .offset = 0,
657
0
                        .download_size = segment_size,
658
0
                        .file_system = rs_meta.fs(),
659
0
                        .ctx = {.is_index_data = false,
660
0
                                .expiration_time = expiration_time,
661
0
                                .is_dryrun = config::enable_reader_dryrun_when_download_file_cache,
662
0
                                .is_warmup = true},
663
0
                        .download_done =
664
0
                                [=, version = rs_meta.version()](Status st) {
665
0
                                    handle_segment_download_done(
666
0
                                            st, tablet_id, rowset_id, segment_id, tablet, wait,
667
0
                                            version, segment_size, request_ts, handle_ts);
668
0
                                },
669
0
                        .tablet_id = tablet_id};
670
671
0
                g_file_cache_event_driven_warm_up_submitted_segment_num << 1;
672
0
                g_file_cache_event_driven_warm_up_submitted_segment_size << segment_size;
673
0
                if (wait) {
674
0
                    wait->add_count();
675
0
                }
676
677
0
                _engine.file_cache_block_downloader().submit_download_task(download_meta);
678
0
            }
679
680
            // Use rs_meta.fs() to support packed files for inverted index download.
681
0
            auto download_inverted_index = [&, tablet](std::string index_path, uint64_t idx_size) {
682
0
                io::DownloadFileMeta download_meta {
683
0
                        .path = io::Path(index_path),
684
0
                        .file_size = static_cast<int64_t>(idx_size),
685
0
                        .file_system = rs_meta.fs(),
686
0
                        .ctx = {.is_index_data = false, // DORIS-20877
687
0
                                .expiration_time = expiration_time,
688
0
                                .is_dryrun = config::enable_reader_dryrun_when_download_file_cache,
689
0
                                .is_warmup = true},
690
0
                        .download_done =
691
0
                                [=, version = rs_meta.version()](Status st) {
692
0
                                    handle_inverted_index_download_done(
693
0
                                            st, tablet_id, rowset_id, segment_id, index_path,
694
0
                                            tablet, wait, version, idx_size, request_ts, handle_ts);
695
0
                                },
696
0
                        .tablet_id = tablet_id};
697
0
                g_file_cache_event_driven_warm_up_submitted_index_num << 1;
698
0
                g_file_cache_event_driven_warm_up_submitted_index_size << idx_size;
699
0
                tablet->update_rowset_warmup_state_inverted_idx_num(
700
0
                        WarmUpTriggerSource::EVENT_DRIVEN, rowset_id, 1);
701
0
                if (wait) {
702
0
                    wait->add_count();
703
0
                }
704
0
                _engine.file_cache_block_downloader().submit_download_task(download_meta);
705
0
            };
706
707
            // inverted index
708
0
            auto schema_ptr = rs_meta.tablet_schema();
709
0
            auto idx_version = schema_ptr->get_inverted_index_storage_format();
710
711
0
            if (schema_ptr->has_inverted_index() || schema_ptr->has_ann_index()) {
712
0
                if (idx_version == InvertedIndexStorageFormatPB::V1) {
713
0
                    auto&& inverted_index_info = rs_meta.inverted_index_file_info(segment_id);
714
0
                    std::unordered_map<int64_t, int64_t> index_size_map;
715
0
                    for (const auto& info : inverted_index_info.index_info()) {
716
0
                        if (info.index_file_size() != -1) {
717
0
                            index_size_map[info.index_id()] = info.index_file_size();
718
0
                        } else {
719
0
                            VLOG_DEBUG << "Invalid index_file_size for segment_id " << segment_id
720
0
                                       << ", index_id " << info.index_id();
721
0
                        }
722
0
                    }
723
0
                    for (const auto& index : schema_ptr->inverted_indexes()) {
724
0
                        auto idx_path = storage_resource.value()->remote_idx_v1_path(
725
0
                                rs_meta, segment_id, index->index_id(), index->get_index_suffix());
726
0
                        download_inverted_index(idx_path, index_size_map[index->index_id()]);
727
0
                    }
728
0
                } else { // InvertedIndexStorageFormatPB::V2
729
0
                    auto&& inverted_index_info = rs_meta.inverted_index_file_info(segment_id);
730
0
                    int64_t idx_size = 0;
731
0
                    if (inverted_index_info.has_index_size()) {
732
0
                        idx_size = inverted_index_info.index_size();
733
0
                    } else {
734
0
                        VLOG_DEBUG << "index_size is not set for segment " << segment_id;
735
0
                    }
736
0
                    auto idx_path =
737
0
                            storage_resource.value()->remote_idx_v2_path(rs_meta, segment_id);
738
0
                    download_inverted_index(idx_path, idx_size);
739
0
                }
740
0
            }
741
0
        }
742
0
    }
743
0
    if (wait && wait->timed_wait(due_time)) {
744
0
        g_file_cache_warm_up_rowset_wait_for_compaction_timeout_num << 1;
745
0
        LOG_WARNING("the time spent warming up {} rowsets exceeded {} ms",
746
0
                    request->rowset_metas().size(), request->sync_wait_timeout_ms());
747
0
    }
748
0
}
749
750
bvar::Adder<uint64_t> g_file_cache_recycle_cache_finished_segment_num(
751
        "file_cache_recycle_cache_finished_segment_num");
752
bvar::Adder<uint64_t> g_file_cache_recycle_cache_finished_index_num(
753
        "file_cache_recycle_cache_finished_index_num");
754
755
void CloudInternalServiceImpl::recycle_cache(google::protobuf::RpcController* controller
756
                                             [[maybe_unused]],
757
                                             const PRecycleCacheRequest* request,
758
                                             PRecycleCacheResponse* response,
759
0
                                             google::protobuf::Closure* done) {
760
0
    brpc::ClosureGuard closure_guard(done);
761
762
0
    if (!config::enable_file_cache) {
763
0
        return;
764
0
    }
765
0
    for (const auto& meta : request->cache_metas()) {
766
0
        for (int64_t segment_id = 0; segment_id < meta.num_segments(); segment_id++) {
767
0
            auto file_key = Segment::file_cache_key(meta.rowset_id(), segment_id);
768
0
            auto* file_cache = io::FileCacheFactory::instance()->get_by_path(file_key);
769
0
            file_cache->remove_if_cached_async(file_key);
770
0
            g_file_cache_recycle_cache_finished_segment_num << 1;
771
0
        }
772
773
        // inverted index
774
0
        for (const auto& file_name : meta.index_file_names()) {
775
0
            auto file_key = io::BlockFileCache::hash(file_name);
776
0
            auto* file_cache = io::FileCacheFactory::instance()->get_by_path(file_key);
777
0
            file_cache->remove_if_cached_async(file_key);
778
0
            g_file_cache_recycle_cache_finished_index_num << 1;
779
0
        }
780
0
    }
781
0
}
782
783
#include "common/compile_check_avoid_end.h"
784
} // namespace doris