Coverage Report

Created: 2026-03-12 14:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/io/fs/file_handle_cache.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// This file is copied from
19
// https://github.com/apache/impala/blob/master/be/src/runtime/io/handle-cache.h
20
// and modified by Doris
21
22
#pragma once
23
24
#include <array>
25
#include <list>
26
#include <map>
27
#include <memory>
28
29
#include "common/status.h"
30
#include "io/fs/file_system.h"
31
#include "io/fs/hdfs.h"
32
#include "util/aligned_new.h"
33
#include "util/lru_multi_cache.inline.h"
34
#include "util/thread.h"
35
36
namespace doris::io {
37
38
/// This abstract class is a small wrapper around the hdfsFile handle and the file system
39
/// instance which is needed to close the file handle. The handle incorporates
40
/// the last modified time of the file when it was opened. This is used to distinguish
41
/// between file handles for files that can be updated or overwritten.
42
/// This is used only through its subclasses, CachedHdfsFileHandle and
43
/// ExclusiveHdfsFileHandle.
44
class HdfsFileHandle {
45
public:
46
    /// Destructor will close the file handle
47
    ~HdfsFileHandle();
48
49
    /// Init opens the file handle
50
    Status init(int64_t file_size);
51
52
0
    hdfsFS fs() const { return _fs; }
53
0
    hdfsFile file() const { return _hdfs_file; }
54
0
    int64_t mtime() const { return _mtime; }
55
0
    int64_t file_size() const { return _file_size; }
56
57
protected:
58
    HdfsFileHandle(const hdfsFS& fs, const std::string& fname, int64_t mtime)
59
0
            : _fs(fs), _fname(fname), _mtime(mtime) {}
60
61
private:
62
    hdfsFS _fs;
63
    const std::string _fname;
64
    hdfsFile _hdfs_file = nullptr;
65
    int64_t _mtime;
66
    int64_t _file_size;
67
};
68
69
/// CachedHdfsFileHandles are owned by the file handle cache and are used for no
70
/// other purpose.
71
class CachedHdfsFileHandle : public HdfsFileHandle {
72
public:
73
    CachedHdfsFileHandle(const hdfsFS& fs, const std::string& fname, int64_t mtime);
74
    ~CachedHdfsFileHandle();
75
};
76
77
/// ExclusiveHdfsFileHandles are used for all purposes where a CachedHdfsFileHandle
78
/// is not appropriate.
79
class ExclusiveHdfsFileHandle : public HdfsFileHandle {
80
public:
81
    ExclusiveHdfsFileHandle(const hdfsFS& fs, const std::string& fname, int64_t mtime)
82
0
            : HdfsFileHandle(fs, fname, mtime) {}
83
};
84
85
/// The FileHandleCache is a data structure that owns HdfsFileHandles to share between
86
/// threads. The HdfsFileHandles are hash partitioned across NUM_PARTITIONS partitions.
87
/// Each partition operates independently with its own locks, reducing contention
88
/// between concurrent threads. The `capacity` is split between the partitions and is
89
/// enforced independently.
90
///
91
/// Threads check out a file handle for exclusive access, released automatically by RAII
92
/// accessor. If the file handle is not already present in the cache or all file handles
93
/// for this file are checked out, the file handle is emplaced in the cache. The cache can
94
/// contain multiple file handles for the same file. If a file handle is checked out, it
95
/// cannot be evicted from the cache. In this case, a cache can exceed the specified
96
/// capacity.
97
///
98
/// Remote file systems could keep a connection as part of the file handle without support
99
/// for unbuffering. The file handle cache is not suitable for those systems, as the cache
100
/// size can exceed the limit on the number of concurrent connections. HDFS does not
101
/// maintain a connection in the file handle, S3A client supports unbuffering since
102
/// IMPALA-8428, so those do not have this restriction.
103
///
104
/// If there is a file handle in the cache and the underlying file is deleted,
105
/// the file handle might keep the file from being deleted at the OS level. This can
106
/// take up disk space and impact correctness. To avoid this, the cache will evict any
107
/// file handle that has been unused for longer than threshold specified by
108
/// `unused_handle_timeout_secs`. Eviction is disabled when the threshold is 0.
109
///
110
/// TODO: The cache should also evict file handles more aggressively if the file handle's
111
/// mtime is older than the file's current mtime.
112
class FileHandleCache {
113
private:
114
    /// Each partition operates independently, and thus has its own thread-safe cache.
115
    /// To avoid contention on the lock_ due to false sharing the partitions are
116
    /// aligned to cache line boundaries.
117
    struct FileHandleCachePartition : public CacheLineAligned {
118
        // Cache key is a pair of filename and mtime
119
        // Using std::pair to spare boilerplate of hash function
120
        typedef LruMultiCache<std::pair<std::string, int64_t>, CachedHdfsFileHandle> CacheType;
121
        CacheType cache;
122
    };
123
124
public:
125
    /// RAII accessor built over LruMultiCache::Accessor to handle metrics and unbuffering.
126
    /// Composition is used instead of inheritance to support the usage as in/out parameter
127
    class Accessor {
128
    public:
129
        Accessor();
130
        Accessor(FileHandleCachePartition::CacheType::Accessor&& cache_accessor);
131
0
        Accessor(Accessor&&) = default;
132
        Accessor& operator=(Accessor&&) = default;
133
134
        DISALLOW_COPY_AND_ASSIGN(Accessor);
135
136
        /// Handles metrics and unbuffering
137
        ~Accessor();
138
139
        /// Set function can be used if the Accessor is used as in/out parameter.
140
        void set(FileHandleCachePartition::CacheType::Accessor&& cache_accessor);
141
142
        /// Interface mimics LruMultiCache::Accessor's interface, handles metrics
143
        CachedHdfsFileHandle* get();
144
        void release();
145
        void destroy();
146
147
    private:
148
        FileHandleCachePartition::CacheType::Accessor _cache_accessor;
149
    };
150
151
    /// Instantiates the cache with `capacity` split evenly across NUM_PARTITIONS
152
    /// partitions. If the capacity does not split evenly, then the capacity is rounded
153
    /// up. The cache will age out any file handle that is unused for
154
    /// `unused_handle_timeout_secs` seconds. Age out is disabled if this is set to zero.
155
    FileHandleCache(size_t capacity, size_t num_partitions, uint64_t unused_handle_timeout_secs);
156
157
    /// Destructor is only called for backend tests
158
    ~FileHandleCache();
159
160
    /// Starts up a thread that monitors the age of file handles and evicts any that
161
    /// exceed the limit.
162
    Status init() WARN_UNUSED_RESULT;
163
164
    /// Get a file handle accessor from the cache for the specified filename (fname) and
165
    /// last modification time (mtime). This will hash the filename to determine
166
    /// which partition to use for this file handle.
167
    ///
168
    /// If 'require_new_handle' is false and the partition contains an available handle,
169
    /// an accessor is returned and cache_hit is set to true. Otherwise, the partition will
170
    /// emplace file handle, an accessor to it will be returned with cache_hit set to false.
171
    /// On failure, empty accessor will be returned. In either case, the partition may evict
172
    /// a file handle to make room for the new file handle.
173
    ///
174
    /// This obtains exclusive control over the returned file handle.
175
    Status get_file_handle(const hdfsFS& fs, const std::string& fname, int64_t mtime,
176
                           int64_t file_size, bool require_new_handle, Accessor* accessor,
177
                           bool* cache_hit) WARN_UNUSED_RESULT;
178
179
private:
180
    /// Periodic check to evict unused file handles. Only executed by _eviction_thread.
181
    void _evict_handles_loop();
182
183
    std::vector<FileHandleCachePartition> _cache_partitions;
184
185
    /// Maximum time before an unused file handle is aged out of the cache.
186
    /// Aging out is disabled if this is set to 0.
187
    uint64_t _unused_handle_timeout_secs;
188
189
    /// Thread to check for unused file handles to evict. This thread will exit when
190
    /// the _shut_down_promise is set.
191
    std::shared_ptr<Thread> _eviction_thread;
192
    std::atomic<bool> _is_shut_down = {false};
193
};
194
195
} // namespace doris::io