Coverage Report

Created: 2026-05-22 13:29

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/dns_cache.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "util/dns_cache.h"
19
20
#include <algorithm>
21
#include <atomic>
22
#include <unordered_set>
23
24
#include "common/config.h"
25
#include "service/backend_options.h"
26
#include "util/network_util.h"
27
28
namespace doris {
29
30
4
DNSCache::DNSCache() {
31
4
    refresh_thread = std::thread(&DNSCache::_refresh_cache, this);
32
4
}
33
34
5
DNSCache::DNSCache(Resolver resolver) : _resolver(std::move(resolver)) {}
35
36
9
DNSCache::~DNSCache() {
37
9
    stop_refresh = true;
38
9
    if (refresh_thread.joinable()) {
39
4
        refresh_thread.join();
40
4
    }
41
9
}
42
43
25
Status DNSCache::get(const std::string& hostname, std::string* ip) {
44
25
    {
45
25
        std::shared_lock<std::shared_mutex> lock(mutex);
46
25
        auto it = cache.find(hostname);
47
25
        if (it != cache.end()) {
48
8
            *ip = it->second;
49
8
            return Status::OK();
50
8
        }
51
25
    }
52
    // Update if not found
53
17
    RETURN_IF_ERROR(_update(hostname));
54
5
    {
55
5
        std::shared_lock<std::shared_mutex> lock(mutex);
56
5
        *ip = cache[hostname];
57
5
        return Status::OK();
58
17
    }
59
17
}
60
61
// Resolve hostname to IP address, similar to Java's DNSCache.resolveHostname.
62
// If resolution fails, falls back to cached IP if available.
63
// Returns the resolved IP, or cached IP on failure, or empty string if no cache available.
64
31
std::string DNSCache::_resolve_hostname(const std::string& hostname) {
65
    // Get cached IP first (if any)
66
31
    std::string cached_ip;
67
31
    {
68
31
        std::shared_lock<std::shared_mutex> lock(mutex);
69
31
        auto it = cache.find(hostname);
70
31
        if (it != cache.end()) {
71
14
            cached_ip = it->second;
72
14
        }
73
31
    }
74
75
    // Try to resolve hostname
76
31
    std::string resolved_ip;
77
31
    Status status = _resolver
78
31
                            ? _resolver(hostname, resolved_ip, BackendOptions::is_bind_ipv6())
79
31
                            : hostname_to_ip(hostname, resolved_ip, BackendOptions::is_bind_ipv6());
80
81
31
    if (!status.ok() || resolved_ip.empty()) {
82
23
        if (!cached_ip.empty()) {
83
            // Only track failure counts for hosts that are currently in the cache.
84
            // Hosts that were never cached or have already been evicted are not
85
            // tracked, which prevents unbounded growth of failure_count.
86
11
            uint32_t failures = 0;
87
11
            {
88
11
                std::unique_lock<std::shared_mutex> lock(mutex);
89
                // Re-check that the host is still cached under the unique_lock:
90
                // it may have been evicted by the refresh thread between our
91
                // earlier shared_lock read of cached_ip and now (hostname_to_ip
92
                // can block for seconds on DNS timeout, widening the window).
93
                // Skipping the bump here preserves keys(failure_count) ⊆ keys(cache).
94
11
                if (cache.find(hostname) != cache.end()) {
95
10
                    failures = ++failure_count[hostname];
96
10
                }
97
11
            }
98
            // Throttle the log: only every N failures or the first failure.
99
11
            if (failures > 0) {
100
10
                int32_t every_n = std::max(1, config::dns_cache_log_every_n_failures);
101
10
                if (failures == 1 || failures % static_cast<uint32_t>(every_n) == 0) {
102
4
                    LOG(WARNING) << "Failed to resolve hostname " << hostname
103
4
                                 << " (consecutive failures: " << failures
104
4
                                 << "), use cached ip: " << cached_ip;
105
4
                }
106
10
            }
107
11
            return cached_ip;
108
12
        } else {
109
            // Throttle to avoid flooding be.WARNING when callers repeatedly
110
            // query an evicted or never-resolvable hostname.  This branch
111
            // deliberately does not maintain a per-hostname counter (that
112
            // would break the keys(failure_count) ⊆ keys(cache) invariant),
113
            // so the throttle is a coarse global rate limit shared across
114
            // all hostnames hitting this code path.
115
12
            static std::atomic<uint64_t> no_cache_warn_counter {0};
116
12
            uint64_t n = no_cache_warn_counter.fetch_add(1, std::memory_order_relaxed) + 1;
117
12
            int32_t every_n = std::max(1, config::dns_cache_log_every_n_failures);
118
12
            if (n == 1 || n % static_cast<uint64_t>(every_n) == 0) {
119
1
                LOG(WARNING) << "Failed to resolve hostname " << hostname
120
1
                             << ", no cached ip available";
121
1
            }
122
12
            return "";
123
12
        }
124
23
    }
125
126
    // Resolution succeeded - clear failure counter for this hostname.
127
8
    {
128
8
        std::unique_lock<std::shared_mutex> lock(mutex);
129
8
        failure_count.erase(hostname);
130
8
    }
131
8
    return resolved_ip;
132
31
}
133
134
4
void DNSCache::_erase(const std::string& hostname) {
135
4
    std::unique_lock<std::shared_mutex> lock(mutex);
136
4
    cache.erase(hostname);
137
4
    failure_count.erase(hostname);
138
4
}
139
140
30
Status DNSCache::_update(const std::string& hostname, uint32_t* out_failures) {
141
30
    std::string real_ip = _resolve_hostname(hostname);
142
30
    if (real_ip.empty()) {
143
12
        if (out_failures) *out_failures = 0;
144
12
        return Status::InternalError("Failed to resolve hostname {} and no cached ip available",
145
12
                                     hostname);
146
12
    }
147
148
18
    std::unique_lock<std::shared_mutex> lock(mutex);
149
18
    auto it = cache.find(hostname);
150
18
    if (it == cache.end() || it->second != real_ip) {
151
5
        cache[hostname] = real_ip;
152
5
        LOG(INFO) << "update hostname " << hostname << "'s ip to " << real_ip;
153
5
    }
154
    // Read failure_count under the same lock we already hold, so _refresh_once
155
    // does not need a second lock acquisition to decide on eviction.
156
18
    if (out_failures) {
157
13
        auto fc_it = failure_count.find(hostname);
158
13
        *out_failures = fc_it != failure_count.end() ? fc_it->second : 0;
159
13
    }
160
18
    return Status::OK();
161
30
}
162
163
15
void DNSCache::_refresh_once() {
164
15
    std::unordered_set<std::string> keys;
165
15
    {
166
15
        std::shared_lock<std::shared_mutex> lock(mutex);
167
15
        std::transform(cache.begin(), cache.end(), std::inserter(keys, keys.end()),
168
15
                       [](const auto& pair) { return pair.first; });
169
15
    }
170
15
    for (auto& key : keys) {
171
13
        uint32_t failures = 0;
172
13
        Status st = _update(key, &failures);
173
13
        if (!st.ok()) {
174
            // _update only returns an error when _resolve_hostname returns an
175
            // empty string, which happens only if the hostname has never been
176
            // successfully resolved (no cached IP to fall back to).  Keys in
177
            // the refresh loop come from `cache`, so they all have a prior IP;
178
            // during DNS failure _resolve_hostname returns that cached IP and
179
            // _update returns OK.  This branch is therefore effectively dead
180
            // under normal refresh semantics — the eviction logic below handles
181
            // the long-running failure case instead.
182
0
            LOG(WARNING) << "Failed to update DNS cache for hostname " << key << ": "
183
0
                         << st.to_string();
184
0
        }
185
        // Evict hostnames that have failed to resolve for too long.
186
        // This avoids two pathological symptoms after a backend is dropped
187
        // from the cluster and its DNS record is removed:
188
        //   1) be.WARNING gets flooded with `failed to get ip from host`.
189
        //   2) brpc keeps re-using the stale IP from cache, producing
190
        //      `Fail to wait EPOLLOUT ... Connection timed out`.
191
13
        int32_t threshold = config::dns_cache_max_consecutive_failures;
192
13
        if (threshold > 0 && failures >= static_cast<uint32_t>(threshold)) {
193
3
            LOG(WARNING) << "Evicting hostname " << key << " from DNS cache after " << failures
194
3
                         << " consecutive resolution failures";
195
3
            _erase(key);
196
3
        }
197
13
    }
198
15
}
199
200
4
void DNSCache::_refresh_cache() {
201
8
    while (!stop_refresh) {
202
        // refresh every 1 min
203
4
        std::this_thread::sleep_for(std::chrono::minutes(1));
204
4
        _refresh_once();
205
4
    }
206
4
}
207
208
} // end of namespace doris