be/src/util/dns_cache.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "util/dns_cache.h" |
19 | | |
20 | | #include <algorithm> |
21 | | #include <atomic> |
22 | | #include <unordered_set> |
23 | | |
24 | | #include "common/config.h" |
25 | | #include "service/backend_options.h" |
26 | | #include "util/network_util.h" |
27 | | |
28 | | namespace doris { |
29 | | |
30 | 4 | DNSCache::DNSCache() { |
31 | 4 | refresh_thread = std::thread(&DNSCache::_refresh_cache, this); |
32 | 4 | } |
33 | | |
34 | 5 | DNSCache::DNSCache(Resolver resolver) : _resolver(std::move(resolver)) {} |
35 | | |
36 | 9 | DNSCache::~DNSCache() { |
37 | 9 | stop_refresh = true; |
38 | 9 | if (refresh_thread.joinable()) { |
39 | 4 | refresh_thread.join(); |
40 | 4 | } |
41 | 9 | } |
42 | | |
43 | 25 | Status DNSCache::get(const std::string& hostname, std::string* ip) { |
44 | 25 | { |
45 | 25 | std::shared_lock<std::shared_mutex> lock(mutex); |
46 | 25 | auto it = cache.find(hostname); |
47 | 25 | if (it != cache.end()) { |
48 | 8 | *ip = it->second; |
49 | 8 | return Status::OK(); |
50 | 8 | } |
51 | 25 | } |
52 | | // Update if not found |
53 | 17 | RETURN_IF_ERROR(_update(hostname)); |
54 | 5 | { |
55 | 5 | std::shared_lock<std::shared_mutex> lock(mutex); |
56 | 5 | *ip = cache[hostname]; |
57 | 5 | return Status::OK(); |
58 | 17 | } |
59 | 17 | } |
60 | | |
61 | | // Resolve hostname to IP address, similar to Java's DNSCache.resolveHostname. |
62 | | // If resolution fails, falls back to cached IP if available. |
63 | | // Returns the resolved IP, or cached IP on failure, or empty string if no cache available. |
64 | 31 | std::string DNSCache::_resolve_hostname(const std::string& hostname) { |
65 | | // Get cached IP first (if any) |
66 | 31 | std::string cached_ip; |
67 | 31 | { |
68 | 31 | std::shared_lock<std::shared_mutex> lock(mutex); |
69 | 31 | auto it = cache.find(hostname); |
70 | 31 | if (it != cache.end()) { |
71 | 14 | cached_ip = it->second; |
72 | 14 | } |
73 | 31 | } |
74 | | |
75 | | // Try to resolve hostname |
76 | 31 | std::string resolved_ip; |
77 | 31 | Status status = _resolver |
78 | 31 | ? _resolver(hostname, resolved_ip, BackendOptions::is_bind_ipv6()) |
79 | 31 | : hostname_to_ip(hostname, resolved_ip, BackendOptions::is_bind_ipv6()); |
80 | | |
81 | 31 | if (!status.ok() || resolved_ip.empty()) { |
82 | 23 | if (!cached_ip.empty()) { |
83 | | // Only track failure counts for hosts that are currently in the cache. |
84 | | // Hosts that were never cached or have already been evicted are not |
85 | | // tracked, which prevents unbounded growth of failure_count. |
86 | 11 | uint32_t failures = 0; |
87 | 11 | { |
88 | 11 | std::unique_lock<std::shared_mutex> lock(mutex); |
89 | | // Re-check that the host is still cached under the unique_lock: |
90 | | // it may have been evicted by the refresh thread between our |
91 | | // earlier shared_lock read of cached_ip and now (hostname_to_ip |
92 | | // can block for seconds on DNS timeout, widening the window). |
93 | | // Skipping the bump here preserves keys(failure_count) ⊆ keys(cache). |
94 | 11 | if (cache.find(hostname) != cache.end()) { |
95 | 10 | failures = ++failure_count[hostname]; |
96 | 10 | } |
97 | 11 | } |
98 | | // Throttle the log: only every N failures or the first failure. |
99 | 11 | if (failures > 0) { |
100 | 10 | int32_t every_n = std::max(1, config::dns_cache_log_every_n_failures); |
101 | 10 | if (failures == 1 || failures % static_cast<uint32_t>(every_n) == 0) { |
102 | 4 | LOG(WARNING) << "Failed to resolve hostname " << hostname |
103 | 4 | << " (consecutive failures: " << failures |
104 | 4 | << "), use cached ip: " << cached_ip; |
105 | 4 | } |
106 | 10 | } |
107 | 11 | return cached_ip; |
108 | 12 | } else { |
109 | | // Throttle to avoid flooding be.WARNING when callers repeatedly |
110 | | // query an evicted or never-resolvable hostname. This branch |
111 | | // deliberately does not maintain a per-hostname counter (that |
112 | | // would break the keys(failure_count) ⊆ keys(cache) invariant), |
113 | | // so the throttle is a coarse global rate limit shared across |
114 | | // all hostnames hitting this code path. |
115 | 12 | static std::atomic<uint64_t> no_cache_warn_counter {0}; |
116 | 12 | uint64_t n = no_cache_warn_counter.fetch_add(1, std::memory_order_relaxed) + 1; |
117 | 12 | int32_t every_n = std::max(1, config::dns_cache_log_every_n_failures); |
118 | 12 | if (n == 1 || n % static_cast<uint64_t>(every_n) == 0) { |
119 | 1 | LOG(WARNING) << "Failed to resolve hostname " << hostname |
120 | 1 | << ", no cached ip available"; |
121 | 1 | } |
122 | 12 | return ""; |
123 | 12 | } |
124 | 23 | } |
125 | | |
126 | | // Resolution succeeded - clear failure counter for this hostname. |
127 | 8 | { |
128 | 8 | std::unique_lock<std::shared_mutex> lock(mutex); |
129 | 8 | failure_count.erase(hostname); |
130 | 8 | } |
131 | 8 | return resolved_ip; |
132 | 31 | } |
133 | | |
134 | 4 | void DNSCache::_erase(const std::string& hostname) { |
135 | 4 | std::unique_lock<std::shared_mutex> lock(mutex); |
136 | 4 | cache.erase(hostname); |
137 | 4 | failure_count.erase(hostname); |
138 | 4 | } |
139 | | |
140 | 30 | Status DNSCache::_update(const std::string& hostname, uint32_t* out_failures) { |
141 | 30 | std::string real_ip = _resolve_hostname(hostname); |
142 | 30 | if (real_ip.empty()) { |
143 | 12 | if (out_failures) *out_failures = 0; |
144 | 12 | return Status::InternalError("Failed to resolve hostname {} and no cached ip available", |
145 | 12 | hostname); |
146 | 12 | } |
147 | | |
148 | 18 | std::unique_lock<std::shared_mutex> lock(mutex); |
149 | 18 | auto it = cache.find(hostname); |
150 | 18 | if (it == cache.end() || it->second != real_ip) { |
151 | 5 | cache[hostname] = real_ip; |
152 | 5 | LOG(INFO) << "update hostname " << hostname << "'s ip to " << real_ip; |
153 | 5 | } |
154 | | // Read failure_count under the same lock we already hold, so _refresh_once |
155 | | // does not need a second lock acquisition to decide on eviction. |
156 | 18 | if (out_failures) { |
157 | 13 | auto fc_it = failure_count.find(hostname); |
158 | 13 | *out_failures = fc_it != failure_count.end() ? fc_it->second : 0; |
159 | 13 | } |
160 | 18 | return Status::OK(); |
161 | 30 | } |
162 | | |
163 | 15 | void DNSCache::_refresh_once() { |
164 | 15 | std::unordered_set<std::string> keys; |
165 | 15 | { |
166 | 15 | std::shared_lock<std::shared_mutex> lock(mutex); |
167 | 15 | std::transform(cache.begin(), cache.end(), std::inserter(keys, keys.end()), |
168 | 15 | [](const auto& pair) { return pair.first; }); |
169 | 15 | } |
170 | 15 | for (auto& key : keys) { |
171 | 13 | uint32_t failures = 0; |
172 | 13 | Status st = _update(key, &failures); |
173 | 13 | if (!st.ok()) { |
174 | | // _update only returns an error when _resolve_hostname returns an |
175 | | // empty string, which happens only if the hostname has never been |
176 | | // successfully resolved (no cached IP to fall back to). Keys in |
177 | | // the refresh loop come from `cache`, so they all have a prior IP; |
178 | | // during DNS failure _resolve_hostname returns that cached IP and |
179 | | // _update returns OK. This branch is therefore effectively dead |
180 | | // under normal refresh semantics — the eviction logic below handles |
181 | | // the long-running failure case instead. |
182 | 0 | LOG(WARNING) << "Failed to update DNS cache for hostname " << key << ": " |
183 | 0 | << st.to_string(); |
184 | 0 | } |
185 | | // Evict hostnames that have failed to resolve for too long. |
186 | | // This avoids two pathological symptoms after a backend is dropped |
187 | | // from the cluster and its DNS record is removed: |
188 | | // 1) be.WARNING gets flooded with `failed to get ip from host`. |
189 | | // 2) brpc keeps re-using the stale IP from cache, producing |
190 | | // `Fail to wait EPOLLOUT ... Connection timed out`. |
191 | 13 | int32_t threshold = config::dns_cache_max_consecutive_failures; |
192 | 13 | if (threshold > 0 && failures >= static_cast<uint32_t>(threshold)) { |
193 | 3 | LOG(WARNING) << "Evicting hostname " << key << " from DNS cache after " << failures |
194 | 3 | << " consecutive resolution failures"; |
195 | 3 | _erase(key); |
196 | 3 | } |
197 | 13 | } |
198 | 15 | } |
199 | | |
200 | 4 | void DNSCache::_refresh_cache() { |
201 | 8 | while (!stop_refresh) { |
202 | | // refresh every 1 min |
203 | 4 | std::this_thread::sleep_for(std::chrono::minutes(1)); |
204 | 4 | _refresh_once(); |
205 | 4 | } |
206 | 4 | } |
207 | | |
208 | | } // end of namespace doris |