be/src/util/timezone_utils.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "util/timezone_utils.h" |
19 | | |
20 | | #include <cctz/civil_time.h> |
21 | | #include <cctz/time_zone.h> |
22 | | #include <fcntl.h> |
23 | | #include <glog/logging.h> |
24 | | #include <re2/re2.h> |
25 | | #include <re2/stringpiece.h> |
26 | | #include <sys/mman.h> |
27 | | #include <sys/stat.h> |
28 | | #include <sys/types.h> |
29 | | #include <unistd.h> |
30 | | |
31 | | #include <algorithm> |
32 | | #include <boost/algorithm/string.hpp> |
33 | | #include <boost/algorithm/string/case_conv.hpp> |
34 | | #include <cctype> |
35 | | #include <chrono> |
36 | | #include <cstdlib> |
37 | | #include <filesystem> |
38 | | #include <memory> |
39 | | #include <string> |
40 | | #include <string_view> |
41 | | |
42 | | #include "common/exception.h" |
43 | | #include "common/logging.h" |
44 | | #include "common/status.h" |
45 | | |
46 | | using boost::algorithm::to_lower_copy; |
47 | | |
48 | | namespace fs = std::filesystem; |
49 | | |
50 | | namespace doris { |
51 | | |
52 | | using ZoneList = std::unordered_map<std::string, cctz::time_zone>; |
53 | | |
54 | | RE2 time_zone_offset_format_reg(R"(^[+-]{1}\d{2}\:\d{2}$)"); // visiting is thread-safe |
55 | | |
56 | | // for ut, make it never nullptr. |
57 | | std::unique_ptr<ZoneList> lower_zone_cache_ = std::make_unique<ZoneList>(); |
58 | | |
59 | | const std::string TimezoneUtils::default_time_zone = "+08:00"; |
60 | | static const char* tzdir = "/usr/share/zoneinfo"; // default value, may change by TZDIR env var |
61 | | |
62 | 6 | void TimezoneUtils::clear_timezone_caches() { |
63 | 6 | lower_zone_cache_->clear(); |
64 | 6 | } |
65 | 2 | size_t TimezoneUtils::cache_size() { |
66 | 2 | return lower_zone_cache_->size(); |
67 | 2 | } |
68 | | |
69 | 148k | static bool parse_save_name_tz(const std::string& tz_name) { |
70 | 148k | cctz::time_zone tz; |
71 | 148k | PROPAGATE_FALSE(cctz::load_time_zone(tz_name, &tz)); |
72 | 146k | lower_zone_cache_->emplace(to_lower_copy(tz_name), tz); |
73 | 146k | return true; |
74 | 148k | } |
75 | | |
76 | 247 | void TimezoneUtils::load_timezones_to_cache() { |
77 | 247 | std::string base_str; |
78 | | // try get from system |
79 | 247 | char* tzdir_env = std::getenv("TZDIR"); |
80 | 247 | if (tzdir_env && *tzdir_env) { |
81 | 0 | tzdir = tzdir_env; |
82 | 0 | } |
83 | | |
84 | 247 | base_str = tzdir; |
85 | 247 | base_str += '/'; |
86 | | |
87 | 247 | const auto root_path = fs::path {base_str}; |
88 | 247 | if (!exists(root_path)) { |
89 | 0 | throw Exception(Status::FatalError("Cannot find system tzfile. Doris exiting!")); |
90 | 0 | } |
91 | | |
92 | 247 | std::set<std::string> ignore_paths = {"posix", "right"}; // duplications. ignore them. |
93 | | |
94 | 153k | for (fs::recursive_directory_iterator it {base_str}; it != end(it); it++) { |
95 | 153k | const auto& dir_entry = *it; |
96 | 153k | try { |
97 | 153k | if (dir_entry.is_regular_file() || |
98 | 153k | (dir_entry.is_symlink() && is_regular_file(read_symlink(dir_entry)))) { |
99 | 148k | auto tz_name = dir_entry.path().string().substr(base_str.length()); |
100 | 148k | if (!parse_save_name_tz(tz_name)) { |
101 | 1.24k | LOG(WARNING) << "Meet illegal tzdata file: " << tz_name << ". skipped"; |
102 | 1.24k | } |
103 | 148k | } else if (dir_entry.is_directory() && |
104 | 5.44k | ignore_paths.contains(dir_entry.path().filename())) { |
105 | 494 | it.disable_recursion_pending(); |
106 | 494 | } |
107 | 153k | } catch (const fs::filesystem_error& e) { |
108 | | // maybe symlink loop or to nowhere... |
109 | 0 | LOG(WARNING) << "filesystem error when loading timezone file from " << dir_entry.path() |
110 | 0 | << ": " << e.what(); |
111 | 0 | } |
112 | 153k | } |
113 | | // some special cases. Z = Zulu. CST = Asia/Shanghai |
114 | 247 | if (auto it = lower_zone_cache_->find("zulu"); it != lower_zone_cache_->end()) { |
115 | 247 | lower_zone_cache_->emplace("z", it->second); |
116 | 247 | } |
117 | 247 | if (auto it = lower_zone_cache_->find("asia/shanghai"); it != lower_zone_cache_->end()) { |
118 | 247 | lower_zone_cache_->emplace("cst", it->second); |
119 | 247 | } |
120 | | |
121 | 247 | lower_zone_cache_->erase("lmt"); // local mean time for every timezone |
122 | | |
123 | 247 | load_offsets_to_cache(); |
124 | 247 | LOG(INFO) << "Preloaded" << lower_zone_cache_->size() << " timezones."; |
125 | 247 | } |
126 | | |
127 | 21.3k | static std::string to_hour_string(int arg) { |
128 | 21.3k | if (arg < 0 && arg > -10) { // -9 to -1 |
129 | 7.12k | return std::string {"-0"} + std::to_string(std::abs(arg)); |
130 | 14.2k | } else if (arg >= 0 && arg < 10) { //0 to 9 |
131 | 7.92k | return std::string {"0"} + std::to_string(arg); |
132 | 7.92k | } |
133 | 6.33k | return std::to_string(arg); |
134 | 21.3k | } |
135 | | |
136 | 264 | void TimezoneUtils::load_offsets_to_cache() { |
137 | 264 | static constexpr int supported_minutes[] = {0, 30, 45}; |
138 | 7.39k | for (int hour = -12; hour <= +14; hour++) { |
139 | 21.3k | for (int minute : supported_minutes) { |
140 | 21.3k | char min_str[3]; |
141 | 21.3k | snprintf(min_str, sizeof(min_str), "%02d", minute); |
142 | 21.3k | std::string offset_str = (hour >= 0 ? "+" : "") + to_hour_string(hour) + ':' + min_str; |
143 | 21.3k | cctz::time_zone result; |
144 | 21.3k | parse_tz_offset_string(offset_str, result); |
145 | 21.3k | lower_zone_cache_->emplace(offset_str, result); |
146 | 21.3k | } |
147 | 7.12k | } |
148 | | // -00 for hour is also valid |
149 | 264 | std::string offset_str = "-00:00"; |
150 | 264 | cctz::time_zone result; |
151 | 264 | parse_tz_offset_string(offset_str, result); |
152 | 264 | lower_zone_cache_->emplace(offset_str, result); |
153 | 264 | offset_str = "-00:30"; |
154 | 264 | parse_tz_offset_string(offset_str, result); |
155 | 264 | lower_zone_cache_->emplace(offset_str, result); |
156 | 264 | offset_str = "-00:45"; |
157 | 264 | parse_tz_offset_string(offset_str, result); |
158 | 264 | lower_zone_cache_->emplace(offset_str, result); |
159 | 264 | } |
160 | | |
161 | 2.77M | bool TimezoneUtils::find_cctz_time_zone(const std::string& timezone, cctz::time_zone& ctz) { |
162 | 2.77M | if (auto it = lower_zone_cache_->find(to_lower_copy(timezone)); it != lower_zone_cache_->end()) |
163 | 2.72M | [[likely]] { |
164 | 2.72M | ctz = it->second; |
165 | 2.72M | return true; |
166 | 2.72M | } |
167 | | |
168 | 47.3k | std::string normalized; |
169 | 47.3k | if (!normalize_timezone_name(timezone, &normalized)) { |
170 | 315 | return false; |
171 | 315 | } |
172 | 47.0k | if (auto it = lower_zone_cache_->find(to_lower_copy(normalized)); |
173 | 47.0k | it != lower_zone_cache_->end()) [[likely]] { |
174 | 4 | ctz = it->second; |
175 | 4 | return true; |
176 | 4 | } |
177 | 46.9k | return parse_tz_offset_string(normalized, ctz); |
178 | 47.0k | } |
179 | | |
180 | | bool TimezoneUtils::try_get_fixed_offset_seconds(const cctz::time_zone& timezone, |
181 | 192k | int32_t* offset_seconds) { |
182 | 192k | const std::string& timezone_name = timezone.name(); |
183 | 192k | if (timezone_name == "UTC" || timezone_name == "Etc/UTC" || timezone_name == "Etc/GMT") { |
184 | 2.54k | *offset_seconds = 0; |
185 | 2.54k | return true; |
186 | 2.54k | } |
187 | | |
188 | | // cctz names fixed_time_zone() instances with the "Fixed/" prefix. TZDB's Etc/GMT* |
189 | | // zones are fixed offsets too; cctz handles their POSIX-style reversed sign in lookup_offset(). |
190 | | // If this naming convention changes, falling through to the generic path remains correct. |
191 | 189k | static const auto epoch = std::chrono::time_point_cast<cctz::sys_seconds>( |
192 | 189k | std::chrono::system_clock::from_time_t(0)); |
193 | 189k | if (timezone_name.compare(0, 6, "Fixed/") == 0 || timezone_name.compare(0, 7, "Etc/GMT") == 0) { |
194 | 236 | *offset_seconds = timezone.lookup_offset(epoch).offset; |
195 | 236 | return true; |
196 | 236 | } |
197 | 189k | return false; |
198 | 189k | } |
199 | | |
200 | | static bool normalize_offset_string(const std::string& timezone, bool allow_hour_only, |
201 | 119k | std::string* normalized) { |
202 | 119k | if (timezone.size() < 2 || (timezone[0] != '+' && timezone[0] != '-')) { |
203 | 1 | return false; |
204 | 1 | } |
205 | | |
206 | 119k | const bool positive = timezone[0] == '+'; |
207 | 119k | const std::string_view rest(timezone.data() + 1, timezone.size() - 1); |
208 | 119k | int hour = 0; |
209 | 119k | int minute = 0; |
210 | | |
211 | 238k | const auto parse_digit = [](char c) -> int { return c - '0'; }; |
212 | 119k | const auto is_two_digits = [](std::string_view value) -> bool { |
213 | 119k | return value.size() == 2 && std::isdigit(static_cast<unsigned char>(value[0])) && |
214 | 119k | std::isdigit(static_cast<unsigned char>(value[1])); |
215 | 119k | }; |
216 | 119k | const auto is_one_or_two_digits = [](std::string_view value) -> bool { |
217 | 119k | return (value.size() == 1 || value.size() == 2) && |
218 | 119k | std::all_of(value.begin(), value.end(), |
219 | 238k | [](char c) { return std::isdigit(static_cast<unsigned char>(c)); }); |
220 | 119k | }; |
221 | | |
222 | 119k | auto colon_pos = rest.find(':'); |
223 | 119k | if (colon_pos != std::string_view::npos) { |
224 | 119k | std::string_view hour_part = rest.substr(0, colon_pos); |
225 | 119k | std::string_view minute_part = rest.substr(colon_pos + 1); |
226 | 119k | if (!is_one_or_two_digits(hour_part) || !is_two_digits(minute_part)) { |
227 | 0 | return false; |
228 | 0 | } |
229 | 119k | hour = std::stoi(std::string(hour_part)); |
230 | 119k | minute = parse_digit(minute_part[0]) * 10 + parse_digit(minute_part[1]); |
231 | 119k | } else { |
232 | 5 | if (!allow_hour_only || !is_one_or_two_digits(rest)) { |
233 | 2 | return false; |
234 | 2 | } |
235 | 3 | hour = std::stoi(std::string(rest)); |
236 | 3 | minute = 0; |
237 | 3 | } |
238 | | |
239 | 119k | if ((!positive && hour > 12) || (positive && hour > 14) || minute >= 60) { |
240 | 7 | return false; |
241 | 7 | } |
242 | | |
243 | 119k | *normalized = std::string(1, positive ? '+' : '-') + (hour < 10 ? "0" : "") + |
244 | 119k | std::to_string(hour) + ":" + (minute < 10 ? "0" : "") + std::to_string(minute); |
245 | 119k | return true; |
246 | 119k | } |
247 | | |
248 | 120k | bool TimezoneUtils::normalize_timezone_name(const std::string& timezone, std::string* normalized) { |
249 | 120k | const std::string lower = to_lower_copy(timezone); |
250 | 120k | if (lower == "utc" || lower == "etc/utc" || lower == "zulu") { |
251 | 285 | *normalized = "UTC"; |
252 | 285 | return true; |
253 | 285 | } |
254 | | |
255 | 119k | if (lower.rfind("utc", 0) == 0 || lower.rfind("gmt", 0) == 0) { |
256 | 6 | if (timezone.size() <= 3) { |
257 | 0 | return false; |
258 | 0 | } |
259 | 6 | return normalize_offset_string(timezone.substr(3), true, normalized); |
260 | 6 | } |
261 | | |
262 | 119k | if (!timezone.empty() && (timezone[0] == '+' || timezone[0] == '-')) { |
263 | 119k | return normalize_offset_string(timezone, false, normalized); |
264 | 119k | } |
265 | | |
266 | 310 | return false; |
267 | 119k | } |
268 | | |
269 | 70.9k | bool TimezoneUtils::parse_tz_offset_string(const std::string& timezone, cctz::time_zone& ctz) { |
270 | 70.9k | std::string normalized; |
271 | 70.9k | if (!normalize_timezone_name(timezone, &normalized)) { |
272 | 5 | return false; |
273 | 5 | } |
274 | 70.9k | if (normalized == "UTC") { |
275 | 143 | ctz = cctz::utc_time_zone(); |
276 | 143 | return true; |
277 | 143 | } |
278 | | |
279 | 70.8k | re2::StringPiece value; |
280 | 70.8k | if (time_zone_offset_format_reg.Match(normalized, 0, normalized.size(), RE2::UNANCHORED, &value, |
281 | 70.8k | 1)) [[likely]] { |
282 | 70.8k | const bool positive = value[0] != '-'; |
283 | 70.8k | const int hour = std::stoi(value.substr(1, 2).as_string()); |
284 | 70.8k | const int minute = std::stoi(value.substr(4, 2).as_string()); |
285 | 70.8k | int offset = hour * 60 * 60 + minute * 60; |
286 | 70.8k | offset *= positive ? 1 : -1; |
287 | 70.8k | ctz = cctz::fixed_time_zone(cctz::seconds(offset)); |
288 | 70.8k | return true; |
289 | 70.8k | } |
290 | 0 | return false; |
291 | 70.8k | } |
292 | | |
293 | | } // namespace doris |