be/src/util/cgroup_util.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "util/cgroup_util.h" |
19 | | |
20 | | #include <absl/strings/escaping.h> |
21 | | #include <absl/strings/str_split.h> |
22 | | |
23 | | #include <algorithm> |
24 | | #include <boost/algorithm/string.hpp> |
25 | | #include <fstream> |
26 | | #include <utility> |
27 | | #include <vector> |
28 | | |
29 | | #include "io/fs/local_file_system.h" |
30 | | #include "util/error_util.h" |
31 | | #include "util/string_parser.hpp" |
32 | | |
33 | | using std::pair; |
34 | | |
35 | | namespace doris { |
36 | | |
37 | 215k | bool CGroupUtil::cgroupsv1_enable() { |
38 | 215k | bool exists = true; |
39 | 215k | Status st = io::global_local_filesystem()->exists("/proc/cgroups", &exists); |
40 | 215k | return st.ok() && exists; |
41 | 215k | } |
42 | | |
43 | 427k | bool CGroupUtil::cgroupsv2_enable() { |
44 | 427k | #if defined(OS_LINUX) |
45 | | // This file exists iff the host has cgroups v2 enabled. |
46 | 427k | auto controllers_file = default_cgroups_mount / "cgroup.controllers"; |
47 | 427k | bool exists = true; |
48 | 427k | Status st = io::global_local_filesystem()->exists(controllers_file, &exists); |
49 | 427k | return st.ok() && exists; |
50 | | #else |
51 | | return false; |
52 | | #endif |
53 | 427k | } |
54 | | |
55 | 214k | Status CGroupUtil::find_global_cgroupv1(const std::string& subsystem, std::string* path) { |
56 | 214k | std::ifstream proc_cgroups("/proc/self/cgroup", std::ios::in); |
57 | 214k | std::string line; |
58 | 2.00M | while (true) { |
59 | 2.00M | if (proc_cgroups.fail()) { |
60 | 0 | return Status::CgroupError("Error reading /proc/self/cgroup: {}", get_str_err_msg()); |
61 | 2.00M | } else if (proc_cgroups.peek() == std::ifstream::traits_type::eof()) { |
62 | 0 | return Status::CgroupError("Could not find subsystem {} in /proc/self/cgroup", |
63 | 0 | subsystem); |
64 | 0 | } |
65 | | // The line format looks like this: |
66 | | // 4:memory:/user.slice |
67 | | // 9:cpu,cpuacct:/user.slice |
68 | | // so field size will be 3 |
69 | 2.00M | getline(proc_cgroups, line); |
70 | 2.00M | if (!proc_cgroups.good()) { |
71 | 0 | continue; |
72 | 0 | } |
73 | 2.00M | std::vector<std::string> fields = absl::StrSplit(line, ":"); |
74 | | // ":" in the path does not appear to be escaped - bail in the unusual case that |
75 | | // we get too many tokens. |
76 | 2.00M | if (fields.size() != 3) { |
77 | 0 | return Status::InvalidArgument( |
78 | 0 | "Could not parse line from /proc/self/cgroup - had {} > 3 tokens: '{}'", |
79 | 0 | fields.size(), line); |
80 | 0 | } |
81 | 2.00M | std::vector<std::string> subsystems = absl::StrSplit(fields[1], ","); |
82 | 2.00M | auto it = std::find(subsystems.begin(), subsystems.end(), subsystem); |
83 | 2.00M | if (it != subsystems.end()) { |
84 | 214k | *path = std::move(fields[2]); |
85 | 214k | return Status::OK(); |
86 | 214k | } |
87 | 2.00M | } |
88 | 214k | } |
89 | | |
90 | 429k | static Status unescape_path(const std::string& escaped, std::string* unescaped) { |
91 | 429k | std::string err; |
92 | 429k | if (!absl::CUnescape(escaped, unescaped, &err)) { |
93 | 0 | return Status::InvalidArgument("Could not unescape path '{}': {}", escaped, err); |
94 | 0 | } |
95 | 429k | return Status::OK(); |
96 | 429k | } |
97 | | |
98 | | Status CGroupUtil::find_cgroupv1_mounts(const std::string& subsystem, |
99 | 214k | pair<std::string, std::string>* result) { |
100 | 214k | std::ifstream mountinfo("/proc/self/mountinfo", std::ios::in); |
101 | 214k | std::string line; |
102 | 3.58M | while (true) { |
103 | 3.58M | if (mountinfo.fail() || mountinfo.bad()) { |
104 | 0 | return Status::CgroupError("Error reading /proc/self/mountinfo: {}", get_str_err_msg()); |
105 | 3.58M | } else if (mountinfo.eof()) { |
106 | 0 | return Status::CgroupError("Could not find subsystem {} in /proc/self/mountinfo", |
107 | 0 | subsystem); |
108 | 0 | } |
109 | | // The relevant lines look like below (see proc manpage for full documentation). The |
110 | | // first example is running outside of a container, the second example is running |
111 | | // inside a docker container. Field 3 is the path relative to the root CGroup on |
112 | | // the host and Field 4 is the mount point from this process's point of view. |
113 | | // 34 29 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:15 - |
114 | | // cgroup cgroup rw,memory |
115 | | // 275 271 0:28 /docker/f23eee6f88c2ba99fcce /sys/fs/cgroup/memory |
116 | | // ro,nosuid,nodev,noexec,relatime master:15 - cgroup cgroup rw,memory |
117 | 3.58M | getline(mountinfo, line); |
118 | 3.58M | if (!mountinfo.good()) { |
119 | 0 | continue; |
120 | 0 | } |
121 | 3.58M | std::vector<std::string> fields = absl::StrSplit(line, " ", absl::SkipWhitespace()); |
122 | 3.58M | if (fields.size() < 7) { |
123 | 0 | return Status::InvalidArgument( |
124 | 0 | "Could not parse line from /proc/self/mountinfo - had {} > 7 tokens: '{}'", |
125 | 0 | fields.size(), line); |
126 | 0 | } |
127 | 3.58M | if (fields[fields.size() - 3] != "cgroup") { |
128 | 2.79M | continue; |
129 | 2.79M | } |
130 | | // This is a cgroup mount. Check if it's the mount we're looking for. |
131 | 792k | std::vector<std::string> cgroup_opts = |
132 | 792k | absl::StrSplit(fields[fields.size() - 1], ",", absl::SkipWhitespace()); |
133 | 792k | auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem); |
134 | 792k | if (it == cgroup_opts.end()) { |
135 | 577k | continue; |
136 | 577k | } |
137 | | // This is the right mount. |
138 | 214k | std::string mount_path, system_path; |
139 | 214k | RETURN_IF_ERROR(unescape_path(fields[4], &mount_path)); |
140 | 214k | RETURN_IF_ERROR(unescape_path(fields[3], &system_path)); |
141 | | // Strip trailing "/" so that both returned paths match in whether they have a |
142 | | // trailing "/". |
143 | 214k | if (system_path[system_path.size() - 1] == '/') { |
144 | 214k | system_path.pop_back(); |
145 | 214k | } |
146 | 214k | *result = {mount_path, system_path}; |
147 | 214k | return Status::OK(); |
148 | 214k | } |
149 | 214k | } |
150 | | |
151 | 214k | Status CGroupUtil::find_abs_cgroupv1_path(const std::string& subsystem, std::string* path) { |
152 | 214k | if (!cgroupsv1_enable()) { |
153 | 0 | return Status::InvalidArgument("cgroup is not enabled!"); |
154 | 0 | } |
155 | 214k | RETURN_IF_ERROR(find_global_cgroupv1(subsystem, path)); |
156 | 214k | pair<std::string, std::string> paths; |
157 | 214k | RETURN_IF_ERROR(find_cgroupv1_mounts(subsystem, &paths)); |
158 | 214k | const std::string& mount_path = paths.first; |
159 | 214k | const std::string& system_path = paths.second; |
160 | 214k | if (path->compare(0, system_path.size(), system_path) != 0) { |
161 | 0 | return Status::InvalidArgument("Expected CGroup path '{}' to start with '{}'", *path, |
162 | 0 | system_path); |
163 | 0 | } |
164 | 214k | path->replace(0, system_path.size(), mount_path); |
165 | 214k | return Status::OK(); |
166 | 214k | } |
167 | | |
168 | 0 | std::string CGroupUtil::cgroupv2_of_process() { |
169 | 0 | #if defined(OS_LINUX) |
170 | 0 | if (!cgroupsv2_enable()) { |
171 | 0 | return ""; |
172 | 0 | } |
173 | | // All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs |
174 | | // A simpler way to get the membership is: |
175 | 0 | std::ifstream cgroup_name_file("/proc/self/cgroup"); |
176 | 0 | if (!cgroup_name_file.is_open()) { |
177 | 0 | return ""; |
178 | 0 | } |
179 | | // With cgroups v2, there will be a *single* line with prefix "0::/" |
180 | | // (see https://docs.kernel.org/admin-guide/cgroup-v2.html) |
181 | 0 | std::string cgroup; |
182 | 0 | std::getline(cgroup_name_file, cgroup); |
183 | 0 | static const std::string v2_prefix = "0::/"; |
184 | 0 | if (!cgroup.starts_with(v2_prefix)) { |
185 | 0 | return ""; |
186 | 0 | } |
187 | 0 | cgroup = cgroup.substr(v2_prefix.length()); |
188 | 0 | return cgroup; |
189 | | #else |
190 | | return ""; |
191 | | #endif |
192 | 0 | } |
193 | | |
194 | 0 | std::optional<std::string> CGroupUtil::get_cgroupsv2_path(const std::string& subsystem) { |
195 | 0 | #if defined(OS_LINUX) |
196 | 0 | if (!CGroupUtil::cgroupsv2_enable()) { |
197 | 0 | return {}; |
198 | 0 | } |
199 | | |
200 | 0 | std::string cgroup = CGroupUtil::cgroupv2_of_process(); |
201 | 0 | auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup); |
202 | | |
203 | | // Return the bottom-most nested current memory file. If there is no such file at the current |
204 | | // level, try again at the parent level as memory settings are inherited. |
205 | 0 | while (current_cgroup != default_cgroups_mount.parent_path()) { |
206 | 0 | if (std::filesystem::exists(current_cgroup / subsystem)) { |
207 | 0 | return {current_cgroup}; |
208 | 0 | } |
209 | 0 | current_cgroup = current_cgroup.parent_path(); |
210 | 0 | } |
211 | 0 | return {}; |
212 | | #else |
213 | | return {}; |
214 | | #endif |
215 | 0 | } |
216 | | |
217 | | Status CGroupUtil::read_int_line_from_cgroup_file(const std::filesystem::path& file_path, |
218 | 2.10k | int64_t* val) { |
219 | 2.10k | std::ifstream file_stream(file_path, std::ios::in); |
220 | 2.10k | if (!file_stream.is_open()) { |
221 | 1 | return Status::CgroupError("Error open {}", file_path.string()); |
222 | 1 | } |
223 | | |
224 | 2.10k | std::string line; |
225 | 2.10k | getline(file_stream, line); |
226 | 2.10k | if (file_stream.fail() || file_stream.bad()) { |
227 | 0 | return Status::CgroupError("Error reading {}: {}", file_path.string(), get_str_err_msg()); |
228 | 0 | } |
229 | 2.10k | StringParser::ParseResult pr; |
230 | | // Parse into an int64_t If it overflows, returning the max value of int64_t is ok because that |
231 | | // is effectively unlimited. |
232 | 2.10k | *val = StringParser::string_to_int<int64_t>(line.c_str(), line.size(), &pr); |
233 | 2.10k | if ((pr != StringParser::PARSE_SUCCESS && pr != StringParser::PARSE_OVERFLOW)) { |
234 | 0 | return Status::InvalidArgument("Failed to parse {} as int64: '{}'", file_path.string(), |
235 | 0 | line); |
236 | 0 | } |
237 | 2.10k | return Status::OK(); |
238 | 2.10k | } |
239 | | |
240 | | void CGroupUtil::read_int_metric_from_cgroup_file( |
241 | | const std::filesystem::path& file_path, |
242 | 211k | std::unordered_map<std::string, int64_t>& metrics_map) { |
243 | 211k | std::ifstream cgroup_file(file_path, std::ios::in); |
244 | 211k | std::string line; |
245 | 7.39M | while (cgroup_file.good() && !cgroup_file.eof()) { |
246 | 7.18M | getline(cgroup_file, line); |
247 | 7.18M | std::vector<std::string> fields = absl::StrSplit(line, " ", absl::SkipWhitespace()); |
248 | 7.18M | if (fields.size() < 2) { |
249 | 211k | continue; |
250 | 211k | } |
251 | 6.97M | std::string key = fields[0].substr(0, fields[0].size()); |
252 | | |
253 | 6.97M | StringParser::ParseResult result; |
254 | 6.97M | auto value = |
255 | 6.97M | StringParser::string_to_int<int64_t>(fields[1].data(), fields[1].size(), &result); |
256 | | |
257 | 6.97M | if (result == StringParser::PARSE_SUCCESS) { |
258 | 6.97M | if (fields.size() == 2) { |
259 | 6.97M | metrics_map[key] = value; |
260 | 6.97M | } else if (fields[2] == "kB") { |
261 | 0 | metrics_map[key] = value * 1024L; |
262 | 0 | } |
263 | 6.97M | } |
264 | 6.97M | } |
265 | 211k | if (cgroup_file.is_open()) { |
266 | 211k | cgroup_file.close(); |
267 | 211k | } |
268 | 211k | } |
269 | | |
270 | | Status CGroupUtil::read_string_line_from_cgroup_file(const std::filesystem::path& file_path, |
271 | 791 | std::string* line_ptr) { |
272 | 791 | std::ifstream file_stream(file_path, std::ios::in); |
273 | 791 | if (!file_stream.is_open()) { |
274 | 0 | return Status::CgroupError("Error open {}", file_path.string()); |
275 | 0 | } |
276 | 791 | std::string line; |
277 | 791 | getline(file_stream, line); |
278 | 791 | if (file_stream.fail() || file_stream.bad()) { |
279 | 0 | return Status::CgroupError("Error reading {}: {}", file_path.string(), get_str_err_msg()); |
280 | 0 | } |
281 | 791 | *line_ptr = line; |
282 | 791 | return Status::OK(); |
283 | 791 | } |
284 | | |
285 | 793 | Status CGroupUtil::parse_cpuset_line(std::string cpuset_line, int* cpu_count_ptr) { |
286 | 793 | if (cpuset_line.empty()) { |
287 | 0 | return Status::CgroupError("cpuset line is empty"); |
288 | 0 | } |
289 | 793 | std::vector<std::string> ranges; |
290 | 793 | boost::split(ranges, cpuset_line, boost::is_any_of(",")); |
291 | 793 | int cpu_count = 0; |
292 | | |
293 | 800 | for (const std::string& range : ranges) { |
294 | 800 | std::vector<std::string> cpu_values; |
295 | 800 | boost::split(cpu_values, range, boost::is_any_of("-")); |
296 | | |
297 | 800 | if (cpu_values.size() == 2) { |
298 | 795 | int start = std::stoi(cpu_values[0]); |
299 | 795 | int end = std::stoi(cpu_values[1]); |
300 | 795 | cpu_count += (end - start) + 1; |
301 | 795 | } else { |
302 | 5 | cpu_count++; |
303 | 5 | } |
304 | 800 | } |
305 | 793 | *cpu_count_ptr = cpu_count; |
306 | 793 | return Status::OK(); |
307 | 793 | } |
308 | | |
309 | 787 | int CGroupUtil::get_cgroup_limited_cpu_number(int physical_cores) { |
310 | 787 | if (physical_cores <= 0) { |
311 | 0 | return physical_cores; |
312 | 0 | } |
313 | 787 | int ret = physical_cores; |
314 | 787 | #if defined(OS_LINUX) |
315 | | // For cgroup v2 |
316 | | // Child cgroup's cpu.max may bigger than parent group's cpu.max, |
317 | | // so it should look up from current cgroup to top group. |
318 | | // For cpuset, child cgroup's cpuset.cpus could not bigger thant parent's cpuset.cpus. |
319 | 787 | if (CGroupUtil::cgroupsv2_enable()) { |
320 | 0 | std::string cgroupv2_process_path = CGroupUtil::cgroupv2_of_process(); |
321 | 0 | if (cgroupv2_process_path.empty()) { |
322 | 0 | return ret; |
323 | 0 | } |
324 | 0 | std::filesystem::path current_cgroup_path = (default_cgroups_mount / cgroupv2_process_path); |
325 | 0 | ret = get_cgroup_v2_cpu_quota_number(current_cgroup_path, default_cgroups_mount, ret); |
326 | |
|
327 | 0 | current_cgroup_path = (default_cgroups_mount / cgroupv2_process_path); |
328 | 0 | ret = get_cgroup_v2_cpuset_number(current_cgroup_path, default_cgroups_mount, ret); |
329 | 787 | } else if (CGroupUtil::cgroupsv1_enable()) { |
330 | | // cpu quota, should find first not empty config from current path to top. |
331 | | // because if a process attach to current cgroup, its cpu quota may not be set. |
332 | 787 | std::string cpu_quota_path = ""; |
333 | 787 | Status cpu_quota_ret = CGroupUtil::find_abs_cgroupv1_path("cpu", &cpu_quota_path); |
334 | 787 | if (cpu_quota_ret.ok() && !cpu_quota_path.empty()) { |
335 | 787 | std::filesystem::path current_cgroup_path = cpu_quota_path; |
336 | 787 | ret = get_cgroup_v1_cpu_quota_number(current_cgroup_path, default_cgroups_mount, ret); |
337 | 787 | } |
338 | | |
339 | | //cpuset |
340 | | // just lookup current process cgroup path is enough |
341 | | // because if a process attach to current cgroup, its cpuset.cpus must be set. |
342 | 787 | std::string cpuset_path = ""; |
343 | 787 | Status cpuset_ret = CGroupUtil::find_abs_cgroupv1_path("cpuset", &cpuset_path); |
344 | 787 | if (cpuset_ret.ok() && !cpuset_path.empty()) { |
345 | 787 | std::filesystem::path current_path = cpuset_path; |
346 | 787 | ret = get_cgroup_v1_cpuset_number(current_path, ret); |
347 | 787 | } |
348 | 787 | } |
349 | 787 | #endif |
350 | 787 | return ret; |
351 | 787 | } |
352 | | |
353 | | int CGroupUtil::get_cgroup_v2_cpu_quota_number(std::filesystem::path& current_path, |
354 | | const std::filesystem::path& default_cg_mout_path, |
355 | 4 | int cpu_num) { |
356 | 4 | int ret = cpu_num; |
357 | 12 | while (current_path != default_cg_mout_path.parent_path()) { |
358 | 8 | std::ifstream cpu_max_file(current_path / "cpu.max"); |
359 | 8 | if (cpu_max_file.is_open()) { |
360 | 8 | std::string cpu_limit_str; |
361 | 8 | double cpu_period; |
362 | 8 | cpu_max_file >> cpu_limit_str >> cpu_period; |
363 | 8 | if (cpu_limit_str != "max" && cpu_period != 0) { |
364 | 5 | double cpu_limit = std::stod(cpu_limit_str); |
365 | 5 | ret = std::min(static_cast<int>(std::ceil(cpu_limit / cpu_period)), ret); |
366 | 5 | } |
367 | 8 | } |
368 | 8 | current_path = current_path.parent_path(); |
369 | 8 | } |
370 | 4 | return ret; |
371 | 4 | } |
372 | | |
373 | | int CGroupUtil::get_cgroup_v2_cpuset_number(std::filesystem::path& current_path, |
374 | | const std::filesystem::path& default_cg_mout_path, |
375 | 2 | int cpu_num) { |
376 | 2 | int ret = cpu_num; |
377 | 3 | while (current_path != default_cg_mout_path.parent_path()) { |
378 | 3 | std::ifstream cpuset_cpus_file(current_path / "cpuset.cpus.effective"); |
379 | 3 | current_path = current_path.parent_path(); |
380 | 3 | if (cpuset_cpus_file.is_open()) { |
381 | 3 | std::string cpuset_line; |
382 | 3 | cpuset_cpus_file >> cpuset_line; |
383 | 3 | if (cpuset_line.empty()) { |
384 | 1 | continue; |
385 | 1 | } |
386 | 2 | int cpus_count = 0; |
387 | 2 | static_cast<void>(CGroupUtil::parse_cpuset_line(cpuset_line, &cpus_count)); |
388 | 2 | ret = std::min(cpus_count, ret); |
389 | 2 | break; |
390 | 3 | } |
391 | 3 | } |
392 | 2 | return ret; |
393 | 2 | } |
394 | | |
395 | | int CGroupUtil::get_cgroup_v1_cpu_quota_number(std::filesystem::path& current_path, |
396 | | const std::filesystem::path& default_cg_mout_path, |
397 | 790 | int cpu_num) { |
398 | 790 | int ret = cpu_num; |
399 | 3.40k | while (current_path != default_cg_mout_path.parent_path()) { |
400 | 2.62k | std::ifstream cpu_quota_file(current_path / "cpu.cfs_quota_us"); |
401 | 2.62k | std::ifstream cpu_period_file(current_path / "cpu.cfs_period_us"); |
402 | 2.62k | if (cpu_quota_file.is_open() && cpu_period_file.is_open()) { |
403 | 1.83k | double cpu_quota_value; |
404 | 1.83k | double cpu_period_value; |
405 | 1.83k | cpu_quota_file >> cpu_quota_value; |
406 | 1.83k | cpu_period_file >> cpu_period_value; |
407 | 1.83k | if (cpu_quota_value > 0 && cpu_period_value > 0) { |
408 | 2 | ret = std::min(ret, |
409 | 2 | static_cast<int>(std::ceil(cpu_quota_value / cpu_period_value))); |
410 | 2 | break; |
411 | 2 | } |
412 | 1.83k | } |
413 | 2.61k | current_path = current_path.parent_path(); |
414 | 2.61k | } |
415 | 790 | return ret; |
416 | 790 | } |
417 | | |
418 | 788 | int CGroupUtil::get_cgroup_v1_cpuset_number(std::filesystem::path& current_path, int cpu_num) { |
419 | 788 | int ret = cpu_num; |
420 | 788 | std::string cpuset_line = ""; |
421 | 788 | Status cpuset_ret = CGroupUtil::read_string_line_from_cgroup_file( |
422 | 788 | (current_path / "cpuset.cpus"), &cpuset_line); |
423 | 788 | if (cpuset_ret.ok() && !cpuset_line.empty()) { |
424 | 788 | int cpuset_count = 0; |
425 | 788 | static_cast<void>(CGroupUtil::parse_cpuset_line(cpuset_line, &cpuset_count)); |
426 | 788 | if (cpuset_count > 0) { |
427 | 788 | ret = std::min(ret, cpuset_count); |
428 | 788 | } |
429 | 788 | } |
430 | 788 | return ret; |
431 | 788 | } |
432 | | |
433 | | } // namespace doris |