be/src/common/metrics/system_metrics.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "common/metrics/system_metrics.h" |
19 | | |
20 | | #include <absl/strings/str_split.h> |
21 | | #include <glog/logging.h> |
22 | | |
23 | | #include <functional> |
24 | | #include <ostream> |
25 | | #include <unordered_map> |
26 | | #include <utility> |
27 | | |
28 | | #include "common/cast_set.h" |
29 | | #include "common/config.h" |
30 | | #include "runtime/memory/jemalloc_control.h" |
31 | | #include "util/cgroup_util.h" |
32 | | #include "util/perf_counters.h" |
33 | | |
34 | | namespace doris { |
35 | | DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(avail_cpu_num, MetricUnit::NOUNIT); |
36 | | |
37 | | DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(host_cpu_num, MetricUnit::NOUNIT); |
38 | | struct CpuNumberMetrics { |
39 | 12 | CpuNumberMetrics(MetricEntity* ent) : entity(ent) { |
40 | 12 | INT_COUNTER_METRIC_REGISTER(entity, host_cpu_num); |
41 | 12 | INT_COUNTER_METRIC_REGISTER(entity, avail_cpu_num); |
42 | 12 | } |
43 | | |
44 | | IntCounter* host_cpu_num {nullptr}; |
45 | | IntCounter* avail_cpu_num {nullptr}; |
46 | | MetricEntity* entity = nullptr; |
47 | | }; |
48 | | |
49 | | #define DEFINE_CPU_COUNTER_METRIC(metric) \ |
50 | | DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(cpu_##metric, MetricUnit::PERCENT, "", cpu, \ |
51 | | Labels({{"mode", #metric}})); |
52 | | DEFINE_CPU_COUNTER_METRIC(user); |
53 | | DEFINE_CPU_COUNTER_METRIC(nice); |
54 | | DEFINE_CPU_COUNTER_METRIC(system); |
55 | | DEFINE_CPU_COUNTER_METRIC(idle); |
56 | | DEFINE_CPU_COUNTER_METRIC(iowait); |
57 | | DEFINE_CPU_COUNTER_METRIC(irq); |
58 | | DEFINE_CPU_COUNTER_METRIC(soft_irq); |
59 | | DEFINE_CPU_COUNTER_METRIC(steal); |
60 | | DEFINE_CPU_COUNTER_METRIC(guest); |
61 | | DEFINE_CPU_COUNTER_METRIC(guest_nice); |
62 | | |
63 | | // /proc/stat: http://www.linuxhowtos.org/System/procstat.htm |
64 | | struct CpuMetrics { |
65 | 187 | CpuMetrics(MetricEntity* ent) : entity(ent) { |
66 | 187 | INT_COUNTER_METRIC_REGISTER(entity, cpu_user); |
67 | 187 | INT_COUNTER_METRIC_REGISTER(entity, cpu_nice); |
68 | 187 | INT_COUNTER_METRIC_REGISTER(entity, cpu_system); |
69 | 187 | INT_COUNTER_METRIC_REGISTER(entity, cpu_idle); |
70 | 187 | INT_COUNTER_METRIC_REGISTER(entity, cpu_iowait); |
71 | 187 | INT_COUNTER_METRIC_REGISTER(entity, cpu_irq); |
72 | 187 | INT_COUNTER_METRIC_REGISTER(entity, cpu_soft_irq); |
73 | 187 | INT_COUNTER_METRIC_REGISTER(entity, cpu_steal); |
74 | 187 | INT_COUNTER_METRIC_REGISTER(entity, cpu_guest); |
75 | 187 | INT_COUNTER_METRIC_REGISTER(entity, cpu_guest_nice); |
76 | | |
77 | 187 | metrics[0] = cpu_user; |
78 | 187 | metrics[1] = cpu_nice; |
79 | 187 | metrics[2] = cpu_system; |
80 | 187 | metrics[3] = cpu_idle; |
81 | 187 | metrics[4] = cpu_iowait; |
82 | 187 | metrics[5] = cpu_irq; |
83 | 187 | metrics[6] = cpu_soft_irq; |
84 | 187 | metrics[7] = cpu_steal; |
85 | 187 | metrics[8] = cpu_guest; |
86 | 187 | metrics[9] = cpu_guest_nice; |
87 | 187 | } |
88 | | |
89 | | static constexpr int cpu_num_metrics = 10; |
90 | | |
91 | | MetricEntity* entity = nullptr; |
92 | | IntCounter* cpu_user; |
93 | | IntCounter* cpu_nice; |
94 | | IntCounter* cpu_system; |
95 | | IntCounter* cpu_idle; |
96 | | IntCounter* cpu_iowait; |
97 | | IntCounter* cpu_irq; |
98 | | IntCounter* cpu_soft_irq; |
99 | | IntCounter* cpu_steal; |
100 | | IntCounter* cpu_guest; |
101 | | IntCounter* cpu_guest_nice; |
102 | | |
103 | | IntCounter* metrics[cpu_num_metrics]; |
104 | | }; |
105 | | |
106 | | #define DEFINE_MEMORY_GAUGE_METRIC(metric, unit) \ |
107 | | DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(memory_##metric, unit); |
108 | | DEFINE_MEMORY_GAUGE_METRIC(allocated_bytes, MetricUnit::BYTES); |
109 | | DEFINE_MEMORY_GAUGE_METRIC(pgpgin, MetricUnit::NOUNIT); |
110 | | DEFINE_MEMORY_GAUGE_METRIC(pgpgout, MetricUnit::NOUNIT); |
111 | | DEFINE_MEMORY_GAUGE_METRIC(pswpin, MetricUnit::NOUNIT); |
112 | | DEFINE_MEMORY_GAUGE_METRIC(pswpout, MetricUnit::NOUNIT); |
113 | | #ifndef USE_JEMALLOC |
114 | | DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_allocated_bytes, MetricUnit::BYTES); |
115 | | DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_total_thread_cache_bytes, MetricUnit::BYTES); |
116 | | DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_central_cache_free_bytes, MetricUnit::BYTES); |
117 | | DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_transfer_cache_free_bytes, MetricUnit::BYTES); |
118 | | DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_thread_cache_free_bytes, MetricUnit::BYTES); |
119 | | DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_pageheap_free_bytes, MetricUnit::BYTES); |
120 | | DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_pageheap_unmapped_bytes, MetricUnit::BYTES); |
121 | | #else |
122 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_allocated_bytes, MetricUnit::BYTES); |
123 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_active_bytes, MetricUnit::BYTES); |
124 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_metadata_bytes, MetricUnit::BYTES); |
125 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_resident_bytes, MetricUnit::BYTES); |
126 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_mapped_bytes, MetricUnit::BYTES); |
127 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_retained_bytes, MetricUnit::BYTES); |
128 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_tcache_bytes, MetricUnit::BYTES); |
129 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pactive_num, MetricUnit::NOUNIT); |
130 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pdirty_num, MetricUnit::NOUNIT); |
131 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pmuzzy_num, MetricUnit::NOUNIT); |
132 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_dirty_purged_num, MetricUnit::NOUNIT); |
133 | | DEFINE_MEMORY_GAUGE_METRIC(jemalloc_muzzy_purged_num, MetricUnit::NOUNIT); |
134 | | #endif |
135 | | |
136 | | struct MemoryMetrics { |
137 | 12 | MemoryMetrics(MetricEntity* ent) : entity(ent) { |
138 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_allocated_bytes); |
139 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_pgpgin); |
140 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_pgpgout); |
141 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_pswpin); |
142 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_pswpout); |
143 | | |
144 | 12 | #ifndef USE_JEMALLOC |
145 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_allocated_bytes); |
146 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_total_thread_cache_bytes); |
147 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_central_cache_free_bytes); |
148 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_transfer_cache_free_bytes); |
149 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_thread_cache_free_bytes); |
150 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_pageheap_free_bytes); |
151 | 12 | INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_pageheap_unmapped_bytes); |
152 | | #else |
153 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_allocated_bytes); |
154 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_active_bytes); |
155 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_metadata_bytes); |
156 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_resident_bytes); |
157 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_mapped_bytes); |
158 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_retained_bytes); |
159 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_tcache_bytes); |
160 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pactive_num); |
161 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pdirty_num); |
162 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pmuzzy_num); |
163 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_dirty_purged_num); |
164 | | INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_muzzy_purged_num); |
165 | | #endif |
166 | 12 | } |
167 | | |
168 | | MetricEntity* entity = nullptr; |
169 | | IntGauge* memory_allocated_bytes; |
170 | | IntGauge* memory_pgpgin; |
171 | | IntGauge* memory_pgpgout; |
172 | | IntGauge* memory_pswpin; |
173 | | IntGauge* memory_pswpout; |
174 | | |
175 | | #ifndef USE_JEMALLOC |
176 | | IntGauge* memory_tcmalloc_allocated_bytes; |
177 | | IntGauge* memory_tcmalloc_total_thread_cache_bytes; |
178 | | IntGauge* memory_tcmalloc_central_cache_free_bytes; |
179 | | IntGauge* memory_tcmalloc_transfer_cache_free_bytes; |
180 | | IntGauge* memory_tcmalloc_thread_cache_free_bytes; |
181 | | IntGauge* memory_tcmalloc_pageheap_free_bytes; |
182 | | IntGauge* memory_tcmalloc_pageheap_unmapped_bytes; |
183 | | #else |
184 | | IntGauge* memory_jemalloc_allocated_bytes; |
185 | | IntGauge* memory_jemalloc_active_bytes; |
186 | | IntGauge* memory_jemalloc_metadata_bytes; |
187 | | IntGauge* memory_jemalloc_resident_bytes; |
188 | | IntGauge* memory_jemalloc_mapped_bytes; |
189 | | IntGauge* memory_jemalloc_retained_bytes; |
190 | | IntGauge* memory_jemalloc_tcache_bytes; |
191 | | IntGauge* memory_jemalloc_pactive_num; |
192 | | IntGauge* memory_jemalloc_pdirty_num; |
193 | | IntGauge* memory_jemalloc_pmuzzy_num; |
194 | | IntGauge* memory_jemalloc_dirty_purged_num; |
195 | | IntGauge* memory_jemalloc_muzzy_purged_num; |
196 | | #endif |
197 | | }; |
198 | | |
199 | | #define DEFINE_DISK_COUNTER_METRIC(metric, unit) \ |
200 | | DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(disk_##metric, unit); |
201 | | DEFINE_DISK_COUNTER_METRIC(reads_completed, MetricUnit::OPERATIONS); |
202 | | DEFINE_DISK_COUNTER_METRIC(bytes_read, MetricUnit::BYTES); |
203 | | DEFINE_DISK_COUNTER_METRIC(read_time_ms, MetricUnit::MILLISECONDS); |
204 | | DEFINE_DISK_COUNTER_METRIC(writes_completed, MetricUnit::OPERATIONS); |
205 | | DEFINE_DISK_COUNTER_METRIC(bytes_written, MetricUnit::BYTES); |
206 | | DEFINE_DISK_COUNTER_METRIC(write_time_ms, MetricUnit::MILLISECONDS); |
207 | | DEFINE_DISK_COUNTER_METRIC(io_time_ms, MetricUnit::MILLISECONDS); |
208 | | DEFINE_DISK_COUNTER_METRIC(io_time_weigthed, MetricUnit::MILLISECONDS); |
209 | | |
210 | | struct DiskMetrics { |
211 | 9 | DiskMetrics(MetricEntity* ent) : entity(ent) { |
212 | 9 | INT_COUNTER_METRIC_REGISTER(entity, disk_reads_completed); |
213 | 9 | INT_COUNTER_METRIC_REGISTER(entity, disk_bytes_read); |
214 | 9 | INT_COUNTER_METRIC_REGISTER(entity, disk_read_time_ms); |
215 | 9 | INT_COUNTER_METRIC_REGISTER(entity, disk_writes_completed); |
216 | 9 | INT_COUNTER_METRIC_REGISTER(entity, disk_bytes_written); |
217 | 9 | INT_COUNTER_METRIC_REGISTER(entity, disk_write_time_ms); |
218 | 9 | INT_COUNTER_METRIC_REGISTER(entity, disk_io_time_ms); |
219 | 9 | INT_COUNTER_METRIC_REGISTER(entity, disk_io_time_weigthed); |
220 | 9 | } |
221 | | |
222 | | MetricEntity* entity = nullptr; |
223 | | IntCounter* disk_reads_completed; |
224 | | IntCounter* disk_bytes_read; |
225 | | IntCounter* disk_read_time_ms; |
226 | | IntCounter* disk_writes_completed; |
227 | | IntCounter* disk_bytes_written; |
228 | | IntCounter* disk_write_time_ms; |
229 | | IntCounter* disk_io_time_ms; |
230 | | IntCounter* disk_io_time_weigthed; |
231 | | }; |
232 | | |
233 | | #define DEFINE_NETWORK_COUNTER_METRIC(metric, unit) \ |
234 | | DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(network_##metric, unit); |
235 | | DEFINE_NETWORK_COUNTER_METRIC(receive_bytes, MetricUnit::BYTES); |
236 | | DEFINE_NETWORK_COUNTER_METRIC(receive_packets, MetricUnit::PACKETS); |
237 | | DEFINE_NETWORK_COUNTER_METRIC(send_bytes, MetricUnit::BYTES); |
238 | | DEFINE_NETWORK_COUNTER_METRIC(send_packets, MetricUnit::PACKETS); |
239 | | |
240 | | struct NetworkMetrics { |
241 | 37 | NetworkMetrics(MetricEntity* ent) : entity(ent) { |
242 | 37 | INT_COUNTER_METRIC_REGISTER(entity, network_receive_bytes); |
243 | 37 | INT_COUNTER_METRIC_REGISTER(entity, network_receive_packets); |
244 | 37 | INT_COUNTER_METRIC_REGISTER(entity, network_send_bytes); |
245 | 37 | INT_COUNTER_METRIC_REGISTER(entity, network_send_packets); |
246 | 37 | } |
247 | | |
248 | | MetricEntity* entity = nullptr; |
249 | | IntCounter* network_receive_bytes; |
250 | | IntCounter* network_receive_packets; |
251 | | IntCounter* network_send_bytes; |
252 | | IntCounter* network_send_packets; |
253 | | }; |
254 | | |
255 | | #define DEFINE_SNMP_COUNTER_METRIC(metric, unit, desc) \ |
256 | | DEFINE_COUNTER_METRIC_PROTOTYPE_3ARG(snmp_##metric, unit, desc); |
257 | | DEFINE_SNMP_COUNTER_METRIC(tcp_in_errs, MetricUnit::NOUNIT, |
258 | | "The number of all problematic TCP packets received"); |
259 | | DEFINE_SNMP_COUNTER_METRIC(tcp_retrans_segs, MetricUnit::NOUNIT, "All TCP packets retransmitted"); |
260 | | DEFINE_SNMP_COUNTER_METRIC(tcp_in_segs, MetricUnit::NOUNIT, "All received TCP packets"); |
261 | | DEFINE_SNMP_COUNTER_METRIC(tcp_out_segs, MetricUnit::NOUNIT, "All send TCP packets with RST mark"); |
262 | | |
263 | | // metrics read from /proc/net/snmp |
264 | | struct SnmpMetrics { |
265 | 12 | SnmpMetrics(MetricEntity* ent) : entity(ent) { |
266 | 12 | INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_in_errs); |
267 | 12 | INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_retrans_segs); |
268 | 12 | INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_in_segs); |
269 | 12 | INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_out_segs); |
270 | 12 | } |
271 | | |
272 | | MetricEntity* entity = nullptr; |
273 | | IntCounter* snmp_tcp_in_errs; |
274 | | IntCounter* snmp_tcp_retrans_segs; |
275 | | IntCounter* snmp_tcp_in_segs; |
276 | | IntCounter* snmp_tcp_out_segs; |
277 | | }; |
278 | | |
279 | | #define DEFINE_FD_COUNTER_METRIC(metric, unit) \ |
280 | | DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(fd_##metric, unit); |
281 | | DEFINE_FD_COUNTER_METRIC(num_limit, MetricUnit::NOUNIT); |
282 | | DEFINE_FD_COUNTER_METRIC(num_used, MetricUnit::NOUNIT); |
283 | | |
284 | | struct FileDescriptorMetrics { |
285 | 12 | FileDescriptorMetrics(MetricEntity* ent) : entity(ent) { |
286 | 12 | INT_GAUGE_METRIC_REGISTER(entity, fd_num_limit); |
287 | 12 | INT_GAUGE_METRIC_REGISTER(entity, fd_num_used); |
288 | 12 | } |
289 | | |
290 | | MetricEntity* entity = nullptr; |
291 | | IntGauge* fd_num_limit; |
292 | | IntGauge* fd_num_used; |
293 | | }; |
294 | | |
295 | | #define DEFINE_LOAD_AVERAGE_DOUBLE_METRIC(metric) \ |
296 | | DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(load_average_##metric, MetricUnit::NOUNIT, "", \ |
297 | | load_average, Labels({{"mode", #metric}})); |
298 | | DEFINE_LOAD_AVERAGE_DOUBLE_METRIC(1_minutes); |
299 | | DEFINE_LOAD_AVERAGE_DOUBLE_METRIC(5_minutes); |
300 | | DEFINE_LOAD_AVERAGE_DOUBLE_METRIC(15_minutes); |
301 | | |
302 | | struct LoadAverageMetrics { |
303 | 12 | LoadAverageMetrics(MetricEntity* ent) : entity(ent) { |
304 | 12 | DOUBLE_GAUGE_METRIC_REGISTER(entity, load_average_1_minutes); |
305 | 12 | DOUBLE_GAUGE_METRIC_REGISTER(entity, load_average_5_minutes); |
306 | 12 | DOUBLE_GAUGE_METRIC_REGISTER(entity, load_average_15_minutes); |
307 | 12 | } |
308 | | |
309 | | MetricEntity* entity = nullptr; |
310 | | DoubleGauge* load_average_1_minutes; |
311 | | DoubleGauge* load_average_5_minutes; |
312 | | DoubleGauge* load_average_15_minutes; |
313 | | }; |
314 | | |
315 | | #define DEFINE_PROC_STAT_COUNTER_METRIC(metric) \ |
316 | | DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(proc_##metric, MetricUnit::NOUNIT, "", proc, \ |
317 | | Labels({{"mode", #metric}})); |
318 | | DEFINE_PROC_STAT_COUNTER_METRIC(interrupt); |
319 | | DEFINE_PROC_STAT_COUNTER_METRIC(ctxt_switch); |
320 | | DEFINE_PROC_STAT_COUNTER_METRIC(procs_running); |
321 | | DEFINE_PROC_STAT_COUNTER_METRIC(procs_blocked); |
322 | | |
323 | | struct ProcMetrics { |
324 | 12 | ProcMetrics(MetricEntity* ent) : entity(ent) { |
325 | 12 | INT_COUNTER_METRIC_REGISTER(entity, proc_interrupt); |
326 | 12 | INT_COUNTER_METRIC_REGISTER(entity, proc_ctxt_switch); |
327 | 12 | INT_COUNTER_METRIC_REGISTER(entity, proc_procs_running); |
328 | 12 | INT_COUNTER_METRIC_REGISTER(entity, proc_procs_blocked); |
329 | 12 | } |
330 | | |
331 | | MetricEntity* entity = nullptr; |
332 | | |
333 | | IntCounter* proc_interrupt; |
334 | | IntCounter* proc_ctxt_switch; |
335 | | IntCounter* proc_procs_running; |
336 | | IntCounter* proc_procs_blocked; |
337 | | }; |
338 | | |
339 | | DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(max_disk_io_util_percent, MetricUnit::PERCENT); |
340 | | DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(max_network_send_bytes_rate, MetricUnit::BYTES); |
341 | | DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(max_network_receive_bytes_rate, MetricUnit::BYTES); |
342 | | |
343 | | const char* SystemMetrics::_s_hook_name = "system_metrics"; |
344 | | |
345 | | SystemMetrics::SystemMetrics(MetricRegistry* registry, const std::set<std::string>& disk_devices, |
346 | 12 | const std::vector<std::string>& network_interfaces) { |
347 | 12 | DCHECK(registry != nullptr); |
348 | 12 | _registry = registry; |
349 | 12 | _server_entity = _registry->register_entity("server"); |
350 | 12 | DCHECK(_server_entity != nullptr); |
351 | 12 | _server_entity->register_hook(_s_hook_name, std::bind(&SystemMetrics::update, this)); |
352 | 12 | _install_cpu_metrics(); |
353 | 12 | _install_memory_metrics(_server_entity.get()); |
354 | 12 | _install_disk_metrics(disk_devices); |
355 | 12 | _install_net_metrics(network_interfaces); |
356 | 12 | _install_fd_metrics(_server_entity.get()); |
357 | 12 | _install_snmp_metrics(_server_entity.get()); |
358 | 12 | _install_load_avg_metrics(_server_entity.get()); |
359 | 12 | _install_proc_metrics(_server_entity.get()); |
360 | | |
361 | 12 | INT_GAUGE_METRIC_REGISTER(_server_entity.get(), max_disk_io_util_percent); |
362 | 12 | INT_GAUGE_METRIC_REGISTER(_server_entity.get(), max_network_send_bytes_rate); |
363 | 12 | INT_GAUGE_METRIC_REGISTER(_server_entity.get(), max_network_receive_bytes_rate); |
364 | 12 | } |
365 | | |
366 | 8 | SystemMetrics::~SystemMetrics() { |
367 | 8 | DCHECK(_server_entity != nullptr); |
368 | 8 | _server_entity->deregister_hook(_s_hook_name); |
369 | | |
370 | 95 | for (auto& it : _cpu_metrics) { |
371 | 95 | delete it.second; |
372 | 95 | } |
373 | 8 | for (auto& it : _disk_metrics) { |
374 | 5 | delete it.second; |
375 | 5 | } |
376 | 17 | for (auto& it : _network_metrics) { |
377 | 17 | delete it.second; |
378 | 17 | } |
379 | 8 | if (_line_ptr != nullptr) { |
380 | 7 | free(_line_ptr); |
381 | 7 | } |
382 | 8 | } |
383 | | |
384 | 765 | void SystemMetrics::update() { |
385 | 765 | _update_cpu_metrics(); |
386 | 765 | _update_memory_metrics(); |
387 | 765 | _update_disk_metrics(); |
388 | 765 | _update_net_metrics(); |
389 | 765 | _update_fd_metrics(); |
390 | 765 | _update_snmp_metrics(); |
391 | 765 | _update_load_avg_metrics(); |
392 | 765 | _update_proc_metrics(); |
393 | 765 | } |
394 | | |
395 | 12 | void SystemMetrics::_install_cpu_metrics() { |
396 | 12 | get_cpu_name(); |
397 | | |
398 | 12 | int cpu_num = 0; |
399 | 187 | for (auto cpu_name : _cpu_names) { |
400 | | // NOTE: cpu_name comes from /proc/stat which named 'cpu' is not a real cpu name, it should be skipped. |
401 | 187 | if (cpu_name != "cpu") { |
402 | 176 | cpu_num++; |
403 | 176 | } |
404 | 187 | auto cpu_entity = _registry->register_entity(cpu_name, {{"device", cpu_name}}); |
405 | 187 | CpuMetrics* metrics = new CpuMetrics(cpu_entity.get()); |
406 | 187 | _cpu_metrics.emplace(cpu_name, metrics); |
407 | 187 | } |
408 | | |
409 | 12 | auto cpu_num_entity = _registry->register_entity("doris_be_host_cpu_num"); |
410 | 12 | _cpu_num_metrics = std::make_unique<CpuNumberMetrics>(cpu_num_entity.get()); |
411 | | |
412 | 12 | _cpu_num_metrics->host_cpu_num->set_value(cpu_num); |
413 | 12 | } |
414 | | |
415 | | #ifdef BE_TEST |
416 | | const char* k_ut_stat_path; |
417 | | const char* k_ut_diskstats_path; |
418 | | const char* k_ut_net_dev_path; |
419 | | const char* k_ut_fd_path; |
420 | | const char* k_ut_net_snmp_path; |
421 | | const char* k_ut_load_avg_path; |
422 | | const char* k_ut_vmstat_path; |
423 | | #endif |
424 | | |
425 | 765 | void SystemMetrics::_update_cpu_metrics() { |
426 | | #ifdef BE_TEST |
427 | | FILE* fp = fopen(k_ut_stat_path, "r"); |
428 | | #else |
429 | 765 | FILE* fp = fopen("/proc/stat", "r"); |
430 | 765 | #endif |
431 | 765 | if (fp == nullptr) { |
432 | 0 | char buf[64]; |
433 | 0 | LOG(WARNING) << "open /proc/stat failed, errno=" << errno |
434 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
435 | 0 | if (errno == 24) { |
436 | 0 | _file_handle_deplenish_counter++; |
437 | 0 | } else { |
438 | 0 | _file_handle_deplenish_counter = 0; |
439 | 0 | } |
440 | | // Threshold of the number of consecutive failures |
441 | 0 | if (_file_handle_deplenish_counter >= config::file_handles_deplenish_frequency_times) { |
442 | 0 | LOG(FATAL) << "The system file handles are insufficient, causing service exceptions" |
443 | 0 | << ", BE will exit. please check the configs 'soft nofile'" |
444 | 0 | << " and 'hard nofile' of /etc/security/limits.conf "; |
445 | 0 | exit(-1); |
446 | 0 | } |
447 | 0 | return; |
448 | 0 | } |
449 | | |
450 | 22.8k | while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { |
451 | 22.0k | char cpu[16]; |
452 | 22.0k | int64_t values[CpuMetrics::cpu_num_metrics]; |
453 | 22.0k | memset(values, 0, sizeof(values)); |
454 | 22.0k | int num = sscanf(_line_ptr, |
455 | 22.0k | "%15s" |
456 | 22.0k | " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 |
457 | 22.0k | " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64, |
458 | 22.0k | cpu, &values[0], &values[1], &values[2], &values[3], &values[4], |
459 | 22.0k | &values[5], &values[6], &values[7], &values[8], &values[9]); |
460 | 22.0k | if (num < 4) { |
461 | 3.79k | continue; |
462 | 3.79k | } |
463 | | |
464 | 18.2k | std::string cpu_name(cpu); |
465 | 18.2k | auto it = _cpu_metrics.find(cpu_name); |
466 | 18.2k | if (it == _cpu_metrics.end()) { |
467 | 1.51k | continue; |
468 | 1.51k | } |
469 | | |
470 | 16.7k | if (cpu_name == "cpu") { |
471 | 765 | AggregateCpuTime aggregate_cpu_time; |
472 | 765 | aggregate_cpu_time.total_time = values[0] + values[1] + values[2] + values[3] + |
473 | 765 | values[4] + values[5] + values[6] + values[7]; |
474 | 765 | aggregate_cpu_time.idle_time = values[3] + values[4]; |
475 | 765 | aggregate_cpu_time.initialized = aggregate_cpu_time.total_time > 0; |
476 | | // Publish a consistent aggregate snapshot derived from one /proc/stat row. |
477 | 765 | std::lock_guard<std::mutex> lk(_aggregate_cpu_time_mutex); |
478 | 765 | _aggregate_cpu_time = aggregate_cpu_time; |
479 | 765 | } |
480 | | |
481 | 184k | for (int i = 0; i < CpuMetrics::cpu_num_metrics; ++i) { |
482 | 167k | it->second->metrics[i]->set_value(values[i]); |
483 | 167k | } |
484 | 16.7k | } |
485 | | |
486 | 765 | if (ferror(fp) != 0) { |
487 | 0 | char buf[64]; |
488 | 0 | LOG(WARNING) << "getline failed, errno=" << errno |
489 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
490 | 0 | } |
491 | | |
492 | 765 | fclose(fp); |
493 | 765 | } |
494 | | |
495 | 12 | void SystemMetrics::_install_memory_metrics(MetricEntity* entity) { |
496 | 12 | _memory_metrics.reset(new MemoryMetrics(entity)); |
497 | 12 | } |
498 | | |
499 | 765 | void SystemMetrics::_update_memory_metrics() { |
500 | 765 | _memory_metrics->memory_allocated_bytes->set_value(PerfCounters::get_vm_rss()); |
501 | 765 | get_metrics_from_proc_vmstat(); |
502 | 765 | } |
503 | | |
504 | 0 | void SystemMetrics::update_allocator_metrics() { |
505 | 0 | #if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER) |
506 | 0 | LOG(INFO) << "Memory tracking is not available with address sanitizer builds."; |
507 | | #elif defined(USE_JEMALLOC) |
508 | | _memory_metrics->memory_jemalloc_allocated_bytes->set_value( |
509 | | JemallocControl::get_jemallctl_value<int64_t>("stats.allocated")); |
510 | | _memory_metrics->memory_jemalloc_active_bytes->set_value( |
511 | | JemallocControl::get_jemallctl_value<int64_t>("stats.active")); |
512 | | _memory_metrics->memory_jemalloc_metadata_bytes->set_value( |
513 | | JemallocControl::get_jemallctl_value<int64_t>("stats.metadata")); |
514 | | _memory_metrics->memory_jemalloc_resident_bytes->set_value( |
515 | | JemallocControl::get_jemallctl_value<int64_t>("stats.resident")); |
516 | | _memory_metrics->memory_jemalloc_mapped_bytes->set_value( |
517 | | JemallocControl::get_jemallctl_value<int64_t>("stats.mapped")); |
518 | | _memory_metrics->memory_jemalloc_retained_bytes->set_value( |
519 | | JemallocControl::get_jemallctl_value<int64_t>("stats.retained")); |
520 | | _memory_metrics->memory_jemalloc_tcache_bytes->set_value( |
521 | | JemallocControl::get_je_all_arena_metrics("tcache_bytes")); |
522 | | _memory_metrics->memory_jemalloc_pactive_num->set_value( |
523 | | JemallocControl::get_je_all_arena_metrics("pactive")); |
524 | | _memory_metrics->memory_jemalloc_pdirty_num->set_value( |
525 | | JemallocControl::get_je_all_arena_metrics("pdirty")); |
526 | | _memory_metrics->memory_jemalloc_pmuzzy_num->set_value( |
527 | | JemallocControl::get_je_all_arena_metrics("pmuzzy")); |
528 | | _memory_metrics->memory_jemalloc_dirty_purged_num->set_value( |
529 | | JemallocControl::get_je_all_arena_metrics("dirty_purged")); |
530 | | _memory_metrics->memory_jemalloc_muzzy_purged_num->set_value( |
531 | | JemallocControl::get_je_all_arena_metrics("muzzy_purged")); |
532 | | #else |
533 | | _memory_metrics->memory_tcmalloc_allocated_bytes->set_value( |
534 | | JemallocControl::get_tc_metrics("generic.total_physical_bytes")); |
535 | | _memory_metrics->memory_tcmalloc_total_thread_cache_bytes->set_value( |
536 | | JemallocControl::je_cache_bytes()); |
537 | | _memory_metrics->memory_tcmalloc_central_cache_free_bytes->set_value( |
538 | | JemallocControl::get_tc_metrics("tcmalloc.central_cache_free_bytes")); |
539 | | _memory_metrics->memory_tcmalloc_transfer_cache_free_bytes->set_value( |
540 | | JemallocControl::get_tc_metrics("tcmalloc.transfer_cache_free_bytes")); |
541 | | _memory_metrics->memory_tcmalloc_thread_cache_free_bytes->set_value( |
542 | | JemallocControl::get_tc_metrics("tcmalloc.thread_cache_free_bytes")); |
543 | | _memory_metrics->memory_tcmalloc_pageheap_free_bytes->set_value( |
544 | | JemallocControl::get_tc_metrics("tcmalloc.pageheap_free_bytes")); |
545 | | _memory_metrics->memory_tcmalloc_pageheap_unmapped_bytes->set_value( |
546 | | JemallocControl::get_tc_metrics("tcmalloc.pageheap_unmapped_bytes")); |
547 | | #endif |
548 | 0 | } |
549 | | |
550 | 12 | void SystemMetrics::_install_disk_metrics(const std::set<std::string>& disk_devices) { |
551 | 12 | for (auto& disk_device : disk_devices) { |
552 | 9 | auto disk_entity = _registry->register_entity(std::string("disk_metrics.") + disk_device, |
553 | 9 | {{"device", disk_device}}); |
554 | 9 | DiskMetrics* metrics = new DiskMetrics(disk_entity.get()); |
555 | 9 | _disk_metrics.emplace(disk_device, metrics); |
556 | 9 | } |
557 | 12 | } |
558 | | |
559 | 765 | void SystemMetrics::_update_disk_metrics() { |
560 | | #ifdef BE_TEST |
561 | | FILE* fp = fopen(k_ut_diskstats_path, "r"); |
562 | | #else |
563 | 765 | FILE* fp = fopen("/proc/diskstats", "r"); |
564 | 765 | #endif |
565 | 765 | if (fp == nullptr) { |
566 | 7 | char buf[64]; |
567 | 7 | LOG(WARNING) << "open /proc/diskstats failed, errno=" << errno |
568 | 7 | << ", message=" << strerror_r(errno, buf, 64); |
569 | 7 | return; |
570 | 7 | } |
571 | | |
572 | | // /proc/diskstats: https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats |
573 | | // 1 - major number |
574 | | // 2 - minor mumber |
575 | | // 3 - device name |
576 | | // 4 - reads completed successfully |
577 | | // 5 - reads merged |
578 | | // 6 - sectors read |
579 | | // 7 - time spent reading (ms) |
580 | | // 8 - writes completed |
581 | | // 9 - writes merged |
582 | | // 10 - sectors written |
583 | | // 11 - time spent writing (ms) |
584 | | // 12 - I/Os currently in progress |
585 | | // 13 - time spent doing I/Os (ms) |
586 | | // 14 - weighted time spent doing I/Os (ms) |
587 | | // I think 1024 is enough for device name |
588 | 758 | int major = 0; |
589 | 758 | int minor = 0; |
590 | 758 | char device[1024]; |
591 | 758 | int64_t values[11]; |
592 | 9.11k | while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { |
593 | 8.36k | memset(values, 0, sizeof(values)); |
594 | 8.36k | int num = sscanf(_line_ptr, |
595 | 8.36k | "%d %d %1023s" |
596 | 8.36k | " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 |
597 | 8.36k | " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64, |
598 | 8.36k | &major, &minor, device, &values[0], &values[1], &values[2], &values[3], |
599 | 8.36k | &values[4], &values[5], &values[6], &values[7], &values[8], &values[9], |
600 | 8.36k | &values[10]); |
601 | 8.36k | if (num < 4) { |
602 | 0 | continue; |
603 | 0 | } |
604 | 8.36k | auto it = _disk_metrics.find(device); |
605 | 8.36k | if (it == _disk_metrics.end()) { |
606 | 7.60k | continue; |
607 | 7.60k | } |
608 | | // update disk metrics |
609 | | // reads_completed: 4 reads completed successfully |
610 | 758 | it->second->disk_reads_completed->set_value(values[0]); |
611 | | // bytes_read: 6 sectors read * 512; 5 reads merged is ignored |
612 | 758 | it->second->disk_bytes_read->set_value(values[2] * 512); |
613 | | // read_time_ms: 7 time spent reading (ms) |
614 | 758 | it->second->disk_read_time_ms->set_value(values[3]); |
615 | | // writes_completed: 8 writes completed |
616 | 758 | it->second->disk_writes_completed->set_value(values[4]); |
617 | | // bytes_written: 10 sectors write * 512; 9 writes merged is ignored |
618 | 758 | it->second->disk_bytes_written->set_value(values[6] * 512); |
619 | | // write_time_ms: 11 time spent writing (ms) |
620 | 758 | it->second->disk_write_time_ms->set_value(values[7]); |
621 | | // io_time_ms: 13 time spent doing I/Os (ms) |
622 | 758 | it->second->disk_io_time_ms->set_value(values[9]); |
623 | | // io_time_weigthed: 14 - weighted time spent doing I/Os (ms) |
624 | 758 | it->second->disk_io_time_weigthed->set_value(values[10]); |
625 | 758 | } |
626 | 758 | if (ferror(fp) != 0) { |
627 | 0 | char buf[64]; |
628 | 0 | LOG(WARNING) << "getline failed, errno=" << errno |
629 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
630 | 0 | } |
631 | 758 | fclose(fp); |
632 | 758 | } |
633 | | |
634 | 12 | void SystemMetrics::_install_net_metrics(const std::vector<std::string>& interfaces) { |
635 | 37 | for (auto& interface : interfaces) { |
636 | 37 | auto interface_entity = _registry->register_entity( |
637 | 37 | std::string("network_metrics.") + interface, {{"device", interface}}); |
638 | 37 | NetworkMetrics* metrics = new NetworkMetrics(interface_entity.get()); |
639 | 37 | _network_metrics.emplace(interface, metrics); |
640 | 37 | } |
641 | 12 | } |
642 | | |
643 | 12 | void SystemMetrics::_install_snmp_metrics(MetricEntity* entity) { |
644 | 12 | _snmp_metrics.reset(new SnmpMetrics(entity)); |
645 | 12 | } |
646 | | |
647 | 765 | void SystemMetrics::_update_net_metrics() { |
648 | | #ifdef BE_TEST |
649 | | // to mock proc |
650 | | FILE* fp = fopen(k_ut_net_dev_path, "r"); |
651 | | #else |
652 | 765 | FILE* fp = fopen("/proc/net/dev", "r"); |
653 | 765 | #endif |
654 | 765 | if (fp == nullptr) { |
655 | 7 | char buf[64]; |
656 | 7 | LOG(WARNING) << "open /proc/net/dev failed, errno=" << errno |
657 | 7 | << ", message=" << strerror_r(errno, buf, 64); |
658 | 7 | return; |
659 | 7 | } |
660 | | |
661 | | // Ignore header |
662 | 758 | if (getline(&_line_ptr, &_line_buf_size, fp) < 0 || |
663 | 758 | getline(&_line_ptr, &_line_buf_size, fp) < 0) { |
664 | 0 | char buf[64]; |
665 | 0 | LOG(WARNING) << "read /proc/net/dev first two line failed, errno=" << errno |
666 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
667 | 0 | fclose(fp); |
668 | 0 | return; |
669 | 0 | } |
670 | 758 | if (_proc_net_dev_version == 0) { |
671 | 8 | if (strstr(_line_ptr, "compressed") != nullptr) { |
672 | 8 | _proc_net_dev_version = 3; |
673 | 8 | } else if (strstr(_line_ptr, "bytes") != nullptr) { |
674 | 0 | _proc_net_dev_version = 2; |
675 | 0 | } else { |
676 | 0 | _proc_net_dev_version = 1; |
677 | 0 | } |
678 | 8 | } |
679 | | |
680 | 19.5k | while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { |
681 | 18.8k | char* ptr = strrchr(_line_ptr, ':'); |
682 | 18.8k | if (ptr == nullptr) { |
683 | 0 | continue; |
684 | 0 | } |
685 | 18.8k | char* start = _line_ptr; |
686 | 23.3k | while (isspace(*start)) { |
687 | 4.55k | start++; |
688 | 4.55k | } |
689 | 18.8k | std::string interface(start, ptr - start); |
690 | 18.8k | auto it = _network_metrics.find(interface); |
691 | 18.8k | if (it == _network_metrics.end()) { |
692 | 15.4k | continue; |
693 | 15.4k | } |
694 | 3.35k | ptr++; |
695 | 3.35k | int64_t receive_bytes = 0; |
696 | 3.35k | int64_t receive_packets = 0; |
697 | 3.35k | int64_t send_bytes = 0; |
698 | 3.35k | int64_t send_packets = 0; |
699 | 3.35k | switch (_proc_net_dev_version) { |
700 | 3.35k | case 3: |
701 | | // receive: bytes packets errs drop fifo frame compressed multicast |
702 | | // send: bytes packets errs drop fifo colls carrier compressed |
703 | 3.35k | sscanf(ptr, |
704 | 3.35k | " %" PRId64 " %" PRId64 |
705 | 3.35k | " %*d %*d %*d %*d %*d %*d" |
706 | 3.35k | " %" PRId64 " %" PRId64 " %*d %*d %*d %*d %*d %*d", |
707 | 3.35k | &receive_bytes, &receive_packets, &send_bytes, &send_packets); |
708 | 3.35k | break; |
709 | 0 | case 2: |
710 | | // receive: bytes packets errs drop fifo frame |
711 | | // send: bytes packets errs drop fifo colls carrier |
712 | 0 | sscanf(ptr, |
713 | 0 | " %" PRId64 " %" PRId64 |
714 | 0 | " %*d %*d %*d %*d" |
715 | 0 | " %" PRId64 " %" PRId64 " %*d %*d %*d %*d %*d", |
716 | 0 | &receive_bytes, &receive_packets, &send_bytes, &send_packets); |
717 | 0 | break; |
718 | 0 | case 1: |
719 | | // receive: packets errs drop fifo frame |
720 | | // send: packets errs drop fifo colls carrier |
721 | 0 | sscanf(ptr, |
722 | 0 | " %" PRId64 |
723 | 0 | " %*d %*d %*d %*d" |
724 | 0 | " %" PRId64 " %*d %*d %*d %*d %*d", |
725 | 0 | &receive_packets, &send_packets); |
726 | 0 | break; |
727 | 0 | default: |
728 | 0 | break; |
729 | 3.35k | } |
730 | 3.35k | it->second->network_receive_bytes->set_value(receive_bytes); |
731 | 3.35k | it->second->network_receive_packets->set_value(receive_packets); |
732 | 3.35k | it->second->network_send_bytes->set_value(send_bytes); |
733 | 3.35k | it->second->network_send_packets->set_value(send_packets); |
734 | 3.35k | } |
735 | 758 | if (ferror(fp) != 0) { |
736 | 0 | char buf[64]; |
737 | 0 | LOG(WARNING) << "getline failed, errno=" << errno |
738 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
739 | 0 | } |
740 | 758 | fclose(fp); |
741 | 758 | } |
742 | | |
743 | 765 | void SystemMetrics::_update_snmp_metrics() { |
744 | | #ifdef BE_TEST |
745 | | // to mock proc |
746 | | FILE* fp = fopen(k_ut_net_snmp_path, "r"); |
747 | | #else |
748 | 765 | FILE* fp = fopen("/proc/net/snmp", "r"); |
749 | 765 | #endif |
750 | 765 | if (fp == nullptr) { |
751 | 7 | char buf[64]; |
752 | 7 | LOG(WARNING) << "open /proc/net/snmp failed, errno=" << errno |
753 | 7 | << ", message=" << strerror_r(errno, buf, 64); |
754 | 7 | return; |
755 | 7 | } |
756 | | |
757 | | // We only care about Tcp lines, so skip other lines in front of Tcp line |
758 | 758 | int64_t res = 0; |
759 | 5.30k | while ((res = getline(&_line_ptr, &_line_buf_size, fp)) > 0) { |
760 | 5.30k | if (strstr(_line_ptr, "Tcp") != nullptr) { |
761 | 758 | break; |
762 | 758 | } |
763 | 5.30k | } |
764 | 758 | if (res <= 0) { |
765 | 0 | char buf[64]; |
766 | 0 | LOG(WARNING) << "failed to skip lines of /proc/net/snmp, errno=" << errno |
767 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
768 | 0 | fclose(fp); |
769 | 0 | return; |
770 | 0 | } |
771 | | |
772 | | // parse the Tcp header |
773 | | // Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors |
774 | 758 | std::vector<std::string> headers = absl::StrSplit(_line_ptr, " "); |
775 | 758 | std::unordered_map<std::string, int32_t> header_map; |
776 | 758 | int32_t pos = 0; |
777 | 12.1k | for (auto& h : headers) { |
778 | 12.1k | header_map.emplace(h, pos++); |
779 | 12.1k | } |
780 | | |
781 | | // read the metrics of TCP |
782 | 758 | if (getline(&_line_ptr, &_line_buf_size, fp) < 0) { |
783 | 0 | char buf[64]; |
784 | 0 | LOG(WARNING) << "failed to skip Tcp header line of /proc/net/snmp, errno=" << errno |
785 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
786 | 0 | fclose(fp); |
787 | 0 | return; |
788 | 0 | } |
789 | | |
790 | | // metric line looks like: |
791 | | // Tcp: 1 200 120000 -1 47849374 38601877 3353843 2320314 276 1033354613 1166025166 825439 12694 23238924 0 |
792 | 758 | std::vector<std::string> metrics = absl::StrSplit(_line_ptr, " "); |
793 | 758 | if (metrics.size() != headers.size()) { |
794 | 0 | LOG(WARNING) << "invalid tcp metrics line: " << _line_ptr; |
795 | 0 | fclose(fp); |
796 | 0 | return; |
797 | 0 | } |
798 | 758 | int64_t retrans_segs = std::stoll(metrics[header_map["RetransSegs"]]); |
799 | 758 | int64_t in_errs = std::stoll(metrics[header_map["InErrs"]]); |
800 | 758 | int64_t in_segs = std::stoll(metrics[header_map["InSegs"]]); |
801 | 758 | int64_t out_segs = std::stoll(metrics[header_map["OutSegs"]]); |
802 | 758 | _snmp_metrics->snmp_tcp_retrans_segs->set_value(retrans_segs); |
803 | 758 | _snmp_metrics->snmp_tcp_in_errs->set_value(in_errs); |
804 | 758 | _snmp_metrics->snmp_tcp_in_segs->set_value(in_segs); |
805 | 758 | _snmp_metrics->snmp_tcp_out_segs->set_value(out_segs); |
806 | | |
807 | 758 | if (ferror(fp) != 0) { |
808 | 0 | char buf[64]; |
809 | 0 | LOG(WARNING) << "getline failed, errno=" << errno |
810 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
811 | 0 | } |
812 | 758 | fclose(fp); |
813 | 758 | } |
814 | | |
815 | 12 | void SystemMetrics::_install_fd_metrics(MetricEntity* entity) { |
816 | 12 | _fd_metrics.reset(new FileDescriptorMetrics(entity)); |
817 | 12 | } |
818 | | |
819 | 765 | void SystemMetrics::_update_fd_metrics() { |
820 | | #ifdef BE_TEST |
821 | | FILE* fp = fopen(k_ut_fd_path, "r"); |
822 | | #else |
823 | 765 | FILE* fp = fopen("/proc/sys/fs/file-nr", "r"); |
824 | 765 | #endif |
825 | 765 | if (fp == nullptr) { |
826 | 7 | char buf[64]; |
827 | 7 | LOG(WARNING) << "open /proc/sys/fs/file-nr failed, errno=" << errno |
828 | 7 | << ", message=" << strerror_r(errno, buf, 64); |
829 | 7 | return; |
830 | 7 | } |
831 | | |
832 | | // /proc/sys/fs/file-nr: https://www.kernel.org/doc/Documentation/sysctl/fs.txt |
833 | | // 1 - the number of allocated file handles |
834 | | // 2 - the number of allocated but unused file handles |
835 | | // 3 - the maximum number of file handles |
836 | | |
837 | 758 | int64_t values[3]; |
838 | 758 | if (getline(&_line_ptr, &_line_buf_size, fp) > 0) { |
839 | 758 | memset(values, 0, sizeof(values)); |
840 | 758 | int num = sscanf(_line_ptr, "%" PRId64 " %" PRId64 " %" PRId64, &values[0], &values[1], |
841 | 758 | &values[2]); |
842 | 758 | if (num == 3) { |
843 | 758 | _fd_metrics->fd_num_limit->set_value(values[2]); |
844 | 758 | _fd_metrics->fd_num_used->set_value(values[0] - values[1]); |
845 | 758 | } |
846 | 758 | } |
847 | | |
848 | 758 | if (ferror(fp) != 0) { |
849 | 0 | char buf[64]; |
850 | 0 | LOG(WARNING) << "getline failed, errno=" << errno |
851 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
852 | 0 | } |
853 | 758 | fclose(fp); |
854 | 758 | } |
855 | | |
856 | 12 | void SystemMetrics::_install_load_avg_metrics(MetricEntity* entity) { |
857 | 12 | _load_average_metrics.reset(new LoadAverageMetrics(entity)); |
858 | 12 | } |
859 | | |
860 | 765 | void SystemMetrics::_update_load_avg_metrics() { |
861 | | #ifdef BE_TEST |
862 | | FILE* fp = fopen(k_ut_load_avg_path, "r"); |
863 | | #else |
864 | 765 | FILE* fp = fopen("/proc/loadavg", "r"); |
865 | 765 | #endif |
866 | 765 | if (fp == nullptr) { |
867 | 7 | char buf[64]; |
868 | 7 | LOG(WARNING) << "open /proc/loadavg failed, errno=" << errno |
869 | 7 | << ", message=" << strerror_r(errno, buf, 64); |
870 | 7 | return; |
871 | 7 | } |
872 | | |
873 | 758 | double values[3]; |
874 | 758 | if (getline(&_line_ptr, &_line_buf_size, fp) > 0) { |
875 | 758 | memset(values, 0, sizeof(values)); |
876 | 758 | int num = sscanf(_line_ptr, "%lf %lf %lf", &values[0], &values[1], &values[2]); |
877 | 758 | if (num == 3) { |
878 | 758 | _load_average_metrics->load_average_1_minutes->set_value(values[0]); |
879 | 758 | _load_average_metrics->load_average_5_minutes->set_value(values[1]); |
880 | 758 | _load_average_metrics->load_average_15_minutes->set_value(values[2]); |
881 | 758 | } |
882 | 758 | } |
883 | | |
884 | 758 | if (ferror(fp) != 0) { |
885 | 0 | char buf[64]; |
886 | 0 | LOG(WARNING) << "getline failed, errno=" << errno |
887 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
888 | 0 | } |
889 | 758 | fclose(fp); |
890 | 758 | } |
891 | | |
892 | | int64_t SystemMetrics::get_max_io_util(const std::map<std::string, int64_t>& lst_value, |
893 | 1.65k | int64_t interval_sec) { |
894 | 1.65k | int64_t max = 0; |
895 | 1.65k | for (auto& it : _disk_metrics) { |
896 | 1.65k | int64_t cur = it.second->disk_io_time_ms->value(); |
897 | 1.65k | const auto find = lst_value.find(it.first); |
898 | 1.65k | if (find == lst_value.end()) { |
899 | 6 | continue; |
900 | 6 | } |
901 | 1.64k | int64_t incr = cur - find->second; |
902 | 1.64k | if (incr > max) max = incr; |
903 | 1.64k | } |
904 | 1.65k | return max / interval_sec / 10; |
905 | 1.65k | } |
906 | | |
907 | 1.66k | void SystemMetrics::get_disks_io_time(std::map<std::string, int64_t>* map) { |
908 | 1.66k | map->clear(); |
909 | 1.66k | for (auto& it : _disk_metrics) { |
910 | 1.66k | map->emplace(it.first, it.second->disk_io_time_ms->value()); |
911 | 1.66k | } |
912 | 1.66k | } |
913 | | |
914 | 9.57k | double SystemMetrics::get_load_average_1_min() { |
915 | 9.57k | if (_load_average_metrics) { |
916 | 9.57k | return _load_average_metrics->load_average_1_minutes->value(); |
917 | 9.57k | } else { |
918 | 0 | return 0; |
919 | 0 | } |
920 | 9.57k | } |
921 | | |
922 | 4.53k | bool SystemMetrics::get_aggregate_cpu_time(int64_t* total_time, int64_t* idle_time) const { |
923 | 4.53k | DCHECK(total_time != nullptr); |
924 | 4.53k | DCHECK(idle_time != nullptr); |
925 | | |
926 | 4.53k | std::lock_guard<std::mutex> lk(_aggregate_cpu_time_mutex); |
927 | 4.53k | if (!_aggregate_cpu_time.initialized) { |
928 | 0 | return false; |
929 | 0 | } |
930 | | |
931 | 4.53k | *total_time = _aggregate_cpu_time.total_time; |
932 | 4.53k | *idle_time = _aggregate_cpu_time.idle_time; |
933 | 4.53k | return true; |
934 | 4.53k | } |
935 | | |
936 | | void SystemMetrics::get_network_traffic(std::map<std::string, int64_t>* send_map, |
937 | 757 | std::map<std::string, int64_t>* rcv_map) { |
938 | 757 | send_map->clear(); |
939 | 757 | rcv_map->clear(); |
940 | 3.79k | for (auto& it : _network_metrics) { |
941 | 3.79k | if (it.first == "lo") { |
942 | 757 | continue; |
943 | 757 | } |
944 | 3.03k | send_map->emplace(it.first, it.second->network_send_bytes->value()); |
945 | 3.03k | rcv_map->emplace(it.first, it.second->network_receive_bytes->value()); |
946 | 3.03k | } |
947 | 757 | } |
948 | | |
949 | | void SystemMetrics::get_max_net_traffic(const std::map<std::string, int64_t>& lst_send_map, |
950 | | const std::map<std::string, int64_t>& lst_rcv_map, |
951 | | int64_t interval_sec, int64_t* send_rate, |
952 | 750 | int64_t* rcv_rate) { |
953 | 750 | int64_t max_send = 0; |
954 | 750 | int64_t max_rcv = 0; |
955 | 3.76k | for (auto& it : _network_metrics) { |
956 | 3.76k | int64_t cur_send = it.second->network_send_bytes->value(); |
957 | 3.76k | int64_t cur_rcv = it.second->network_receive_bytes->value(); |
958 | | |
959 | 3.76k | const auto find_send = lst_send_map.find(it.first); |
960 | 3.76k | if (find_send != lst_send_map.end()) { |
961 | 3.01k | int64_t incr = cur_send - find_send->second; |
962 | 3.01k | if (incr > max_send) max_send = incr; |
963 | 3.01k | } |
964 | 3.76k | const auto find_rcv = lst_rcv_map.find(it.first); |
965 | 3.76k | if (find_rcv != lst_rcv_map.end()) { |
966 | 3.01k | int64_t incr = cur_rcv - find_rcv->second; |
967 | 3.01k | if (incr > max_rcv) max_rcv = incr; |
968 | 3.01k | } |
969 | 3.76k | } |
970 | | |
971 | 750 | *send_rate = max_send / interval_sec; |
972 | 750 | *rcv_rate = max_rcv / interval_sec; |
973 | 750 | } |
974 | | |
975 | | void SystemMetrics::update_max_disk_io_util_percent(const std::map<std::string, int64_t>& lst_value, |
976 | 750 | int64_t interval_sec) { |
977 | 750 | max_disk_io_util_percent->set_value(get_max_io_util(lst_value, interval_sec)); |
978 | 750 | } |
979 | | |
980 | 750 | void SystemMetrics::update_max_network_send_bytes_rate(int64_t max_send_bytes_rate) { |
981 | 750 | max_network_send_bytes_rate->set_value(max_send_bytes_rate); |
982 | 750 | } |
983 | | |
984 | 750 | void SystemMetrics::update_max_network_receive_bytes_rate(int64_t max_receive_bytes_rate) { |
985 | 750 | max_network_receive_bytes_rate->set_value(max_receive_bytes_rate); |
986 | 750 | } |
987 | | |
988 | 12 | void SystemMetrics::_install_proc_metrics(MetricEntity* entity) { |
989 | 12 | _proc_metrics.reset(new ProcMetrics(entity)); |
990 | 12 | } |
991 | | |
992 | 765 | void SystemMetrics::_update_proc_metrics() { |
993 | | #ifdef BE_TEST |
994 | | FILE* fp = fopen(k_ut_stat_path, "r"); |
995 | | #else |
996 | 765 | FILE* fp = fopen("/proc/stat", "r"); |
997 | 765 | #endif |
998 | 765 | if (fp == nullptr) { |
999 | 0 | char buf[64]; |
1000 | 0 | LOG(WARNING) << "open /proc/stat failed, errno=" << errno |
1001 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
1002 | 0 | return; |
1003 | 0 | } |
1004 | | |
1005 | 765 | uint64_t inter = 0, ctxt = 0, procs_r = 0, procs_b = 0; |
1006 | 22.8k | while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { |
1007 | 22.0k | char* start_pos = nullptr; |
1008 | 22.0k | start_pos = strstr(_line_ptr, "intr "); |
1009 | 22.0k | if (start_pos) { |
1010 | 758 | sscanf(start_pos, "intr %" PRIu64, &inter); |
1011 | 758 | _proc_metrics->proc_interrupt->set_value(inter); |
1012 | 758 | } |
1013 | | |
1014 | 22.0k | start_pos = strstr(_line_ptr, "ctxt "); |
1015 | 22.0k | if (start_pos) { |
1016 | 758 | sscanf(start_pos, "ctxt %" PRIu64, &ctxt); |
1017 | 758 | _proc_metrics->proc_ctxt_switch->set_value(ctxt); |
1018 | 758 | } |
1019 | | |
1020 | 22.0k | start_pos = strstr(_line_ptr, "procs_running "); |
1021 | 22.0k | if (start_pos) { |
1022 | 758 | sscanf(start_pos, "procs_running %" PRIu64, &procs_r); |
1023 | 758 | _proc_metrics->proc_procs_running->set_value(procs_r); |
1024 | 758 | } |
1025 | | |
1026 | 22.0k | start_pos = strstr(_line_ptr, "procs_blocked "); |
1027 | 22.0k | if (start_pos) { |
1028 | 758 | sscanf(start_pos, "procs_blocked %" PRIu64, &procs_b); |
1029 | 758 | _proc_metrics->proc_procs_blocked->set_value(procs_b); |
1030 | 758 | } |
1031 | 22.0k | } |
1032 | | |
1033 | 765 | if (ferror(fp) != 0) { |
1034 | 0 | char buf[64]; |
1035 | 0 | LOG(WARNING) << "getline failed, errno=" << errno |
1036 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
1037 | 0 | } |
1038 | | |
1039 | 765 | fclose(fp); |
1040 | 765 | } |
1041 | | |
1042 | 750 | void SystemMetrics::update_be_avail_cpu_num() { |
1043 | 750 | int64_t physical_cpu_num = _cpu_num_metrics->host_cpu_num->value(); |
1044 | 750 | if (physical_cpu_num > 0) { |
1045 | 750 | physical_cpu_num = |
1046 | 750 | CGroupUtil::get_cgroup_limited_cpu_number(cast_set<int32_t>(physical_cpu_num)); |
1047 | 750 | _cpu_num_metrics->avail_cpu_num->set_value(physical_cpu_num); |
1048 | 750 | } |
1049 | 750 | } |
1050 | | |
1051 | 765 | void SystemMetrics::get_metrics_from_proc_vmstat() { |
1052 | | #ifdef BE_TEST |
1053 | | FILE* fp = fopen(k_ut_vmstat_path, "r"); |
1054 | | #else |
1055 | 765 | FILE* fp = fopen("/proc/vmstat", "r"); |
1056 | 765 | #endif |
1057 | 765 | if (fp == nullptr) { |
1058 | 7 | char buf[64]; |
1059 | 7 | LOG(WARNING) << "open /proc/vmstat failed, errno=" << errno |
1060 | 7 | << ", message=" << strerror_r(errno, buf, 64); |
1061 | 7 | return; |
1062 | 7 | } |
1063 | | |
1064 | 105k | while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { |
1065 | 104k | uint64_t value; |
1066 | 104k | char name[64]; |
1067 | 104k | int num = sscanf(_line_ptr, "%s %" PRIu64, name, &value); |
1068 | 104k | if (num < 2) { |
1069 | 0 | continue; |
1070 | 0 | } |
1071 | | |
1072 | 104k | if (strcmp(name, "pgpgin") == 0) { |
1073 | 758 | _memory_metrics->memory_pgpgin->set_value(value); |
1074 | 103k | } else if (strcmp(name, "pgpgout") == 0) { |
1075 | 758 | _memory_metrics->memory_pgpgout->set_value(value); |
1076 | 102k | } else if (strcmp(name, "pswpin") == 0) { |
1077 | 758 | _memory_metrics->memory_pswpin->set_value(value); |
1078 | 102k | } else if (strcmp(name, "pswpout") == 0) { |
1079 | 758 | _memory_metrics->memory_pswpout->set_value(value); |
1080 | 758 | } |
1081 | 104k | } |
1082 | | |
1083 | 758 | if (ferror(fp) != 0) { |
1084 | 0 | char buf[64]; |
1085 | 0 | LOG(WARNING) << "getline failed, errno=" << errno |
1086 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
1087 | 0 | } |
1088 | | |
1089 | 758 | fclose(fp); |
1090 | 758 | } |
1091 | | |
1092 | 12 | void SystemMetrics::get_cpu_name() { |
1093 | | #ifdef BE_TEST |
1094 | | FILE* fp = fopen(k_ut_stat_path, "r"); |
1095 | | #else |
1096 | 12 | FILE* fp = fopen("/proc/stat", "r"); |
1097 | 12 | #endif |
1098 | 12 | if (fp == nullptr) { |
1099 | 1 | char buf[64]; |
1100 | 1 | LOG(WARNING) << "open /proc/stat failed, errno=" << errno |
1101 | 1 | << ", message=" << strerror_r(errno, buf, 64); |
1102 | 1 | return; |
1103 | 1 | } |
1104 | | |
1105 | 254 | while (getline(&_line_ptr, &_line_buf_size, fp) > 0) { |
1106 | 243 | char cpu[16]; |
1107 | 243 | char* start_pos = nullptr; |
1108 | 243 | start_pos = strstr(_line_ptr, "cpu"); |
1109 | 243 | if (start_pos) { |
1110 | 187 | sscanf(_line_ptr, "%15s", cpu); |
1111 | 187 | std::string cpu_name(cpu); |
1112 | 187 | _cpu_names.push_back(cpu_name); |
1113 | 187 | } |
1114 | 243 | } |
1115 | | |
1116 | 11 | if (ferror(fp) != 0) { |
1117 | 0 | char buf[64]; |
1118 | 0 | LOG(WARNING) << "getline failed, errno=" << errno |
1119 | 0 | << ", message=" << strerror_r(errno, buf, 64); |
1120 | 0 | } |
1121 | | |
1122 | 11 | fclose(fp); |
1123 | 11 | } |
1124 | | |
1125 | | } // namespace doris |