Coverage Report

Created: 2026-05-09 10:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/common/metrics/system_metrics.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "common/metrics/system_metrics.h"
19
20
#include <absl/strings/str_split.h>
21
#include <glog/logging.h>
22
23
#include <functional>
24
#include <ostream>
25
#include <unordered_map>
26
#include <utility>
27
28
#include "common/cast_set.h"
29
#include "common/config.h"
30
#include "runtime/memory/jemalloc_control.h"
31
#include "util/cgroup_util.h"
32
#include "util/perf_counters.h"
33
34
namespace doris {
35
DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(avail_cpu_num, MetricUnit::NOUNIT);
36
37
DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(host_cpu_num, MetricUnit::NOUNIT);
38
struct CpuNumberMetrics {
39
12
    CpuNumberMetrics(MetricEntity* ent) : entity(ent) {
40
12
        INT_COUNTER_METRIC_REGISTER(entity, host_cpu_num);
41
12
        INT_COUNTER_METRIC_REGISTER(entity, avail_cpu_num);
42
12
    }
43
44
    IntCounter* host_cpu_num {nullptr};
45
    IntCounter* avail_cpu_num {nullptr};
46
    MetricEntity* entity = nullptr;
47
};
48
49
#define DEFINE_CPU_COUNTER_METRIC(metric)                                            \
50
    DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(cpu_##metric, MetricUnit::PERCENT, "", cpu, \
51
                                         Labels({{"mode", #metric}}));
52
DEFINE_CPU_COUNTER_METRIC(user);
53
DEFINE_CPU_COUNTER_METRIC(nice);
54
DEFINE_CPU_COUNTER_METRIC(system);
55
DEFINE_CPU_COUNTER_METRIC(idle);
56
DEFINE_CPU_COUNTER_METRIC(iowait);
57
DEFINE_CPU_COUNTER_METRIC(irq);
58
DEFINE_CPU_COUNTER_METRIC(soft_irq);
59
DEFINE_CPU_COUNTER_METRIC(steal);
60
DEFINE_CPU_COUNTER_METRIC(guest);
61
DEFINE_CPU_COUNTER_METRIC(guest_nice);
62
63
// /proc/stat: http://www.linuxhowtos.org/System/procstat.htm
64
struct CpuMetrics {
65
187
    CpuMetrics(MetricEntity* ent) : entity(ent) {
66
187
        INT_COUNTER_METRIC_REGISTER(entity, cpu_user);
67
187
        INT_COUNTER_METRIC_REGISTER(entity, cpu_nice);
68
187
        INT_COUNTER_METRIC_REGISTER(entity, cpu_system);
69
187
        INT_COUNTER_METRIC_REGISTER(entity, cpu_idle);
70
187
        INT_COUNTER_METRIC_REGISTER(entity, cpu_iowait);
71
187
        INT_COUNTER_METRIC_REGISTER(entity, cpu_irq);
72
187
        INT_COUNTER_METRIC_REGISTER(entity, cpu_soft_irq);
73
187
        INT_COUNTER_METRIC_REGISTER(entity, cpu_steal);
74
187
        INT_COUNTER_METRIC_REGISTER(entity, cpu_guest);
75
187
        INT_COUNTER_METRIC_REGISTER(entity, cpu_guest_nice);
76
77
187
        metrics[0] = cpu_user;
78
187
        metrics[1] = cpu_nice;
79
187
        metrics[2] = cpu_system;
80
187
        metrics[3] = cpu_idle;
81
187
        metrics[4] = cpu_iowait;
82
187
        metrics[5] = cpu_irq;
83
187
        metrics[6] = cpu_soft_irq;
84
187
        metrics[7] = cpu_steal;
85
187
        metrics[8] = cpu_guest;
86
187
        metrics[9] = cpu_guest_nice;
87
187
    }
88
89
    static constexpr int cpu_num_metrics = 10;
90
91
    MetricEntity* entity = nullptr;
92
    IntCounter* cpu_user;
93
    IntCounter* cpu_nice;
94
    IntCounter* cpu_system;
95
    IntCounter* cpu_idle;
96
    IntCounter* cpu_iowait;
97
    IntCounter* cpu_irq;
98
    IntCounter* cpu_soft_irq;
99
    IntCounter* cpu_steal;
100
    IntCounter* cpu_guest;
101
    IntCounter* cpu_guest_nice;
102
103
    IntCounter* metrics[cpu_num_metrics];
104
};
105
106
#define DEFINE_MEMORY_GAUGE_METRIC(metric, unit) \
107
    DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(memory_##metric, unit);
108
DEFINE_MEMORY_GAUGE_METRIC(allocated_bytes, MetricUnit::BYTES);
109
DEFINE_MEMORY_GAUGE_METRIC(pgpgin, MetricUnit::NOUNIT);
110
DEFINE_MEMORY_GAUGE_METRIC(pgpgout, MetricUnit::NOUNIT);
111
DEFINE_MEMORY_GAUGE_METRIC(pswpin, MetricUnit::NOUNIT);
112
DEFINE_MEMORY_GAUGE_METRIC(pswpout, MetricUnit::NOUNIT);
113
#ifndef USE_JEMALLOC
114
DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_allocated_bytes, MetricUnit::BYTES);
115
DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_total_thread_cache_bytes, MetricUnit::BYTES);
116
DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_central_cache_free_bytes, MetricUnit::BYTES);
117
DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_transfer_cache_free_bytes, MetricUnit::BYTES);
118
DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_thread_cache_free_bytes, MetricUnit::BYTES);
119
DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_pageheap_free_bytes, MetricUnit::BYTES);
120
DEFINE_MEMORY_GAUGE_METRIC(tcmalloc_pageheap_unmapped_bytes, MetricUnit::BYTES);
121
#else
122
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_allocated_bytes, MetricUnit::BYTES);
123
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_active_bytes, MetricUnit::BYTES);
124
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_metadata_bytes, MetricUnit::BYTES);
125
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_resident_bytes, MetricUnit::BYTES);
126
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_mapped_bytes, MetricUnit::BYTES);
127
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_retained_bytes, MetricUnit::BYTES);
128
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_tcache_bytes, MetricUnit::BYTES);
129
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pactive_num, MetricUnit::NOUNIT);
130
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pdirty_num, MetricUnit::NOUNIT);
131
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_pmuzzy_num, MetricUnit::NOUNIT);
132
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_dirty_purged_num, MetricUnit::NOUNIT);
133
DEFINE_MEMORY_GAUGE_METRIC(jemalloc_muzzy_purged_num, MetricUnit::NOUNIT);
134
#endif
135
136
struct MemoryMetrics {
137
12
    MemoryMetrics(MetricEntity* ent) : entity(ent) {
138
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_allocated_bytes);
139
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_pgpgin);
140
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_pgpgout);
141
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_pswpin);
142
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_pswpout);
143
144
12
#ifndef USE_JEMALLOC
145
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_allocated_bytes);
146
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_total_thread_cache_bytes);
147
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_central_cache_free_bytes);
148
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_transfer_cache_free_bytes);
149
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_thread_cache_free_bytes);
150
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_pageheap_free_bytes);
151
12
        INT_GAUGE_METRIC_REGISTER(entity, memory_tcmalloc_pageheap_unmapped_bytes);
152
#else
153
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_allocated_bytes);
154
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_active_bytes);
155
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_metadata_bytes);
156
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_resident_bytes);
157
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_mapped_bytes);
158
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_retained_bytes);
159
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_tcache_bytes);
160
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pactive_num);
161
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pdirty_num);
162
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_pmuzzy_num);
163
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_dirty_purged_num);
164
        INT_GAUGE_METRIC_REGISTER(entity, memory_jemalloc_muzzy_purged_num);
165
#endif
166
12
    }
167
168
    MetricEntity* entity = nullptr;
169
    IntGauge* memory_allocated_bytes;
170
    IntGauge* memory_pgpgin;
171
    IntGauge* memory_pgpgout;
172
    IntGauge* memory_pswpin;
173
    IntGauge* memory_pswpout;
174
175
#ifndef USE_JEMALLOC
176
    IntGauge* memory_tcmalloc_allocated_bytes;
177
    IntGauge* memory_tcmalloc_total_thread_cache_bytes;
178
    IntGauge* memory_tcmalloc_central_cache_free_bytes;
179
    IntGauge* memory_tcmalloc_transfer_cache_free_bytes;
180
    IntGauge* memory_tcmalloc_thread_cache_free_bytes;
181
    IntGauge* memory_tcmalloc_pageheap_free_bytes;
182
    IntGauge* memory_tcmalloc_pageheap_unmapped_bytes;
183
#else
184
    IntGauge* memory_jemalloc_allocated_bytes;
185
    IntGauge* memory_jemalloc_active_bytes;
186
    IntGauge* memory_jemalloc_metadata_bytes;
187
    IntGauge* memory_jemalloc_resident_bytes;
188
    IntGauge* memory_jemalloc_mapped_bytes;
189
    IntGauge* memory_jemalloc_retained_bytes;
190
    IntGauge* memory_jemalloc_tcache_bytes;
191
    IntGauge* memory_jemalloc_pactive_num;
192
    IntGauge* memory_jemalloc_pdirty_num;
193
    IntGauge* memory_jemalloc_pmuzzy_num;
194
    IntGauge* memory_jemalloc_dirty_purged_num;
195
    IntGauge* memory_jemalloc_muzzy_purged_num;
196
#endif
197
};
198
199
#define DEFINE_DISK_COUNTER_METRIC(metric, unit) \
200
    DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(disk_##metric, unit);
201
DEFINE_DISK_COUNTER_METRIC(reads_completed, MetricUnit::OPERATIONS);
202
DEFINE_DISK_COUNTER_METRIC(bytes_read, MetricUnit::BYTES);
203
DEFINE_DISK_COUNTER_METRIC(read_time_ms, MetricUnit::MILLISECONDS);
204
DEFINE_DISK_COUNTER_METRIC(writes_completed, MetricUnit::OPERATIONS);
205
DEFINE_DISK_COUNTER_METRIC(bytes_written, MetricUnit::BYTES);
206
DEFINE_DISK_COUNTER_METRIC(write_time_ms, MetricUnit::MILLISECONDS);
207
DEFINE_DISK_COUNTER_METRIC(io_time_ms, MetricUnit::MILLISECONDS);
208
DEFINE_DISK_COUNTER_METRIC(io_time_weigthed, MetricUnit::MILLISECONDS);
209
210
struct DiskMetrics {
211
9
    DiskMetrics(MetricEntity* ent) : entity(ent) {
212
9
        INT_COUNTER_METRIC_REGISTER(entity, disk_reads_completed);
213
9
        INT_COUNTER_METRIC_REGISTER(entity, disk_bytes_read);
214
9
        INT_COUNTER_METRIC_REGISTER(entity, disk_read_time_ms);
215
9
        INT_COUNTER_METRIC_REGISTER(entity, disk_writes_completed);
216
9
        INT_COUNTER_METRIC_REGISTER(entity, disk_bytes_written);
217
9
        INT_COUNTER_METRIC_REGISTER(entity, disk_write_time_ms);
218
9
        INT_COUNTER_METRIC_REGISTER(entity, disk_io_time_ms);
219
9
        INT_COUNTER_METRIC_REGISTER(entity, disk_io_time_weigthed);
220
9
    }
221
222
    MetricEntity* entity = nullptr;
223
    IntCounter* disk_reads_completed;
224
    IntCounter* disk_bytes_read;
225
    IntCounter* disk_read_time_ms;
226
    IntCounter* disk_writes_completed;
227
    IntCounter* disk_bytes_written;
228
    IntCounter* disk_write_time_ms;
229
    IntCounter* disk_io_time_ms;
230
    IntCounter* disk_io_time_weigthed;
231
};
232
233
#define DEFINE_NETWORK_COUNTER_METRIC(metric, unit) \
234
    DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(network_##metric, unit);
235
DEFINE_NETWORK_COUNTER_METRIC(receive_bytes, MetricUnit::BYTES);
236
DEFINE_NETWORK_COUNTER_METRIC(receive_packets, MetricUnit::PACKETS);
237
DEFINE_NETWORK_COUNTER_METRIC(send_bytes, MetricUnit::BYTES);
238
DEFINE_NETWORK_COUNTER_METRIC(send_packets, MetricUnit::PACKETS);
239
240
struct NetworkMetrics {
241
37
    NetworkMetrics(MetricEntity* ent) : entity(ent) {
242
37
        INT_COUNTER_METRIC_REGISTER(entity, network_receive_bytes);
243
37
        INT_COUNTER_METRIC_REGISTER(entity, network_receive_packets);
244
37
        INT_COUNTER_METRIC_REGISTER(entity, network_send_bytes);
245
37
        INT_COUNTER_METRIC_REGISTER(entity, network_send_packets);
246
37
    }
247
248
    MetricEntity* entity = nullptr;
249
    IntCounter* network_receive_bytes;
250
    IntCounter* network_receive_packets;
251
    IntCounter* network_send_bytes;
252
    IntCounter* network_send_packets;
253
};
254
255
#define DEFINE_SNMP_COUNTER_METRIC(metric, unit, desc) \
256
    DEFINE_COUNTER_METRIC_PROTOTYPE_3ARG(snmp_##metric, unit, desc);
257
DEFINE_SNMP_COUNTER_METRIC(tcp_in_errs, MetricUnit::NOUNIT,
258
                           "The number of all problematic TCP packets received");
259
DEFINE_SNMP_COUNTER_METRIC(tcp_retrans_segs, MetricUnit::NOUNIT, "All TCP packets retransmitted");
260
DEFINE_SNMP_COUNTER_METRIC(tcp_in_segs, MetricUnit::NOUNIT, "All received TCP packets");
261
DEFINE_SNMP_COUNTER_METRIC(tcp_out_segs, MetricUnit::NOUNIT, "All send TCP packets with RST mark");
262
263
// metrics read from /proc/net/snmp
264
struct SnmpMetrics {
265
12
    SnmpMetrics(MetricEntity* ent) : entity(ent) {
266
12
        INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_in_errs);
267
12
        INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_retrans_segs);
268
12
        INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_in_segs);
269
12
        INT_COUNTER_METRIC_REGISTER(entity, snmp_tcp_out_segs);
270
12
    }
271
272
    MetricEntity* entity = nullptr;
273
    IntCounter* snmp_tcp_in_errs;
274
    IntCounter* snmp_tcp_retrans_segs;
275
    IntCounter* snmp_tcp_in_segs;
276
    IntCounter* snmp_tcp_out_segs;
277
};
278
279
#define DEFINE_FD_COUNTER_METRIC(metric, unit) \
280
    DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(fd_##metric, unit);
281
DEFINE_FD_COUNTER_METRIC(num_limit, MetricUnit::NOUNIT);
282
DEFINE_FD_COUNTER_METRIC(num_used, MetricUnit::NOUNIT);
283
284
struct FileDescriptorMetrics {
285
12
    FileDescriptorMetrics(MetricEntity* ent) : entity(ent) {
286
12
        INT_GAUGE_METRIC_REGISTER(entity, fd_num_limit);
287
12
        INT_GAUGE_METRIC_REGISTER(entity, fd_num_used);
288
12
    }
289
290
    MetricEntity* entity = nullptr;
291
    IntGauge* fd_num_limit;
292
    IntGauge* fd_num_used;
293
};
294
295
#define DEFINE_LOAD_AVERAGE_DOUBLE_METRIC(metric)                                     \
296
    DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(load_average_##metric, MetricUnit::NOUNIT, "", \
297
                                       load_average, Labels({{"mode", #metric}}));
298
DEFINE_LOAD_AVERAGE_DOUBLE_METRIC(1_minutes);
299
DEFINE_LOAD_AVERAGE_DOUBLE_METRIC(5_minutes);
300
DEFINE_LOAD_AVERAGE_DOUBLE_METRIC(15_minutes);
301
302
struct LoadAverageMetrics {
303
12
    LoadAverageMetrics(MetricEntity* ent) : entity(ent) {
304
12
        DOUBLE_GAUGE_METRIC_REGISTER(entity, load_average_1_minutes);
305
12
        DOUBLE_GAUGE_METRIC_REGISTER(entity, load_average_5_minutes);
306
12
        DOUBLE_GAUGE_METRIC_REGISTER(entity, load_average_15_minutes);
307
12
    }
308
309
    MetricEntity* entity = nullptr;
310
    DoubleGauge* load_average_1_minutes;
311
    DoubleGauge* load_average_5_minutes;
312
    DoubleGauge* load_average_15_minutes;
313
};
314
315
#define DEFINE_PROC_STAT_COUNTER_METRIC(metric)                                       \
316
    DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(proc_##metric, MetricUnit::NOUNIT, "", proc, \
317
                                         Labels({{"mode", #metric}}));
318
DEFINE_PROC_STAT_COUNTER_METRIC(interrupt);
319
DEFINE_PROC_STAT_COUNTER_METRIC(ctxt_switch);
320
DEFINE_PROC_STAT_COUNTER_METRIC(procs_running);
321
DEFINE_PROC_STAT_COUNTER_METRIC(procs_blocked);
322
323
struct ProcMetrics {
324
12
    ProcMetrics(MetricEntity* ent) : entity(ent) {
325
12
        INT_COUNTER_METRIC_REGISTER(entity, proc_interrupt);
326
12
        INT_COUNTER_METRIC_REGISTER(entity, proc_ctxt_switch);
327
12
        INT_COUNTER_METRIC_REGISTER(entity, proc_procs_running);
328
12
        INT_COUNTER_METRIC_REGISTER(entity, proc_procs_blocked);
329
12
    }
330
331
    MetricEntity* entity = nullptr;
332
333
    IntCounter* proc_interrupt;
334
    IntCounter* proc_ctxt_switch;
335
    IntCounter* proc_procs_running;
336
    IntCounter* proc_procs_blocked;
337
};
338
339
DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(max_disk_io_util_percent, MetricUnit::PERCENT);
340
DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(max_network_send_bytes_rate, MetricUnit::BYTES);
341
DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(max_network_receive_bytes_rate, MetricUnit::BYTES);
342
343
const char* SystemMetrics::_s_hook_name = "system_metrics";
344
345
SystemMetrics::SystemMetrics(MetricRegistry* registry, const std::set<std::string>& disk_devices,
346
12
                             const std::vector<std::string>& network_interfaces) {
347
12
    DCHECK(registry != nullptr);
348
12
    _registry = registry;
349
12
    _server_entity = _registry->register_entity("server");
350
12
    DCHECK(_server_entity != nullptr);
351
12
    _server_entity->register_hook(_s_hook_name, std::bind(&SystemMetrics::update, this));
352
12
    _install_cpu_metrics();
353
12
    _install_memory_metrics(_server_entity.get());
354
12
    _install_disk_metrics(disk_devices);
355
12
    _install_net_metrics(network_interfaces);
356
12
    _install_fd_metrics(_server_entity.get());
357
12
    _install_snmp_metrics(_server_entity.get());
358
12
    _install_load_avg_metrics(_server_entity.get());
359
12
    _install_proc_metrics(_server_entity.get());
360
361
12
    INT_GAUGE_METRIC_REGISTER(_server_entity.get(), max_disk_io_util_percent);
362
12
    INT_GAUGE_METRIC_REGISTER(_server_entity.get(), max_network_send_bytes_rate);
363
12
    INT_GAUGE_METRIC_REGISTER(_server_entity.get(), max_network_receive_bytes_rate);
364
12
}
365
366
8
SystemMetrics::~SystemMetrics() {
367
8
    DCHECK(_server_entity != nullptr);
368
8
    _server_entity->deregister_hook(_s_hook_name);
369
370
95
    for (auto& it : _cpu_metrics) {
371
95
        delete it.second;
372
95
    }
373
8
    for (auto& it : _disk_metrics) {
374
5
        delete it.second;
375
5
    }
376
17
    for (auto& it : _network_metrics) {
377
17
        delete it.second;
378
17
    }
379
8
    if (_line_ptr != nullptr) {
380
7
        free(_line_ptr);
381
7
    }
382
8
}
383
384
765
void SystemMetrics::update() {
385
765
    _update_cpu_metrics();
386
765
    _update_memory_metrics();
387
765
    _update_disk_metrics();
388
765
    _update_net_metrics();
389
765
    _update_fd_metrics();
390
765
    _update_snmp_metrics();
391
765
    _update_load_avg_metrics();
392
765
    _update_proc_metrics();
393
765
}
394
395
12
void SystemMetrics::_install_cpu_metrics() {
396
12
    get_cpu_name();
397
398
12
    int cpu_num = 0;
399
187
    for (auto cpu_name : _cpu_names) {
400
        // NOTE: cpu_name comes from /proc/stat which named 'cpu' is not a real cpu name, it should be skipped.
401
187
        if (cpu_name != "cpu") {
402
176
            cpu_num++;
403
176
        }
404
187
        auto cpu_entity = _registry->register_entity(cpu_name, {{"device", cpu_name}});
405
187
        CpuMetrics* metrics = new CpuMetrics(cpu_entity.get());
406
187
        _cpu_metrics.emplace(cpu_name, metrics);
407
187
    }
408
409
12
    auto cpu_num_entity = _registry->register_entity("doris_be_host_cpu_num");
410
12
    _cpu_num_metrics = std::make_unique<CpuNumberMetrics>(cpu_num_entity.get());
411
412
12
    _cpu_num_metrics->host_cpu_num->set_value(cpu_num);
413
12
}
414
415
#ifdef BE_TEST
416
const char* k_ut_stat_path;
417
const char* k_ut_diskstats_path;
418
const char* k_ut_net_dev_path;
419
const char* k_ut_fd_path;
420
const char* k_ut_net_snmp_path;
421
const char* k_ut_load_avg_path;
422
const char* k_ut_vmstat_path;
423
#endif
424
425
765
void SystemMetrics::_update_cpu_metrics() {
426
#ifdef BE_TEST
427
    FILE* fp = fopen(k_ut_stat_path, "r");
428
#else
429
765
    FILE* fp = fopen("/proc/stat", "r");
430
765
#endif
431
765
    if (fp == nullptr) {
432
0
        char buf[64];
433
0
        LOG(WARNING) << "open /proc/stat failed, errno=" << errno
434
0
                     << ", message=" << strerror_r(errno, buf, 64);
435
0
        if (errno == 24) {
436
0
            _file_handle_deplenish_counter++;
437
0
        } else {
438
0
            _file_handle_deplenish_counter = 0;
439
0
        }
440
        // Threshold of the number of consecutive failures
441
0
        if (_file_handle_deplenish_counter >= config::file_handles_deplenish_frequency_times) {
442
0
            LOG(FATAL) << "The system file handles are insufficient, causing service exceptions"
443
0
                       << ", BE will exit. please check the configs 'soft nofile'"
444
0
                       << " and 'hard nofile' of /etc/security/limits.conf ";
445
0
            exit(-1);
446
0
        }
447
0
        return;
448
0
    }
449
450
22.8k
    while (getline(&_line_ptr, &_line_buf_size, fp) > 0) {
451
22.0k
        char cpu[16];
452
22.0k
        int64_t values[CpuMetrics::cpu_num_metrics];
453
22.0k
        memset(values, 0, sizeof(values));
454
22.0k
        int num = sscanf(_line_ptr,
455
22.0k
                         "%15s"
456
22.0k
                         " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64
457
22.0k
                         " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64,
458
22.0k
                         cpu, &values[0], &values[1], &values[2], &values[3], &values[4],
459
22.0k
                         &values[5], &values[6], &values[7], &values[8], &values[9]);
460
22.0k
        if (num < 4) {
461
3.79k
            continue;
462
3.79k
        }
463
464
18.2k
        std::string cpu_name(cpu);
465
18.2k
        auto it = _cpu_metrics.find(cpu_name);
466
18.2k
        if (it == _cpu_metrics.end()) {
467
1.51k
            continue;
468
1.51k
        }
469
470
16.7k
        if (cpu_name == "cpu") {
471
765
            AggregateCpuTime aggregate_cpu_time;
472
765
            aggregate_cpu_time.total_time = values[0] + values[1] + values[2] + values[3] +
473
765
                                            values[4] + values[5] + values[6] + values[7];
474
765
            aggregate_cpu_time.idle_time = values[3] + values[4];
475
765
            aggregate_cpu_time.initialized = aggregate_cpu_time.total_time > 0;
476
            // Publish a consistent aggregate snapshot derived from one /proc/stat row.
477
765
            std::lock_guard<std::mutex> lk(_aggregate_cpu_time_mutex);
478
765
            _aggregate_cpu_time = aggregate_cpu_time;
479
765
        }
480
481
184k
        for (int i = 0; i < CpuMetrics::cpu_num_metrics; ++i) {
482
167k
            it->second->metrics[i]->set_value(values[i]);
483
167k
        }
484
16.7k
    }
485
486
765
    if (ferror(fp) != 0) {
487
0
        char buf[64];
488
0
        LOG(WARNING) << "getline failed, errno=" << errno
489
0
                     << ", message=" << strerror_r(errno, buf, 64);
490
0
    }
491
492
765
    fclose(fp);
493
765
}
494
495
12
void SystemMetrics::_install_memory_metrics(MetricEntity* entity) {
496
12
    _memory_metrics.reset(new MemoryMetrics(entity));
497
12
}
498
499
765
void SystemMetrics::_update_memory_metrics() {
500
765
    _memory_metrics->memory_allocated_bytes->set_value(PerfCounters::get_vm_rss());
501
765
    get_metrics_from_proc_vmstat();
502
765
}
503
504
0
void SystemMetrics::update_allocator_metrics() {
505
0
#if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER)
506
0
    LOG(INFO) << "Memory tracking is not available with address sanitizer builds.";
507
#elif defined(USE_JEMALLOC)
508
    _memory_metrics->memory_jemalloc_allocated_bytes->set_value(
509
            JemallocControl::get_jemallctl_value<int64_t>("stats.allocated"));
510
    _memory_metrics->memory_jemalloc_active_bytes->set_value(
511
            JemallocControl::get_jemallctl_value<int64_t>("stats.active"));
512
    _memory_metrics->memory_jemalloc_metadata_bytes->set_value(
513
            JemallocControl::get_jemallctl_value<int64_t>("stats.metadata"));
514
    _memory_metrics->memory_jemalloc_resident_bytes->set_value(
515
            JemallocControl::get_jemallctl_value<int64_t>("stats.resident"));
516
    _memory_metrics->memory_jemalloc_mapped_bytes->set_value(
517
            JemallocControl::get_jemallctl_value<int64_t>("stats.mapped"));
518
    _memory_metrics->memory_jemalloc_retained_bytes->set_value(
519
            JemallocControl::get_jemallctl_value<int64_t>("stats.retained"));
520
    _memory_metrics->memory_jemalloc_tcache_bytes->set_value(
521
            JemallocControl::get_je_all_arena_metrics("tcache_bytes"));
522
    _memory_metrics->memory_jemalloc_pactive_num->set_value(
523
            JemallocControl::get_je_all_arena_metrics("pactive"));
524
    _memory_metrics->memory_jemalloc_pdirty_num->set_value(
525
            JemallocControl::get_je_all_arena_metrics("pdirty"));
526
    _memory_metrics->memory_jemalloc_pmuzzy_num->set_value(
527
            JemallocControl::get_je_all_arena_metrics("pmuzzy"));
528
    _memory_metrics->memory_jemalloc_dirty_purged_num->set_value(
529
            JemallocControl::get_je_all_arena_metrics("dirty_purged"));
530
    _memory_metrics->memory_jemalloc_muzzy_purged_num->set_value(
531
            JemallocControl::get_je_all_arena_metrics("muzzy_purged"));
532
#else
533
    _memory_metrics->memory_tcmalloc_allocated_bytes->set_value(
534
            JemallocControl::get_tc_metrics("generic.total_physical_bytes"));
535
    _memory_metrics->memory_tcmalloc_total_thread_cache_bytes->set_value(
536
            JemallocControl::je_cache_bytes());
537
    _memory_metrics->memory_tcmalloc_central_cache_free_bytes->set_value(
538
            JemallocControl::get_tc_metrics("tcmalloc.central_cache_free_bytes"));
539
    _memory_metrics->memory_tcmalloc_transfer_cache_free_bytes->set_value(
540
            JemallocControl::get_tc_metrics("tcmalloc.transfer_cache_free_bytes"));
541
    _memory_metrics->memory_tcmalloc_thread_cache_free_bytes->set_value(
542
            JemallocControl::get_tc_metrics("tcmalloc.thread_cache_free_bytes"));
543
    _memory_metrics->memory_tcmalloc_pageheap_free_bytes->set_value(
544
            JemallocControl::get_tc_metrics("tcmalloc.pageheap_free_bytes"));
545
    _memory_metrics->memory_tcmalloc_pageheap_unmapped_bytes->set_value(
546
            JemallocControl::get_tc_metrics("tcmalloc.pageheap_unmapped_bytes"));
547
#endif
548
0
}
549
550
12
void SystemMetrics::_install_disk_metrics(const std::set<std::string>& disk_devices) {
551
12
    for (auto& disk_device : disk_devices) {
552
9
        auto disk_entity = _registry->register_entity(std::string("disk_metrics.") + disk_device,
553
9
                                                      {{"device", disk_device}});
554
9
        DiskMetrics* metrics = new DiskMetrics(disk_entity.get());
555
9
        _disk_metrics.emplace(disk_device, metrics);
556
9
    }
557
12
}
558
559
765
void SystemMetrics::_update_disk_metrics() {
560
#ifdef BE_TEST
561
    FILE* fp = fopen(k_ut_diskstats_path, "r");
562
#else
563
765
    FILE* fp = fopen("/proc/diskstats", "r");
564
765
#endif
565
765
    if (fp == nullptr) {
566
7
        char buf[64];
567
7
        LOG(WARNING) << "open /proc/diskstats failed, errno=" << errno
568
7
                     << ", message=" << strerror_r(errno, buf, 64);
569
7
        return;
570
7
    }
571
572
    // /proc/diskstats: https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats
573
    // 1 - major number
574
    // 2 - minor mumber
575
    // 3 - device name
576
    // 4 - reads completed successfully
577
    // 5 - reads merged
578
    // 6 - sectors read
579
    // 7 - time spent reading (ms)
580
    // 8 - writes completed
581
    // 9 - writes merged
582
    // 10 - sectors written
583
    // 11 - time spent writing (ms)
584
    // 12 - I/Os currently in progress
585
    // 13 - time spent doing I/Os (ms)
586
    // 14 - weighted time spent doing I/Os (ms)
587
    // I think 1024 is enough for device name
588
758
    int major = 0;
589
758
    int minor = 0;
590
758
    char device[1024];
591
758
    int64_t values[11];
592
9.11k
    while (getline(&_line_ptr, &_line_buf_size, fp) > 0) {
593
8.36k
        memset(values, 0, sizeof(values));
594
8.36k
        int num = sscanf(_line_ptr,
595
8.36k
                         "%d %d %1023s"
596
8.36k
                         " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64
597
8.36k
                         " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64,
598
8.36k
                         &major, &minor, device, &values[0], &values[1], &values[2], &values[3],
599
8.36k
                         &values[4], &values[5], &values[6], &values[7], &values[8], &values[9],
600
8.36k
                         &values[10]);
601
8.36k
        if (num < 4) {
602
0
            continue;
603
0
        }
604
8.36k
        auto it = _disk_metrics.find(device);
605
8.36k
        if (it == _disk_metrics.end()) {
606
7.60k
            continue;
607
7.60k
        }
608
        // update disk metrics
609
        // reads_completed: 4 reads completed successfully
610
758
        it->second->disk_reads_completed->set_value(values[0]);
611
        // bytes_read: 6 sectors read * 512; 5 reads merged is ignored
612
758
        it->second->disk_bytes_read->set_value(values[2] * 512);
613
        // read_time_ms: 7 time spent reading (ms)
614
758
        it->second->disk_read_time_ms->set_value(values[3]);
615
        // writes_completed: 8 writes completed
616
758
        it->second->disk_writes_completed->set_value(values[4]);
617
        // bytes_written: 10 sectors write * 512; 9 writes merged is ignored
618
758
        it->second->disk_bytes_written->set_value(values[6] * 512);
619
        // write_time_ms: 11 time spent writing (ms)
620
758
        it->second->disk_write_time_ms->set_value(values[7]);
621
        // io_time_ms: 13 time spent doing I/Os (ms)
622
758
        it->second->disk_io_time_ms->set_value(values[9]);
623
        // io_time_weigthed: 14 - weighted time spent doing I/Os (ms)
624
758
        it->second->disk_io_time_weigthed->set_value(values[10]);
625
758
    }
626
758
    if (ferror(fp) != 0) {
627
0
        char buf[64];
628
0
        LOG(WARNING) << "getline failed, errno=" << errno
629
0
                     << ", message=" << strerror_r(errno, buf, 64);
630
0
    }
631
758
    fclose(fp);
632
758
}
633
634
12
void SystemMetrics::_install_net_metrics(const std::vector<std::string>& interfaces) {
635
37
    for (auto& interface : interfaces) {
636
37
        auto interface_entity = _registry->register_entity(
637
37
                std::string("network_metrics.") + interface, {{"device", interface}});
638
37
        NetworkMetrics* metrics = new NetworkMetrics(interface_entity.get());
639
37
        _network_metrics.emplace(interface, metrics);
640
37
    }
641
12
}
642
643
12
void SystemMetrics::_install_snmp_metrics(MetricEntity* entity) {
644
12
    _snmp_metrics.reset(new SnmpMetrics(entity));
645
12
}
646
647
765
void SystemMetrics::_update_net_metrics() {
648
#ifdef BE_TEST
649
    // to mock proc
650
    FILE* fp = fopen(k_ut_net_dev_path, "r");
651
#else
652
765
    FILE* fp = fopen("/proc/net/dev", "r");
653
765
#endif
654
765
    if (fp == nullptr) {
655
7
        char buf[64];
656
7
        LOG(WARNING) << "open /proc/net/dev failed, errno=" << errno
657
7
                     << ", message=" << strerror_r(errno, buf, 64);
658
7
        return;
659
7
    }
660
661
    // Ignore header
662
758
    if (getline(&_line_ptr, &_line_buf_size, fp) < 0 ||
663
758
        getline(&_line_ptr, &_line_buf_size, fp) < 0) {
664
0
        char buf[64];
665
0
        LOG(WARNING) << "read /proc/net/dev first two line failed, errno=" << errno
666
0
                     << ", message=" << strerror_r(errno, buf, 64);
667
0
        fclose(fp);
668
0
        return;
669
0
    }
670
758
    if (_proc_net_dev_version == 0) {
671
8
        if (strstr(_line_ptr, "compressed") != nullptr) {
672
8
            _proc_net_dev_version = 3;
673
8
        } else if (strstr(_line_ptr, "bytes") != nullptr) {
674
0
            _proc_net_dev_version = 2;
675
0
        } else {
676
0
            _proc_net_dev_version = 1;
677
0
        }
678
8
    }
679
680
19.5k
    while (getline(&_line_ptr, &_line_buf_size, fp) > 0) {
681
18.8k
        char* ptr = strrchr(_line_ptr, ':');
682
18.8k
        if (ptr == nullptr) {
683
0
            continue;
684
0
        }
685
18.8k
        char* start = _line_ptr;
686
23.3k
        while (isspace(*start)) {
687
4.55k
            start++;
688
4.55k
        }
689
18.8k
        std::string interface(start, ptr - start);
690
18.8k
        auto it = _network_metrics.find(interface);
691
18.8k
        if (it == _network_metrics.end()) {
692
15.4k
            continue;
693
15.4k
        }
694
3.35k
        ptr++;
695
3.35k
        int64_t receive_bytes = 0;
696
3.35k
        int64_t receive_packets = 0;
697
3.35k
        int64_t send_bytes = 0;
698
3.35k
        int64_t send_packets = 0;
699
3.35k
        switch (_proc_net_dev_version) {
700
3.35k
        case 3:
701
            // receive: bytes packets errs drop fifo frame compressed multicast
702
            // send:    bytes packets errs drop fifo colls carrier compressed
703
3.35k
            sscanf(ptr,
704
3.35k
                   " %" PRId64 " %" PRId64
705
3.35k
                   " %*d %*d %*d %*d %*d %*d"
706
3.35k
                   " %" PRId64 " %" PRId64 " %*d %*d %*d %*d %*d %*d",
707
3.35k
                   &receive_bytes, &receive_packets, &send_bytes, &send_packets);
708
3.35k
            break;
709
0
        case 2:
710
            // receive: bytes packets errs drop fifo frame
711
            // send:    bytes packets errs drop fifo colls carrier
712
0
            sscanf(ptr,
713
0
                   " %" PRId64 " %" PRId64
714
0
                   " %*d %*d %*d %*d"
715
0
                   " %" PRId64 " %" PRId64 " %*d %*d %*d %*d %*d",
716
0
                   &receive_bytes, &receive_packets, &send_bytes, &send_packets);
717
0
            break;
718
0
        case 1:
719
            // receive: packets errs drop fifo frame
720
            // send: packets errs drop fifo colls carrier
721
0
            sscanf(ptr,
722
0
                   " %" PRId64
723
0
                   " %*d %*d %*d %*d"
724
0
                   " %" PRId64 " %*d %*d %*d %*d %*d",
725
0
                   &receive_packets, &send_packets);
726
0
            break;
727
0
        default:
728
0
            break;
729
3.35k
        }
730
3.35k
        it->second->network_receive_bytes->set_value(receive_bytes);
731
3.35k
        it->second->network_receive_packets->set_value(receive_packets);
732
3.35k
        it->second->network_send_bytes->set_value(send_bytes);
733
3.35k
        it->second->network_send_packets->set_value(send_packets);
734
3.35k
    }
735
758
    if (ferror(fp) != 0) {
736
0
        char buf[64];
737
0
        LOG(WARNING) << "getline failed, errno=" << errno
738
0
                     << ", message=" << strerror_r(errno, buf, 64);
739
0
    }
740
758
    fclose(fp);
741
758
}
742
743
765
void SystemMetrics::_update_snmp_metrics() {
744
#ifdef BE_TEST
745
    // to mock proc
746
    FILE* fp = fopen(k_ut_net_snmp_path, "r");
747
#else
748
765
    FILE* fp = fopen("/proc/net/snmp", "r");
749
765
#endif
750
765
    if (fp == nullptr) {
751
7
        char buf[64];
752
7
        LOG(WARNING) << "open /proc/net/snmp failed, errno=" << errno
753
7
                     << ", message=" << strerror_r(errno, buf, 64);
754
7
        return;
755
7
    }
756
757
    // We only care about Tcp lines, so skip other lines in front of Tcp line
758
758
    int64_t res = 0;
759
5.30k
    while ((res = getline(&_line_ptr, &_line_buf_size, fp)) > 0) {
760
5.30k
        if (strstr(_line_ptr, "Tcp") != nullptr) {
761
758
            break;
762
758
        }
763
5.30k
    }
764
758
    if (res <= 0) {
765
0
        char buf[64];
766
0
        LOG(WARNING) << "failed to skip lines of /proc/net/snmp, errno=" << errno
767
0
                     << ", message=" << strerror_r(errno, buf, 64);
768
0
        fclose(fp);
769
0
        return;
770
0
    }
771
772
    // parse the Tcp header
773
    // Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors
774
758
    std::vector<std::string> headers = absl::StrSplit(_line_ptr, " ");
775
758
    std::unordered_map<std::string, int32_t> header_map;
776
758
    int32_t pos = 0;
777
12.1k
    for (auto& h : headers) {
778
12.1k
        header_map.emplace(h, pos++);
779
12.1k
    }
780
781
    // read the metrics of TCP
782
758
    if (getline(&_line_ptr, &_line_buf_size, fp) < 0) {
783
0
        char buf[64];
784
0
        LOG(WARNING) << "failed to skip Tcp header line of /proc/net/snmp, errno=" << errno
785
0
                     << ", message=" << strerror_r(errno, buf, 64);
786
0
        fclose(fp);
787
0
        return;
788
0
    }
789
790
    // metric line looks like:
791
    // Tcp: 1 200 120000 -1 47849374 38601877 3353843 2320314 276 1033354613 1166025166 825439 12694 23238924 0
792
758
    std::vector<std::string> metrics = absl::StrSplit(_line_ptr, " ");
793
758
    if (metrics.size() != headers.size()) {
794
0
        LOG(WARNING) << "invalid tcp metrics line: " << _line_ptr;
795
0
        fclose(fp);
796
0
        return;
797
0
    }
798
758
    int64_t retrans_segs = std::stoll(metrics[header_map["RetransSegs"]]);
799
758
    int64_t in_errs = std::stoll(metrics[header_map["InErrs"]]);
800
758
    int64_t in_segs = std::stoll(metrics[header_map["InSegs"]]);
801
758
    int64_t out_segs = std::stoll(metrics[header_map["OutSegs"]]);
802
758
    _snmp_metrics->snmp_tcp_retrans_segs->set_value(retrans_segs);
803
758
    _snmp_metrics->snmp_tcp_in_errs->set_value(in_errs);
804
758
    _snmp_metrics->snmp_tcp_in_segs->set_value(in_segs);
805
758
    _snmp_metrics->snmp_tcp_out_segs->set_value(out_segs);
806
807
758
    if (ferror(fp) != 0) {
808
0
        char buf[64];
809
0
        LOG(WARNING) << "getline failed, errno=" << errno
810
0
                     << ", message=" << strerror_r(errno, buf, 64);
811
0
    }
812
758
    fclose(fp);
813
758
}
814
815
12
void SystemMetrics::_install_fd_metrics(MetricEntity* entity) {
816
12
    _fd_metrics.reset(new FileDescriptorMetrics(entity));
817
12
}
818
819
765
void SystemMetrics::_update_fd_metrics() {
820
#ifdef BE_TEST
821
    FILE* fp = fopen(k_ut_fd_path, "r");
822
#else
823
765
    FILE* fp = fopen("/proc/sys/fs/file-nr", "r");
824
765
#endif
825
765
    if (fp == nullptr) {
826
7
        char buf[64];
827
7
        LOG(WARNING) << "open /proc/sys/fs/file-nr failed, errno=" << errno
828
7
                     << ", message=" << strerror_r(errno, buf, 64);
829
7
        return;
830
7
    }
831
832
    // /proc/sys/fs/file-nr: https://www.kernel.org/doc/Documentation/sysctl/fs.txt
833
    // 1 - the number of allocated file handles
834
    // 2 - the number of allocated but unused file handles
835
    // 3 - the maximum number of file handles
836
837
758
    int64_t values[3];
838
758
    if (getline(&_line_ptr, &_line_buf_size, fp) > 0) {
839
758
        memset(values, 0, sizeof(values));
840
758
        int num = sscanf(_line_ptr, "%" PRId64 " %" PRId64 " %" PRId64, &values[0], &values[1],
841
758
                         &values[2]);
842
758
        if (num == 3) {
843
758
            _fd_metrics->fd_num_limit->set_value(values[2]);
844
758
            _fd_metrics->fd_num_used->set_value(values[0] - values[1]);
845
758
        }
846
758
    }
847
848
758
    if (ferror(fp) != 0) {
849
0
        char buf[64];
850
0
        LOG(WARNING) << "getline failed, errno=" << errno
851
0
                     << ", message=" << strerror_r(errno, buf, 64);
852
0
    }
853
758
    fclose(fp);
854
758
}
855
856
12
void SystemMetrics::_install_load_avg_metrics(MetricEntity* entity) {
857
12
    _load_average_metrics.reset(new LoadAverageMetrics(entity));
858
12
}
859
860
765
void SystemMetrics::_update_load_avg_metrics() {
861
#ifdef BE_TEST
862
    FILE* fp = fopen(k_ut_load_avg_path, "r");
863
#else
864
765
    FILE* fp = fopen("/proc/loadavg", "r");
865
765
#endif
866
765
    if (fp == nullptr) {
867
7
        char buf[64];
868
7
        LOG(WARNING) << "open /proc/loadavg failed, errno=" << errno
869
7
                     << ", message=" << strerror_r(errno, buf, 64);
870
7
        return;
871
7
    }
872
873
758
    double values[3];
874
758
    if (getline(&_line_ptr, &_line_buf_size, fp) > 0) {
875
758
        memset(values, 0, sizeof(values));
876
758
        int num = sscanf(_line_ptr, "%lf %lf %lf", &values[0], &values[1], &values[2]);
877
758
        if (num == 3) {
878
758
            _load_average_metrics->load_average_1_minutes->set_value(values[0]);
879
758
            _load_average_metrics->load_average_5_minutes->set_value(values[1]);
880
758
            _load_average_metrics->load_average_15_minutes->set_value(values[2]);
881
758
        }
882
758
    }
883
884
758
    if (ferror(fp) != 0) {
885
0
        char buf[64];
886
0
        LOG(WARNING) << "getline failed, errno=" << errno
887
0
                     << ", message=" << strerror_r(errno, buf, 64);
888
0
    }
889
758
    fclose(fp);
890
758
}
891
892
int64_t SystemMetrics::get_max_io_util(const std::map<std::string, int64_t>& lst_value,
893
1.65k
                                       int64_t interval_sec) {
894
1.65k
    int64_t max = 0;
895
1.65k
    for (auto& it : _disk_metrics) {
896
1.65k
        int64_t cur = it.second->disk_io_time_ms->value();
897
1.65k
        const auto find = lst_value.find(it.first);
898
1.65k
        if (find == lst_value.end()) {
899
6
            continue;
900
6
        }
901
1.64k
        int64_t incr = cur - find->second;
902
1.64k
        if (incr > max) max = incr;
903
1.64k
    }
904
1.65k
    return max / interval_sec / 10;
905
1.65k
}
906
907
1.66k
void SystemMetrics::get_disks_io_time(std::map<std::string, int64_t>* map) {
908
1.66k
    map->clear();
909
1.66k
    for (auto& it : _disk_metrics) {
910
1.66k
        map->emplace(it.first, it.second->disk_io_time_ms->value());
911
1.66k
    }
912
1.66k
}
913
914
9.57k
double SystemMetrics::get_load_average_1_min() {
915
9.57k
    if (_load_average_metrics) {
916
9.57k
        return _load_average_metrics->load_average_1_minutes->value();
917
9.57k
    } else {
918
0
        return 0;
919
0
    }
920
9.57k
}
921
922
4.53k
bool SystemMetrics::get_aggregate_cpu_time(int64_t* total_time, int64_t* idle_time) const {
923
4.53k
    DCHECK(total_time != nullptr);
924
4.53k
    DCHECK(idle_time != nullptr);
925
926
4.53k
    std::lock_guard<std::mutex> lk(_aggregate_cpu_time_mutex);
927
4.53k
    if (!_aggregate_cpu_time.initialized) {
928
0
        return false;
929
0
    }
930
931
4.53k
    *total_time = _aggregate_cpu_time.total_time;
932
4.53k
    *idle_time = _aggregate_cpu_time.idle_time;
933
4.53k
    return true;
934
4.53k
}
935
936
void SystemMetrics::get_network_traffic(std::map<std::string, int64_t>* send_map,
937
757
                                        std::map<std::string, int64_t>* rcv_map) {
938
757
    send_map->clear();
939
757
    rcv_map->clear();
940
3.79k
    for (auto& it : _network_metrics) {
941
3.79k
        if (it.first == "lo") {
942
757
            continue;
943
757
        }
944
3.03k
        send_map->emplace(it.first, it.second->network_send_bytes->value());
945
3.03k
        rcv_map->emplace(it.first, it.second->network_receive_bytes->value());
946
3.03k
    }
947
757
}
948
949
void SystemMetrics::get_max_net_traffic(const std::map<std::string, int64_t>& lst_send_map,
950
                                        const std::map<std::string, int64_t>& lst_rcv_map,
951
                                        int64_t interval_sec, int64_t* send_rate,
952
750
                                        int64_t* rcv_rate) {
953
750
    int64_t max_send = 0;
954
750
    int64_t max_rcv = 0;
955
3.76k
    for (auto& it : _network_metrics) {
956
3.76k
        int64_t cur_send = it.second->network_send_bytes->value();
957
3.76k
        int64_t cur_rcv = it.second->network_receive_bytes->value();
958
959
3.76k
        const auto find_send = lst_send_map.find(it.first);
960
3.76k
        if (find_send != lst_send_map.end()) {
961
3.01k
            int64_t incr = cur_send - find_send->second;
962
3.01k
            if (incr > max_send) max_send = incr;
963
3.01k
        }
964
3.76k
        const auto find_rcv = lst_rcv_map.find(it.first);
965
3.76k
        if (find_rcv != lst_rcv_map.end()) {
966
3.01k
            int64_t incr = cur_rcv - find_rcv->second;
967
3.01k
            if (incr > max_rcv) max_rcv = incr;
968
3.01k
        }
969
3.76k
    }
970
971
750
    *send_rate = max_send / interval_sec;
972
750
    *rcv_rate = max_rcv / interval_sec;
973
750
}
974
975
void SystemMetrics::update_max_disk_io_util_percent(const std::map<std::string, int64_t>& lst_value,
976
750
                                                    int64_t interval_sec) {
977
750
    max_disk_io_util_percent->set_value(get_max_io_util(lst_value, interval_sec));
978
750
}
979
980
750
void SystemMetrics::update_max_network_send_bytes_rate(int64_t max_send_bytes_rate) {
981
750
    max_network_send_bytes_rate->set_value(max_send_bytes_rate);
982
750
}
983
984
750
void SystemMetrics::update_max_network_receive_bytes_rate(int64_t max_receive_bytes_rate) {
985
750
    max_network_receive_bytes_rate->set_value(max_receive_bytes_rate);
986
750
}
987
988
12
void SystemMetrics::_install_proc_metrics(MetricEntity* entity) {
989
12
    _proc_metrics.reset(new ProcMetrics(entity));
990
12
}
991
992
765
void SystemMetrics::_update_proc_metrics() {
993
#ifdef BE_TEST
994
    FILE* fp = fopen(k_ut_stat_path, "r");
995
#else
996
765
    FILE* fp = fopen("/proc/stat", "r");
997
765
#endif
998
765
    if (fp == nullptr) {
999
0
        char buf[64];
1000
0
        LOG(WARNING) << "open /proc/stat failed, errno=" << errno
1001
0
                     << ", message=" << strerror_r(errno, buf, 64);
1002
0
        return;
1003
0
    }
1004
1005
765
    uint64_t inter = 0, ctxt = 0, procs_r = 0, procs_b = 0;
1006
22.8k
    while (getline(&_line_ptr, &_line_buf_size, fp) > 0) {
1007
22.0k
        char* start_pos = nullptr;
1008
22.0k
        start_pos = strstr(_line_ptr, "intr ");
1009
22.0k
        if (start_pos) {
1010
758
            sscanf(start_pos, "intr %" PRIu64, &inter);
1011
758
            _proc_metrics->proc_interrupt->set_value(inter);
1012
758
        }
1013
1014
22.0k
        start_pos = strstr(_line_ptr, "ctxt ");
1015
22.0k
        if (start_pos) {
1016
758
            sscanf(start_pos, "ctxt %" PRIu64, &ctxt);
1017
758
            _proc_metrics->proc_ctxt_switch->set_value(ctxt);
1018
758
        }
1019
1020
22.0k
        start_pos = strstr(_line_ptr, "procs_running ");
1021
22.0k
        if (start_pos) {
1022
758
            sscanf(start_pos, "procs_running %" PRIu64, &procs_r);
1023
758
            _proc_metrics->proc_procs_running->set_value(procs_r);
1024
758
        }
1025
1026
22.0k
        start_pos = strstr(_line_ptr, "procs_blocked ");
1027
22.0k
        if (start_pos) {
1028
758
            sscanf(start_pos, "procs_blocked %" PRIu64, &procs_b);
1029
758
            _proc_metrics->proc_procs_blocked->set_value(procs_b);
1030
758
        }
1031
22.0k
    }
1032
1033
765
    if (ferror(fp) != 0) {
1034
0
        char buf[64];
1035
0
        LOG(WARNING) << "getline failed, errno=" << errno
1036
0
                     << ", message=" << strerror_r(errno, buf, 64);
1037
0
    }
1038
1039
765
    fclose(fp);
1040
765
}
1041
1042
750
void SystemMetrics::update_be_avail_cpu_num() {
1043
750
    int64_t physical_cpu_num = _cpu_num_metrics->host_cpu_num->value();
1044
750
    if (physical_cpu_num > 0) {
1045
750
        physical_cpu_num =
1046
750
                CGroupUtil::get_cgroup_limited_cpu_number(cast_set<int32_t>(physical_cpu_num));
1047
750
        _cpu_num_metrics->avail_cpu_num->set_value(physical_cpu_num);
1048
750
    }
1049
750
}
1050
1051
765
void SystemMetrics::get_metrics_from_proc_vmstat() {
1052
#ifdef BE_TEST
1053
    FILE* fp = fopen(k_ut_vmstat_path, "r");
1054
#else
1055
765
    FILE* fp = fopen("/proc/vmstat", "r");
1056
765
#endif
1057
765
    if (fp == nullptr) {
1058
7
        char buf[64];
1059
7
        LOG(WARNING) << "open /proc/vmstat failed, errno=" << errno
1060
7
                     << ", message=" << strerror_r(errno, buf, 64);
1061
7
        return;
1062
7
    }
1063
1064
105k
    while (getline(&_line_ptr, &_line_buf_size, fp) > 0) {
1065
104k
        uint64_t value;
1066
104k
        char name[64];
1067
104k
        int num = sscanf(_line_ptr, "%s %" PRIu64, name, &value);
1068
104k
        if (num < 2) {
1069
0
            continue;
1070
0
        }
1071
1072
104k
        if (strcmp(name, "pgpgin") == 0) {
1073
758
            _memory_metrics->memory_pgpgin->set_value(value);
1074
103k
        } else if (strcmp(name, "pgpgout") == 0) {
1075
758
            _memory_metrics->memory_pgpgout->set_value(value);
1076
102k
        } else if (strcmp(name, "pswpin") == 0) {
1077
758
            _memory_metrics->memory_pswpin->set_value(value);
1078
102k
        } else if (strcmp(name, "pswpout") == 0) {
1079
758
            _memory_metrics->memory_pswpout->set_value(value);
1080
758
        }
1081
104k
    }
1082
1083
758
    if (ferror(fp) != 0) {
1084
0
        char buf[64];
1085
0
        LOG(WARNING) << "getline failed, errno=" << errno
1086
0
                     << ", message=" << strerror_r(errno, buf, 64);
1087
0
    }
1088
1089
758
    fclose(fp);
1090
758
}
1091
1092
12
void SystemMetrics::get_cpu_name() {
1093
#ifdef BE_TEST
1094
    FILE* fp = fopen(k_ut_stat_path, "r");
1095
#else
1096
12
    FILE* fp = fopen("/proc/stat", "r");
1097
12
#endif
1098
12
    if (fp == nullptr) {
1099
1
        char buf[64];
1100
1
        LOG(WARNING) << "open /proc/stat failed, errno=" << errno
1101
1
                     << ", message=" << strerror_r(errno, buf, 64);
1102
1
        return;
1103
1
    }
1104
1105
254
    while (getline(&_line_ptr, &_line_buf_size, fp) > 0) {
1106
243
        char cpu[16];
1107
243
        char* start_pos = nullptr;
1108
243
        start_pos = strstr(_line_ptr, "cpu");
1109
243
        if (start_pos) {
1110
187
            sscanf(_line_ptr, "%15s", cpu);
1111
187
            std::string cpu_name(cpu);
1112
187
            _cpu_names.push_back(cpu_name);
1113
187
        }
1114
243
    }
1115
1116
11
    if (ferror(fp) != 0) {
1117
0
        char buf[64];
1118
0
        LOG(WARNING) << "getline failed, errno=" << errno
1119
0
                     << ", message=" << strerror_r(errno, buf, 64);
1120
0
    }
1121
1122
11
    fclose(fp);
1123
11
}
1124
1125
} // namespace doris