/root/doris/be/src/util/mem_info.cpp

Source (jump to first uncovered line)
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/mem-info.cc
// and modified by Doris

#include "mem_info.h"

#include "gutil/strings/split.h"

#ifdef __APPLE__
#include <sys/sysctl.h>
#endif

#include <bvar/bvar.h>
#include <fmt/format.h>
#include <gen_cpp/Metrics_types.h>
#include <gen_cpp/segment_v2.pb.h>
#include <jemalloc/jemalloc.h>

#include <algorithm>
#include <boost/algorithm/string/trim.hpp>
#include <fstream>
#include <unordered_map>

#include "common/cgroup_memory_ctl.h"
#include "common/config.h"
#include "common/status.h"
#include "runtime/memory/global_memory_arbitrator.h"
#include "util/cgroup_util.h"
#include "util/parse_util.h"
#include "util/pretty_printer.h"
#include "util/string_parser.hpp"

namespace doris {

static bvar::Adder<int64_t> memory_jemalloc_cache_bytes("memory_jemalloc_cache_bytes");
static bvar::Adder<int64_t> memory_jemalloc_dirty_pages_bytes("memory_jemalloc_dirty_pages_bytes");
static bvar::Adder<int64_t> memory_jemalloc_metadata_bytes("memory_jemalloc_metadata_bytes");
static bvar::Adder<int64_t> memory_jemalloc_virtual_bytes("memory_jemalloc_virtual_bytes");
static bvar::Adder<int64_t> memory_cgroup_usage_bytes("memory_cgroup_usage_bytes");
static bvar::Adder<int64_t> memory_sys_available_bytes("memory_sys_available_bytes");
static bvar::Adder<int64_t> memory_arbitrator_sys_available_bytes(
        "memory_arbitrator_sys_available_bytes");
static bvar::Adder<int64_t> memory_arbitrator_process_usage_bytes(
        "memory_arbitrator_process_usage_bytes");
static bvar::Adder<int64_t> memory_arbitrator_reserve_memory_bytes(
        "memory_arbitrator_reserve_memory_bytes");
static bvar::Adder<int64_t> memory_arbitrator_refresh_interval_growth_bytes(
        "memory_arbitrator_refresh_interval_growth_bytes");

bool MemInfo::_s_initialized = false;
std::atomic<int64_t> MemInfo::_s_physical_mem = std::numeric_limits<int64_t>::max();
std::atomic<int64_t> MemInfo::_s_mem_limit = std::numeric_limits<int64_t>::max();
std::atomic<int64_t> MemInfo::_s_soft_mem_limit = std::numeric_limits<int64_t>::max();

std::atomic<int64_t> MemInfo::_s_allocator_cache_mem = 0;
std::atomic<int64_t> MemInfo::_s_allocator_metadata_mem = 0;
std::atomic<int64_t> MemInfo::_s_je_dirty_pages_mem = std::numeric_limits<int64_t>::min();
std::atomic<int64_t> MemInfo::_s_je_dirty_pages_mem_limit = std::numeric_limits<int64_t>::max();
std::atomic<int64_t> MemInfo::_s_virtual_memory_used = 0;

std::atomic<int64_t> MemInfo::_s_cgroup_mem_limit = std::numeric_limits<int64_t>::max();
std::atomic<int64_t> MemInfo::_s_cgroup_mem_usage = std::numeric_limits<int64_t>::min();
std::atomic<bool> MemInfo::_s_cgroup_mem_refresh_state = false;
int64_t MemInfo::_s_cgroup_mem_refresh_wait_times = 0;

static std::unordered_map<std::string, int64_t> _mem_info_bytes;
std::atomic<int64_t> MemInfo::_s_sys_mem_available = -1;
int64_t MemInfo::_s_sys_mem_available_low_water_mark = std::numeric_limits<int64_t>::min();
int64_t MemInfo::_s_sys_mem_available_warning_water_mark = std::numeric_limits<int64_t>::min();
std::atomic<int64_t> MemInfo::_s_process_minor_gc_size = -1;
std::atomic<int64_t> MemInfo::_s_process_full_gc_size = -1;
std::mutex MemInfo::je_purge_dirty_pages_lock;
std::condition_variable MemInfo::je_purge_dirty_pages_cv;
std::atomic<bool> MemInfo::je_purge_dirty_pages_notify {false};

void MemInfo::refresh_allocator_mem() {
#if defined(ADDRESS_SANITIZER) || defined(LEAK_SANITIZER) || defined(THREAD_SANITIZER)
#elif defined(USE_JEMALLOC)
    // jemalloc mallctl refer to : https://jemalloc.net/jemalloc.3.html
    // https://www.bookstack.cn/read/aliyun-rds-core/4a0cdf677f62feb3.md
    //  Check the Doris BE web page `http://ip:webserver_port/memory` to get the Jemalloc Profile.

    // 'epoch' is a special mallctl -- it updates the statistics. Without it, all
    // the following calls will return stale values. It increments and returns
    // the current epoch number, which might be useful to log as a sanity check.
    uint64_t epoch = 0;
    size_t sz = sizeof(epoch);
    jemallctl("epoch", &epoch, &sz, &epoch, sz);

    // Number of extents of the given type in this arena in the bucket corresponding to page size index.
    // Large size class starts at 16384, the extents have three sizes before 16384: 4096, 8192, and 12288, so + 3
    int64_t dirty_pages_bytes = 0;
    for (unsigned i = 0; i < get_je_unsigned_metrics("arenas.nlextents") + 3; i++) {
        dirty_pages_bytes += get_je_all_arena_extents_metrics(i, "dirty_bytes");
    }
    _s_je_dirty_pages_mem.store(dirty_pages_bytes, std::memory_order_relaxed);

    // Doris uses Jemalloc as default Allocator, Jemalloc Cache consists of two parts:
    // - Thread Cache, cache a specified number of Pages in Thread Cache.
    // - Dirty Page, memory Page that can be reused in all Arenas.
    _s_allocator_cache_mem.store(get_je_all_arena_metrics("tcache_bytes") + dirty_pages_bytes,
                                 std::memory_order_relaxed);
    // Total number of bytes dedicated to metadata, which comprise base allocations used
    // for bootstrap-sensitive allocator metadata structures.
    _s_allocator_metadata_mem.store(get_je_metrics("stats.metadata"), std::memory_order_relaxed);
    _s_virtual_memory_used.store(get_je_metrics("stats.mapped"), std::memory_order_relaxed);
#else
    _s_allocator_cache_mem.store(get_tc_metrics("tcmalloc.pageheap_free_bytes") +
                                         get_tc_metrics("tcmalloc.central_cache_free_bytes") +
                                         get_tc_metrics("tcmalloc.transfer_cache_free_bytes") +
                                         get_tc_metrics("tcmalloc.thread_cache_free_bytes"),
                                 std::memory_order_relaxed);
    _s_virtual_memory_used.store(get_tc_metrics("generic.total_physical_bytes") +
                                         get_tc_metrics("tcmalloc.pageheap_unmapped_bytes"),
                                 std::memory_order_relaxed);
#endif
}

void MemInfo::refresh_memory_bvar() {
    memory_jemalloc_cache_bytes << MemInfo::allocator_cache_mem() -
                                           memory_jemalloc_cache_bytes.get_value();
    memory_jemalloc_dirty_pages_bytes
            << MemInfo::je_dirty_pages_mem() - memory_jemalloc_dirty_pages_bytes.get_value();
    memory_jemalloc_metadata_bytes
            << MemInfo::allocator_metadata_mem() - memory_jemalloc_metadata_bytes.get_value();
    memory_jemalloc_virtual_bytes << MemInfo::allocator_virtual_mem() -
                                             memory_jemalloc_virtual_bytes.get_value();

    memory_cgroup_usage_bytes << _s_cgroup_mem_usage - memory_cgroup_usage_bytes.get_value();
    memory_sys_available_bytes << _s_sys_mem_available - memory_sys_available_bytes.get_value();

    memory_arbitrator_sys_available_bytes
            << GlobalMemoryArbitrator::sys_mem_available() -
                       memory_arbitrator_sys_available_bytes.get_value();
    memory_arbitrator_process_usage_bytes
            << GlobalMemoryArbitrator::process_memory_usage() -
                       memory_arbitrator_process_usage_bytes.get_value();
    memory_arbitrator_reserve_memory_bytes
            << GlobalMemoryArbitrator::process_reserved_memory() -
                       memory_arbitrator_reserve_memory_bytes.get_value();
    memory_arbitrator_refresh_interval_growth_bytes
            << GlobalMemoryArbitrator::refresh_interval_memory_growth -
                       memory_arbitrator_refresh_interval_growth_bytes.get_value();
}

#ifndef __APPLE__
void MemInfo::refresh_proc_meminfo() {
    std::ifstream meminfo("/proc/meminfo", std::ios::in);
    std::string line;

    while (meminfo.good() && !meminfo.eof()) {
        getline(meminfo, line);
        std::vector<std::string> fields = strings::Split(line, " ", strings::SkipWhitespace());
        if (fields.size() < 2) {
            continue;
        }
        std::string key = fields[0].substr(0, fields[0].size() - 1);

        StringParser::ParseResult result;
        auto mem_value =
                StringParser::string_to_int<int64_t>(fields[1].data(), fields[1].size(), &result);

        if (result == StringParser::PARSE_SUCCESS) {
            if (fields.size() == 2) {
                _mem_info_bytes[key] = mem_value;
            } else if (fields[2] == "kB") {
                _mem_info_bytes[key] = mem_value * 1024L;
            }
        }
    }
    if (meminfo.is_open()) {
        meminfo.close();
    }

    // refresh cgroup memory
    if (config::enable_use_cgroup_memory_info) {
        if (_s_cgroup_mem_refresh_wait_times >= 0) {
            int64_t cgroup_mem_limit;
            auto status = CGroupMemoryCtl::find_cgroup_mem_limit(&cgroup_mem_limit);
            if (!status.ok()) {
                _s_cgroup_mem_limit = std::numeric_limits<int64_t>::max();
                // find cgroup limit failed, wait 300s, 1000 * 100ms.
                _s_cgroup_mem_refresh_wait_times = -3000;
                LOG(WARNING)
                        << "Refresh cgroup memory limit failed, refresh again after 300s, cgroup "
                           "mem limit: "
                        << _s_cgroup_mem_limit << ", " << status;
            } else {
                _s_cgroup_mem_limit = cgroup_mem_limit;
                // wait 10s, 100 * 100ms, avoid too frequently.
                _s_cgroup_mem_refresh_wait_times = -100;
            }
        } else {
            _s_cgroup_mem_refresh_wait_times++;
        }

        // cgroup mem limit is refreshed every 10 seconds,
        // cgroup mem usage is refreshed together with memInfo every time, which is very frequent.
        if (_s_cgroup_mem_limit != std::numeric_limits<int64_t>::max()) {
            int64_t cgroup_mem_usage;
            auto status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage);
            if (!status.ok()) {
                _s_cgroup_mem_usage = std::numeric_limits<int64_t>::min();
                _s_cgroup_mem_refresh_state = false;
                LOG_EVERY_N(WARNING, 500)
                        << "Refresh cgroup memory usage failed, cgroup mem limit: "
                        << _s_cgroup_mem_limit << ", " << status;
            } else {
                _s_cgroup_mem_usage = cgroup_mem_usage;
                _s_cgroup_mem_refresh_state = true;
            }
        } else {
            _s_cgroup_mem_refresh_state = false;
        }
    } else {
        _s_cgroup_mem_refresh_state = false;
    }

    // 1. calculate physical_mem
    int64_t physical_mem = -1;

    physical_mem = _mem_info_bytes["MemTotal"];
    if (_s_cgroup_mem_refresh_state) {
        // In theory, always cgroup_mem_limit < physical_mem
        if (physical_mem < 0) {
            physical_mem = _s_cgroup_mem_limit;
        } else {
            physical_mem =
                    std::min(physical_mem, _s_cgroup_mem_limit.load(std::memory_order_relaxed));
        }
    }

    if (physical_mem <= 0) {
        LOG(WARNING)
                << "Could not determine amount of physical memory on this machine, physical_mem: "
                << physical_mem;
    }

    // 2. if physical_mem changed, refresh mem limit and gc size.
    if (physical_mem > 0 && _s_physical_mem.load(std::memory_order_relaxed) != physical_mem) {
        _s_physical_mem.store(physical_mem);

        bool is_percent = true;
        _s_mem_limit.store(
                ParseUtil::parse_mem_spec(config::mem_limit, -1, _s_physical_mem, &is_percent));
        if (_s_mem_limit <= 0) {
            LOG(WARNING) << "Failed to parse mem limit from '" + config::mem_limit + "'.";
        }
        if (_s_mem_limit > _s_physical_mem) {
            LOG(WARNING) << "Memory limit " << PrettyPrinter::print(_s_mem_limit, TUnit::BYTES)
                         << " exceeds physical memory of "
                         << PrettyPrinter::print(_s_physical_mem, TUnit::BYTES)
                         << ". Using physical memory instead";
            _s_mem_limit.store(_s_physical_mem);
        }
        _s_soft_mem_limit.store(int64_t(_s_mem_limit * config::soft_mem_limit_frac));

        _s_process_minor_gc_size.store(ParseUtil::parse_mem_spec(config::process_minor_gc_size, -1,
                                                                 _s_mem_limit, &is_percent));
        _s_process_full_gc_size.store(ParseUtil::parse_mem_spec(config::process_full_gc_size, -1,
                                                                _s_mem_limit, &is_percent));
        _s_je_dirty_pages_mem_limit.store(ParseUtil::parse_mem_spec(
                config::je_dirty_pages_mem_limit_percent, -1, _s_mem_limit, &is_percent));
    }

    // 3. refresh process available memory
    int64_t mem_available = -1;
    if (_mem_info_bytes.find("MemAvailable") != _mem_info_bytes.end()) {
        mem_available = _mem_info_bytes["MemAvailable"];
    }
    if (_s_cgroup_mem_refresh_state) {
        // Note, CgroupV2 MemAvailable is usually a little smaller than Process MemAvailable.
        // Process `MemAvailable = MemFree - LowWaterMark + (PageCache - min(PageCache / 2, LowWaterMark))`,
        // from `MemAvailable` in `/proc/meminfo`, calculated by OS.
        // CgroupV2 `MemAvailable = cgroup_mem_limit - cgroup_mem_usage`,
        // `cgroup_mem_usage = memory.current - inactive_file - slab_reclaimable`, in fact,
        // there seems to be some memory that can be reused in `cgroup_mem_usage`.
        if (mem_available < 0) {
            mem_available = _s_cgroup_mem_limit - _s_cgroup_mem_usage;
        } else {
            mem_available = std::min(mem_available, _s_cgroup_mem_limit - _s_cgroup_mem_usage);
        }
    }
    if (mem_available < 0) {
        LOG(WARNING) << "Failed to get available memory, set MAX_INT.";
        mem_available = std::numeric_limits<int64_t>::max();
    }
    if (_s_sys_mem_available.load(std::memory_order_relaxed) != mem_available) {
        _s_sys_mem_available.store(mem_available);
    }
}

void MemInfo::init() {
    refresh_proc_meminfo();

    std::string line;
    int64_t _s_vm_min_free_kbytes = 0;
    std::ifstream vminfo("/proc/sys/vm/min_free_kbytes", std::ios::in);
    if (vminfo.good() && !vminfo.eof()) {
        getline(vminfo, line);
        boost::algorithm::trim(line);
        StringParser::ParseResult result;
        auto mem_value = StringParser::string_to_int<int64_t>(line.data(), line.size(), &result);

        if (result == StringParser::PARSE_SUCCESS) {
            _s_vm_min_free_kbytes = mem_value * 1024L;
        }
    }
    if (vminfo.is_open()) {
        vminfo.close();
    }

    // Redhat 4.x OS, `/proc/meminfo` has no `MemAvailable`.
    if (_mem_info_bytes.find("MemAvailable") != _mem_info_bytes.end()) {
        // MemAvailable = MemFree - LowWaterMark + (PageCache - min(PageCache / 2, LowWaterMark))
        // LowWaterMark = /proc/sys/vm/min_free_kbytes
        // Ref:
        // https://serverfault.com/questions/940196/why-is-memavailable-a-lot-less-than-memfreebufferscached
        // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=34e431b0ae398fc54ea69ff85ec700722c9da773
        //
        // smaller sys_mem_available_low_water_mark can avoid wasting too much memory.
        _s_sys_mem_available_low_water_mark =
                config::max_sys_mem_available_low_water_mark_bytes != -1
                        ? config::max_sys_mem_available_low_water_mark_bytes
                        : std::min<int64_t>(_s_physical_mem - _s_mem_limit,
                                            int64_t(_s_physical_mem * 0.05));
        _s_sys_mem_available_warning_water_mark = _s_sys_mem_available_low_water_mark * 2;
    }

    std::ifstream sys_transparent_hugepage("/sys/kernel/mm/transparent_hugepage/enabled",
                                           std::ios::in);
    std::string hugepage_enable;
    // If file not exist, getline returns an empty string.
    getline(sys_transparent_hugepage, hugepage_enable);
    if (sys_transparent_hugepage.is_open()) {
        sys_transparent_hugepage.close();
    }
    if (hugepage_enable == "[always] madvise never") {
        std::cout << "[WARNING!] /sys/kernel/mm/transparent_hugepage/enabled: " << hugepage_enable
                  << ", Doris not recommend turning on THP, which may cause the BE process to use "
                     "more memory and cannot be freed in time. Turn off THP: `echo madvise | sudo "
                     "tee /sys/kernel/mm/transparent_hugepage/enabled`"
                  << std::endl;
    }

    // Expect vm overcommit memory value to be 1, system will no longer throw bad_alloc, memory alloc are always accepted,
    // memory limit check is handed over to Doris Allocator, make sure throw exception position is controllable,
    // otherwise bad_alloc can be thrown anywhere and it will be difficult to achieve exception safety.
    std::ifstream sys_vm("/proc/sys/vm/overcommit_memory", std::ios::in);
    std::string vm_overcommit;
    getline(sys_vm, vm_overcommit);
    if (sys_vm.is_open()) {
        sys_vm.close();
    }
    if (!vm_overcommit.empty() && std::stoi(vm_overcommit) == 2) {
        std::cout << "[WARNING!] /proc/sys/vm/overcommit_memory: " << vm_overcommit
                  << ", expect is 1, memory limit check is handed over to Doris Allocator, "
                     "otherwise BE may crash even with remaining memory"
                  << std::endl;
    }

    LOG(INFO) << "Physical Memory: " << _mem_info_bytes["MemTotal"]
              << ", BE Available Physical Memory(consider cgroup): "
              << PrettyPrinter::print(_s_physical_mem, TUnit::BYTES) << ", Mem Limit: "
              << PrettyPrinter::print(_s_mem_limit.load(std::memory_order_relaxed), TUnit::BYTES)
              << ", origin config value: " << config::mem_limit
              << ", System Mem Available Min Reserve: "
              << PrettyPrinter::print(_s_sys_mem_available_low_water_mark, TUnit::BYTES)
              << ", Vm Min Free KBytes: "
              << PrettyPrinter::print(_s_vm_min_free_kbytes, TUnit::BYTES)
              << ", Vm Overcommit Memory: " << vm_overcommit;
    _s_initialized = true;
}
#else
void MemInfo::refresh_proc_meminfo() {}

void MemInfo::init() {
    size_t size = sizeof(_s_physical_mem);
    if (sysctlbyname("hw.memsize", &_s_physical_mem, &size, nullptr, 0) != 0) {
        LOG(WARNING) << "Could not determine amount of physical memory on this machine.";
        _s_physical_mem = -1;
    }

    bool is_percent = true;
    _s_mem_limit = ParseUtil::parse_mem_spec(config::mem_limit, -1, _s_physical_mem, &is_percent);
    _s_soft_mem_limit = static_cast<int64_t>(_s_mem_limit * config::soft_mem_limit_frac);

    LOG(INFO) << "Physical Memory: " << PrettyPrinter::print(_s_physical_mem, TUnit::BYTES);
    _s_initialized = true;
}
#endif

std::string MemInfo::debug_string() {
    DCHECK(_s_initialized);
    std::stringstream stream;
    stream << "Physical Memory: " << PrettyPrinter::print(_s_physical_mem, TUnit::BYTES)
           << std::endl;
    stream << "Memory Limt: " << PrettyPrinter::print(_s_mem_limit, TUnit::BYTES) << std::endl;
    stream << "CGroup Info: " << doris::CGroupMemoryCtl::debug_string() << std::endl;
    return stream.str();
}

} // namespace doris

Coverage Report

Created: 2025-04-14 12:46

Line	Count	Source (jump to first uncovered line)
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17		// This file is copied from
18		// https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/mem-info.cc
19		// and modified by Doris
20
21		#include "mem_info.h"
22
23		#include "gutil/strings/split.h"
24
25		#ifdef __APPLE__
26		#include <sys/sysctl.h>
27		#endif
28
29		#include <bvar/bvar.h>
30		#include <fmt/format.h>
31		#include <gen_cpp/Metrics_types.h>
32		#include <gen_cpp/segment_v2.pb.h>
33		#include <jemalloc/jemalloc.h>
34
35		#include <algorithm>
36		#include <boost/algorithm/string/trim.hpp>
37		#include <fstream>
38		#include <unordered_map>
39
40		#include "common/cgroup_memory_ctl.h"
41		#include "common/config.h"
42		#include "common/status.h"
43		#include "runtime/memory/global_memory_arbitrator.h"
44		#include "util/cgroup_util.h"
45		#include "util/parse_util.h"
46		#include "util/pretty_printer.h"
47		#include "util/string_parser.hpp"
48
49		namespace doris {
50
51		static bvar::Adder<int64_t> memory_jemalloc_cache_bytes("memory_jemalloc_cache_bytes");
52		static bvar::Adder<int64_t> memory_jemalloc_dirty_pages_bytes("memory_jemalloc_dirty_pages_bytes");
53		static bvar::Adder<int64_t> memory_jemalloc_metadata_bytes("memory_jemalloc_metadata_bytes");
54		static bvar::Adder<int64_t> memory_jemalloc_virtual_bytes("memory_jemalloc_virtual_bytes");
55		static bvar::Adder<int64_t> memory_cgroup_usage_bytes("memory_cgroup_usage_bytes");
56		static bvar::Adder<int64_t> memory_sys_available_bytes("memory_sys_available_bytes");
57		static bvar::Adder<int64_t> memory_arbitrator_sys_available_bytes(
58		"memory_arbitrator_sys_available_bytes");
59		static bvar::Adder<int64_t> memory_arbitrator_process_usage_bytes(
60		"memory_arbitrator_process_usage_bytes");
61		static bvar::Adder<int64_t> memory_arbitrator_reserve_memory_bytes(
62		"memory_arbitrator_reserve_memory_bytes");
63		static bvar::Adder<int64_t> memory_arbitrator_refresh_interval_growth_bytes(
64		"memory_arbitrator_refresh_interval_growth_bytes");
65
66		bool MemInfo::_s_initialized = false;
67		std::atomic<int64_t> MemInfo::_s_physical_mem = std::numeric_limits<int64_t>::max();
68		std::atomic<int64_t> MemInfo::_s_mem_limit = std::numeric_limits<int64_t>::max();
69		std::atomic<int64_t> MemInfo::_s_soft_mem_limit = std::numeric_limits<int64_t>::max();
70
71		std::atomic<int64_t> MemInfo::_s_allocator_cache_mem = 0;
72		std::atomic<int64_t> MemInfo::_s_allocator_metadata_mem = 0;
73		std::atomic<int64_t> MemInfo::_s_je_dirty_pages_mem = std::numeric_limits<int64_t>::min();
74		std::atomic<int64_t> MemInfo::_s_je_dirty_pages_mem_limit = std::numeric_limits<int64_t>::max();
75		std::atomic<int64_t> MemInfo::_s_virtual_memory_used = 0;
76
77		std::atomic<int64_t> MemInfo::_s_cgroup_mem_limit = std::numeric_limits<int64_t>::max();
78		std::atomic<int64_t> MemInfo::_s_cgroup_mem_usage = std::numeric_limits<int64_t>::min();
79		std::atomic<bool> MemInfo::_s_cgroup_mem_refresh_state = false;
80		int64_t MemInfo::_s_cgroup_mem_refresh_wait_times = 0;
81
82		static std::unordered_map<std::string, int64_t> _mem_info_bytes;
83		std::atomic<int64_t> MemInfo::_s_sys_mem_available = -1;
84		int64_t MemInfo::_s_sys_mem_available_low_water_mark = std::numeric_limits<int64_t>::min();
85		int64_t MemInfo::_s_sys_mem_available_warning_water_mark = std::numeric_limits<int64_t>::min();
86		std::atomic<int64_t> MemInfo::_s_process_minor_gc_size = -1;
87		std::atomic<int64_t> MemInfo::_s_process_full_gc_size = -1;
88		std::mutex MemInfo::je_purge_dirty_pages_lock;
89		std::condition_variable MemInfo::je_purge_dirty_pages_cv;
90		std::atomic<bool> MemInfo::je_purge_dirty_pages_notify {false};
91
92	0	void MemInfo::refresh_allocator_mem() {
93	0	#if defined(ADDRESS_SANITIZER) \|\| defined(LEAK_SANITIZER) \|\| defined(THREAD_SANITIZER)
94		#elif defined(USE_JEMALLOC)
95		// jemalloc mallctl refer to : https://jemalloc.net/jemalloc.3.html
96		// https://www.bookstack.cn/read/aliyun-rds-core/4a0cdf677f62feb3.md
97		// Check the Doris BE web page `http://ip:webserver_port/memory` to get the Jemalloc Profile.
98
99		// 'epoch' is a special mallctl -- it updates the statistics. Without it, all
100		// the following calls will return stale values. It increments and returns
101		// the current epoch number, which might be useful to log as a sanity check.
102		uint64_t epoch = 0;
103		size_t sz = sizeof(epoch);
104		jemallctl("epoch", &epoch, &sz, &epoch, sz);
105
106		// Number of extents of the given type in this arena in the bucket corresponding to page size index.
107		// Large size class starts at 16384, the extents have three sizes before 16384: 4096, 8192, and 12288, so + 3
108		int64_t dirty_pages_bytes = 0;
109		for (unsigned i = 0; i < get_je_unsigned_metrics("arenas.nlextents") + 3; i++) {
110		dirty_pages_bytes += get_je_all_arena_extents_metrics(i, "dirty_bytes");
111		}
112		_s_je_dirty_pages_mem.store(dirty_pages_bytes, std::memory_order_relaxed);
113
114		// Doris uses Jemalloc as default Allocator, Jemalloc Cache consists of two parts:
115		// - Thread Cache, cache a specified number of Pages in Thread Cache.
116		// - Dirty Page, memory Page that can be reused in all Arenas.
117		_s_allocator_cache_mem.store(get_je_all_arena_metrics("tcache_bytes") + dirty_pages_bytes,
118		std::memory_order_relaxed);
119		// Total number of bytes dedicated to metadata, which comprise base allocations used
120		// for bootstrap-sensitive allocator metadata structures.
121		_s_allocator_metadata_mem.store(get_je_metrics("stats.metadata"), std::memory_order_relaxed);
122		_s_virtual_memory_used.store(get_je_metrics("stats.mapped"), std::memory_order_relaxed);
123		#else
124		_s_allocator_cache_mem.store(get_tc_metrics("tcmalloc.pageheap_free_bytes") +
125		get_tc_metrics("tcmalloc.central_cache_free_bytes") +
126		get_tc_metrics("tcmalloc.transfer_cache_free_bytes") +
127		get_tc_metrics("tcmalloc.thread_cache_free_bytes"),
128		std::memory_order_relaxed);
129		_s_virtual_memory_used.store(get_tc_metrics("generic.total_physical_bytes") +
130		get_tc_metrics("tcmalloc.pageheap_unmapped_bytes"),
131		std::memory_order_relaxed);
132		#endif
133	0	}
134
135	0	void MemInfo::refresh_memory_bvar() {
136	0	memory_jemalloc_cache_bytes << MemInfo::allocator_cache_mem() -
137	0	memory_jemalloc_cache_bytes.get_value();
138	0	memory_jemalloc_dirty_pages_bytes
139	0	<< MemInfo::je_dirty_pages_mem() - memory_jemalloc_dirty_pages_bytes.get_value();
140	0	memory_jemalloc_metadata_bytes
141	0	<< MemInfo::allocator_metadata_mem() - memory_jemalloc_metadata_bytes.get_value();
142	0	memory_jemalloc_virtual_bytes << MemInfo::allocator_virtual_mem() -
143	0	memory_jemalloc_virtual_bytes.get_value();
144
145	0	memory_cgroup_usage_bytes << _s_cgroup_mem_usage - memory_cgroup_usage_bytes.get_value();
146	0	memory_sys_available_bytes << _s_sys_mem_available - memory_sys_available_bytes.get_value();
147
148	0	memory_arbitrator_sys_available_bytes
149	0	<< GlobalMemoryArbitrator::sys_mem_available() -
150	0	memory_arbitrator_sys_available_bytes.get_value();
151	0	memory_arbitrator_process_usage_bytes
152	0	<< GlobalMemoryArbitrator::process_memory_usage() -
153	0	memory_arbitrator_process_usage_bytes.get_value();
154	0	memory_arbitrator_reserve_memory_bytes
155	0	<< GlobalMemoryArbitrator::process_reserved_memory() -
156	0	memory_arbitrator_reserve_memory_bytes.get_value();
157	0	memory_arbitrator_refresh_interval_growth_bytes
158	0	<< GlobalMemoryArbitrator::refresh_interval_memory_growth -
159	0	memory_arbitrator_refresh_interval_growth_bytes.get_value();
160	0	}
161
162		#ifndef __APPLE__
163	1	void MemInfo::refresh_proc_meminfo() {
164	1	std::ifstream meminfo("/proc/meminfo", std::ios::in);
165	1	std::string line;
166
167	55	while (meminfo.good() && !meminfo.eof()) {
168	54	getline(meminfo, line);
169	54	std::vector<std::string> fields = strings::Split(line, " ", strings::SkipWhitespace());
170	54	if (fields.size() < 2) {
171	1	continue;
172	1	}
173	53	std::string key = fields[0].substr(0, fields[0].size() - 1);
174
175	53	StringParser::ParseResult result;
176	53	auto mem_value =
177	53	StringParser::string_to_int<int64_t>(fields[1].data(), fields[1].size(), &result);
178
179	53	if (result == StringParser::PARSE_SUCCESS) {
180	53	if (fields.size() == 2) {
181	4	_mem_info_bytes[key] = mem_value;
182	49	} else if (fields[2] == "kB") {
183	49	_mem_info_bytes[key] = mem_value * 1024L;
184	49	}
185	53	}
186	53	}
187	1	if (meminfo.is_open()) {
188	1	meminfo.close();
189	1	}
190
191		// refresh cgroup memory
192	1	if (config::enable_use_cgroup_memory_info) {
193	1	if (_s_cgroup_mem_refresh_wait_times >= 0) {
194	1	int64_t cgroup_mem_limit;
195	1	auto status = CGroupMemoryCtl::find_cgroup_mem_limit(&cgroup_mem_limit);
196	1	if (!status.ok()) {
197	0	_s_cgroup_mem_limit = std::numeric_limits<int64_t>::max();
198		// find cgroup limit failed, wait 300s, 1000 * 100ms.
199	0	_s_cgroup_mem_refresh_wait_times = -3000;
200	0	LOG(WARNING)
201	0	<< "Refresh cgroup memory limit failed, refresh again after 300s, cgroup "
202	0	"mem limit: "
203	0	<< _s_cgroup_mem_limit << ", " << status;
204	1	} else {
205	1	_s_cgroup_mem_limit = cgroup_mem_limit;
206		// wait 10s, 100 * 100ms, avoid too frequently.
207	1	_s_cgroup_mem_refresh_wait_times = -100;
208	1	}
209	1	} else {
210	0	_s_cgroup_mem_refresh_wait_times++;
211	0	}
212
213		// cgroup mem limit is refreshed every 10 seconds,
214		// cgroup mem usage is refreshed together with memInfo every time, which is very frequent.
215	1	if (_s_cgroup_mem_limit != std::numeric_limits<int64_t>::max()) {
216	1	int64_t cgroup_mem_usage;
217	1	auto status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage);
218	1	if (!status.ok()) {
219	0	_s_cgroup_mem_usage = std::numeric_limits<int64_t>::min();
220	0	_s_cgroup_mem_refresh_state = false;
221	0	LOG_EVERY_N(WARNING, 500)
222	0	<< "Refresh cgroup memory usage failed, cgroup mem limit: "
223	0	<< _s_cgroup_mem_limit << ", " << status;
224	1	} else {
225	1	_s_cgroup_mem_usage = cgroup_mem_usage;
226	1	_s_cgroup_mem_refresh_state = true;
227	1	}
228	1	} else {
229	0	_s_cgroup_mem_refresh_state = false;
230	0	}
231	1	} else {
232	0	_s_cgroup_mem_refresh_state = false;
233	0	}
234
235		// 1. calculate physical_mem
236	1	int64_t physical_mem = -1;
237
238	1	physical_mem = _mem_info_bytes["MemTotal"];
239	1	if (_s_cgroup_mem_refresh_state) {
240		// In theory, always cgroup_mem_limit < physical_mem
241	1	if (physical_mem < 0) {
242	0	physical_mem = _s_cgroup_mem_limit;
243	1	} else {
244	1	physical_mem =
245	1	std::min(physical_mem, _s_cgroup_mem_limit.load(std::memory_order_relaxed));
246	1	}
247	1	}
248
249	1	if (physical_mem <= 0) {
250	0	LOG(WARNING)
251	0	<< "Could not determine amount of physical memory on this machine, physical_mem: "
252	0	<< physical_mem;
253	0	}
254
255		// 2. if physical_mem changed, refresh mem limit and gc size.
256	1	if (physical_mem > 0 && _s_physical_mem.load(std::memory_order_relaxed) != physical_mem) {
257	1	_s_physical_mem.store(physical_mem);
258
259	1	bool is_percent = true;
260	1	_s_mem_limit.store(
261	1	ParseUtil::parse_mem_spec(config::mem_limit, -1, _s_physical_mem, &is_percent));
262	1	if (_s_mem_limit <= 0) {
263	0	LOG(WARNING) << "Failed to parse mem limit from '" + config::mem_limit + "'.";
264	0	}
265	1	if (_s_mem_limit > _s_physical_mem) {
266	0	LOG(WARNING) << "Memory limit " << PrettyPrinter::print(_s_mem_limit, TUnit::BYTES)
267	0	<< " exceeds physical memory of "
268	0	<< PrettyPrinter::print(_s_physical_mem, TUnit::BYTES)
269	0	<< ". Using physical memory instead";
270	0	_s_mem_limit.store(_s_physical_mem);
271	0	}
272	1	_s_soft_mem_limit.store(int64_t(_s_mem_limit * config::soft_mem_limit_frac));
273
274	1	_s_process_minor_gc_size.store(ParseUtil::parse_mem_spec(config::process_minor_gc_size, -1,
275	1	_s_mem_limit, &is_percent));
276	1	_s_process_full_gc_size.store(ParseUtil::parse_mem_spec(config::process_full_gc_size, -1,
277	1	_s_mem_limit, &is_percent));
278	1	_s_je_dirty_pages_mem_limit.store(ParseUtil::parse_mem_spec(
279	1	config::je_dirty_pages_mem_limit_percent, -1, _s_mem_limit, &is_percent));
280	1	}
281
282		// 3. refresh process available memory
283	1	int64_t mem_available = -1;
284	1	if (_mem_info_bytes.find("MemAvailable") != _mem_info_bytes.end()) {
285	1	mem_available = _mem_info_bytes["MemAvailable"];
286	1	}
287	1	if (_s_cgroup_mem_refresh_state) {
288		// Note, CgroupV2 MemAvailable is usually a little smaller than Process MemAvailable.
289		// Process `MemAvailable = MemFree - LowWaterMark + (PageCache - min(PageCache / 2, LowWaterMark))`,
290		// from `MemAvailable` in `/proc/meminfo`, calculated by OS.
291		// CgroupV2 `MemAvailable = cgroup_mem_limit - cgroup_mem_usage`,
292		// `cgroup_mem_usage = memory.current - inactive_file - slab_reclaimable`, in fact,
293		// there seems to be some memory that can be reused in `cgroup_mem_usage`.
294	1	if (mem_available < 0) {
295	0	mem_available = _s_cgroup_mem_limit - _s_cgroup_mem_usage;
296	1	} else {
297	1	mem_available = std::min(mem_available, _s_cgroup_mem_limit - _s_cgroup_mem_usage);
298	1	}
299	1	}
300	1	if (mem_available < 0) {
301	0	LOG(WARNING) << "Failed to get available memory, set MAX_INT.";
302	0	mem_available = std::numeric_limits<int64_t>::max();
303	0	}
304	1	if (_s_sys_mem_available.load(std::memory_order_relaxed) != mem_available) {
305	1	_s_sys_mem_available.store(mem_available);
306	1	}
307	1	}
308
309	1	void MemInfo::init() {
310	1	refresh_proc_meminfo();
311
312	1	std::string line;
313	1	int64_t _s_vm_min_free_kbytes = 0;
314	1	std::ifstream vminfo("/proc/sys/vm/min_free_kbytes", std::ios::in);
315	1	if (vminfo.good() && !vminfo.eof()) {
316	1	getline(vminfo, line);
317	1	boost::algorithm::trim(line);
318	1	StringParser::ParseResult result;
319	1	auto mem_value = StringParser::string_to_int<int64_t>(line.data(), line.size(), &result);
320
321	1	if (result == StringParser::PARSE_SUCCESS) {
322	1	_s_vm_min_free_kbytes = mem_value * 1024L;
323	1	}
324	1	}
325	1	if (vminfo.is_open()) {
326	1	vminfo.close();
327	1	}
328
329		// Redhat 4.x OS, `/proc/meminfo` has no `MemAvailable`.
330	1	if (_mem_info_bytes.find("MemAvailable") != _mem_info_bytes.end()) {
331		// MemAvailable = MemFree - LowWaterMark + (PageCache - min(PageCache / 2, LowWaterMark))
332		// LowWaterMark = /proc/sys/vm/min_free_kbytes
333		// Ref:
334		// https://serverfault.com/questions/940196/why-is-memavailable-a-lot-less-than-memfreebufferscached
335		// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=34e431b0ae398fc54ea69ff85ec700722c9da773
336		//
337		// smaller sys_mem_available_low_water_mark can avoid wasting too much memory.
338	1	_s_sys_mem_available_low_water_mark =
339	1	config::max_sys_mem_available_low_water_mark_bytes != -1
340	1	? config::max_sys_mem_available_low_water_mark_bytes
341	1	: std::min<int64_t>(_s_physical_mem - _s_mem_limit,
342	1	int64_t(_s_physical_mem * 0.05));
343	1	_s_sys_mem_available_warning_water_mark = _s_sys_mem_available_low_water_mark * 2;
344	1	}
345
346	1	std::ifstream sys_transparent_hugepage("/sys/kernel/mm/transparent_hugepage/enabled",
347	1	std::ios::in);
348	1	std::string hugepage_enable;
349		// If file not exist, getline returns an empty string.
350	1	getline(sys_transparent_hugepage, hugepage_enable);
351	1	if (sys_transparent_hugepage.is_open()) {
352	1	sys_transparent_hugepage.close();
353	1	}
354	1	if (hugepage_enable == "[always] madvise never") {
355	0	std::cout << "[WARNING!] /sys/kernel/mm/transparent_hugepage/enabled: " << hugepage_enable
356	0	<< ", Doris not recommend turning on THP, which may cause the BE process to use "
357	0	"more memory and cannot be freed in time. Turn off THP: `echo madvise \| sudo "
358	0	"tee /sys/kernel/mm/transparent_hugepage/enabled`"
359	0	<< std::endl;
360	0	}
361
362		// Expect vm overcommit memory value to be 1, system will no longer throw bad_alloc, memory alloc are always accepted,
363		// memory limit check is handed over to Doris Allocator, make sure throw exception position is controllable,
364		// otherwise bad_alloc can be thrown anywhere and it will be difficult to achieve exception safety.
365	1	std::ifstream sys_vm("/proc/sys/vm/overcommit_memory", std::ios::in);
366	1	std::string vm_overcommit;
367	1	getline(sys_vm, vm_overcommit);
368	1	if (sys_vm.is_open()) {
369	1	sys_vm.close();
370	1	}
371	1	if (!vm_overcommit.empty() && std::stoi(vm_overcommit) == 2) {
372	0	std::cout << "[WARNING!] /proc/sys/vm/overcommit_memory: " << vm_overcommit
373	0	<< ", expect is 1, memory limit check is handed over to Doris Allocator, "
374	0	"otherwise BE may crash even with remaining memory"
375	0	<< std::endl;
376	0	}
377
378	1	LOG(INFO) << "Physical Memory: " << _mem_info_bytes["MemTotal"]
379	1	<< ", BE Available Physical Memory(consider cgroup): "
380	1	<< PrettyPrinter::print(_s_physical_mem, TUnit::BYTES) << ", Mem Limit: "
381	1	<< PrettyPrinter::print(_s_mem_limit.load(std::memory_order_relaxed), TUnit::BYTES)
382	1	<< ", origin config value: " << config::mem_limit
383	1	<< ", System Mem Available Min Reserve: "
384	1	<< PrettyPrinter::print(_s_sys_mem_available_low_water_mark, TUnit::BYTES)
385	1	<< ", Vm Min Free KBytes: "
386	1	<< PrettyPrinter::print(_s_vm_min_free_kbytes, TUnit::BYTES)
387	1	<< ", Vm Overcommit Memory: " << vm_overcommit;
388	1	_s_initialized = true;
389	1	}
390		#else
391		void MemInfo::refresh_proc_meminfo() {}
392
393		void MemInfo::init() {
394		size_t size = sizeof(_s_physical_mem);
395		if (sysctlbyname("hw.memsize", &_s_physical_mem, &size, nullptr, 0) != 0) {
396		LOG(WARNING) << "Could not determine amount of physical memory on this machine.";
397		_s_physical_mem = -1;
398		}
399
400		bool is_percent = true;
401		_s_mem_limit = ParseUtil::parse_mem_spec(config::mem_limit, -1, _s_physical_mem, &is_percent);
402		_s_soft_mem_limit = static_cast<int64_t>(_s_mem_limit * config::soft_mem_limit_frac);
403
404		LOG(INFO) << "Physical Memory: " << PrettyPrinter::print(_s_physical_mem, TUnit::BYTES);
405		_s_initialized = true;
406		}
407		#endif
408
409	0	std::string MemInfo::debug_string() {
410	0	DCHECK(_s_initialized);
411	0	std::stringstream stream;
412	0	stream << "Physical Memory: " << PrettyPrinter::print(_s_physical_mem, TUnit::BYTES)
413	0	<< std::endl;
414	0	stream << "Memory Limt: " << PrettyPrinter::print(_s_mem_limit, TUnit::BYTES) << std::endl;
415	0	stream << "CGroup Info: " << doris::CGroupMemoryCtl::debug_string() << std::endl;
416	0	return stream.str();
417	0	}
418
419		} // namespace doris