目的
了解linux内存计费的目的是,因为最近有业务反馈说从监控上看内存使用率已经达到150%容器才oom,我们监控使用的指标是container_memory_usage_bytes - container_memory_cache,通过观察linux内核日志发现oom时使用的内存并没有超。明白了内核是如何对cgroup内存计费的,能帮助我们更准确的做好监控。内核使用的5.4
从内存缺页中断开始
linux使用虚拟内存管理机制,每个进程都有独立的虚拟地址空间,分配和释放的都是虚拟内存,cpu读写虚拟内存时发现没有对应的物理页,会出发生缺页中断,调用一些列函数分配物理内存。linux内存管理不是一两个篇幅能说清楚的,所以我们直接从缺页开始看,主要分析下用户态匿名页这快逻辑。 关键的流程如下图所示:
- 缺页异常会出发缺页中断然后点用__do_page_fault,该函数那会判断缺页是发生在内核还是用户空间,本文只分析用户空产生的缺页,因为cgroup里主要是用户空间消耗的内存多,所以走到do_user_addr_fault
- do_user_addr_fault调用find_vma查找该内存地址所在的vm_area_struct(这里说的区也就是常说的linux虚拟内存划分:本文区、数据区、bss区、堆、栈等),然后点用handle_mm_fault进行映射这个区。
- 当前linux使用的是5级页表:PGD、P4G、PUD、PMD、PTE,__handle_mm_fault首先会判断这PGD、P4G、PUD、PMD是否存在,若果不存在则创建相应页目录项,创建失败则直接oom,成功的话调用handle_pte_fault创建页表。
- handle_pte_fault中判断页表项pte如果是null的,说明没有发生过映射,这是根据用户层申请内存的类型分为匿名页(调用do_anonymous_page)和文件映射页(调用do_fault,文件映射最终和匿名页计费逻辑相同,这下面主要分析匿名页);如果pte之前出现过,说明发生了页换出到磁盘,现在调用do_swap_page换入,因为关闭了swap所以这步也不关心了。
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
if (unlikely(pmd_none(*vmf->pmd))) {
......
vmf->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
......
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
vmf->orig_pte = *vmf->pte;
..... barrier();
if (pte_none(vmf->orig_pte)) {
pte_unmap(vmf->pte);
vmf->pte = NULL;
}
}
if (!vmf->pte) {
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);
else
return do_fault(vmf);
}
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
......
return 0;
}
- do_anonymous_page先分配页表项,分配页表项失败则直接oom。成功则调用alloc_zeroed_user_highpage_movable分配一个物理页,失败同样直接oom。成功则调用mem_cgroup_try_charge_delay对cgroup内存进行计费,如果超过crgoup的限制则先释放之前申请的页,然后oom,这部分下一节详细分析。然后调用mk_pte将页表项与物理页建立映射关系,调用page_add_new_anon_rmap将页表pte映射到匿名页。mem_cgroup_commit_charge用来提交cgroup计费,更新不同类型的缓存。调用lru_cache_add_active_or_unevictable将不可回收的页添加到相应的zone。调用set_pte_at将页表项添加到页表里。
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
......
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
......
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
false))
goto oom_free_page;
......
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (!pte_none(*vmf->pte))
goto release;
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
goto unlock;
oom_free_page:
put_page(page);
oom:
return VM_FAULT_OOM;
}
cgroup计费
下面是linux内核文档mem cgroup中给出主要资源对象之间的关系:
- struct mem_cgroup mem cgroup中的一起都在这个结构里,使用Rik Van Riel为clock-pro开发的统计数据方式进行计费,也就是page_counter对象。
struct mem_cgroup {
struct cgroup_subsys_state css;
/* Private memcg ID. Used to ID objects that outlive the cgroup */
struct mem_cgroup_id id;
/* Accounted resources */
struct page_counter memory;
struct page_counter swap;
/* Legacy consumer-oriented counters */
struct page_counter memsw;
struct page_counter kmem;
struct page_counter tcpmem;
/* Upper bound of normal memory consumption range */
unsigned long high;
/* Range enforcement for interrupt charges */
struct work_struct high_work;
unsigned long soft_limit;
/* vmpressure notifications */
struct vmpressure vmpressure;
/*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
/*
* Should the OOM killer kill all belonging tasks, had it kill one?
*/
bool oom_group;
- page_counter结构如下
struct page_counter {
atomic_long_t usage; // cgroup中申请的内存
unsigned long min;
unsigned long low;
unsigned long max; // cgroup中设置的可用内存limit
struct page_counter *parent; // cgroup层级关系
/* effective memory.min and memory.min usage tracking */
unsigned long emin;
atomic_long_t min_usage;
atomic_long_t children_min_usage;
/* effective memory.low and memory.low usage tracking */
unsigned long elow;
atomic_long_t low_usage;
atomic_long_t children_low_usage;
/* legacy */
unsigned long watermark; //
unsigned long failcnt; // 超过limit的次数
};
-
mm_struct、mem_group、page、page_cgroup之间都能通过相应的结构相关联
Memory Cgroup 只是统计rss 和 page cache
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
*/
struct mem_cgroup {
- 计费page_counter主要函数
- 计费方式usage += PAGE_SIZE
- 计费并判断是否超过内存限制mem_cgroup_try_charge()
- 取消计费,如超过内存限制oom之前要先取消之前的计费mem_cgroup_uncharge()
- 提交计费计算更新rss、page cache,mem_cgroup_commit_charge()
- page cache计费add_to_page_cache_locked()
- 内核申请的内存计费__memcg_kmem_charge_memcg()
int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg)
{
unsigned int nr_pages = 1 << order;
struct page_counter *counter;
int ret;
ret = try_charge(memcg, gfp, nr_pages); // 内核使用内存计费也是用的try_charge,所以内核使用的内存也会加到memcg->memory
if (ret)
return ret;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { // 额外会单独计费内核使用的内存
......
return 0;
}
cgroup内存控制
以匿名页位例,映射到文件与匿名页计算方式相同,swap不涉及。
- 设置limit_in_bytes
设置limit_in_bytes通过mem_cgroup_write->mem_cgroup_resize_max->page_counter_set_max实现
static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
unsigned long max, bool memsw)
{
struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
do {
......
limits_invariant = memsw ? max >= memcg->memory.max :
max <= memcg->memsw.max;
if (!limits_invariant) {
mutex_unlock(&memcg_max_mutex);
ret = -EINVAL;
break;
}
if (max > counter->max)
enlarge = true;
ret = page_counter_set_max(counter, max); // 修改limit_in_bytes
......
return ret;
}
- 读取mem cgroup的内存使用状态都是调用memory_stat_show->memory_stat_format。申请内存时,是内核申请的还是用户态,是匿名页还是文件映射,是可回收还是不可回收的,都会根据不同的类型设置不同的内存类别。
static char *memory_stat_format(struct mem_cgroup *memcg)
{
struct seq_buf s;
int i;
seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
if (!s.buffer)
return NULL;
/*
* Provide statistics on the state of the memory subsystem as
* well as cumulative event counters that show past behavior.
*
* This list is ordered following a combination of these gradients:
* 1) generic big picture -> specifics and details
* 2) reflecting userspace activity -> reflecting kernel heuristics
*
* Current memory state:
*/
seq_buf_printf(&s, "anon %llu\n",
(u64)memcg_page_state(memcg, MEMCG_RSS) *
PAGE_SIZE);
seq_buf_printf(&s, "file %llu\n",
(u64)memcg_page_state(memcg, MEMCG_CACHE) *
PAGE_SIZE);
seq_buf_printf(&s, "kernel_stack %llu\n",
(u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
1024);
seq_buf_printf(&s, "slab %llu\n",
(u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
PAGE_SIZE);
seq_buf_printf(&s, "sock %llu\n",
(u64)memcg_page_state(memcg, MEMCG_SOCK) *
PAGE_SIZE);
seq_buf_printf(&s, "shmem %llu\n",
(u64)memcg_page_state(memcg, NR_SHMEM) *
PAGE_SIZE);
seq_buf_printf(&s, "file_mapped %llu\n",
(u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
PAGE_SIZE);
seq_buf_printf(&s, "file_dirty %llu\n",
(u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
PAGE_SIZE);
seq_buf_printf(&s, "file_writeback %llu\n",
(u64)memcg_page_state(memcg, NR_WRITEBACK) *
PAGE_SIZE);
/*
* TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
* with the NR_ANON_THP vm counter, but right now it's a pain in the
* arse because it requires migrating the work out of rmap to a place
* where the page->mem_cgroup is set up and stable.
*/
seq_buf_printf(&s, "anon_thp %llu\n",
(u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++)
seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
seq_buf_printf(&s, "slab_reclaimable %llu\n",
(u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
PAGE_SIZE);
seq_buf_printf(&s, "slab_unreclaimable %llu\n",
(u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
PAGE_SIZE);
/* Accumulated memory events */
seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
seq_buf_printf(&s, "workingset_refault %lu\n",
memcg_page_state(memcg, WORKINGSET_REFAULT));
seq_buf_printf(&s, "workingset_activate %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT));
seq_buf_printf(&s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT));
seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
seq_buf_printf(&s, "thp_fault_alloc %lu\n",
memcg_events(memcg, THP_FAULT_ALLOC));
seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
memcg_events(memcg, THP_COLLAPSE_ALLOC));
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/* The above should easily fit into one page */
WARN_ON_ONCE(seq_buf_has_overflowed(&s));
return s.buffer;
}
- 前面说了do_anonymous_page申请页成功后会调用mem_cgroup_try_charge_delay进行计费。mem_cgroup_try_charge_delay-->mem_cgroup_try_charge-->try_charge -->page_counter_try_charge 。主要逻辑:如果超过内存超过限制,会尝试最多5次内存回收,如果5次都失败了退出并返回错误码
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); // 一次计费最小32个页
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
if (mem_cgroup_is_root(memcg)) // 根cgroup不计费
return 0;
retry:
if (consume_stock(memcg, nr_pages)) // 根据上次计费没有超过32页,直接返回
return 0;
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) { // swap 已经关闭
if (page_counter_try_charge(&memcg->memory, batch, &counter)) // 对mem使用量计费
goto done_restock;
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, batch);
mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
may_swap = false;
}
if (batch > nr_pages) {
batch = nr_pages;
goto retry;
}
if (gfp_mask & __GFP_ATOMIC)
goto force;
/*
* Unlike in global OOM situations, memcg is not in a physical
* memory shortage. Allow dying and OOM-killed tasks to
* bypass the last charges so that they can exit quickly and
* free their memory.
*/
if (unlikely(should_force_charge())) // 如果进程正要oom或者标记为要退出的状态,则强只计费,然后退出。
goto force;
......
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap); // 超出内存限制尝试回收内存
......
if (mem_cgroup_wait_acct_move(mem_over_limit))
goto retry;
if (nr_retries--) // 5次内存回收的机会
goto retry;
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
if (gfp_mask & __GFP_NOFAIL)
goto force;
if (fatal_signal_pending(current))
goto force;
......
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
force:
/*
* The allocation either can't fail or will lead to more memory
* being freed very soon. Allow memory usage go over the limit
* temporarily by force charging it.
*/
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
css_get_many(&memcg->css, nr_pages);
return 0;
done_restock:
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
......
do {
if (page_counter_read(&memcg->memory) > memcg->high) {
/* Don't bother a random interrupted task */
if (in_interrupt()) {
schedule_work(&memcg->high_work);
break;
}
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
}
} while ((memcg = parent_mem_cgroup(memcg)));
return 0;
}
page_counter_try_charge 逻辑清晰,如果usage超过c->max(limit_in_bytes),则增加失败次数failcnt,撤销计费,退出并返回false。没有超过内存限制返回true
bool page_counter_try_charge(struct page_counter *counter,
unsigned long nr_pages,
struct page_counter **fail)
{
struct page_counter *c;
for (c = counter; c; c = c->parent) {
long new;
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
propagate_protected_usage(counter, new);
......
c->failcnt++;
*fail = c;
goto failed;
}
propagate_protected_usage(counter, new);
......
if (new > c->watermark)
c->watermark = new;
}
return true;
failed:
for (c = counter; c != *fail; c = c->parent)
page_counter_cancel(c, nr_pages);
return false;
}
- 设置page类型(加入到哪个lru表:inactive_anon还是active_anon)
mem_cgroup_commit_charge->commit_charge->unlock_page_lru->add_page_to_lru_list
static void unlock_page_lru(struct page *page, int isolated)
{
......
add_page_to_lru_list(page, lruvec, page_lru(page));
}
spin_unlock_irq(&pgdat->lru_lock);
}
/**
* page_lru - which LRU list should a page be on?
* @page: the page to test
*
* Returns the LRU list a page should be on, as an index
* into the array of LRU lists.
*/
static __always_inline enum lru_list page_lru(struct page *page) // 根据page的类型获取不同lru表的索引,以便加到相应的lru表
{
enum lru_list lru;
if (PageUnevictable(page))
lru = LRU_UNEVICTABLE;
else {
lru = page_lru_base_type(page);
if (PageActive(page))
lru += LRU_ACTIVE;
}
return lru;
}
- cgroup内存回收
主动回收:主动回收是内核线程 kswapd干的事,内存紧张时kswapd会调用shrink_node进行内存回收,这点不论cgroup还会全区内存都一样。slab的主动回收是通过每个cpu上的reap_work工作队列最终调用shrink_slab进行回收。
被动回收:超过cgroup内存限制时通过try_to_free_mem_cgroup_pages->do_try_to_free_pages进行内存回收。
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int initial_priority = sc->priority;
pg_data_t *last_pgdat;
struct zoneref *z;
struct zone *zone;
retry:
delayacct_freepages_start();
if (global_reclaim(sc))
__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
do {
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
shrink_zones(zonelist, sc); // 主要的回收逻辑
if (sc->nr_reclaimed >= sc->nr_to_reclaim) // 已经回收的页数nr_reclaimed达到了想要回收的页数nr_to_reclaim就返回了
break;
if (sc->compaction_ready)
break;
/*
* If we're getting trouble reclaiming, start doing
* writepage even in laptop mode.
*/
if (sc->priority < DEF_PRIORITY - 2)
sc->may_writepage = 1;
} while (--sc->priority >= 0);
......
return 0;
}
回收哪些资源
- lru上主要有五种类型的cache :匿名页活跃和不活跃,文件活跃和不活跃,及不可回收的。因为关闭swap匿名页不会回收,所以lru中只回收活跃和不活跃的文件cache。主要调用逻辑do_try_to_free_pages->shrink_zones->shrink_node->shrink_node_memcg (cgroup的numa node内存回收函数)
static const char *const mem_cgroup_lru_names[] = {
"inactive_anon",
"active_anon",
"inactive_file",
"active_file",
"unevictable",
};
* This is a basic per-node page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
struct scan_control *sc, unsigned long *lru_pages)
{
......
get_scan_count(lruvec, memcg, sc, nr, lru_pages); // 获取可以回收的page cache数量
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
unsigned long nr_anon, nr_file, percentage;
unsigned long nr_scanned;
for_each_evictable_lru(lru) {
if (nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan, // 真正回收的地方
lruvec, sc);
}
}
cond_resched();
}
static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
struct scan_control *sc, unsigned long *nr,
unsigned long *lru_pages)
{
......
// 没有开启swap,就不扫描匿名页
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
scan_balance = SCAN_FILE;
goto out;
}
......
}
- slab主要回收dentry和inode cache。主要调用逻辑do_try_to_free_pages->shrink_zones->shrink_node-> shrink_slab->shrink_slab_memcg。
之前在3.10.*版本内核遇见过容器内存不足时cgroup没有回收slab的问题。目前如果申请内存时内存不足,能够回收memcrougp的slab。不会影响全局的slab
static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg,
int priority)
{
unsigned long ret, freed = 0;
struct shrinker *shrinker;
/*
* The root memcg might be allocated even though memcg is disabled
* via "cgroup_disable=memory" boot parameter. This could make
* mem_cgroup_is_root() return false, then just run memcg slab
* shrink, but skip global shrink. This may result in premature
* oom.
*/
if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
结论
- 根据以上分析可以知道目前真正的内存使用应该是
real_used = memory.usage_in_bytes – memory.stat .( total_inactive_file + total_active_file ) - memory.kmem.slabinfo.(inode_cache + xfs_inode)。 - 判断内存是否达到limit的逻辑很清晰,不可能出现内存超过150%情况,这个问题还需要排查下指标统计是否有问题