在生产环境中某些主机经常报pod的liveness探针检查（ipv6地址）失败，kubelet的探针检查就是kubelet尝试去访问liveness的接口，因此登录到pod所在的主机，使用curl命令访问pod的liveness中的url，也会偶现失败，使用ping6命令访问pod的ip，也会偶现失败，重启network，重启主机后，问题依然存在。

环境信息

kubernetes版本： 1.23.4
kernel版本：4.19.113
centos版本：7.6
calico版本：3.20，配置为vxlan的crosssubnet模式

问题初步排查

使用curl命令访问pod的liveness接口的流程还是相对比较简单的。宿主机发出tcp请求，经由cali网卡转到pod的命名空间内，pod的进程进行响应会回复。通过抓取ping报文和curl报文，发现在cali网卡上能够看到pod内的回复，说明丢包是发生在宿主机的命名空间内，抓包能看到响应，说明是在2层之上丢的包，ping和tcp包都会丢失，说明是在4层下丢的包，基于此判定丢包最大可能发生在ip层的处理中。

经过梳理相关的内核代码，查看相关的metrics，并没有发现异常，内核日志也未发现异常。后来由同事搜索到了一篇文章，与生产环境的表现基本一致，通过调大net.ipv6.route.max_size这个内核参数可以解决这种偶发性的丢包，但是文中并没有解释这个内核参数过小为什么会引起概率性的丢包，本文将深入内核，确定丢包的位置，让真相大白。

观测路由缓存的使用量

这里先使用路由缓存这个词来描述net.ipv6.route.max_size，了解到调高这个参数可以解决问题，那么需要找到证据来证明当前的路由缓存的使用量超过了这个阈值。
有了关键字，找相关的代码就比较简单了。由下面代码得知，可以通过/proc/net/rt6_stat的倒数第二个参数查看路由缓存的使用量，即下面的003e（十六进制表示）。

static int __net_init ip6_route_net_init_late(struct net *net)
{
    proc_create_net_single("rt6_stats", 0444, net->proc_net,
            rt6_stats_seq_show, NULL);
}
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
    struct net *net = (struct net *)seq->private;
    seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
           net->ipv6.rt6_stats->fib_nodes,
           net->ipv6.rt6_stats->fib_route_nodes,
           atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
           net->ipv6.rt6_stats->fib_rt_entries,
           net->ipv6.rt6_stats->fib_rt_cache,
           dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
           net->ipv6.rt6_stats->fib_discarded_routes);

    return 0;
}
root@VM-12-3-ubuntu:~# cat /proc/net/rt6_stats 
0042 0023 01d1 0091 0000 003e 0050

接收ping响应包内核解析

收到的数据包为ip类型，会调用ipv6_rcv这个入口函数进行处理。

static struct packet_type ipv6_packet_type __read_mostly = {
    .type = cpu_to_be16(ETH_P_IPV6),
    .func = ipv6_rcv,
    .list_func = ipv6_list_rcv,
};

int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
    struct net *net = dev_net(skb->dev);
    skb = ip6_rcv_core(skb, dev, net);
    return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
               net, NULL, skb, dev, NULL,
               ip6_rcv_finish);
}

查询路由表，根据查询结果判定数据包是本地接收还是转发。
如果是本地接收，则调用ip6_input函数将包继续上送协议栈进行处理，如果需要转发，则调用ip6_forward将包转发到其他接口。

static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
{
    rt->dst.error = 0;
    rt->dst.output = ip6_output;

    if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
        rt->dst.input = ip6_input;
    } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
        rt->dst.input = ip6_mc_input;
    } else {
        rt->dst.input = ip6_forward;
    }
}

如果dst_entry缓存不存在，则创建dst_entry缓存
在申请dst_entry时，会去判断当前路由缓存的使用量是否超过了内核的配置net.ipv6.route.gc_thresh，如果超过的话，会调用ops->gc()也就是ip6_dst_gc进行清理，如果返回值非0，则说明申请失败。

void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
        int initial_ref, int initial_obsolete, unsigned short flags)
{
    struct dst_entry *dst;
    if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
        if (ops->gc(ops))
            return NULL;
    }
    dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
    dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);

    return dst;
}

在本次操作中，是宿主机收到ping的回复报文，处理函数的入口是icmpv6_rcv

static const struct inet6_protocol icmpv6_protocol = {
    .handler    =   icmpv6_rcv,
    .err_handler    =   icmpv6_err,
    .flags      =   INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};

详细分析`ip6_dst_gc`函数

主要分为四个步骤：

使用dst_entries_get_fast获取路由缓存的使用量
调用fib6_run_gc做实际的清理工作
调用dst_entries_get_slow获取路由缓存的使用量
如果清理后，路由缓存依然大于max_size的值，则返回false

static int ip6_dst_gc(struct dst_ops *ops)
{
    struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
    int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
    int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
    int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
    int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
    unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
    int entries;

    entries = dst_entries_get_fast(ops);
    if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
        entries <= rt_max_size)
        goto out;

    net->ipv6.ip6_rt_gc_expire++;
    fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
    entries = dst_entries_get_slow(ops);
    if (entries < ops->gc_thresh)
        net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
out:
    net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
    return entries > rt_max_size;
}

slow和fast两种获取路由缓存的使用量的区别

这里需要仔细分辨下slow和fast的区别，因为在测试环境中，我将net.ipv6.route.gc_thresh和net.ipv6.route.max_size都调整为1，然后在本机ping主机上的pod，并未出现丢包。这一度让我以为整个流程都分析错了，执行仔细阅读和理解了fast和slow这两个函数的实现方式后，才恍然大悟。

percpu_counter数据结构
这个数据结构有两个类型的count计数，一个在外面，一个在percpu变量里。

struct percpu_counter {
    raw_spinlock_t lock;
    s64 count;
    s32 __percpu *counters;
};

fast的方式
可以看到fast的方式是直接将外部的count进行了返回

static inline int dst_entries_get_fast(struct dst_ops *dst)
{
    return percpu_counter_read_positive(&dst->pcpuc_entries);
}

static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
    s64 ret = fbc->count;

    barrier();      /* Prevent reloads of fbc->count */
    if (ret >= 0)
        return ret;
    return 0;
}

slow的方式
slow的方式是将外部的count加上了每个cpu变量里的统计值

static inline int dst_entries_get_slow(struct dst_ops *dst)
{
    return percpu_counter_sum_positive(&dst->pcpuc_entries);
}
static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
    s64 ret = __percpu_counter_sum(fbc);
    return ret < 0 ? 0 : ret;
}
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
    s64 ret;
    int cpu;
    unsigned long flags;

    raw_spin_lock_irqsave(&fbc->lock, flags);
    ret = fbc->count;
    for_each_online_cpu(cpu) {
        s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
        ret += *pcount;
    }
    raw_spin_unlock_irqrestore(&fbc->lock, flags);
    return ret;
}

为何要区分slow和fast
linux内核为了提高效率，引入了这种percpu变量的方式，每个cpu都只需要操作本cpu的数据，这样就不需要进行加锁。那么如果汇总所有cpu上的统计值呢？这里又引入了批量提交延迟写入的方法。

在函数percpu_counter_add_batch中可以看到当计数值count未超过batch阈值时，直接加到每cpu的统计值中，如果超过batch阈值，则将每cpu中的统计值累加到外部的统计值中，并清理每cpu中的统计值。所以fast的方式获取的数据其实是不准确的，因为在多cpu环境下，可能很多cpu的统计值还没有达到batch的阈值，但是减少了加锁的次数，效率比较高，这或许就是一种折中吧。

那么batch的值是多少呢？在内核中是取了32与（nrcpu*2）中的最大值，在测试环境中，cpu个数是96，所以batch值为192。

static inline void dst_entries_add(struct dst_ops *dst, int val)
{
    percpu_counter_add(&dst->pcpuc_entries, val);
}
static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
    percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
    s64 count;

    preempt_disable();
    count = __this_cpu_read(*fbc->counters) + amount;
    if (count >= batch || count <= -batch) {
        unsigned long flags;
        raw_spin_lock_irqsave(&fbc->lock, flags);
        fbc->count += count;
        __this_cpu_sub(*fbc->counters, count - amount);
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
    } else {
        this_cpu_add(*fbc->counters, amount);
    }
    preempt_enable();
}

丢包路径分析

当调用ip6_create_rt_rcu失败时，路由信息的返回值会被赋值一个默认值ip6_null_entry，该变量的input函数是ip6_pkt_discard，这样的话调用dst_input()相当于就是调用ip6_pkt_discard函数了，进而调用ip6_pkt_drop和kfree_skb，这样包就被丢了，也就不会将数据包继续上送到4层协议中处理了。

至此，可以得出结论，偶发性的丢包其实是与每cpu的统计值有关，当ping6命令所在的这个cpu的count值较高时，就会丢包，count值很低时，就不会丢包，是不是很神奇？

static const struct rt6_info ip6_null_entry_template = {
    .dst = {
        .__refcnt   = ATOMIC_INIT(1),
        .__use      = 1,
        .obsolete   = DST_OBSOLETE_FORCE_CHK,
        .error      = -ENETUNREACH,
        .input      = ip6_pkt_discard,
        .output     = ip6_pkt_discard_out,
    },
    .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
};

static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
{
    unsigned short flags = fib6_info_dst_flags(rt);
    struct net_device *dev = rt->fib6_nh.nh_dev;
    struct rt6_info *nrt;
    nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
    if (!nrt) {
        fib6_info_release(rt);
        goto fallback;
    }

    ip6_rt_copy_init(nrt, rt);
    return nrt;

fallback:
    nrt = dev_net(dev)->ipv6.ip6_null_entry;
    dst_hold(&nrt->dst);
    return nrt;
}

int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
    ip6_rcv_finish_core(net, sk, skb);
    return dst_input(skb);
}

static inline int dst_input(struct sk_buff *skb)
{
    return skb_dst(skb)->input(skb);
}

使用ftrace验证丢包

经过上面的分析，对于丢包的逻辑已经非常清晰了，在实验环境中，为了使得问题更好的复现，减小net.ipv6.route.gc_thresh和net.ipv6.route.max_size这两个内核参数的值。之后创建15个pod，使用ping6命令直接在宿主机上访问pod。关注ip6_pkt_drop函数，观察ping6命令的输出，当出现丢包时，trace中会看到相关的函数调用栈。

root@VM-12-3-ubuntu:/sys/kernel/debug/tracing# echo ip6_pkt_drop >> set_ftrace_filter 
root@VM-12-3-ubuntu:/sys/kernel/debug/tracing# echo 1 >options/func_stack_trace
root@VM-12-3-ubuntu:/sys/kernel/debug/tracing# echo function > current_tracer

root@VM-12-3-ubuntu:/sys/kernel/debug/tracing# cat trace_pipe
 => ip6_pkt_drop
 => ip6_pkt_discard
 => ip6_rcv_finish
 => ipv6_rcv
 => __netif_receive_skb_one_core
 => __netif_receive_skb
 => process_backlog
 => net_rx_action
 => __do_softirq
 => do_softirq_own_stack
 => do_softirq.part.0
 => __local_bh_enable_ip
 => ip6_finish_output2
 => __ip6_finish_output
 => ip6_finish_output
 => ip6_output
 => ip6_local_out
 => ip6_send_skb
 => ip6_push_pending_frames
 => icmpv6_push_pending_frames
 => ping_v6_sendmsg
 => inet_sendmsg
 => sock_sendmsg
 => __sys_sendto
 => __x64_sys_sendto
 => do_syscall_64
 => entry_SYSCALL_64_after_hwframe

总结

在4.19内核中，并没有找到相关的metrics可以判断这种类型的丢包，内核日志中也没有相应的错误信息，这就给问题的排查带来了很大的困难，幸运的是，5.4的内核中demsg会有consider increasing sysctl net.ipv[4|6].route.max_size.的告警信息。如果不是搜索出来结果，更好的办法是使用ebpf技术hook在内核网络收包的关键函数上，这样当丢包发生时，就可以大致判断丢包的位置，再逐步调整hook的位置，直到找到最终的丢包点。

k8s环境ipv6通信偶发性丢包问题分析

k8s环境ipv6通信偶发性丢包问题分析

环境信息

问题初步排查

观测路由缓存的使用量

接收ping响应包内核解析

详细分析`ip6_dst_gc`函数

slow和fast两种获取路由缓存的使用量的区别

丢包路径分析

使用ftrace验证丢包

总结

推荐阅读更多精彩内容

k8s环境ipv6通信偶发性丢包问题分析

环境信息

问题初步排查

观测路由缓存的使用量

接收ping响应包内核解析

详细分析ip6_dst_gc函数

slow和fast两种获取路由缓存的使用量的区别

丢包路径分析

使用ftrace验证丢包

总结

推荐阅读更多精彩内容

详细分析`ip6_dst_gc`函数