本文探究一下僵尸进程的产生，首先会介绍一下进程id相关的概念，再介绍一下进程退出的流程，最后介绍一下父进程wait的流程。

进程关系

这里首先需要明确的一个概念，就是在linux里面，线程和进程到底是如何区分的呢？

线程和进程是操作系统理论中的概念，在windows和linux中的实现可能不同，对应到linux内核中，进程和线程都是用task_struct来表示的，所以在数据结构上linux内核并没有区分进程和线程。

进程id，容易让人迷惑，比如 TID，TGID，PID，PPID，PGID，SID。下面的例子列出了一些进程的这些id，如用户态1号进程systemd，内核态进程总管kthreadd，软中断进程ksoftirqd，还有一些用户态程序containerd，docker，还有三个僵尸进程app-test，通过实例去观察能更好的去理解这些id的关系。

如果进程中没有其他线程，则TID与PID是相同的。

如果是多线程的go程序，如containerd，对应下图中的TID为493，674，675，676。他们的PID都是493，有一个的TID、PID、TGID都是493，这个线程可以理解为主线程，也可以说containerd是一个有4个线程的进程，但是在内核中实实在在的对应了4个不同的task_struct结构。

root@iZt4n1u8u50jg1r5n6myn2Z:~# ps -eLo comm:20,tid,pid,tgid,ppid,pgid,sid | column -t 
COMMAND               TID        PID    TGID   PPID   PGID   SID
systemd               1          1      1      0      1      1
kthreadd              2          2      2      0      0      0
ksoftirqd/0           10         10     10     2      0      0
migration/1           17         17     17     2      0      0
kcompactd0            28         28     28     2      0      0
containerd            493        493    493    1      493    493
containerd            674        493    493    1      493    493
containerd            675        493    493    1      493    493
containerd            676        493    493    1      493    493
containerd-shim       31824      31824  31824  1      31824  493
containerd-shim       31825      31824  31824  1      31824  493
containerd-shim       31826      31824  31824  1      31824  493
containerd-shim       31827      31824  31824  1      31824  493
containerd-shim       31828      31824  31824  1      31824  493
containerd-shim       65521      65521  65521  1      65521  493
containerd-shim       65522      65521  65521  1      65521  493
containerd-shim       65523      65521  65521  1      65521  493
dockerd               27296      27296  27296  1      27296  27296
dockerd               27297      27296  27296  1      27296  27296
dockerd               27298      27296  27296  1      27296  27296
dockerd               27299      27296  27296  1      27296  27296
docker-proxy          28170      28170  28170  27296  27296  27296
docker-proxy          28171      28170  28170  27296  27296  27296
docker-proxy          28172      28170  28170  27296  27296  27296
docker-proxy          28173      28170  28170  27296  27296  27296
docker-proxy          28174      28170  28170  27296  27296  27296
app-test              65543      65543  65543  65521  65543  65543
app-test              <defunct>  65582  65582  65582  65543  65543  65543
app-test              <defunct>  65583  65583  65583  65543  65543  65543
app-test              <defunct>  65584  65584  65584  65543  65543  65543

https://man7.org/linux/man-pages/man7/credentials.7.html 这个链接介绍了PGID和SID的作用。

观测僵尸进程

其中的state是Z（zombie）表明这是一个僵尸进程，Threads是1，Pid与Tgid相同都可以表明这是一个单线程的进程。

root@iZt4n1u8u50jg1r5n6myn2Z:/proc# cat /proc/65582/status 
Name:   app-test
State:  Z (zombie)
Tgid:   65582
Ngid:   0
Pid:    65582
PPid:   65543
TracerPid:  0
Uid:    0   0   0   0
Gid:    0   0   0   0
FDSize: 0
Groups:  
Threads:    1

接下来从内核源码角度分析下status中State字段的由来，也借此记录下proc/ $pid的内核源码位置。下面代码中DIR("task"）就表示/proc/$ pid下面的文件夹，ONE("status")就表示/proc/$pid下面的文件。

// fs/proc/base.c
static const struct pid_entry tgid_base_stuff[] = {
    DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
    DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
    DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
    DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
    DIR("ns",     S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
    DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
    REG("environ",    S_IRUSR, proc_environ_operations),
    REG("auxv",       S_IRUSR, proc_auxv_operations),
    ONE("status",     S_IRUGO, proc_pid_status),
    ONE("personality", S_IRUSR, proc_pid_personality),
    ONE("limits",     S_IRUGO, proc_pid_limits),
    ....
}

status字段对应的函数是proc_pid_status。根据函数名大概就知道函数的作用，如task_state就是要对state字段进行赋值。/proc/$Pid下的文件描述了进程的详细信息，如果网上的资料不足以让你理解某些字段的含义，那么就需要阅读源码去探究一下这些字段的含义了。

int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
            struct pid *pid, struct task_struct *task)
{
    struct mm_struct *mm = get_task_mm(task);

    seq_puts(m, "Name:\t");
    proc_task_name(m, task, true);
    seq_putc(m, '\n');

    task_state(m, ns, pid, task);

    if (mm) {
        task_mem(m, mm);
        task_core_dumping(m, mm);
        mmput(mm);
    }
    task_sig(m, task);
    task_cap(m, task);
    task_seccomp(m, task);
    task_cpus_allowed(m, task);
    cpuset_task_status_allowed(m, task);
    task_context_switch_counts(m, task);
    return 0;
}

这里只关注State字段，即task_state函数

static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                struct pid *pid, struct task_struct *p)
{
    ....
    seq_puts(m, "State:\t");
    seq_puts(m, get_task_state(p));
    ....
}

从上诉代码可以看出，get_task_state的结果就是/proc/$pid/status中State字段的值。

static const char * const task_state_array[] = {

    /* states in TASK_REPORT: */
    "R (running)",      /* 0x00 */
    "S (sleeping)",     /* 0x01 */
    "D (disk sleep)",   /* 0x02 */
    "T (stopped)",      /* 0x04 */
    "t (tracing stop)", /* 0x08 */
    "X (dead)",     /* 0x10 */
    "Z (zombie)",       /* 0x20 */
    "P (parked)",       /* 0x40 */

    /* states beyond TASK_REPORT: */
    "I (idle)",     /* 0x80 */
};


static inline const char *get_task_state(struct task_struct *tsk)
{
    return task_state_array[task_state_index(tsk)];
}

只要task_state_index的返回的index是6，对应的字符串就是Z（zobmie），继续分析task_state_index函数。代码中state的值就是tsk->state与tsk->exit_state进行位或之后在与TASK_REPORT进行位与。

这里直接说下结果，后面会有说明，其中tsk_state为TASK_DEAD(0x0080)，tsk->exit_state为EXIT_ZOMBIE(0x0020)，经过fls函数之后，就是6。

#define TASK_REPORT         (TASK_RUNNING | TASK_INTERRUPTIBLE | \
                     TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
                     __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
                     TASK_PARKED)

static inline unsigned int task_state_index(struct task_struct *tsk)
{
    unsigned int tsk_state = READ_ONCE(tsk->state);
    unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;

    return fls(state);
}

进程的退出

有两个系统调用与进程主动退出有关，一个是exit，一个是exit_group。

/*
 * this kills every thread in the thread group. Note that any externally
 * wait4()-ing process will get the correct exit code - even if this
 * thread is not the thread group leader.
 */
SYSCALL_DEFINE1(exit_group, int, error_code)
{
    do_group_exit((error_code & 0xff) << 8);
    /* NOTREACHED */
    return 0;
}

SYSCALL_DEFINE1(exit, int, error_code)
{
    do_exit((error_code&0xff)<<8);
}

exit_group和exit都会调用do_exit，接下来重点分析do_exit函数，do_exit函数的参数是退出码。

# 省略中间的代码
void __noreturn do_exit(long code)
{
    struct task_struct *tsk = current;
    exit_signals(tsk);  /* sets PF_EXITING */
    tsk->exit_code = code;
    exit_mm();
    exit_sem(tsk);
    exit_shm(tsk);
    exit_files(tsk);
    exit_fs(tsk);
    exit_notify(tsk, group_dead);
    do_task_dead();
}

其中exit_notify 中会将进程退出状态设置为EXIT_ZOMBIE，do_notify_parent函数会返回false，所以autoreap值会为false。
从这里也可以看出，父进程如果做一些特别的设置，即使父进程不调用wait，子进程也不会成为僵尸进程

static void exit_notify(struct task_struct *tsk, int group_dead)
{
    // 如果有子进程，会给子进程找新的父进程。
    forget_original_parent(tsk, &dead);
    if (unlikely(tsk->ptrace)) {
        int sig = thread_group_leader(tsk) &&
                thread_group_empty(tsk) &&
                !ptrace_reparented(tsk) ?
            tsk->exit_signal : SIGCHLD;
        autoreap = do_notify_parent(tsk, sig);
    } else if (thread_group_leader(tsk)) {
        autoreap = thread_group_empty(tsk) &&
            do_notify_parent(tsk, tsk->exit_signal);
    } else {
        autoreap = true;
    }
    tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
}

bool do_notify_parent(struct task_struct *tsk, int sig)
{
    if (!tsk->ptrace && sig == SIGCHLD &&
        (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
         (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
        /*
         * We are exiting and our parent doesn't care.  POSIX.1
         * defines special semantics for setting SIGCHLD to SIG_IGN
         * or setting the SA_NOCLDWAIT flag: we should be reaped
         * automatically and not left for our parent's wait4 call.
         * Rather than having the parent do it as a magic kind of
         * signal handler, we just set this to tell do_exit that we
         * can be cleaned up without becoming a zombie.  Note that
         * we still call __wake_up_parent in this case, because a
         * blocked sys_wait4 might now return -ECHILD.
         *
         * Whether we send SIGCHLD or not for SA_NOCLDWAIT
         * is implementation-defined: we do (if you don't want
         * it, just use SIG_IGN instead).
         */
        autoreap = true;
        if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
            sig = 0;
    }
    if (valid_signal(sig) && sig)
        __group_send_sig_info(sig, &info, tsk->parent);
    __wake_up_parent(tsk, tsk->parent);
}

在do_task_dead中会将tsk的state字段赋值为TASK_DEAD，这样一来，tsk的state字段和exit_state都已经赋值了，正好与上面分析的一致，所以/proc/$pid/status中State字段会为Z（zombie）。

void __noreturn do_task_dead(void)
{
    /* Causes final put_task_struct in finish_task_switch(): */
    set_special_state(TASK_DEAD);
}

父进程调用wait回收子进程

程序退出调用do_exit，变成僵尸进程后，在内核中只留了一个task_struct结构体还没有回收。

从逻辑上讲，一般的程序的父子进程并不是孤立的，而是有一定的关系的，父进程需要获得子进程的退出状态，才可以根据不同的退出状态做出不同的响应，是选择忽略还是新启一个子进程呢？接下来分析wait系统调用。

SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
        int, options, struct rusage __user *, ru)
{
    struct rusage r;
    long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
}
long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
          struct rusage *ru)
{
    ret = do_wait(&wo);
}
static long do_wait(struct wait_opts *wo)
{
    retval = do_wait_thread(wo, tsk);
}
static int wait_consider_task(struct wait_opts *wo, int ptrace,
                struct task_struct *p)
{
    if (unlikely(exit_state == EXIT_DEAD))
        return 0;
    if (exit_state == EXIT_ZOMBIE) {
        /* we don't reap group leaders with subthreads */
        if (!delay_group_leader(p)) {
            /*
             * A zombie ptracee is only visible to its ptracer.
             * Notification and reaping will be cascaded to the
             * real parent when the ptracer detaches.
             */
            if (unlikely(ptrace) || likely(!p->ptrace))
                return wait_task_zombie(wo, p);
        }
    }

}

在函数wait_task_zombie中，release_task会回收task_struct，将task_struct做一下清理后放回到slub中待用。

static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
    state = (ptrace_reparented(p) && thread_group_leader(p)) ?
        EXIT_TRACE : EXIT_DEAD;
    if (state == EXIT_DEAD)
        release_task(p);
}

示例制造僵尸进程

以下示例代码是制造僵尸进程的一个简单实现，一句话概括就是父进程不调用wait等待子进程的退出。

C语言版本

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/unistd.h>

int main(int argc, char *argv[])
{
    pid_t pid = fork();
    if (pid == 0) {
        exit(EXIT_SUCCESS);
    } else if (pid > 0) {
        printf("Parent created child %d\n", i);
    }

    sleep(30);
    return EXIT_SUCCESS;
}

等效的Go语言版本

package main

import (
    "time"
    "os"
    "syscall"
)

func main() {
    id, _, _ := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
    if id == 0 {
        os.Exit(0)
    } else {
    }

    time.Sleep(60* time.Second)
}

僵尸进程的产生

进程关系

观测僵尸进程

进程的退出

父进程调用wait回收子进程

示例制造僵尸进程

推荐阅读更多精彩内容