前言
大家都可能都在自己的应用中集成Crash收集服务,通常使用NSSetUncaughtExceptionHandler()
+ signal() / sigaction()
的方式。它可以帮助我们收集到大部分Crash,直到后来发现stack overflow并不能被以上方法扑捉到,而且其它一些SDK也未能收集。那这篇文章简单介绍下Mach异常与signal的联系。
OS X 、iOS系统架构
这张图片来之苹果Mac Technology Overview,除了用户体验层,OS X与iOS架构大体上是一直的。它们内核核心都是XNU(包含Mach、BSD)。Mach是微内核,负责操作系统中基本职责:进程和线程抽象、虚拟内存管理、任务调度、进程间通信和消息传递机制。BSD层简历在Mach上,提供一套可靠且更现代的API,提供了POSIX兼容性。
本文用到的XNU的版本号为3248.60.10的源码,下载地址。
你也可以在http://opensource.apple.com 中下载历史版本。
Mach exceptions 与 POSIX signals
Exception Type项通常会包含两个元素: Mach异常 和 Unix信号。
Mach exceptions: 允许在进程里或进程外处理,处理程序通过Mach RPC调用。
POSIX signals: 只在进程中处理,处理程序总是在发生错误的线程上调用。
Mach
异常首先是由处理器陷阱引发的。 通用的Mach异常处理程序exception_triage()
,负责将异常转换成Mach 消息。exception_triage()
通过调用exception_deliver()
尝试把异常投递到thread、task最后是host。首先尝试将异常抛给thread端口,然后尝试抛给task端口,最后再抛给host端口(默认端口),如果没有一个端口返回KERN_SUCCESS,那么任务就会被终止。
// 位于 osfmk/kern/exception.c
/*
* Routine: exception_triage
* Purpose:
* The current thread caught an exception.
* We make an up-call to the thread's exception server.
* Conditions:
* Nothing locked and no resources held.
* Called from an exception context, so
* thread_exception_return and thread_kdb_return
* are possible.
* Returns:
* KERN_SUCCESS if exception is handled by any of the handlers.
*/
kern_return_t
exception_triage(
exception_type_t exception,
mach_exception_data_t code,
mach_msg_type_number_t codeCnt)
{
thread_t thread;
task_t task;
host_priv_t host_priv;
lck_mtx_t *mutex;
kern_return_t kr = KERN_FAILURE;
assert(exception != EXC_RPC_ALERT);
/*
* If this behavior has been requested by the the kernel
* (due to the boot environment), we should panic if we
* enter this function. This is intended as a debugging
* aid; it should allow us to debug why we caught an
* exception in environments where debugging is especially
* difficult.
*/
if (panic_on_exception_triage) {
panic("called exception_triage when it was forbidden by the boot environment");
}
thread = current_thread();
// 分别尝试把异常投递到thread、task最后是host。
/*
* Try to raise the exception at the activation level.
*/
mutex = &thread->mutex;
if (KERN_SUCCESS == check_exc_receiver_dependency(exception, thread->exc_actions, mutex))
{
kr = exception_deliver(thread, exception, code, codeCnt, thread->exc_actions, mutex);
if (kr == KERN_SUCCESS || kr == MACH_RCV_PORT_DIED)
goto out;
}
/*
* Maybe the task level will handle it.
*/
task = current_task();
mutex = &task->lock;
if (KERN_SUCCESS == check_exc_receiver_dependency(exception, task->exc_actions, mutex))
{
kr = exception_deliver(thread, exception, code, codeCnt, task->exc_actions, mutex);
if (kr == KERN_SUCCESS || kr == MACH_RCV_PORT_DIED)
goto out;
}
/*
* How about at the host level?
*/
host_priv = host_priv_self();
mutex = &host_priv->lock;
if (KERN_SUCCESS == check_exc_receiver_dependency(exception, host_priv->exc_actions, mutex))
{
kr = exception_deliver(thread, exception, code, codeCnt, host_priv->exc_actions, mutex);
if (kr == KERN_SUCCESS || kr == MACH_RCV_PORT_DIED)
goto out;
}
out:
if ((exception != EXC_CRASH) && (exception != EXC_RESOURCE) &&
(exception != EXC_GUARD) && (exception != EXC_CORPSE_NOTIFY))
thread_exception_return();
return kr;
}
异常行为
/*
* Machine-independent exception behaviors
*/
# define EXCEPTION_DEFAULT 1 // Send a catch_exception_raise message including the identity.
# define EXCEPTION_STATE 2 // Send a catch_exception_raise_state message including the thread state.
# define EXCEPTION_STATE_IDENTITY 3 // Send a catch_exception_raise_state_identity message including the thread identity and state.
// 位于 osfmk/kern/exception.c
/*
* Routine: exception_deliver
* Purpose:
* Make an upcall to the exception server provided.
* Conditions:
* Nothing locked and no resources held.
* Called from an exception context, so
* thread_exception_return and thread_kdb_return
* are possible.
* Returns:
* KERN_SUCCESS if the exception was handled
*/
kern_return_t
exception_deliver(
thread_t thread,
exception_type_t exception,
mach_exception_data_t code,
mach_msg_type_number_t codeCnt,
struct exception_action *excp,
lck_mtx_t *mutex)
{
... // 省略部分代码
switch (behavior) {
case EXCEPTION_STATE: {
mach_msg_type_number_t state_cnt;
thread_state_data_t state;
c_thr_exc_raise_state++;
state_cnt = _MachineStateCount[flavor];
kr = thread_getstatus(thread, flavor,
(thread_state_t)state,
&state_cnt);
if (kr == KERN_SUCCESS) {
if (code64) {
kr = mach_exception_raise_state(exc_port,
exception,
code,
codeCnt,
&flavor,
state, state_cnt,
state, &state_cnt);
} else {
kr = exception_raise_state(exc_port, exception,
small_code,
codeCnt,
&flavor,
state, state_cnt,
state, &state_cnt);
}
if (kr == MACH_MSG_SUCCESS && exception != EXC_CORPSE_NOTIFY)
kr = thread_setstatus(thread, flavor,
(thread_state_t)state,
state_cnt);
}
return kr;
}
case EXCEPTION_DEFAULT:
c_thr_exc_raise++;
if (code64) {
kr = mach_exception_raise(exc_port,
retrieve_thread_self_fast(thread),
retrieve_task_self_fast(thread->task),
exception,
code,
codeCnt);
} else {
kr = exception_raise(exc_port,
retrieve_thread_self_fast(thread),
retrieve_task_self_fast(thread->task),
exception,
small_code,
codeCnt);
}
return kr;
case EXCEPTION_STATE_IDENTITY: {
mach_msg_type_number_t state_cnt;
thread_state_data_t state;
c_thr_exc_raise_state_id++;
state_cnt = _MachineStateCount[flavor];
kr = thread_getstatus(thread, flavor,
(thread_state_t)state,
&state_cnt);
if (kr == KERN_SUCCESS) {
if (code64) {
kr = mach_exception_raise_state_identity(exc_port,
retrieve_thread_self_fast(thread),
retrieve_task_self_fast(thread->task),
exception,
code,
codeCnt,
&flavor,
state, state_cnt,
state, &state_cnt);
} else {
kr = exception_raise_state_identity(exc_port,
retrieve_thread_self_fast(thread),
retrieve_task_self_fast(thread->task),
exception,
small_code,
codeCnt,
&flavor,
state, state_cnt,
state, &state_cnt);
}
if (kr == MACH_MSG_SUCCESS && exception != EXC_CORPSE_NOTIFY)
kr = thread_setstatus(thread, flavor,
(thread_state_t)state,
state_cnt);
}
return kr;
}
default:
panic ("bad exception behavior!");
return KERN_FAILURE;
}/* switch */
}
BSD
当第一个BSD进程调用bsdinit_task()
函数启动时,这函数还调用了ux_handler_init()
函数设置了一个Mach内核线程跑ux_handler()
的。
// 位于bsd/kern/bsd_init.c
/* Called with kernel funnel held */
void
bsdinit_task(void)
{
proc_t p = current_proc();
struct uthread *ut;
thread_t thread;
process_name("init", p);
ux_handler_init(); // 初始化handler
// 设置port
thread = current_thread();
(void) host_set_exception_ports(host_priv_self(),
EXC_MASK_ALL & ~(EXC_MASK_RPC_ALERT),//pilotfish (shark) needs this port
(mach_port_t) ux_exception_port,
EXCEPTION_DEFAULT| MACH_EXCEPTION_CODES,
0);
ut = (uthread_t)get_bsdthread_info(thread);
bsd_init_task = get_threadtask(thread);
init_task_failure_data[0] = 0;
#if CONFIG_MACF
mac_cred_label_associate_user(p->p_ucred);
mac_task_label_update_cred (p->p_ucred, (struct task *) p->task);
#endif
load_init_program(p);
lock_trace = 1;
}
// 位于bsd/uxkern/ux_exception.c
void
ux_handler_init(void)
{
thread = THREAD_NULL;
ux_exception_port = MACH_PORT_NULL;
(void) kernel_thread_start((thread_continue_t)ux_handler, NULL, &thread);
thread_deallocate(thread);
proc_list_lock();
if (ux_exception_port == MACH_PORT_NULL) {
(void)msleep(&ux_exception_port, proc_list_mlock, 0, "ux_handler_wait", 0);
}
proc_list_unlock();
}
static void
ux_handler(void)
{
task_t self = current_task();
mach_port_name_t exc_port_name;
mach_port_name_t exc_set_name;
/* self->kernel_vm_space = TRUE; */
ux_handler_self = self;
... // 省略部分
/* Message handling loop. */
// 消息处理循环
for (;;) {
struct rep_msg {
mach_msg_header_t Head;
NDR_record_t NDR;
kern_return_t RetCode;
} rep_msg;
struct exc_msg {
mach_msg_header_t Head;
/* start of the kernel processed data */
mach_msg_body_t msgh_body;
mach_msg_port_descriptor_t thread;
mach_msg_port_descriptor_t task;
/* end of the kernel processed data */
NDR_record_t NDR;
exception_type_t exception;
mach_msg_type_number_t codeCnt;
mach_exception_data_t code;
/* some times RCV_TO_LARGE probs */
char pad[512];
} exc_msg;
mach_port_name_t reply_port;
kern_return_t result;
exc_msg.Head.msgh_local_port = CAST_MACH_NAME_TO_PORT(exc_set_name);
exc_msg.Head.msgh_size = sizeof (exc_msg);
#if 0
result = mach_msg_receive(&exc_msg.Head);
#else
result = mach_msg_receive(&exc_msg.Head, MACH_RCV_MSG,
sizeof (exc_msg), exc_set_name,
MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL,
0);
#endif
if (result == MACH_MSG_SUCCESS) {
reply_port = CAST_MACH_PORT_TO_NAME(exc_msg.Head.msgh_remote_port);
// 消息处理
if (mach_exc_server(&exc_msg.Head, &rep_msg.Head)) {
result = mach_msg_send(&rep_msg.Head, MACH_SEND_MSG,
sizeof (rep_msg),MACH_MSG_TIMEOUT_NONE,MACH_PORT_NULL);
if (reply_port != 0 && result != MACH_MSG_SUCCESS)
mach_port_deallocate(get_task_ipcspace(ux_handler_self), reply_port);
}
}
else if (result == MACH_RCV_TOO_LARGE)
/* ignore oversized messages */;
else
panic("exception_handler");
}
}
每一个thread、task及host自身都有一个异常端口数组,通过调用xxx_set_exception_ports()
(xxx为thread、task或host)可以设置这些异常端口。 xxx_set_exception_ports()
第四个参数为exception_behavior_t behavior
,这将会使用到与行为相匹配的实现(exc.defs 或 mach_exc.defs)。
各种行为都在host层被catch_[mach]_exception_xxx处理,64位的对应的是有mach
函数(可在/bsd/uxkern/ux_exception.c查看)。
这些函数通过调用ux_exception()
将异常转换为对应的UNIX信号,并通过threadsignal()
将信号投递到出错线程。
// EXCEPTION_DEFAULT行为 64位处理函数
kern_return_t
catch_mach_exception_raise(
__unused mach_port_t exception_port,
mach_port_t thread,
mach_port_t task,
exception_type_t exception,
mach_exception_data_t code,
__unused mach_msg_type_number_t codeCnt
)
{
task_t self = current_task();
thread_t th_act;
ipc_port_t thread_port;
struct proc *p;
kern_return_t result = MACH_MSG_SUCCESS;
int ux_signal = 0;
mach_exception_code_t ucode = 0;
struct uthread *ut;
mach_port_name_t thread_name = CAST_MACH_PORT_TO_NAME(thread);
mach_port_name_t task_name = CAST_MACH_PORT_TO_NAME(task);
/*
* Convert local thread name to global port.
*/
if (MACH_PORT_VALID(thread_name) &&
(ipc_object_copyin(get_task_ipcspace(self), thread_name,
MACH_MSG_TYPE_PORT_SEND,
(void *) &thread_port) == MACH_MSG_SUCCESS)) {
if (IPC_PORT_VALID(thread_port)) {
th_act = convert_port_to_thread(thread_port);
ipc_port_release_send(thread_port);
} else {
th_act = THREAD_NULL;
}
/*
* Catch bogus ports
*/
if (th_act != THREAD_NULL) {
/*
* Convert exception to unix signal and code.
* 调用 ux_exception将exception转成UNIX信号
*/
ux_exception(exception, code[0], code[1], &ux_signal, &ucode);
ut = get_bsdthread_info(th_act);
p = proc_findthread(th_act);
/* Can't deliver a signal without a bsd process reference */
// 如果未找到进程,那么这个信号就不会投递了
if (p == NULL) {
ux_signal = 0;
result = KERN_FAILURE;
}
/*
* Stack overflow should result in a SIGSEGV signal
* on the alternate stack.
* but we have one or more guard pages after the
* stack top, so we would get a KERN_PROTECTION_FAILURE
* exception instead of KERN_INVALID_ADDRESS, resulting in
* a SIGBUS signal.
* Detect that situation and select the correct signal.
*/
if (code[0] == KERN_PROTECTION_FAILURE &&
ux_signal == SIGBUS) {
user_addr_t sp, stack_min, stack_max;
int mask;
struct sigacts *ps;
sp = code[1];
stack_max = p->user_stack;
stack_min = p->user_stack - MAXSSIZ;
if (sp >= stack_min &&
sp < stack_max) {
/*
* This is indeed a stack overflow. Deliver a
* SIGSEGV signal.
* 因为栈溢出需返回的是SIGSEGV,这里把SIGBUS替换成SIGSEGV
*/
ux_signal = SIGSEGV;
/*
* If the thread/process is not ready to handle
* SIGSEGV on an alternate stack, force-deliver
* SIGSEGV with a SIG_DFL handler.
*/
mask = sigmask(ux_signal);
ps = p->p_sigacts;
if ((p->p_sigignore & mask) ||
(ut->uu_sigwait & mask) ||
(ut->uu_sigmask & mask) ||
(ps->ps_sigact[SIGSEGV] == SIG_IGN) ||
(! (ps->ps_sigonstack & mask))) {
p->p_sigignore &= ~mask;
p->p_sigcatch &= ~mask;
ps->ps_sigact[SIGSEGV] = SIG_DFL;
ut->uu_sigwait &= ~mask;
ut->uu_sigmask &= ~mask;
}
}
}
/*
* Send signal. 发送信号
*/
if (ux_signal != 0) {
ut->uu_exception = exception;
//ut->uu_code = code[0]; // filled in by threadsignal
ut->uu_subcode = code[1];
threadsignal(th_act, ux_signal, code[0]);
}
if (p != NULL)
proc_rele(p);
thread_deallocate(th_act);
}
else
result = KERN_INVALID_ARGUMENT;
}
else
result = KERN_INVALID_ARGUMENT;
/*
* Delete our send rights to the task port.
*/
(void)mach_port_deallocate(get_task_ipcspace(ux_handler_self), task_name);
return (result);
}
所以,如果异常是栈溢出,那么signal是SIGSEGV而不是SIGBUS;如果进程退出了或者线程/进程未准备好处理signal,所注册的signal()
是无法接收信号的。
// 位于bsd/uxkern/ux_exception.c
/*
* ux_exception translates a mach exception, code and subcode to
* a signal and u.u_code. Calls machine_exception (machine dependent)
* to attempt translation first.
*/
static
void ux_exception(
int exception,
mach_exception_code_t code,
mach_exception_subcode_t subcode,
int *ux_signal,
mach_exception_code_t *ux_code)
{
/*
* Try machine-dependent translation first.
*/
if (machine_exception(exception, code, subcode, ux_signal, ux_code))
return;
switch(exception) {
case EXC_BAD_ACCESS:
if (code == KERN_INVALID_ADDRESS)
*ux_signal = SIGSEGV;
else
*ux_signal = SIGBUS;
break;
case EXC_BAD_INSTRUCTION:
*ux_signal = SIGILL;
break;
case EXC_ARITHMETIC:
*ux_signal = SIGFPE;
break;
case EXC_EMULATION:
*ux_signal = SIGEMT;
break;
case EXC_SOFTWARE:
switch (code) {
case EXC_UNIX_BAD_SYSCALL:
*ux_signal = SIGSYS;
break;
case EXC_UNIX_BAD_PIPE:
*ux_signal = SIGPIPE;
break;
case EXC_UNIX_ABORT:
*ux_signal = SIGABRT;
break;
case EXC_SOFT_SIGNAL:
*ux_signal = SIGKILL;
break;
}
break;
case EXC_BREAKPOINT:
*ux_signal = SIGTRAP;
break;
}
}
把Mach exception 和 UNIX signal 的转换制表后,如下
用下图来展示而Mach exception转换成signal的流程(图截自 Mac OS X and iOS Internals)
实践
在Mach中,异常是通过内核中的主要设施-消息传递机制-进行处理的。一个异常与一条消息并无差别,由出错的线程或任务(通过 msg_send()
)发送,并通过一个处理程(通过 msg_recv()
)接收。
由于Mach的异常以消息机制处理而不是通过函数调用,exception messages可以被转发到先前注册的Mach exception处理程序。这意味着你可以插入一个exception处理程序,而不干扰现有的无论是调试器或Apple's crash reporter。可以使用mach_msg() // flag MACH_SEND_MSG
发送原始消息到以前注册的处理程序的Mach端口,将消息转发到一个现有的处理程序。
知道以上这些,那我们来尝试扑捉一下Mach异常
void catchMACHExceptions() {
kern_return_t rc = 0;
exception_mask_t excMask = EXC_MASK_BAD_ACCESS |
EXC_MASK_BAD_INSTRUCTION |
EXC_MASK_ARITHMETIC |
EXC_MASK_SOFTWARE |
EXC_MASK_BREAKPOINT;
rc = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &myExceptionPort);
if (rc != KERN_SUCCESS) {
fprintf(stderr, "------->Fail to allocate exception port\\\\\\\\n");
return;
}
rc = mach_port_insert_right(mach_task_self(), myExceptionPort, myExceptionPort, MACH_MSG_TYPE_MAKE_SEND);
if (rc != KERN_SUCCESS) {
fprintf(stderr, "-------->Fail to insert right");
return;
}
rc = thread_set_exception_ports(mach_thread_self(), excMask, myExceptionPort, EXCEPTION_DEFAULT, MACHINE_THREAD_STATE);
if (rc != KERN_SUCCESS) {
fprintf(stderr, "-------->Fail to set exception\\\\\\\\n");
return;
}
// at the end of catchMachExceptions, spawn the exception handling thread
pthread_t thread;
pthread_create(&thread, NULL, exc_handler, NULL);
} // end catchMACHExceptions
static void *exc_handler(void *ignored) {
// Exception handler – runs a message loop. Refactored into a standalone function
// so as to allow easy insertion into a thread (can be in same program or different)
mach_msg_return_t rc;
fprintf(stderr, "Exc handler listening\\\\\\\\n");
// The exception message, straight from mach/exc.defs (following MIG processing) // copied here for ease of reference.
typedef struct {
mach_msg_header_t Head;
/* start of the kernel processed data */
mach_msg_body_t msgh_body;
mach_msg_port_descriptor_t thread;
mach_msg_port_descriptor_t task;
/* end of the kernel processed data */
NDR_record_t NDR;
exception_type_t exception;
mach_msg_type_number_t codeCnt;
integer_t code[2];
int flavor;
mach_msg_type_number_t old_stateCnt;
natural_t old_state[144];
} Request;
Request exc;
struct rep_msg {
mach_msg_header_t Head;
NDR_record_t NDR;
kern_return_t RetCode;
} rep_msg;
for(;;) {
// Message Loop: Block indefinitely until we get a message, which has to be
// 这里会阻塞,直到接收到exception message,或者线程被中断。
// an exception message (nothing else arrives on an exception port)
rc = mach_msg( &exc.Head,
MACH_RCV_MSG|MACH_RCV_LARGE,
0,
sizeof(Request),
myExceptionPort, // Remember this was global – that's why.
MACH_MSG_TIMEOUT_NONE,
MACH_PORT_NULL);
if(rc != MACH_MSG_SUCCESS) {
/*... */
break ;
};
// Normally we would call exc_server or other. In this example, however, we wish
// to demonstrate the message contents:
printf("Got message %d. Exception : %d Flavor: %d. Code %lld/%lld. State count is %d\\\\\\\\n" ,
exc.Head.msgh_id, exc.exception, exc.flavor,
exc.code[0], exc.code[1], // can also print as 64-bit quantity
exc.old_stateCnt);
rep_msg.Head = exc.Head;
rep_msg.NDR = exc.NDR;
rep_msg.RetCode = KERN_FAILURE;
kern_return_t result;
if (rc == MACH_MSG_SUCCESS) {
result = mach_msg(&rep_msg.Head,
MACH_SEND_MSG,
sizeof (rep_msg),
0,
MACH_PORT_NULL,
MACH_MSG_TIMEOUT_NONE,
MACH_PORT_NULL);
}
}
return NULL;
} // end exc_handler
接下来,我们测试一下。
- (void)test
{
[self test];
}
调用这个方法,结果如下
Exc handler listening
Got message 2401. Exception : 1 Flavor: 0. Code 2/1486065656. State count is 8
(lldb)
我们可以查看mach/exception_types.h
对exception type的定义
Exception: 1 ,即为EXC_BAD_ACCESS
code: 2,即KERN_PROTECTION_FAILURE
而ux_exception()
函数告诉我们Code与signal是怎么转换的。
也就是 Exception = EXC_BAD_ACCESS
, code = 2 对应的是SIGBUS
信号,又因为为是stack overflow,信号应该是SIGSEGV。
那么结这个exception就是:
Exception Type: EXC_BAD_ACCESS (SIGSEGV)
Exception Subtype: KERN_PROTECTION_FAILURE
再试试其它的:
int *pi = (int*)0x00001111;
*pi = 17;
Exc handler listening
Got message 2401. Exception : 1 Flavor: 0. Code 1/4369. State count is 8
(lldb)
Exception Type: EXC_BAD_ACCESS (SIGSEGV)
Exception Subtype: KERN_INVALID_ADDRESS
最后
除了上面硬件产生的信号,另外还有软件产生的信号。软件产生的信号来自kill()
、pthread_kill()
两个函数的调用,大概过程是这样的:kill()
/pthread_kill()
--> ...
--> psignal_internal()
--> act_set_astbsd()
。最终也会调用act_set_astbsd()
发送信号到目标线程。这意味着Mach exception流程是不会走的。
另外,在abort()源码注释着:<rdar://problem/7397932> abort() should call pthread_kill to deliver a signal to the aborting thread
, 它是这样调用的(void)pthread_kill(pthread_self(), SIGABRT);
。Apple’s Crash Reporter 把SIGABRT的Mach exception type记为EXC_CRASH,不同与上面转变表。
Exception Type: EXC_CRASH (SIGABRT)
Exception Codes: 0x0000000000000000, 0x0000000000000000
所以尽管Mach exception handle 比 UNIX signal handle 更有优势,但我们还是须要注册signal handle用于处理EXC_SOFTWARE/EXC_CRASH。
由于篇幅有限,有些细节可能并没有展现出来,如果你想有深入的了解,可以阅读下面参考文献,或者一些开源框架。最后,希望这篇文章可以帮助你了解Mach exception。
参考文献:
Apple Kernel Programming Guide
漫谈iOS Crash收集框架
Mac OS X and iOS Internals:To the Apple’s Core(中文译名:深入解析 MAC OS X & IOS 操作系统)
Mach Exception Handlers
xnu-3248.60.10.tar.gz
Understanding Crash Reports on iPhone OS(wwdc)
Understanding and Analyzing iOS Application Crash Reports