我最近编写了一个信号处理程序,它使用 execinfo.h 中的回溯,它在 MacO 上运行良好,但当它在 Linux(Ubuntu Debian)上使用时,它会无限期地等待锁定。我不确定这是否有帮助,但是我的多线程程序(pthread)使用rocksdb来存储数据,并且我故意在rocksdb中保留了一个段错误,这样如果rocksdb端出现任何问题,我可以测试我的信号处理程序,但我没有能够调试为什么锁正在等待。
这是我在 gdb 上得到的堆栈跟踪:
#0 futex_wait (private=0, expected=2, futex_word=0x77e088a1ac80 <main_arena>) at ../sysdeps/nptl/futex-internal.h:146
#1 __GI___lll_lock_wait_private (futex=futex@entry=0x77e088a1ac80 <main_arena>) at ./nptl/lowlevellock.c:34
#2 0x000077e0888a53c8 in __GI___libc_malloc (bytes=408) at ./malloc/malloc.c:3327
#3 0x000077e088c024a3 in malloc (size=408) at ../include/rtld-malloc.h:56
#4 _dl_scope_free (old=old@entry=0x5660325219f0) at ./elf/dl-scope.c:34
#5 0x000077e088bf3308 in _dl_map_object_deps (map=map@entry=0x566032520dc0, preloads=preloads@entry=0x0, npreloads=npreloads@entry=0,
trace_mode=trace_mode@entry=0, open_mode=open_mode@entry=-2147483648) at ./elf/dl-deps.c:635
#6 0x000077e088bfda0f in dl_open_worker_begin (a=a@entry=0x7fff4a7a5010) at ./elf/dl-open.c:592
#7 0x000077e088974a98 in __GI__dl_catch_exception (exception=exception@entry=0x7fff4a7a4e70, operate=operate@entry=0x77e088bfd900 <dl_open_worker_begin>,
args=args@entry=0x7fff4a7a5010) at ./elf/dl-error-skeleton.c:208
#8 0x000077e088bfcf9a in dl_open_worker (a=a@entry=0x7fff4a7a5010) at ./elf/dl-open.c:782
#9 0x000077e088974a98 in __GI__dl_catch_exception (exception=exception@entry=0x7fff4a7a4ff0, operate=operate@entry=0x77e088bfcf60 <dl_open_worker>,
args=args@entry=0x7fff4a7a5010) at ./elf/dl-error-skeleton.c:208
#10 0x000077e088bfd34e in _dl_open (file=<optimized out>, mode=-2147483646, caller_dlopen=0x77e088925611 <__GI___libc_unwind_link_get+81>, nsid=-2, argc=3,
argv=<optimized out>, env=0x5660324f9fe0) at ./elf/dl-open.c:883
#11 0x000077e088974e01 in do_dlopen (ptr=ptr@entry=0x7fff4a7a5240) at ./elf/dl-libc.c:95
#12 0x000077e088974a98 in __GI__dl_catch_exception (exception=exception@entry=0x7fff4a7a51e0, operate=<optimized out>, args=<optimized out>)
at ./elf/dl-error-skeleton.c:208
#13 0x000077e088974b63 in __GI__dl_catch_error (objname=0x7fff4a7a5230, errstring=0x7fff4a7a5238, mallocedp=0x7fff4a7a522f, operate=<optimized out>,
args=<optimized out>) at ./elf/dl-error-skeleton.c:227
#14 0x000077e088974f37 in dlerror_run (args=0x7fff4a7a5240, operate=0x77e088974dc0 <do_dlopen>) at ./elf/dl-libc.c:45
#15 __libc_dlopen_mode (name=name@entry=0x77e0889db527 "libgcc_s.so.1", mode=mode@entry=-2147483646) at ./elf/dl-libc.c:162
#16 0x000077e088925611 in __GI___libc_unwind_link_get () at ./misc/unwind-link.c:50
#17 __GI___libc_unwind_link_get () at ./misc/unwind-link.c:40
#18 0x000077e088933b77 in __GI___backtrace (array=array@entry=0x77e088af0000 <backtrace_frames>, size=size@entry=1) at ./debug/backtrace.c:69
#19 0x000077e088a65f92 in dumpBackTrace () at my_faultHandler.c:366
#20 0x000077e088a66027 in faultHandler (signo=6) at my_faultHandler.c:344
#21 <signal handler called>
#22 __pthread_kill_implementation (no_tid=0, signo=6, threadid=131806249563968) at ./nptl/pthread_kill.c:44
#23 __pthread_kill_internal (signo=6, threadid=131806249563968) at ./nptl/pthread_kill.c:78
#24 __GI___pthread_kill (threadid=131806249563968, signo=signo@entry=6) at ./nptl/pthread_kill.c:89
#25 0x000077e088842476 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26
#26 0x000077e0888287f3 in __GI_abort () at ./stdlib/abort.c:79
#27 0x000077e088889676 in __libc_message (action=action@entry=do_abort, fmt=fmt@entry=0x77e0889dbb77 "%s\n") at ../sysdeps/posix/libc_fatal.c:155
#28 0x000077e0888a0cfc in malloc_printerr (str=str@entry=0x77e0889de5b8 "malloc_consolidate(): unaligned fastbin chunk detected") at ./malloc/malloc.c:5664
#29 0x000077e0888a198c in malloc_consolidate (av=av@entry=0x77e088a1ac80 <main_arena>) at ./malloc/malloc.c:4750
#30 0x000077e0888a3bdb in _int_malloc (av=av@entry=0x77e088a1ac80 <main_arena>, bytes=bytes@entry=32816) at ./malloc/malloc.c:3965
#31 0x000077e0888a5139 in __GI___libc_malloc (bytes=bytes@entry=32816) at ./malloc/malloc.c:3329
#32 0x000077e0888e630b in __alloc_dir (statp=0x7fff4a7a5ec0, flags=0, close_fd=true, fd=39) at ../sysdeps/unix/sysv/linux/opendir.c:115
#33 opendir_tail (fd=39) at ../sysdeps/unix/sysv/linux/opendir.c:63
#34 __opendir (name=<optimized out>) at ../sysdeps/unix/sysv/linux/opendir.c:86
#35 0x000077e087f93748 in rocksdb::(anonymous namespace)::PosixEnv::GetChildren (this=<optimized out>,
dir="/home/dummy/rocks", result=0x7fff4a7a6080)
at /usr/include/c++/9/bits/basic_string.h:2309
#36 0x000077e087eeaae0 in rocksdb::DBImpl::FindObsoleteFiles (this=this@entry=0x56603283fc40, job_context=job_context@entry=0x7fff4a7a6180, force=force@entry=true,
--Type <RET> for more, q to quit, c to continue without paging--
no_full_scan=no_full_scan@entry=false) at db/db_impl_files.cc:200
#37 0x000077e087eccfd3 in rocksdb::DBImpl::~DBImpl (this=0x56603283fc40, __in_chrg=<optimized out>) at db/db_impl.cc:308
#38 0x000077e087ecd3f6 in rocksdb::DBImpl::~DBImpl (this=0x56603283fc40, __in_chrg=<optimized out>) at db/db_impl.cc:357
#39 0x000077e087e66e9d in rocksdb_close (db=0x5660328a2b20) at db/c.cc:627
信号处理程序代码:
void RegisterFaultHandler(void)
{
struct sigaction bt_action;
sigemptyset(&bt_action.sa_mask);
bt_action.sa_handler = &faultHandler;
bt_action.sa_flags = SA_RESTART | SA_ONSTACK;
if (sigaction(SIGSEGV, &bt_action, prev_action + SIGSEGV) || sigaction(SIGBUS, &bt_action, prev_action + SIGBUS) ||
sigaction(SIGILL, &bt_action, prev_action + SIGILL) || sigaction(SIGABRT, &bt_action, prev_action + SIGABRT) ||
sigaction(SIGFPE, &bt_action, prev_action + SIGFPE) || sigaction(SIGSYS, &bt_action, prev_action + SIGSYS))
{
int savedErrno = errno;
exit(1);
}
}
static void unRegisterFaultHandler()
{
/* Install 'previous' fault handler for all 'crash' (fatal) signals */
sigaction(SIGSEGV, prev_action + SIGSEGV, NULL);
sigaction(SIGBUS, prev_action + SIGBUS, NULL);
sigaction(SIGILL, prev_action + SIGILL, NULL);
sigaction(SIGABRT, prev_action + SIGABRT, NULL);
sigaction(SIGFPE, prev_action + SIGFPE, NULL);
sigaction(SIGSYS, prev_action + SIGSYS, NULL);
}
static void faultHandler(int signo)
{
/* Disable fault_handler to call previous fault handlers, if any */
unRegisterFaultHandler();
dumpBackTrace();
/* Propagate the signal back to, previous handler */
raise(signo);
}
static void dumpBackTrace()
{
int bt_fd = openBackTraceFile(); /* This will just open my file with open() system call */
if (bt_fd >= 0)
{
static void *backtrace_frames[10];
int size = backtrace(backtrace_frames, 10);
backtrace_symbols_fd(backtrace_frames, size, bt_fd);
close(bt_fd);
}
else
{
const char error[] = "Cannot open backtrace file\n";
(void)write(STDERR_FILENO, error, sizeof(error));
}
}
我知道在第 3 帧中调用 malloc 可能是原因,因为它不安全,但我不知道如何解决这个问题。尝试在互联网上搜索答案,我只能找到 malloc 部分。如果您需要更多信息,请告诉我。
编辑-1: 仅当在rocksdb端发生分段错误并且我的程序中没有收到任何分段错误时,我才会遇到此问题。我认为这可能是由于 malloc_consolidate 发生错误而 backtrace 本身再次调用 malloc
您的信号处理程序从根本上被破坏了 - 它不是异步信号安全的。
标准库中的函数不保证可重入,并且可能会修改具有静态或线程存储持续时间的对象。188
注意脚注188的链接:
- 因此,信号处理程序通常不能调用标准库函数。
根据 POSIX 7“信号概念 和 Linux
signal-safety
手册页,有一组有限的“异步信号安全”函数可以从信号处理程序中调用。
无法从信号处理程序中安全地调用不在这些列表中的函数。
您的回溯特别感兴趣的是这一行:
#3 0x000077e088c024a3 in malloc (size=408) at ../include/rtld-malloc.h:56
malloc()
not 在任何异步信号安全函数列表中,并且从信号处理程序中调用 malloc()
(即使是间接调用)也是不安全的,并且可能会导致问题,例如死锁。
文档,
backtrace()
是异步信号-不安全,原因有多种:
功能:int backtrace(void **buffer,int size)初步:| MT-安全 |
AS-不安全的初始化堆 dlopen 插件锁 | AC-不安全 init mem lock fd |请参阅 POSIX 安全概念
通用解决方案
通常,您可以使用
sigprocmask(SIG_BLOCK, ...)
阻止所有线程中感兴趣的信号的传递,除了 对于专用线程,该线程可以在
sigwaitinfo()
中等待要在非异步上下文中处理的信号。SEGV解决方案
SIGSEGV
来说无论如何都不起作用,因为它是同步传递到有问题的线程的。因此,您有一个常规信号处理程序,但它需要一种异步信号安全的方式来与其他(非异步)上下文进行通信,然后可能会休眠,直到其他上下文完成。
这是完全可行的,除了对
backtrace()
的调用现在将发生在您不关心其调用堆栈的不同线程中。只需转储核心
C++23
std::basic_stacktrace
并且提供异步信号安全分配器供其使用,那么这可能可以工作。
如果你不能使用C++23或者你的编译器还不支持它,你仍然可以使用之前的。 但是,请考虑关于 Boost 库中该主题的