...
 
Commits (1)
  • Kirill Smelkov's avatar
    bigfile: Terminate only current thread if loading-through-mmap from file fails · f9379a1c
    Kirill Smelkov authored
    On usual kernel, if a file is mmaped, and then memory read, and
    underlying file implementation return -EIO, the kernel sends SIGBUS to
    client thread, and if that SIGBUS is not handled the whole client process
    is terminated with coredump.
    
    In bigfile/virtmem.c until now we were doing similar thing - if in
    vma_on_pagefault() a read request to loadblk() fails - we abort the
    whole process.
    
    This is however not very convenient, because if there is a multithreaded
    server with each request mapped to thread, and a handling thread for
    only 1 request fails this way, we kill the whole server process.
    
    What could be convenient is to somehow propagate the error to calling
    thread, e.g. unwinding the stack in a C++-style exceptions way and
    turning that back to python exception at some point. And in the future
    we maybe could try to do it.
    
    For now we take a small step forward - we can terminate only the thread
    which caused failed loadblk() - i.e. we still kill the code, without
    providing it a way to recover, but we kill only the working thread, not
    the whole process.
    
    To test the functionality, we leverage our tfault framework which is now
    extended to verify not only at which function a testcase dies, but more
    generally it now examines the traceback (so that we can track coredump
    to which thread terminated), and also it now verifies exit status code
    and terminating signal of dying process.
    
    NOTE on Linux it is not easy to terminate only 1 thread and produce a
    coredump for debugging and have right process exit status if e.g. main
    thread is terminated this way.
    
    The reason is Linux hardcodes termination-with-coredump to kill all
    threads from a process, and even separate processes which happen to
    share virtual memory layout with the killing thread.
    
    So to do such termination, we use hacks and workarounds - see comments
    in newly introduced __abort_thread().
    
    NOTE2 for getting separate coredump files for several faulting threads
    
        /proc/sys/kernel/core_pattern   or
        /proc/sys/kernel/core_uses_pid
    
    are your friends.
    
    /cc @Tyagov
    /cc @klaus
    f9379a1c
......@@ -178,7 +178,7 @@ FAULTS := $(shell grep '{"fault.*"' $(tfault).c | sed 's/"/ /g' |awk '{print $$2
test.fault : $(FAULTS:%=%.tfault)
%.tfault : $(tfault).t
t/tfault-run $< $* $(shell grep '{"$*"' $(tfault).c | awk '{print $$NF}')
t/tfault-run $< $* $(shell grep '{"$*"' $(tfault).c | awk '{print $$4 " " $$5 " " $$6}')
# -*- benchmarking -*-
......
......@@ -950,7 +950,7 @@ void test_pagefault_savestate()
// TODO test for loadblk that returns -1
/* NOTE tests for loadblk that returns -1 are in bigfile/tests/tfault.c */
int main()
{
......
......@@ -18,7 +18,7 @@
*
* ~~~~
*
* All tests here end up crashing via segmentation violation. The calling
* Most tests here end up crashing via segmentation violation. The calling
* driver verifies test output prior to crash and that the crash happenned in
* the right place.
*
......@@ -36,6 +36,7 @@
#include <ccan/array_size/array_size.h>
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include "../../t/t_utils.h"
......@@ -178,17 +179,114 @@ void fault_in_storeblk()
/* BigFile, which .loadblk() always return error */
int err_loadblk(BigFile *file, blk_t blk, void *buf)
{
return -1;
}
const struct bigfile_ops err_ops = {
.loadblk = err_loadblk,
};
/* loadblk error in main thread -> full abort */
void abort_loadblkerr_t0()
{
RAM *ram;
BigFileH fh;
VMA vma_struct, *vma = &vma_struct;
int err;
diag("testing loadblk error in main thread");
// XXX save/restore sigaction ?
ok1(!pagefault_init());
ram = ram_new(NULL,NULL);
ok1(ram);
BigFile f = {
.blksize = ram->pagesize,
.file_ops = &err_ops,
};
err = fileh_open(&fh, &f, ram);
ok1(!err);
err = fileh_mmap(vma, &fh, 0, 2);
ok1(!err);
/* touch page[0] - should abort whole processe because loadblk() returns -1 */
prefault();
b(vma, 0);
}
/* loadblk error in second thread -> abort only that thread, main continues to run */
void abort_loadblkerr_t1()
{
RAM *ram;
BigFileH fh;
VMA vma_struct, *vma = &vma_struct;
int err;
pthread_t t1;
diag("testing loadblk error in second thread");
// XXX save/restore sigaction ?
ok1(!pagefault_init());
ram = ram_new(NULL,NULL);
ok1(ram);
BigFile f = {
.blksize = ram->pagesize,
.file_ops = &err_ops,
};
err = fileh_open(&fh, &f, ram);
ok1(!err);
err = fileh_mmap(vma, &fh, 0, 2);
ok1(!err);
void *__t1(void *arg)
{
/* touch page[0] - should abort t1 processe because loadblk() returns -1 */
prefault();
b(vma, 0);
/* should not get here - abort whole process */
abort();
}
err = pthread_create(&t1, NULL, __t1, NULL);
ok1(!err);
/* but main thread stays alive */
err = pthread_join(t1, NULL);
ok1(!err);
diag("I: main thread is still alive");
exit(0);
}
static const struct {
const char *name;
void (*test)(void);
} tests[] = {
// XXX fragile - test names must start exactly with `{"fault` - Makefile extracts them this way
// name func-where-it-dies
{"faultr", fault_read}, // on_pagefault
{"faultw", fault_write}, // on_pagefault
{"fault_loadblk", fault_in_loadblk}, // faulty_loadblk
{"fault_storeblk", fault_in_storeblk}, // faulty_storeblk
// name mustdie traceback signal
{"faultr", fault_read}, // on_pagefault SIGSEGV
{"faultw", fault_write}, // on_pagefault SIGSEGV
{"fault_loadblk", fault_in_loadblk}, // faulty_loadblk SIGSEGV
{"fault_storeblk", fault_in_storeblk}, // faulty_storeblk
{"fault_loadblkerr_t0", abort_loadblkerr_t0}, // __GI_raise,__GI_abort,__abort_thread,vma_on_pagefault,on_pagefault,sighandler,abort_loadblkerr_t0 SIGABRT
{"fault_loadblkerr_t1", abort_loadblkerr_t1}, // __GI_raise,__GI_abort,__abort_thread,vma_on_pagefault,on_pagefault,sighandler,__t1 SIGABRT 0
};
int main(int argc, char *argv[])
......
......@@ -505,9 +505,12 @@ void vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
* that it can abort current transaction, but not die.
*
* NOTE for analogue situation when read for mmaped file fails, the
* kernel sends SIGBUS
* kernel sends SIGBUS, which if not handled, terminates whole process.
*
* For now we just terminate current thread.
*/
TODO (err);
if (err)
ABORT_THREAD();
xmunmap(pageram, page_size(page));
......
......@@ -51,6 +51,10 @@
#define BUG_ON(expr) ASSERT(!(expr))
/* like abort() but aborts only current thread (or whole process if current thread is main) */
#define ABORT_THREAD() \
__abort_thread(__FILE__, __LINE__, __func__)
void __todo(const char *, const char *, unsigned, const char *)
__attribute__((noreturn));
......@@ -61,6 +65,8 @@ void __bug_errno(const char *, unsigned, const char *)
__attribute__((noreturn));
void __bug_fail(const char *, const char *, unsigned, const char *)
__attribute__((noreturn));
void __abort_thread(const char *file, unsigned line, const char *func)
__attribute__((noreturn));
#endif
......@@ -23,8 +23,13 @@
#include <wendelin/bug.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <errno.h>
......@@ -65,3 +70,85 @@ void __todo(const char *expr, const char *file, unsigned line, const char *func)
fprintf(stderr, "%s:%u %s\tTODO %s\n", file, line, func, expr);
abort();
}
void __abort_thread(const char *file, unsigned line, const char *func)
{
pid_t tgid = getpid(); /* thread-group id of current thread */
pid_t tid = syscall(SYS_gettid); /* thread id of current thread */
int main_thread = (tgid == tid);
pid_t pid;
fprintf(stderr, "%s:%u %s\tABORT_THREAD %i/%i%s\n", file, line, func, tgid, tid, main_thread ? " (main)" : "");
/* if it is main thread - terminate whole process -
* - it is logical and this way we get proper exit code */
if (main_thread)
abort();
/* else try to produce coredump, terminate current thread, but do not kill the whole process
*
* ( on Linux, if a thread gets fatal signal and does not handle it, whole
* thread-group is terminated by kernel, after dumping core. OTOH, there is
* no other way to make the kernel dump core of us than to get a fatal
* signal without handling it.
*
* What could work, is to first remove current thread from it's
* thread-group, and then do usual abort(3) which is ~ raise(SIGABRT),
* but such leaving-thread-group functionality is non-existent as per linux-v4.1.
*
* NOTE Once sys_ungroup(2) system call was mentioned as being handy long
* ago, but it not implemented anywhere:
*
* https://git.kernel.org/cgit/linux/kernel/git/history/history.git/commit/?id=63540cea
* https://lkml.org/lkml/2002/9/15/125 )
*
* ~~~~
*
* vfork is ~ clone(CLONE_VFORK | CLONE_VM) without CLONE_THREAD.
*
* - without CLONE_THREAD means the child will leave current thread-group
* - CLONE_VM means it will share memory
* - CLONE_VFORK means current thread will pause before clone finishes
*
* so it looks all we have to do to get a coredump and terminate only
* current thread is vfork + abort in clone + pthread_exit in current.
*
* But it is not so - because on coredumping, Linux terminates all
* processes who share mm with terminating process, not only processes from
* thread group, and it was done on purpose:
*
* https://git.kernel.org/cgit/linux/kernel/git/history/history.git/commit/?id=d89f3847
* ("properly wait for all threads that share the same MM ...")
*
*
* So the only thing we are left to do, is to do usual fork() and coredump
* from forked child.
*/
pid = fork();
if (pid == -1) {
/* fork failed for some reason */
BUGe();
}
else if (!pid) {
/* child - abort it - this way we can get coredump.
* NOTE it does not affect parent */
abort();
}
else {
/* forked ok - wait for child to abort and exit current thread */
int status;
pid_t waited;
waited = waitpid(pid, &status, 0);
if (waited == -1)
BUGe();
ASSERT(waited == pid); /* waitpid can only return for child */
ASSERT(WIFSIGNALED(status)); /* child must terminated via SIGABRT */
/* now we know child terminated the way we wanted it to terminate
* -> we can exit current thread */
pthread_exit(NULL);
}
}
#!/bin/sh -e
# tfault-run <tfault> <arg> <mustdie>
# tfault-run <tfault> <arg> <mustdie> <signal> [exitcode]
# run `<tfault> <arg>` and verify that it produces correct coredump, dieing for
# SIGSEGV in function <mustdie>.
# <signal> with traceback <mustdie>.
#
# NOTE mustdie is generally func1,func2,func3,... - traceback tail is checked to be func1<-func2<-func3
# NOTE if exitcode not present - it is checked to be non-zero
# die <msg> ...
die() {
......@@ -12,6 +15,8 @@ die() {
tfault=$(realpath $1)
arg=$2
mustdie=$3
signal=$4
exitcode=$5
# XXX ok to hardcode t/ ?
workdir=t/tfault-run.$arg
......@@ -22,21 +27,38 @@ cd "$workdir"
ulimit -c unlimited
$tfault $arg 2>&1 |tee run.out
st="${PIPESTATUS[0]}"
if test -z "$exitcode"; then
test $st != 0 || die "test exit code is 0"
else
test $st == $exitcode || die "test exit code unexpected ($st != $exitcode)"
fi
grep -q "^# going to fault" run.out || die "test didn't run to faulting point"
test -e core || die "no core after run"
gdb -q -batch $tfault core >core.info || die "can't gdb(core)"
grep -q "Program terminated with signal SIGSEGV, Segmentation fault." core.info || die "not SIGSEGV"
gdb -q -batch $tfault core >core.info || die "can't gdb-info(core)"
grep -q "Program terminated with signal $signal" core.info || die "not $signal"
gdb -batch-silent \
-ex "set logging file /dev/stdout" \
-ex "set logging on" \
-ex bt $tfault core >core.bt || die "can't gdb-bt(core)"
# #0 0x00000000004031ae in doublefault_loadblk (file=0x7fff0f25d9c0, blk=0, buf=0x7ff85a553000) at t/tfault.c:93
diefunc=$(grep '^#0' core.info | awk '{print $4}')
test -n "$diefunc" || die "can't extract diefunc"
# verify mustdie
mustdie_depth=`echo $mustdie | sed 's/,/ /g' | wc -w`
diebt=
for nframe in `seq 0 $(($mustdie_depth - 1))`; do
# #0 0x00000000004031ae in doublefault_loadblk (file=0x7fff0f25d9c0, blk=0, buf=0x7ff85a553000) at t/tfault.c:93
# #5 <signal handler called>
framefunc=$(grep "^#$nframe" core.bt | awk '{print $4}')
test "$framefunc" = "called>" && framefunc=sighandler # <signal handler called> -> sighandler
diebt=$diebt${diebt:+,}$framefunc
done
test "$diefunc" == "$mustdie" || die "must die in $mustdie, died in $diefunc"
test "$diebt" == "$mustdie" || die "must die via $mustdie, died via $diebt"
# run ok - cleanup
cd "$cwd"
rm -rf "$workdir"
echo "ok - crashed OK (in $diefunc)"
echo "ok - crashed OK (via $diebt)"