Commit f9379a1c authored by Kirill Smelkov's avatar Kirill Smelkov

bigfile: Terminate only current thread if loading-through-mmap from file fails

On usual kernel, if a file is mmaped, and then memory read, and
underlying file implementation return -EIO, the kernel sends SIGBUS to
client thread, and if that SIGBUS is not handled the whole client process
is terminated with coredump.

In bigfile/virtmem.c until now we were doing similar thing - if in
vma_on_pagefault() a read request to loadblk() fails - we abort the
whole process.

This is however not very convenient, because if there is a multithreaded
server with each request mapped to thread, and a handling thread for
only 1 request fails this way, we kill the whole server process.

What could be convenient is to somehow propagate the error to calling
thread, e.g. unwinding the stack in a C++-style exceptions way and
turning that back to python exception at some point. And in the future
we maybe could try to do it.

For now we take a small step forward - we can terminate only the thread
which caused failed loadblk() - i.e. we still kill the code, without
providing it a way to recover, but we kill only the working thread, not
the whole process.

To test the functionality, we leverage our tfault framework which is now
extended to verify not only at which function a testcase dies, but more
generally it now examines the traceback (so that we can track coredump
to which thread terminated), and also it now verifies exit status code
and terminating signal of dying process.

NOTE on Linux it is not easy to terminate only 1 thread and produce a
coredump for debugging and have right process exit status if e.g. main
thread is terminated this way.

The reason is Linux hardcodes termination-with-coredump to kill all
threads from a process, and even separate processes which happen to
share virtual memory layout with the killing thread.

So to do such termination, we use hacks and workarounds - see comments
in newly introduced __abort_thread().

NOTE2 for getting separate coredump files for several faulting threads

    /proc/sys/kernel/core_pattern   or
    /proc/sys/kernel/core_uses_pid

are your friends.

/cc @Tyagov
/cc @klaus
parent de3fdb85
......@@ -178,7 +178,7 @@ FAULTS := $(shell grep '{"fault.*"' $(tfault).c | sed 's/"/ /g' |awk '{print $$2
test.fault : $(FAULTS:%=%.tfault)
%.tfault : $(tfault).t
t/tfault-run $< $* $(shell grep '{"$*"' $(tfault).c | awk '{print $$NF}')
t/tfault-run $< $* $(shell grep '{"$*"' $(tfault).c | awk '{print $$4 " " $$5 " " $$6}')
# -*- benchmarking -*-
......
......@@ -950,7 +950,7 @@ void test_pagefault_savestate()
// TODO test for loadblk that returns -1
/* NOTE tests for loadblk that returns -1 are in bigfile/tests/tfault.c */
int main()
{
......
......@@ -18,7 +18,7 @@
*
* ~~~~
*
* All tests here end up crashing via segmentation violation. The calling
* Most tests here end up crashing via segmentation violation. The calling
* driver verifies test output prior to crash and that the crash happenned in
* the right place.
*
......@@ -36,6 +36,7 @@
#include <ccan/array_size/array_size.h>
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include "../../t/t_utils.h"
......@@ -178,17 +179,114 @@ void fault_in_storeblk()
/* BigFile, which .loadblk() always return error */
int err_loadblk(BigFile *file, blk_t blk, void *buf)
{
return -1;
}
const struct bigfile_ops err_ops = {
.loadblk = err_loadblk,
};
/* loadblk error in main thread -> full abort */
void abort_loadblkerr_t0()
{
RAM *ram;
BigFileH fh;
VMA vma_struct, *vma = &vma_struct;
int err;
diag("testing loadblk error in main thread");
// XXX save/restore sigaction ?
ok1(!pagefault_init());
ram = ram_new(NULL,NULL);
ok1(ram);
BigFile f = {
.blksize = ram->pagesize,
.file_ops = &err_ops,
};
err = fileh_open(&fh, &f, ram);
ok1(!err);
err = fileh_mmap(vma, &fh, 0, 2);
ok1(!err);
/* touch page[0] - should abort whole processe because loadblk() returns -1 */
prefault();
b(vma, 0);
}
/* loadblk error in second thread -> abort only that thread, main continues to run */
void abort_loadblkerr_t1()
{
RAM *ram;
BigFileH fh;
VMA vma_struct, *vma = &vma_struct;
int err;
pthread_t t1;
diag("testing loadblk error in second thread");
// XXX save/restore sigaction ?
ok1(!pagefault_init());
ram = ram_new(NULL,NULL);
ok1(ram);
BigFile f = {
.blksize = ram->pagesize,
.file_ops = &err_ops,
};
err = fileh_open(&fh, &f, ram);
ok1(!err);
err = fileh_mmap(vma, &fh, 0, 2);
ok1(!err);
void *__t1(void *arg)
{
/* touch page[0] - should abort t1 processe because loadblk() returns -1 */
prefault();
b(vma, 0);
/* should not get here - abort whole process */
abort();
}
err = pthread_create(&t1, NULL, __t1, NULL);
ok1(!err);
/* but main thread stays alive */
err = pthread_join(t1, NULL);
ok1(!err);
diag("I: main thread is still alive");
exit(0);
}
static const struct {
const char *name;
void (*test)(void);
} tests[] = {
// XXX fragile - test names must start exactly with `{"fault` - Makefile extracts them this way
// name func-where-it-dies
{"faultr", fault_read}, // on_pagefault
{"faultw", fault_write}, // on_pagefault
{"fault_loadblk", fault_in_loadblk}, // faulty_loadblk
// name mustdie traceback signal
{"faultr", fault_read}, // on_pagefault SIGSEGV
{"faultw", fault_write}, // on_pagefault SIGSEGV
{"fault_loadblk", fault_in_loadblk}, // faulty_loadblk SIGSEGV
{"fault_storeblk", fault_in_storeblk}, // faulty_storeblk
{"fault_loadblkerr_t0", abort_loadblkerr_t0}, // __GI_raise,__GI_abort,__abort_thread,vma_on_pagefault,on_pagefault,sighandler,abort_loadblkerr_t0 SIGABRT
{"fault_loadblkerr_t1", abort_loadblkerr_t1}, // __GI_raise,__GI_abort,__abort_thread,vma_on_pagefault,on_pagefault,sighandler,__t1 SIGABRT 0
};
int main(int argc, char *argv[])
......
......@@ -505,9 +505,12 @@ void vma_on_pagefault(VMA *vma, uintptr_t addr, int write)
* that it can abort current transaction, but not die.
*
* NOTE for analogue situation when read for mmaped file fails, the
* kernel sends SIGBUS
* kernel sends SIGBUS, which if not handled, terminates whole process.
*
* For now we just terminate current thread.
*/
TODO (err);
if (err)
ABORT_THREAD();
xmunmap(pageram, page_size(page));
......
......@@ -51,6 +51,10 @@
#define BUG_ON(expr) ASSERT(!(expr))
/* like abort() but aborts only current thread (or whole process if current thread is main) */
#define ABORT_THREAD() \
__abort_thread(__FILE__, __LINE__, __func__)
void __todo(const char *, const char *, unsigned, const char *)
__attribute__((noreturn));
......@@ -61,6 +65,8 @@ void __bug_errno(const char *, unsigned, const char *)
__attribute__((noreturn));
void __bug_fail(const char *, const char *, unsigned, const char *)
__attribute__((noreturn));
void __abort_thread(const char *file, unsigned line, const char *func)
__attribute__((noreturn));
#endif
......@@ -23,8 +23,13 @@
#include <wendelin/bug.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <errno.h>
......@@ -65,3 +70,85 @@ void __todo(const char *expr, const char *file, unsigned line, const char *func)
fprintf(stderr, "%s:%u %s\tTODO %s\n", file, line, func, expr);
abort();
}
void __abort_thread(const char *file, unsigned line, const char *func)
{
pid_t tgid = getpid(); /* thread-group id of current thread */
pid_t tid = syscall(SYS_gettid); /* thread id of current thread */
int main_thread = (tgid == tid);
pid_t pid;
fprintf(stderr, "%s:%u %s\tABORT_THREAD %i/%i%s\n", file, line, func, tgid, tid, main_thread ? " (main)" : "");
/* if it is main thread - terminate whole process -
* - it is logical and this way we get proper exit code */
if (main_thread)
abort();
/* else try to produce coredump, terminate current thread, but do not kill the whole process
*
* ( on Linux, if a thread gets fatal signal and does not handle it, whole
* thread-group is terminated by kernel, after dumping core. OTOH, there is
* no other way to make the kernel dump core of us than to get a fatal
* signal without handling it.
*
* What could work, is to first remove current thread from it's
* thread-group, and then do usual abort(3) which is ~ raise(SIGABRT),
* but such leaving-thread-group functionality is non-existent as per linux-v4.1.
*
* NOTE Once sys_ungroup(2) system call was mentioned as being handy long
* ago, but it not implemented anywhere:
*
* https://git.kernel.org/cgit/linux/kernel/git/history/history.git/commit/?id=63540cea
* https://lkml.org/lkml/2002/9/15/125 )
*
* ~~~~
*
* vfork is ~ clone(CLONE_VFORK | CLONE_VM) without CLONE_THREAD.
*
* - without CLONE_THREAD means the child will leave current thread-group
* - CLONE_VM means it will share memory
* - CLONE_VFORK means current thread will pause before clone finishes
*
* so it looks all we have to do to get a coredump and terminate only
* current thread is vfork + abort in clone + pthread_exit in current.
*
* But it is not so - because on coredumping, Linux terminates all
* processes who share mm with terminating process, not only processes from
* thread group, and it was done on purpose:
*
* https://git.kernel.org/cgit/linux/kernel/git/history/history.git/commit/?id=d89f3847
* ("properly wait for all threads that share the same MM ...")
*
*
* So the only thing we are left to do, is to do usual fork() and coredump
* from forked child.
*/
pid = fork();
if (pid == -1) {
/* fork failed for some reason */
BUGe();
}
else if (!pid) {
/* child - abort it - this way we can get coredump.
* NOTE it does not affect parent */
abort();
}
else {
/* forked ok - wait for child to abort and exit current thread */
int status;
pid_t waited;
waited = waitpid(pid, &status, 0);
if (waited == -1)
BUGe();
ASSERT(waited == pid); /* waitpid can only return for child */
ASSERT(WIFSIGNALED(status)); /* child must terminated via SIGABRT */
/* now we know child terminated the way we wanted it to terminate
* -> we can exit current thread */
pthread_exit(NULL);
}
}
#!/bin/sh -e
# tfault-run <tfault> <arg> <mustdie>
# tfault-run <tfault> <arg> <mustdie> <signal> [exitcode]
# run `<tfault> <arg>` and verify that it produces correct coredump, dieing for
# SIGSEGV in function <mustdie>.
# <signal> with traceback <mustdie>.
#
# NOTE mustdie is generally func1,func2,func3,... - traceback tail is checked to be func1<-func2<-func3
# NOTE if exitcode not present - it is checked to be non-zero
# die <msg> ...
die() {
......@@ -12,6 +15,8 @@ die() {
tfault=$(realpath $1)
arg=$2
mustdie=$3
signal=$4
exitcode=$5
# XXX ok to hardcode t/ ?
workdir=t/tfault-run.$arg
......@@ -22,21 +27,38 @@ cd "$workdir"
ulimit -c unlimited
$tfault $arg 2>&1 |tee run.out
st="${PIPESTATUS[0]}"
if test -z "$exitcode"; then
test $st != 0 || die "test exit code is 0"
else
test $st == $exitcode || die "test exit code unexpected ($st != $exitcode)"
fi
grep -q "^# going to fault" run.out || die "test didn't run to faulting point"
test -e core || die "no core after run"
gdb -q -batch $tfault core >core.info || die "can't gdb(core)"
grep -q "Program terminated with signal SIGSEGV, Segmentation fault." core.info || die "not SIGSEGV"
gdb -q -batch $tfault core >core.info || die "can't gdb-info(core)"
grep -q "Program terminated with signal $signal" core.info || die "not $signal"
gdb -batch-silent \
-ex "set logging file /dev/stdout" \
-ex "set logging on" \
-ex bt $tfault core >core.bt || die "can't gdb-bt(core)"
# #0 0x00000000004031ae in doublefault_loadblk (file=0x7fff0f25d9c0, blk=0, buf=0x7ff85a553000) at t/tfault.c:93
diefunc=$(grep '^#0' core.info | awk '{print $4}')
test -n "$diefunc" || die "can't extract diefunc"
# verify mustdie
mustdie_depth=`echo $mustdie | sed 's/,/ /g' | wc -w`
diebt=
for nframe in `seq 0 $(($mustdie_depth - 1))`; do
# #0 0x00000000004031ae in doublefault_loadblk (file=0x7fff0f25d9c0, blk=0, buf=0x7ff85a553000) at t/tfault.c:93
# #5 <signal handler called>
framefunc=$(grep "^#$nframe" core.bt | awk '{print $4}')
test "$framefunc" = "called>" && framefunc=sighandler # <signal handler called> -> sighandler
diebt=$diebt${diebt:+,}$framefunc
done
test "$diefunc" == "$mustdie" || die "must die in $mustdie, died in $diefunc"
test "$diebt" == "$mustdie" || die "must die via $mustdie, died via $diebt"
# run ok - cleanup
cd "$cwd"
rm -rf "$workdir"
echo "ok - crashed OK (in $diefunc)"
echo "ok - crashed OK (via $diebt)"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment