Commit 18488048 authored by Aleksey Midenkov's avatar Aleksey Midenkov

MDEV-29023 MTR hangs after multiple failures

Passing $opt_parallel as $childs is wrong: child can be killed before
it connects and you will never decrement $childs for this.

Another problem is (and that is the cause of this bug): child can be
killed and never close server socket. This can happen f.ex. after
unmaskable KILL signal. In such case the socket is closed by reaping
the child but that never happens inside reading the socket loop in
run_test_server().

The proper design is the waitless reap of children inside the socket
loop and if there is no more children we finish the socket loop. Since
there is Windows variation where we don't control the children via
waitpid(), all the clients must normally close the socket and only
this can finish the socket loop. For Unix variation we reckon that
case as all children closed the socket but not all yet died and for
that we do final waiting waitpid() (was done before the patch as
well).

To be more complete, we now handle 3 end-of-game scenarios in Unix:

   1. all children closed socket, all children died: everything is
      handled by the socket loop;

   2. all children closed socket, not all yet died: we wait for alive
      children to die after exiting the socket loop;

   3. not all children closed socket, all children died: everything is
      handled by the socket loop.

For Windows end-of-game scenario is only one:

   All children close the socket.
parent 7ca5c7d8
......@@ -78,6 +78,7 @@ use lib "lib";
use Cwd ;
use Cwd 'realpath';
use POSIX ":sys_wait_h";
use Getopt::Long qw(:config bundling);
use My::File::Path; # Patched version of File::Path
use File::Basename;
......@@ -649,7 +650,7 @@ sub main {
mark_time_used('init');
my ($prefix, $fail, $completed, $extra_warnings)=
run_test_server($server, $tests, $opt_parallel);
run_test_server($server, $tests, \%children);
exit(0) if $opt_start_exit;
......@@ -664,10 +665,12 @@ sub main {
if ($ret_pid == -1) {
# Child was automatically reaped. Probably not possible
# unless you $SIG{CHLD}= 'IGNORE'
mtr_report("Child ${pid} was automatically reaped (this should never happend)");
mtr_warning("Child ${pid} was automatically reaped (this should never happen)");
} elsif ($ret_pid != $pid) {
confess("Unexpected PID ${ret_pid} instead of expected ${pid}");
}
my $exit_status= ($? >> 8);
mtr_verbose2("Child ${pid} exited with status ${exit_status}");
delete $children{$ret_pid};
}
}
......@@ -727,7 +730,7 @@ sub main {
sub run_test_server ($$$) {
my ($server, $tests, $childs) = @_;
my ($server, $tests, $children) = @_;
my $num_saved_datadir= 0; # Number of datadirs saved in vardir/log/ so far.
my $num_failed_test= 0; # Number of tests failed so far
......@@ -742,6 +745,7 @@ sub run_test_server ($$$) {
my $suite_timeout= start_timer(suite_timeout());
my $s= IO::Select->new();
my $childs= 0;
$s->add($server);
while (1) {
if ($opt_stop_file)
......@@ -755,12 +759,14 @@ sub run_test_server ($$$) {
mark_time_used('admin');
my @ready = $s->can_read(1); # Wake up once every second
mtr_debug("Got ". (0 + @ready). " connection(s)");
mark_time_idle();
foreach my $sock (@ready) {
if ($sock == $server) {
# New client connected
++$childs;
my $child= $sock->accept();
mtr_verbose2("Client connected");
mtr_verbose2("Client connected (got ${childs} childs)");
$s->add($child);
print $child "HELLO\n";
}
......@@ -768,12 +774,10 @@ sub run_test_server ($$$) {
my $line= <$sock>;
if (!defined $line) {
# Client disconnected
mtr_verbose2("Child closed socket");
--$childs;
mtr_verbose2("Child closed socket (left ${childs} childs)");
$s->remove($sock);
$sock->close;
if (--$childs == 0){
return ("Completed", $test_failure, $completed, $extra_warnings);
}
next;
}
chomp($line);
......@@ -1016,6 +1020,33 @@ sub run_test_server ($$$) {
}
}
if (!IS_WINDOWS) {
foreach my $pid (keys %$children)
{
my $res= waitpid($pid, WNOHANG);
if ($res == $pid || $res == -1) {
if ($res == -1) {
# Child was automatically reaped. Probably not possible
# unless you $SIG{CHLD}= 'IGNORE'
mtr_warning("Child ${pid} was automatically reaped (this should never happen)");
}
my $exit_status= ($? >> 8);
mtr_verbose2("Child ${pid} exited with status ${exit_status}");
delete $children->{$pid};
if (!%$children && $childs) {
mtr_verbose2("${childs} children didn't close socket before dying!");
$childs= 0;
}
} elsif ($res != 0) {
confess("Unexpected result ${res} on waitpid(${pid}, WNOHANG)");
}
}
}
if ($childs == 0){
return ("Completed", $test_failure, $completed, $extra_warnings);
}
# ----------------------------------------------------
# Check if test suite timer expired
# ----------------------------------------------------
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment