Commit 10092ba1 authored by unknown's avatar unknown

Improved handling of marking processes as dead

Run ndb_mgmd as deamon
Make extra attempt to check if processes are still alive


mysql-test/lib/mtr_process.pl:
  Add common function to mark processes as dead
  When all attempts to kil processes has failed make an extra attempt with ping to check if they really are still alive
mysql-test/mysql-test-run.pl:
  Run ndb_mgmd with --nodaemon
parent 906a8ad5
...@@ -272,40 +272,17 @@ sub spawn_parent_impl { ...@@ -272,40 +272,17 @@ sub spawn_parent_impl {
last; last;
} }
# If one of the mysqld processes died, we want to # If one of the processes died, we want to
# mark this, and kill the mysqltest process. # mark this, and kill the mysqltest process.
foreach my $idx (0..1) mark_process_dead($ret_pid);
{
if ( $::master->[$idx]->{'pid'} eq $ret_pid )
{
mtr_debug("child $ret_pid was master[$idx], " .
"exit during mysqltest run");
$::master->[$idx]->{'pid'}= 0;
last;
}
}
foreach my $idx (0..2)
{
if ( $::slave->[$idx]->{'pid'} eq $ret_pid )
{
mtr_debug("child $ret_pid was slave[$idx], " .
"exit during mysqltest run");
$::slave->[$idx]->{'pid'}= 0;
last;
}
}
mtr_debug("waitpid() caught exit of unknown child $ret_pid, " .
"exit during mysqltest run");
} }
if ( $ret_pid != $pid ) if ( $ret_pid != $pid )
{ {
# We terminated the waiting because a "mysqld" process died. # We terminated the waiting because a "mysqld" process died.
# Kill the mysqltest process. # Kill the mysqltest process.
mtr_verbose("Kill mysqltest because another process died");
kill(9,$pid); kill(9,$pid);
$ret_pid= waitpid($pid,0); $ret_pid= waitpid($pid,0);
...@@ -639,15 +616,21 @@ sub mtr_check_stop_servers ($) { ...@@ -639,15 +616,21 @@ sub mtr_check_stop_servers ($) {
mtr_warning("couldn't delete $file"); mtr_warning("couldn't delete $file");
} }
} }
$srv->{'pid'}= 0;
} }
} }
} }
if ( $errors ) if ( $errors )
{ {
# We are in trouble, just die.... # There where errors killing processes
# do one last attempt to ping the servers
# and if they can't be pinged, assume they are dead
if ( ! mtr_ping_with_timeout( \@$spec ) )
{
mtr_error("we could not kill or clean up all processes"); mtr_error("we could not kill or clean up all processes");
} }
} }
}
# FIXME We just assume they are all dead, for Cygwin we are not # FIXME We just assume they are all dead, for Cygwin we are not
# really sure # really sure
...@@ -773,39 +756,23 @@ sub mtr_ping_with_timeout($) { ...@@ -773,39 +756,23 @@ sub mtr_ping_with_timeout($) {
return $res; return $res;
} }
##############################################################################
# #
# The operating system will keep information about dead children, # Loop through our list of processes and look for and entry
# we read this information here, and if we have records the process # with the provided pid
# is alive, we mark it as dead. # Set the pid of that process to 0 if found
# #
############################################################################## sub mark_process_dead($)
{
sub mtr_record_dead_children () { my $ret_pid= shift;
my $ret_pid;
# Wait without blockinng to see if any processes had died
# -1 or 0 means there are no more procesess to wait for
while ( ($ret_pid= waitpid(-1,&WNOHANG)) != 0 and $ret_pid != -1)
{
mtr_warning("waitpid() caught exit of child $ret_pid");
foreach my $idx (0..1)
{
if ( $::master->[$idx]->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was master[$idx]");
$::master->[$idx]->{'pid'}= 0;
}
}
foreach my $idx (0..2) foreach my $mysqld (@{$::master}, @{$::slave})
{ {
if ( $::slave->[$idx]->{'pid'} eq $ret_pid ) if ( $mysqld->{'pid'} eq $ret_pid )
{ {
mtr_warning("child $ret_pid was slave[$idx]"); mtr_verbose("$mysqld->{'type'} $mysqld->{'idx'} exited, pid: $ret_pid");
$::slave->[$idx]->{'pid'}= 0; $mysqld->{'pid'}= 0;
last; return;
} }
} }
...@@ -813,21 +780,43 @@ sub mtr_record_dead_children () { ...@@ -813,21 +780,43 @@ sub mtr_record_dead_children () {
{ {
if ( $cluster->{'pid'} eq $ret_pid ) if ( $cluster->{'pid'} eq $ret_pid )
{ {
mtr_warning("child $ret_pid was $cluster->{'name'} cluster ndb_mgmd"); mtr_verbose("$cluster->{'name'} cluster ndb_mgmd exited, pid: $ret_pid");
$cluster->{'pid'}= 0; $cluster->{'pid'}= 0;
last; return;
} }
foreach my $ndbd (@{$cluster->{'ndbds'}}) foreach my $ndbd (@{$cluster->{'ndbds'}})
{ {
if ( $ndbd->{'pid'} eq $ret_pid ) if ( $ndbd->{'pid'} eq $ret_pid )
{ {
mtr_warning("child $ret_pid was $cluster->{'name'} cluster ndbd"); mtr_verbose("$cluster->{'name'} cluster ndbd exited, pid: $ret_pid");
$ndbd->{'pid'}= 0; $ndbd->{'pid'}= 0;
last; return;
} }
} }
} }
mtr_warning("mark_process_dead couldn't find an entry for pid: $ret_pid");
}
##############################################################################
#
# The operating system will keep information about dead children,
# we read this information here, and if we have records the process
# is alive, we mark it as dead.
#
##############################################################################
sub mtr_record_dead_children () {
my $ret_pid;
# Wait without blockinng to see if any processes had died
# -1 or 0 means there are no more procesess to wait for
while ( ($ret_pid= waitpid(-1,&WNOHANG)) != 0 and $ret_pid != -1)
{
mtr_warning("mtr_record_dead_children: $ret_pid");
mark_process_dead($ret_pid);
} }
} }
...@@ -843,7 +832,8 @@ sub start_reap_all { ...@@ -843,7 +832,8 @@ sub start_reap_all {
my $pid; my $pid;
while(($pid= waitpid(-1, &WNOHANG)) != 0 and $pid != -1) while(($pid= waitpid(-1, &WNOHANG)) != 0 and $pid != -1)
{ {
print "start_reap_all: pid: $pid.\n"; mtr_warning("start_reap_all pid: $pid");
mark_process_dead($pid);
}; };
} }
...@@ -903,6 +893,7 @@ sub sleep_until_file_created ($$$) { ...@@ -903,6 +893,7 @@ sub sleep_until_file_created ($$$) {
# Check if it died after the fork() was successful # Check if it died after the fork() was successful
if ( $pid != 0 && waitpid($pid,&WNOHANG) == $pid ) if ( $pid != 0 && waitpid($pid,&WNOHANG) == $pid )
{ {
mtr_warning("Process $pid died");
return 0; return 0;
} }
......
...@@ -1687,6 +1687,7 @@ sub ndbcluster_wait_started($){ ...@@ -1687,6 +1687,7 @@ sub ndbcluster_wait_started($){
} }
sub mysqld_wait_started($){ sub mysqld_wait_started($){
my $mysqld= shift; my $mysqld= shift;
...@@ -1706,6 +1707,7 @@ sub ndb_mgmd_start ($) { ...@@ -1706,6 +1707,7 @@ sub ndb_mgmd_start ($) {
mtr_init_args(\$args); mtr_init_args(\$args);
mtr_add_arg($args, "--no-defaults"); mtr_add_arg($args, "--no-defaults");
mtr_add_arg($args, "--core"); mtr_add_arg($args, "--core");
mtr_add_arg($args, "--nodaemon");
mtr_add_arg($args, "--config-file=%s", "$cluster->{'data_dir'}/config.ini"); mtr_add_arg($args, "--config-file=%s", "$cluster->{'data_dir'}/config.ini");
...@@ -1716,9 +1718,23 @@ sub ndb_mgmd_start ($) { ...@@ -1716,9 +1718,23 @@ sub ndb_mgmd_start ($) {
"", "",
{ append_log_file => 1 }); { append_log_file => 1 });
# FIXME Should not be needed
# Unfortunately the cluster nodes will fail to start
# if ndb_mgmd has not started properly
sleep(1);
# if (!sleep_until_file_created($cluster->{'path_pid'},
# 30, # Seconds
# $pid))
# {
# mtr_warning("Failed to start ndb_mgd for $cluster->{'name'} cluster");
# return 1;
# }
# Remember pid of ndb_mgmd # Remember pid of ndb_mgmd
$cluster->{'pid'}= $pid; $cluster->{'pid'}= $pid;
mtr_verbose("ndb_mgmd_start, pid: $pid");
return $pid; return $pid;
} }
...@@ -1774,19 +1790,6 @@ sub ndbcluster_start ($$) { ...@@ -1774,19 +1790,6 @@ sub ndbcluster_start ($$) {
my $pid= ndb_mgmd_start($cluster); my $pid= ndb_mgmd_start($cluster);
# FIXME Should not be needed
# Unfortunately cluster will fail
# if ndb_mgmd has not started properly
# Wait for the ndb_mgmd pid file to be created
if (!sleep_until_file_created($cluster->{'path_pid'},
60,
$pid))
{
mtr_warning("Failed to start ndb_mgmd for $cluster->{'name'} cluster");
return 1;
}
for ( my $idx= 0; $idx < $cluster->{'nodes'}; $idx++ ) for ( my $idx= 0; $idx < $cluster->{'nodes'}; $idx++ )
{ {
ndbd_start($cluster, $idx, $extra_args); ndbd_start($cluster, $idx, $extra_args);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment