From 2d89a14b905dc928affcd78fc91f20860af5c80f Mon Sep 17 00:00:00 2001
From: Nicolas Wavrant <nicolas.wavrant@nexedi.com>
Date: Mon, 7 Nov 2016 15:01:44 +0100
Subject: [PATCH] runner: improves feedback on importer script failure.

If slapgrid fails, print the end of instance/software log in resilient log.
Now an the exit-code-file is always written, even if runner-importer fails.
The promise on this file gives the URL where the resilient log can
be accessed, to simplify the debugging
---
 software/slaprunner/common.cfg                |  4 +--
 .../slaprunner/instance-runner-import.cfg.in  | 20 +++++++++------
 .../template/runner-import.sh.jinja2          | 25 +++++++++++++------
 3 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/software/slaprunner/common.cfg b/software/slaprunner/common.cfg
index a54e760c8..5288e631f 100644
--- a/software/slaprunner/common.cfg
+++ b/software/slaprunner/common.cfg
@@ -68,7 +68,7 @@ recipe = hexagonit.recipe.download
 ignore-existing = true
 url = ${:_profile_base_location_}/template/runner-import.sh.jinja2
 download-only = true
-md5sum = 3cebc5d793ff1b5c786392723babc510
+md5sum = 52ae874aad06acd2a9cc0eb2b2bd29b1
 filename = runner-import.sh.jinja2
 mode = 0644
 
@@ -76,7 +76,7 @@ mode = 0644
 recipe = slapos.recipe.template
 url = ${:_profile_base_location_}/instance-runner-import.cfg.in
 output = ${buildout:directory}/instance-runner-import.cfg
-md5sum = 91c34a55b7a45b14b0fac8b7faa202fe
+md5sum = 0950285e9a579a47e501d98d7e79e0fa
 mode = 0644
 
 [template-runner-export-script]
diff --git a/software/slaprunner/instance-runner-import.cfg.in b/software/slaprunner/instance-runner-import.cfg.in
index 4f99a42a8..d5118e795 100644
--- a/software/slaprunner/instance-runner-import.cfg.in
+++ b/software/slaprunner/instance-runner-import.cfg.in
@@ -79,34 +79,38 @@ rendered = $${directory:bin}/$${slap-parameter:namebase}-importer
 # backward compatibility for resilient stack
 wrapper = $${:rendered}
 mode = 700
-restore-exit-code-file=$${directory:srv}/importer-exit-code-file
+restore-exit-code-file = $${directory:srv}/$${:restore-exit-code-file-basename}
+restore-exit-code-file-basename = importer-exit-code-file
+resilient-log-basename = resilient.log
 context =
   key backend_url slaprunner:access-url
   key ipv4 slaprunner:ipv4
   key ipv6 slaprunner:ipv6
   key proxy_port slaprunner:proxy_port
   section directory directory
-  raw  output_log_file $${directory:log}/resilient.log
+  section supervisord supervisord
+  raw  output_log_file $${directory:log}/$${:resilient-log-basename}
   raw  shell_binary ${bash:location}/bin/bash
   raw  rsync_binary ${rsync:location}/bin/rsync
   raw  restore_exit_code_file $${:restore-exit-code-file}
 
 [importer-consistency-promise]
 # Test that the importer script and "after-import" subscripts
-# are not older than 1 day (24h), and have succeeded
+# are not older than 2 days (1 day + some slack), and have succeeded
 recipe = collective.recipe.template
 input = inline: #!/bin/sh
-  EXIT_CODE_FILE=$(find "$${importer:restore-exit-code-file}")
-  RECENT_EXIT_CODE_FILE=$(find "$${importer:restore-exit-code-file}" -mtime -1)
-  if [ -z "$EXIT_CODE_FILE" ]; then
+  EXIT_CODE_FILE="$${importer:restore-exit-code-file}"
+  RECENT_EXIT_CODE_FILE=$(find $${directory:srv} -maxdepth 1 -name "$${importer:restore-exit-code-file-basename}" -mtime -2)
+  RESILIENT_LOG_URL=$${publish:monitor-base-url}/log/$${importer:resilient-log-basename}
+  if [ ! -f "$EXIT_CODE_FILE" ]; then
     exit 0;
   else
     if [ -z "$RECENT_EXIT_CODE_FILE" ]; then
        echo "Consistency check is too old.";
        exit 1;
     else
-      EXIT_CODE=$(cat $EXIT_CODE_FILE)
-      exit $EXIT_CODE
+      echo "Error during import. Please check here : $RESILIENT_LOG_URL";
+      exit $(cat $EXIT_CODE_FILE);
     fi
   fi
   exit 1; # Something else went wrong
diff --git a/software/slaprunner/template/runner-import.sh.jinja2 b/software/slaprunner/template/runner-import.sh.jinja2
index ca89d5aee..1dc90cf85 100644
--- a/software/slaprunner/template/runner-import.sh.jinja2
+++ b/software/slaprunner/template/runner-import.sh.jinja2
@@ -8,7 +8,13 @@ umask 077
 exec > >(tee -ai {{ output_log_file }})
 exec 2>&1
 
-echo -e "\n\nrunner-import run at : $(date)"
+RESTORE_EXIT_CODE_FILE="{{ restore_exit_code_file }}"
+
+fail_with_exit_code () {
+  echo 1 > $RESTORE_EXIT_CODE_FILE
+}
+
+trap fail_with_exit_code ERR
 
 srv_directory={{ directory['srv'] }}
 restore_element () {
@@ -27,6 +33,8 @@ restore_element () {
   done
 }
 
+echo -e "\n\nrunner-import run at : $(date)"
+
 restore_element {{ directory['backup'] }}/runner/ $srv_directory/runner  instance project  proxy.db
 restore_element  {{ directory['backup'] }}/etc/ {{ directory['etc'] }} config.json
 cp -r {{ directory['backup'] }}/etc/.??* {{ directory['etc'] }};
@@ -70,16 +78,18 @@ $SQLITE3 $DATABASE "update partition_network11 set address='$IPV6' where netmask
 MASTERURL="http://{{ ipv4 }}:{{ proxy_port }}"
 
 echo "Building newest software..."
-$SLAPOS node software --cfg $HOME/etc/slapos.cfg --all --master-url=$MASTERURL --logfile $HOME/srv/runner/software.log --pidfile $HOME/var/run/slapos-node-software.pid >/dev/null 2>&1 ||
-$SLAPOS node software --cfg $HOME/etc/slapos.cfg --all --master-url=$MASTERURL --logfile $HOME/srv/runner/software.log --pidfile $HOME/var/run/slapos-node-software.pid >/dev/null 2>&1 ||
-$SLAPOS node software --cfg $HOME/etc/slapos.cfg --all --master-url=$MASTERURL --logfile $HOME/srv/runner/software.log --pidfile $HOME/var/run/slapos-node-software.pid >/dev/null 2>&1
+$SLAPOS node software --cfg {{ supervisord['slapos-cfg'] }} --all --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-sr-log'] }} --pidfile {{ supervisord['slapgrid-sr-pid'] }} >/dev/null 2>&1 ||
+$SLAPOS node software --cfg {{ supervisord['slapos-cfg'] }} --all --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-sr-log'] }} --pidfile {{ supervisord['slapgrid-sr-pid'] }} >/dev/null 2>&1 ||
+$SLAPOS node software --cfg {{ supervisord['slapos-cfg'] }} --all --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-sr-log'] }} --pidfile {{ supervisord['slapgrid-sr-pid'] }} >/dev/null 2>&1 ||
+(tail -n 200 {{ supervisord['slapgrid-sr-log'] }} && false)
 # Remove defined scripts to force buildout to recreate them to have updated paths
 rm $srv_directory/runner/instance/slappart*/srv/runner-import-restore || true
 echo "Running slapos node instance..."
 # XXX hardcoded
-$SLAPOS node instance --cfg $HOME/etc/slapos.cfg --master-url=$MASTERURL --logfile $HOME/srv/runner/instance.log --pidfile $HOME/var/run/slapos-node-instance.pid >/dev/null 2>&1 ||
-$SLAPOS node instance --cfg $HOME/etc/slapos.cfg --master-url=$MASTERURL --logfile $HOME/srv/runner/instance.log --pidfile $HOME/var/run/slapos-node-instance.pid >/dev/null 2>&1 ||
-$SLAPOS node instance --cfg $HOME/etc/slapos.cfg --master-url=$MASTERURL --logfile $HOME/srv/runner/instance.log --pidfile $HOME/var/run/slapos-node-instance.pid >/dev/null 2>&1
+$SLAPOS node instance --cfg {{ supervisord['slapos-cfg'] }} --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-cp-log'] }} --pidfile {{ supervisord['slapgrid-cp-pid'] }} >/dev/null 2>&1 ||
+$SLAPOS node instance --cfg {{ supervisord['slapos-cfg'] }} --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-cp-log'] }} --pidfile {{ supervisord['slapgrid-cp-pid'] }} >/dev/null 2>&1 ||
+$SLAPOS node instance --cfg {{ supervisord['slapos-cfg'] }} --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-cp-log'] }} --pidfile {{ supervisord['slapgrid-cp-pid'] }} >/dev/null 2>&1 ||
+(tail -n 200 {{ supervisord['slapgrid-cp-log'] }} && false)
 
 # Invoke defined scripts for each partition inside of slaprunner
 echo "Invoke custom import scripts defined by each instances..."
@@ -98,6 +108,5 @@ $SQLITE3 $DATABASE "update partition11 set requested_state='started';"
 
 # Write exit code to an arbitrary file that will be checked by promise/monitor
 echo "Write status file... End"
-RESTORE_EXIT_CODE_FILE="{{ restore_exit_code_file }}"
 echo $RESTORE_EXIT_CODE > $RESTORE_EXIT_CODE_FILE
 exit $RESTORE_EXIT_CODE
-- 
2.30.9