From 2d89a14b905dc928affcd78fc91f20860af5c80f Mon Sep 17 00:00:00 2001 From: Nicolas Wavrant <nicolas.wavrant@nexedi.com> Date: Mon, 7 Nov 2016 15:01:44 +0100 Subject: [PATCH] runner: improves feedback on importer script failure. If slapgrid fails, print the end of instance/software log in resilient log. Now an the exit-code-file is always written, even if runner-importer fails. The promise on this file gives the URL where the resilient log can be accessed, to simplify the debugging --- software/slaprunner/common.cfg | 4 +-- .../slaprunner/instance-runner-import.cfg.in | 20 +++++++++------ .../template/runner-import.sh.jinja2 | 25 +++++++++++++------ 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/software/slaprunner/common.cfg b/software/slaprunner/common.cfg index a54e760c8..5288e631f 100644 --- a/software/slaprunner/common.cfg +++ b/software/slaprunner/common.cfg @@ -68,7 +68,7 @@ recipe = hexagonit.recipe.download ignore-existing = true url = ${:_profile_base_location_}/template/runner-import.sh.jinja2 download-only = true -md5sum = 3cebc5d793ff1b5c786392723babc510 +md5sum = 52ae874aad06acd2a9cc0eb2b2bd29b1 filename = runner-import.sh.jinja2 mode = 0644 @@ -76,7 +76,7 @@ mode = 0644 recipe = slapos.recipe.template url = ${:_profile_base_location_}/instance-runner-import.cfg.in output = ${buildout:directory}/instance-runner-import.cfg -md5sum = 91c34a55b7a45b14b0fac8b7faa202fe +md5sum = 0950285e9a579a47e501d98d7e79e0fa mode = 0644 [template-runner-export-script] diff --git a/software/slaprunner/instance-runner-import.cfg.in b/software/slaprunner/instance-runner-import.cfg.in index 4f99a42a8..d5118e795 100644 --- a/software/slaprunner/instance-runner-import.cfg.in +++ b/software/slaprunner/instance-runner-import.cfg.in @@ -79,34 +79,38 @@ rendered = $${directory:bin}/$${slap-parameter:namebase}-importer # backward compatibility for resilient stack wrapper = $${:rendered} mode = 700 -restore-exit-code-file=$${directory:srv}/importer-exit-code-file +restore-exit-code-file = $${directory:srv}/$${:restore-exit-code-file-basename} +restore-exit-code-file-basename = importer-exit-code-file +resilient-log-basename = resilient.log context = key backend_url slaprunner:access-url key ipv4 slaprunner:ipv4 key ipv6 slaprunner:ipv6 key proxy_port slaprunner:proxy_port section directory directory - raw output_log_file $${directory:log}/resilient.log + section supervisord supervisord + raw output_log_file $${directory:log}/$${:resilient-log-basename} raw shell_binary ${bash:location}/bin/bash raw rsync_binary ${rsync:location}/bin/rsync raw restore_exit_code_file $${:restore-exit-code-file} [importer-consistency-promise] # Test that the importer script and "after-import" subscripts -# are not older than 1 day (24h), and have succeeded +# are not older than 2 days (1 day + some slack), and have succeeded recipe = collective.recipe.template input = inline: #!/bin/sh - EXIT_CODE_FILE=$(find "$${importer:restore-exit-code-file}") - RECENT_EXIT_CODE_FILE=$(find "$${importer:restore-exit-code-file}" -mtime -1) - if [ -z "$EXIT_CODE_FILE" ]; then + EXIT_CODE_FILE="$${importer:restore-exit-code-file}" + RECENT_EXIT_CODE_FILE=$(find $${directory:srv} -maxdepth 1 -name "$${importer:restore-exit-code-file-basename}" -mtime -2) + RESILIENT_LOG_URL=$${publish:monitor-base-url}/log/$${importer:resilient-log-basename} + if [ ! -f "$EXIT_CODE_FILE" ]; then exit 0; else if [ -z "$RECENT_EXIT_CODE_FILE" ]; then echo "Consistency check is too old."; exit 1; else - EXIT_CODE=$(cat $EXIT_CODE_FILE) - exit $EXIT_CODE + echo "Error during import. Please check here : $RESILIENT_LOG_URL"; + exit $(cat $EXIT_CODE_FILE); fi fi exit 1; # Something else went wrong diff --git a/software/slaprunner/template/runner-import.sh.jinja2 b/software/slaprunner/template/runner-import.sh.jinja2 index ca89d5aee..1dc90cf85 100644 --- a/software/slaprunner/template/runner-import.sh.jinja2 +++ b/software/slaprunner/template/runner-import.sh.jinja2 @@ -8,7 +8,13 @@ umask 077 exec > >(tee -ai {{ output_log_file }}) exec 2>&1 -echo -e "\n\nrunner-import run at : $(date)" +RESTORE_EXIT_CODE_FILE="{{ restore_exit_code_file }}" + +fail_with_exit_code () { + echo 1 > $RESTORE_EXIT_CODE_FILE +} + +trap fail_with_exit_code ERR srv_directory={{ directory['srv'] }} restore_element () { @@ -27,6 +33,8 @@ restore_element () { done } +echo -e "\n\nrunner-import run at : $(date)" + restore_element {{ directory['backup'] }}/runner/ $srv_directory/runner instance project proxy.db restore_element {{ directory['backup'] }}/etc/ {{ directory['etc'] }} config.json cp -r {{ directory['backup'] }}/etc/.??* {{ directory['etc'] }}; @@ -70,16 +78,18 @@ $SQLITE3 $DATABASE "update partition_network11 set address='$IPV6' where netmask MASTERURL="http://{{ ipv4 }}:{{ proxy_port }}" echo "Building newest software..." -$SLAPOS node software --cfg $HOME/etc/slapos.cfg --all --master-url=$MASTERURL --logfile $HOME/srv/runner/software.log --pidfile $HOME/var/run/slapos-node-software.pid >/dev/null 2>&1 || -$SLAPOS node software --cfg $HOME/etc/slapos.cfg --all --master-url=$MASTERURL --logfile $HOME/srv/runner/software.log --pidfile $HOME/var/run/slapos-node-software.pid >/dev/null 2>&1 || -$SLAPOS node software --cfg $HOME/etc/slapos.cfg --all --master-url=$MASTERURL --logfile $HOME/srv/runner/software.log --pidfile $HOME/var/run/slapos-node-software.pid >/dev/null 2>&1 +$SLAPOS node software --cfg {{ supervisord['slapos-cfg'] }} --all --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-sr-log'] }} --pidfile {{ supervisord['slapgrid-sr-pid'] }} >/dev/null 2>&1 || +$SLAPOS node software --cfg {{ supervisord['slapos-cfg'] }} --all --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-sr-log'] }} --pidfile {{ supervisord['slapgrid-sr-pid'] }} >/dev/null 2>&1 || +$SLAPOS node software --cfg {{ supervisord['slapos-cfg'] }} --all --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-sr-log'] }} --pidfile {{ supervisord['slapgrid-sr-pid'] }} >/dev/null 2>&1 || +(tail -n 200 {{ supervisord['slapgrid-sr-log'] }} && false) # Remove defined scripts to force buildout to recreate them to have updated paths rm $srv_directory/runner/instance/slappart*/srv/runner-import-restore || true echo "Running slapos node instance..." # XXX hardcoded -$SLAPOS node instance --cfg $HOME/etc/slapos.cfg --master-url=$MASTERURL --logfile $HOME/srv/runner/instance.log --pidfile $HOME/var/run/slapos-node-instance.pid >/dev/null 2>&1 || -$SLAPOS node instance --cfg $HOME/etc/slapos.cfg --master-url=$MASTERURL --logfile $HOME/srv/runner/instance.log --pidfile $HOME/var/run/slapos-node-instance.pid >/dev/null 2>&1 || -$SLAPOS node instance --cfg $HOME/etc/slapos.cfg --master-url=$MASTERURL --logfile $HOME/srv/runner/instance.log --pidfile $HOME/var/run/slapos-node-instance.pid >/dev/null 2>&1 +$SLAPOS node instance --cfg {{ supervisord['slapos-cfg'] }} --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-cp-log'] }} --pidfile {{ supervisord['slapgrid-cp-pid'] }} >/dev/null 2>&1 || +$SLAPOS node instance --cfg {{ supervisord['slapos-cfg'] }} --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-cp-log'] }} --pidfile {{ supervisord['slapgrid-cp-pid'] }} >/dev/null 2>&1 || +$SLAPOS node instance --cfg {{ supervisord['slapos-cfg'] }} --master-url=$MASTERURL --logfile {{ supervisord['slapgrid-cp-log'] }} --pidfile {{ supervisord['slapgrid-cp-pid'] }} >/dev/null 2>&1 || +(tail -n 200 {{ supervisord['slapgrid-cp-log'] }} && false) # Invoke defined scripts for each partition inside of slaprunner echo "Invoke custom import scripts defined by each instances..." @@ -98,6 +108,5 @@ $SQLITE3 $DATABASE "update partition11 set requested_state='started';" # Write exit code to an arbitrary file that will be checked by promise/monitor echo "Write status file... End" -RESTORE_EXIT_CODE_FILE="{{ restore_exit_code_file }}" echo $RESTORE_EXIT_CODE > $RESTORE_EXIT_CODE_FILE exit $RESTORE_EXIT_CODE -- 2.30.9