Commit 2d0d3884 authored by Łukasz Nowak's avatar Łukasz Nowak

software/kvm: Switch to incremental backups

parent b5bfd2cd
......@@ -15,7 +15,7 @@
[template]
filename = instance.cfg.in
md5sum = 7d269cc9da65403476fe95b9975a565e
md5sum = 4e01b5b9541d8a0e0467895121d50b55
[template-kvm]
filename = instance-kvm.cfg.jinja2
......@@ -31,19 +31,19 @@ md5sum = 839fc16c112d3b87e2dbd2e382e326de
[template-kvm-import]
filename = instance-kvm-import.cfg.jinja2.in
md5sum = a463a5e3cd2287d275d6943c2a11b7e4
md5sum = 58647cb7c170086cbbebcf957c6701b7
[template-kvm-import-script]
filename = template/kvm-import.sh.jinja2
md5sum = 013725987114c82ca3fd11097d0a7f9f
md5sum = 64f063d9c51e80c0d29d35a235927696
[template-kvm-export]
filename = instance-kvm-export.cfg.jinja2
md5sum = 34d1b7cc8ca62bfdfce759a1dfbbaccd
md5sum = a02f0694dcb944c18d99f7f79afa2384
[template-kvm-export-script]
filename = template/kvm-export.sh.jinja2
md5sum = 64aa1ce8785f6b94aabd787fa3443082
md5sum = b69ddd6baa938cfd3bf0c1ba9e7a1cd2
[template-nbd]
filename = instance-nbd.cfg.jinja2
......@@ -96,3 +96,7 @@ md5sum = b4f6ffef08685bace1b9c01a3bd2620d
[whitelist-domains-default]
filename = template/whitelist-domains-default
md5sum = e9d40162ba77472775256637a2617d14
[check-backup-directory.sh]
filename = template/check-backup-directory.sh
md5sum = e569494a941e1d585c2e0bbf070cf1c9
......@@ -6,6 +6,7 @@ extends =
parts +=
cron-entry-backup
${instance-kvm-parts:parts}
check-backup-directory-promise
[slap-parameter]
{% for k, v in slapparameter_dict.items() -%}
......@@ -16,13 +17,10 @@ parts +=
{%- endif %}
{% endfor -%}
# Create the exporter executable, which is a simple shell script
[exporter]
recipe = slapos.recipe.template:jinja2
url = {{ template_kvm_export }}
output = ${directory:bin}/${slap-parameter:namebase}-exporter
# Resilient stack wants a "wrapper" parameter
wrapper = ${:output}
[directory]
tmp = ${buildout:directory}/tmp
[disk]
{%- set disk_type = slapparameter_dict.get('disk-type', 'virtio') %}
{%- if disk_type == "virtio" %}
device = virtio0
......@@ -32,12 +30,19 @@ device = ide0-hd0
{%- else %}
# unsupported disk-type {{ disk_type }}
{%- endif %}
[exporter]
recipe = slapos.recipe.template:jinja2
url = {{ template_kvm_export }}
output = ${directory:bin}/${slap-parameter:namebase}-exporter
# Resilient stack wants a "wrapper" parameter
wrapper = ${:output}
context =
section directory directory
section buildout buildout
key socket_path kvm-instance:socket-path
key device :device
raw gzip_binary {{ gzip_binary }}
section disk disk
raw qmpbackup {{ qmpbackup }}
# Extends publish section with resilient parameters
[publish-connection-information]
......@@ -56,3 +61,10 @@ username = {{ slapparameter_dict['monitor-username'] }}
password = {{ slapparameter_dict['monitor-password'] }}
{% endif -%}
[check-backup-directory-promise]
# Check that disk image is not corrupted
<= monitor-promise-base
promise = check_command_execute
name = check-backup-directory.py
config-command = {{ check_backup_directory }} ${directory:backup}/${disk:device} ${directory:tmp}
failure_amount = 5
......@@ -48,6 +48,7 @@ bin = ${buildout:directory}/bin
srv = ${buildout:directory}/srv
var = ${buildout:directory}/var
log = ${:var}/log
tmp = ${buildout:directory}/tmp
scripts = ${:etc}/run
services = ${:etc}/service
novnc-conf = ${:etc}/novnc
......@@ -65,10 +66,7 @@ output = ${directory:bin}/${slap-parameter:namebase}-importer
wrapper = ${:output}
context =
section directory directory
raw zcat_binary {{ zcat_binary }}
raw gzip_binary {{ gzip_binary }}
backup-disk-path = ${directory:backup}/virtual.qcow2
raw qmprebase {{ qmprebase }}
[kvm-disk-image-corruption-bin]
recipe = collective.recipe.template
......
......@@ -126,6 +126,15 @@ template-replicated-destination = ${template-replicated:target}
import-list = file parts :template-parts-destination
file replicated :template-replicated-destination
[qmpbackup-binary]
recipe = slapos.cookbook:wrapper
environment =
PATH=${qemu:location}/bin
wrapper-path =
$${buildout:bin-directory}/qmpbackup
command-line =
${buildout:bin-directory}/qmpbackup
[dynamic-template-kvm-export]
recipe = slapos.recipe.template:jinja2
url = ${template-kvm-export:location}/instance-kvm-export.cfg.jinja2
......@@ -137,8 +146,18 @@ context =
raw kvm_template $${dynamic-template-kvm:output}
raw template_kvm_export ${template-kvm-export-script:target}
key pbsready_export_template template-pbsready-export:output
raw gzip_binary ${gzip:location}/bin/gzip
key slapparameter_dict slap-configuration:configuration
key qmpbackup qmpbackup-binary:wrapper-path
raw check_backup_directory ${check-backup-directory.sh:output}
[qmprebase-binary]
recipe = slapos.cookbook:wrapper
environment =
PATH=${qemu:location}/bin
wrapper-path =
$${buildout:bin-directory}/qmprebase
command-line =
${buildout:bin-directory}/qmprebase
[dynamic-template-kvm-import]
recipe = slapos.recipe.template:jinja2
......@@ -152,8 +171,7 @@ context =
raw template_kvm_import ${template-kvm-import-script:target}
key pbsready_import_template template-pbsready-import:output
key slapparameter_dict slap-configuration:configuration
raw zcat_binary ${gzip:location}/bin/zcat
raw gzip_binary ${gzip:location}/bin/gzip
key qmprebase qmprebase-binary:wrapper-path
[dynamic-template-nbd]
<= jinja2-template-base
......
......@@ -10,7 +10,6 @@ extends =
../../component/netcat/buildout.cfg
../../component/nginx/buildout.cfg
../../component/pycurl/buildout.cfg
../../component/gzip/buildout.cfg
../../stack/slapos.cfg
../../stack/resilient/buildout.cfg
buildout.hash.cfg
......@@ -19,6 +18,7 @@ extends =
# to avoid versioning issues
common-parts =
template
qmpbackup
# XXX: we have to manually add this for resilience
pbs-recipe-egg
......@@ -32,6 +32,18 @@ parts = ${:common-parts}
# In qemu builtin vnc server, and make it available only for localhost
# so that only novnc can listen to it.
[qmpbackup]
recipe = zc.recipe.egg
eggs =
qemu.qmp
qmpbackup
find-links +=
https://github.com/abbbi/qmpbackup/releases/download/v0.29/qmpbackup-0.29.tar.gz
[versions]
qemu.qmp = 0.0.3:whl
qmpbackup = 0.29
[python-with-eggs]
recipe = zc.recipe.egg
interpreter = ${:_buildout_section_name_}
......@@ -129,3 +141,7 @@ context =
[whitelist-domains-default]
<= download-base
[check-backup-directory.sh]
<= template-base
output = ${buildout:parts-directory}/${:_buildout_section_name_}/check-backup-directory.sh
#!/bin/bash
directory=$1
tmp=$2
# support a case of not ready yet directory
if [ ! -d $directory ] ; then
exit 0
fi
tmpfile=$(mktemp -p $tmp)
trap "rm -fr $tmpfile" EXIT TERM INT
find $directory -type f -name 'FULL*qcow2' -printf '%f\n' > $tmpfile
full_amount=$(wc -l $tmpfile | cut -d ' ' -f 1)
if [ $full_amount -gt 1 ]; then
echo "Too many FULL backups"
cat $tmpfile
exit 1
fi
find $directory -type f -name 'INC*qcow2' -printf '%f\n' > $tmpfile
if [ $(wc -l $tmpfile | cut -d ' ' -f 1) -gt 0 ] && [ $full_amount -eq 0 ] ; then
echo "INC present but no FULL backup"
cat $tmpfile
exit 1
fi
find $directory -type f -name '*.partial' -printf '%f\n' > $tmpfile
if [ $(wc -l $tmpfile | cut -d ' ' -f 1) -ne 0 ]; then
echo "Partial file present"
cat $tmpfile
exit 1
fi
exit 0
......@@ -6,25 +6,38 @@ set -e
LC_ALL=C
export LC_ALL
BACKUP_DIR={{ directory['backup'] }}
BACKUP_FILE=virtual.qcow2
QMP_CLIENT={{ buildout['directory'] }}/software_release/bin/qemu-qmp-client
$QMP_CLIENT --socket {{ socket_path }} --drive-backup $BACKUP_DIR/$BACKUP_FILE {{ device }}
# Due to the way qmp works, the VM file cannot be compressed on the fly.
# Although the compression step is optional, the importer uses the .gz file
# if present. So, remove it if you are disabling the compression.
# The downside of compression, here, is the temporary usage of more disk space
# in the exporter node. The goal is to minimize disk usage on the PBS server.
log=$(mktemp --tmpdir={{ directory['tmp'] }})
trap "rm -f $log" EXIT TERM INT
set +e
qmpbackup="{{ qmpbackup }} --socket {{ socket_path }} backup --compress --target $BACKUP_DIR --include {{ disk['device'] }}"
$qmpbackup --level auto > $log
RESULT=$?
cat $log
if [ $RESULT -ne 0 ] ; then
# recover from unfinished previous backup
if egrep -q 'Partial backup found in.*{{ disk['device']}}.*possible broken backup chain. Execute new full backup' $log ; then
find $BACKUP_DIR/{{ disk['device'] }} -name '*.partial' -delete
$qmpbackup --level auto || exit $?
echo "Recovered from partial backup by removing partial"
else
exit $RESULT
fi
fi
set -e
# If you want to compress the file in-place:
# truncate -s $(gzip -c $BACKUP_FILE | dd of=$BACKUP_FILE conv=notrunc 2>&1 | sed -n '$ s/ .*$// p') $BACKUP_FILE
# but the importer script would have to be adapted.
# as new style backup went fine delete potential old style backup
rm -f $BACKUP_DIR/virtual.qcow2{,.gz}
echo "Compressing backup..."
{{ gzip_binary }} --force --rsyncable $BACKUP_DIR/$BACKUP_FILE
# cleanup the backup directory from too old backups, especially important after take-over
recent_full=$(find $BACKUP_DIR -type f -name 'FULL-*.qcow2' -exec ls -t1 {} + | head -n1)
if [ x"$recent_full" != x"" ] ; then
for f in $(find $BACKUP_DIR -type f -name '*qcow2' \! -newer $recent_full) ; do
if [ "$f" != "$recent_full" ] ; then
rm -vf $f
fi
done
fi
cd $BACKUP_DIR && find -type f ! -name backup.signature -print0 | xargs -0 sha256sum | LC_ALL=C sort -k 66 > backup.signature
......@@ -4,7 +4,6 @@ set -e
VM_DIR={{ directory['srv'] }}
BACKUP_DIR={{ directory['backup'] }}
VM_FILE=virtual.qcow2
LC_ALL=C
export LC_ALL
umask 077
......@@ -18,11 +17,30 @@ write_backup_proof () {
# For now we just make the diff before
write_backup_proof
if [ -f "$BACKUP_DIR/${VM_FILE}.gz" ]; then
{{ gzip_binary }} -t "$BACKUP_DIR/${VM_FILE}.gz" || exit 10
{{ zcat_binary }} "$BACKUP_DIR/${VM_FILE}.gz" > $VM_DIR/$VM_FILE
tmpfile=$(mktemp --tmpdir={{ directory['tmp'] }})
# use temporary space inside of the partition, as it can be quite big
tmpdir=$(mktemp -d --tmpdir={{ directory['tmp'] }})
# assure the temporary directory is cleaned up
trap "rm -rf $tmpdir $tmpfile" EXIT TERM INT
cp -a $BACKUP_DIR/* $tmpdir
if [ -d $tmpdir/ide0-hd0 ] ; then
disk_type="ide0-hd0"
elif [ -d $tmpdir/virtio0 ] ; then
disk_type=virtio0
else
rm $VM_DIR/$VM_FILE
cp $BACKUP_DIR/$VM_FILE $VM_DIR/$VM_FILE
echo "Unsupported disk type"
exit 1
fi
if [ $(find $tmpdir/$disk_type -name 'INC-*' | wc -l | cut -d ' ' -f 1) -gt 0 ] ; then
{{ qmprebase }} rebase --dir $tmpdir/$disk_type
fi
# assert that restore went well and there is only one file
find $tmpdir/$disk_type -type f -name 'FULL*qcow2' -printf '%f\n' > $tmpfile
if [ $(wc -l $tmpfile | cut -d ' ' -f 1) -ne 1 ] ; then
echo "Wrong amount of FULL backups"
cat $tmpfile
exit 1
fi
cp $tmpdir/$disk_type/* $VM_DIR/virtual.qcow2
......@@ -221,6 +221,12 @@ class KvmMixin:
return os.path.join(
cls.slap._instance_root, cls.getPartitionId(instance_type), *paths)
@classmethod
def getBackupPartitionPath(cls, *paths):
return cls.getPartitionPath(
'kvm-export', 'srv', 'backup', 'kvm',
cls.disk_type_backup_mapping[cls.disk_type], *paths)
def getConnectionParameterDictJson(self):
return json.loads(
self.computer_partition.getConnectionParameterDict()['_'])
......@@ -759,6 +765,151 @@ class CronMixin(object):
return job_list_output
class TestInstanceResilientBackupMixin(CronMixin, KvmMixin):
__partition_reference__ = 'irb'
instance_max_retry = 20
disk_type = 'virtio'
disk_type_backup_mapping = {
'virtio': 'virtio0',
'ide': 'ide0-hd0',
}
@classmethod
def getInstanceParameterDict(cls):
parameter_dict = {}
if cls.disk_type != 'virtio':
parameter_dict['disk-type'] = cls.disk_type
return parameter_dict
@classmethod
def getInstanceSoftwareType(cls):
return 'kvm-resilient'
def setUp(self):
super().setUp()
importer_partition = glob.glob(os.path.join(
self.slap.instance_directory, '*', 'template-kvm-import.cfg'))
self.assertEqual(1, len(importer_partition))
self.importer_partition = os.path.dirname(importer_partition[0])
def call_exporter(self):
result = self.executeCronDJob('kvm-export', 'backup')
self.assertEqual(len(result), 1)
self.assertEqual(
0,
result[0].returncode,
result[0].stdout.decode('utf-8'))
return result[0].stdout.decode('utf-8')
@skipUnlessKvm
class TestInstanceResilientBackupImporter(
TestInstanceResilientBackupMixin, KVMTestCase):
def test(self):
equeue_file = os.path.join(
self.importer_partition, 'var', 'log', 'equeue.log')
destination_qcow2 = os.path.join(
self.importer_partition, 'srv', 'virtual.qcow2')
destination_backup = os.path.join(
self.importer_partition, 'srv', 'backup', 'kvm',
self.disk_type_backup_mapping[self.disk_type])
# sanity check - no export/import happened yet
self.assertFalse(os.path.exists(self.getBackupPartitionPath()))
self.call_exporter()
def awaitBackup(equeue_file):
for f in range(30):
with open(equeue_file, 'r') as fh:
equeue_log = fh.read()
if 'finished successfully' in equeue_log:
break
time.sleep(1)
else:
self.fail('Backup not finished: %s' % (equeue_log))
return equeue_log
equeue_log = awaitBackup(equeue_file)
self.assertNotIn('qemu-img rebase', equeue_log)
self.assertEqual(
os.listdir(self.getBackupPartitionPath()),
os.listdir(destination_backup)
)
self.assertTrue(os.path.exists(destination_qcow2))
# clean up equeue file for precise assertion
with open(equeue_file, 'w') as fh:
fh.write('')
# drop backup destination to assert its recreation
os.unlink(destination_qcow2)
self.call_exporter()
equeue_log = awaitBackup(equeue_file)
self.assertIn('qemu-img rebase', equeue_log)
self.assertEqual(
os.listdir(self.getBackupPartitionPath()),
os.listdir(destination_backup)
)
self.assertTrue(os.path.exists(destination_qcow2))
# takeover
connection_parameter = self.computer_partition.getConnectionParameterDict()
takeover_result = requests.post(
connection_parameter['takeover-kvm-1-url'],
data={
'password': connection_parameter['takeover-kvm-1-password']})
self.assertEqual(httplib.OK, takeover_result.status_code)
self.assertTrue(takeover_result.text.startswith('Success.'))
# the real assertions comes from re-stabilizing the instance tree
self.slap.waitForInstance(max_retry=10)
# check that all stabilizes after backup after takeover
self.call_exporter()
self.slap.waitForInstance(max_retry=10)
@skipUnlessKvm
class TestInstanceResilientBackupImporterIde(
TestInstanceResilientBackupImporter):
disk_type = 'ide'
@skipUnlessKvm
class TestInstanceResilientBackupExporter(
TestInstanceResilientBackupMixin, KVMTestCase):
def test(self):
status_text = self.call_exporter()
self.assertEqual(
len(glob.glob(self.getBackupPartitionPath('FULL-*.qcow2'))),
1)
self.assertEqual(
len(glob.glob(self.getBackupPartitionPath('INC-*.qcow2'))),
0)
self.assertNotIn(
'Recovered from partial backup by removing partial',
status_text
)
# cover .partial file in the backup directory with fallback to full
current_backup = glob.glob(self.getBackupPartitionPath('FULL-*'))[0]
with open(current_backup + '.partial', 'w') as fh:
fh.write('')
status_text = self.call_exporter()
self.assertEqual(
len(glob.glob(self.getBackupPartitionPath('FULL-*.qcow2'))),
1)
self.assertEqual(
len(glob.glob(self.getBackupPartitionPath('INC-*.qcow2'))),
1)
self.assertIn(
'Recovered from partial backup by removing partial',
status_text
)
self.assertTrue(os.path.exists(os.path.join(
self.getPartitionPath(
'kvm-export', 'etc', 'plugin', 'check-backup-directory.py'))))
@skipUnlessKvm
class TestInstanceResilientBackupExporterIde(
TestInstanceResilientBackupExporter):
disk_type = 'ide'
@skipUnlessKvm
class TestInstanceResilient(KVMTestCase, KvmMixin):
__partition_reference__ = 'ir'
......@@ -775,25 +926,6 @@ class TestInstanceResilient(KVMTestCase, KvmMixin):
cls.kvm0_ipv6 = cls.getPartitionIPv6(cls.getPartitionId('kvm0'))
cls.kvm1_ipv6 = cls.getPartitionIPv6(cls.getPartitionId('kvm1'))
def test_kvm_exporter(self):
exporter_partition = os.path.join(
self.slap.instance_directory,
self.__partition_reference__ + '2')
backup_path = os.path.join(
exporter_partition, 'srv', 'backup', 'kvm', 'virtual.qcow2.gz')
exporter = os.path.join(exporter_partition, 'bin', 'exporter')
if os.path.exists(backup_path):
os.unlink(backup_path)
def call_exporter():
try:
return (0, subprocess.check_output(
[exporter], stderr=subprocess.STDOUT).decode('utf-8'))
except subprocess.CalledProcessError as e:
return (e.returncode, e.output.decode('utf-8'))
status_code, status_text = call_exporter()
self.assertEqual(0, status_code, status_text)
def test(self):
connection_parameter_dict = self\
.computer_partition.getConnectionParameterDict()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment