#!/bin/bash -e # pull/restore gitlab data into/from git-backup # Copyright (C) 2015-2016 Nexedi SA and Contributors. # Kirill Smelkov <kirr@nexedi.com> # # This program is free software: you can Use, Study, Modify and Redistribute # it under the terms of the GNU General Public License version 3, or (at your # option) any later version, as published by the Free Software Foundation. # # This program is distributed WITHOUT ANY WARRANTY; without even the implied # warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # See COPYING file for full licensing terms. die() { echo "$@" 1>&2 exit 1 } GITLAB_BACKUP_PATH= GITLAB_REPOS_PATH= # database name & PostgreSQL settings how to connect to server GITLAB_DATABASE= export PGUSER= export PGHOST= export PGPORT= export PGPASSWORD= need_gitlab_config() { test -n "$GITLAB_BACKUP_PATH" && return # various gitlab config values extracted in 1 go (gitlab is very slow to load) { read GITLAB_BACKUP_PATH read GITLAB_REPOS_PATH read GITLAB_DATABASE read gitlab_db_adapter test -n "$gitlab_db_adapter" || die "E: cannot detect GitLab DB adapter" test "$gitlab_db_adapter" == "postgresql" || \ die "E: only postgresql is supported (gitlab db is $gitlab_db_adapter)" read PGUSER read PGHOST read PGPORT read PGPASSWORD read END } < <(gitlab-rails r ' c = Gitlab.config s = c.gitlab_shell puts c.backup.path, s.repos_path c = Backup::Database.new.config puts c["database"] puts c["adapter"], c["username"], c["host"], c["port"], c["password"] puts "END" ') test -n "$GITLAB_BACKUP_PATH" || die "E: cannot detect GITLAB_BACKUP_PATH" test -n "$GITLAB_REPOS_PATH" || die "E: cannot detect GITLAB_REPOS_PATH" test -n "$GITLAB_DATABASE" || die "E: cannot detect GITLAB_DATABASE" test "$END" == "END" || die "E: self-consistency failure (END)" } backup_pull() { need_gitlab_config # 1. dump all gitlab data except repositories & db echo " * Dumping gitlab data (except repositories & db)" tmpd=$(mktemp -d `pwd`/gitlab-backup.XXXXXX) gitlab-rake gitlab:backup:create SKIP=repositories,db | tee "$tmpd/gitlab_backup_create.out" backup_tar=`grep "^Creating backup archive: .* done" "$tmpd/gitlab_backup_create.out"` || \ die "E: Cannot detect backup tar" # 'Creating backup archive: 1440613567_gitlab_backup.tar ... done' -> 1440613567_gitlab_backup.tar backup_tar=`echo "$backup_tar" | grep -o '[^ ]*\.tar'` # 1440613567_gitlab_backup.tar -> /var/opt/gitlab/backups/1440613958_gitlab_backup.tar backup_tar="$GITLAB_BACKUP_PATH/$backup_tar" # 2. unpack backup_tar so it is ready to be pulled into git-backup mkdir "$tmpd/gitlab_backup" tar -C "$tmpd/gitlab_backup" -xf "$backup_tar" # unpack tarballs so files are better stored in git find "$tmpd/gitlab_backup" -maxdepth 1 -type f -name "*.tar.gz" | \ while read tar; do mv $tar $tar.x mkdir $tar tar xf $tar.x -C $tar rm $tar.x # keep empty dirs too test -n "`ls -A $tar`" || touch $tar/.gitlab-backup-keep done # 3. dump database ... echo " * Dumping database" mkdir -p "$tmpd/gitlab_backup/db" db_pgdump="$tmpd/gitlab_backup/db/database.pgdump" gitlab-rake -e "exec \"pg_dump -Fd -Z0 -f \\"$db_pgdump\\" $GITLAB_DATABASE\"" # ... sort each table data # # pg_dump dumps table data with `COPY ... TO stdout` which does not guaranty any ordering - # http://git.postgresql.org/gitweb/?p=postgresql.git;a=blob;f=src/bin/pg_dump/pg_dump.c;h=aa01d6a6;hb=HEAD#l1590 # http://stackoverflow.com/questions/24622579/does-or-can-the-postgresql-copy-to-command-guarantee-a-particular-row-order # - in fact it dumps data as stored raw in DB pages, and every record update changes row order. # # On the other hand, Rails by default adds integer `id` first column to # every table as convention - # http://edgeguides.rubyonrails.org/active_record_basics.html # and GitLab does not override this. So we can sort tables on id and this # way make data order stable. # # ( and even if there is no id column we can sort - as COPY does not # guarantee ordering, we can change the order of rows in _whatever_ way and # the dump will still be correct ) find "$db_pgdump" -maxdepth 1 -type f -name "*.dat" -a \! -name toc.dat | \ while read F; do # split file into data with numeric-start lines and tail with non-numeric lines touch $F.tail ntail=1 while true; do tail --lines $ntail $F > $F.tail.x test "$ntail" == "`wc -l <$F.tail.x`" || break # no data part at all ? head -1 $F.tail.x | grep -q '^[0-9]\+' && break # first data line # this line was non-numeric too - prepare for next iteration mv $F.tail.x $F.tail ntail=$(($ntail + 1)) done ntail=`wc -l <$F.tail` head --lines=-$ntail $F >$F.data # sort data part sort -n $F.data >$F.data.x # re-glue data & tail together cat $F.data.x $F.tail >$F.x # assert #lines stayed the same (just in case) nline=$(wc -l <$F) nlinex=$(wc -l <$F.x) test "$nline" == "$nlinex" || die "E: assertion failed while sorting $F" mv $F.x $F rm -f $F.data{,.x} $F.tail{,.x} done # 4. pull gitlab data into git-backup # gitlab/misc - db + uploads + ... # gitlab/repo - git repositories echo " * git-backup pull everything" $GIT_BACKUP pull "$tmpd/gitlab_backup:gitlab/misc" $GITLAB_REPOS_PATH:gitlab/repo # mark backup_tar as pulled and cleanup mv "$backup_tar" "$backup_tar.pulled" rm -rf "$tmpd" echo OK } backup_restore() { HEAD=$1 need_gitlab_config # 1. extract all gitlab data except repositories echo " * Extracting gitlab data (except repositories)" tmpd=$(mktemp -d `pwd`/gitlab-backup.XXXXXX) $GIT_BACKUP restore $HEAD gitlab/misc:"$tmpd/gitlab_backup" backup_info="$tmpd/gitlab_backup/backup_information.yml" # recreate tarballs from *.tar.gz directories find "$tmpd/gitlab_backup" -maxdepth 1 -type d -name "*.tar.gz" | \ while read tar; do rm -f $tar/.gitlab-backup-keep mv $tar $tar.x tar cfz $tar -C $tar.x . rm -rf $tar.x done # if backup is in pgdump (not sql) format - decode it db_pgdump="$tmpd/gitlab_backup/db/database.pgdump" if [ -d "$db_pgdump" ]; then # convert database dump to plain-text sql (as gitlab restore expects) gitlab-rake -e "exec \"pg_restore --clean \\"$db_pgdump\\" >$tmpd/gitlab_backup/db/database.sql \"" rm -rf "$db_pgdump" # tweak gitlab's backup_information.yml so it says db is included in the backup sed -i -e 's/:skipped: repositories,db/:skipped: repositories/' "$backup_info" fi gzip "$tmpd/gitlab_backup/db/database.sql" # gzip sql dump, as gitlab expects .gz # 2. find out backup timestamp as saved by gitlab backup_created_at=`grep :backup_created_at: "$backup_info" | sed -e s'/:backup_created_at: //'` # 2015-08-27 11:32:37.201345216 +02:00 -> 1440667957 backup_created_at=`echo "$backup_created_at" | gitlab-rake -e 'puts Time.parse(STDIN.read).to_i'` # 3. prepare tarball as would be created by gitlab:backup:create backup_tar="${backup_created_at}_gitlab_backup.tar" backup_tar="$GITLAB_BACKUP_PATH/$backup_tar" test -e "$backup_tar" && die "E: $backup_tar already exists" tar -C "$tmpd/gitlab_backup" -cf "$backup_tar" . rm -rf "$tmpd" # tmpd no longer needed # 4. extract repositories into .../repositories.<timestamp> $GIT_BACKUP restore $HEAD gitlab/repo:"${GITLAB_REPOS_PATH}.${backup_created_at}" # extraction complete - now proceed with actual backup restore # (which is mv repositories dir + load db) echo echo "Extraction complete. To actually restore data please do" echo "# TODO check, and make this run automatically" cat << EOF # https://gitlab.com/gitlab-org/gitlab-ce/blob/master/doc/raketasks/backup_restore.md gitlab-ctl stop unicorn gitlab-ctl stop sidekiq mv ${GITLAB_REPOS_PATH} ${GITLAB_REPOS_PATH}.old mv ${GITLAB_REPOS_PATH}.${backup_created_at} ${GITLAB_REPOS_PATH} gitlab-rake gitlab:backup:restore BACKUP=$backup_created_at gitlab-ctl start gitlab-rake gitlab:satellites:create # will go away after gitlab 8.0 gitlab-rake gitlab:check SANITIZE=true EOF } # ---------------------------------------- # make sure git-backup is present or explicitly specified GIT_BACKUP=${GIT_BACKUP:-`which git-backup`} || die "E: where is git-backup ?" # make sure we run under proper user used by gitlab gitlab_user=`gitlab-rake -e "exec 'whoami'"` me=`whoami` if test "$me" != "$gitlab_user"; then #echo respawning ... exec gitlab-rake -e "Dir.chdir \"$PWD\"; exec \"GIT_BACKUP=$GIT_BACKUP $0 $*\"" fi # we are working with potentially sensitive data # -> limit what could be read to current user only umask 0077 # XXX maybe not good - e.g. git-data/repositories should (?) be rwxrwx--- usage() { echo "Usage: gitlab-backup [pull | restore <commit-ish>]" } test $# -lt 1 && die `usage` action="$1" shift case "$action" in pull) backup_pull ;; restore) test $# -lt 1 && die `usage` backup_restore "$1" ;; -h) usage exit 0 ;; *) die `usage` esac