#!/bin/bash -e
# pull/restore gitlab data into/from git-backup
# Copyright (C) 2015-2016  Nexedi SA and Contributors.
#                          Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.

die() {
    echo "$@" 1>&2
    exit 1
}


GITLAB_BACKUP_PATH=
GITLAB_REPOS_PATH=

# database name & PostgreSQL settings how to connect to server
GITLAB_DATABASE=
export PGUSER=
export PGHOST=
export PGPORT=
export PGPASSWORD=

need_gitlab_config() {
    test -n "$GITLAB_BACKUP_PATH" && return

    # various gitlab config values extracted in 1 go (gitlab is very slow to load)
    {
        read GITLAB_BACKUP_PATH
        read GITLAB_REPOS_PATH

        read GITLAB_DATABASE
        read gitlab_db_adapter
        test -n "$gitlab_db_adapter"    || die "E: cannot detect GitLab DB adapter"
        test "$gitlab_db_adapter" == "postgresql" || \
            die "E: only postgresql is supported (gitlab db is $gitlab_db_adapter)"
        read PGUSER
        read PGHOST
        read PGPORT
        read PGPASSWORD

        read END
    } < <(gitlab-rails r '
        c = Gitlab.config
        s = c.gitlab_shell
        puts c.backup.path, s.repos_path

        c = Backup::Database.new.config
        puts c["database"]
        puts c["adapter"], c["username"], c["host"], c["port"], c["password"]

        puts "END"
    ')

    test -n "$GITLAB_BACKUP_PATH"   || die "E: cannot detect GITLAB_BACKUP_PATH"
    test -n "$GITLAB_REPOS_PATH"    || die "E: cannot detect GITLAB_REPOS_PATH"
    test -n "$GITLAB_DATABASE"      || die "E: cannot detect GITLAB_DATABASE"
    test "$END" == "END"            || die "E: self-consistency failure (END)"
}


backup_pull() {
    need_gitlab_config

    # 1. dump all gitlab data except repositories & db
    echo " * Dumping gitlab data (except repositories & db)"
    tmpd=$(mktemp -d `pwd`/gitlab-backup.XXXXXX)
    gitlab-rake gitlab:backup:create SKIP=repositories,db | tee "$tmpd/gitlab_backup_create.out"
    backup_tar=`grep "^Creating backup archive: .* done" "$tmpd/gitlab_backup_create.out"` || \
        die "E: Cannot detect backup tar"

    # 'Creating backup archive: 1440613567_gitlab_backup.tar ... done' -> 1440613567_gitlab_backup.tar
    backup_tar=`echo "$backup_tar" | grep -o '[^ ]*\.tar'`
    # 1440613567_gitlab_backup.tar -> /var/opt/gitlab/backups/1440613958_gitlab_backup.tar
    backup_tar="$GITLAB_BACKUP_PATH/$backup_tar"


    # 2. unpack backup_tar so it is ready to be pulled into git-backup
    mkdir "$tmpd/gitlab_backup"
    tar -C "$tmpd/gitlab_backup" -xf "$backup_tar"
    # unpack tarballs so files are better stored in git
    find "$tmpd/gitlab_backup" -maxdepth 1 -type f -name "*.tar.gz" | \
    while read tar; do
        mv $tar $tar.x
        mkdir $tar
        tar xf $tar.x -C $tar
        rm $tar.x

        # keep empty dirs too
        test -n "`ls -A $tar`" || touch $tar/.gitlab-backup-keep
    done


    # 3. dump database ...
    echo " * Dumping database"
    mkdir -p "$tmpd/gitlab_backup/db"
    db_pgdump="$tmpd/gitlab_backup/db/database.pgdump"
    gitlab-rake -e "exec \"pg_dump -Fd -Z0 -f \\"$db_pgdump\\" $GITLAB_DATABASE\""

    # ... sort each table data
    #
    # pg_dump dumps table data with `COPY ... TO stdout` which does not guaranty any ordering -
    #   http://git.postgresql.org/gitweb/?p=postgresql.git;a=blob;f=src/bin/pg_dump/pg_dump.c;h=aa01d6a6;hb=HEAD#l1590
    #   http://stackoverflow.com/questions/24622579/does-or-can-the-postgresql-copy-to-command-guarantee-a-particular-row-order
    # - in fact it dumps data as stored raw in DB pages, and every record update changes row order.
    #
    # On the other hand, Rails by default adds integer `id` first column to
    # every table as convention -
    #   http://edgeguides.rubyonrails.org/active_record_basics.html
    # and GitLab does not override this. So we can sort tables on id and this
    # way make data order stable.
    #
    # ( and even if there is no id column we can sort - as COPY does not
    #   guarantee ordering, we can change the order of rows in _whatever_ way and
    #   the dump will still be correct )
    find "$db_pgdump" -maxdepth 1 -type f -name "*.dat" -a \! -name toc.dat | \
    while read F; do
        # split file into data with numeric-start lines and tail with non-numeric lines
        touch $F.tail
        ntail=1
        while true; do
            tail --lines $ntail $F > $F.tail.x
            test "$ntail" == "`wc -l <$F.tail.x`"   || break   # no data part at all ?
            head -1 $F.tail.x | grep -q '^[0-9]\+'  && break   # first data line

            # this line was non-numeric too - prepare for next iteration
            mv $F.tail.x $F.tail
            ntail=$(($ntail + 1))
        done
        ntail=`wc -l <$F.tail`
        head --lines=-$ntail $F >$F.data

        # sort data part
        sort -n $F.data >$F.data.x

        # re-glue data & tail together
        cat $F.data.x $F.tail >$F.x

        # assert #lines stayed the same (just in case)
        nline=$(wc -l <$F)
        nlinex=$(wc -l <$F.x)
        test "$nline" == "$nlinex" || die "E: assertion failed while sorting $F"

        mv $F.x $F
        rm -f $F.data{,.x} $F.tail{,.x}
    done


    # 4. pull gitlab data into git-backup
    # gitlab/misc   - db + uploads + ...
    # gitlab/repo   - git repositories
    echo " * git-backup pull everything"
    $GIT_BACKUP pull "$tmpd/gitlab_backup:gitlab/misc"  $GITLAB_REPOS_PATH:gitlab/repo

    # mark backup_tar as pulled and cleanup
    mv "$backup_tar" "$backup_tar.pulled"
    rm -rf "$tmpd"

    echo OK
}


backup_restore() {
    HEAD=$1

    need_gitlab_config

    # 1. extract all gitlab data except repositories
    echo " * Extracting gitlab data (except repositories)"
    tmpd=$(mktemp -d `pwd`/gitlab-backup.XXXXXX)

    $GIT_BACKUP restore $HEAD gitlab/misc:"$tmpd/gitlab_backup"
    backup_info="$tmpd/gitlab_backup/backup_information.yml"

    # recreate tarballs from *.tar.gz directories
    find "$tmpd/gitlab_backup" -maxdepth 1 -type d -name "*.tar.gz" | \
    while read tar; do
        rm -f $tar/.gitlab-backup-keep

        mv $tar $tar.x
        tar cfz $tar -C $tar.x .
        rm -rf $tar.x
    done

    # if backup is in pgdump (not sql) format - decode it
    db_pgdump="$tmpd/gitlab_backup/db/database.pgdump"
    if [ -d "$db_pgdump" ]; then
        # convert database dump to plain-text sql (as gitlab restore expects)
        gitlab-rake -e "exec \"pg_restore --clean \\"$db_pgdump\\" >$tmpd/gitlab_backup/db/database.sql \""
        rm -rf "$db_pgdump"

        # tweak gitlab's backup_information.yml so it says db is included in the backup
        sed -i -e 's/:skipped: repositories,db/:skipped: repositories/' "$backup_info"
    fi

    gzip "$tmpd/gitlab_backup/db/database.sql"  # gzip sql dump, as gitlab expects .gz

    # 2. find out backup timestamp as saved by gitlab
    backup_created_at=`grep :backup_created_at: "$backup_info" |
                        sed -e s'/:backup_created_at: //'`
    # 2015-08-27 11:32:37.201345216 +02:00 -> 1440667957
    backup_created_at=`echo "$backup_created_at" |
                        gitlab-rake -e 'puts Time.parse(STDIN.read).to_i'`

    # 3. prepare tarball as would be created by gitlab:backup:create
    backup_tar="${backup_created_at}_gitlab_backup.tar"
    backup_tar="$GITLAB_BACKUP_PATH/$backup_tar"
    test -e "$backup_tar" && die "E: $backup_tar already exists"
    tar -C "$tmpd/gitlab_backup" -cf "$backup_tar" .

    rm -rf "$tmpd"  # tmpd no longer needed

    # 4. extract repositories into .../repositories.<timestamp>
    $GIT_BACKUP restore $HEAD gitlab/repo:"${GITLAB_REPOS_PATH}.${backup_created_at}"


    # extraction complete - now proceed with actual backup restore
    # (which is mv repositories dir + load db)
    echo
    echo "Extraction complete. To actually restore data please do"
    echo "# TODO check, and make this run automatically"
    cat << EOF
# https://gitlab.com/gitlab-org/gitlab-ce/blob/master/doc/raketasks/backup_restore.md
gitlab-ctl stop unicorn
gitlab-ctl stop sidekiq
mv ${GITLAB_REPOS_PATH} ${GITLAB_REPOS_PATH}.old
mv ${GITLAB_REPOS_PATH}.${backup_created_at} ${GITLAB_REPOS_PATH}
gitlab-rake gitlab:backup:restore BACKUP=$backup_created_at
gitlab-ctl start
gitlab-rake gitlab:satellites:create    # will go away after gitlab 8.0
gitlab-rake gitlab:check SANITIZE=true
EOF
}


# ----------------------------------------

# make sure git-backup is present or explicitly specified
GIT_BACKUP=${GIT_BACKUP:-`which git-backup`} || die "E: where is git-backup ?"

# make sure we run under proper user used by gitlab
gitlab_user=`gitlab-rake -e "exec 'whoami'"`
me=`whoami`
if test "$me" != "$gitlab_user"; then
    #echo respawning ...
    exec gitlab-rake -e "Dir.chdir \"$PWD\"; exec \"GIT_BACKUP=$GIT_BACKUP $0 $*\""
fi


# we are working with potentially sensitive data
# -> limit what could be read to current user only
umask 0077    # XXX maybe not good - e.g. git-data/repositories should (?) be rwxrwx---


usage() {
    echo "Usage: gitlab-backup [pull | restore <commit-ish>]"
}


test $# -lt 1 && die `usage`
action="$1"
shift

case "$action" in
    pull)
        backup_pull
        ;;
    restore)
        test $# -lt 1 && die `usage`
        backup_restore "$1"
        ;;
    -h)
        usage
        exit 0
        ;;
    *)
        die `usage`
esac