Commit c1b4fc2c authored by Thong Kuah's avatar Thong Kuah

Merge branch 'concurrent_repo_backup' into 'master'

Add concurrency support for Git repository backups

See merge request gitlab-org/gitlab!37158
parents cf995b42 c2306c34
...@@ -651,6 +651,8 @@ class Project < ApplicationRecord ...@@ -651,6 +651,8 @@ class Project < ApplicationRecord
scope :joins_import_state, -> { joins("INNER JOIN project_mirror_data import_state ON import_state.project_id = projects.id") } scope :joins_import_state, -> { joins("INNER JOIN project_mirror_data import_state ON import_state.project_id = projects.id") }
scope :for_group, -> (group) { where(group: group) } scope :for_group, -> (group) { where(group: group) }
scope :for_group_and_its_subgroups, ->(group) { where(namespace_id: group.self_and_descendants.select(:id)) } scope :for_group_and_its_subgroups, ->(group) { where(namespace_id: group.self_and_descendants.select(:id)) }
scope :for_repository_storage, -> (repository_storage) { where(repository_storage: repository_storage) }
scope :excluding_repository_storage, -> (repository_storage) { where.not(repository_storage: repository_storage) }
class << self class << self
# Searches for a list of projects based on the query given in `query`. # Searches for a list of projects based on the query given in `query`.
......
---
title: Add concurrency support for Git repository backups
merge_request: 37158
author:
type: changed
...@@ -295,6 +295,30 @@ For installations from source: ...@@ -295,6 +295,30 @@ For installations from source:
sudo -u git -H bundle exec rake gitlab:backup:create SKIP=tar RAILS_ENV=production sudo -u git -H bundle exec rake gitlab:backup:create SKIP=tar RAILS_ENV=production
``` ```
#### Back up Git repositories concurrently
> [Introduced](https://gitlab.com/gitlab-org/gitlab/-/merge_requests/37158) in GitLab 13.3.
Repositories can be backed up concurrently to help fully utilise CPU time. The following variables
are available to modify the default behavior of the Rake task:
- `GITLAB_BACKUP_MAX_CONCURRENCY` sets the maximum number of projects to backup at the same time.
Defaults to 1.
- `GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY` sets the maximum number of projects to backup at the same time on each storage. This allows the repository backups to be spread across storages.
Defaults to 1.
For example, for Omnibus GitLab installations:
```shell
sudo gitlab-backup create GITLAB_BACKUP_MAX_CONCURRENCY=4 GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY=1
```
For example, for installations from source:
```shell
sudo -u git -H bundle exec rake gitlab:backup:create GITLAB_BACKUP_MAX_CONCURRENCY=4 GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY=1
```
#### Uploading backups to a remote (cloud) storage #### Uploading backups to a remote (cloud) storage
Starting with GitLab 7.4 you can let the backup script upload the `.tar` file it creates. Starting with GitLab 7.4 you can let the backup script upload the `.tar` file it creates.
......
...@@ -10,34 +10,31 @@ module Backup ...@@ -10,34 +10,31 @@ module Backup
@progress = progress @progress = progress
end end
def dump def dump(max_concurrency:, max_storage_concurrency:)
prepare prepare
Project.find_each(batch_size: 1000) do |project| if max_concurrency <= 1 && max_storage_concurrency <= 1
progress.print " * #{display_repo_path(project)} ... " return dump_consecutive
end
if project.hashed_storage?(:repository)
FileUtils.mkdir_p(File.dirname(File.join(backup_repos_path, project.disk_path)))
else
FileUtils.mkdir_p(File.join(backup_repos_path, project.namespace.full_path)) if project.namespace
end
if !empty_repo?(project) if Project.excluding_repository_storage(Gitlab.config.repositories.storages.keys).exists?
backup_project(project) raise Error, 'repositories.storages in gitlab.yml is misconfigured'
progress.puts "[DONE]".color(:green) end
else
progress.puts "[SKIPPED]".color(:cyan)
end
wiki = ProjectWiki.new(project) semaphore = Concurrent::Semaphore.new(max_concurrency)
errors = Queue.new
if !empty_repo?(wiki) threads = Gitlab.config.repositories.storages.keys.map do |storage|
backup_project(wiki) Thread.new do
progress.puts "[DONE] Wiki".color(:green) dump_storage(storage, semaphore, max_storage_concurrency: max_storage_concurrency)
else rescue => e
progress.puts "[SKIPPED] Wiki".color(:cyan) errors << e
end end
end end
threads.each(&:join)
raise errors.pop unless errors.empty?
end end
def backup_project(project) def backup_project(project)
...@@ -146,6 +143,71 @@ module Backup ...@@ -146,6 +143,71 @@ module Backup
private private
def dump_consecutive
Project.find_each(batch_size: 1000) do |project|
dump_project(project)
end
end
def dump_storage(storage, semaphore, max_storage_concurrency:)
errors = Queue.new
queue = SizedQueue.new(1)
threads = Array.new(max_storage_concurrency) do
Thread.new do
while project = queue.pop
semaphore.acquire
begin
dump_project(project)
rescue => e
errors << e
break
ensure
semaphore.release
end
end
end
end
Project.for_repository_storage(storage).find_each(batch_size: 100) do |project|
break unless errors.empty?
queue.push(project)
end
queue.close
threads.each(&:join)
raise errors.pop unless errors.empty?
end
def dump_project(project)
progress.puts " * #{display_repo_path(project)} ... "
if project.hashed_storage?(:repository)
FileUtils.mkdir_p(File.dirname(File.join(backup_repos_path, project.disk_path)))
else
FileUtils.mkdir_p(File.join(backup_repos_path, project.namespace.full_path)) if project.namespace
end
if !empty_repo?(project)
backup_project(project)
progress.puts " * #{display_repo_path(project)} ... " + "[DONE]".color(:green)
else
progress.puts " * #{display_repo_path(project)} ... " + "[SKIPPED]".color(:cyan)
end
wiki = ProjectWiki.new(project)
if !empty_repo?(wiki)
backup_project(wiki)
progress.puts " * #{display_repo_path(project)} ... " + "[DONE] Wiki".color(:green)
else
progress.puts " * #{display_repo_path(project)} ... " + "[SKIPPED] Wiki".color(:cyan)
end
end
def progress_warn(project, cmd, output) def progress_warn(project, cmd, output)
progress.puts "[WARNING] Executing #{cmd}".color(:orange) progress.puts "[WARNING] Executing #{cmd}".color(:orange)
progress.puts "Ignoring error on #{display_repo_path(project)} - #{output}".color(:orange) progress.puts "Ignoring error on #{display_repo_path(project)} - #{output}".color(:orange)
......
...@@ -93,10 +93,19 @@ namespace :gitlab do ...@@ -93,10 +93,19 @@ namespace :gitlab do
task create: :gitlab_environment do task create: :gitlab_environment do
puts_time "Dumping repositories ...".color(:blue) puts_time "Dumping repositories ...".color(:blue)
max_concurrency = ENV.fetch('GITLAB_BACKUP_MAX_CONCURRENCY', 1).to_i
max_storage_concurrency = ENV.fetch('GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY', 1).to_i
if ENV["SKIP"] && ENV["SKIP"].include?("repositories") if ENV["SKIP"] && ENV["SKIP"].include?("repositories")
puts_time "[SKIPPED]".color(:cyan) puts_time "[SKIPPED]".color(:cyan)
elsif max_concurrency < 1 || max_storage_concurrency < 1
puts "GITLAB_BACKUP_MAX_CONCURRENCY and GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY must have a value of at least 1".color(:red)
exit 1
else else
Backup::Repository.new(progress).dump Backup::Repository.new(progress).dump(
max_concurrency: max_concurrency,
max_storage_concurrency: max_storage_concurrency
)
puts_time "done".color(:green) puts_time "done".color(:green)
end end
end end
......
...@@ -3,8 +3,9 @@ ...@@ -3,8 +3,9 @@
require 'spec_helper' require 'spec_helper'
RSpec.describe Backup::Repository do RSpec.describe Backup::Repository do
let_it_be(:project) { create(:project, :wiki_repo) }
let(:progress) { StringIO.new } let(:progress) { StringIO.new }
let!(:project) { create(:project, :wiki_repo) }
subject { described_class.new(progress) } subject { described_class.new(progress) }
...@@ -19,13 +20,88 @@ RSpec.describe Backup::Repository do ...@@ -19,13 +20,88 @@ RSpec.describe Backup::Repository do
end end
describe '#dump' do describe '#dump' do
describe 'repo failure' do before do
before do allow(Gitlab.config.repositories.storages).to receive(:keys).and_return(storage_keys)
allow(Gitlab::Popen).to receive(:popen).and_return(['normal output', 0]) end
let_it_be(:projects) { create_list(:project, 5, :wiki_repo) + [project] }
let(:storage_keys) { %w[default test_second_storage] }
context 'no concurrency' do
it 'creates the expected number of threads' do
expect(Thread).not_to receive(:new)
projects.each do |project|
expect(subject).to receive(:dump_project).with(project).and_call_original
end
subject.dump(max_concurrency: 1, max_storage_concurrency: 1)
end end
it 'does not raise error' do describe 'command failure' do
expect { subject.dump }.not_to raise_error it 'dump_project raises an error' do
allow(subject).to receive(:dump_project).and_raise(IOError)
expect { subject.dump(max_concurrency: 1, max_storage_concurrency: 1) }.to raise_error(IOError)
end
it 'project query raises an error' do
allow(Project).to receive(:find_each).and_raise(ActiveRecord::StatementTimeout)
expect { subject.dump(max_concurrency: 1, max_storage_concurrency: 1) }.to raise_error(ActiveRecord::StatementTimeout)
end
end
end
[4, 10].each do |max_storage_concurrency|
context "max_storage_concurrency #{max_storage_concurrency}" do
it 'creates the expected number of threads' do
expect(Thread).to receive(:new)
.exactly(storage_keys.length * (max_storage_concurrency + 1)).times
.and_call_original
projects.each do |project|
expect(subject).to receive(:dump_project).with(project).and_call_original
end
subject.dump(max_concurrency: 1, max_storage_concurrency: max_storage_concurrency)
end
it 'creates the expected number of threads with extra max concurrency' do
expect(Thread).to receive(:new)
.exactly(storage_keys.length * (max_storage_concurrency + 1)).times
.and_call_original
projects.each do |project|
expect(subject).to receive(:dump_project).with(project).and_call_original
end
subject.dump(max_concurrency: 3, max_storage_concurrency: max_storage_concurrency)
end
describe 'command failure' do
it 'dump_project raises an error' do
allow(subject).to receive(:dump_project)
.and_raise(IOError)
expect { subject.dump(max_concurrency: 1, max_storage_concurrency: max_storage_concurrency) }.to raise_error(IOError)
end
it 'project query raises an error' do
allow(Project).to receive_message_chain('for_repository_storage.find_each').and_raise(ActiveRecord::StatementTimeout)
expect { subject.dump(max_concurrency: 1, max_storage_concurrency: max_storage_concurrency) }.to raise_error(ActiveRecord::StatementTimeout)
end
context 'misconfigured storages' do
let(:storage_keys) { %w[test_second_storage] }
it 'raises an error' do
expect { subject.dump(max_concurrency: 1, max_storage_concurrency: max_storage_concurrency) }.to raise_error(Backup::Error, 'repositories.storages in gitlab.yml is misconfigured')
end
end
end
end end
end end
end end
......
...@@ -5552,6 +5552,32 @@ RSpec.describe Project do ...@@ -5552,6 +5552,32 @@ RSpec.describe Project do
end end
end end
describe '.for_repository_storage' do
it 'returns the projects for a given repository storage' do
stub_storage_settings('test_second_storage' => {
'path' => TestEnv::SECOND_STORAGE_PATH,
'gitaly_address' => Gitlab.config.repositories.storages.default.gitaly_address
})
expected_project = create(:project, repository_storage: 'default')
create(:project, repository_storage: 'test_second_storage')
expect(described_class.for_repository_storage('default')).to eq([expected_project])
end
end
describe '.excluding_repository_storage' do
it 'returns the projects excluding the given repository storage' do
stub_storage_settings('test_second_storage' => {
'path' => TestEnv::SECOND_STORAGE_PATH,
'gitaly_address' => Gitlab.config.repositories.storages.default.gitaly_address
})
expected_project = create(:project, repository_storage: 'test_second_storage')
create(:project, repository_storage: 'default')
expect(described_class.excluding_repository_storage('default')).to eq([expected_project])
end
end
describe '.deployments' do describe '.deployments' do
subject { project.deployments } subject { project.deployments }
......
...@@ -283,20 +283,7 @@ RSpec.describe 'gitlab:app namespace rake task', :delete do ...@@ -283,20 +283,7 @@ RSpec.describe 'gitlab:app namespace rake task', :delete do
end end
context 'multiple repository storages' do context 'multiple repository storages' do
let(:test_second_storage) do let_it_be(:default_storage_hash) { Gitlab.config.repositories.storages.default.to_h }
Gitlab::GitalyClient::StorageSettings.new(@default_storage_hash.merge('path' => 'tmp/tests/custom_storage'))
end
let(:storages) do
{
'default' => Gitlab.config.repositories.storages.default,
'test_second_storage' => test_second_storage
}
end
before(:all) do
@default_storage_hash = Gitlab.config.repositories.storages.default.to_h
end
before do before do
# We only need a backup of the repositories for this test # We only need a backup of the repositories for this test
...@@ -307,17 +294,6 @@ RSpec.describe 'gitlab:app namespace rake task', :delete do ...@@ -307,17 +294,6 @@ RSpec.describe 'gitlab:app namespace rake task', :delete do
# Avoid asking gitaly about the root ref (which will fail because of the # Avoid asking gitaly about the root ref (which will fail because of the
# mocked storages) # mocked storages)
allow_any_instance_of(Repository).to receive(:empty?).and_return(false) allow_any_instance_of(Repository).to receive(:empty?).and_return(false)
end
after do
FileUtils.rm_rf(Settings.absolute('tmp/tests/custom_storage'))
end
it 'includes repositories in all repository storages' do
project_a = create(:project, :repository)
project_b = create(:project, :repository, repository_storage: 'test_second_storage')
b_storage_dir = File.join(Settings.absolute('tmp/tests/custom_storage'), File.dirname(project_b.disk_path))
FileUtils.mkdir_p(b_storage_dir) FileUtils.mkdir_p(b_storage_dir)
...@@ -328,16 +304,91 @@ RSpec.describe 'gitlab:app namespace rake task', :delete do ...@@ -328,16 +304,91 @@ RSpec.describe 'gitlab:app namespace rake task', :delete do
Rails.root.join(storages['test_second_storage'].legacy_disk_path, project_b.repository.disk_path + '.git') Rails.root.join(storages['test_second_storage'].legacy_disk_path, project_b.repository.disk_path + '.git')
) )
end end
end
after do
FileUtils.rm_rf(test_second_storage_dir)
end
let(:test_second_storage_dir) { Dir.mktmpdir }
let(:test_second_storage) do
Gitlab::GitalyClient::StorageSettings.new(default_storage_hash.merge('path' => test_second_storage_dir))
end
let(:storages) do
{
'default' => Gitlab.config.repositories.storages.default,
'test_second_storage' => test_second_storage
}
end
let!(:project_a) { create(:project, :repository) }
let!(:project_b) { create(:project, :repository, repository_storage: 'test_second_storage') }
let!(:b_storage_dir) { File.join(test_second_storage_dir, File.dirname(project_b.disk_path)) }
context 'no concurrency' do
it 'includes repositories in all repository storages' do
expect { run_rake_task('gitlab:backup:create') }.to output.to_stdout
tar_contents, exit_status = Gitlab::Popen.popen(
%W{tar -tvf #{backup_tar} repositories}
)
expect(exit_status).to eq(0)
expect(tar_contents).to match("repositories/#{project_a.disk_path}.bundle")
expect(tar_contents).to match("repositories/#{project_b.disk_path}.bundle")
end
end
context 'with concurrency' do
before do
stub_env('GITLAB_BACKUP_MAX_CONCURRENCY', 4)
end
it 'includes repositories in all repository storages' do
expect { run_rake_task('gitlab:backup:create') }.to output.to_stdout
tar_contents, exit_status = Gitlab::Popen.popen(
%W{tar -tvf #{backup_tar} repositories}
)
expect(exit_status).to eq(0)
expect(tar_contents).to match("repositories/#{project_a.disk_path}.bundle")
expect(tar_contents).to match("repositories/#{project_b.disk_path}.bundle")
end
end
end
context 'concurrency settings' do
before do
# We only need a backup of the repositories for this test
stub_env('SKIP', 'db,uploads,builds,artifacts,lfs,registry')
create(:project, :repository)
end
it 'has defaults' do
expect_next_instance_of(::Backup::Repository) do |instance|
expect(instance).to receive(:dump)
.with(max_concurrency: 1, max_storage_concurrency: 1)
.and_call_original
end
expect { run_rake_task('gitlab:backup:create') }.to output.to_stdout expect { run_rake_task('gitlab:backup:create') }.to output.to_stdout
end
tar_contents, exit_status = Gitlab::Popen.popen( it 'passes through concurrency environment variables' do
%W{tar -tvf #{backup_tar} repositories} stub_env('GITLAB_BACKUP_MAX_CONCURRENCY', 5)
) stub_env('GITLAB_BACKUP_MAX_STORAGE_CONCURRENCY', 2)
expect(exit_status).to eq(0) expect_next_instance_of(::Backup::Repository) do |instance|
expect(tar_contents).to match("repositories/#{project_a.disk_path}.bundle") expect(instance).to receive(:dump)
expect(tar_contents).to match("repositories/#{project_b.disk_path}.bundle") .with(max_concurrency: 5, max_storage_concurrency: 2)
.and_call_original
end
expect { run_rake_task('gitlab:backup:create') }.to output.to_stdout
end end
end end
end # backup_create task end # backup_create task
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment