Commit 7b27e4cb authored by Stan Hu's avatar Stan Hu

Merge branch 'bvl-robust-gpg-homedir-cleanup' into 'master'

Improve cleanup of gpg-homedirs

See merge request gitlab-org/gitlab!19311
parents b5a3f98d 7ac392b4
---
title: Improve handling of gpg-agent processes
merge_request: 19311
author:
type: changed
......@@ -4,6 +4,10 @@ module Gitlab
module Gpg
extend self
CleanupError = Class.new(StandardError)
BG_CLEANUP_RUNTIME_S = 2
FG_CLEANUP_RUNTIME_S = 0.5
MUTEX = Mutex.new
module CurrentKeyChain
......@@ -94,16 +98,55 @@ module Gitlab
previous_dir = current_home_dir
tmp_dir = Dir.mktmpdir
GPGME::Engine.home_dir = tmp_dir
tmp_keychains_created.increment
yield
ensure
# Ignore any errors when removing the tmp directory, as we may run into a
GPGME::Engine.home_dir = previous_dir
begin
cleanup_tmp_dir(tmp_dir)
rescue CleanupError => e
# This means we left a GPG-agent process hanging. Logging the problem in
# sentry will make this more visible.
Gitlab::Sentry.track_exception(e,
issue_url: 'https://gitlab.com/gitlab-org/gitlab/issues/20918',
extra: { tmp_dir: tmp_dir })
end
tmp_keychains_removed.increment unless File.exist?(tmp_dir)
end
def cleanup_tmp_dir(tmp_dir)
return FileUtils.remove_entry(tmp_dir, true) if Feature.disabled?(:gpg_cleanup_retries)
# Retry when removing the tmp directory failed, as we may run into a
# race condition:
# The `gpg-agent` agent process may clean up some files as well while
# `FileUtils.remove_entry` is iterating the directory and removing all
# its contained files and directories recursively, which could raise an
# error.
FileUtils.remove_entry(tmp_dir, true)
GPGME::Engine.home_dir = previous_dir
# Failing to remove the tmp directory could leave the `gpg-agent` process
# running forever.
Retriable.retriable(max_elapsed_time: cleanup_time, base_interval: 0.1) do
FileUtils.remove_entry(tmp_dir) if File.exist?(tmp_dir)
end
rescue => e
raise CleanupError, e
end
def cleanup_time
Sidekiq.server? ? BG_CLEANUP_RUNTIME_S : FG_CLEANUP_RUNTIME_S
end
def tmp_keychains_created
@tmp_keychains_created ||= Gitlab::Metrics.counter(:gpg_tmp_keychains_created_total,
'The number of temporary GPG keychains created')
end
def tmp_keychains_removed
@tmp_keychains_removed ||= Gitlab::Metrics.counter(:gpg_tmp_keychains_removed_total,
'The number of temporary GPG keychains removed')
end
end
end
......@@ -139,6 +139,96 @@ describe Gitlab::Gpg do
end
end.not_to raise_error
end
it 'keeps track of created and removed keychains in counters' do
created = Gitlab::Metrics.counter(:gpg_tmp_keychains_created_total, 'The number of temporary GPG keychains')
removed = Gitlab::Metrics.counter(:gpg_tmp_keychains_removed_total, 'The number of temporary GPG keychains')
initial_created = created.get
initial_removed = removed.get
described_class.using_tmp_keychain do
expect(created.get).to eq(initial_created + 1)
expect(removed.get).to eq(initial_removed)
end
expect(removed.get).to eq(initial_removed + 1)
end
it 'cleans up the tmp directory after finishing' do
tmp_directory = nil
described_class.using_tmp_keychain do
tmp_directory = described_class.current_home_dir
expect(File.exist?(tmp_directory)).to be true
end
expect(tmp_directory).not_to be_nil
expect(File.exist?(tmp_directory)).to be false
end
it 'does not fail if the homedir was deleted while running' do
expect do
described_class.using_tmp_keychain do
FileUtils.remove_entry(described_class.current_home_dir)
end
end.not_to raise_error
end
shared_examples 'multiple deletion attempts of the tmp-dir' do |seconds|
let(:tmp_dir) do
tmp_dir = Dir.mktmpdir
allow(Dir).to receive(:mktmpdir).and_return(tmp_dir)
tmp_dir
end
before do
# Stub all the other calls for `remove_entry`
allow(FileUtils).to receive(:remove_entry).with(any_args).and_call_original
end
it "tries for #{seconds}" do
expect(Retriable).to receive(:retriable).with(a_hash_including(max_elapsed_time: seconds))
described_class.using_tmp_keychain {}
end
it 'tries at least 2 times to remove the tmp dir before raising', :aggregate_failures do
expect(Retriable).to receive(:sleep).at_least(2).times
expect(FileUtils).to receive(:remove_entry).with(tmp_dir).at_least(2).times.and_raise('Deletion failed')
expect { described_class.using_tmp_keychain { } }.to raise_error(described_class::CleanupError)
end
it 'does not attempt multiple times when the deletion succeeds' do
expect(Retriable).to receive(:sleep).once
expect(FileUtils).to receive(:remove_entry).with(tmp_dir).once.and_raise('Deletion failed')
expect(FileUtils).to receive(:remove_entry).with(tmp_dir).and_call_original
expect { described_class.using_tmp_keychain { } }.not_to raise_error
expect(File.exist?(tmp_dir)).to be false
end
it 'does not retry when the feature flag is disabled' do
stub_feature_flags(gpg_cleanup_retries: false)
expect(FileUtils).to receive(:remove_entry).with(tmp_dir, true).and_call_original
expect(Retriable).not_to receive(:retriable)
described_class.using_tmp_keychain {}
end
end
it_behaves_like 'multiple deletion attempts of the tmp-dir', described_class::FG_CLEANUP_RUNTIME_S
context 'when running in Sidekiq' do
before do
allow(Sidekiq).to receive(:server?).and_return(true)
end
it_behaves_like 'multiple deletion attempts of the tmp-dir', described_class::BG_CLEANUP_RUNTIME_S
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment