Commit 7ac392b4 authored by Bob Van Landuyt's avatar Bob Van Landuyt Committed by Stan Hu

Improve cleanup of gpg-homedirs

The `gpg-agent` that could have been spawned here dies when it sees
it's socket disappear.

However, sometimes it seems like we fail to delete the homedir,
causing the `gpg-agent` to live on forever. We've noticed that the
deletion failed in
http://gitlab.com/gitlab-org/gitlab-foss/issues/36998: there was a
race condition during the deletion where `gpg-agent` would still be
modifying files while we've already called `FileUtils.remove_entry`.

This will attempt to delete the directory multiple times, at least 0.1
seconds apart. This is a naive way of trying to make sure
we clean up the homedir and count on `gpg-agent` to see that and make
itself go away.

On a web node we'll attempt for at most 0.5 seconds to clean up the
directory before failing. In a sidekiq process we'll attempt the
deletion for up to 2 seconds.

When the cleanup fails, we will now track that exception in
Sentry to gain some visibility.

This also adds counters for the creation and deletion of tmp
keychains, which we should be able to correlate to the nubmer of
zombie `gpg-agent` processes.
parent b5a3f98d
---
title: Improve handling of gpg-agent processes
merge_request: 19311
author:
type: changed
......@@ -4,6 +4,10 @@ module Gitlab
module Gpg
extend self
CleanupError = Class.new(StandardError)
BG_CLEANUP_RUNTIME_S = 2
FG_CLEANUP_RUNTIME_S = 0.5
MUTEX = Mutex.new
module CurrentKeyChain
......@@ -94,16 +98,55 @@ module Gitlab
previous_dir = current_home_dir
tmp_dir = Dir.mktmpdir
GPGME::Engine.home_dir = tmp_dir
tmp_keychains_created.increment
yield
ensure
# Ignore any errors when removing the tmp directory, as we may run into a
GPGME::Engine.home_dir = previous_dir
begin
cleanup_tmp_dir(tmp_dir)
rescue CleanupError => e
# This means we left a GPG-agent process hanging. Logging the problem in
# sentry will make this more visible.
Gitlab::Sentry.track_exception(e,
issue_url: 'https://gitlab.com/gitlab-org/gitlab/issues/20918',
extra: { tmp_dir: tmp_dir })
end
tmp_keychains_removed.increment unless File.exist?(tmp_dir)
end
def cleanup_tmp_dir(tmp_dir)
return FileUtils.remove_entry(tmp_dir, true) if Feature.disabled?(:gpg_cleanup_retries)
# Retry when removing the tmp directory failed, as we may run into a
# race condition:
# The `gpg-agent` agent process may clean up some files as well while
# `FileUtils.remove_entry` is iterating the directory and removing all
# its contained files and directories recursively, which could raise an
# error.
FileUtils.remove_entry(tmp_dir, true)
GPGME::Engine.home_dir = previous_dir
# Failing to remove the tmp directory could leave the `gpg-agent` process
# running forever.
Retriable.retriable(max_elapsed_time: cleanup_time, base_interval: 0.1) do
FileUtils.remove_entry(tmp_dir) if File.exist?(tmp_dir)
end
rescue => e
raise CleanupError, e
end
def cleanup_time
Sidekiq.server? ? BG_CLEANUP_RUNTIME_S : FG_CLEANUP_RUNTIME_S
end
def tmp_keychains_created
@tmp_keychains_created ||= Gitlab::Metrics.counter(:gpg_tmp_keychains_created_total,
'The number of temporary GPG keychains created')
end
def tmp_keychains_removed
@tmp_keychains_removed ||= Gitlab::Metrics.counter(:gpg_tmp_keychains_removed_total,
'The number of temporary GPG keychains removed')
end
end
end
......@@ -139,6 +139,96 @@ describe Gitlab::Gpg do
end
end.not_to raise_error
end
it 'keeps track of created and removed keychains in counters' do
created = Gitlab::Metrics.counter(:gpg_tmp_keychains_created_total, 'The number of temporary GPG keychains')
removed = Gitlab::Metrics.counter(:gpg_tmp_keychains_removed_total, 'The number of temporary GPG keychains')
initial_created = created.get
initial_removed = removed.get
described_class.using_tmp_keychain do
expect(created.get).to eq(initial_created + 1)
expect(removed.get).to eq(initial_removed)
end
expect(removed.get).to eq(initial_removed + 1)
end
it 'cleans up the tmp directory after finishing' do
tmp_directory = nil
described_class.using_tmp_keychain do
tmp_directory = described_class.current_home_dir
expect(File.exist?(tmp_directory)).to be true
end
expect(tmp_directory).not_to be_nil
expect(File.exist?(tmp_directory)).to be false
end
it 'does not fail if the homedir was deleted while running' do
expect do
described_class.using_tmp_keychain do
FileUtils.remove_entry(described_class.current_home_dir)
end
end.not_to raise_error
end
shared_examples 'multiple deletion attempts of the tmp-dir' do |seconds|
let(:tmp_dir) do
tmp_dir = Dir.mktmpdir
allow(Dir).to receive(:mktmpdir).and_return(tmp_dir)
tmp_dir
end
before do
# Stub all the other calls for `remove_entry`
allow(FileUtils).to receive(:remove_entry).with(any_args).and_call_original
end
it "tries for #{seconds}" do
expect(Retriable).to receive(:retriable).with(a_hash_including(max_elapsed_time: seconds))
described_class.using_tmp_keychain {}
end
it 'tries at least 2 times to remove the tmp dir before raising', :aggregate_failures do
expect(Retriable).to receive(:sleep).at_least(2).times
expect(FileUtils).to receive(:remove_entry).with(tmp_dir).at_least(2).times.and_raise('Deletion failed')
expect { described_class.using_tmp_keychain { } }.to raise_error(described_class::CleanupError)
end
it 'does not attempt multiple times when the deletion succeeds' do
expect(Retriable).to receive(:sleep).once
expect(FileUtils).to receive(:remove_entry).with(tmp_dir).once.and_raise('Deletion failed')
expect(FileUtils).to receive(:remove_entry).with(tmp_dir).and_call_original
expect { described_class.using_tmp_keychain { } }.not_to raise_error
expect(File.exist?(tmp_dir)).to be false
end
it 'does not retry when the feature flag is disabled' do
stub_feature_flags(gpg_cleanup_retries: false)
expect(FileUtils).to receive(:remove_entry).with(tmp_dir, true).and_call_original
expect(Retriable).not_to receive(:retriable)
described_class.using_tmp_keychain {}
end
end
it_behaves_like 'multiple deletion attempts of the tmp-dir', described_class::FG_CLEANUP_RUNTIME_S
context 'when running in Sidekiq' do
before do
allow(Sidekiq).to receive(:server?).and_return(true)
end
it_behaves_like 'multiple deletion attempts of the tmp-dir', described_class::BG_CLEANUP_RUNTIME_S
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment