Commit 90886008 authored by Douwe Maan's avatar Douwe Maan

Merge branch '30093-apply-bfg-object-map-to-database' into 'master'

Remove cleaned up OIDs from database and cache

Closes #30093

See merge request gitlab-org/gitlab-ce!26555
parents 0cc30909 8973f32d
......@@ -51,6 +51,10 @@ class MergeRequestDiff < ApplicationRecord
joins(:merge_request_diff_commits).where(merge_request_diff_commits: { sha: sha }).reorder(nil)
end
scope :by_project_id, -> (project_id) do
joins(:merge_request).where(merge_requests: { target_project_id: project_id })
end
scope :recent, -> { order(id: :desc).limit(100) }
scope :files_in_database, -> { where(stored_externally: [false, nil]) }
......
......@@ -7,6 +7,10 @@ class NoteDiffFile < ApplicationRecord
joins(:diff_note).where("resolved_at IS NULL OR noteable_type = 'Commit'")
end
scope :referencing_sha, -> (oids, project_id:) do
joins(:diff_note).where(notes: { project_id: project_id, commit_id: oids })
end
delegate :original_position, :project, to: :diff_note
belongs_to :diff_note, inverse_of: :note_diff_file
......
......@@ -18,9 +18,6 @@ module Projects
# per rewritten object, with the old and new SHAs space-separated. It can be
# used to update or remove content that references the objects that BFG has
# altered
#
# Currently, only the project repository is modified by this service, but we
# may wish to modify other data sources in the future.
def execute
apply_bfg_object_map!
......@@ -41,8 +38,50 @@ module Projects
raise NoUploadError unless project.bfg_object_map.exists?
project.bfg_object_map.open do |io|
repository_cleaner.apply_bfg_object_map(io)
repository_cleaner.apply_bfg_object_map_stream(io) do |response|
cleanup_diffs(response)
end
end
end
def cleanup_diffs(response)
old_commit_shas = extract_old_commit_shas(response.entries)
ActiveRecord::Base.transaction do
cleanup_merge_request_diffs(old_commit_shas)
cleanup_note_diff_files(old_commit_shas)
end
end
def extract_old_commit_shas(batch)
batch.lazy.select { |entry| entry.type == :COMMIT }.map(&:old_oid).force
end
def cleanup_merge_request_diffs(old_commit_shas)
merge_request_diffs = MergeRequestDiff
.by_project_id(project.id)
.by_commit_sha(old_commit_shas)
# It's important to run the ActiveRecord callbacks here
merge_request_diffs.destroy_all # rubocop:disable Cop/DestroyAll
# TODO: ensure the highlight cache is removed immediately. It's too hard
# to calculate the Redis keys at present.
#
# https://gitlab.com/gitlab-org/gitlab-ce/issues/61115
end
def cleanup_note_diff_files(old_commit_shas)
# Pluck the IDs instead of running the query twice to ensure we clear the
# cache for exactly the note diffs we remove
ids = NoteDiffFile
.referencing_sha(old_commit_shas, project_id: project.id)
.pluck_primary_key
NoteDiffFile.id_in(ids).delete_all
# A highlighted version of the diff is stored in redis. Remove it now.
Gitlab::DiscussionsDiff::HighlightCache.clear_multiple(ids)
end
def repository_cleaner
......
---
title: Remove cleaned up OIDs from database and cache
merge_request: 26555
author:
type: added
......@@ -98,6 +98,12 @@ up its own internal state, maximizing the space saved.
`git gc` against the repository. You will receive an email once it has
completed.
This process will remove some copies of the rewritten commits from GitLab's
cache and database, but there are still numerous gaps in coverage - at present,
some of the copies may persist indefinitely. [Clearing the instance cache]
(../../../administration/raketasks/maintenance.md#clear-redis-cache) may help to
remove some of them, but it should not be depended on for security purposes!
## Using `git filter-branch`
1. Navigate to your repository:
......
......@@ -52,6 +52,19 @@ module Gitlab
end
end
# Clears multiple cache keys at once.
#
# raw_keys - An Array of unique cache keys, without namespaces.
#
# It returns the number of cache keys cleared. Ex.: 42
def clear_multiple(raw_keys)
return [] if raw_keys.empty?
keys = raw_keys.map { |id| cache_key_for(id) }
Redis::Cache.with { |redis| redis.del(keys) }
end
def cache_key_for(raw_key)
"#{cache_key_prefix}:#{raw_key}"
end
......
......@@ -12,9 +12,9 @@ module Gitlab
@repository = repository
end
def apply_bfg_object_map(io)
def apply_bfg_object_map_stream(io, &blk)
wrapped_gitaly_errors do
gitaly_cleanup_client.apply_bfg_object_map(io)
gitaly_cleanup_client.apply_bfg_object_map_stream(io, &blk)
end
end
......
......@@ -12,25 +12,32 @@ module Gitlab
@storage = repository.storage
end
def apply_bfg_object_map(io)
first_request = Gitaly::ApplyBfgObjectMapRequest.new(repository: gitaly_repo)
def apply_bfg_object_map_stream(io, &blk)
responses = GitalyClient.call(
storage,
:cleanup_service,
:apply_bfg_object_map_stream,
build_object_map_enum(io),
timeout: GitalyClient.no_timeout
)
responses.each(&blk)
end
private
enum = Enumerator.new do |y|
y.yield first_request
def build_object_map_enum(io)
Enumerator.new do |y|
# First request. For simplicity, doesn't include any object map data
y << Gitaly::ApplyBfgObjectMapStreamRequest.new(repository: gitaly_repo)
# Now stream the BFG object map file to gitaly in chunks
while data = io.read(RepositoryService::MAX_MSG_SIZE)
y.yield Gitaly::ApplyBfgObjectMapRequest.new(object_map: data)
y << Gitaly::ApplyBfgObjectMapStreamRequest.new(object_map: data)
break if io&.eof?
end
end
GitalyClient.call(
storage,
:cleanup_service,
:apply_bfg_object_map,
enum,
timeout: GitalyClient.no_timeout
)
end
end
end
......
......@@ -3,31 +3,32 @@
require 'spec_helper'
describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
describe '#write_multiple' do
it 'sets multiple keys serializing content as JSON' do
mapping = {
3 => [
def fake_file(offset)
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 10,
new_pos: 11,
index: 2 + offset,
old_pos: 10 + offset,
new_pos: 11 + offset,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
},
{
text: 'foo',
type: 'new',
index: 3,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blops>blips</blops>'
}
end
let(:mapping) do
{
3 => [
fake_file(0),
fake_file(1)
],
4 => [
fake_file(2)
]
}
end
describe '#write_multiple' do
it 'sets multiple keys serializing content as JSON' do
described_class.write_multiple(mapping)
mapping.each do |key, value|
......@@ -41,53 +42,16 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
describe '#read_multiple' do
it 'reads multiple keys and serializes content into Gitlab::Diff::Line objects' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
},
{
text: 'foo',
type: 'new',
index: 3,
old_pos: 10,
new_pos: 11,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
]
}
described_class.write_multiple(mapping)
found = described_class.read_multiple(mapping.keys)
expect(found.size).to eq(1)
expect(found.size).to eq(2)
expect(found.first.size).to eq(2)
expect(found.first).to all(be_a(Gitlab::Diff::Line))
end
it 'returns nil when cached key is not found' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
]
}
described_class.write_multiple(mapping)
found = described_class.read_multiple([2, 3])
......@@ -95,8 +59,30 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
expect(found.size).to eq(2)
expect(found.first).to eq(nil)
expect(found.second.size).to eq(1)
expect(found.second.size).to eq(2)
expect(found.second).to all(be_a(Gitlab::Diff::Line))
end
end
describe '#clear_multiple' do
it 'removes all named keys' do
described_class.write_multiple(mapping)
described_class.clear_multiple(mapping.keys)
expect(described_class.read_multiple(mapping.keys)).to all(be_nil)
end
it 'only removed named keys' do
to_clear, to_leave = mapping.keys
described_class.write_multiple(mapping)
described_class.clear_multiple([to_clear])
cleared, left = described_class.read_multiple([to_clear, to_leave])
expect(cleared).to be_nil
expect(left).to all(be_a(Gitlab::Diff::Line))
end
end
end
......@@ -6,32 +6,47 @@ describe Gitlab::Git::RepositoryCleaner do
let(:project) { create(:project, :repository) }
let(:repository) { project.repository }
let(:head_sha) { repository.head_commit.id }
let(:object_map_data) { "#{head_sha} #{'0' * 40}" }
let(:object_map_data) { "#{head_sha} #{Gitlab::Git::BLANK_SHA}" }
subject(:cleaner) { described_class.new(repository.raw) }
describe '#apply_bfg_object_map' do
let(:clean_refs) { %W[refs/environments/1 refs/merge-requests/1 refs/keep-around/#{head_sha}] }
let(:keep_refs) { %w[refs/heads/_keep refs/tags/_keep] }
subject(:cleaner) { described_class.new(repository.raw) }
shared_examples_for '#apply_bfg_object_map_stream' do
before do
(clean_refs + keep_refs).each { |ref| repository.create_ref(head_sha, ref) }
end
context 'from StringIO' do
let(:object_map) { StringIO.new(object_map_data) }
it 'removes internal references' do
cleaner.apply_bfg_object_map(object_map)
entries = []
cleaner.apply_bfg_object_map_stream(object_map) do |rsp|
entries.concat(rsp.entries)
end
aggregate_failures do
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_falsy }
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_truthy }
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be(false) }
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be(true) }
expect(entries).to contain_exactly(
Gitaly::ApplyBfgObjectMapStreamResponse::Entry.new(
type: :COMMIT,
old_oid: head_sha,
new_oid: Gitlab::Git::BLANK_SHA
)
)
end
end
end
context 'from Gitlab::HttpIO' do
describe '#apply_bfg_object_map_stream (from StringIO)' do
let(:object_map) { StringIO.new(object_map_data) }
include_examples '#apply_bfg_object_map_stream'
end
describe '#apply_bfg_object_map_stream (from Gitlab::HttpIO)' do
let(:url) { 'http://example.com/bfg_object_map.txt' }
let(:tempfile) { Tempfile.new }
let(:object_map) { Gitlab::HttpIO.new(url, object_map_data.size) }
......@@ -40,21 +55,13 @@ describe Gitlab::Git::RepositoryCleaner do
tempfile.write(object_map_data)
tempfile.close
stub_remote_url_200(url, tempfile.path)
example.run
ensure
tempfile.unlink
end
it 'removes internal references' do
stub_remote_url_200(url, tempfile.path)
cleaner.apply_bfg_object_map(object_map)
aggregate_failures do
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_falsy }
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_truthy }
end
end
end
include_examples '#apply_bfg_object_map_stream'
end
end
......@@ -6,14 +6,14 @@ describe Gitlab::GitalyClient::CleanupService do
let(:relative_path) { project.disk_path + '.git' }
let(:client) { described_class.new(project.repository) }
describe '#apply_bfg_object_map' do
it 'sends an apply_bfg_object_map message' do
describe '#apply_bfg_object_map_stream' do
it 'sends an apply_bfg_object_map_stream message' do
expect_any_instance_of(Gitaly::CleanupService::Stub)
.to receive(:apply_bfg_object_map)
.to receive(:apply_bfg_object_map_stream)
.with(kind_of(Enumerator), kind_of(Hash))
.and_return(double)
.and_return([])
client.apply_bfg_object_map(StringIO.new)
client.apply_bfg_object_map_stream(StringIO.new)
end
end
end
......@@ -10,4 +10,31 @@ describe NoteDiffFile do
describe 'validations' do
it { is_expected.to validate_presence_of(:diff_note) }
end
describe '.referencing_sha' do
let!(:diff_note) { create(:diff_note_on_commit) }
let(:note_diff_file) { diff_note.note_diff_file }
let(:project) { diff_note.project }
it 'finds note diff files by project and sha' do
found = described_class.referencing_sha(diff_note.commit_id, project_id: project.id)
expect(found).to contain_exactly(note_diff_file)
end
it 'excludes note diff files with the wrong project' do
other_project = create(:project)
found = described_class.referencing_sha(diff_note.commit_id, project_id: other_project.id)
expect(found).to be_empty
end
it 'excludes note diff files with the wrong sha' do
found = described_class.referencing_sha(Gitlab::Git::BLANK_SHA, project_id: project.id)
expect(found).to be_empty
end
end
end
......@@ -6,13 +6,13 @@ describe Projects::CleanupService do
let(:project) { create(:project, :repository, bfg_object_map: fixture_file_upload('spec/fixtures/bfg_object_map.txt')) }
let(:object_map) { project.bfg_object_map }
let(:cleaner) { service.__send__(:repository_cleaner) }
subject(:service) { described_class.new(project) }
describe '#execute' do
it 'runs the apply_bfg_object_map gitaly RPC' do
expect_next_instance_of(Gitlab::Git::RepositoryCleaner) do |cleaner|
expect(cleaner).to receive(:apply_bfg_object_map).with(kind_of(IO))
end
it 'runs the apply_bfg_object_map_stream gitaly RPC' do
expect(cleaner).to receive(:apply_bfg_object_map_stream).with(kind_of(IO))
service.execute
end
......@@ -37,10 +37,91 @@ describe Projects::CleanupService do
expect(object_map.exists?).to be_falsy
end
context 'with a tainted merge request diff' do
let(:merge_request) { create(:merge_request, source_project: project, target_project: project) }
let(:diff) { merge_request.merge_request_diff }
let(:entry) { build_entry(diff.commits.first.id) }
before do
allow(cleaner)
.to receive(:apply_bfg_object_map_stream)
.and_yield(Gitaly::ApplyBfgObjectMapStreamResponse.new(entries: [entry]))
end
it 'removes the tainted commit from the database' do
service.execute
expect(MergeRequestDiff.exists?(diff.id)).to be_falsy
end
it 'ignores non-commit responses from Gitaly' do
entry.type = :UNKNOWN
service.execute
expect(MergeRequestDiff.exists?(diff.id)).to be_truthy
end
end
context 'with a tainted diff note' do
let(:diff_note) { create(:diff_note_on_commit, project: project) }
let(:note_diff_file) { diff_note.note_diff_file }
let(:entry) { build_entry(diff_note.commit_id) }
let(:highlight_cache) { Gitlab::DiscussionsDiff::HighlightCache }
let(:cache_id) { note_diff_file.id }
before do
allow(cleaner)
.to receive(:apply_bfg_object_map_stream)
.and_yield(Gitaly::ApplyBfgObjectMapStreamResponse.new(entries: [entry]))
end
it 'removes the tainted commit from the database' do
service.execute
expect(NoteDiffFile.exists?(note_diff_file.id)).to be_falsy
end
it 'removes the highlight cache from redis' do
write_cache(highlight_cache, cache_id, [{}])
expect(read_cache(highlight_cache, cache_id)).not_to be_nil
service.execute
expect(read_cache(highlight_cache, cache_id)).to be_nil
end
it 'ignores non-commit responses from Gitaly' do
entry.type = :UNKNOWN
service.execute
expect(NoteDiffFile.exists?(note_diff_file.id)).to be_truthy
end
end
it 'raises an error if no object map can be found' do
object_map.remove!
expect { service.execute }.to raise_error(described_class::NoUploadError)
end
end
def build_entry(old_oid)
Gitaly::ApplyBfgObjectMapStreamResponse::Entry.new(
type: :COMMIT,
old_oid: old_oid,
new_oid: Gitlab::Git::BLANK_SHA
)
end
def read_cache(cache, key)
cache.read_multiple([key]).first
end
def write_cache(cache, key, value)
cache.write_multiple(key => value)
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment