Commit 6b426001 authored by Matthias Kaeppler's avatar Matthias Kaeppler

De-dup project tree entries in GL imports

This introduces ProjectTreeProcessor, which de-duplicates elements in
the project tree hash for better memory efficiency during imports.

As the measurements show, the optimization only becomes useful above a
certain project size, which is why it is only enabled for projects
larger than 500MB.
parent 6d92cec2
# frozen_string_literal: true
module Gitlab
module ImportExport
# this is mostly useful for testing and comparing results between
# processed and unprocessed project trees without changing the
# structure of the caller
class IdentityProjectTreeProcessor
def process(tree_hash)
tree_hash
end
end
# optimizes the project tree for memory efficiency by deduplicating entries
class ProjectTreeProcessor
LARGE_PROJECT_FILE_SIZE_BYTES = 500.megabyte
class << self
# some optimizations only yield amortized gains above a certain
# project size, see https://gitlab.com/gitlab-org/gitlab/issues/27070
def new_for_file(project_json_path)
if Feature.enabled?(:dedup_project_import_metadata) && large_project?(project_json_path)
ProjectTreeProcessor.new
else
IdentityProjectTreeProcessor.new
end
end
private
def large_project?(project_json_path)
File.size(project_json_path) >= LARGE_PROJECT_FILE_SIZE_BYTES
end
end
def process(tree_hash)
dedup_tree(tree_hash)
end
private
# This function removes duplicate entries from the given tree recursively
# by caching nodes it encounters repeatedly. We only consider nodes for
# which there can actually be multiple equivalent instances (e.g. strings,
# hashes and arrays, but not `nil`s, numbers or booleans.)
#
# The algorithm uses a recursive depth-first descent with 3 cases, starting
# with a root node (the tree/hash itself):
# - a node has already been cached; in this case we return it from the cache
# - a node has not been cached yet but should be; descend into its children
# - a node is neither cached nor qualifies for caching; this is a no-op
def dedup_tree(node, nodes_seen = {})
if nodes_seen.key?(node)
yield nodes_seen[node]
elsif should_dedup?(node)
nodes_seen[node] = node
case node
when Array
node.each_index do |idx|
dedup_tree(node[idx], nodes_seen) do |cached_node|
node[idx] = cached_node
end
end
when Hash
node.each do |k, v|
dedup_tree(v, nodes_seen) do |cached_node|
node[k] = cached_node
end
end
end
end
end
# We do not need to consider nodes for which there cannot be multiple instances
def should_dedup?(node)
node && !(node.is_a?(Numeric) || node.is_a?(TrueClass) || node.is_a?(FalseClass))
end
end
end
end
...@@ -7,15 +7,16 @@ module Gitlab ...@@ -7,15 +7,16 @@ module Gitlab
attr_reader :shared attr_reader :shared
attr_reader :project attr_reader :project
def initialize(user:, shared:, project:) def initialize(user:, shared:, project:, project_tree_processor: nil)
@path = File.join(shared.export_path, 'project.json') @path = File.join(shared.export_path, 'project.json')
@user = user @user = user
@shared = shared @shared = shared
@project = project @project = project
@project_tree_processor = project_tree_processor || ProjectTreeProcessor.new_for_file(@path)
end end
def restore def restore
@tree_hash = read_tree_hash @tree_hash = @project_tree_processor.process(read_tree_hash)
@project_members = @tree_hash.delete('project_members') @project_members = @tree_hash.delete('project_members')
RelationRenameService.rename(@tree_hash) RelationRenameService.rename(@tree_hash)
......
...@@ -155,7 +155,7 @@ module Gitlab ...@@ -155,7 +155,7 @@ module Gitlab
def build_relation(relation_key, relation_definition, data_hash) def build_relation(relation_key, relation_definition, data_hash)
# TODO: This is hack to not create relation for the author # TODO: This is hack to not create relation for the author
# Rather make `RelationFactory#set_note_author` to take care of that # Rather make `RelationFactory#set_note_author` to take care of that
return data_hash if relation_key == 'author' return data_hash if relation_key == 'author' || already_restored?(data_hash)
# create relation objects recursively for all sub-objects # create relation objects recursively for all sub-objects
relation_definition.each do |sub_relation_key, sub_relation_definition| relation_definition.each do |sub_relation_key, sub_relation_definition|
...@@ -165,6 +165,13 @@ module Gitlab ...@@ -165,6 +165,13 @@ module Gitlab
@relation_factory.create(relation_factory_params(relation_key, data_hash)) @relation_factory.create(relation_factory_params(relation_key, data_hash))
end end
# Since we update the data hash in place as we restore relation items,
# and since we also de-duplicate items, we might encounter items that
# have already been restored in a previous iteration.
def already_restored?(relation_item)
!relation_item.is_a?(Hash)
end
def transform_sub_relations!(data_hash, sub_relation_key, sub_relation_definition) def transform_sub_relations!(data_hash, sub_relation_key, sub_relation_definition)
sub_data_hash = data_hash[sub_relation_key] sub_data_hash = data_hash[sub_relation_key]
return unless sub_data_hash return unless sub_data_hash
......
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::ImportExport::ProjectTreeProcessor do
let(:tree) do
{
simple: 42,
k1: { "v1": 1 },
k2: ["v2"],
array: [
{ k1: { "v1": 1 } },
{ k2: ["v2"] }
],
nested: {
k1: { "v1": 1 },
k2: ["v2"],
k3: ["don't touch"]
}
}
end
let(:processed_tree) { subject.process(tree) }
it 'returns the processed tree' do
expect(processed_tree).to be(tree)
end
it 'de-duplicates equal values' do
expect(processed_tree[:k1]).to be(processed_tree[:array][0][:k1])
expect(processed_tree[:k1]).to be(processed_tree[:nested][:k1])
expect(processed_tree[:k2]).to be(processed_tree[:array][1][:k2])
expect(processed_tree[:k2]).to be(processed_tree[:nested][:k2])
end
it 'keeps unique entries intact' do
expect(processed_tree[:simple]).to eq(42)
expect(processed_tree[:nested][:k3]).to eq(["don't touch"])
end
it 'maintains object equality' do
expect { processed_tree }.not_to change { tree }
end
context 'obtaining a suitable processor' do
context 'when the project file is above the size threshold' do
it 'returns an optimizing processor' do
stub_project_file_size(subject.class::LARGE_PROJECT_FILE_SIZE_BYTES)
expect(subject.class.new_for_file('/path/to/project.json')).to(
be_an_instance_of(Gitlab::ImportExport::ProjectTreeProcessor)
)
end
end
context 'when the file is below the size threshold' do
it 'returns a no-op processor' do
stub_project_file_size(subject.class::LARGE_PROJECT_FILE_SIZE_BYTES - 1)
expect(subject.class.new_for_file('/path/to/project.json')).to(
be_an_instance_of(Gitlab::ImportExport::IdentityProjectTreeProcessor)
)
end
end
def stub_project_file_size(size)
allow(File).to receive(:size).and_return(size)
end
end
end
...@@ -29,7 +29,8 @@ describe Gitlab::ImportExport::ProjectTreeRestorer do ...@@ -29,7 +29,8 @@ describe Gitlab::ImportExport::ProjectTreeRestorer do
expect_any_instance_of(Gitlab::Git::Repository).to receive(:create_branch).with('feature', 'DCBA') expect_any_instance_of(Gitlab::Git::Repository).to receive(:create_branch).with('feature', 'DCBA')
allow_any_instance_of(Gitlab::Git::Repository).to receive(:create_branch) allow_any_instance_of(Gitlab::Git::Repository).to receive(:create_branch)
project_tree_restorer = described_class.new(user: @user, shared: @shared, project: @project) project_tree_restorer = described_class.new(user: @user, shared: @shared,
project: @project, project_tree_processor: Gitlab::ImportExport::ProjectTreeProcessor.new)
@restored_project_json = project_tree_restorer.restore @restored_project_json = project_tree_restorer.restore
end end
...@@ -450,7 +451,10 @@ describe Gitlab::ImportExport::ProjectTreeRestorer do ...@@ -450,7 +451,10 @@ describe Gitlab::ImportExport::ProjectTreeRestorer do
context 'project.json file access check' do context 'project.json file access check' do
let(:user) { create(:user) } let(:user) { create(:user) }
let!(:project) { create(:project, :builds_disabled, :issues_disabled, name: 'project', path: 'project') } let!(:project) { create(:project, :builds_disabled, :issues_disabled, name: 'project', path: 'project') }
let(:project_tree_restorer) { described_class.new(user: user, shared: shared, project: project) } let(:project_tree_restorer) do
described_class.new(user: user, shared: shared, project: project,
project_tree_processor: Gitlab::ImportExport::ProjectTreeProcessor.new)
end
let(:restored_project_json) { project_tree_restorer.restore } let(:restored_project_json) { project_tree_restorer.restore }
it 'does not read a symlink' do it 'does not read a symlink' do
...@@ -673,7 +677,10 @@ describe Gitlab::ImportExport::ProjectTreeRestorer do ...@@ -673,7 +677,10 @@ describe Gitlab::ImportExport::ProjectTreeRestorer do
let(:project) { create(:project) } let(:project) { create(:project) }
let(:user) { create(:user) } let(:user) { create(:user) }
let(:tree_hash) { { 'visibility_level' => visibility } } let(:tree_hash) { { 'visibility_level' => visibility } }
let(:restorer) { described_class.new(user: user, shared: shared, project: project) } let(:restorer) do
described_class.new(user: user, shared: shared, project: project,
project_tree_processor: Gitlab::ImportExport::ProjectTreeProcessor.new)
end
before do before do
expect(restorer).to receive(:read_tree_hash) { tree_hash } expect(restorer).to receive(:read_tree_hash) { tree_hash }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment