Commit bcf38b7f authored by Qingyu Zhao's avatar Qingyu Zhao Committed by Kamil Trzciński

Introduce relation reader abstraction for project import

 - reader can read JSON files for further processing
 - its only implementation would be to read legacy(fat) JSON files
 - split Project::TreeLoader into dedup_legacy_reader and legacy_reader
parent e8e2c02a
......@@ -17,9 +17,17 @@ module Gitlab
end
def restore
@tree_hash = @group_hash || read_tree_hash
@group_members = @tree_hash.delete('members')
@children = @tree_hash.delete('children')
@relation_reader ||=
if @group_hash.present?
ImportExport::JSON::LegacyReader::User.new(@group_hash, reader.group_relation_names)
else
ImportExport::JSON::LegacyReader::File.new(@path, reader.group_relation_names)
end
@group_members = @relation_reader.consume_relation('members')
@children = @relation_reader.consume_attribute('children')
@relation_reader.consume_attribute('name')
@relation_reader.consume_attribute('path')
if members_mapper.map && restorer.restore
@children&.each do |group_hash|
......@@ -45,21 +53,12 @@ module Gitlab
private
def read_tree_hash
json = IO.read(@path)
ActiveSupport::JSON.decode(json)
rescue => e
@shared.error(e)
raise Gitlab::ImportExport::Error.new('Incorrect JSON format')
end
def restorer
@relation_tree_restorer ||= RelationTreeRestorer.new(
user: @user,
shared: @shared,
importable: @group,
tree_hash: @tree_hash.except('name', 'path'),
relation_reader: @relation_reader,
members_mapper: members_mapper,
object_builder: object_builder,
relation_factory: relation_factory,
......
# frozen_string_literal: true
module Gitlab
module ImportExport
module JSON
class LegacyReader
class File < LegacyReader
def initialize(path, relation_names)
@path = path
super(relation_names)
end
def valid?
::File.exist?(@path)
end
private
def tree_hash
@tree_hash ||= read_hash
end
def read_hash
ActiveSupport::JSON.decode(IO.read(@path))
rescue => e
Gitlab::ErrorTracking.log_exception(e)
raise Gitlab::ImportExport::Error.new('Incorrect JSON format')
end
end
class User < LegacyReader
def initialize(tree_hash, relation_names)
@tree_hash = tree_hash
super(relation_names)
end
def valid?
@tree_hash.present?
end
protected
attr_reader :tree_hash
end
def initialize(relation_names)
@relation_names = relation_names.map(&:to_s)
end
def valid?
raise NotImplementedError
end
def legacy?
true
end
def root_attributes(excluded_attributes = [])
attributes.except(*excluded_attributes.map(&:to_s))
end
def consume_relation(key)
value = relations.delete(key)
return value unless block_given?
return if value.nil?
if value.is_a?(Array)
value.each.with_index do |item, idx|
yield(item, idx)
end
else
yield(value, 0)
end
end
def consume_attribute(key)
attributes.delete(key)
end
def sort_ci_pipelines_by_id
relations['ci_pipelines']&.sort_by! { |hash| hash['id'] }
end
private
attr_reader :relation_names
def tree_hash
raise NotImplementedError
end
def attributes
@attributes ||= tree_hash.slice!(*relation_names)
end
def relations
@relations ||= tree_hash.extract!(*relation_names)
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module ImportExport
module Project
class TreeLoader
def load(path, dedup_entries: false)
tree_hash = ActiveSupport::JSON.decode(IO.read(path))
if dedup_entries
dedup_tree(tree_hash)
else
tree_hash
end
end
private
# This function removes duplicate entries from the given tree recursively
# by caching nodes it encounters repeatedly. We only consider nodes for
# which there can actually be multiple equivalent instances (e.g. strings,
# hashes and arrays, but not `nil`s, numbers or booleans.)
#
# The algorithm uses a recursive depth-first descent with 3 cases, starting
# with a root node (the tree/hash itself):
# - a node has already been cached; in this case we return it from the cache
# - a node has not been cached yet but should be; descend into its children
# - a node is neither cached nor qualifies for caching; this is a no-op
def dedup_tree(node, nodes_seen = {})
if nodes_seen.key?(node) && distinguishable?(node)
yield nodes_seen[node]
elsif should_dedup?(node)
nodes_seen[node] = node
case node
when Array
node.each_index do |idx|
dedup_tree(node[idx], nodes_seen) do |cached_node|
node[idx] = cached_node
end
end
when Hash
node.each do |k, v|
dedup_tree(v, nodes_seen) do |cached_node|
node[k] = cached_node
end
end
end
else
node
end
end
# We do not need to consider nodes for which there cannot be multiple instances
def should_dedup?(node)
node && !(node.is_a?(Numeric) || node.is_a?(TrueClass) || node.is_a?(FalseClass))
end
# We can only safely de-dup values that are distinguishable. True value objects
# are always distinguishable by nature. Hashes however can represent entities,
# which are identified by ID, not value. We therefore disallow de-duping hashes
# that do not have an `id` field, since we might risk dropping entities that
# have equal attributes yet different identities.
def distinguishable?(node)
if node.is_a?(Hash)
node.key?('id')
else
true
end
end
end
end
end
end
......@@ -4,8 +4,6 @@ module Gitlab
module ImportExport
module Project
class TreeRestorer
LARGE_PROJECT_FILE_SIZE_BYTES = 500.megabyte
attr_reader :user
attr_reader :shared
attr_reader :project
......@@ -14,12 +12,12 @@ module Gitlab
@user = user
@shared = shared
@project = project
@tree_loader = TreeLoader.new
end
def restore
@tree_hash = read_tree_hash
@project_members = @tree_hash.delete('project_members')
@relation_reader = ImportExport::JSON::LegacyReader::File.new(File.join(shared.export_path, 'project.json'), reader.project_relation_names)
@project_members = @relation_reader.consume_relation('project_members')
if relation_tree_restorer.restore
import_failure_service.with_retry(action: 'set_latest_merge_request_diff_ids!') do
......@@ -37,24 +35,12 @@ module Gitlab
private
def large_project?(path)
File.size(path) >= LARGE_PROJECT_FILE_SIZE_BYTES
end
def read_tree_hash
path = File.join(@shared.export_path, 'project.json')
@tree_loader.load(path, dedup_entries: large_project?(path))
rescue => e
Rails.logger.error("Import/Export error: #{e.message}") # rubocop:disable Gitlab/RailsLogger
raise Gitlab::ImportExport::Error.new('Incorrect JSON format')
end
def relation_tree_restorer
@relation_tree_restorer ||= RelationTreeRestorer.new(
user: @user,
shared: @shared,
importable: @project,
tree_hash: @tree_hash,
relation_reader: @relation_reader,
object_builder: object_builder,
members_mapper: members_mapper,
relation_factory: relation_factory,
......
......@@ -17,10 +17,18 @@ module Gitlab
tree_by_key(:project)
end
def project_relation_names
attributes_finder.find_relations_tree(:project).keys
end
def group_tree
tree_by_key(:group)
end
def group_relation_names
attributes_finder.find_relations_tree(:group).keys
end
def group_members_tree
tree_by_key(:group_members)
end
......
......@@ -9,13 +9,13 @@ module Gitlab
attr_reader :user
attr_reader :shared
attr_reader :importable
attr_reader :tree_hash
attr_reader :relation_reader
def initialize(user:, shared:, importable:, tree_hash:, members_mapper:, object_builder:, relation_factory:, reader:)
def initialize(user:, shared:, importable:, relation_reader:, members_mapper:, object_builder:, relation_factory:, reader:)
@user = user
@shared = shared
@importable = importable
@tree_hash = tree_hash
@relation_reader = relation_reader
@members_mapper = members_mapper
@object_builder = object_builder
@relation_factory = relation_factory
......@@ -30,7 +30,7 @@ module Gitlab
bulk_inserts_enabled = @importable.class == ::Project &&
Feature.enabled?(:import_bulk_inserts, @importable.group)
BulkInsertableAssociations.with_bulk_insert(enabled: bulk_inserts_enabled) do
update_relation_hashes!
fix_ci_pipelines_not_sorted_on_legacy_project_json!
create_relations!
end
end
......@@ -57,18 +57,8 @@ module Gitlab
end
def process_relation!(relation_key, relation_definition)
data_hashes = @tree_hash.delete(relation_key)
return unless data_hashes
# we do not care if we process array or hash
data_hashes = [data_hashes] unless data_hashes.is_a?(Array)
relation_index = 0
# consume and remove objects from memory
while data_hash = data_hashes.shift
@relation_reader.consume_relation(relation_key) do |data_hash, relation_index|
process_relation_item!(relation_key, relation_definition, relation_index, data_hash)
relation_index += 1
end
end
......@@ -103,10 +93,7 @@ module Gitlab
end
def update_params!
params = @tree_hash.reject do |key, _|
relations.include?(key)
end
params = @relation_reader.root_attributes(relations.keys)
params = params.merge(present_override_params)
# Cleaning all imported and overridden params
......@@ -223,8 +210,13 @@ module Gitlab
}
end
def update_relation_hashes!
@tree_hash['ci_pipelines']&.sort_by! { |hash| hash['id'] }
# Temporary fix for https://gitlab.com/gitlab-org/gitlab/-/issues/27883 when import from legacy project.json
# This should be removed once legacy JSON format is deprecated.
# Ndjson export file will fix the order during project export.
def fix_ci_pipelines_not_sorted_on_legacy_project_json!
return unless relation_reader.legacy?
relation_reader.sort_ci_pipelines_by_id
end
end
end
......
......@@ -18,7 +18,7 @@ module Gitlab
def save(tree, dir_path, filename)
mkdir_p(dir_path)
tree_json = JSON.generate(tree)
tree_json = ::JSON.generate(tree)
File.write(File.join(dir_path, filename), tree_json)
end
......
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::ImportExport::JSON::LegacyReader::User do
let(:relation_names) { [] }
let(:legacy_reader) { described_class.new(tree_hash, relation_names) }
describe '#valid?' do
subject { legacy_reader.valid? }
context 'tree_hash not present' do
let(:tree_hash) { nil }
it { is_expected.to be false }
end
context 'tree_hash presents' do
let(:tree_hash) { { "issues": [] } }
it { is_expected.to be true }
end
end
end
describe Gitlab::ImportExport::JSON::LegacyReader::File do
let(:fixture) { 'spec/fixtures/lib/gitlab/import_export/light/project.json' }
let(:project_tree) { JSON.parse(File.read(fixture)) }
let(:relation_names) { [] }
let(:legacy_reader) { described_class.new(path, relation_names) }
describe '#valid?' do
subject { legacy_reader.valid? }
context 'given valid path' do
let(:path) { fixture }
it { is_expected.to be true }
end
context 'given invalid path' do
let(:path) { 'spec/non-existing-folder/do-not-create-this-file.json' }
it { is_expected.to be false }
end
end
describe '#root_attributes' do
let(:path) { fixture }
subject { legacy_reader.root_attributes(excluded_attributes) }
context 'No excluded attributes' do
let(:excluded_attributes) { [] }
let(:relation_names) { [] }
it 'returns the whole tree from parsed JSON' do
expect(subject).to eq(project_tree)
end
end
context 'Some attributes are excluded' do
let(:excluded_attributes) { %w[milestones labels issues services snippets] }
let(:relation_names) { %w[import_type archived] }
it 'returns hash without excluded attributes and relations' do
expect(subject).not_to include('milestones', 'labels', 'issues', 'services', 'snippets', 'import_type', 'archived')
end
end
end
describe '#consume_relation' do
let(:path) { fixture }
let(:key) { 'description' }
context 'block not given' do
it 'returns value of the key' do
expect(legacy_reader).to receive(:relations).and_return({ key => 'test value' })
expect(legacy_reader.consume_relation(key)).to eq('test value')
end
end
context 'key has been consumed' do
before do
legacy_reader.consume_relation(key)
end
it 'does not yield' do
expect do |blk|
legacy_reader.consume_relation(key, &blk)
end.not_to yield_control
end
end
context 'value is nil' do
before do
expect(legacy_reader).to receive(:relations).and_return({ key => nil })
end
it 'does not yield' do
expect do |blk|
legacy_reader.consume_relation(key, &blk)
end.not_to yield_control
end
end
context 'value is not array' do
before do
expect(legacy_reader).to receive(:relations).and_return({ key => 'value' })
end
it 'yield the value with index 0' do
expect do |blk|
legacy_reader.consume_relation(key, &blk)
end.to yield_with_args('value', 0)
end
end
context 'value is an array' do
before do
expect(legacy_reader).to receive(:relations).and_return({ key => %w[item1 item2 item3] })
end
it 'yield each array element with index' do
expect do |blk|
legacy_reader.consume_relation(key, &blk)
end.to yield_successive_args(['item1', 0], ['item2', 1], ['item3', 2])
end
end
end
describe '#tree_hash' do
let(:path) { fixture }
subject { legacy_reader.send(:tree_hash) }
it 'parses the JSON into the expected tree' do
expect(subject).to eq(project_tree)
end
context 'invalid JSON' do
let(:path) { 'spec/fixtures/lib/gitlab/import_export/invalid_json/project.json' }
it 'raise Exception' do
expect { subject }.to raise_exception(Gitlab::ImportExport::Error, 'Incorrect JSON format')
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::ImportExport::Project::TreeLoader do
let(:fixture) { 'spec/fixtures/lib/gitlab/import_export/with_duplicates.json' }
let(:project_tree) { JSON.parse(File.read(fixture)) }
context 'without de-duplicating entries' do
let(:parsed_tree) do
subject.load(fixture)
end
it 'parses the JSON into the expected tree' do
expect(parsed_tree).to eq(project_tree)
end
it 'does not de-duplicate entries' do
expect(parsed_tree['duped_hash_with_id']).not_to be(parsed_tree['array'][0]['duped_hash_with_id'])
end
end
context 'with de-duplicating entries' do
let(:parsed_tree) do
subject.load(fixture, dedup_entries: true)
end
it 'parses the JSON into the expected tree' do
expect(parsed_tree).to eq(project_tree)
end
it 'de-duplicates equal values' do
expect(parsed_tree['duped_hash_with_id']).to be(parsed_tree['array'][0]['duped_hash_with_id'])
expect(parsed_tree['duped_hash_with_id']).to be(parsed_tree['nested']['duped_hash_with_id'])
expect(parsed_tree['duped_array']).to be(parsed_tree['array'][1]['duped_array'])
expect(parsed_tree['duped_array']).to be(parsed_tree['nested']['duped_array'])
end
it 'does not de-duplicate hashes without IDs' do
expect(parsed_tree['duped_hash_no_id']).to eq(parsed_tree['array'][2]['duped_hash_no_id'])
expect(parsed_tree['duped_hash_no_id']).not_to be(parsed_tree['array'][2]['duped_hash_no_id'])
end
it 'keeps single entries intact' do
expect(parsed_tree['simple']).to eq(42)
expect(parsed_tree['nested']['array']).to eq(["don't touch"])
end
end
end
......@@ -783,7 +783,8 @@ describe Gitlab::ImportExport::Project::TreeRestorer do
end
before do
expect(restorer).to receive(:read_tree_hash) { tree_hash }
allow_any_instance_of(Gitlab::ImportExport::JSON::LegacyReader::File).to receive(:valid?).and_return(true)
allow_any_instance_of(Gitlab::ImportExport::JSON::LegacyReader::File).to receive(:tree_hash) { tree_hash }
end
context 'no group visibility' do
......
# frozen_string_literal: true
# This spec is a lightweight version of:
# * project_tree_restorer_spec.rb
# * project/tree_restorer_spec.rb
#
# In depth testing is being done in the above specs.
# This spec tests that restore project works
......@@ -25,7 +25,7 @@ describe Gitlab::ImportExport::RelationTreeRestorer do
described_class.new(
user: user,
shared: shared,
tree_hash: tree_hash,
relation_reader: relation_reader,
importable: importable,
object_builder: object_builder,
members_mapper: members_mapper,
......@@ -36,14 +36,7 @@ describe Gitlab::ImportExport::RelationTreeRestorer do
subject { relation_tree_restorer.restore }
context 'when restoring a project' do
let(:path) { 'spec/fixtures/lib/gitlab/import_export/complex/project.json' }
let(:importable) { create(:project, :builds_enabled, :issues_disabled, name: 'project', path: 'project') }
let(:object_builder) { Gitlab::ImportExport::Project::ObjectBuilder }
let(:relation_factory) { Gitlab::ImportExport::Project::RelationFactory }
let(:reader) { Gitlab::ImportExport::Reader.new(shared: shared) }
let(:tree_hash) { importable_hash }
shared_examples 'import project successfully' do
it 'restores project tree' do
expect(subject).to eq(true)
end
......@@ -66,4 +59,18 @@ describe Gitlab::ImportExport::RelationTreeRestorer do
end
end
end
context 'when restoring a project' do
let(:path) { 'spec/fixtures/lib/gitlab/import_export/complex/project.json' }
let(:importable) { create(:project, :builds_enabled, :issues_disabled, name: 'project', path: 'project') }
let(:object_builder) { Gitlab::ImportExport::Project::ObjectBuilder }
let(:relation_factory) { Gitlab::ImportExport::Project::RelationFactory }
let(:reader) { Gitlab::ImportExport::Reader.new(shared: shared) }
context 'using legacy reader' do
let(:relation_reader) { Gitlab::ImportExport::JSON::LegacyReader::File.new(path, reader.project_relation_names) }
it_behaves_like 'import project successfully'
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment