Commit 63bc19ed authored by Vladimir Shushlin's avatar Vladimir Shushlin

Automatically migrate pages to zip storage

Currently, the only way to migrate pages to zip storage is to manually
run the rake task:

```
gitlab-rake gitlab:pages:migrate_legacy_storage
```

It requires human intervention on migration and can take a long time
to  complete. (it took about a week on gitlab.com).

We can't remove this rake task because it allows users to control the
process and fix some errors:
* it prints all errors by default, such as invalid symlinks and
projects marked as deployed by absent on pages storage
* it also allows for fix these errors by ignoring invalid symlinks and
marking projects as not deployed
* and it allows users to see the progress/check if everything has been
migrated

However, making every instance admin run this task when upgrading to
`14.0` may create too much work for them.
So we want to run the same migration code as the background
migration.
This way we will migrate the data for most of the users without their
intervention.

It still will be recommended to run the rake task and make sure
everything has been migrated properly.
But in most cases everything should be migrated automatically and even
if users forgot to run the rake task everything should be fine.

* Refactor existing batch migration service to support
calling without threads and just proccessing the batch or projects
* Subsequently refactor rake task due to changed interface
* Shedule a background migration and execute zip migration service
there
parent 0f672aca
......@@ -2,10 +2,8 @@
module Pages
class MigrateFromLegacyStorageService
def initialize(logger, migration_threads:, batch_size:, ignore_invalid_entries:, mark_projects_as_not_deployed:)
def initialize(logger, ignore_invalid_entries:, mark_projects_as_not_deployed:)
@logger = logger
@migration_threads = migration_threads
@batch_size = batch_size
@ignore_invalid_entries = ignore_invalid_entries
@mark_projects_as_not_deployed = mark_projects_as_not_deployed
......@@ -14,25 +12,35 @@ module Pages
@counters_lock = Mutex.new
end
def execute
def execute_with_threads(threads:, batch_size:)
@queue = SizedQueue.new(1)
threads = start_migration_threads
migration_threads = start_migration_threads(threads)
ProjectPagesMetadatum.only_on_legacy_storage.each_batch(of: @batch_size) do |batch|
ProjectPagesMetadatum.only_on_legacy_storage.each_batch(of: batch_size) do |batch|
@queue.push(batch)
end
@queue.close
@logger.info("Waiting for threads to finish...")
threads.each(&:join)
migration_threads.each(&:join)
{ migrated: @migrated, errored: @errored }
end
def start_migration_threads
Array.new(@migration_threads) do
def execute_for_batch(project_ids)
batch = ProjectPagesMetadatum.only_on_legacy_storage.where(project_id: project_ids) # rubocop: disable CodeReuse/ActiveRecord
process_batch(batch)
{ migrated: @migrated, errored: @errored }
end
private
def start_migration_threads(count)
Array.new(count) do
Thread.new do
while batch = @queue.pop
Rails.application.executor.wrap do
......
---
title: Automatically try to migrate gitlab pages to zip storage
merge_request: 54578
author:
type: added
# frozen_string_literal: true
class ScheduleMigratePagesToZipStorage < ActiveRecord::Migration[6.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
MIGRATION = 'MigratePagesToZipStorage'
BATCH_SIZE = 100
BATCH_TIME = 6.minutes
disable_ddl_transaction!
class ProjectPagesMetadatum < ActiveRecord::Base
extend SuppressCompositePrimaryKeyWarning
include EachBatch
self.primary_key = :project_id
self.table_name = 'project_pages_metadata'
self.inheritance_column = :_type_disabled
scope :deployed, -> { where(deployed: true) }
scope :only_on_legacy_storage, -> { deployed.where(pages_deployment_id: nil) }
end
def up
queue_background_migration_jobs_by_range_at_intervals(
ProjectPagesMetadatum.only_on_legacy_storage,
MIGRATION,
BATCH_TIME,
batch_size: BATCH_SIZE,
primary_column_name: :project_id
)
end
end
7c562d43801c18af48dc526dc6574aebd11689b62bad864b107580d341ba64a1
\ No newline at end of file
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
# migrates pages from legacy storage to zip format
# we intentionally use application code here because
# it has a lot of dependencies including models, carrierwave uploaders and service objects
# and copying all or part of this code in the background migration doesn't add much value
# see https://gitlab.com/gitlab-org/gitlab/-/merge_requests/54578 for discussion
class MigratePagesToZipStorage
def perform(start_id, stop_id)
::Pages::MigrateFromLegacyStorageService.new(Gitlab::AppLogger,
ignore_invalid_entries: false,
mark_projects_as_not_deployed: false)
.execute_for_batch(start_id..stop_id)
end
end
end
end
......@@ -9,10 +9,9 @@ namespace :gitlab do
logger.info('Starting to migrate legacy pages storage to zip deployments')
result = ::Pages::MigrateFromLegacyStorageService.new(logger,
migration_threads: migration_threads,
batch_size: batch_size,
ignore_invalid_entries: ignore_invalid_entries,
mark_projects_as_not_deployed: mark_projects_as_not_deployed).execute
mark_projects_as_not_deployed: mark_projects_as_not_deployed)
.execute_with_threads(threads: migration_threads, batch_size: batch_size)
logger.info("A total of #{result[:migrated] + result[:errored]} projects were processed.")
logger.info("- The #{result[:migrated]} projects migrated successfully")
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::MigratePagesToZipStorage do
let(:namespace) { create(:group) } # rubocop: disable RSpec/FactoriesInMigrationSpecs
let(:migration) { described_class.new }
describe '#perform' do
context 'when there is project to migrate' do
let!(:project) { create_project('project') }
after do
FileUtils.rm_rf(project.pages_path)
end
it 'migrates project to zip storage' do
expect_next_instance_of(::Pages::MigrateFromLegacyStorageService,
anything,
ignore_invalid_entries: false,
mark_projects_as_not_deployed: false) do |service|
expect(service).to receive(:execute_for_batch).with(project.id..project.id).and_call_original
end
migration.perform(project.id, project.id)
expect(project.reload.pages_metadatum.pages_deployment.file.filename).to eq("_migrated.zip")
end
end
end
def create_project(path)
project = create(:project) # rubocop: disable RSpec/FactoriesInMigrationSpecs
project.mark_pages_as_deployed
FileUtils.mkdir_p File.join(project.pages_path, "public")
File.open(File.join(project.pages_path, "public/index.html"), "w") do |f|
f.write("Hello!")
end
project
end
end
# frozen_string_literal: true
require 'spec_helper'
require Rails.root.join('db', 'post_migrate', '20210302150310_schedule_migrate_pages_to_zip_storage.rb')
RSpec.describe ScheduleMigratePagesToZipStorage, :sidekiq_might_not_need_inline, schema: 20201231133921 do
let(:migration_class) { described_class::MIGRATION }
let(:migration_name) { migration_class.to_s.demodulize }
let(:namespaces_table) { table(:namespaces) }
let(:projects_table) { table(:projects) }
let(:metadata_table) { table(:project_pages_metadata) }
let(:deployments_table) { table(:pages_deployments) }
let(:namespace) { namespaces_table.create!(path: "group", name: "group") }
def create_project_metadata(path, deployed, with_deployment)
project = projects_table.create!(path: path, namespace_id: namespace.id)
deployment_id = nil
if with_deployment
deployment_id = deployments_table.create!(project_id: project.id, file_store: 1, file: '1', file_count: 1, file_sha256: '123', size: 1).id
end
metadata_table.create!(project_id: project.id, deployed: deployed, pages_deployment_id: deployment_id)
end
it 'correctly schedules background migrations' do
Sidekiq::Testing.fake! do
freeze_time do
create_project_metadata("not-deployed-project", false, false)
first_id = create_project_metadata("project1", true, false).id
last_id = create_project_metadata("project2", true, false).id
create_project_metadata("project-with-deployment", true, true)
migrate!
expect(migration_name).to be_scheduled_delayed_migration(6.minutes, first_id, last_id)
expect(BackgroundMigrationWorker.jobs.size).to eq(1)
end
end
end
end
......@@ -5,104 +5,133 @@ require 'spec_helper'
RSpec.describe Pages::MigrateFromLegacyStorageService do
let(:batch_size) { 10 }
let(:mark_projects_as_not_deployed) { false }
let(:service) { described_class.new(Rails.logger, migration_threads: 3, batch_size: batch_size, ignore_invalid_entries: false, mark_projects_as_not_deployed: mark_projects_as_not_deployed) }
let(:service) { described_class.new(Rails.logger, ignore_invalid_entries: false, mark_projects_as_not_deployed: mark_projects_as_not_deployed) }
it 'does not try to migrate pages if pages are not deployed' do
expect(::Pages::MigrateLegacyStorageToDeploymentService).not_to receive(:new)
shared_examples "migrates projects properly" do
it 'does not try to migrate pages if pages are not deployed' do
expect(::Pages::MigrateLegacyStorageToDeploymentService).not_to receive(:new)
expect(service.execute).to eq(migrated: 0, errored: 0)
end
is_expected.to eq(migrated: 0, errored: 0)
end
context 'when there is work for multiple threads' do
let(:batch_size) { 2 } # override to force usage of multiple threads
context 'when pages are marked as deployed' do
let(:project) { create(:project) }
it 'uses multiple threads' do
projects = create_list(:project, 20)
projects.each do |project|
before do
project.mark_pages_as_deployed
end
context 'when pages directory does not exist' do
context 'when mark_projects_as_not_deployed is set' do
let(:mark_projects_as_not_deployed) { true }
it 'counts project as migrated' do
expect_next_instance_of(::Pages::MigrateLegacyStorageToDeploymentService, project, ignore_invalid_entries: false, mark_projects_as_not_deployed: true) do |service|
expect(service).to receive(:execute).and_call_original
end
is_expected.to eq(migrated: 1, errored: 0)
end
end
it 'counts project as errored' do
expect_next_instance_of(::Pages::MigrateLegacyStorageToDeploymentService, project, ignore_invalid_entries: false, mark_projects_as_not_deployed: false) do |service|
expect(service).to receive(:execute).and_call_original
end
FileUtils.mkdir_p File.join(project.pages_path, "public")
File.open(File.join(project.pages_path, "public/index.html"), "w") do |f|
f.write("Hello!")
is_expected.to eq(migrated: 0, errored: 1)
end
end
threads = Concurrent::Set.new
context 'when pages directory exists on disk' do
before do
FileUtils.mkdir_p File.join(project.pages_path, "public")
File.open(File.join(project.pages_path, "public/index.html"), "w") do |f|
f.write("Hello!")
end
end
it 'migrates pages projects without deployments' do
expect_next_instance_of(::Pages::MigrateLegacyStorageToDeploymentService, project, ignore_invalid_entries: false, mark_projects_as_not_deployed: false) do |service|
expect(service).to receive(:execute).and_call_original
end
expect(service).to receive(:migrate_project).exactly(20).times.and_wrap_original do |m, *args|
threads.add(Thread.current)
expect(project.pages_metadatum.reload.pages_deployment).to eq(nil)
expect(subject).to eq(migrated: 1, errored: 0)
expect(project.pages_metadatum.reload.pages_deployment).to be
end
# sleep to be 100% certain that once thread can't consume all the queue
# it works without it, but I want to avoid making this test flaky
sleep(0.01)
context 'when deployed already exists for the project' do
before do
deployment = create(:pages_deployment, project: project)
project.set_first_pages_deployment!(deployment)
end
m.call(*args)
end
it 'does not try to migrate project' do
expect(::Pages::MigrateLegacyStorageToDeploymentService).not_to receive(:new)
expect(service.execute).to eq(migrated: 20, errored: 0)
expect(threads.length).to eq(3)
is_expected.to eq(migrated: 0, errored: 0)
end
end
end
end
end
context 'when pages are marked as deployed' do
let(:project) { create(:project) }
describe '#execute_with_threads' do
subject { service.execute_with_threads(threads: 3, batch_size: batch_size) }
before do
project.mark_pages_as_deployed
end
include_examples "migrates projects properly"
context 'when pages directory does not exist' do
context 'when mark_projects_as_not_deployed is set' do
let(:mark_projects_as_not_deployed) { true }
context 'when there is work for multiple threads' do
let(:batch_size) { 2 } # override to force usage of multiple threads
it 'counts project as migrated' do
expect_next_instance_of(::Pages::MigrateLegacyStorageToDeploymentService, project, ignore_invalid_entries: false, mark_projects_as_not_deployed: true) do |service|
expect(service).to receive(:execute).and_call_original
end
it 'uses multiple threads' do
projects = create_list(:project, 20)
projects.each do |project|
project.mark_pages_as_deployed
expect(service.execute).to eq(migrated: 1, errored: 0)
FileUtils.mkdir_p File.join(project.pages_path, "public")
File.open(File.join(project.pages_path, "public/index.html"), "w") do |f|
f.write("Hello!")
end
end
end
it 'counts project as errored' do
expect_next_instance_of(::Pages::MigrateLegacyStorageToDeploymentService, project, ignore_invalid_entries: false, mark_projects_as_not_deployed: false) do |service|
expect(service).to receive(:execute).and_call_original
end
threads = Concurrent::Set.new
expect(service.execute).to eq(migrated: 0, errored: 1)
end
end
expect(service).to receive(:migrate_project).exactly(20).times.and_wrap_original do |m, *args|
threads.add(Thread.current)
context 'when pages directory exists on disk' do
before do
FileUtils.mkdir_p File.join(project.pages_path, "public")
File.open(File.join(project.pages_path, "public/index.html"), "w") do |f|
f.write("Hello!")
end
end
# sleep to be 100% certain that once thread can't consume all the queue
# it works without it, but I want to avoid making this test flaky
sleep(0.01)
it 'migrates pages projects without deployments' do
expect_next_instance_of(::Pages::MigrateLegacyStorageToDeploymentService, project, ignore_invalid_entries: false, mark_projects_as_not_deployed: false) do |service|
expect(service).to receive(:execute).and_call_original
m.call(*args)
end
expect do
expect(service.execute).to eq(migrated: 1, errored: 0)
end.to change { project.pages_metadatum.reload.pages_deployment }.from(nil)
is_expected.to eq(migrated: 20, errored: 0)
expect(threads.length).to eq(3)
end
end
end
context 'when deployed already exists for the project' do
before do
deployment = create(:pages_deployment, project: project)
project.set_first_pages_deployment!(deployment)
end
describe "#execute_for_batch" do
subject { service.execute_for_batch(Project.ids) }
include_examples "migrates projects properly"
it 'only tries to migrate projects with passed ids' do
projects = create_list(:project, 5)
it 'does not try to migrate project' do
expect(::Pages::MigrateLegacyStorageToDeploymentService).not_to receive(:new)
projects.each(&:mark_pages_as_deployed)
projects_to_migrate = projects.first(3)
expect(service.execute).to eq(migrated: 0, errored: 0)
projects_to_migrate.each do |project|
expect_next_instance_of(::Pages::MigrateLegacyStorageToDeploymentService, project, ignore_invalid_entries: false, mark_projects_as_not_deployed: false) do |service|
expect(service).to receive(:execute).and_call_original
end
end
expect(service.execute_for_batch(projects_to_migrate.pluck(:id))).to eq(migrated: 0, errored: 3)
end
end
end
......@@ -12,11 +12,9 @@ RSpec.describe 'gitlab:pages' do
it 'calls migration service' do
expect_next_instance_of(::Pages::MigrateFromLegacyStorageService, anything,
migration_threads: 3,
batch_size: 10,
ignore_invalid_entries: false,
mark_projects_as_not_deployed: false) do |service|
expect(service).to receive(:execute).and_call_original
expect(service).to receive(:execute_with_threads).with(threads: 3, batch_size: 10).and_call_original
end
subject
......@@ -26,11 +24,9 @@ RSpec.describe 'gitlab:pages' do
stub_env('PAGES_MIGRATION_THREADS', '5')
expect_next_instance_of(::Pages::MigrateFromLegacyStorageService, anything,
migration_threads: 5,
batch_size: 10,
ignore_invalid_entries: false,
mark_projects_as_not_deployed: false) do |service|
expect(service).to receive(:execute).and_call_original
expect(service).to receive(:execute_with_threads).with(threads: 5, batch_size: 10).and_call_original
end
subject
......@@ -40,11 +36,9 @@ RSpec.describe 'gitlab:pages' do
stub_env('PAGES_MIGRATION_BATCH_SIZE', '100')
expect_next_instance_of(::Pages::MigrateFromLegacyStorageService, anything,
migration_threads: 3,
batch_size: 100,
ignore_invalid_entries: false,
mark_projects_as_not_deployed: false) do |service|
expect(service).to receive(:execute).and_call_original
expect(service).to receive(:execute_with_threads).with(threads: 3, batch_size: 100).and_call_original
end
subject
......@@ -54,11 +48,9 @@ RSpec.describe 'gitlab:pages' do
stub_env('PAGES_MIGRATION_IGNORE_INVALID_ENTRIES', 'true')
expect_next_instance_of(::Pages::MigrateFromLegacyStorageService, anything,
migration_threads: 3,
batch_size: 10,
ignore_invalid_entries: true,
mark_projects_as_not_deployed: false) do |service|
expect(service).to receive(:execute).and_call_original
expect(service).to receive(:execute_with_threads).with(threads: 3, batch_size: 10).and_call_original
end
subject
......@@ -68,11 +60,9 @@ RSpec.describe 'gitlab:pages' do
stub_env('PAGES_MIGRATION_MARK_PROJECTS_AS_NOT_DEPLOYED', 'true')
expect_next_instance_of(::Pages::MigrateFromLegacyStorageService, anything,
migration_threads: 3,
batch_size: 10,
ignore_invalid_entries: false,
mark_projects_as_not_deployed: true) do |service|
expect(service).to receive(:execute).and_call_original
expect(service).to receive(:execute_with_threads).with(threads: 3, batch_size: 10).and_call_original
end
subject
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment