Commit 684114bb authored by Stan Hu's avatar Stan Hu

Merge branch 'ab/cleanup-untracked-uploads-migration' into 'master'

Cleanup and remove PopulateUntrackedUploads background migration

Closes #223857

See merge request gitlab-org/gitlab!37352
parents 2bbc4d78 2640a65c
......@@ -25,7 +25,6 @@ Layout/ClosingHeredocIndentation:
- 'app/graphql/mutations/merge_requests/set_wip.rb'
- 'ee/db/geo/migrate/20180322062741_migrate_ci_job_artifacts_to_separate_registry.rb'
- 'ee/lib/gitlab/geo/health_check.rb'
- 'lib/gitlab/background_migration/populate_untracked_uploads.rb'
- 'spec/features/merge_request/user_sees_diff_spec.rb'
- 'spec/lib/gitlab/asciidoc_spec.rb'
- 'spec/lib/gitlab/checks/project_moved_spec.rb'
......@@ -1536,7 +1535,6 @@ Rails/SaveBang:
- 'spec/support/helpers/stub_object_storage.rb'
- 'spec/support/migrations_helpers/cluster_helpers.rb'
- 'spec/support/migrations_helpers/namespaces_helper.rb'
- 'spec/support/migrations_helpers/track_untracked_uploads_helpers.rb'
- 'spec/support/shared_contexts/email_shared_context.rb'
- 'spec/support/shared_contexts/finders/group_projects_finder_shared_contexts.rb'
- 'spec/support/shared_contexts/mailers/notify_shared_context.rb'
......
---
title: Cleanup migration to drop temporary table untracked_files_for_uploads if exists
merge_request: 37352
author:
type: other
# Ignore table used temporarily in background migration
ActiveRecord::SchemaDumper.ignore_tables = ["untracked_files_for_uploads"]
# Ignore dynamically managed partitions in static application schema
ActiveRecord::SchemaDumper.ignore_tables += ["#{Gitlab::Database::DYNAMIC_PARTITIONS_SCHEMA}.*"]
# frozen_string_literal: true
class DropTemporaryTableUntrackedFilesForUploadsIfExists < ActiveRecord::Migration[6.0]
DOWNTIME = false
def up
execute 'DROP TABLE IF EXISTS untracked_files_for_uploads'
end
def down
# no-op - this table should not exist
end
end
......@@ -23960,6 +23960,7 @@ COPY "schema_migrations" (version) FROM STDIN;
20200703124823
20200703125016
20200703154822
20200703165434
20200704143633
20200704161600
20200706005325
......
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
# This class processes a batch of rows in `untracked_files_for_uploads` by
# adding each file to the `uploads` table if it does not exist.
class PopulateUntrackedUploads
def perform(start_id, end_id)
return unless migrate?
files = Gitlab::BackgroundMigration::PopulateUntrackedUploadsDependencies::UntrackedFile.where(id: start_id..end_id)
processed_files = insert_uploads_if_needed(files)
processed_files.delete_all
drop_temp_table_if_finished
end
private
def migrate?
Gitlab::BackgroundMigration::PopulateUntrackedUploadsDependencies::UntrackedFile.table_exists? &&
Gitlab::BackgroundMigration::PopulateUntrackedUploadsDependencies::Upload.table_exists?
end
def insert_uploads_if_needed(files)
filtered_files, error_files = filter_error_files(files)
filtered_files = filter_existing_uploads(filtered_files)
filtered_files = filter_deleted_models(filtered_files)
insert(filtered_files)
processed_files = files.where.not(id: error_files.map(&:id))
processed_files
end
def filter_error_files(files)
files.partition do |file|
file.to_h
true
rescue => e
msg = <<~MSG
Error parsing path "#{file.path}":
#{e.message}
#{e.backtrace.join("\n ")}
MSG
Rails.logger.error(msg) # rubocop:disable Gitlab/RailsLogger
false
end
end
def filter_existing_uploads(files)
paths = files.map(&:upload_path)
existing_paths = Gitlab::BackgroundMigration::PopulateUntrackedUploadsDependencies::Upload.where(path: paths).pluck(:path).to_set
files.reject do |file|
existing_paths.include?(file.upload_path)
end
end
# There are files on disk that are not in the uploads table because their
# model was deleted, and we don't delete the files on disk.
def filter_deleted_models(files)
ids = deleted_model_ids(files)
files.reject do |file|
ids[file.model_type].include?(file.model_id)
end
end
def deleted_model_ids(files)
ids = {
'Appearance' => [],
'Namespace' => [],
'Note' => [],
'Project' => [],
'User' => []
}
# group model IDs by model type
files.each do |file|
ids[file.model_type] << file.model_id
end
ids.each do |model_type, model_ids|
model_class = "Gitlab::BackgroundMigration::PopulateUntrackedUploadsDependencies::#{model_type}".constantize
found_ids = model_class.where(id: model_ids.uniq).pluck(:id)
deleted_ids = ids[model_type] - found_ids
ids[model_type] = deleted_ids
end
ids
end
def insert(files)
rows = files.map do |file|
file.to_h.merge(created_at: 'NOW()')
end
Gitlab::Database.bulk_insert('uploads', # rubocop:disable Gitlab/BulkInsert
rows,
disable_quote: :created_at)
end
def drop_temp_table_if_finished
if Gitlab::BackgroundMigration::PopulateUntrackedUploadsDependencies::UntrackedFile.all.empty? && !Rails.env.test? # Dropping a table intermittently breaks test cleanup
Gitlab::BackgroundMigration::PopulateUntrackedUploadsDependencies::UntrackedFile.connection.drop_table(:untracked_files_for_uploads,
if_exists: true)
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
module PopulateUntrackedUploadsDependencies
# This class is responsible for producing the attributes necessary to
# track an uploaded file in the `uploads` table.
class UntrackedFile < ActiveRecord::Base # rubocop:disable Metrics/ClassLength
self.table_name = 'untracked_files_for_uploads'
# Ends with /:random_hex/:filename
FILE_UPLOADER_PATH = %r{/\h+/[^/]+\z}.freeze
FULL_PATH_CAPTURE = /\A(.+)#{FILE_UPLOADER_PATH}/.freeze
# These regex patterns are tested against a relative path, relative to
# the upload directory.
# For convenience, if there exists a capture group in the pattern, then
# it indicates the model_id.
PATH_PATTERNS = [
{
pattern: %r{\A-/system/appearance/logo/(\d+)/},
uploader: 'AttachmentUploader',
model_type: 'Appearance'
},
{
pattern: %r{\A-/system/appearance/header_logo/(\d+)/},
uploader: 'AttachmentUploader',
model_type: 'Appearance'
},
{
pattern: %r{\A-/system/note/attachment/(\d+)/},
uploader: 'AttachmentUploader',
model_type: 'Note'
},
{
pattern: %r{\A-/system/user/avatar/(\d+)/},
uploader: 'AvatarUploader',
model_type: 'User'
},
{
pattern: %r{\A-/system/group/avatar/(\d+)/},
uploader: 'AvatarUploader',
model_type: 'Namespace'
},
{
pattern: %r{\A-/system/project/avatar/(\d+)/},
uploader: 'AvatarUploader',
model_type: 'Project'
},
{
pattern: FILE_UPLOADER_PATH,
uploader: 'FileUploader',
model_type: 'Project'
}
].freeze
def to_h
@upload_hash ||= {
path: upload_path,
uploader: uploader,
model_type: model_type,
model_id: model_id,
size: file_size,
checksum: checksum
}
end
def upload_path
# UntrackedFile#path is absolute, but Upload#path depends on uploader
@upload_path ||=
if uploader == 'FileUploader'
# Path relative to project directory in uploads
matchd = path_relative_to_upload_dir.match(FILE_UPLOADER_PATH)
matchd[0].sub(%r{\A/}, '') # remove leading slash
else
path
end
end
def uploader
matching_pattern_map[:uploader]
end
def model_type
matching_pattern_map[:model_type]
end
def model_id
return @model_id if defined?(@model_id)
pattern = matching_pattern_map[:pattern]
matchd = path_relative_to_upload_dir.match(pattern)
# If something is captured (matchd[1] is not nil), it is a model_id
# Only the FileUploader pattern will not match an ID
@model_id = matchd[1] ? matchd[1].to_i : file_uploader_model_id
end
def file_size
File.size(absolute_path)
end
def checksum
Digest::SHA256.file(absolute_path).hexdigest
end
private
def matching_pattern_map
@matching_pattern_map ||= PATH_PATTERNS.find do |path_pattern_map|
path_relative_to_upload_dir.match(path_pattern_map[:pattern])
end
unless @matching_pattern_map
raise "Unknown upload path pattern \"#{path}\""
end
@matching_pattern_map
end
def file_uploader_model_id
matchd = path_relative_to_upload_dir.match(FULL_PATH_CAPTURE)
not_found_msg = <<~MSG
Could not capture project full_path from a FileUploader path:
"#{path_relative_to_upload_dir}"
MSG
raise not_found_msg unless matchd
full_path = matchd[1]
project = Gitlab::BackgroundMigration::PopulateUntrackedUploadsDependencies::Project.find_by_full_path(full_path)
return unless project
project.id
end
# Not including a leading slash
def path_relative_to_upload_dir
upload_dir = Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR
base = %r{\A#{Regexp.escape(upload_dir)}/}
@path_relative_to_upload_dir ||= path.sub(base, '')
end
def absolute_path
File.join(Gitlab.config.uploads.storage_path, path)
end
end
# Avoid using application code
class Upload < ActiveRecord::Base
self.table_name = 'uploads'
end
# Avoid using application code
class Appearance < ActiveRecord::Base
self.table_name = 'appearances'
end
# Avoid using application code
class Namespace < ActiveRecord::Base
self.table_name = 'namespaces'
end
# Avoid using application code
class Note < ActiveRecord::Base
self.table_name = 'notes'
end
# Avoid using application code
class User < ActiveRecord::Base
self.table_name = 'users'
end
# Since project Markdown upload paths don't contain the project ID, we have to find the
# project by its full_path. Due to MySQL/PostgreSQL differences, and historical reasons,
# the logic is somewhat complex, so I've mostly copied it in here.
class Project < ActiveRecord::Base
self.table_name = 'projects'
def self.find_by_full_path(path)
order_sql = Arel.sql("(CASE WHEN routes.path = #{connection.quote(path)} THEN 0 ELSE 1 END)")
where_full_path_in(path).reorder(order_sql).take
end
def self.where_full_path_in(path)
where = "(LOWER(routes.path) = LOWER(#{connection.quote(path)}))"
joins("INNER JOIN routes ON routes.source_id = projects.id AND routes.source_type = 'Project'").where(where)
end
end
end
end
end
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
# This class finds all non-hashed uploaded file paths and saves them to a
# `untracked_files_for_uploads` table.
class PrepareUntrackedUploads # rubocop:disable Metrics/ClassLength
# For bulk_queue_background_migration_jobs_by_range
include Database::MigrationHelpers
include ::Gitlab::Utils::StrongMemoize
FIND_BATCH_SIZE = 500
RELATIVE_UPLOAD_DIR = "uploads"
ABSOLUTE_UPLOAD_DIR = File.join(
Gitlab.config.uploads.storage_path,
RELATIVE_UPLOAD_DIR
)
FOLLOW_UP_MIGRATION = 'PopulateUntrackedUploads'
START_WITH_ROOT_REGEX = %r{\A#{Gitlab.config.uploads.storage_path}/}.freeze
EXCLUDED_HASHED_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/@hashed/*"
EXCLUDED_TMP_UPLOADS_PATH = "#{ABSOLUTE_UPLOAD_DIR}/tmp/*"
# This class is used to iterate over batches of
# `untracked_files_for_uploads` rows.
class UntrackedFile < ActiveRecord::Base
include EachBatch
self.table_name = 'untracked_files_for_uploads'
end
def perform
ensure_temporary_tracking_table_exists
# Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since
# doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be
# slow, start with an empty table for Postgres < 9.5.
# That way we can do bulk inserts at ~30x the speed of individual
# inserts (~20 minutes worth of inserts at GitLab.com scale instead of
# ~10 hours).
# In all other cases, installations will get both bulk inserts and the
# ability for these jobs to retry without having to clear and reinsert.
clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates?
store_untracked_file_paths
if UntrackedFile.all.empty?
drop_temp_table
else
schedule_populate_untracked_uploads_jobs
end
end
private
def ensure_temporary_tracking_table_exists
table_name = :untracked_files_for_uploads
unless ActiveRecord::Base.connection.table_exists?(table_name)
UntrackedFile.connection.create_table table_name do |t|
t.string :path, limit: 600, null: false
t.index :path, unique: true
end
end
end
def clear_untracked_file_paths
UntrackedFile.delete_all
end
def store_untracked_file_paths
return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR)
each_file_batch(ABSOLUTE_UPLOAD_DIR, FIND_BATCH_SIZE) do |file_paths|
insert_file_paths(file_paths)
end
end
def each_file_batch(search_dir, batch_size, &block)
cmd = build_find_command(search_dir)
Open3.popen2(*cmd) do |stdin, stdout, status_thread|
yield_paths_in_batches(stdout, batch_size, &block)
raise "Find command failed" unless status_thread.value.success?
end
end
def yield_paths_in_batches(stdout, batch_size, &block)
paths = []
stdout.each_line("\0") do |line|
paths << line.chomp("\0").sub(START_WITH_ROOT_REGEX, '')
if paths.size >= batch_size
yield(paths)
paths = []
end
end
yield(paths) if paths.any?
end
def build_find_command(search_dir)
cmd = %W[find -L #{search_dir}
-type f
! ( -path #{EXCLUDED_HASHED_UPLOADS_PATH} -prune )
! ( -path #{EXCLUDED_TMP_UPLOADS_PATH} -prune )
-print0]
ionice = which_ionice
cmd = %W[#{ionice} -c Idle] + cmd if ionice
log_msg = "PrepareUntrackedUploads find command: \"#{cmd.join(' ')}\""
Rails.logger.info log_msg # rubocop:disable Gitlab/RailsLogger
cmd
end
def which_ionice
Gitlab::Utils.which('ionice')
rescue StandardError
# In this case, returning false is relatively safe,
# even though it isn't very nice
false
end
def insert_file_paths(file_paths)
sql = insert_sql(file_paths)
ActiveRecord::Base.connection.execute(sql)
end
def insert_sql(file_paths)
if postgresql_pre_9_5?
"INSERT INTO #{table_columns_and_values_for_insert(file_paths)};"
else
"INSERT INTO #{table_columns_and_values_for_insert(file_paths)}"\
" ON CONFLICT DO NOTHING;"
end
end
def table_columns_and_values_for_insert(file_paths)
values = file_paths.map do |file_path|
ActiveRecord::Base.send(:sanitize_sql_array, ['(?)', file_path]) # rubocop:disable GitlabSecurity/PublicSend
end.join(', ')
"#{UntrackedFile.table_name} (path) VALUES #{values}"
end
def can_bulk_insert_and_ignore_duplicates?
!postgresql_pre_9_5?
end
def postgresql_pre_9_5?
strong_memoize(:postgresql_pre_9_5) do
Gitlab::Database.version.to_f < 9.5
end
end
def schedule_populate_untracked_uploads_jobs
bulk_queue_background_migration_jobs_by_range(
UntrackedFile, FOLLOW_UP_MIGRATION)
end
def drop_temp_table
unless Rails.env.test? # Dropping a table intermittently breaks test cleanup
UntrackedFile.connection.drop_table(:untracked_files_for_uploads,
if_exists: true)
end
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::PopulateUntrackedUploadsDependencies::UntrackedFile do
include MigrationsHelpers::TrackUntrackedUploadsHelpers
let!(:appearances) { table(:appearances) }
let!(:namespaces) { table(:namespaces) }
let!(:projects) { table(:projects) }
let!(:routes) { table(:routes) }
let!(:uploads) { table(:uploads) }
before(:all) do
ensure_temporary_tracking_table_exists
end
describe '#upload_path' do
def assert_upload_path(file_path, expected_upload_path)
untracked_file = create_untracked_file(file_path)
expect(untracked_file.upload_path).to eq(expected_upload_path)
end
context 'for an appearance logo file path' do
it 'returns the file path relative to the CarrierWave root' do
assert_upload_path('/-/system/appearance/logo/1/some_logo.jpg', 'uploads/-/system/appearance/logo/1/some_logo.jpg')
end
end
context 'for an appearance header_logo file path' do
it 'returns the file path relative to the CarrierWave root' do
assert_upload_path('/-/system/appearance/header_logo/1/some_logo.jpg', 'uploads/-/system/appearance/header_logo/1/some_logo.jpg')
end
end
context 'for a pre-Markdown Note attachment file path' do
it 'returns the file path relative to the CarrierWave root' do
assert_upload_path('/-/system/note/attachment/1234/some_attachment.pdf', 'uploads/-/system/note/attachment/1234/some_attachment.pdf')
end
end
context 'for a user avatar file path' do
it 'returns the file path relative to the CarrierWave root' do
assert_upload_path('/-/system/user/avatar/1234/avatar.jpg', 'uploads/-/system/user/avatar/1234/avatar.jpg')
end
end
context 'for a group avatar file path' do
it 'returns the file path relative to the CarrierWave root' do
assert_upload_path('/-/system/group/avatar/1234/avatar.jpg', 'uploads/-/system/group/avatar/1234/avatar.jpg')
end
end
context 'for a project avatar file path' do
it 'returns the file path relative to the CarrierWave root' do
assert_upload_path('/-/system/project/avatar/1234/avatar.jpg', 'uploads/-/system/project/avatar/1234/avatar.jpg')
end
end
context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do
it 'returns the file path relative to the project directory in uploads' do
project = create_project
random_hex = SecureRandom.hex
assert_upload_path("/#{get_full_path(project)}/#{random_hex}/Some file.jpg", "#{random_hex}/Some file.jpg")
end
end
end
describe '#uploader' do
def assert_uploader(file_path, expected_uploader)
untracked_file = create_untracked_file(file_path)
expect(untracked_file.uploader).to eq(expected_uploader)
end
context 'for an appearance logo file path' do
it 'returns AttachmentUploader as a string' do
assert_uploader('/-/system/appearance/logo/1/some_logo.jpg', 'AttachmentUploader')
end
end
context 'for an appearance header_logo file path' do
it 'returns AttachmentUploader as a string' do
assert_uploader('/-/system/appearance/header_logo/1/some_logo.jpg', 'AttachmentUploader')
end
end
context 'for a pre-Markdown Note attachment file path' do
it 'returns AttachmentUploader as a string' do
assert_uploader('/-/system/note/attachment/1234/some_attachment.pdf', 'AttachmentUploader')
end
end
context 'for a user avatar file path' do
it 'returns AvatarUploader as a string' do
assert_uploader('/-/system/user/avatar/1234/avatar.jpg', 'AvatarUploader')
end
end
context 'for a group avatar file path' do
it 'returns AvatarUploader as a string' do
assert_uploader('/-/system/group/avatar/1234/avatar.jpg', 'AvatarUploader')
end
end
context 'for a project avatar file path' do
it 'returns AvatarUploader as a string' do
assert_uploader('/-/system/project/avatar/1234/avatar.jpg', 'AvatarUploader')
end
end
context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do
it 'returns FileUploader as a string' do
project = create_project
assert_uploader("/#{get_full_path(project)}/#{SecureRandom.hex}/Some file.jpg", 'FileUploader')
end
end
end
describe '#model_type' do
def assert_model_type(file_path, expected_model_type)
untracked_file = create_untracked_file(file_path)
expect(untracked_file.model_type).to eq(expected_model_type)
end
context 'for an appearance logo file path' do
it 'returns Appearance as a string' do
assert_model_type('/-/system/appearance/logo/1/some_logo.jpg', 'Appearance')
end
end
context 'for an appearance header_logo file path' do
it 'returns Appearance as a string' do
assert_model_type('/-/system/appearance/header_logo/1/some_logo.jpg', 'Appearance')
end
end
context 'for a pre-Markdown Note attachment file path' do
it 'returns Note as a string' do
assert_model_type('/-/system/note/attachment/1234/some_attachment.pdf', 'Note')
end
end
context 'for a user avatar file path' do
it 'returns User as a string' do
assert_model_type('/-/system/user/avatar/1234/avatar.jpg', 'User')
end
end
context 'for a group avatar file path' do
it 'returns Namespace as a string' do
assert_model_type('/-/system/group/avatar/1234/avatar.jpg', 'Namespace')
end
end
context 'for a project avatar file path' do
it 'returns Project as a string' do
assert_model_type('/-/system/project/avatar/1234/avatar.jpg', 'Project')
end
end
context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do
it 'returns Project as a string' do
project = create_project
assert_model_type("/#{get_full_path(project)}/#{SecureRandom.hex}/Some file.jpg", 'Project')
end
end
end
describe '#model_id' do
def assert_model_id(file_path, expected_model_id)
untracked_file = create_untracked_file(file_path)
expect(untracked_file.model_id).to eq(expected_model_id)
end
context 'for an appearance logo file path' do
it 'returns the ID as a string' do
assert_model_id('/-/system/appearance/logo/1/some_logo.jpg', 1)
end
end
context 'for an appearance header_logo file path' do
it 'returns the ID as a string' do
assert_model_id('/-/system/appearance/header_logo/1/some_logo.jpg', 1)
end
end
context 'for a pre-Markdown Note attachment file path' do
it 'returns the ID as a string' do
assert_model_id('/-/system/note/attachment/1234/some_attachment.pdf', 1234)
end
end
context 'for a user avatar file path' do
it 'returns the ID as a string' do
assert_model_id('/-/system/user/avatar/1234/avatar.jpg', 1234)
end
end
context 'for a group avatar file path' do
it 'returns the ID as a string' do
assert_model_id('/-/system/group/avatar/1234/avatar.jpg', 1234)
end
end
context 'for a project avatar file path' do
it 'returns the ID as a string' do
assert_model_id('/-/system/project/avatar/1234/avatar.jpg', 1234)
end
end
context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do
it 'returns the ID as a string' do
project = create_project
assert_model_id("/#{get_full_path(project)}/#{SecureRandom.hex}/Some file.jpg", project.id)
end
end
end
describe '#file_size' do
context 'for an appearance logo file path' do
let(:appearance) { create_or_update_appearance(logo: true) }
let(:untracked_file) { described_class.create!(path: get_uploads(appearance, 'Appearance').first.path) }
it 'returns the file size' do
expect(untracked_file.file_size).to eq(1062)
end
end
context 'for a project avatar file path' do
let(:project) { create_project(avatar: true) }
let(:untracked_file) { described_class.create!(path: get_uploads(project, 'Project').first.path) }
it 'returns the file size' do
expect(untracked_file.file_size).to eq(1062)
end
end
context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do
let(:project) { create_project }
let(:untracked_file) { create_untracked_file("/#{get_full_path(project)}/#{get_uploads(project, 'Project').first.path}") }
before do
add_markdown_attachment(project)
end
it 'returns the file size' do
expect(untracked_file.file_size).to eq(1062)
end
end
end
def create_untracked_file(path_relative_to_upload_dir)
described_class.create!(path: "#{Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR}#{path_relative_to_upload_dir}")
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::PopulateUntrackedUploads do
include MigrationsHelpers::TrackUntrackedUploadsHelpers
subject { described_class.new }
let!(:appearances) { table(:appearances) }
let!(:namespaces) { table(:namespaces) }
let!(:notes) { table(:notes) }
let!(:projects) { table(:projects) }
let!(:routes) { table(:routes) }
let!(:untracked_files_for_uploads) { table(:untracked_files_for_uploads) }
let!(:uploads) { table(:uploads) }
let!(:users) { table(:users) }
before do
ensure_temporary_tracking_table_exists
uploads.delete_all
end
context 'with untracked files and tracked files in untracked_files_for_uploads' do
let!(:appearance) { create_or_update_appearance(logo: true, header_logo: true) }
let!(:user1) { create_user(avatar: true) }
let!(:user2) { create_user(avatar: true) }
let!(:project1) { create_project(avatar: true) }
let!(:project2) { create_project(avatar: true) }
before do
add_markdown_attachment(project1)
add_markdown_attachment(project2)
# File records created by PrepareUntrackedUploads
untracked_files_for_uploads.create!(path: get_uploads(appearance, 'Appearance').first.path)
untracked_files_for_uploads.create!(path: get_uploads(appearance, 'Appearance').last.path)
untracked_files_for_uploads.create!(path: get_uploads(user1, 'User').first.path)
untracked_files_for_uploads.create!(path: get_uploads(user2, 'User').first.path)
untracked_files_for_uploads.create!(path: get_uploads(project1, 'Project').first.path)
untracked_files_for_uploads.create!(path: get_uploads(project2, 'Project').first.path)
untracked_files_for_uploads.create!(path: "#{legacy_project_uploads_dir(project1).sub("#{MigrationsHelpers::TrackUntrackedUploadsHelpers::PUBLIC_DIR}/", '')}/#{get_uploads(project1, 'Project').last.path}")
untracked_files_for_uploads.create!(path: "#{legacy_project_uploads_dir(project2).sub("#{MigrationsHelpers::TrackUntrackedUploadsHelpers::PUBLIC_DIR}/", '')}/#{get_uploads(project2, 'Project').last.path}")
# Untrack 4 files
get_uploads(user2, 'User').delete_all
get_uploads(project2, 'Project').delete_all # 2 files: avatar and a Markdown upload
get_uploads(appearance, 'Appearance').where("path like '%header_logo%'").delete_all
end
it 'adds untracked files to the uploads table' do
expect do
subject.perform(1, untracked_files_for_uploads.reorder(:id).last.id)
end.to change { uploads.count }.from(4).to(8)
expect(get_uploads(user2, 'User').count).to eq(1)
expect(get_uploads(project2, 'Project').count).to eq(2)
expect(get_uploads(appearance, 'Appearance').count).to eq(2)
end
it 'deletes rows after processing them' do
expect(subject).to receive(:drop_temp_table_if_finished) # Don't drop the table so we can look at it
expect do
subject.perform(1, untracked_files_for_uploads.last.id)
end.to change { untracked_files_for_uploads.count }.from(8).to(0)
end
it 'does not create duplicate uploads of already tracked files' do
subject.perform(1, untracked_files_for_uploads.last.id)
expect(get_uploads(user1, 'User').count).to eq(1)
expect(get_uploads(project1, 'Project').count).to eq(2)
expect(get_uploads(appearance, 'Appearance').count).to eq(2)
end
it 'uses the start and end batch ids [only 1st half]' do
ids = untracked_files_for_uploads.all.order(:id).pluck(:id)
start_id = ids[0]
end_id = ids[3]
expect do
subject.perform(start_id, end_id)
end.to change { uploads.count }.from(4).to(6)
expect(get_uploads(user1, 'User').count).to eq(1)
expect(get_uploads(user2, 'User').count).to eq(1)
expect(get_uploads(appearance, 'Appearance').count).to eq(2)
expect(get_uploads(project1, 'Project').count).to eq(2)
expect(get_uploads(project2, 'Project').count).to eq(0)
# Only 4 have been either confirmed or added to uploads
expect(untracked_files_for_uploads.count).to eq(4)
end
it 'uses the start and end batch ids [only 2nd half]' do
ids = untracked_files_for_uploads.all.order(:id).pluck(:id)
start_id = ids[4]
end_id = ids[7]
expect do
subject.perform(start_id, end_id)
end.to change { uploads.count }.from(4).to(6)
expect(get_uploads(user1, 'User').count).to eq(1)
expect(get_uploads(user2, 'User').count).to eq(0)
expect(get_uploads(appearance, 'Appearance').count).to eq(1)
expect(get_uploads(project1, 'Project').count).to eq(2)
expect(get_uploads(project2, 'Project').count).to eq(2)
# Only 4 have been either confirmed or added to uploads
expect(untracked_files_for_uploads.count).to eq(4)
end
it 'does not drop the temporary tracking table after processing the batch, if there are still untracked rows' do
subject.perform(1, untracked_files_for_uploads.last.id - 1)
expect(ActiveRecord::Base.connection.table_exists?(:untracked_files_for_uploads)).to be_truthy
end
it 'drops the temporary tracking table after processing the batch, if there are no untracked rows left' do
expect(subject).to receive(:drop_temp_table_if_finished)
subject.perform(1, untracked_files_for_uploads.last.id)
end
it 'does not block a whole batch because of one bad path' do
untracked_files_for_uploads.create!(path: "#{Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR}/#{get_full_path(project2)}/._7d37bf4c747916390e596744117d5d1a")
expect(untracked_files_for_uploads.count).to eq(9)
expect(uploads.count).to eq(4)
subject.perform(1, untracked_files_for_uploads.last.id)
expect(untracked_files_for_uploads.count).to eq(1)
expect(uploads.count).to eq(8)
end
it 'an unparseable path is shown in error output' do
bad_path = "#{Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR}/#{get_full_path(project2)}/._7d37bf4c747916390e596744117d5d1a"
untracked_files_for_uploads.create!(path: bad_path)
expect(Rails.logger).to receive(:error).with(/Error parsing path "#{bad_path}":/)
subject.perform(1, untracked_files_for_uploads.last.id)
end
end
context 'with no untracked files' do
it 'does not add to the uploads table (and does not raise error)' do
expect do
subject.perform(1, 1000)
end.not_to change { uploads.count }.from(0)
end
end
describe 'upload outcomes for each path pattern' do
shared_examples_for 'non_markdown_file' do
let!(:expected_upload_attrs) { model_uploads.first.attributes.slice('path', 'uploader', 'size', 'checksum') }
let!(:untracked_file) { untracked_files_for_uploads.create!(path: expected_upload_attrs['path']) }
before do
model_uploads.delete_all
end
it 'creates an Upload record' do
expect do
subject.perform(1, untracked_files_for_uploads.last.id)
end.to change { model_uploads.count }.from(0).to(1)
expect(model_uploads.first.attributes).to include(expected_upload_attrs)
end
end
context 'for an appearance logo file path' do
let(:model) { create_or_update_appearance(logo: true) }
let(:model_uploads) { get_uploads(model, 'Appearance') }
it_behaves_like 'non_markdown_file'
end
context 'for an appearance header_logo file path' do
let(:model) { create_or_update_appearance(header_logo: true) }
let(:model_uploads) { get_uploads(model, 'Appearance') }
it_behaves_like 'non_markdown_file'
end
context 'for a pre-Markdown Note attachment file path' do
let(:model) { create_note(attachment: true) }
let!(:expected_upload_attrs) { get_uploads(model, 'Note').first.attributes.slice('path', 'uploader', 'size', 'checksum') }
let!(:untracked_file) { untracked_files_for_uploads.create!(path: expected_upload_attrs['path']) }
before do
get_uploads(model, 'Note').delete_all
end
# Can't use the shared example because Note doesn't have an `uploads` association
it 'creates an Upload record' do
expect do
subject.perform(1, untracked_files_for_uploads.last.id)
end.to change { get_uploads(model, 'Note').count }.from(0).to(1)
expect(get_uploads(model, 'Note').first.attributes).to include(expected_upload_attrs)
end
end
context 'for a user avatar file path' do
let(:model) { create_user(avatar: true) }
let(:model_uploads) { get_uploads(model, 'User') }
it_behaves_like 'non_markdown_file'
end
context 'for a group avatar file path' do
let(:model) { create_group(avatar: true) }
let(:model_uploads) { get_uploads(model, 'Namespace') }
it_behaves_like 'non_markdown_file'
end
context 'for a project avatar file path' do
let(:model) { create_project(avatar: true) }
let(:model_uploads) { get_uploads(model, 'Project') }
it_behaves_like 'non_markdown_file'
end
context 'for a project Markdown attachment (notes, issues, MR descriptions) file path' do
let(:model) { create_project }
before do
# Upload the file
add_markdown_attachment(model)
# Create the untracked_files_for_uploads record
untracked_files_for_uploads.create!(path: "#{Gitlab::BackgroundMigration::PrepareUntrackedUploads::RELATIVE_UPLOAD_DIR}/#{get_full_path(model)}/#{get_uploads(model, 'Project').first.path}")
# Save the expected upload attributes
@expected_upload_attrs = get_uploads(model, 'Project').first.attributes.slice('path', 'uploader', 'size', 'checksum')
# Untrack the file
get_uploads(model, 'Project').delete_all
end
it 'creates an Upload record' do
expect do
subject.perform(1, untracked_files_for_uploads.last.id)
end.to change { get_uploads(model, 'Project').count }.from(0).to(1)
expect(get_uploads(model, 'Project').first.attributes).to include(@expected_upload_attrs)
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
# Rollback DB to 10.5 (later than this was originally written for) because it still needs to work.
RSpec.describe Gitlab::BackgroundMigration::PrepareUntrackedUploads do
include MigrationsHelpers::TrackUntrackedUploadsHelpers
let!(:untracked_files_for_uploads) { table(:untracked_files_for_uploads) }
let!(:appearances) { table(:appearances) }
let!(:namespaces) { table(:namespaces) }
let!(:projects) { table(:projects) }
let!(:routes) { table(:routes) }
let!(:uploads) { table(:uploads) }
let!(:users) { table(:users) }
around do |example|
# Especially important so the follow-up migration does not get run
Sidekiq::Testing.fake! do
example.run
end
end
shared_examples 'prepares the untracked_files_for_uploads table' do
context 'when files were uploaded before and after hashed storage was enabled' do
let!(:appearance) { create_or_update_appearance(logo: true, header_logo: true) }
let!(:user) { create_user(avatar: true) }
let!(:project1) { create_project(avatar: true) }
let(:project2) { create_project } # instantiate after enabling hashed_storage
before do
# Markdown upload before enabling hashed_storage
add_markdown_attachment(project1)
# Markdown upload after enabling hashed_storage
add_markdown_attachment(project2, hashed_storage: true)
end
it 'has a path field long enough for really long paths' do
described_class.new.perform
component = 'a' * 255
long_path = [
'uploads',
component, # project.full_path
component # filename
].flatten.join('/')
record = untracked_files_for_uploads.create!(path: long_path)
expect(record.reload.path.size).to eq(519)
end
it 'adds unhashed files to the untracked_files_for_uploads table' do
described_class.new.perform
expect(untracked_files_for_uploads.count).to eq(5)
end
it 'adds files with paths relative to CarrierWave.root' do
described_class.new.perform
untracked_files_for_uploads.all.each do |file|
expect(file.path.start_with?('uploads/')).to be_truthy
end
end
it 'does not add hashed files to the untracked_files_for_uploads table' do
described_class.new.perform
hashed_file_path = get_uploads(project2, 'Project').find_by(uploader: 'FileUploader').path
expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey
end
it 'correctly schedules the follow-up background migration jobs' do
described_class.new.perform
ids = described_class::UntrackedFile.all.order(:id).pluck(:id)
expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(ids.first, ids.last)
expect(BackgroundMigrationWorker.jobs.size).to eq(1)
end
# E.g. from a previous failed run of this background migration
context 'when there is existing data in untracked_files_for_uploads' do
before do
described_class.new.perform
end
it 'does not error or produce duplicates of existing data' do
expect do
described_class.new.perform
end.not_to change { untracked_files_for_uploads.count }.from(5)
end
end
# E.g. The installation is in use at the time of migration, and someone has
# just uploaded a file
context 'when there are files in /uploads/tmp' do
let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') }
before do
FileUtils.mkdir(File.dirname(tmp_file))
FileUtils.touch(tmp_file)
end
after do
FileUtils.rm(tmp_file)
end
it 'does not add files from /uploads/tmp' do
described_class.new.perform
expect(untracked_files_for_uploads.count).to eq(5)
end
end
context 'when the last batch size exactly matches the max batch size' do
it 'does not raise error' do
stub_const("#{described_class}::FIND_BATCH_SIZE", 5)
expect do
described_class.new.perform
end.not_to raise_error
expect(untracked_files_for_uploads.count).to eq(5)
end
end
end
end
# If running on Postgres 9.2 (like on CI), this whole context is skipped
# since we're unable to use ON CONFLICT DO NOTHING or IGNORE.
context "test bulk insert with ON CONFLICT DO NOTHING or IGNORE", if: described_class.new.send(:can_bulk_insert_and_ignore_duplicates?) do
it_behaves_like 'prepares the untracked_files_for_uploads table'
end
# If running on Postgres 9.2 (like on CI), the stubbed method has no effect.
#
# If running on Postgres 9.5+ or MySQL, then this context effectively tests
# the bulk insert functionality without ON CONFLICT DO NOTHING or IGNORE.
context 'test bulk insert without ON CONFLICT DO NOTHING or IGNORE' do
before do
allow_any_instance_of(described_class).to receive(:postgresql_pre_9_5?).and_return(true)
end
it_behaves_like 'prepares the untracked_files_for_uploads table'
end
# Very new or lightly-used installations that are running this migration
# may not have an upload directory because they have no uploads.
context 'when no files were ever uploaded' do
it 'deletes the `untracked_files_for_uploads` table (and does not raise error)' do
background_migration = described_class.new
expect(background_migration).to receive(:drop_temp_table)
background_migration.perform
end
end
end
# frozen_string_literal: true
module MigrationsHelpers
module TrackUntrackedUploadsHelpers
PUBLIC_DIR = File.join(Rails.root, 'tmp', 'tests', 'public')
UPLOADS_DIR = File.join(PUBLIC_DIR, 'uploads')
SYSTEM_DIR = File.join(UPLOADS_DIR, '-', 'system')
UPLOAD_FILENAME = 'image.png'.freeze
FIXTURE_FILE_PATH = File.join(Rails.root, 'spec', 'fixtures', 'dk.png')
FIXTURE_CHECKSUM = 'b804383982bb89b00e828e3f44c038cc991d3d1768009fc39ba8e2c081b9fb75'.freeze
def create_or_update_appearance(logo: false, header_logo: false)
appearance = appearances.first_or_create(title: 'foo', description: 'bar', logo: (UPLOAD_FILENAME if logo), header_logo: (UPLOAD_FILENAME if header_logo))
add_upload(appearance, 'Appearance', 'logo', 'AttachmentUploader') if logo
add_upload(appearance, 'Appearance', 'header_logo', 'AttachmentUploader') if header_logo
appearance
end
def create_group(avatar: false)
index = unique_index(:group)
group = namespaces.create(name: "group#{index}", path: "group#{index}", avatar: (UPLOAD_FILENAME if avatar))
add_upload(group, 'Group', 'avatar', 'AvatarUploader') if avatar
group
end
def create_note(attachment: false)
note = notes.create(attachment: (UPLOAD_FILENAME if attachment))
add_upload(note, 'Note', 'attachment', 'AttachmentUploader') if attachment
note
end
def create_project(avatar: false)
group = create_group
project = projects.create(namespace_id: group.id, path: "project#{unique_index(:project)}", avatar: (UPLOAD_FILENAME if avatar))
routes.create(path: "#{group.path}/#{project.path}", source_id: project.id, source_type: 'Project') # so Project.find_by_full_path works
add_upload(project, 'Project', 'avatar', 'AvatarUploader') if avatar
project
end
def create_user(avatar: false)
user = users.create(email: "foo#{unique_index(:user)}@bar.com", avatar: (UPLOAD_FILENAME if avatar), projects_limit: 100)
add_upload(user, 'User', 'avatar', 'AvatarUploader') if avatar
user
end
def unique_index(name = :unnamed)
@unique_index ||= {}
@unique_index[name] ||= 0
@unique_index[name] += 1
end
def add_upload(model, model_type, attachment_type, uploader)
file_path = upload_file_path(model, model_type, attachment_type)
path_relative_to_public = file_path.sub("#{PUBLIC_DIR}/", '')
create_file(file_path)
uploads.create!(
size: 1062,
path: path_relative_to_public,
model_id: model.id,
model_type: model_type == 'Group' ? 'Namespace' : model_type,
uploader: uploader,
checksum: FIXTURE_CHECKSUM
)
end
def add_markdown_attachment(project, hashed_storage: false)
project_dir = hashed_storage ? hashed_project_uploads_dir(project) : legacy_project_uploads_dir(project)
attachment_dir = File.join(project_dir, SecureRandom.hex)
attachment_file_path = File.join(attachment_dir, UPLOAD_FILENAME)
project_attachment_path_relative_to_project = attachment_file_path.sub("#{project_dir}/", '')
create_file(attachment_file_path)
uploads.create!(
size: 1062,
path: project_attachment_path_relative_to_project,
model_id: project.id,
model_type: 'Project',
uploader: 'FileUploader',
checksum: FIXTURE_CHECKSUM
)
end
def legacy_project_uploads_dir(project)
namespace = namespaces.find_by(id: project.namespace_id)
File.join(UPLOADS_DIR, namespace.path, project.path)
end
def hashed_project_uploads_dir(project)
File.join(UPLOADS_DIR, '@hashed', 'aa', 'aaaaaaaaaaaa')
end
def upload_file_path(model, model_type, attachment_type)
dir = File.join(upload_dir(model_type.downcase, attachment_type.to_s), model.id.to_s)
File.join(dir, UPLOAD_FILENAME)
end
def upload_dir(model_type, attachment_type)
File.join(SYSTEM_DIR, model_type, attachment_type)
end
def create_file(path)
File.delete(path) if File.exist?(path)
FileUtils.mkdir_p(File.dirname(path))
FileUtils.cp(FIXTURE_FILE_PATH, path)
end
def get_uploads(model, model_type)
uploads.where(model_type: model_type, model_id: model.id)
end
def get_full_path(project)
routes.find_by(source_id: project.id, source_type: 'Project').path
end
def ensure_temporary_tracking_table_exists
Gitlab::BackgroundMigration::PrepareUntrackedUploads.new.send(:ensure_temporary_tracking_table_exists)
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment