Commit c50f3e8e authored by Nick Thomas's avatar Nick Thomas

Foreground verification of uploads and LFS objects

parent 68a13836
......@@ -18,4 +18,8 @@ class LfsObject < ActiveRecord::Base
.where(lfs_objects_projects: { id: nil })
.destroy_all
end
def self.calculate_oid(path)
Digest::SHA256.file(path).hexdigest
end
end
---
title: Foreground verification of uploads and LFS objects
merge_request: 17402
author:
type: added
......@@ -78,34 +78,41 @@ Example output:
## Uploaded Files Integrity
The uploads check Rake task will loop through all uploads in the database
and run two checks to determine the integrity of each file:
Various types of file can be uploaded to a GitLab installation by users.
Checksums are generated and stored in the database upon upload, and integrity
checks using those checksums can be run. These checks also detect missing files.
1. Check if the file exist on the file system.
1. Check if the checksum of the file on the file system matches the checksum in the database.
Currently, integrity checks are supported for the following types of file:
* LFS objects
* User uploads
**Omnibus Installation**
```
sudo gitlab-rake gitlab:lfs:check
sudo gitlab-rake gitlab:uploads:check
```
**Source Installation**
```bash
sudo -u git -H bundle exec rake gitlab:lfs:check RAILS_ENV=production
sudo -u git -H bundle exec rake gitlab:uploads:check RAILS_ENV=production
```
This task also accepts some environment variables which you can use to override
These tasks also accept some environment variables which you can use to override
certain values:
Variable | Type | Description
-------- | ---- | -----------
--------- | ------- | -----------
`BATCH` | integer | Specifies the size of the batch. Defaults to 200.
`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value.
`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value.
`VERBOSE` | boolean | Causes failures to be listed individually, rather than being summarized.
```bash
sudo gitlab-rake gitlab:lfs:check BATCH=100 ID_FROM=50 ID_TO=250
sudo gitlab-rake gitlab:uploads:check BATCH=100 ID_FROM=50 ID_TO=250
```
......
module Gitlab
module Verify
class BatchVerifier
attr_reader :batch_size, :start, :finish
def initialize(batch_size:, start: nil, finish: nil)
@batch_size = batch_size
@start = start
@finish = finish
end
# Yields a Range of IDs and a Hash of failed verifications (object => error)
def run_batches(&blk)
relation.in_batches(of: batch_size, start: start, finish: finish) do |relation| # rubocop: disable Cop/InBatches
range = relation.first.id..relation.last.id
failures = run_batch(relation)
yield(range, failures)
end
end
def name
raise NotImplementedError.new
end
def describe(_object)
raise NotImplementedError.new
end
private
def run_batch(relation)
relation.map { |upload| verify(upload) }.compact.to_h
end
def verify(object)
expected = expected_checksum(object)
actual = actual_checksum(object)
raise 'Checksum missing' unless expected.present?
raise 'Checksum mismatch' unless expected == actual
nil
rescue => err
[object, err]
end
# This should return an ActiveRecord::Relation suitable for calling #in_batches on
def relation
raise NotImplementedError.new
end
# The checksum we expect the object to have
def expected_checksum(_object)
raise NotImplementedError.new
end
# The freshly-recalculated checksum of the object
def actual_checksum(_object)
raise NotImplementedError.new
end
end
end
end
module Gitlab
module Verify
class LfsObjects < BatchVerifier
def name
'LFS objects'
end
def describe(object)
"LFS object: #{object.oid}"
end
private
def relation
LfsObject.all
end
def expected_checksum(lfs_object)
lfs_object.oid
end
def actual_checksum(lfs_object)
LfsObject.calculate_oid(lfs_object.file.path)
end
end
end
end
module Gitlab
module Verify
class RakeTask
def self.run!(verify_kls)
verifier = verify_kls.new(
batch_size: ENV.fetch('BATCH', 200).to_i,
start: ENV['ID_FROM'],
finish: ENV['ID_TO']
)
verbose = Gitlab::Utils.to_boolean(ENV['VERBOSE'])
new(verifier, verbose).run!
end
attr_reader :verifier, :output
def initialize(verifier, verbose)
@verifier = verifier
@verbose = verbose
end
def run!
say "Checking integrity of #{verifier.name}"
verifier.run_batches { |*args| run_batch(*args) }
say 'Done!'
end
def verbose?
!!@verbose
end
private
def say(text)
puts(text) # rubocop:disable Rails/Output
end
def run_batch(range, failures)
status_color = failures.empty? ? :green : :red
say "- #{range}: Failures: #{failures.count}".color(status_color)
return unless verbose?
failures.each do |object, error|
say " - #{verifier.describe(object)}: #{error.inspect}".color(:red)
end
end
end
end
end
module Gitlab
module Verify
class Uploads < BatchVerifier
def name
'Uploads'
end
def describe(object)
"Upload: #{object.id}"
end
private
def relation
Upload.all
end
def expected_checksum(upload)
upload.checksum
end
def actual_checksum(upload)
Upload.hexdigest(upload.absolute_path)
end
end
end
end
namespace :gitlab do
namespace :lfs do
desc 'GitLab | LFS | Check integrity of uploaded LFS objects'
task check: :environment do
Gitlab::Verify::RakeTask.run!(Gitlab::Verify::LfsObjects)
end
end
end
namespace :gitlab do
namespace :uploads do
desc 'GitLab | Uploads | Check integrity of uploaded files'
task check: :environment do
Gitlab::Verify::RakeTask.run!(Gitlab::Verify::Uploads)
end
end
end
......@@ -9,4 +9,10 @@ FactoryBot.define do
trait :with_file do
file { fixture_file_upload(Rails.root + "spec/fixtures/dk.png", "`/png") }
end
# The uniqueness constraint means we can't use the correct OID for all LFS
# objects, so the test needs to decide which (if any) object gets it
trait :correct_oid do
oid 'b804383982bb89b00e828e3f44c038cc991d3d1768009fc39ba8e2c081b9fb75'
end
end
require 'spec_helper'
describe Gitlab::Verify::LfsObjects do
include GitlabVerifyHelpers
it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
let!(:objects) { create_list(:lfs_object, 3, :with_file) }
end
describe '#run_batches' do
let(:failures) { collect_failures }
let(:failure) { failures[lfs_object] }
let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
it 'passes LFS objects with the correct file' do
expect(failures).to eq({})
end
it 'fails LFS objects with a missing file' do
FileUtils.rm_f(lfs_object.file.path)
expect(failures.keys).to contain_exactly(lfs_object)
expect(failure).to be_a(Errno::ENOENT)
expect(failure.to_s).to include(lfs_object.file.path)
end
it 'fails LFS objects with a mismatched oid' do
File.truncate(lfs_object.file.path, 0)
expect(failures.keys).to contain_exactly(lfs_object)
expect(failure.to_s).to include('Checksum mismatch')
end
end
end
require 'spec_helper'
describe Gitlab::Verify::Uploads do
include GitlabVerifyHelpers
it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
let(:projects) { create_list(:project, 3, :with_avatar) }
let!(:objects) { projects.flat_map(&:uploads) }
end
describe '#run_batches' do
let(:project) { create(:project, :with_avatar) }
let(:failures) { collect_failures }
let(:failure) { failures[upload] }
let!(:upload) { project.uploads.first }
it 'passes uploads with the correct file' do
expect(failures).to eq({})
end
it 'fails uploads with a missing file' do
FileUtils.rm_f(upload.absolute_path)
expect(failures.keys).to contain_exactly(upload)
expect(failure).to be_a(Errno::ENOENT)
expect(failure.to_s).to include(upload.absolute_path)
end
it 'fails uploads with a mismatched checksum' do
upload.update!(checksum: 'something incorrect')
expect(failures.keys).to contain_exactly(upload)
expect(failure.to_s).to include('Checksum mismatch')
end
it 'fails uploads with a missing precalculated checksum' do
upload.update!(checksum: '')
expect(failures.keys).to contain_exactly(upload)
expect(failure.to_s).to include('Checksum missing')
end
end
end
RSpec.shared_examples 'Gitlab::Verify::BatchVerifier subclass' do
describe 'batching' do
let(:first_batch) { objects[0].id..objects[0].id }
let(:second_batch) { objects[1].id..objects[1].id }
let(:third_batch) { objects[2].id..objects[2].id }
it 'iterates through objects in batches' do
expect(collect_ranges).to eq([first_batch, second_batch, third_batch])
end
it 'allows the starting ID to be specified' do
expect(collect_ranges(start: second_batch.first)).to eq([second_batch, third_batch])
end
it 'allows the finishing ID to be specified' do
expect(collect_ranges(finish: second_batch.last)).to eq([first_batch, second_batch])
end
end
end
module GitlabVerifyHelpers
def collect_ranges(args = {})
verifier = described_class.new(args.merge(batch_size: 1))
collect_results(verifier).map { |range, _| range }
end
def collect_failures
verifier = described_class.new(batch_size: 1)
out = {}
collect_results(verifier).map { |_, failures| out.merge!(failures) }
out
end
def collect_results(verifier)
out = []
verifier.run_batches { |*args| out << args }
out
end
end
require 'rake_helper'
describe 'gitlab:lfs rake tasks' do
describe 'check' do
let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
before do
Rake.application.rake_require('tasks/gitlab/lfs/check')
stub_env('VERBOSE' => 'true')
end
it 'outputs the integrity check for each batch' do
expect { run_rake_task('gitlab:lfs:check') }.to output(/Failures: 0/).to_stdout
end
it 'errors out about missing files on the file system' do
FileUtils.rm_f(lfs_object.file.path)
expect { run_rake_task('gitlab:lfs:check') }.to output(/No such file.*#{Regexp.quote(lfs_object.file.path)}/).to_stdout
end
it 'errors out about invalid checksum' do
File.truncate(lfs_object.file.path, 0)
expect { run_rake_task('gitlab:lfs:check') }.to output(/Checksum mismatch/).to_stdout
end
end
end
require 'rake_helper'
describe 'gitlab:uploads rake tasks' do
describe 'check' do
let!(:upload) { create(:upload, path: Rails.root.join('spec/fixtures/banana_sample.gif')) }
before do
Rake.application.rake_require('tasks/gitlab/uploads/check')
stub_env('VERBOSE' => 'true')
end
it 'outputs the integrity check for each batch' do
expect { run_rake_task('gitlab:uploads:check') }.to output(/Failures: 0/).to_stdout
end
it 'errors out about missing files on the file system' do
missing_upload = create(:upload)
expect { run_rake_task('gitlab:uploads:check') }.to output(/No such file.*#{Regexp.quote(missing_upload.absolute_path)}/).to_stdout
end
it 'errors out about invalid checksum' do
upload.update_column(:checksum, '01a3156db2cf4f67ec823680b40b7302f89ab39179124ad219f94919b8a1769e')
expect { run_rake_task('gitlab:uploads:check') }.to output(/Checksum mismatch/).to_stdout
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment