Commit f28ff040 authored by Stan Hu's avatar Stan Hu

Merge branch 'gitlab-elt' into 'master'

Adds method to move tables to CSV with redacted data.

Closes meltano/meltano#80

See merge request gitlab-org/gitlab-ee!5532
parents a43e2609 4b90a8f8
......@@ -373,3 +373,5 @@
= _('Geo allows you to replicate your GitLab instance to other geographical locations.')
.settings-content
= render partial: 'slack'
= render_if_exists 'admin/application_settings/pseudonymizer_settings', expanded: expanded
......@@ -145,6 +145,7 @@
- cronjob:ldap_all_groups_sync
- cronjob:ldap_sync
- cronjob:update_all_mirrors
- cronjob:pseudonymizer
- geo:geo_scheduler_scheduler
- geo:geo_scheduler_primary_scheduler
......
......@@ -311,6 +311,10 @@ production: &base
geo_migrated_local_files_clean_up_worker:
cron: "15 */6 * * *"
# Export pseudonymized data in CSV format for analysis
pseudonymizer_worker:
cron: "0 * * * *"
registry:
# enabled: true
# host: registry.example.com
......@@ -726,6 +730,20 @@ production: &base
# # Specifies Amazon S3 storage class to use for backups, this is optional
# # storage_class: 'STANDARD'
## Pseudonymizer exporter
pseudonymizer:
# Tables manifest that specifies the fields to extract and pseudonymize.
manifest: config/pseudonymizer.yml
upload:
# remote_directory: 'gitlab-elt'
# Fog storage connection settings, see http://fog.io/storage/ .
connection:
# provider: AWS
# region: eu-west-1
# aws_access_key_id: AKIAKIAKI
# aws_secret_access_key: 'secret123'
# # The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
## GitLab Shell settings
gitlab_shell:
path: /home/git/gitlab-shell/
......@@ -876,6 +894,17 @@ test:
token: secret
backup:
path: tmp/tests/backups
pseudonymizer:
manifest: config/pseudonymizer.yml
upload:
# The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
remote_directory: gitlab-elt.test
# Fog storage connection settings, see http://fog.io/storage/
connection:
provider: AWS # Only AWS supported at the moment
aws_access_key_id: AWS_ACCESS_KEY_ID
aws_secret_access_key: AWS_SECRET_ACCESS_KEY
region: us-east-1
gitlab_shell:
path: tmp/tests/gitlab-shell/
hooks_path: tmp/tests/gitlab-shell/hooks/
......
......@@ -370,6 +370,10 @@ Settings.cron_jobs['gitlab_usage_ping_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['gitlab_usage_ping_worker']['cron'] ||= Settings.__send__(:cron_for_usage_ping)
Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWorker'
Settings.cron_jobs['pseudonymizer_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['pseudonymizer_worker']['cron'] ||= '0 23 * * *'
Settings.cron_jobs['pseudonymizer_worker']['job_class'] ||= 'PseudonymizerWorker'
Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *'
Settings.cron_jobs['schedule_update_user_activity_worker']['job_class'] = 'ScheduleUpdateUserActivityWorker'
......@@ -470,6 +474,14 @@ Settings.backup['upload']['multipart_chunk_size'] ||= 104857600
Settings.backup['upload']['encryption'] ||= nil
Settings.backup['upload']['storage_class'] ||= nil
#
# Pseudonymizer
#
Settings['pseudonymizer'] ||= Settingslogic.new({})
Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("config/pseudonymizer.yml"))
Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil })
# Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600
#
# Git
#
......
This diff is collapsed.
......@@ -206,6 +206,7 @@ ActiveRecord::Schema.define(version: 20180612175636) do
t.string "encrypted_external_auth_client_key_pass_iv"
t.string "email_additional_text"
t.boolean "enforce_terms", default: false
t.boolean "pseudonymizer_enabled", default: false, null: false
end
create_table "approvals", force: :cascade do |t|
......
......@@ -167,6 +167,10 @@ created in snippets, wikis, and repos.
- [Request Profiling](monitoring/performance/request_profiling.md): Get a detailed profile on slow requests.
- [Performance Bar](monitoring/performance/performance_bar.md): Get performance information for the current page.
## Analytics
- [Pseudonymizer](pseudonymizer.md): Export data from GitLab's database to CSV files in a secure way.
## Troubleshooting
- [Debugging tips](troubleshooting/debug.md): Tips to debug problems when things go wrong
......
# Pseudonymizer
> [Introduced](https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/5532) in [GitLab Ultimate][ee] 11.1.
As GitLab's database hosts sensitive information, using it unfiltered for analytics
implies high security requirements. To help alleviate this constraint, the Pseudonymizer
service is used to export GitLab's data in a pseudonymized way.
CAUTION: **Warning:**
This process is not impervious. If the source data is available, it's possible for
a user to correlate data to the pseudonymized version.
The Pseudonymizer currently uses `HMAC(SHA256)` to mutate fields that shouldn't
be textually exported. This ensures that:
- the end-user of the data source cannot infer/revert the pseudonymized fields
- the referential integrity is maintained
## Configuration
To configure the pseudonymizer, you need to:
- Provide a manifest file that describes which fields should be included or
pseudonymized ([example `manifest.yml` file](https://gitlab.com/gitlab-org/gitlab-ee/tree/master/config/pseudonymizer.yml)).
A default manifest is provided with the GitLab installation. Using a relative file path will be resolved from the Rails root.
Alternatively, you can use an absolute file path.
- Use an object storage and specify the connection parameters in the `pseudonymizer.upload.connection` configuration option.
**For Omnibus installations:**
1. Edit `/etc/gitlab/gitlab.rb` and add the following lines by replacing with
the values you want:
```ruby
gitlab_rails['pseudonymizer_manifest'] = 'config/pseudonymizer.yml'
gitlab_rails['pseudonymizer_upload_remote_directory'] = 'gitlab-elt'
gitlab_rails['pseudonymizer_upload_connection'] = {
'provider' => 'AWS',
'region' => 'eu-central-1',
'aws_access_key_id' => 'AWS_ACCESS_KEY_ID',
'aws_secret_access_key' => 'AWS_SECRET_ACCESS_KEY'
}
```
NOTE: **Note:**
If you are using AWS IAM profiles, be sure to omit the AWS access key and secret access key/value pairs.
```ruby
gitlab_rails['pseudonymizer_upload_connection'] = {
'provider' => 'AWS',
'region' => 'eu-central-1',
'use_iam_profile' => true
}
```
1. Save the file and [reconfigure GitLab](restart_gitlab.md#omnibus-gitlab-reconfigure)
for the changes to take effect.
---
**For installations from source:**
1. Edit `/home/git/gitlab/config/gitlab.yml` and add or amend the following
lines:
```yaml
pseudonymizer:
manifest: config/pseudonymizer.yml
upload:
remote_directory: 'gitlab-elt' # The bucket name
connection:
provider: AWS # Only AWS supported at the moment
aws_access_key_id: AWS_ACCESS_KEY_ID
aws_secret_access_key: AWS_SECRET_ACCESS_KEY
region: eu-central-1
```
1. Save the file and [restart GitLab](restart_gitlab.md#installations-from-source)
for the changes to take effect.
## Usage
You can optionally run the pseudonymizer using the following environment variables:
- `PSEUDONYMIZER_OUTPUT_DIR` - where to store the output CSV files (defaults to `/tmp`)
- `PSEUDONYMIZER_BATCH` - the batch size when querying the DB (defaults to `100000`)
```bash
## Omnibus
sudo gitlab-rake gitlab:db:pseudonymizer
## Source
sudo -u git -H bundle exec rake gitlab:db:pseudonymizer RAILS_ENV=production
```
This will produce some CSV files that might be very large, so make sure the
`PSEUDONYMIZER_OUTPUT_DIR` has sufficient space. As a rule of thumb, at least
10% of the database size is recommended.
After the pseudonymizer has run, the output CSV files should be uploaded to the
configured object storage and deleted from the local disk.
[ee]: https://about.gitlab.com/pricing/
......@@ -20,6 +20,10 @@ module EE
attrs << :email_additional_text
end
if License.feature_available?(:pseudonymizer)
attrs << :pseudonymizer_enabled
end
attrs
end
end
......
......@@ -35,6 +35,18 @@ module EE
"and the value is encrypted at rest.")
end
def pseudonymizer_enabled_help_text
_("Enable Pseudonymizer data collection")
end
def pseudonymizer_description_text
_("GitLab will run a background job that will produce pseudonymized CSVs of the GitLab database that will be uploaded to your configured object storage directory.")
end
def pseudonymizer_disabled_description_text
_("The pseudonymizer data collection is disabled. When enabled, GitLab will run a background job that will produce pseudonymized CSVs of the GitLab database that will be uploaded to your configured object storage directory.")
end
override :visible_attributes
def visible_attributes
super + [
......@@ -55,7 +67,8 @@ module EE
:slack_app_id,
:slack_app_secret,
:slack_app_verification_token,
:allow_group_owners_to_manage_ldap
:allow_group_owners_to_manage_ldap,
:pseudonymizer_enabled
]
end
......
......@@ -100,11 +100,20 @@ module EE
slack_app_enabled: false,
slack_app_id: nil,
slack_app_secret: nil,
slack_app_verification_token: nil
slack_app_verification_token: nil,
pseudonymizer_enabled: false
)
end
end
def pseudonymizer_available?
License.feature_available?(:pseudonymizer)
end
def pseudonymizer_enabled?
pseudonymizer_available? && super
end
def should_check_namespace_plan?
check_namespace_plan? && (Rails.env.test? || ::Gitlab.dev_env_or_com?)
end
......
......@@ -73,6 +73,7 @@ class License < ActiveRecord::Base
ide
chatops
pod_logs
pseudonymizer
].freeze
# List all features available for early adopters,
......
= form_for @application_setting, url: admin_application_settings_path, html: { class: 'fieldset-form' } do |f|
= form_errors(@application_setting)
%fieldset
.form-group.row
.offset-sm-2.col-sm-10
- is_enabled = @application_setting.pseudonymizer_enabled?
.form-check
= f.label :pseudonymizer_enabled do
= f.check_box :pseudonymizer_enabled
= pseudonymizer_enabled_help_text
.form-text.text-muted
- if is_enabled
= pseudonymizer_description_text
- else
= pseudonymizer_disabled_description_text
= f.submit 'Save changes', class: "btn btn-success"
- if Gitlab::CurrentSettings.pseudonymizer_available?
%section.settings.as-pseudonymizer.no-animate#js-pseudonymizer-settings{ class: ('expanded' if expanded) }
.settings-header
%h4
= _('Pseudonymizer data collection')
%button.btn.btn-default.js-settings-toggle{ type: 'button' }
= expanded ? _('Collapse') : _('Expand')
%p
= _('Enable or disable the Pseudonymizer data collection.')
.settings-content
= render 'pseudonymizer'
class PseudonymizerWorker
include ApplicationWorker
include CronjobQueue
def perform
return unless Gitlab::CurrentSettings.pseudonymizer_enabled?
options = Pseudonymizer::Options.new(
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest),
output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
)
dumper = Pseudonymizer::Dumper.new(options)
uploader = Pseudonymizer::Uploader.new(options, progress_output: File.open(File::NULL, "w"))
unless uploader.available?
Rails.logger.error("The pseudonymizer object storage must be configured.")
return
end
begin
dumper.tables_to_csv
uploader.upload
ensure
uploader.cleanup
end
end
end
---
title: Pseudonymizer to safely export data for analytics.
merge_request: 5532
author:
type: added
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class AddPseudonymizerEnabledToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
# When a migration requires downtime you **must** uncomment the following
# constant and define a short and easy to understand explanation as to why the
# migration requires downtime.
# DOWNTIME_REASON = ''
# When using the methods "add_concurrent_index", "remove_concurrent_index" or
# "add_column_with_default" you must disable the use of transactions
# as these methods can not run in an existing transaction.
# When using "add_concurrent_index" or "remove_concurrent_index" methods make sure
# that either of them is the _only_ method called in the migration,
# any other changes should go in a separate migration.
# This ensures that upon failure _only_ the index creation or removing fails
# and can be retried or reverted easily.
#
# To disable transactions uncomment the following line and remove these
# comments:
# disable_ddl_transaction!
def change
add_column :application_settings, :pseudonymizer_enabled, :boolean, null: false, default: false
end
end
module Pseudonymizer
class Dumper
attr_accessor :config, :output_dir
def initialize(options)
@config = options.config.deep_symbolize_keys
@output_dir = options.output_dir
@start_at = options.start_at
reset!
end
def reset!
@schema = Hash.new { |h, k| h[k] = {} }
@output_files = []
end
def tables_to_csv
return @output_files unless @output_files.empty?
tables = config[:tables]
FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir)
@output_files = tables.map do |k, v|
table_to_csv(k, v[:whitelist], v[:pseudo])
end.compact
schema_to_yml
file_list_to_json
@output_files
end
private
def output_filename(basename = nil, ext = "csv.gz")
File.join(output_dir, "#{basename}.#{ext}")
end
def schema_to_yml
file_path = output_filename("schema", "yml")
File.write(file_path, @schema.to_yaml)
end
def file_list_to_json
file_path = output_filename("file_list", "json")
relative_files = @output_files.map(&File.method(:basename))
File.write(file_path, relative_files.to_json)
end
def table_to_csv(table, whitelist_columns, pseudonymity_columns)
table_to_schema(table)
write_to_csv_file(
table,
table_page_results(table,
whitelist_columns,
pseudonymity_columns)
)
rescue => e
Rails.logger.error("Failed to export #{table}: #{e}")
raise e
end
# yield every results, pagined, anonymized
def table_page_results(table, whitelist_columns, pseudonymity_columns)
filter = Filter.new(table, whitelist_columns, pseudonymity_columns)
pager = Pager.new(table, whitelist_columns)
Enumerator.new do |yielder|
pager.pages do |page|
filter.anonymize(page).each do |result|
yielder << result
end
end
end.lazy
end
def table_to_schema(table)
table_config = @config.dig(:tables, table)
type_results = ActiveRecord::Base.connection.columns(table)
type_results = type_results.select do |c|
table_config[:whitelist].include?(c.name)
end
type_results = type_results.map do |c|
data_type = c.sql_type
if table_config[:pseudo].include?(c.name)
data_type = "character varying"
end
{ name: c.name, data_type: data_type }
end
set_schema_column_types(table, type_results)
end
def set_schema_column_types(table, type_results)
has_id = type_results.any? {|c| c[:name] == "id" }
type_results.each do |type_result|
@schema[table.to_s][type_result[:name]] = type_result[:data_type]
end
if has_id
# if there is an ID, it is the mapping_key
@schema[table.to_s]["gl_mapping_key"] = "id"
end
end
def write_to_csv_file(table, contents)
file_path = output_filename(table)
headers = contents.peek.keys
Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}."
Zlib::GzipWriter.open(file_path) do |io|
csv = CSV.new(io, headers: headers, write_headers: true)
contents.each { |row| csv << row.values }
end
file_path
rescue StopIteration
Rails.logger.info "#{self.class.name} table #{table} is empty."
nil
end
end
end
require 'openssl'
require 'digest'
module Pseudonymizer
class Filter
def initialize(table, whitelisted_fields, pseudonymized_fields)
@table = table
@pseudo_fields = pseudo_fields(whitelisted_fields, pseudonymized_fields)
end
def anonymize(results)
key = Rails.application.secrets[:secret_key_base]
digest = OpenSSL::Digest.new('sha256')
Enumerator.new do |yielder|
results.each do |result|
@pseudo_fields.each do |field|
next if result[field].nil?
result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field]))
end
yielder << result
end
end
end
private
def pseudo_fields(whitelisted, pseudonymized)
pseudo_extra_fields = pseudonymized - whitelisted
pseudo_extra_fields.each do |field|
Rails.logger.warn("#{self.class.name} extraneous pseudo: #{@table}.#{field} is not whitelisted and will be ignored.")
end
pseudonymized & whitelisted
end
end
end
module Pseudonymizer
class Options
attr_reader :config
attr_reader :start_at
attr_reader :output_dir
def initialize(config: {}, output_dir: nil)
@config = config
@start_at = Time.now.utc
base_dir = output_dir || File.join(Dir.tmpdir, 'gitlab-pseudonymizer')
@output_dir = File.join(base_dir, batch_dir)
end
def upload_dir
batch_dir
end
private
def batch_dir
start_at.iso8601
end
end
end
module Pseudonymizer
class Pager
PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000)
def initialize(table, columns)
@table = table
@columns = columns
end
def pages(&block)
if @columns.include?("id")
# optimize the pagination using WHERE id > ?
pages_per_id(&block)
else
# fallback to `LIMIT ? OFFSET ?` when "id" is unavailable
pages_per_offset(&block)
end
end
def pages_per_id(&block)
id_offset = 0
loop do
# a page of results
results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
SELECT #{@columns.join(",")}
FROM #{@table}
WHERE id > #{id_offset}
ORDER BY id
LIMIT #{PAGE_SIZE}
SQL
Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}..+#{PAGE_SIZE}]")
break if results.empty?
id_offset = results.last["id"].to_i
yield results
break if results.count < PAGE_SIZE
end
end
def pages_per_offset(&block)
offset = 0
loop do
# a page of results
results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
SELECT #{@columns.join(",")}
FROM #{@table}
ORDER BY #{@columns.join(",")}
LIMIT #{PAGE_SIZE} OFFSET #{offset}
SQL
Rails.logger.debug("#{self.class.name} fetching offset [#{offset}..#{offset + PAGE_SIZE}]")
break if results.empty?
offset += PAGE_SIZE
yield results
break if results.count < PAGE_SIZE
end
end
end
end
module Pseudonymizer
ObjectStorageUnavailableError = Class.new(StandardError)
class Uploader
include Gitlab::Utils::StrongMemoize
RemoteStorageUnavailableError = Class.new(StandardError)
# Our settings use string keys, but Fog expects symbols
def self.object_store_credentials
Gitlab.config.pseudonymizer.upload.connection.to_hash.deep_symbolize_keys
end
def self.remote_directory
Gitlab.config.pseudonymizer.upload.remote_directory
end
def initialize(options, progress_output: nil)
@progress_output = progress_output || $stdout
@config = options.config
@output_dir = options.output_dir
@upload_dir = options.upload_dir
@remote_dir = self.class.remote_directory
@connection_params = self.class.object_store_credentials
end
def available?
!connect_to_remote_directory.nil?
rescue ObjectStorageUnavailableError
false
end
def upload
progress_output.puts "Uploading output files to remote storage #{remote_directory}:"
file_list.each do |file|
upload_file(file, remote_directory)
end
rescue ObjectStorageUnavailableError
abort "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly"
end
def cleanup
return unless File.exist?(@output_dir)
progress_output.print "Deleting tmp directory #{@output_dir} ... "
FileUtils.rm_rf(@output_dir)
progress_output.puts "done"
rescue
progress_output.puts "failed"
end
private
attr_reader :progress_output
def upload_file(file, directory)
progress_output.print "\t#{file} ... "
if directory.files.create(key: File.join(@upload_dir, File.basename(file)),
body: File.open(file),
public: false)
progress_output.puts "done"
else
progress_output.puts "failed"
end
end
def remote_directory
strong_memoize(:remote_directory) { connect_to_remote_directory }
end
def connect_to_remote_directory
if @connection_params.blank?
raise ObjectStorageUnavailableError
end
connection = ::Fog::Storage.new(@connection_params)
# We only attempt to create the directory for local backups. For AWS
# and other cloud providers, we cannot guarantee the user will have
# permission to create the bucket.
if connection.service == ::Fog::Storage::Local
connection.directories.create(key: @remote_dir)
else
connection.directories.get(@remote_dir)
end
end
def file_list
Dir[File.join(@output_dir, "*")]
end
end
end
namespace :gitlab do
namespace :db do
desc 'Output pseudonymity dump of selected tables'
task pseudonymizer: :environment do
abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer)
abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled?
options = Pseudonymizer::Options.new(
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest),
output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
)
dumper = Pseudonymizer::Dumper.new(options)
uploader = Pseudonymizer::Uploader.new(options)
abort "There is an error in the pseudonymizer object store configuration." unless uploader.available?
begin
dumper.tables_to_csv
uploader.upload
ensure
uploader.cleanup
end
end
end
end
require 'spec_helper'
describe Pseudonymizer::Dumper do
let!(:project) { create(:project) }
let(:base_dir) { Dir.mktmpdir }
let(:options) do
Pseudonymizer::Options.new(
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest)
)
end
subject(:pseudo) { described_class.new(options) }
before do
allow(options).to receive(:output_dir).and_return(base_dir)
end
after do
FileUtils.rm_rf(base_dir)
end
describe 'Pseudo tables' do
it 'outputs project tables to csv' do
column_names = %w(id name path description)
pseudo.config[:tables] = {
projects: {
whitelist: column_names,
pseudo: %w(id)
}
}
expect(pseudo.output_dir).to eq(base_dir)
# grab the first table it outputs. There would only be 1.
project_table_file = pseudo.tables_to_csv[0]
expect(project_table_file).to end_with("projects.csv.gz")
columns = []
project_data = []
Zlib::GzipReader.open(project_table_file) do |gz|
csv = CSV.new(gz, headers: true)
# csv.shift # read the header row
project_data = csv.gets
columns = csv.headers
end
# check if CSV columns are correct
expect(columns).to include(*column_names)
# is it pseudonymous
# sha 256 is 64 chars in length
expect(project_data["id"].length).to eq(64)
end
it "warns when pseudonymized fields are extraneous" do
column_names = %w(id name path description)
pseudo.config[:tables] = {
projects: {
whitelist: column_names,
pseudo: %w(id extraneous)
}
}
expect(Rails.logger).to receive(:warn).with(/extraneous/)
pseudo.tables_to_csv
end
end
describe "manifest is valid" do
it "all tables exist" do
existing_tables = ActiveRecord::Base.connection.tables
tables = options.config['tables'].keys
expect(existing_tables).to include(*tables)
end
it "all whitelisted attributes exist" do
options.config['tables'].each do |table, table_def|
whitelisted = table_def['whitelist']
existing_columns = ActiveRecord::Base.connection.columns(table.to_sym).map(&:name)
diff = whitelisted - existing_columns
expect(diff).to be_empty, "#{table} should define columns #{whitelisted.inspect}: missing #{diff.inspect}"
end
end
it "all pseudonymized attributes are whitelisted" do
options.config['tables'].each do |table, table_def|
whitelisted = table_def['whitelist']
pseudonymized = table_def['pseudo']
diff = pseudonymized - whitelisted
expect(diff).to be_empty, "#{table} should whitelist columns #{pseudonymized.inspect}: missing #{diff.inspect}"
end
end
end
end
require 'spec_helper'
describe Pseudonymizer::Pager do
let(:page_size) { 1 }
let!(:projects) { create_list(:project, 10) }
subject { described_class.new("projects", whitelisted_columns) }
before do
stub_const("Pseudonymizer::Pager::PAGE_SIZE", page_size)
end
shared_examples "yield results in page" do
it do
page_count = 0
result_count = 0
subject.pages do |page|
result_count += page.count
page_count += 1
end
expect(result_count).to eq(projects.count)
expect(page_count).to eq(projects.count / page_size)
end
end
context "`id` column is present" do
let(:whitelisted_columns) { %w(id name) }
describe "#pages" do
it "delegates to #pages_per_id" do
expect(subject).to receive(:pages_per_id)
subject.pages {|page| nil}
end
include_examples "yield results in page"
end
end
context "`id` column is missing" do
let(:whitelisted_columns) { %w(name) }
describe "#pages" do
it "delegates to #pages_per_offset" do
expect(subject).to receive(:pages_per_offset)
subject.pages {|page| nil}
end
include_examples "yield results in page"
end
end
end
require 'spec_helper'
describe Pseudonymizer::Uploader do
let(:base_dir) { Dir.mktmpdir }
let(:options) do
Pseudonymizer::Options.new(
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest)
)
end
let(:remote_directory) { subject.send(:remote_directory) }
subject { described_class.new(options) }
def mock_file(file_name)
FileUtils.touch(File.join(base_dir, file_name))
end
before do
allow(options).to receive(:output_dir).and_return(base_dir)
stub_object_storage_pseudonymizer
10.times {|i| mock_file("file_#{i}.test")}
mock_file("schema.yml")
mock_file("file_list.json")
end
after do
FileUtils.rm_rf(base_dir)
end
describe "#upload" do
it "upload all file in the directory" do
subject.upload
expect(remote_directory.files.all.count).to eq(12)
end
end
describe "#cleanup" do
it "cleans the directory" do
subject.cleanup
expect(Dir[File.join(base_dir, "*")].length).to eq(0)
end
end
end
......@@ -15,9 +15,14 @@ module StubObjectStorage
return unless enabled
stub_object_storage(connection_params: uploader.object_store_credentials,
remote_directory: remote_directory)
end
def stub_object_storage(connection_params:, remote_directory:)
Fog.mock!
::Fog::Storage.new(uploader.object_store_credentials).tap do |connection|
::Fog::Storage.new(connection_params).tap do |connection|
begin
connection.directories.create(key: remote_directory)
rescue Excon::Error::Conflict
......@@ -57,4 +62,9 @@ module StubObjectStorage
</InitiateMultipartUploadResult>
EOS
end
def stub_object_storage_pseudonymizer
stub_object_storage(connection_params: Pseudonymizer::Uploader.object_store_credentials,
remote_directory: Pseudonymizer::Uploader.remote_directory)
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment