Commit 51da5736 authored by Micaël Bergeron's avatar Micaël Bergeron

adds the object storage configuration

parent 0f52db38
class GitlabELTDataDumpWorker
class GitlabEltDataDumpWorker
include ApplicationWorker
include CronjobQueue
def perform
return unless Gitlab::CurrentSettings.elt_database_dump_enabled
Pseudonymity::Table.new.tables_to_csv
end
end
......@@ -726,6 +726,21 @@ production: &base
# # Specifies Amazon S3 storage class to use for backups, this is optional
# # storage_class: 'STANDARD'
## Pseudonym exporter
pseudonymizer:
# Tables manifest that specifies the fields to extract and pseudonymize.
# TODO: link to meltano configuration?
manifest: config/pseudonymizer.yml
upload:
# Fog storage connection settings, see http://fog.io/storage/ .
connection:
# provider: AWS
# region: eu-west-1
# aws_access_key_id: AKIAKIAKI
# aws_secret_access_key: 'secret123'
# # The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
# remote_directory: 'gitlab-elt'
## GitLab Shell settings
gitlab_shell:
path: /home/git/gitlab-shell/
......@@ -876,6 +891,17 @@ test:
token: secret
backup:
path: tmp/tests/backups
pseudonymizer:
manifest: config/pseudonymizer.test.yml
upload:
# The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
remote_directory: gitlab-elt.test
# Fog storage connection settings, see http://fog.io/storage/
connection:
provider: AWS
region: us-east-1
aws_access_key_id: AWS_ACCESS_KEY_ID
aws_secret_access_key: AWS_SECRET_ACCESS_KEY
gitlab_shell:
path: tmp/tests/gitlab-shell/
hooks_path: tmp/tests/gitlab-shell/hooks/
......
......@@ -373,7 +373,7 @@ Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWo
Settings.cron_jobs['gitlab_elt_database_dump'] ||= Settingslogic.new({})
Settings.cron_jobs['gitlab_elt_database_dump']['cron'] ||= '0 23 * * *';
Settings.cron_jobs['gitlab_elt_database_dump']['job_class'] ||= 'GitlabELTDataDumpWorker';
Settings.cron_jobs['gitlab_elt_database_dump']['job_class'] ||= 'GitlabEltDataDumpWorker';
Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *'
......@@ -475,6 +475,14 @@ Settings.backup['upload']['multipart_chunk_size'] ||= 104857600
Settings.backup['upload']['encryption'] ||= nil
Settings.backup['upload']['storage_class'] ||= nil
#
# Pseudonymizer
#
Settings['pseudonymizer'] ||= Settingslogic.new({})
Settings.pseudonymizer['manifest'] = Settings.pseudonymizer['manifest'] || "lib/pseudonymity/manifest.yml"
Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil })
# Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600
#
# Git
#
......
......@@ -156,7 +156,6 @@ tables:
- last_edited_by_id
- discussion_locked
- closed_at
- closed_by_id
pseudo:
- id
- title
......@@ -487,8 +486,6 @@ tables:
- merge_merge_request
- failed_pipeline
- success_pipeline
- push_to_merge_request
- issue_due
pseudo:
- id
- user_id
......@@ -509,8 +506,6 @@ tables:
- merge_merge_request
- failed_pipeline
- success_pipeline
- push_to_merge_request
- issue_due
project_authorizations:
whitelist:
- user_id
......@@ -535,15 +530,6 @@ tables:
- updated_at
- enabled
- domain
project_ci_cd_settings:
whitelist:
- id
- project_id
- group_runners_enabled
pseudo:
- id
- project_id
- group_runners_enabled
project_custom_attributes:
whitelist:
- id
......@@ -559,17 +545,6 @@ tables:
- project_id
- key
- value
project_deploy_tokens:
whitelist:
- id
- project_id
- deploy_token_id
- created_at
pseudo:
- id
- project_id
- deploy_token_id
- created_at
project_features:
whitelist:
- id
......@@ -750,7 +725,6 @@ tables:
- mirror_overwrites_diverged_branches
- external_authorization_classification_label
- external_webhook_token
- pages_https_only
pseudo:
- id
- name
......@@ -820,7 +794,6 @@ tables:
- mirror_overwrites_diverged_branches
- external_authorization_classification_label
- external_webhook_token
- pages_https_only
subscriptions:
whitelist:
- id
......@@ -932,4 +905,4 @@ tables:
- two_factor_grace_period
- ghost
- rss_token
- theme_id
\ No newline at end of file
- theme_id
......@@ -27,27 +27,26 @@ module Pseudonymity
class Table
attr_accessor :config
attr_accessor :output_dir
def initialize
@config = {}
@csv_output = ""
parse_config
@config = parse_config
@output_dir = ""
@schema = {}
@output_files = []
end
def tables_to_csv
tables = config["tables"]
@csv_output = config["output"]["csv"]
unless File.directory?(@csv_output)
raise "No such directory #{@csv_output}"
end
@output_dir = File.join("/tmp/", SecureRandom.hex)
Dir.mkdir(@output_dir) unless File.directory?(@output_dir)
new_tables = tables.map do |k, v|
@schema[k] = {}
table_to_csv(k, v["whitelist"], v["pseudo"])
end
schema_to_yml
file_list_to_json
new_tables
......@@ -57,7 +56,7 @@ module Pseudonymity
file_timestamp = filename || "#{prefix}_#{Time.now.to_i}"
file_timestamp = "#{file_timestamp}.#{ext}"
@output_files << file_timestamp
File.join(@csv_output, file_timestamp)
File.join(@output_dir, file_timestamp)
end
def schema_to_yml
......@@ -103,10 +102,11 @@ module Pseudonymity
end
def parse_config
@config = YAML.load_file(Rails.root.join('./ee/lib/assets/pseudonymity_dump.yml'))
YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest))
end
def write_to_csv_file(title, contents)
Rails.logger.info "Writing #{title} ..."
file_path = get_and_log_file_name("csv", title)
column_names = contents.first.keys
contents = CSV.generate do |csv|
......
module Pseudonymity
class UploadService
RemoteStorageUnavailableError = Class.new(StandardError)
def initialize(output_dir, progress)
@progress = progress
@output_dir = output_dir
end
def upload
progress.print "Uploading backup archive to remote storage #{remote_directory} ... "
file_list.each do |file|
upload_file(file, remote_directory)
end
end
def upload_file(file, directory)
progress.print "\tUploading #{file} ... "
if directory.files.create(key: File.basename(file), body: File.open(file), public: false)
progress.puts "done".color(:green)
else
puts "uploading CSV to #{remote_directory} failed".color(:red)
end
end
def cleanup
progress.print "Deleting tmp directory #{@output_dir} ... "
return unless File.exist?(@output_dir)
if FileUtils.rm_rf(@output_dir)
progress.puts "done".color(:green)
else
progress.puts "failed".color(:red)
end
end
private
def config
Gitlab.config.pseudonymizer
end
def remote_directory
connection_settings = config.upload.connection
if connection_settings.blank?
progress.puts "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly".color(:red)
raise RemoteStorageUnavailableError.new(connection_settings)
end
connect_to_remote_directory(connection_settings)
end
def connect_to_remote_directory(connection_settings)
# our settings use string keys, but Fog expects symbols
connection = ::Fog::Storage.new(connection_settings.symbolize_keys)
remote_dir = config.upload.remote_directory
# We only attempt to create the directory for local backups. For AWS
# and other cloud providers, we cannot guarantee the user will have
# permission to create the bucket.
if connection.service == ::Fog::Storage::Local
connection.directories.create(key: remote_dir)
else
connection.directories.get(remote_dir)
end
end
def file_list
Dir[File.join(@output_dir, "*.{csv,yml}")]
end
end
end
......@@ -78,6 +78,21 @@ namespace :gitlab do
task pseudonymity_dump: :environment do
table = Pseudonymity::Table.new
table.tables_to_csv
upload = Pseudonymity::UploadService.new(table.output_dir, progress)
upload.upload
upload.cleanup
end
def progress
if ENV['CRON']
# We need an object we can say 'puts' and 'print' to; let's use a
# StringIO.
require 'stringio'
StringIO.new
else
$stdout
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment