Commit 51da5736 authored by Micaël Bergeron's avatar Micaël Bergeron

adds the object storage configuration

parent 0f52db38
class GitlabELTDataDumpWorker class GitlabEltDataDumpWorker
include ApplicationWorker include ApplicationWorker
include CronjobQueue include CronjobQueue
def perform def perform
return unless Gitlab::CurrentSettings.elt_database_dump_enabled return unless Gitlab::CurrentSettings.elt_database_dump_enabled
Pseudonymity::Table.new.tables_to_csv Pseudonymity::Table.new.tables_to_csv
end end
end end
...@@ -726,6 +726,21 @@ production: &base ...@@ -726,6 +726,21 @@ production: &base
# # Specifies Amazon S3 storage class to use for backups, this is optional # # Specifies Amazon S3 storage class to use for backups, this is optional
# # storage_class: 'STANDARD' # # storage_class: 'STANDARD'
## Pseudonym exporter
pseudonymizer:
# Tables manifest that specifies the fields to extract and pseudonymize.
# TODO: link to meltano configuration?
manifest: config/pseudonymizer.yml
upload:
# Fog storage connection settings, see http://fog.io/storage/ .
connection:
# provider: AWS
# region: eu-west-1
# aws_access_key_id: AKIAKIAKI
# aws_secret_access_key: 'secret123'
# # The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
# remote_directory: 'gitlab-elt'
## GitLab Shell settings ## GitLab Shell settings
gitlab_shell: gitlab_shell:
path: /home/git/gitlab-shell/ path: /home/git/gitlab-shell/
...@@ -876,6 +891,17 @@ test: ...@@ -876,6 +891,17 @@ test:
token: secret token: secret
backup: backup:
path: tmp/tests/backups path: tmp/tests/backups
pseudonymizer:
manifest: config/pseudonymizer.test.yml
upload:
# The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
remote_directory: gitlab-elt.test
# Fog storage connection settings, see http://fog.io/storage/
connection:
provider: AWS
region: us-east-1
aws_access_key_id: AWS_ACCESS_KEY_ID
aws_secret_access_key: AWS_SECRET_ACCESS_KEY
gitlab_shell: gitlab_shell:
path: tmp/tests/gitlab-shell/ path: tmp/tests/gitlab-shell/
hooks_path: tmp/tests/gitlab-shell/hooks/ hooks_path: tmp/tests/gitlab-shell/hooks/
......
...@@ -373,7 +373,7 @@ Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWo ...@@ -373,7 +373,7 @@ Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWo
Settings.cron_jobs['gitlab_elt_database_dump'] ||= Settingslogic.new({}) Settings.cron_jobs['gitlab_elt_database_dump'] ||= Settingslogic.new({})
Settings.cron_jobs['gitlab_elt_database_dump']['cron'] ||= '0 23 * * *'; Settings.cron_jobs['gitlab_elt_database_dump']['cron'] ||= '0 23 * * *';
Settings.cron_jobs['gitlab_elt_database_dump']['job_class'] ||= 'GitlabELTDataDumpWorker'; Settings.cron_jobs['gitlab_elt_database_dump']['job_class'] ||= 'GitlabEltDataDumpWorker';
Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *' Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *'
...@@ -475,6 +475,14 @@ Settings.backup['upload']['multipart_chunk_size'] ||= 104857600 ...@@ -475,6 +475,14 @@ Settings.backup['upload']['multipart_chunk_size'] ||= 104857600
Settings.backup['upload']['encryption'] ||= nil Settings.backup['upload']['encryption'] ||= nil
Settings.backup['upload']['storage_class'] ||= nil Settings.backup['upload']['storage_class'] ||= nil
#
# Pseudonymizer
#
Settings['pseudonymizer'] ||= Settingslogic.new({})
Settings.pseudonymizer['manifest'] = Settings.pseudonymizer['manifest'] || "lib/pseudonymity/manifest.yml"
Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil })
# Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600
# #
# Git # Git
# #
......
...@@ -156,7 +156,6 @@ tables: ...@@ -156,7 +156,6 @@ tables:
- last_edited_by_id - last_edited_by_id
- discussion_locked - discussion_locked
- closed_at - closed_at
- closed_by_id
pseudo: pseudo:
- id - id
- title - title
...@@ -487,8 +486,6 @@ tables: ...@@ -487,8 +486,6 @@ tables:
- merge_merge_request - merge_merge_request
- failed_pipeline - failed_pipeline
- success_pipeline - success_pipeline
- push_to_merge_request
- issue_due
pseudo: pseudo:
- id - id
- user_id - user_id
...@@ -509,8 +506,6 @@ tables: ...@@ -509,8 +506,6 @@ tables:
- merge_merge_request - merge_merge_request
- failed_pipeline - failed_pipeline
- success_pipeline - success_pipeline
- push_to_merge_request
- issue_due
project_authorizations: project_authorizations:
whitelist: whitelist:
- user_id - user_id
...@@ -535,15 +530,6 @@ tables: ...@@ -535,15 +530,6 @@ tables:
- updated_at - updated_at
- enabled - enabled
- domain - domain
project_ci_cd_settings:
whitelist:
- id
- project_id
- group_runners_enabled
pseudo:
- id
- project_id
- group_runners_enabled
project_custom_attributes: project_custom_attributes:
whitelist: whitelist:
- id - id
...@@ -559,17 +545,6 @@ tables: ...@@ -559,17 +545,6 @@ tables:
- project_id - project_id
- key - key
- value - value
project_deploy_tokens:
whitelist:
- id
- project_id
- deploy_token_id
- created_at
pseudo:
- id
- project_id
- deploy_token_id
- created_at
project_features: project_features:
whitelist: whitelist:
- id - id
...@@ -750,7 +725,6 @@ tables: ...@@ -750,7 +725,6 @@ tables:
- mirror_overwrites_diverged_branches - mirror_overwrites_diverged_branches
- external_authorization_classification_label - external_authorization_classification_label
- external_webhook_token - external_webhook_token
- pages_https_only
pseudo: pseudo:
- id - id
- name - name
...@@ -820,7 +794,6 @@ tables: ...@@ -820,7 +794,6 @@ tables:
- mirror_overwrites_diverged_branches - mirror_overwrites_diverged_branches
- external_authorization_classification_label - external_authorization_classification_label
- external_webhook_token - external_webhook_token
- pages_https_only
subscriptions: subscriptions:
whitelist: whitelist:
- id - id
...@@ -932,4 +905,4 @@ tables: ...@@ -932,4 +905,4 @@ tables:
- two_factor_grace_period - two_factor_grace_period
- ghost - ghost
- rss_token - rss_token
- theme_id - theme_id
\ No newline at end of file
...@@ -27,27 +27,26 @@ module Pseudonymity ...@@ -27,27 +27,26 @@ module Pseudonymity
class Table class Table
attr_accessor :config attr_accessor :config
attr_accessor :output_dir
def initialize def initialize
@config = {} @config = parse_config
@csv_output = "" @output_dir = ""
parse_config
@schema = {} @schema = {}
@output_files = [] @output_files = []
end end
def tables_to_csv def tables_to_csv
tables = config["tables"] tables = config["tables"]
@csv_output = config["output"]["csv"]
unless File.directory?(@csv_output) @output_dir = File.join("/tmp/", SecureRandom.hex)
raise "No such directory #{@csv_output}" Dir.mkdir(@output_dir) unless File.directory?(@output_dir)
end
new_tables = tables.map do |k, v| new_tables = tables.map do |k, v|
@schema[k] = {} @schema[k] = {}
table_to_csv(k, v["whitelist"], v["pseudo"]) table_to_csv(k, v["whitelist"], v["pseudo"])
end end
schema_to_yml schema_to_yml
file_list_to_json file_list_to_json
new_tables new_tables
...@@ -57,7 +56,7 @@ module Pseudonymity ...@@ -57,7 +56,7 @@ module Pseudonymity
file_timestamp = filename || "#{prefix}_#{Time.now.to_i}" file_timestamp = filename || "#{prefix}_#{Time.now.to_i}"
file_timestamp = "#{file_timestamp}.#{ext}" file_timestamp = "#{file_timestamp}.#{ext}"
@output_files << file_timestamp @output_files << file_timestamp
File.join(@csv_output, file_timestamp) File.join(@output_dir, file_timestamp)
end end
def schema_to_yml def schema_to_yml
...@@ -103,10 +102,11 @@ module Pseudonymity ...@@ -103,10 +102,11 @@ module Pseudonymity
end end
def parse_config def parse_config
@config = YAML.load_file(Rails.root.join('./ee/lib/assets/pseudonymity_dump.yml')) YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest))
end end
def write_to_csv_file(title, contents) def write_to_csv_file(title, contents)
Rails.logger.info "Writing #{title} ..."
file_path = get_and_log_file_name("csv", title) file_path = get_and_log_file_name("csv", title)
column_names = contents.first.keys column_names = contents.first.keys
contents = CSV.generate do |csv| contents = CSV.generate do |csv|
......
module Pseudonymity
class UploadService
RemoteStorageUnavailableError = Class.new(StandardError)
def initialize(output_dir, progress)
@progress = progress
@output_dir = output_dir
end
def upload
progress.print "Uploading backup archive to remote storage #{remote_directory} ... "
file_list.each do |file|
upload_file(file, remote_directory)
end
end
def upload_file(file, directory)
progress.print "\tUploading #{file} ... "
if directory.files.create(key: File.basename(file), body: File.open(file), public: false)
progress.puts "done".color(:green)
else
puts "uploading CSV to #{remote_directory} failed".color(:red)
end
end
def cleanup
progress.print "Deleting tmp directory #{@output_dir} ... "
return unless File.exist?(@output_dir)
if FileUtils.rm_rf(@output_dir)
progress.puts "done".color(:green)
else
progress.puts "failed".color(:red)
end
end
private
def config
Gitlab.config.pseudonymizer
end
def remote_directory
connection_settings = config.upload.connection
if connection_settings.blank?
progress.puts "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly".color(:red)
raise RemoteStorageUnavailableError.new(connection_settings)
end
connect_to_remote_directory(connection_settings)
end
def connect_to_remote_directory(connection_settings)
# our settings use string keys, but Fog expects symbols
connection = ::Fog::Storage.new(connection_settings.symbolize_keys)
remote_dir = config.upload.remote_directory
# We only attempt to create the directory for local backups. For AWS
# and other cloud providers, we cannot guarantee the user will have
# permission to create the bucket.
if connection.service == ::Fog::Storage::Local
connection.directories.create(key: remote_dir)
else
connection.directories.get(remote_dir)
end
end
def file_list
Dir[File.join(@output_dir, "*.{csv,yml}")]
end
end
end
...@@ -78,6 +78,21 @@ namespace :gitlab do ...@@ -78,6 +78,21 @@ namespace :gitlab do
task pseudonymity_dump: :environment do task pseudonymity_dump: :environment do
table = Pseudonymity::Table.new table = Pseudonymity::Table.new
table.tables_to_csv table.tables_to_csv
upload = Pseudonymity::UploadService.new(table.output_dir, progress)
upload.upload
upload.cleanup
end
def progress
if ENV['CRON']
# We need an object we can say 'puts' and 'print' to; let's use a
# StringIO.
require 'stringio'
StringIO.new
else
$stdout
end
end end
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment