Commit 7525a79f authored by Micaël Bergeron's avatar Micaël Bergeron

apply feedback

parent 1c16cdec
......@@ -734,7 +734,7 @@ production: &base
pseudonymizer:
enabled: false
# Tables manifest that specifies the fields to extract and pseudonymize.
manifest: lib/pseudonymizer/manifest.yml
manifest: config/pseudonymizer.yml
upload:
# remote_directory: 'gitlab-elt'
# Fog storage connection settings, see http://fog.io/storage/ .
......@@ -897,7 +897,7 @@ test:
path: tmp/tests/backups
pseudonymizer:
enabled: false
manifest: lib/pseudonymizer/manifest.yml
manifest: config/pseudonymizer.yml
upload:
# The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
remote_directory: gitlab-elt.test
......
......@@ -479,7 +479,7 @@ Settings.backup['upload']['storage_class'] ||= nil
#
Settings['pseudonymizer'] ||= Settingslogic.new({})
Settings.pseudonymizer['enabled'] = false if Settings.pseudonymizer['enabled'].nil?
Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("lib/pseudonymizer/manifest.yml"))
Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("config/pseudonymizer.yml"))
Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil })
# Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600
......
......@@ -98,6 +98,8 @@ tables:
- iid
- updated_by_id
- last_edited_by_id
- title
- description
issue_assignees:
whitelist:
- user_id
......@@ -617,7 +619,6 @@ tables:
- has_external_wiki
- ci_config_path
- lfs_enabled
- description_html
- only_allow_merge_if_all_discussions_are_resolved
- repository_size_limit
- printing_merge_request_link_enabled
......@@ -670,7 +671,6 @@ tables:
- repository_storage
- repository_read_only
- ci_config_path
- description_html
- only_allow_merge_if_all_discussions_are_resolved
- repository_size_limit
- auto_cancel_pending_pipelines
......
......@@ -21,7 +21,7 @@ be textually exported. This ensures that:
To configure the pseudonymizer, you need to:
- Provide a manifest file that describes which fields should be included or
pseudonymized ([example `manifest.yml` file]()).
pseudonymized ([example `manifest.yml` file](https://gitlab.com/gitlab-org/gitlab-ee/tree/master/config/pseudonymizer.yml)).
- Use an object storage
**For Omnibus installations:**
......@@ -31,7 +31,7 @@ To configure the pseudonymizer, you need to:
```ruby
gitlab_rails['pseudonymizer_enabled'] = true
gitlab_rails['pseudonymizer_manifest'] = 'lib/pseudonymizer/manifest.yml'
gitlab_rails['pseudonymizer_manifest'] = 'config/pseudonymizer.yml'
gitlab_rails['pseudonymizer_upload_remote_directory'] = 'gitlab-elt'
gitlab_rails['pseudonymizer_upload_connection'] = {
'provider' => 'AWS',
......@@ -65,7 +65,7 @@ To configure the pseudonymizer, you need to:
```yaml
pseudonymizer:
enabled: true
manifest: lib/pseudonymizer/manifest.yml
manifest: config/pseudonymizer.yml
upload:
remote_directory: 'gitlab-elt' # The bucket name
connection:
......
......@@ -35,8 +35,12 @@ module EE
"and the value is encrypted at rest.")
end
def pseudonymizer_enabled_help_text
_("Enable Pseudonymizer data export")
end
def pseudonymizer_description_text
_("GitLab will run the pseudonymizer cron job which will send pseudoanonymized data to be processed and analyzed.")
_("GitLab will run the pseudonymizer cron job which will output pseudoanonymized data to be processed and analyzed.")
end
def pseudonymizer_disabled_description_text
......
......@@ -8,7 +8,7 @@
.form-check
= f.label :pseudonymizer_enabled do
= f.check_box :pseudonymizer_enabled
Enable Pseudonymizer Cron Job
= pseudonymizer_enabled_help_text
.form-text.text-muted
- if is_enabled
= pseudonymizer_description_text
......
......@@ -69,19 +69,18 @@ module Pseudonymizer
end
class Anon
def initialize(fields)
@anon_fields = fields
def initialize(table, whitelisted_fields, pseudonymized_fields)
@table = table
@pseudo_fields = pseudo_fields(whitelisted_fields, pseudonymized_fields)
end
def anonymize(results)
columns = results.columns # Assume they all have the same table
to_filter = @anon_fields & columns
key = Rails.application.secrets[:secret_key_base]
digest = OpenSSL::Digest.new('sha256')
Enumerator.new do |yielder|
results.each do |result|
to_filter.each do |field|
@pseudo_fields.each do |field|
next if result[field].nil?
result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field]))
......@@ -90,6 +89,17 @@ module Pseudonymizer
end
end
end
private
def pseudo_fields(whitelisted, pseudonymized)
pseudo_extra_fields = pseudonymized - whitelisted
pseudo_extra_fields.each do |field|
Rails.logger.warn("#{self.class.name} extraneous pseudo: #{@table}.#{field} is not whitelisted and will be ignored.")
end
pseudonymized & whitelisted
end
end
class Dumper
......@@ -155,7 +165,7 @@ module Pseudonymizer
# yield every results, pagined, anonymized
def table_page_results(table, whitelist_columns, pseudonymity_columns)
anonymizer = Anon.new(pseudonymity_columns)
anonymizer = Anon.new(table, whitelist_columns, pseudonymity_columns)
pager = Pager.new(table, whitelist_columns)
Enumerator.new do |yielder|
......@@ -168,18 +178,17 @@ module Pseudonymizer
end
def table_to_schema(table)
whitelisted = ->(table) { @config.dig(:tables, table, :whitelist) }
pseudonymized = ->(table) { @config.dig(:tables, table, :pseudo) }
table_config = @config.dig(:tables, table)
type_results = ActiveRecord::Base.connection.columns(table)
type_results = type_results.select do |c|
whitelisted[table].include?(c.name)
table_config[:whitelist].include?(c.name)
end
type_results = type_results.map do |c|
data_type = c.sql_type
if pseudonymized[table].include?(c.name)
if table_config[:pseudo].include?(c.name)
data_type = "character varying"
end
......
......@@ -9,10 +9,16 @@ module Pseudonymizer
@start_at = Time.now.utc
base_dir = output_dir || File.join(Dir.tmpdir, 'gitlab-pseudonymizer')
@output_dir = File.join(base_dir, start_at.iso8601)
@output_dir = File.join(base_dir, batch_dir)
end
def upload_dir
batch_dir
end
private
def batch_dir
start_at.iso8601
end
end
......
......@@ -32,7 +32,7 @@ describe Pseudonymizer::Dumper do
# grab the first table it outputs. There would only be 1.
project_table_file = pseudo.tables_to_csv[0]
expect(project_table_file).to include("projects.csv.gz")
expect(project_table_file).to end_with("projects.csv.gz")
columns = []
project_data = []
......@@ -50,6 +50,20 @@ describe Pseudonymizer::Dumper do
# sha 256 is 64 chars in length
expect(project_data["id"].length).to eq(64)
end
it "warns when pseudonymized fields are extraneous" do
column_names = %w(id name path description)
pseudo.config[:tables] = {
projects: {
whitelist: column_names,
pseudo: %w(id extraneous)
}
}
expect(Rails.logger).to receive(:warn).with(/extraneous/)
pseudo.tables_to_csv
end
end
describe "manifest is valid" do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment