Commit 7525a79f authored by Micaël Bergeron's avatar Micaël Bergeron

apply feedback

parent 1c16cdec
...@@ -734,7 +734,7 @@ production: &base ...@@ -734,7 +734,7 @@ production: &base
pseudonymizer: pseudonymizer:
enabled: false enabled: false
# Tables manifest that specifies the fields to extract and pseudonymize. # Tables manifest that specifies the fields to extract and pseudonymize.
manifest: lib/pseudonymizer/manifest.yml manifest: config/pseudonymizer.yml
upload: upload:
# remote_directory: 'gitlab-elt' # remote_directory: 'gitlab-elt'
# Fog storage connection settings, see http://fog.io/storage/ . # Fog storage connection settings, see http://fog.io/storage/ .
...@@ -897,7 +897,7 @@ test: ...@@ -897,7 +897,7 @@ test:
path: tmp/tests/backups path: tmp/tests/backups
pseudonymizer: pseudonymizer:
enabled: false enabled: false
manifest: lib/pseudonymizer/manifest.yml manifest: config/pseudonymizer.yml
upload: upload:
# The remote 'directory' to store the CSV files. For S3, this would be the bucket name. # The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
remote_directory: gitlab-elt.test remote_directory: gitlab-elt.test
......
...@@ -479,7 +479,7 @@ Settings.backup['upload']['storage_class'] ||= nil ...@@ -479,7 +479,7 @@ Settings.backup['upload']['storage_class'] ||= nil
# #
Settings['pseudonymizer'] ||= Settingslogic.new({}) Settings['pseudonymizer'] ||= Settingslogic.new({})
Settings.pseudonymizer['enabled'] = false if Settings.pseudonymizer['enabled'].nil? Settings.pseudonymizer['enabled'] = false if Settings.pseudonymizer['enabled'].nil?
Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("lib/pseudonymizer/manifest.yml")) Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("config/pseudonymizer.yml"))
Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil }) Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil })
# Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600 # Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600
......
...@@ -98,6 +98,8 @@ tables: ...@@ -98,6 +98,8 @@ tables:
- iid - iid
- updated_by_id - updated_by_id
- last_edited_by_id - last_edited_by_id
- title
- description
issue_assignees: issue_assignees:
whitelist: whitelist:
- user_id - user_id
...@@ -617,7 +619,6 @@ tables: ...@@ -617,7 +619,6 @@ tables:
- has_external_wiki - has_external_wiki
- ci_config_path - ci_config_path
- lfs_enabled - lfs_enabled
- description_html
- only_allow_merge_if_all_discussions_are_resolved - only_allow_merge_if_all_discussions_are_resolved
- repository_size_limit - repository_size_limit
- printing_merge_request_link_enabled - printing_merge_request_link_enabled
...@@ -670,7 +671,6 @@ tables: ...@@ -670,7 +671,6 @@ tables:
- repository_storage - repository_storage
- repository_read_only - repository_read_only
- ci_config_path - ci_config_path
- description_html
- only_allow_merge_if_all_discussions_are_resolved - only_allow_merge_if_all_discussions_are_resolved
- repository_size_limit - repository_size_limit
- auto_cancel_pending_pipelines - auto_cancel_pending_pipelines
......
...@@ -21,7 +21,7 @@ be textually exported. This ensures that: ...@@ -21,7 +21,7 @@ be textually exported. This ensures that:
To configure the pseudonymizer, you need to: To configure the pseudonymizer, you need to:
- Provide a manifest file that describes which fields should be included or - Provide a manifest file that describes which fields should be included or
pseudonymized ([example `manifest.yml` file]()). pseudonymized ([example `manifest.yml` file](https://gitlab.com/gitlab-org/gitlab-ee/tree/master/config/pseudonymizer.yml)).
- Use an object storage - Use an object storage
**For Omnibus installations:** **For Omnibus installations:**
...@@ -31,7 +31,7 @@ To configure the pseudonymizer, you need to: ...@@ -31,7 +31,7 @@ To configure the pseudonymizer, you need to:
```ruby ```ruby
gitlab_rails['pseudonymizer_enabled'] = true gitlab_rails['pseudonymizer_enabled'] = true
gitlab_rails['pseudonymizer_manifest'] = 'lib/pseudonymizer/manifest.yml' gitlab_rails['pseudonymizer_manifest'] = 'config/pseudonymizer.yml'
gitlab_rails['pseudonymizer_upload_remote_directory'] = 'gitlab-elt' gitlab_rails['pseudonymizer_upload_remote_directory'] = 'gitlab-elt'
gitlab_rails['pseudonymizer_upload_connection'] = { gitlab_rails['pseudonymizer_upload_connection'] = {
'provider' => 'AWS', 'provider' => 'AWS',
...@@ -65,7 +65,7 @@ To configure the pseudonymizer, you need to: ...@@ -65,7 +65,7 @@ To configure the pseudonymizer, you need to:
```yaml ```yaml
pseudonymizer: pseudonymizer:
enabled: true enabled: true
manifest: lib/pseudonymizer/manifest.yml manifest: config/pseudonymizer.yml
upload: upload:
remote_directory: 'gitlab-elt' # The bucket name remote_directory: 'gitlab-elt' # The bucket name
connection: connection:
......
...@@ -35,8 +35,12 @@ module EE ...@@ -35,8 +35,12 @@ module EE
"and the value is encrypted at rest.") "and the value is encrypted at rest.")
end end
def pseudonymizer_enabled_help_text
_("Enable Pseudonymizer data export")
end
def pseudonymizer_description_text def pseudonymizer_description_text
_("GitLab will run the pseudonymizer cron job which will send pseudoanonymized data to be processed and analyzed.") _("GitLab will run the pseudonymizer cron job which will output pseudoanonymized data to be processed and analyzed.")
end end
def pseudonymizer_disabled_description_text def pseudonymizer_disabled_description_text
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
.form-check .form-check
= f.label :pseudonymizer_enabled do = f.label :pseudonymizer_enabled do
= f.check_box :pseudonymizer_enabled = f.check_box :pseudonymizer_enabled
Enable Pseudonymizer Cron Job = pseudonymizer_enabled_help_text
.form-text.text-muted .form-text.text-muted
- if is_enabled - if is_enabled
= pseudonymizer_description_text = pseudonymizer_description_text
......
...@@ -69,19 +69,18 @@ module Pseudonymizer ...@@ -69,19 +69,18 @@ module Pseudonymizer
end end
class Anon class Anon
def initialize(fields) def initialize(table, whitelisted_fields, pseudonymized_fields)
@anon_fields = fields @table = table
@pseudo_fields = pseudo_fields(whitelisted_fields, pseudonymized_fields)
end end
def anonymize(results) def anonymize(results)
columns = results.columns # Assume they all have the same table
to_filter = @anon_fields & columns
key = Rails.application.secrets[:secret_key_base] key = Rails.application.secrets[:secret_key_base]
digest = OpenSSL::Digest.new('sha256') digest = OpenSSL::Digest.new('sha256')
Enumerator.new do |yielder| Enumerator.new do |yielder|
results.each do |result| results.each do |result|
to_filter.each do |field| @pseudo_fields.each do |field|
next if result[field].nil? next if result[field].nil?
result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field])) result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field]))
...@@ -90,6 +89,17 @@ module Pseudonymizer ...@@ -90,6 +89,17 @@ module Pseudonymizer
end end
end end
end end
private
def pseudo_fields(whitelisted, pseudonymized)
pseudo_extra_fields = pseudonymized - whitelisted
pseudo_extra_fields.each do |field|
Rails.logger.warn("#{self.class.name} extraneous pseudo: #{@table}.#{field} is not whitelisted and will be ignored.")
end
pseudonymized & whitelisted
end
end end
class Dumper class Dumper
...@@ -155,7 +165,7 @@ module Pseudonymizer ...@@ -155,7 +165,7 @@ module Pseudonymizer
# yield every results, pagined, anonymized # yield every results, pagined, anonymized
def table_page_results(table, whitelist_columns, pseudonymity_columns) def table_page_results(table, whitelist_columns, pseudonymity_columns)
anonymizer = Anon.new(pseudonymity_columns) anonymizer = Anon.new(table, whitelist_columns, pseudonymity_columns)
pager = Pager.new(table, whitelist_columns) pager = Pager.new(table, whitelist_columns)
Enumerator.new do |yielder| Enumerator.new do |yielder|
...@@ -168,18 +178,17 @@ module Pseudonymizer ...@@ -168,18 +178,17 @@ module Pseudonymizer
end end
def table_to_schema(table) def table_to_schema(table)
whitelisted = ->(table) { @config.dig(:tables, table, :whitelist) } table_config = @config.dig(:tables, table)
pseudonymized = ->(table) { @config.dig(:tables, table, :pseudo) }
type_results = ActiveRecord::Base.connection.columns(table) type_results = ActiveRecord::Base.connection.columns(table)
type_results = type_results.select do |c| type_results = type_results.select do |c|
whitelisted[table].include?(c.name) table_config[:whitelist].include?(c.name)
end end
type_results = type_results.map do |c| type_results = type_results.map do |c|
data_type = c.sql_type data_type = c.sql_type
if pseudonymized[table].include?(c.name) if table_config[:pseudo].include?(c.name)
data_type = "character varying" data_type = "character varying"
end end
......
...@@ -9,10 +9,16 @@ module Pseudonymizer ...@@ -9,10 +9,16 @@ module Pseudonymizer
@start_at = Time.now.utc @start_at = Time.now.utc
base_dir = output_dir || File.join(Dir.tmpdir, 'gitlab-pseudonymizer') base_dir = output_dir || File.join(Dir.tmpdir, 'gitlab-pseudonymizer')
@output_dir = File.join(base_dir, start_at.iso8601) @output_dir = File.join(base_dir, batch_dir)
end end
def upload_dir def upload_dir
batch_dir
end
private
def batch_dir
start_at.iso8601 start_at.iso8601
end end
end end
......
...@@ -32,7 +32,7 @@ describe Pseudonymizer::Dumper do ...@@ -32,7 +32,7 @@ describe Pseudonymizer::Dumper do
# grab the first table it outputs. There would only be 1. # grab the first table it outputs. There would only be 1.
project_table_file = pseudo.tables_to_csv[0] project_table_file = pseudo.tables_to_csv[0]
expect(project_table_file).to include("projects.csv.gz") expect(project_table_file).to end_with("projects.csv.gz")
columns = [] columns = []
project_data = [] project_data = []
...@@ -50,6 +50,20 @@ describe Pseudonymizer::Dumper do ...@@ -50,6 +50,20 @@ describe Pseudonymizer::Dumper do
# sha 256 is 64 chars in length # sha 256 is 64 chars in length
expect(project_data["id"].length).to eq(64) expect(project_data["id"].length).to eq(64)
end end
it "warns when pseudonymized fields are extraneous" do
column_names = %w(id name path description)
pseudo.config[:tables] = {
projects: {
whitelist: column_names,
pseudo: %w(id extraneous)
}
}
expect(Rails.logger).to receive(:warn).with(/extraneous/)
pseudo.tables_to_csv
end
end end
describe "manifest is valid" do describe "manifest is valid" do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment