Commit 9c010a97 authored by Micaël Bergeron's avatar Micaël Bergeron

apply feedback

parent a4b43b89
...@@ -7,7 +7,7 @@ class PseudonymizerWorker ...@@ -7,7 +7,7 @@ class PseudonymizerWorker
options = Pseudonymizer::Options.new( options = Pseudonymizer::Options.new(
config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)),
start_at: Time.now.utc output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
) )
dumper = Pseudonymizer::Dumper.new(options) dumper = Pseudonymizer::Dumper.new(options)
......
...@@ -4,7 +4,7 @@ require 'csv' ...@@ -4,7 +4,7 @@ require 'csv'
require 'yaml' require 'yaml'
module Pseudonymizer module Pseudonymizer
PAGE_SIZE = 10000 PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000)
class Anon class Anon
def initialize(fields) def initialize(fields)
...@@ -38,45 +38,57 @@ module Pseudonymizer ...@@ -38,45 +38,57 @@ module Pseudonymizer
@output_dir = options.output_dir @output_dir = options.output_dir
@start_at = options.start_at @start_at = options.start_at
reset!
end
def reset!
@schema = Hash.new { |h, k| h[k] = {} } @schema = Hash.new { |h, k| h[k] = {} }
@output_files = [] @output_files = []
end end
def tables_to_csv def tables_to_csv
tables = config["tables"] reset!
tables = config["tables"]
FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir) FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir)
schema_to_yml schema_to_yml
file_list_to_json @output_files = tables.map do |k, v|
tables.map do |k, v|
table_to_csv(k, v['whitelist'], v['pseudo']) table_to_csv(k, v['whitelist'], v['pseudo'])
end end
file_list_to_json
@output_files
end end
private private
def get_and_log_file_name(ext, prefix = nil, filename = nil) def output_filename(basename = nil, ext = "csv.gz")
file_timestamp = filename || "#{prefix}_#{@start_at.to_i}" file_timestamp = "#{basename}.#{ext}"
file_timestamp = "#{file_timestamp}.#{ext}"
@output_files << file_timestamp
File.join(output_dir, file_timestamp) File.join(output_dir, file_timestamp)
end end
def schema_to_yml def schema_to_yml
file_path = get_and_log_file_name("yml", "schema") file_path = output_filename("schema", "yml")
File.open(file_path, 'w') { |file| file.write(@schema.to_yaml) } File.open(file_path, 'w') { |file| file.write(@schema.to_yaml) }
end end
def file_list_to_json def file_list_to_json
file_path = get_and_log_file_name("json", nil, "file_list") file_path = output_filename("file_list", "json")
File.open(file_path, 'w') { |file| file.write(@output_files.to_json) } File.open(file_path, 'w') do |file|
relative_files = @output_files.map(&File.method(:basename))
file.write(relative_files.to_json)
end
end end
def table_to_csv(table, whitelist_columns, pseudonymity_columns) def table_to_csv(table, whitelist_columns, pseudonymity_columns)
table_to_schema(table) table_to_schema(table)
write_to_csv_file(table, table_page_results(table, whitelist_columns, pseudonymity_columns)) write_to_csv_file(
table,
table_page_results(table,
whitelist_columns,
pseudonymity_columns)
)
rescue => e rescue => e
Rails.logger.error("Failed to export #{table}: #{e}") Rails.logger.error("Failed to export #{table}: #{e}")
end end
...@@ -134,15 +146,16 @@ module Pseudonymizer ...@@ -134,15 +146,16 @@ module Pseudonymizer
end end
def write_to_csv_file(table, contents) def write_to_csv_file(table, contents)
file_path = get_and_log_file_name("csv", table) file_path = output_filename(table, "csv.gz")
Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}." Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}."
CSV.open(file_path, 'w') do |csv| Zlib::GzipWriter.open(file_path) do |io|
csv = CSV.new(io)
contents.with_index do |row, i| contents.with_index do |row, i|
csv << row.keys if i == 0 # header csv << row.keys if i == 0 # header
csv << row.values csv << row.values
csv.flush if i % PAGE_SIZE
end end
csv.close
end end
file_path file_path
......
...@@ -96,15 +96,8 @@ tables: ...@@ -96,15 +96,8 @@ tables:
- author_id - author_id
- assignee_id - assignee_id
- iid - iid
- cached_markdown_version
- updated_by_id - updated_by_id
- last_edited_by_id - last_edited_by_id
- lock_version
- start_date
- end_date
- last_edited_at
- created_at
- updated_at
issue_assignees: issue_assignees:
whitelist: whitelist:
- user_id - user_id
...@@ -208,8 +201,6 @@ tables: ...@@ -208,8 +201,6 @@ tables:
- title - title
- color - color
- project_id - project_id
- created_at
- updated_at
- template - template
- type - type
- group_id - group_id
...@@ -423,12 +414,10 @@ tables: ...@@ -423,12 +414,10 @@ tables:
- created_at - created_at
- updated_at - updated_at
- project_id - project_id
- attachment
- line_code - line_code
- commit_id - commit_id
- noteable_id - noteable_id
- system - system
- st_diff
- updated_by_id - updated_by_id
- type - type
- position - position
...@@ -436,35 +425,18 @@ tables: ...@@ -436,35 +425,18 @@ tables:
- resolved_at - resolved_at
- resolved_by_id - resolved_by_id
- discussion_id - discussion_id
- note_html
- cached_markdown_version
- change_position - change_position
- resolved_by_push - resolved_by_push
pseudo: pseudo:
- id - id
- note - note
- noteable_type
- author_id - author_id
- created_at
- updated_at
- project_id - project_id
- attachment
- line_code
- commit_id - commit_id
- noteable_id - noteable_id
- system
- st_diff
- updated_by_id - updated_by_id
- type
- position
- original_position
- resolved_at
- resolved_by_id - resolved_by_id
- discussion_id - discussion_id
- note_html
- cached_markdown_version
- change_position
- resolved_by_push
notification_settings: notification_settings:
whitelist: whitelist:
- id - id
...@@ -492,8 +464,6 @@ tables: ...@@ -492,8 +464,6 @@ tables:
- source_id - source_id
- source_type - source_type
- level - level
- created_at
- updated_at
- new_note - new_note
- new_issue - new_issue
- reopen_issue - reopen_issue
...@@ -526,8 +496,6 @@ tables: ...@@ -526,8 +496,6 @@ tables:
pseudo: pseudo:
- id - id
- project_id - project_id
- created_at
- updated_at
- enabled - enabled
- domain - domain
project_custom_attributes: project_custom_attributes:
...@@ -540,8 +508,6 @@ tables: ...@@ -540,8 +508,6 @@ tables:
- value - value
pseudo: pseudo:
- id - id
- created_at
- updated_at
- project_id - project_id
- key - key
- value - value
...@@ -565,8 +531,6 @@ tables: ...@@ -565,8 +531,6 @@ tables:
- wiki_access_level - wiki_access_level
- snippets_access_level - snippets_access_level
- builds_access_level - builds_access_level
- created_at
- updated_at
- repository_access_level - repository_access_level
project_group_links: project_group_links:
whitelist: whitelist:
...@@ -581,8 +545,6 @@ tables: ...@@ -581,8 +545,6 @@ tables:
- id - id
- project_id - project_id
- group_id - group_id
- created_at
- updated_at
- group_access - group_access
- expires_at - expires_at
project_import_data: project_import_data:
...@@ -615,8 +577,6 @@ tables: ...@@ -615,8 +577,6 @@ tables:
- last_update_started_at - last_update_started_at
- last_update_scheduled_at - last_update_scheduled_at
- next_execution_timestamp - next_execution_timestamp
- created_at
- updated_at
project_repository_states: project_repository_states:
whitelist: whitelist:
- id - id
...@@ -730,8 +690,6 @@ tables: ...@@ -730,8 +690,6 @@ tables:
- name - name
- path - path
- description - description
- created_at
- updated_at
- creator_id - creator_id
- namespace_id - namespace_id
- last_activity_at - last_activity_at
...@@ -875,7 +833,6 @@ tables: ...@@ -875,7 +833,6 @@ tables:
pseudo: pseudo:
- id - id
- email - email
- remember_created_at
- current_sign_in_ip - current_sign_in_ip
- last_sign_in_ip - last_sign_in_ip
- name - name
...@@ -897,12 +854,10 @@ tables: ...@@ -897,12 +854,10 @@ tables:
- hide_project_limit - hide_project_limit
- note - note
- unlock_token - unlock_token
- otp_grace_period_started_at
- external - external
- incoming_email_token - incoming_email_token
- organization - organization
- auditor - auditor
- two_factor_grace_period - two_factor_grace_period
- ghost
- rss_token - rss_token
- theme_id - theme_id
...@@ -2,14 +2,14 @@ module Pseudonymizer ...@@ -2,14 +2,14 @@ module Pseudonymizer
class Options class Options
attr_reader :config attr_reader :config
attr_reader :start_at attr_reader :start_at
attr_reader :output_dir
def initialize(config: {}) def initialize(config: {}, output_dir: nil)
@config = config @config = config
@start_at = Time.now.utc @start_at = Time.now.utc
end
def output_dir base_dir = output_dir || File.join(Dir.tmpdir, 'gitlab-pseudonymizer')
File.join(Dir.tmpdir, 'gitlab-pseudonymizer', start_at.iso8601) @output_dir = File.join(base_dir, start_at.iso8601)
end end
def upload_dir def upload_dir
......
...@@ -76,7 +76,8 @@ namespace :gitlab do ...@@ -76,7 +76,8 @@ namespace :gitlab do
abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled?
options = Pseudonymizer::Options.new( options = Pseudonymizer::Options.new(
config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)) config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)),
output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
) )
dumper = Pseudonymizer::Dumper.new(options) dumper = Pseudonymizer::Dumper.new(options)
......
...@@ -20,9 +20,10 @@ describe Pseudonymizer::Dumper do ...@@ -20,9 +20,10 @@ describe Pseudonymizer::Dumper do
describe 'Pseudo tables' do describe 'Pseudo tables' do
it 'outputs project tables to csv' do it 'outputs project tables to csv' do
column_names = %w(id name path description)
pseudo.config["tables"] = { pseudo.config["tables"] = {
"projects" => { "projects" => {
"whitelist" => %w(id name path description), "whitelist" => column_names,
"pseudo" => %w(id) "pseudo" => %w(id)
} }
} }
...@@ -31,26 +32,52 @@ describe Pseudonymizer::Dumper do ...@@ -31,26 +32,52 @@ describe Pseudonymizer::Dumper do
# grab the first table it outputs. There would only be 1. # grab the first table it outputs. There would only be 1.
project_table_file = pseudo.tables_to_csv[0] project_table_file = pseudo.tables_to_csv[0]
expect(project_table_file).to include("projects.csv.gz")
expect(project_table_file.include? "projects_").to be true
expect(project_table_file.include? ".csv").to be true
columns = [] columns = []
project_data = [] project_data = []
File.foreach(project_table_file).with_index do |line, line_num| Zlib::GzipReader.open(project_table_file) do |gz|
if line_num == 0 csv = CSV.new(gz, headers: true)
columns = line.split(",") # csv.shift # read the header row
elsif line_num == 1 project_data = csv.gets
project_data = line.split(",") columns = csv.headers
break
end
end end
# check if CSV columns are correct # check if CSV columns are correct
expect(columns.to_set).to eq(%W(id name path description\n).to_set) expect(columns).to include(*column_names)
# is it pseudonymous # is it pseudonymous
expect(project_data[0]).not_to eq(1)
# sha 256 is 64 chars in length # sha 256 is 64 chars in length
expect(project_data[0].length).to eq(64) expect(project_data["id"].length).to eq(64)
end
end
describe "manifest is valid" do
it "all tables exist" do
existing_tables = ActiveRecord::Base.connection.tables
tables = options.config['tables'].keys
expect(existing_tables).to include(*tables)
end
it "all whitelisted attributes exist" do
options.config['tables'].each do |table, table_def|
whitelisted = table_def['whitelist']
existing_columns = ActiveRecord::Base.connection.columns(table.to_sym).map(&:name)
diff = whitelisted - existing_columns
expect(diff).to be_empty, "#{table} should define columns #{whitelisted.inspect}: missing #{diff.inspect}"
end
end
it "all pseudonymized attributes are whitelisted" do
options.config['tables'].each do |table, table_def|
whitelisted = table_def['whitelist']
pseudonymized = table_def['pseudo']
diff = pseudonymized - whitelisted
expect(diff).to be_empty, "#{table} should whitelist columns #{pseudonymized.inspect}: missing #{diff.inspect}"
end
end end
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment