Commit 9c010a97 authored by Micaël Bergeron's avatar Micaël Bergeron

apply feedback

parent a4b43b89
......@@ -7,7 +7,7 @@ class PseudonymizerWorker
options = Pseudonymizer::Options.new(
config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)),
start_at: Time.now.utc
output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
)
dumper = Pseudonymizer::Dumper.new(options)
......
......@@ -4,7 +4,7 @@ require 'csv'
require 'yaml'
module Pseudonymizer
PAGE_SIZE = 10000
PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000)
class Anon
def initialize(fields)
......@@ -38,45 +38,57 @@ module Pseudonymizer
@output_dir = options.output_dir
@start_at = options.start_at
reset!
end
def reset!
@schema = Hash.new { |h, k| h[k] = {} }
@output_files = []
end
def tables_to_csv
tables = config["tables"]
reset!
tables = config["tables"]
FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir)
schema_to_yml
file_list_to_json
tables.map do |k, v|
@output_files = tables.map do |k, v|
table_to_csv(k, v['whitelist'], v['pseudo'])
end
file_list_to_json
@output_files
end
private
def get_and_log_file_name(ext, prefix = nil, filename = nil)
file_timestamp = filename || "#{prefix}_#{@start_at.to_i}"
file_timestamp = "#{file_timestamp}.#{ext}"
@output_files << file_timestamp
def output_filename(basename = nil, ext = "csv.gz")
file_timestamp = "#{basename}.#{ext}"
File.join(output_dir, file_timestamp)
end
def schema_to_yml
file_path = get_and_log_file_name("yml", "schema")
file_path = output_filename("schema", "yml")
File.open(file_path, 'w') { |file| file.write(@schema.to_yaml) }
end
def file_list_to_json
file_path = get_and_log_file_name("json", nil, "file_list")
File.open(file_path, 'w') { |file| file.write(@output_files.to_json) }
file_path = output_filename("file_list", "json")
File.open(file_path, 'w') do |file|
relative_files = @output_files.map(&File.method(:basename))
file.write(relative_files.to_json)
end
end
def table_to_csv(table, whitelist_columns, pseudonymity_columns)
table_to_schema(table)
write_to_csv_file(table, table_page_results(table, whitelist_columns, pseudonymity_columns))
write_to_csv_file(
table,
table_page_results(table,
whitelist_columns,
pseudonymity_columns)
)
rescue => e
Rails.logger.error("Failed to export #{table}: #{e}")
end
......@@ -134,15 +146,16 @@ module Pseudonymizer
end
def write_to_csv_file(table, contents)
file_path = get_and_log_file_name("csv", table)
file_path = output_filename(table, "csv.gz")
Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}."
CSV.open(file_path, 'w') do |csv|
Zlib::GzipWriter.open(file_path) do |io|
csv = CSV.new(io)
contents.with_index do |row, i|
csv << row.keys if i == 0 # header
csv << row.values
csv.flush if i % PAGE_SIZE
end
csv.close
end
file_path
......
......@@ -96,15 +96,8 @@ tables:
- author_id
- assignee_id
- iid
- cached_markdown_version
- updated_by_id
- last_edited_by_id
- lock_version
- start_date
- end_date
- last_edited_at
- created_at
- updated_at
issue_assignees:
whitelist:
- user_id
......@@ -208,8 +201,6 @@ tables:
- title
- color
- project_id
- created_at
- updated_at
- template
- type
- group_id
......@@ -423,12 +414,10 @@ tables:
- created_at
- updated_at
- project_id
- attachment
- line_code
- commit_id
- noteable_id
- system
- st_diff
- updated_by_id
- type
- position
......@@ -436,35 +425,18 @@ tables:
- resolved_at
- resolved_by_id
- discussion_id
- note_html
- cached_markdown_version
- change_position
- resolved_by_push
pseudo:
- id
- note
- noteable_type
- author_id
- created_at
- updated_at
- project_id
- attachment
- line_code
- commit_id
- noteable_id
- system
- st_diff
- updated_by_id
- type
- position
- original_position
- resolved_at
- resolved_by_id
- discussion_id
- note_html
- cached_markdown_version
- change_position
- resolved_by_push
notification_settings:
whitelist:
- id
......@@ -492,8 +464,6 @@ tables:
- source_id
- source_type
- level
- created_at
- updated_at
- new_note
- new_issue
- reopen_issue
......@@ -526,8 +496,6 @@ tables:
pseudo:
- id
- project_id
- created_at
- updated_at
- enabled
- domain
project_custom_attributes:
......@@ -540,8 +508,6 @@ tables:
- value
pseudo:
- id
- created_at
- updated_at
- project_id
- key
- value
......@@ -565,8 +531,6 @@ tables:
- wiki_access_level
- snippets_access_level
- builds_access_level
- created_at
- updated_at
- repository_access_level
project_group_links:
whitelist:
......@@ -581,8 +545,6 @@ tables:
- id
- project_id
- group_id
- created_at
- updated_at
- group_access
- expires_at
project_import_data:
......@@ -615,8 +577,6 @@ tables:
- last_update_started_at
- last_update_scheduled_at
- next_execution_timestamp
- created_at
- updated_at
project_repository_states:
whitelist:
- id
......@@ -730,8 +690,6 @@ tables:
- name
- path
- description
- created_at
- updated_at
- creator_id
- namespace_id
- last_activity_at
......@@ -875,7 +833,6 @@ tables:
pseudo:
- id
- email
- remember_created_at
- current_sign_in_ip
- last_sign_in_ip
- name
......@@ -897,12 +854,10 @@ tables:
- hide_project_limit
- note
- unlock_token
- otp_grace_period_started_at
- external
- incoming_email_token
- organization
- auditor
- two_factor_grace_period
- ghost
- rss_token
- theme_id
......@@ -2,14 +2,14 @@ module Pseudonymizer
class Options
attr_reader :config
attr_reader :start_at
attr_reader :output_dir
def initialize(config: {})
def initialize(config: {}, output_dir: nil)
@config = config
@start_at = Time.now.utc
end
def output_dir
File.join(Dir.tmpdir, 'gitlab-pseudonymizer', start_at.iso8601)
base_dir = output_dir || File.join(Dir.tmpdir, 'gitlab-pseudonymizer')
@output_dir = File.join(base_dir, start_at.iso8601)
end
def upload_dir
......
......@@ -76,7 +76,8 @@ namespace :gitlab do
abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled?
options = Pseudonymizer::Options.new(
config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest))
config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)),
output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
)
dumper = Pseudonymizer::Dumper.new(options)
......
......@@ -20,9 +20,10 @@ describe Pseudonymizer::Dumper do
describe 'Pseudo tables' do
it 'outputs project tables to csv' do
column_names = %w(id name path description)
pseudo.config["tables"] = {
"projects" => {
"whitelist" => %w(id name path description),
"whitelist" => column_names,
"pseudo" => %w(id)
}
}
......@@ -31,26 +32,52 @@ describe Pseudonymizer::Dumper do
# grab the first table it outputs. There would only be 1.
project_table_file = pseudo.tables_to_csv[0]
expect(project_table_file).to include("projects.csv.gz")
expect(project_table_file.include? "projects_").to be true
expect(project_table_file.include? ".csv").to be true
columns = []
project_data = []
File.foreach(project_table_file).with_index do |line, line_num|
if line_num == 0
columns = line.split(",")
elsif line_num == 1
project_data = line.split(",")
break
end
Zlib::GzipReader.open(project_table_file) do |gz|
csv = CSV.new(gz, headers: true)
# csv.shift # read the header row
project_data = csv.gets
columns = csv.headers
end
# check if CSV columns are correct
expect(columns.to_set).to eq(%W(id name path description\n).to_set)
expect(columns).to include(*column_names)
# is it pseudonymous
expect(project_data[0]).not_to eq(1)
# sha 256 is 64 chars in length
expect(project_data[0].length).to eq(64)
expect(project_data["id"].length).to eq(64)
end
end
describe "manifest is valid" do
it "all tables exist" do
existing_tables = ActiveRecord::Base.connection.tables
tables = options.config['tables'].keys
expect(existing_tables).to include(*tables)
end
it "all whitelisted attributes exist" do
options.config['tables'].each do |table, table_def|
whitelisted = table_def['whitelist']
existing_columns = ActiveRecord::Base.connection.columns(table.to_sym).map(&:name)
diff = whitelisted - existing_columns
expect(diff).to be_empty, "#{table} should define columns #{whitelisted.inspect}: missing #{diff.inspect}"
end
end
it "all pseudonymized attributes are whitelisted" do
options.config['tables'].each do |table, table_def|
whitelisted = table_def['whitelist']
pseudonymized = table_def['pseudo']
diff = pseudonymized - whitelisted
expect(diff).to be_empty, "#{table} should whitelist columns #{pseudonymized.inspect}: missing #{diff.inspect}"
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment