Commit 184f612d authored by Micaël Bergeron's avatar Micaël Bergeron

apply feedback

parent 679240a1
......@@ -6,7 +6,7 @@ class PseudonymizerWorker
return unless Gitlab::CurrentSettings.pseudonymizer_enabled?
options = Pseudonymizer::Options.new(
config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)),
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest),
output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
)
......
......@@ -479,7 +479,7 @@ Settings.backup['upload']['storage_class'] ||= nil
#
Settings['pseudonymizer'] ||= Settingslogic.new({})
Settings.pseudonymizer['enabled'] = false if Settings.pseudonymizer['enabled'].nil?
Settings.pseudonymizer['manifest'] = Settings.pseudonymizer['manifest'] || "lib/pseudonymizer/manifest.yml"
Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("lib/pseudonymizer/manifest.yml"))
Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil })
# Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600
......
......@@ -206,7 +206,7 @@ ActiveRecord::Schema.define(version: 20180612175636) do
t.string "encrypted_external_auth_client_key_pass_iv"
t.string "email_additional_text"
t.boolean "enforce_terms", default: false
t.boolean "pseudonymizer_enabled"
t.boolean "pseudonymizer_enabled", default: false, null: false
end
create_table "approvals", force: :cascade do |t|
......
......@@ -70,7 +70,7 @@ To configure the pseudonymizer, you need to:
remote_directory: 'gitlab-elt' # The bucket name
connection:
provider: AWS # Only AWS supported at the moment
aws_access_key_id: AWS_ACESS_KEY_ID
aws_access_key_id: AWS_ACCESS_KEY_ID
aws_secret_access_key: AWS_SECRET_ACCESS_KEY
region: eu-central-1
```
......
......@@ -31,7 +31,6 @@ class License < ActiveRecord::Base
repository_mirrors
repository_size_limit
scoped_issue_board
pseudonymizer
].freeze
EEP_FEATURES = EES_FEATURES + %i[
......@@ -74,6 +73,7 @@ class License < ActiveRecord::Base
ide
chatops
pod_logs
pseudonymizer
].freeze
# List all features available for early adopters,
......
......@@ -26,6 +26,6 @@ class AddPseudonymizerEnabledToApplicationSettings < ActiveRecord::Migration
# disable_ddl_transaction!
def change
add_column :application_settings, :pseudonymizer_enabled, :boolean
add_column :application_settings, :pseudonymizer_enabled, :boolean, null: false, default: false
end
end
......@@ -4,7 +4,69 @@ require 'csv'
require 'yaml'
module Pseudonymizer
PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000)
class Pager
PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000)
def initialize(table, columns)
@table = table
@columns = columns
end
def pages(&block)
if @columns.include?("id")
# optimize the pagination using WHERE id > ?
pages_per_id(&block)
else
# fallback to `LIMIT ? OFFSET ?` when "id" is unavailable
pages_per_offset(&block)
end
end
def pages_per_id(&block)
id_offset = 0
loop do
# a page of results
results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
SELECT #{@columns.join(",")}
FROM #{@table}
WHERE id > #{id_offset}
ORDER BY id
LIMIT #{PAGE_SIZE}
SQL
Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}, +#{PAGE_SIZE}[")
break if results.empty?
id_offset = results.last["id"].to_i
yield results
break if results.count < PAGE_SIZE
end
end
def pages_per_offset(&block)
page = 0
loop do
offset = page * PAGE_SIZE
# a page of results
results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
SELECT #{@columns.join(",")}
FROM #{@table}
ORDER BY #{@columns.join(",")}
LIMIT #{PAGE_SIZE} OFFSET #{offset}
SQL
Rails.logger.debug("#{self.class.name} fetching offset [#{offset}, #{offset + PAGE_SIZE}[")
break if results.empty?
page += 1
yield results
break if results.count < PAGE_SIZE
end
end
end
class Anon
def initialize(fields)
......@@ -47,7 +109,7 @@ module Pseudonymizer
end
def tables_to_csv
reset!
return @output_files if @output_files
tables = config[:tables]
FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir)
......@@ -94,25 +156,13 @@ module Pseudonymizer
# yield every results, pagined, anonymized
def table_page_results(table, whitelist_columns, pseudonymity_columns)
anonymizer = Anon.new(pseudonymity_columns)
page = 0
pager = Pager.new(table, whitelist_columns)
Enumerator.new do |yielder|
loop do
offset = page * PAGE_SIZE
has_more = false
sql = "SELECT #{whitelist_columns.join(",")} FROM #{table} LIMIT #{PAGE_SIZE} OFFSET #{offset}"
# a page of results
results = ActiveRecord::Base.connection.exec_query(sql)
anonymizer.anonymize(results).each do |result|
has_more = true
pager.pages do |page|
anonymizer.anonymize(page).each do |result|
yielder << result
end
raise StopIteration unless has_more
page += 1
end
end.lazy
end
......
......@@ -209,34 +209,6 @@ tables:
- updated_at
pseudo:
- id
merge_request_diff_commits:
whitelist:
- authored_date
- committed_date
- merge_request_diff_id
- relative_order
- author_name
- author_email
- committer_name
- committer_email
pseudo:
- merge_request_diff_id
- author_name
- author_email
- committer_name
- committer_email
merge_request_diff_files:
whitelist:
- merge_request_diff_id
- relative_order
- new_file
- renamed_file
- deleted_file
- too_large
- a_mode
- b_mode
pseudo:
- merge_request_diff_id
merge_request_diffs:
whitelist:
- id
......
......@@ -72,11 +72,11 @@ namespace :gitlab do
desc 'Output pseudonymity dump of selected tables'
task pseudonymizer: :environment do
abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer)
abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled?
# abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer)
# abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled?
options = Pseudonymizer::Options.new(
config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)),
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest),
output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
)
......
......@@ -5,7 +5,7 @@ describe Pseudonymizer::Dumper do
let(:base_dir) { Dir.mktmpdir }
let(:options) do
Pseudonymizer::Options.new(
config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest))
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest)
)
end
subject(:pseudo) { described_class.new(options) }
......
......@@ -4,7 +4,7 @@ describe Pseudonymizer::Uploader do
let(:base_dir) { Dir.mktmpdir }
let(:options) do
Pseudonymizer::Options.new(
config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest))
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest)
)
end
let(:remote_directory) { subject.send(:remote_directory) }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment