Commit 879c6586 authored by Micaël Bergeron's avatar Micaël Bergeron

add specs for `Pseudonymizer::Pager`

parent dc700967
require 'openssl'
require 'digest'
require 'csv'
require 'yaml'
module Pseudonymizer
class Pager
PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000)
def initialize(table, columns)
@table = table
@columns = columns
end
def pages(&block)
if @columns.include?("id")
# optimize the pagination using WHERE id > ?
pages_per_id(&block)
else
# fallback to `LIMIT ? OFFSET ?` when "id" is unavailable
pages_per_offset(&block)
end
end
def pages_per_id(&block)
id_offset = 0
loop do
# a page of results
results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
SELECT #{@columns.join(",")}
FROM #{@table}
WHERE id > #{id_offset}
ORDER BY id
LIMIT #{PAGE_SIZE}
SQL
Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}, +#{PAGE_SIZE}[")
break if results.empty?
id_offset = results.last["id"].to_i
yield results
break if results.count < PAGE_SIZE
end
end
def pages_per_offset(&block)
page = 0
loop do
offset = page * PAGE_SIZE
# a page of results
results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
SELECT #{@columns.join(",")}
FROM #{@table}
ORDER BY #{@columns.join(",")}
LIMIT #{PAGE_SIZE} OFFSET #{offset}
SQL
Rails.logger.debug("#{self.class.name} fetching offset [#{offset}, #{offset + PAGE_SIZE}[")
break if results.empty?
page += 1
yield results
break if results.count < PAGE_SIZE
end
end
end
class Anon
def initialize(table, whitelisted_fields, pseudonymized_fields)
@table = table
@pseudo_fields = pseudo_fields(whitelisted_fields, pseudonymized_fields)
end
def anonymize(results)
key = Rails.application.secrets[:secret_key_base]
digest = OpenSSL::Digest.new('sha256')
Enumerator.new do |yielder|
results.each do |result|
@pseudo_fields.each do |field|
next if result[field].nil?
result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field]))
end
yielder << result
end
end
end
private
def pseudo_fields(whitelisted, pseudonymized)
pseudo_extra_fields = pseudonymized - whitelisted
pseudo_extra_fields.each do |field|
Rails.logger.warn("#{self.class.name} extraneous pseudo: #{@table}.#{field} is not whitelisted and will be ignored.")
end
pseudonymized & whitelisted
end
end
class Dumper
attr_accessor :config, :output_dir
......@@ -165,12 +62,12 @@ module Pseudonymizer
# yield every results, pagined, anonymized
def table_page_results(table, whitelist_columns, pseudonymity_columns)
anonymizer = Anon.new(table, whitelist_columns, pseudonymity_columns)
filter = Filter.new(table, whitelist_columns, pseudonymity_columns)
pager = Pager.new(table, whitelist_columns)
Enumerator.new do |yielder|
pager.pages do |page|
anonymizer.anonymize(page).each do |result|
filter.anonymize(page).each do |result|
yielder << result
end
end
......
require 'openssl'
require 'digest'
module Pseudonymizer
class Filter
def initialize(table, whitelisted_fields, pseudonymized_fields)
@table = table
@pseudo_fields = pseudo_fields(whitelisted_fields, pseudonymized_fields)
end
def anonymize(results)
key = Rails.application.secrets[:secret_key_base]
digest = OpenSSL::Digest.new('sha256')
Enumerator.new do |yielder|
results.each do |result|
@pseudo_fields.each do |field|
next if result[field].nil?
result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field]))
end
yielder << result
end
end
end
private
def pseudo_fields(whitelisted, pseudonymized)
pseudo_extra_fields = pseudonymized - whitelisted
pseudo_extra_fields.each do |field|
Rails.logger.warn("#{self.class.name} extraneous pseudo: #{@table}.#{field} is not whitelisted and will be ignored.")
end
pseudonymized & whitelisted
end
end
end
module Pseudonymizer
class Pager
PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000)
def initialize(table, columns)
@table = table
@columns = columns
end
def pages(&block)
if @columns.include?("id")
# optimize the pagination using WHERE id > ?
pages_per_id(&block)
else
# fallback to `LIMIT ? OFFSET ?` when "id" is unavailable
pages_per_offset(&block)
end
end
def pages_per_id(&block)
id_offset = 0
loop do
# a page of results
results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
SELECT #{@columns.join(",")}
FROM #{@table}
WHERE id > #{id_offset}
ORDER BY id
LIMIT #{PAGE_SIZE}
SQL
Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}, +#{PAGE_SIZE}[")
break if results.empty?
id_offset = results.last["id"].to_i
yield results
break if results.count < PAGE_SIZE
end
end
def pages_per_offset(&block)
offset = 0
loop do
# a page of results
results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
SELECT #{@columns.join(",")}
FROM #{@table}
ORDER BY #{@columns.join(",")}
LIMIT #{PAGE_SIZE} OFFSET #{offset}
SQL
Rails.logger.debug("#{self.class.name} fetching offset [#{offset}, #{offset + PAGE_SIZE}[")
break if results.empty?
offset += PAGE_SIZE
yield results
break if results.count < PAGE_SIZE
end
end
end
end
require 'spec_helper'
describe Pseudonymizer::Pager do
class Counter
@count = 0
def increment(*args)
self.count += 1
end
end
let(:page_size) { 1 }
let!(:projects) { create_list(:project, 10) }
subject { described_class.new("projects", whitelisted_columns) }
before do
stub_const("Pseudonymizer::Pager::PAGE_SIZE", page_size)
end
shared_examples "yield results in page" do
it do
page_count = 0
result_count = 0
subject.pages do |page|
result_count += page.count
page_count += 1
end
expect(result_count).to eq(projects.count)
expect(page_count).to eq(projects.count / page_size)
end
end
context "`id` column is present" do
let(:whitelisted_columns) { %w(id name) }
describe "#pages" do
it "delegates to #pages_per_id" do
expect(subject).to receive(:pages_per_id)
subject.pages {|page| nil}
end
include_examples "yield results in page"
end
end
context "`id` column is missing" do
let(:whitelisted_columns) { %w(name) }
describe "#pages" do
it "delegates to #pages_per_offset" do
expect(subject).to receive(:pages_per_offset)
subject.pages {|page| nil}
end
include_examples "yield results in page"
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment