require 'digest' require 'csv' require 'yaml' module Pseudonymity class Anon def initialize(fields) @anon_fields = fields end def anonymize(results) columns = results.columns # Assume they all have the same table to_filter = @anon_fields & columns Enumerator.new do | yielder | results.each do |result| to_filter.each do |field| result[field] = Digest::SHA2.new(256).hexdigest(result[field]) unless result[field].nil? end yielder << result end end end end class Table attr_accessor :config def initialize @config = {} @csv_output = "" parse_config @schema = {} @output_files = [] end def tables_to_csv tables = config["tables"] @csv_output = config["output"]["csv"].chomp("\g/") if not File.directory?(@csv_output) puts "No such directory #{@csv_output}" return end tables.map do | k, v | @schema[k] = {} table_to_csv(k, v["whitelist"], v["pseudo"]) end schema_to_yml file_list_to_json end def get_and_log_file_name(ext, prefix=nil, filename=nil) file_timestamp = filename || "#{prefix}_#{Time.now.to_i}" file_timestamp = "#{file_timestamp}.#{ext}" @output_files << file_timestamp "#{@csv_output}/#{file_timestamp}" end def schema_to_yml file_path = get_and_log_file_name("yml", "schema") File.open(file_path, 'w') { |file| file.write(@schema.to_yaml) } end def file_list_to_json file_path = get_and_log_file_name("json", nil, "file_list") File.open(file_path, 'w') { |file| file.write(@output_files.to_json) } end def table_to_csv(table, whitelist_columns, pseudonymity_columns) sql = "SELECT #{whitelist_columns.join(",")} FROM #{table};" type_sql = "SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '#{table}';" results = ActiveRecord::Base.connection.exec_query(sql) type_results = ActiveRecord::Base.connection.exec_query(type_sql) set_schema_column_types(table, type_results) return if results.empty? anon = Anon.new(pseudonymity_columns) write_to_csv_file(table, anon.anonymize(results)) end def set_schema_column_types(table, type_results) type_results.each do | type_result | data_type = type_result["data_type"] if @config["tables"][table]["pseudo"].include?(type_result["column_name"]) data_type = "character varying" end @schema[table][type_result["column_name"]] = data_type end # hard coded because all mapping keys in GL are id @schema[table]["gl_mapping_key"] = "id" end def parse_config @config = YAML.load_file('./lib/assets/pseudonymity_dump.yml') end def write_to_csv_file(title, contents) file_path = get_and_log_file_name("csv", title) column_names = contents.first.keys contents = CSV.generate do | csv | csv << column_names contents.each do |x| csv << x.values end end File.open(file_path, 'w') { |file| file.write(contents) } return file_path end private :write_to_csv_file end end