Merge branch 'gitlab-elt' into 'master'

Adds method to move tables to CSV with redacted data. Closes meltano/meltano#80 See merge request gitlab-org/gitlab-ee!5532

Merge branch 'gitlab-elt' into 'master'
Adds method to move tables to CSV with redacted data. Closes meltano/meltano#80 See merge request gitlab-org/gitlab-ee!5532
f28ff040 · Stan Hu · a43e2609 · 4b90a8f8 · f28ff040 · f28ff040
Commit f28ff040 authored Jun 25, 2018 by Stan Hu
27 changed files
--- a/app/views/admin/application_settings/show.html.haml
+++ b/app/views/admin/application_settings/show.html.haml
@@ -373,3 +373,5 @@
        = _('Geo allows you to replicate your GitLab instance to other geographical locations.')
    .settings-content
      = render partial: 'slack'
+
+= render_if_exists 'admin/application_settings/pseudonymizer_settings', expanded: expanded
--- a/app/workers/all_queues.yml
+++ b/app/workers/all_queues.yml
@@ -145,6 +145,7 @@
 - cronjob:ldap_all_groups_sync
 - cronjob:ldap_sync
 - cronjob:update_all_mirrors
+- cronjob:pseudonymizer

 - geo:geo_scheduler_scheduler
 - geo:geo_scheduler_primary_scheduler

--- a/config/gitlab.yml.example
+++ b/config/gitlab.yml.example
@@ -311,6 +311,10 @@ production: &base
    geo_migrated_local_files_clean_up_worker:
      cron: "15 */6 * * *"

+    # Export pseudonymized data in CSV format for analysis
+    pseudonymizer_worker:
+      cron: "0 * * * *"
+
  registry:
    # enabled: true
    # host: registry.example.com
@@ -726,6 +730,20 @@ production: &base
    #   # Specifies Amazon S3 storage class to use for backups, this is optional
    #   # storage_class: 'STANDARD'

+  ## Pseudonymizer exporter
+  pseudonymizer:
+    # Tables manifest that specifies the fields to extract and pseudonymize.
+    manifest: config/pseudonymizer.yml
+    upload:
+      # remote_directory: 'gitlab-elt'
+      # Fog storage connection settings, see http://fog.io/storage/ .
+      connection:
+      #   provider: AWS
+      #   region: eu-west-1
+      #   aws_access_key_id: AKIAKIAKI
+      #   aws_secret_access_key: 'secret123'
+      #   # The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
+
  ## GitLab Shell settings
  gitlab_shell:
    path: /home/git/gitlab-shell/
@@ -876,6 +894,17 @@ test:
    token: secret
  backup:
    path: tmp/tests/backups
+  pseudonymizer:
+    manifest: config/pseudonymizer.yml
+    upload:
+      # The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
+      remote_directory: gitlab-elt.test
+      # Fog storage connection settings, see http://fog.io/storage/
+      connection:
+        provider: AWS # Only AWS supported at the moment
+        aws_access_key_id: AWS_ACCESS_KEY_ID
+        aws_secret_access_key: AWS_SECRET_ACCESS_KEY
+        region: us-east-1
  gitlab_shell:
    path: tmp/tests/gitlab-shell/
    hooks_path: tmp/tests/gitlab-shell/hooks/

--- a/config/initializers/1_settings.rb
+++ b/config/initializers/1_settings.rb
@@ -370,6 +370,10 @@ Settings.cron_jobs['gitlab_usage_ping_worker'] ||= Settingslogic.new({})
 Settings.cron_jobs['gitlab_usage_ping_worker']['cron'] ||= Settings.__send__(:cron_for_usage_ping)
 Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWorker'

+Settings.cron_jobs['pseudonymizer_worker'] ||= Settingslogic.new({})
+Settings.cron_jobs['pseudonymizer_worker']['cron'] ||= '0 23 * * *'
+Settings.cron_jobs['pseudonymizer_worker']['job_class'] ||= 'PseudonymizerWorker'
+
 Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({})
 Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *'
 Settings.cron_jobs['schedule_update_user_activity_worker']['job_class'] = 'ScheduleUpdateUserActivityWorker'
@@ -470,6 +474,14 @@ Settings.backup['upload']['multipart_chunk_size'] ||= 104857600
 Settings.backup['upload']['encryption'] ||= nil
 Settings.backup['upload']['storage_class'] ||= nil

+#
+# Pseudonymizer
+#
+Settings['pseudonymizer'] ||= Settingslogic.new({})
+Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("config/pseudonymizer.yml"))
+Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil })
+# Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600
+
 #
 # Git
 #

--- a/config/pseudonymizer.yml
+++ b/config/pseudonymizer.yml
--- a/db/schema.rb
+++ b/db/schema.rb
@@ -206,6 +206,7 @@ ActiveRecord::Schema.define(version: 20180612175636) do
    t.string "encrypted_external_auth_client_key_pass_iv"
    t.string "email_additional_text"
    t.boolean "enforce_terms", default: false
+    t.boolean "pseudonymizer_enabled", default: false, null: false
  end

  create_table "approvals", force: :cascade do |t|

--- a/doc/administration/index.md
+++ b/doc/administration/index.md
@@ -167,6 +167,10 @@ created in snippets, wikis, and repos.
  - [Request Profiling](monitoring/performance/request_profiling.md): Get a detailed profile on slow requests.
  - [Performance Bar](monitoring/performance/performance_bar.md): Get performance information for the current page.

+## Analytics
+
+- [Pseudonymizer](pseudonymizer.md): Export data from GitLab's database to CSV files in a secure way.
+
 ## Troubleshooting

 - [Debugging tips](troubleshooting/debug.md): Tips to debug problems when things go wrong

--- a/doc/administration/pseudonymizer.md
+++ b/doc/administration/pseudonymizer.md
+# Pseudonymizer
+
+> [Introduced](https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/5532) in [GitLab Ultimate][ee] 11.1.
+
+As GitLab's database hosts sensitive information, using it unfiltered for analytics
+implies high security requirements. To help alleviate this constraint, the Pseudonymizer
+service is used to export GitLab's data in a pseudonymized way.
+
+CAUTION: **Warning:**
+This process is not impervious. If the source data is available, it's possible for
+a user to correlate data to the pseudonymized version.
+
+The Pseudonymizer currently uses `HMAC(SHA256)` to mutate fields that shouldn't
+be textually exported. This ensures that:
+
+- the end-user of the data source cannot infer/revert the pseudonymized fields
+- the referential integrity is maintained
+
+## Configuration
+
+To configure the pseudonymizer, you need to:
+
+- Provide a manifest file that describes which fields should be included or
+  pseudonymized ([example `manifest.yml` file](https://gitlab.com/gitlab-org/gitlab-ee/tree/master/config/pseudonymizer.yml)).
+  A default manifest is provided with the GitLab installation. Using a relative file path will be resolved from the Rails root. 
+  Alternatively, you can use an absolute file path.
+- Use an object storage and specify the connection parameters in the `pseudonymizer.upload.connection` configuration option. 
+
+**For Omnibus installations:**
+
+1. Edit `/etc/gitlab/gitlab.rb` and add the following lines by replacing with
+   the values you want:
+
+    ```ruby
+    gitlab_rails['pseudonymizer_manifest'] = 'config/pseudonymizer.yml'
+    gitlab_rails['pseudonymizer_upload_remote_directory'] = 'gitlab-elt'
+    gitlab_rails['pseudonymizer_upload_connection'] = {
+      'provider' => 'AWS',
+      'region' => 'eu-central-1',
+      'aws_access_key_id' => 'AWS_ACCESS_KEY_ID',
+      'aws_secret_access_key' => 'AWS_SECRET_ACCESS_KEY'
+    }
+    ```
+
+    NOTE: **Note:**
+    If you are using AWS IAM profiles, be sure to omit the AWS access key and secret access key/value pairs.
+
+    ```ruby
+    gitlab_rails['pseudonymizer_upload_connection'] = {
+      'provider' => 'AWS',
+      'region' => 'eu-central-1',
+      'use_iam_profile' => true
+    }
+    ```
+
+1. Save the file and [reconfigure GitLab](restart_gitlab.md#omnibus-gitlab-reconfigure)
+   for the changes to take effect.
+
+---
+
+**For installations from source:**
+
+1. Edit `/home/git/gitlab/config/gitlab.yml` and add or amend the following
+   lines:
+
+    ```yaml
+    pseudonymizer:
+	  manifest: config/pseudonymizer.yml
+	  upload:
+        remote_directory: 'gitlab-elt' # The bucket name
+        connection:
+          provider: AWS # Only AWS supported at the moment
+          aws_access_key_id: AWS_ACCESS_KEY_ID
+          aws_secret_access_key: AWS_SECRET_ACCESS_KEY
+          region: eu-central-1
+    ```
+
+1. Save the file and [restart GitLab](restart_gitlab.md#installations-from-source)
+   for the changes to take effect.
+
+## Usage
+
+You can optionally run the pseudonymizer using the following environment variables:
+
+- `PSEUDONYMIZER_OUTPUT_DIR` - where to store the output CSV files (defaults to `/tmp`)
+- `PSEUDONYMIZER_BATCH` - the batch size when querying the DB (defaults to `100000`)
+
+```bash
+## Omnibus
+sudo gitlab-rake gitlab:db:pseudonymizer
+
+## Source
+sudo -u git -H bundle exec rake gitlab:db:pseudonymizer RAILS_ENV=production
+```
+
+This will produce some CSV files that might be very large, so make sure the
+`PSEUDONYMIZER_OUTPUT_DIR` has sufficient space. As a rule of thumb, at least
+10% of the database size is recommended.
+
+After the pseudonymizer has run, the output CSV files should be uploaded to the
+configured object storage and deleted from the local disk.
+
+[ee]: https://about.gitlab.com/pricing/
--- a/ee/app/controllers/ee/admin/application_settings_controller.rb
+++ b/ee/app/controllers/ee/admin/application_settings_controller.rb
@@ -20,6 +20,10 @@ module EE
          attrs << :email_additional_text
        end

+        if License.feature_available?(:pseudonymizer)
+          attrs << :pseudonymizer_enabled
+        end
+
        attrs
      end
    end

--- a/ee/app/helpers/ee/application_settings_helper.rb
+++ b/ee/app/helpers/ee/application_settings_helper.rb
@@ -35,6 +35,18 @@ module EE
        "and the value is encrypted at rest.")
    end

+    def pseudonymizer_enabled_help_text
+      _("Enable Pseudonymizer data collection")
+    end
+
+    def pseudonymizer_description_text
+      _("GitLab will run a background job that will produce pseudonymized CSVs of the GitLab database that will be uploaded to your configured object storage directory.")
+    end
+
+    def pseudonymizer_disabled_description_text
+      _("The pseudonymizer data collection is disabled. When enabled, GitLab will run a background job that will produce pseudonymized CSVs of the GitLab database that will be uploaded to your configured object storage directory.")
+    end
+
    override :visible_attributes
    def visible_attributes
      super + [
@@ -55,7 +67,8 @@ module EE
        :slack_app_id,
        :slack_app_secret,
        :slack_app_verification_token,
-        :allow_group_owners_to_manage_ldap
+        :allow_group_owners_to_manage_ldap,
+        :pseudonymizer_enabled
      ]
    end


--- a/ee/app/models/ee/application_setting.rb
+++ b/ee/app/models/ee/application_setting.rb
@@ -100,11 +100,20 @@ module EE
          slack_app_enabled: false,
          slack_app_id: nil,
          slack_app_secret: nil,
-          slack_app_verification_token: nil
+          slack_app_verification_token: nil,
+          pseudonymizer_enabled: false
        )
      end
    end

+    def pseudonymizer_available?
+      License.feature_available?(:pseudonymizer)
+    end
+
+    def pseudonymizer_enabled?
+      pseudonymizer_available? && super
+    end
+
    def should_check_namespace_plan?
      check_namespace_plan? && (Rails.env.test? || ::Gitlab.dev_env_or_com?)
    end

--- a/ee/app/models/license.rb
+++ b/ee/app/models/license.rb
@@ -73,6 +73,7 @@ class License < ActiveRecord::Base
    ide
    chatops
    pod_logs
+    pseudonymizer
  ].freeze

  # List all features available for early adopters,

--- a/ee/app/views/admin/application_settings/_pseudonymizer.html.haml
+++ b/ee/app/views/admin/application_settings/_pseudonymizer.html.haml
+= form_for @application_setting, url: admin_application_settings_path, html: { class: 'fieldset-form' } do |f|
+  = form_errors(@application_setting)
+
+  %fieldset
+    .form-group.row
+      .offset-sm-2.col-sm-10
+        - is_enabled = @application_setting.pseudonymizer_enabled?
+        .form-check
+          = f.label :pseudonymizer_enabled do
+            = f.check_box :pseudonymizer_enabled
+            = pseudonymizer_enabled_help_text
+          .form-text.text-muted
+            - if is_enabled
+              = pseudonymizer_description_text
+            - else
+              = pseudonymizer_disabled_description_text
+
+  = f.submit 'Save changes', class: "btn btn-success"
--- a/ee/app/views/admin/application_settings/_pseudonymizer_settings.html.haml
+++ b/ee/app/views/admin/application_settings/_pseudonymizer_settings.html.haml
+- if Gitlab::CurrentSettings.pseudonymizer_available?
+  %section.settings.as-pseudonymizer.no-animate#js-pseudonymizer-settings{ class: ('expanded' if expanded) }
+    .settings-header
+      %h4
+        = _('Pseudonymizer data collection')
+      %button.btn.btn-default.js-settings-toggle{ type: 'button' }
+        = expanded ? _('Collapse') : _('Expand')
+      %p
+        = _('Enable or disable the Pseudonymizer data collection.')
+    .settings-content
+      = render 'pseudonymizer'
--- a/ee/app/workers/pseudonymizer_worker.rb
+++ b/ee/app/workers/pseudonymizer_worker.rb
+class PseudonymizerWorker
+  include ApplicationWorker
+  include CronjobQueue
+
+  def perform
+    return unless Gitlab::CurrentSettings.pseudonymizer_enabled?
+
+    options = Pseudonymizer::Options.new(
+      config: YAML.load_file(Gitlab.config.pseudonymizer.manifest),
+      output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
+    )
+
+    dumper = Pseudonymizer::Dumper.new(options)
+    uploader = Pseudonymizer::Uploader.new(options, progress_output: File.open(File::NULL, "w"))
+
+    unless uploader.available?
+      Rails.logger.error("The pseudonymizer object storage must be configured.")
+      return
+    end
+
+    begin
+      dumper.tables_to_csv
+      uploader.upload
+    ensure
+      uploader.cleanup
+    end
+  end
+end
--- a/ee/changelogs/unreleased/gitlab-elt.yml
+++ b/ee/changelogs/unreleased/gitlab-elt.yml
+---
+title: Pseudonymizer to safely export data for analytics.
+merge_request: 5532
+author:
+type: added
--- a/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb
+++ b/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb
+# See http://doc.gitlab.com/ce/development/migration_style_guide.html
+# for more information on how to write migrations for GitLab.
+
+class AddPseudonymizerEnabledToApplicationSettings < ActiveRecord::Migration
+  include Gitlab::Database::MigrationHelpers
+
+  # Set this constant to true if this migration requires downtime.
+  DOWNTIME = false
+
+  # When a migration requires downtime you **must** uncomment the following
+  # constant and define a short and easy to understand explanation as to why the
+  # migration requires downtime.
+  # DOWNTIME_REASON = ''
+
+  # When using the methods "add_concurrent_index", "remove_concurrent_index" or
+  # "add_column_with_default" you must disable the use of transactions
+  # as these methods can not run in an existing transaction.
+  # When using "add_concurrent_index" or "remove_concurrent_index" methods make sure
+  # that either of them is the _only_ method called in the migration,
+  # any other changes should go in a separate migration.
+  # This ensures that upon failure _only_ the index creation or removing fails
+  # and can be retried or reverted easily.
+  #
+  # To disable transactions uncomment the following line and remove these
+  # comments:
+  # disable_ddl_transaction!
+
+  def change
+    add_column :application_settings, :pseudonymizer_enabled, :boolean, null: false, default: false
+  end
+end
--- a/ee/lib/pseudonymizer/dumper.rb
+++ b/ee/lib/pseudonymizer/dumper.rb
+module Pseudonymizer
+  class Dumper
+    attr_accessor :config, :output_dir
+
+    def initialize(options)
+      @config = options.config.deep_symbolize_keys
+      @output_dir = options.output_dir
+      @start_at = options.start_at
+
+      reset!
+    end
+
+    def reset!
+      @schema = Hash.new { |h, k| h[k] = {} }
+      @output_files = []
+    end
+
+    def tables_to_csv
+      return @output_files unless @output_files.empty?
+
+      tables = config[:tables]
+      FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir)
+
+      @output_files = tables.map do |k, v|
+        table_to_csv(k, v[:whitelist], v[:pseudo])
+      end.compact
+
+      schema_to_yml
+      file_list_to_json
+
+      @output_files
+    end
+
+    private
+
+    def output_filename(basename = nil, ext = "csv.gz")
+      File.join(output_dir, "#{basename}.#{ext}")
+    end
+
+    def schema_to_yml
+      file_path = output_filename("schema", "yml")
+      File.write(file_path, @schema.to_yaml)
+    end
+
+    def file_list_to_json
+      file_path = output_filename("file_list", "json")
+      relative_files = @output_files.map(&File.method(:basename))
+      File.write(file_path, relative_files.to_json)
+    end
+
+    def table_to_csv(table, whitelist_columns, pseudonymity_columns)
+      table_to_schema(table)
+      write_to_csv_file(
+        table,
+        table_page_results(table,
+                           whitelist_columns,
+                           pseudonymity_columns)
+      )
+    rescue => e
+      Rails.logger.error("Failed to export #{table}: #{e}")
+      raise e
+    end
+
+    # yield every results, pagined, anonymized
+    def table_page_results(table, whitelist_columns, pseudonymity_columns)
+      filter = Filter.new(table, whitelist_columns, pseudonymity_columns)
+      pager = Pager.new(table, whitelist_columns)
+
+      Enumerator.new do |yielder|
+        pager.pages do |page|
+          filter.anonymize(page).each do |result|
+            yielder << result
+          end
+        end
+      end.lazy
+    end
+
+    def table_to_schema(table)
+      table_config = @config.dig(:tables, table)
+
+      type_results = ActiveRecord::Base.connection.columns(table)
+      type_results = type_results.select do |c|
+        table_config[:whitelist].include?(c.name)
+      end
+
+      type_results = type_results.map do |c|
+        data_type = c.sql_type
+
+        if table_config[:pseudo].include?(c.name)
+          data_type = "character varying"
+        end
+
+        { name: c.name, data_type: data_type }
+      end
+
+      set_schema_column_types(table, type_results)
+    end
+
+    def set_schema_column_types(table, type_results)
+      has_id = type_results.any? {|c| c[:name] == "id" }
+
+      type_results.each do |type_result|
+        @schema[table.to_s][type_result[:name]] = type_result[:data_type]
+      end
+
+      if has_id
+        # if there is an ID, it is the mapping_key
+        @schema[table.to_s]["gl_mapping_key"] = "id"
+      end
+    end
+
+    def write_to_csv_file(table, contents)
+      file_path = output_filename(table)
+      headers = contents.peek.keys
+
+      Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}."
+      Zlib::GzipWriter.open(file_path) do |io|
+        csv = CSV.new(io, headers: headers, write_headers: true)
+        contents.each { |row| csv << row.values }
+      end
+
+      file_path
+    rescue StopIteration
+      Rails.logger.info "#{self.class.name} table #{table} is empty."
+      nil
+    end
+  end
+end
--- a/ee/lib/pseudonymizer/filter.rb
+++ b/ee/lib/pseudonymizer/filter.rb
+require 'openssl'
+require 'digest'
+
+module Pseudonymizer
+  class Filter
+    def initialize(table, whitelisted_fields, pseudonymized_fields)
+      @table = table
+      @pseudo_fields = pseudo_fields(whitelisted_fields, pseudonymized_fields)
+    end
+
+    def anonymize(results)
+      key = Rails.application.secrets[:secret_key_base]
+      digest = OpenSSL::Digest.new('sha256')
+
+      Enumerator.new do |yielder|
+        results.each do |result|
+          @pseudo_fields.each do |field|
+            next if result[field].nil?
+
+            result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field]))
+          end
+          yielder << result
+        end
+      end
+    end
+
+    private
+
+    def pseudo_fields(whitelisted, pseudonymized)
+      pseudo_extra_fields = pseudonymized - whitelisted
+      pseudo_extra_fields.each do |field|
+        Rails.logger.warn("#{self.class.name} extraneous pseudo: #{@table}.#{field} is not whitelisted and will be ignored.")
+      end
+
+      pseudonymized & whitelisted
+    end
+  end
+end
--- a/ee/lib/pseudonymizer/options.rb
+++ b/ee/lib/pseudonymizer/options.rb
+module Pseudonymizer
+  class Options
+    attr_reader :config
+    attr_reader :start_at
+    attr_reader :output_dir
+
+    def initialize(config: {}, output_dir: nil)
+      @config = config
+      @start_at = Time.now.utc
+
+      base_dir = output_dir || File.join(Dir.tmpdir, 'gitlab-pseudonymizer')
+      @output_dir = File.join(base_dir, batch_dir)
+    end
+
+    def upload_dir
+      batch_dir
+    end
+
+    private
+
+    def batch_dir
+      start_at.iso8601
+    end
+  end
+end
--- a/ee/lib/pseudonymizer/pager.rb
+++ b/ee/lib/pseudonymizer/pager.rb
+module Pseudonymizer
+  class Pager
+    PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000)
+
+    def initialize(table, columns)
+      @table = table
+      @columns = columns
+    end
+
+    def pages(&block)
+      if @columns.include?("id")
+        # optimize the pagination using WHERE id > ?
+        pages_per_id(&block)
+      else
+        # fallback to `LIMIT ? OFFSET ?` when "id" is unavailable
+        pages_per_offset(&block)
+      end
+    end
+
+    def pages_per_id(&block)
+      id_offset = 0
+
+      loop do
+        # a page of results
+        results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
+          SELECT #{@columns.join(",")}
+          FROM #{@table}
+          WHERE id > #{id_offset}
+          ORDER BY id
+          LIMIT #{PAGE_SIZE}
+        SQL
+        Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}..+#{PAGE_SIZE}]")
+        break if results.empty?
+
+        id_offset = results.last["id"].to_i
+        yield results
+
+        break if results.count < PAGE_SIZE
+      end
+    end
+
+    def pages_per_offset(&block)
+      offset = 0
+
+      loop do
+        # a page of results
+        results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
+          SELECT #{@columns.join(",")}
+          FROM #{@table}
+          ORDER BY #{@columns.join(",")}
+          LIMIT #{PAGE_SIZE} OFFSET #{offset}
+        SQL
+        Rails.logger.debug("#{self.class.name} fetching offset [#{offset}..#{offset + PAGE_SIZE}]")
+        break if results.empty?
+
+        offset += PAGE_SIZE
+        yield results
+
+        break if results.count < PAGE_SIZE
+      end
+    end
+  end
+end
--- a/ee/lib/pseudonymizer/uploader.rb
+++ b/ee/lib/pseudonymizer/uploader.rb
+module Pseudonymizer
+  ObjectStorageUnavailableError = Class.new(StandardError)
+
+  class Uploader
+    include Gitlab::Utils::StrongMemoize
+
+    RemoteStorageUnavailableError = Class.new(StandardError)
+
+    # Our settings use string keys, but Fog expects symbols
+    def self.object_store_credentials
+      Gitlab.config.pseudonymizer.upload.connection.to_hash.deep_symbolize_keys
+    end
+
+    def self.remote_directory
+      Gitlab.config.pseudonymizer.upload.remote_directory
+    end
+
+    def initialize(options, progress_output: nil)
+      @progress_output = progress_output || $stdout
+      @config = options.config
+      @output_dir = options.output_dir
+      @upload_dir = options.upload_dir
+      @remote_dir = self.class.remote_directory
+      @connection_params = self.class.object_store_credentials
+    end
+
+    def available?
+      !connect_to_remote_directory.nil?
+    rescue ObjectStorageUnavailableError
+      false
+    end
+
+    def upload
+      progress_output.puts "Uploading output files to remote storage #{remote_directory}:"
+
+      file_list.each do |file|
+        upload_file(file, remote_directory)
+      end
+    rescue ObjectStorageUnavailableError
+      abort "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly"
+    end
+
+    def cleanup
+      return unless File.exist?(@output_dir)
+
+      progress_output.print "Deleting tmp directory #{@output_dir} ... "
+      FileUtils.rm_rf(@output_dir)
+      progress_output.puts "done"
+    rescue
+      progress_output.puts "failed"
+    end
+
+    private
+
+    attr_reader :progress_output
+
+    def upload_file(file, directory)
+      progress_output.print "\t#{file} ... "
+
+      if directory.files.create(key: File.join(@upload_dir, File.basename(file)),
+                                body: File.open(file),
+                                public: false)
+        progress_output.puts "done"
+      else
+        progress_output.puts "failed"
+      end
+    end
+
+    def remote_directory
+      strong_memoize(:remote_directory) { connect_to_remote_directory }
+    end
+
+    def connect_to_remote_directory
+      if @connection_params.blank?
+        raise ObjectStorageUnavailableError
+
+      end
+
+      connection = ::Fog::Storage.new(@connection_params)
+
+      # We only attempt to create the directory for local backups. For AWS
+      # and other cloud providers, we cannot guarantee the user will have
+      # permission to create the bucket.
+      if connection.service == ::Fog::Storage::Local
+        connection.directories.create(key: @remote_dir)
+      else
+        connection.directories.get(@remote_dir)
+      end
+    end
+
+    def file_list
+      Dir[File.join(@output_dir, "*")]
+    end
+  end
+end
--- a/ee/lib/tasks/gitlab/db.rake
+++ b/ee/lib/tasks/gitlab/db.rake
+namespace :gitlab do
+  namespace :db do
+    desc 'Output pseudonymity dump of selected tables'
+    task pseudonymizer: :environment do
+      abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer)
+      abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled?
+
+      options = Pseudonymizer::Options.new(
+        config: YAML.load_file(Gitlab.config.pseudonymizer.manifest),
+        output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
+      )
+
+      dumper = Pseudonymizer::Dumper.new(options)
+      uploader = Pseudonymizer::Uploader.new(options)
+
+      abort "There is an error in the pseudonymizer object store configuration." unless uploader.available?
+
+      begin
+        dumper.tables_to_csv
+        uploader.upload
+      ensure
+        uploader.cleanup
+      end
+    end
+  end
+end
--- a/ee/spec/lib/pseudonymizer/dumper_spec.rb
+++ b/ee/spec/lib/pseudonymizer/dumper_spec.rb
+require 'spec_helper'
+
+describe Pseudonymizer::Dumper do
+  let!(:project) { create(:project) }
+  let(:base_dir) { Dir.mktmpdir }
+  let(:options) do
+    Pseudonymizer::Options.new(
+      config: YAML.load_file(Gitlab.config.pseudonymizer.manifest)
+    )
+  end
+  subject(:pseudo) { described_class.new(options) }
+
+  before do
+    allow(options).to receive(:output_dir).and_return(base_dir)
+  end
+
+  after do
+    FileUtils.rm_rf(base_dir)
+  end
+
+  describe 'Pseudo tables' do
+    it 'outputs project tables to csv' do
+      column_names = %w(id name path description)
+      pseudo.config[:tables] = {
+        projects: {
+          whitelist: column_names,
+          pseudo: %w(id)
+        }
+      }
+
+      expect(pseudo.output_dir).to eq(base_dir)
+
+      # grab the first table it outputs. There would only be 1.
+      project_table_file = pseudo.tables_to_csv[0]
+      expect(project_table_file).to end_with("projects.csv.gz")
+
+      columns = []
+      project_data = []
+      Zlib::GzipReader.open(project_table_file) do |gz|
+        csv = CSV.new(gz, headers: true)
+        # csv.shift # read the header row
+        project_data = csv.gets
+        columns = csv.headers
+      end
+
+      # check if CSV columns are correct
+      expect(columns).to include(*column_names)
+
+      # is it pseudonymous
+      # sha 256 is 64 chars in length
+      expect(project_data["id"].length).to eq(64)
+    end
+
+    it "warns when pseudonymized fields are extraneous" do
+      column_names = %w(id name path description)
+      pseudo.config[:tables] = {
+        projects: {
+          whitelist: column_names,
+          pseudo: %w(id extraneous)
+        }
+      }
+
+      expect(Rails.logger).to receive(:warn).with(/extraneous/)
+
+      pseudo.tables_to_csv
+    end
+  end
+
+  describe "manifest is valid" do
+    it "all tables exist" do
+      existing_tables = ActiveRecord::Base.connection.tables
+      tables = options.config['tables'].keys
+
+      expect(existing_tables).to include(*tables)
+    end
+
+    it "all whitelisted attributes exist" do
+      options.config['tables'].each do |table, table_def|
+        whitelisted = table_def['whitelist']
+        existing_columns = ActiveRecord::Base.connection.columns(table.to_sym).map(&:name)
+        diff = whitelisted - existing_columns
+
+        expect(diff).to be_empty, "#{table} should define columns #{whitelisted.inspect}: missing #{diff.inspect}"
+      end
+    end
+
+    it "all pseudonymized attributes are whitelisted" do
+      options.config['tables'].each do |table, table_def|
+        whitelisted = table_def['whitelist']
+        pseudonymized = table_def['pseudo']
+        diff = pseudonymized - whitelisted
+
+        expect(diff).to be_empty, "#{table} should whitelist columns #{pseudonymized.inspect}: missing #{diff.inspect}"
+      end
+    end
+  end
+end
--- a/ee/spec/lib/pseudonymizer/pager_spec.rb
+++ b/ee/spec/lib/pseudonymizer/pager_spec.rb
+require 'spec_helper'
+
+describe Pseudonymizer::Pager do
+  let(:page_size) { 1 }
+  let!(:projects) { create_list(:project, 10) }
+  subject { described_class.new("projects", whitelisted_columns) }
+
+  before do
+    stub_const("Pseudonymizer::Pager::PAGE_SIZE", page_size)
+  end
+
+  shared_examples "yield results in page" do
+    it do
+      page_count = 0
+      result_count = 0
+
+      subject.pages do |page|
+        result_count += page.count
+        page_count += 1
+      end
+
+      expect(result_count).to eq(projects.count)
+      expect(page_count).to eq(projects.count / page_size)
+    end
+  end
+
+  context "`id` column is present" do
+    let(:whitelisted_columns) { %w(id name) }
+
+    describe "#pages" do
+      it "delegates to #pages_per_id" do
+        expect(subject).to receive(:pages_per_id)
+
+        subject.pages {|page| nil}
+      end
+
+      include_examples "yield results in page"
+    end
+  end
+
+  context "`id` column is missing" do
+    let(:whitelisted_columns) { %w(name) }
+
+    describe "#pages" do
+      it "delegates to #pages_per_offset" do
+        expect(subject).to receive(:pages_per_offset)
+
+        subject.pages {|page| nil}
+      end
+
+      include_examples "yield results in page"
+    end
+  end
+end
--- a/ee/spec/lib/pseudonymizer/uploader_spec.rb
+++ b/ee/spec/lib/pseudonymizer/uploader_spec.rb
+require 'spec_helper'
+
+describe Pseudonymizer::Uploader do
+  let(:base_dir) { Dir.mktmpdir }
+  let(:options) do
+    Pseudonymizer::Options.new(
+      config: YAML.load_file(Gitlab.config.pseudonymizer.manifest)
+    )
+  end
+  let(:remote_directory) { subject.send(:remote_directory) }
+  subject { described_class.new(options) }
+
+  def mock_file(file_name)
+    FileUtils.touch(File.join(base_dir, file_name))
+  end
+
+  before do
+    allow(options).to receive(:output_dir).and_return(base_dir)
+    stub_object_storage_pseudonymizer
+
+    10.times {|i| mock_file("file_#{i}.test")}
+    mock_file("schema.yml")
+    mock_file("file_list.json")
+  end
+
+  after do
+    FileUtils.rm_rf(base_dir)
+  end
+
+  describe "#upload" do
+    it "upload all file in the directory" do
+      subject.upload
+
+      expect(remote_directory.files.all.count).to eq(12)
+    end
+  end
+
+  describe "#cleanup" do
+    it "cleans the directory" do
+      subject.cleanup
+
+      expect(Dir[File.join(base_dir, "*")].length).to eq(0)
+    end
+  end
+end
--- a/spec/support/helpers/stub_object_storage.rb
+++ b/spec/support/helpers/stub_object_storage.rb
@@ -15,9 +15,14 @@ module StubObjectStorage

    return unless enabled

+    stub_object_storage(connection_params: uploader.object_store_credentials,
+                        remote_directory: remote_directory)
+  end
+
+  def stub_object_storage(connection_params:, remote_directory:)
    Fog.mock!

-    ::Fog::Storage.new(uploader.object_store_credentials).tap do |connection|
+    ::Fog::Storage.new(connection_params).tap do |connection|
      begin
        connection.directories.create(key: remote_directory)
      rescue Excon::Error::Conflict
@@ -57,4 +62,9 @@ module StubObjectStorage
        </InitiateMultipartUploadResult>
      EOS
  end
+
+  def stub_object_storage_pseudonymizer
+    stub_object_storage(connection_params: Pseudonymizer::Uploader.object_store_credentials,
+                        remote_directory: Pseudonymizer::Uploader.remote_directory)
+  end
 end