Commit f28ff040 authored by Stan Hu's avatar Stan Hu

Merge branch 'gitlab-elt' into 'master'

Adds method to move tables to CSV with redacted data.

Closes meltano/meltano#80

See merge request gitlab-org/gitlab-ee!5532
parents a43e2609 4b90a8f8
......@@ -373,3 +373,5 @@
= _('Geo allows you to replicate your GitLab instance to other geographical locations.')
.settings-content
= render partial: 'slack'
= render_if_exists 'admin/application_settings/pseudonymizer_settings', expanded: expanded
......@@ -145,6 +145,7 @@
- cronjob:ldap_all_groups_sync
- cronjob:ldap_sync
- cronjob:update_all_mirrors
- cronjob:pseudonymizer
- geo:geo_scheduler_scheduler
- geo:geo_scheduler_primary_scheduler
......
......@@ -311,6 +311,10 @@ production: &base
geo_migrated_local_files_clean_up_worker:
cron: "15 */6 * * *"
# Export pseudonymized data in CSV format for analysis
pseudonymizer_worker:
cron: "0 * * * *"
registry:
# enabled: true
# host: registry.example.com
......@@ -726,6 +730,20 @@ production: &base
# # Specifies Amazon S3 storage class to use for backups, this is optional
# # storage_class: 'STANDARD'
## Pseudonymizer exporter
pseudonymizer:
# Tables manifest that specifies the fields to extract and pseudonymize.
manifest: config/pseudonymizer.yml
upload:
# remote_directory: 'gitlab-elt'
# Fog storage connection settings, see http://fog.io/storage/ .
connection:
# provider: AWS
# region: eu-west-1
# aws_access_key_id: AKIAKIAKI
# aws_secret_access_key: 'secret123'
# # The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
## GitLab Shell settings
gitlab_shell:
path: /home/git/gitlab-shell/
......@@ -876,6 +894,17 @@ test:
token: secret
backup:
path: tmp/tests/backups
pseudonymizer:
manifest: config/pseudonymizer.yml
upload:
# The remote 'directory' to store the CSV files. For S3, this would be the bucket name.
remote_directory: gitlab-elt.test
# Fog storage connection settings, see http://fog.io/storage/
connection:
provider: AWS # Only AWS supported at the moment
aws_access_key_id: AWS_ACCESS_KEY_ID
aws_secret_access_key: AWS_SECRET_ACCESS_KEY
region: us-east-1
gitlab_shell:
path: tmp/tests/gitlab-shell/
hooks_path: tmp/tests/gitlab-shell/hooks/
......
......@@ -370,6 +370,10 @@ Settings.cron_jobs['gitlab_usage_ping_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['gitlab_usage_ping_worker']['cron'] ||= Settings.__send__(:cron_for_usage_ping)
Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWorker'
Settings.cron_jobs['pseudonymizer_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['pseudonymizer_worker']['cron'] ||= '0 23 * * *'
Settings.cron_jobs['pseudonymizer_worker']['job_class'] ||= 'PseudonymizerWorker'
Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *'
Settings.cron_jobs['schedule_update_user_activity_worker']['job_class'] = 'ScheduleUpdateUserActivityWorker'
......@@ -470,6 +474,14 @@ Settings.backup['upload']['multipart_chunk_size'] ||= 104857600
Settings.backup['upload']['encryption'] ||= nil
Settings.backup['upload']['storage_class'] ||= nil
#
# Pseudonymizer
#
Settings['pseudonymizer'] ||= Settingslogic.new({})
Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("config/pseudonymizer.yml"))
Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil })
# Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600
#
# Git
#
......
tables:
approvals:
whitelist:
- id
- merge_request_id
- user_id
- created_at
- updated_at
pseudo:
- id
- merge_request_id
- user_id
approver_groups:
whitelist:
- id
- target_type
- group_id
- created_at
- updated_at
pseudo:
- id
- group_id
board_assignees:
whitelist:
- id
- board_id
- assignee_id
pseudo:
- id
- board_id
- assignee_id
board_labels:
whitelist:
- id
- board_id
- label_id
pseudo:
- id
- board_id
- label_id
boards:
whitelist:
- id
- project_id
- created_at
- updated_at
- milestone_id
- group_id
- weight
pseudo:
- id
- project_id
- milestone_id
- group_id
epic_issues:
whitelist:
- id
- epic_id
- issue_id
- relative_position
pseudo:
- id
- epic_id
- issue_id
epic_metrics:
whitelist:
- id
- epic_id
- created_at
- updated_at
pseudo:
- id
- epic_id
epics:
whitelist:
- id
- milestone_id
- group_id
- author_id
- assignee_id
- iid
- updated_by_id
- last_edited_by_id
- lock_version
- start_date
- end_date
- last_edited_at
- created_at
- updated_at
- title
- description
pseudo:
- id
- milestone_id
- group_id
- author_id
- assignee_id
- iid
- updated_by_id
- last_edited_by_id
- title
- description
issue_assignees:
whitelist:
- user_id
- issue_id
pseudo:
- user_id
- issue_id
issue_links:
whitelist:
- id
- source_id
- target_id
- created_at
- updated_at
pseudo:
- id
- source_id
- target_id
issue_metrics:
whitelist:
- id
- issue_id
- first_mentioned_in_commit_at
- first_associated_with_milestone_at
- first_added_to_board_at
- created_at
- updated_at
pseudo:
- id
- issue_id
issues:
whitelist:
- id
- title
- author_id
- project_id
- created_at
- updated_at
- description
- milestone_id
- state
- updated_by_id
- weight
- due_date
- moved_to_id
- lock_version
- time_estimate
- last_edited_at
- last_edited_by_id
- discussion_locked
- closed_at
pseudo:
- id
- title
- author_id
- project_id
- description
- milestone_id
- updated_by_id
- moved_to_id
- discussion_locked
label_links:
whitelist:
- id
- label_id
- target_id
- target_type
- created_at
- updated_at
pseudo:
- id
- label_id
- target_id
label_priorities:
whitelist:
- id
- project_id
- label_id
- priority
- created_at
- updated_at
pseudo:
- id
- project_id
- label_id
labels:
whitelist:
- id
- title
- color
- project_id
- created_at
- updated_at
- template
- type
- group_id
pseudo:
- id
- title
- color
- project_id
- template
- type
- group_id
licenses:
whitelist:
- id
- created_at
- updated_at
pseudo:
- id
merge_request_diffs:
whitelist:
- id
- state
- merge_request_id
- created_at
- updated_at
- base_commit_sha
- real_size
- head_commit_sha
- start_commit_sha
- commits_count
pseudo:
- id
- merge_request_id
- base_commit_sha
- head_commit_sha
- start_commit_sha
merge_request_metrics:
whitelist:
- id
- merge_request_id
- latest_build_started_at
- latest_build_finished_at
- first_deployed_to_production_at
- merged_at
- created_at
- updated_at
- pipeline_id
- merged_by_id
- latest_closed_by_id
- latest_closed_at
pseudo:
- id
- merge_request_id
- pipeline_id
- merged_by_id
- latest_closed_by_id
merge_requests:
whitelist:
- id
- target_branch
- source_branch
- source_project_id
- author_id
- assignee_id
- created_at
- updated_at
- milestone_id
- state
- merge_status
- target_project_id
- updated_by_id
- merge_error
- merge_params
- merge_when_pipeline_succeeds
- merge_user_id
- approvals_before_merge
- lock_version
- time_estimate
- squash
- last_edited_at
- last_edited_by_id
- head_pipeline_id
- discussion_locked
- latest_merge_request_diff_id
- allow_maintainer_to_push
pseudo:
- id
- target_branch
- source_branch
- source_project_id
- author_id
- assignee_id
- milestone_id
- target_project_id
- updated_by_id
- merge_user_id
- last_edited_by_id
- head_pipeline_id
- latest_merge_request_diff_id
merge_requests_closing_issues:
whitelist:
- id
- merge_request_id
- issue_id
- created_at
- updated_at
pseudo:
- id
- merge_request_id
- issue_id
milestones:
whitelist:
- id
- project_id
- due_date
- created_at
- updated_at
- state
- start_date
- group_id
pseudo:
- id
- project_id
- group_id
namespace_statistics:
whitelist:
- id
- namespace_id
- shared_runners_seconds
- shared_runners_seconds_last_reset
pseudo:
- id
- namespace_id
namespaces:
whitelist:
- id
- name
- path
- owner_id
- created_at
- updated_at
- type
- avatar
- membership_lock
- share_with_group_lock
- visibility_level
- request_access_enabled
- ldap_sync_status
- ldap_sync_error
- ldap_sync_last_update_at
- ldap_sync_last_successful_update_at
- ldap_sync_last_sync_at
- lfs_enabled
- parent_id
- shared_runners_minutes_limit
- repository_size_limit
- require_two_factor_authentication
- two_factor_grace_period
- plan_id
- project_creation_level
pseudo:
- id
- name
- path
- owner_id
- type
- avatar
- membership_lock
- share_with_group_lock
- visibility_level
- request_access_enabled
- ldap_sync_status
- ldap_sync_error
- parent_id
- shared_runners_minutes_limit
- repository_size_limit
- require_two_factor_authentication
- two_factor_grace_period
- plan_id
- project_creation_level
notes:
whitelist:
- id
- note
- noteable_type
- author_id
- created_at
- updated_at
- project_id
- line_code
- commit_id
- noteable_id
- system
- updated_by_id
- type
- position
- original_position
- resolved_at
- resolved_by_id
- discussion_id
- change_position
- resolved_by_push
pseudo:
- id
- note
- author_id
- project_id
- commit_id
- noteable_id
- updated_by_id
- resolved_by_id
- discussion_id
notification_settings:
whitelist:
- id
- user_id
- source_id
- source_type
- level
- created_at
- updated_at
- new_note
- new_issue
- reopen_issue
- close_issue
- reassign_issue
- new_merge_request
- reopen_merge_request
- close_merge_request
- reassign_merge_request
- merge_merge_request
- failed_pipeline
- success_pipeline
pseudo:
- id
- user_id
- source_id
- source_type
- level
- new_note
- new_issue
- reopen_issue
- close_issue
- reassign_issue
- new_merge_request
- reopen_merge_request
- close_merge_request
- reassign_merge_request
- merge_merge_request
- failed_pipeline
- success_pipeline
project_authorizations:
whitelist:
- user_id
- project_id
- access_level
pseudo:
- user_id
- project_id
- access_level
project_auto_devops:
whitelist:
- id
- project_id
- created_at
- updated_at
- enabled
- domain
pseudo:
- id
- project_id
- enabled
- domain
project_custom_attributes:
whitelist:
- id
- created_at
- updated_at
- project_id
- key
- value
pseudo:
- id
- project_id
- key
- value
project_features:
whitelist:
- id
- project_id
- merge_requests_access_level
- issues_access_level
- wiki_access_level
- snippets_access_level
- builds_access_level
- created_at
- updated_at
- repository_access_level
pseudo:
- id
- project_id
- merge_requests_access_level
- issues_access_level
- wiki_access_level
- snippets_access_level
- builds_access_level
- repository_access_level
project_group_links:
whitelist:
- id
- project_id
- group_id
- created_at
- updated_at
- group_access
- expires_at
pseudo:
- id
- project_id
- group_id
- group_access
project_import_data:
whitelist:
- id
- project_id
pseudo:
- id
- project_id
project_mirror_data:
whitelist:
- id
- project_id
- retry_count
- last_update_started_at
- last_update_scheduled_at
- next_execution_timestamp
pseudo:
- id
- project_id
project_repository_states:
whitelist:
- id
- project_id
- repository_verification_checksum
- wiki_verification_checksum
- last_repository_verification_failure
- last_wiki_verification_failure
pseudo:
- id
- project_id
- repository_verification_checksum
- wiki_verification_checksum
- last_repository_verification_failure
- last_wiki_verification_failure
project_statistics:
whitelist:
- id
- project_id
- namespace_id
- commit_count
- storage_size
- repository_size
- lfs_objects_size
- build_artifacts_size
- shared_runners_seconds
- shared_runners_seconds_last_reset
pseudo:
- id
- project_id
- namespace_id
- commit_count
- storage_size
- repository_size
- lfs_objects_size
- build_artifacts_size
- shared_runners_seconds
- shared_runners_seconds_last_reset
projects:
whitelist:
- id
- name
- path
- description
- created_at
- updated_at
- creator_id
- namespace_id
- last_activity_at
- import_url
- visibility_level
- archived
- avatar
- import_status
- merge_requests_template
- star_count
- merge_requests_rebase_enabled
- import_type
- import_source
- approvals_before_merge
- reset_approvals_on_push
- merge_requests_ff_only_enabled
- issues_template
- mirror
- mirror_last_update_at
- mirror_last_successful_update_at
- mirror_user_id
- import_error
- ci_id
- shared_runners_enabled
- build_coverage_regex
- build_allow_git_fetch
- build_timeout
- mirror_trigger_builds
- pending_delete
- public_builds
- last_repository_check_failed
- last_repository_check_at
- container_registry_enabled
- only_allow_merge_if_pipeline_succeeds
- has_external_issue_tracker
- repository_storage
- repository_read_only
- request_access_enabled
- has_external_wiki
- ci_config_path
- lfs_enabled
- only_allow_merge_if_all_discussions_are_resolved
- repository_size_limit
- printing_merge_request_link_enabled
- auto_cancel_pending_pipelines
- service_desk_enabled
- import_jid
- delete_error
- last_repository_updated_at
- disable_overriding_approvers_per_merge_request
- storage_version
- resolve_outdated_diff_discussions
- remote_mirror_available_overridden
- only_mirror_protected_branches
- pull_mirror_available_overridden
- mirror_overwrites_diverged_branches
- external_authorization_classification_label
pseudo:
- id
- name
- path
- description
- creator_id
- namespace_id
- import_url
- visibility_level
- archived
- avatar
- import_status
- merge_requests_template
- star_count
- import_type
- import_source
- approvals_before_merge
- reset_approvals_on_push
- issues_template
- mirror
- mirror_last_successful_update_at
- mirror_user_id
- import_error
- ci_id
- shared_runners_enabled
- build_coverage_regex
- build_allow_git_fetch
- build_timeout
- mirror_trigger_builds
- pending_delete
- public_builds
- last_repository_check_failed
- only_allow_merge_if_pipeline_succeeds
- repository_storage
- repository_read_only
- ci_config_path
- only_allow_merge_if_all_discussions_are_resolved
- repository_size_limit
- auto_cancel_pending_pipelines
- import_jid
- delete_error
- last_repository_updated_at
- disable_overriding_approvers_per_merge_request
- storage_version
- resolve_outdated_diff_discussions
- remote_mirror_available_overridden
- pull_mirror_available_overridden
- mirror_overwrites_diverged_branches
- external_authorization_classification_label
subscriptions:
whitelist:
- id
- user_id
- subscribable_id
- subscribable_type
- subscribed
- created_at
- updated_at
- project_id
pseudo:
- id
- user_id
- subscribable_id
- project_id
users:
whitelist:
- id
- email
- remember_created_at
- sign_in_count
- current_sign_in_at
- last_sign_in_at
- current_sign_in_ip
- last_sign_in_ip
- created_at
- updated_at
- name
- admin
- projects_limit
- skype
- linkedin
- twitter
- bio
- failed_attempts
- locked_at
- username
- can_create_group
- can_create_team
- state
- color_scheme_id
- password_expires_at
- created_by_id
- last_credential_check_at
- avatar
- confirmed_at
- confirmation_sent_at
- unconfirmed_email
- hide_no_ssh_key
- website_url
- admin_email_unsubscribed_at
- notification_email
- hide_no_password
- password_automatically_set
- location
- public_email
- dashboard
- project_view
- consumed_timestep
- layout
- hide_project_limit
- note
- otp_grace_period_started_at
- external
- organization
- auditor
- require_two_factor_authentication_from_group
- two_factor_grace_period
- ghost
- last_activity_on
- notified_of_own_activity
- support_bot
- preferred_language
- theme_id
pseudo:
- id
- email
- current_sign_in_ip
- last_sign_in_ip
- name
- admin
- skype
- linkedin
- twitter
- username
- created_by_id
- avatar
- unconfirmed_email
- hide_no_ssh_key
- website_url
- notification_email
- location
- public_email
- consumed_timestep
- hide_project_limit
- note
- external
- organization
- auditor
- two_factor_grace_period
- theme_id
......@@ -206,6 +206,7 @@ ActiveRecord::Schema.define(version: 20180612175636) do
t.string "encrypted_external_auth_client_key_pass_iv"
t.string "email_additional_text"
t.boolean "enforce_terms", default: false
t.boolean "pseudonymizer_enabled", default: false, null: false
end
create_table "approvals", force: :cascade do |t|
......
......@@ -167,6 +167,10 @@ created in snippets, wikis, and repos.
- [Request Profiling](monitoring/performance/request_profiling.md): Get a detailed profile on slow requests.
- [Performance Bar](monitoring/performance/performance_bar.md): Get performance information for the current page.
## Analytics
- [Pseudonymizer](pseudonymizer.md): Export data from GitLab's database to CSV files in a secure way.
## Troubleshooting
- [Debugging tips](troubleshooting/debug.md): Tips to debug problems when things go wrong
......
# Pseudonymizer
> [Introduced](https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/5532) in [GitLab Ultimate][ee] 11.1.
As GitLab's database hosts sensitive information, using it unfiltered for analytics
implies high security requirements. To help alleviate this constraint, the Pseudonymizer
service is used to export GitLab's data in a pseudonymized way.
CAUTION: **Warning:**
This process is not impervious. If the source data is available, it's possible for
a user to correlate data to the pseudonymized version.
The Pseudonymizer currently uses `HMAC(SHA256)` to mutate fields that shouldn't
be textually exported. This ensures that:
- the end-user of the data source cannot infer/revert the pseudonymized fields
- the referential integrity is maintained
## Configuration
To configure the pseudonymizer, you need to:
- Provide a manifest file that describes which fields should be included or
pseudonymized ([example `manifest.yml` file](https://gitlab.com/gitlab-org/gitlab-ee/tree/master/config/pseudonymizer.yml)).
A default manifest is provided with the GitLab installation. Using a relative file path will be resolved from the Rails root.
Alternatively, you can use an absolute file path.
- Use an object storage and specify the connection parameters in the `pseudonymizer.upload.connection` configuration option.
**For Omnibus installations:**
1. Edit `/etc/gitlab/gitlab.rb` and add the following lines by replacing with
the values you want:
```ruby
gitlab_rails['pseudonymizer_manifest'] = 'config/pseudonymizer.yml'
gitlab_rails['pseudonymizer_upload_remote_directory'] = 'gitlab-elt'
gitlab_rails['pseudonymizer_upload_connection'] = {
'provider' => 'AWS',
'region' => 'eu-central-1',
'aws_access_key_id' => 'AWS_ACCESS_KEY_ID',
'aws_secret_access_key' => 'AWS_SECRET_ACCESS_KEY'
}
```
NOTE: **Note:**
If you are using AWS IAM profiles, be sure to omit the AWS access key and secret access key/value pairs.
```ruby
gitlab_rails['pseudonymizer_upload_connection'] = {
'provider' => 'AWS',
'region' => 'eu-central-1',
'use_iam_profile' => true
}
```
1. Save the file and [reconfigure GitLab](restart_gitlab.md#omnibus-gitlab-reconfigure)
for the changes to take effect.
---
**For installations from source:**
1. Edit `/home/git/gitlab/config/gitlab.yml` and add or amend the following
lines:
```yaml
pseudonymizer:
manifest: config/pseudonymizer.yml
upload:
remote_directory: 'gitlab-elt' # The bucket name
connection:
provider: AWS # Only AWS supported at the moment
aws_access_key_id: AWS_ACCESS_KEY_ID
aws_secret_access_key: AWS_SECRET_ACCESS_KEY
region: eu-central-1
```
1. Save the file and [restart GitLab](restart_gitlab.md#installations-from-source)
for the changes to take effect.
## Usage
You can optionally run the pseudonymizer using the following environment variables:
- `PSEUDONYMIZER_OUTPUT_DIR` - where to store the output CSV files (defaults to `/tmp`)
- `PSEUDONYMIZER_BATCH` - the batch size when querying the DB (defaults to `100000`)
```bash
## Omnibus
sudo gitlab-rake gitlab:db:pseudonymizer
## Source
sudo -u git -H bundle exec rake gitlab:db:pseudonymizer RAILS_ENV=production
```
This will produce some CSV files that might be very large, so make sure the
`PSEUDONYMIZER_OUTPUT_DIR` has sufficient space. As a rule of thumb, at least
10% of the database size is recommended.
After the pseudonymizer has run, the output CSV files should be uploaded to the
configured object storage and deleted from the local disk.
[ee]: https://about.gitlab.com/pricing/
......@@ -20,6 +20,10 @@ module EE
attrs << :email_additional_text
end
if License.feature_available?(:pseudonymizer)
attrs << :pseudonymizer_enabled
end
attrs
end
end
......
......@@ -35,6 +35,18 @@ module EE
"and the value is encrypted at rest.")
end
def pseudonymizer_enabled_help_text
_("Enable Pseudonymizer data collection")
end
def pseudonymizer_description_text
_("GitLab will run a background job that will produce pseudonymized CSVs of the GitLab database that will be uploaded to your configured object storage directory.")
end
def pseudonymizer_disabled_description_text
_("The pseudonymizer data collection is disabled. When enabled, GitLab will run a background job that will produce pseudonymized CSVs of the GitLab database that will be uploaded to your configured object storage directory.")
end
override :visible_attributes
def visible_attributes
super + [
......@@ -55,7 +67,8 @@ module EE
:slack_app_id,
:slack_app_secret,
:slack_app_verification_token,
:allow_group_owners_to_manage_ldap
:allow_group_owners_to_manage_ldap,
:pseudonymizer_enabled
]
end
......
......@@ -100,11 +100,20 @@ module EE
slack_app_enabled: false,
slack_app_id: nil,
slack_app_secret: nil,
slack_app_verification_token: nil
slack_app_verification_token: nil,
pseudonymizer_enabled: false
)
end
end
def pseudonymizer_available?
License.feature_available?(:pseudonymizer)
end
def pseudonymizer_enabled?
pseudonymizer_available? && super
end
def should_check_namespace_plan?
check_namespace_plan? && (Rails.env.test? || ::Gitlab.dev_env_or_com?)
end
......
......@@ -73,6 +73,7 @@ class License < ActiveRecord::Base
ide
chatops
pod_logs
pseudonymizer
].freeze
# List all features available for early adopters,
......
= form_for @application_setting, url: admin_application_settings_path, html: { class: 'fieldset-form' } do |f|
= form_errors(@application_setting)
%fieldset
.form-group.row
.offset-sm-2.col-sm-10
- is_enabled = @application_setting.pseudonymizer_enabled?
.form-check
= f.label :pseudonymizer_enabled do
= f.check_box :pseudonymizer_enabled
= pseudonymizer_enabled_help_text
.form-text.text-muted
- if is_enabled
= pseudonymizer_description_text
- else
= pseudonymizer_disabled_description_text
= f.submit 'Save changes', class: "btn btn-success"
- if Gitlab::CurrentSettings.pseudonymizer_available?
%section.settings.as-pseudonymizer.no-animate#js-pseudonymizer-settings{ class: ('expanded' if expanded) }
.settings-header
%h4
= _('Pseudonymizer data collection')
%button.btn.btn-default.js-settings-toggle{ type: 'button' }
= expanded ? _('Collapse') : _('Expand')
%p
= _('Enable or disable the Pseudonymizer data collection.')
.settings-content
= render 'pseudonymizer'
class PseudonymizerWorker
include ApplicationWorker
include CronjobQueue
def perform
return unless Gitlab::CurrentSettings.pseudonymizer_enabled?
options = Pseudonymizer::Options.new(
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest),
output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
)
dumper = Pseudonymizer::Dumper.new(options)
uploader = Pseudonymizer::Uploader.new(options, progress_output: File.open(File::NULL, "w"))
unless uploader.available?
Rails.logger.error("The pseudonymizer object storage must be configured.")
return
end
begin
dumper.tables_to_csv
uploader.upload
ensure
uploader.cleanup
end
end
end
---
title: Pseudonymizer to safely export data for analytics.
merge_request: 5532
author:
type: added
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class AddPseudonymizerEnabledToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
# When a migration requires downtime you **must** uncomment the following
# constant and define a short and easy to understand explanation as to why the
# migration requires downtime.
# DOWNTIME_REASON = ''
# When using the methods "add_concurrent_index", "remove_concurrent_index" or
# "add_column_with_default" you must disable the use of transactions
# as these methods can not run in an existing transaction.
# When using "add_concurrent_index" or "remove_concurrent_index" methods make sure
# that either of them is the _only_ method called in the migration,
# any other changes should go in a separate migration.
# This ensures that upon failure _only_ the index creation or removing fails
# and can be retried or reverted easily.
#
# To disable transactions uncomment the following line and remove these
# comments:
# disable_ddl_transaction!
def change
add_column :application_settings, :pseudonymizer_enabled, :boolean, null: false, default: false
end
end
module Pseudonymizer
class Dumper
attr_accessor :config, :output_dir
def initialize(options)
@config = options.config.deep_symbolize_keys
@output_dir = options.output_dir
@start_at = options.start_at
reset!
end
def reset!
@schema = Hash.new { |h, k| h[k] = {} }
@output_files = []
end
def tables_to_csv
return @output_files unless @output_files.empty?
tables = config[:tables]
FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir)
@output_files = tables.map do |k, v|
table_to_csv(k, v[:whitelist], v[:pseudo])
end.compact
schema_to_yml
file_list_to_json
@output_files
end
private
def output_filename(basename = nil, ext = "csv.gz")
File.join(output_dir, "#{basename}.#{ext}")
end
def schema_to_yml
file_path = output_filename("schema", "yml")
File.write(file_path, @schema.to_yaml)
end
def file_list_to_json
file_path = output_filename("file_list", "json")
relative_files = @output_files.map(&File.method(:basename))
File.write(file_path, relative_files.to_json)
end
def table_to_csv(table, whitelist_columns, pseudonymity_columns)
table_to_schema(table)
write_to_csv_file(
table,
table_page_results(table,
whitelist_columns,
pseudonymity_columns)
)
rescue => e
Rails.logger.error("Failed to export #{table}: #{e}")
raise e
end
# yield every results, pagined, anonymized
def table_page_results(table, whitelist_columns, pseudonymity_columns)
filter = Filter.new(table, whitelist_columns, pseudonymity_columns)
pager = Pager.new(table, whitelist_columns)
Enumerator.new do |yielder|
pager.pages do |page|
filter.anonymize(page).each do |result|
yielder << result
end
end
end.lazy
end
def table_to_schema(table)
table_config = @config.dig(:tables, table)
type_results = ActiveRecord::Base.connection.columns(table)
type_results = type_results.select do |c|
table_config[:whitelist].include?(c.name)
end
type_results = type_results.map do |c|
data_type = c.sql_type
if table_config[:pseudo].include?(c.name)
data_type = "character varying"
end
{ name: c.name, data_type: data_type }
end
set_schema_column_types(table, type_results)
end
def set_schema_column_types(table, type_results)
has_id = type_results.any? {|c| c[:name] == "id" }
type_results.each do |type_result|
@schema[table.to_s][type_result[:name]] = type_result[:data_type]
end
if has_id
# if there is an ID, it is the mapping_key
@schema[table.to_s]["gl_mapping_key"] = "id"
end
end
def write_to_csv_file(table, contents)
file_path = output_filename(table)
headers = contents.peek.keys
Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}."
Zlib::GzipWriter.open(file_path) do |io|
csv = CSV.new(io, headers: headers, write_headers: true)
contents.each { |row| csv << row.values }
end
file_path
rescue StopIteration
Rails.logger.info "#{self.class.name} table #{table} is empty."
nil
end
end
end
require 'openssl'
require 'digest'
module Pseudonymizer
class Filter
def initialize(table, whitelisted_fields, pseudonymized_fields)
@table = table
@pseudo_fields = pseudo_fields(whitelisted_fields, pseudonymized_fields)
end
def anonymize(results)
key = Rails.application.secrets[:secret_key_base]
digest = OpenSSL::Digest.new('sha256')
Enumerator.new do |yielder|
results.each do |result|
@pseudo_fields.each do |field|
next if result[field].nil?
result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field]))
end
yielder << result
end
end
end
private
def pseudo_fields(whitelisted, pseudonymized)
pseudo_extra_fields = pseudonymized - whitelisted
pseudo_extra_fields.each do |field|
Rails.logger.warn("#{self.class.name} extraneous pseudo: #{@table}.#{field} is not whitelisted and will be ignored.")
end
pseudonymized & whitelisted
end
end
end
module Pseudonymizer
class Options
attr_reader :config
attr_reader :start_at
attr_reader :output_dir
def initialize(config: {}, output_dir: nil)
@config = config
@start_at = Time.now.utc
base_dir = output_dir || File.join(Dir.tmpdir, 'gitlab-pseudonymizer')
@output_dir = File.join(base_dir, batch_dir)
end
def upload_dir
batch_dir
end
private
def batch_dir
start_at.iso8601
end
end
end
module Pseudonymizer
class Pager
PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000)
def initialize(table, columns)
@table = table
@columns = columns
end
def pages(&block)
if @columns.include?("id")
# optimize the pagination using WHERE id > ?
pages_per_id(&block)
else
# fallback to `LIMIT ? OFFSET ?` when "id" is unavailable
pages_per_offset(&block)
end
end
def pages_per_id(&block)
id_offset = 0
loop do
# a page of results
results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
SELECT #{@columns.join(",")}
FROM #{@table}
WHERE id > #{id_offset}
ORDER BY id
LIMIT #{PAGE_SIZE}
SQL
Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}..+#{PAGE_SIZE}]")
break if results.empty?
id_offset = results.last["id"].to_i
yield results
break if results.count < PAGE_SIZE
end
end
def pages_per_offset(&block)
offset = 0
loop do
# a page of results
results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish)
SELECT #{@columns.join(",")}
FROM #{@table}
ORDER BY #{@columns.join(",")}
LIMIT #{PAGE_SIZE} OFFSET #{offset}
SQL
Rails.logger.debug("#{self.class.name} fetching offset [#{offset}..#{offset + PAGE_SIZE}]")
break if results.empty?
offset += PAGE_SIZE
yield results
break if results.count < PAGE_SIZE
end
end
end
end
module Pseudonymizer
ObjectStorageUnavailableError = Class.new(StandardError)
class Uploader
include Gitlab::Utils::StrongMemoize
RemoteStorageUnavailableError = Class.new(StandardError)
# Our settings use string keys, but Fog expects symbols
def self.object_store_credentials
Gitlab.config.pseudonymizer.upload.connection.to_hash.deep_symbolize_keys
end
def self.remote_directory
Gitlab.config.pseudonymizer.upload.remote_directory
end
def initialize(options, progress_output: nil)
@progress_output = progress_output || $stdout
@config = options.config
@output_dir = options.output_dir
@upload_dir = options.upload_dir
@remote_dir = self.class.remote_directory
@connection_params = self.class.object_store_credentials
end
def available?
!connect_to_remote_directory.nil?
rescue ObjectStorageUnavailableError
false
end
def upload
progress_output.puts "Uploading output files to remote storage #{remote_directory}:"
file_list.each do |file|
upload_file(file, remote_directory)
end
rescue ObjectStorageUnavailableError
abort "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly"
end
def cleanup
return unless File.exist?(@output_dir)
progress_output.print "Deleting tmp directory #{@output_dir} ... "
FileUtils.rm_rf(@output_dir)
progress_output.puts "done"
rescue
progress_output.puts "failed"
end
private
attr_reader :progress_output
def upload_file(file, directory)
progress_output.print "\t#{file} ... "
if directory.files.create(key: File.join(@upload_dir, File.basename(file)),
body: File.open(file),
public: false)
progress_output.puts "done"
else
progress_output.puts "failed"
end
end
def remote_directory
strong_memoize(:remote_directory) { connect_to_remote_directory }
end
def connect_to_remote_directory
if @connection_params.blank?
raise ObjectStorageUnavailableError
end
connection = ::Fog::Storage.new(@connection_params)
# We only attempt to create the directory for local backups. For AWS
# and other cloud providers, we cannot guarantee the user will have
# permission to create the bucket.
if connection.service == ::Fog::Storage::Local
connection.directories.create(key: @remote_dir)
else
connection.directories.get(@remote_dir)
end
end
def file_list
Dir[File.join(@output_dir, "*")]
end
end
end
namespace :gitlab do
namespace :db do
desc 'Output pseudonymity dump of selected tables'
task pseudonymizer: :environment do
abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer)
abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled?
options = Pseudonymizer::Options.new(
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest),
output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR']
)
dumper = Pseudonymizer::Dumper.new(options)
uploader = Pseudonymizer::Uploader.new(options)
abort "There is an error in the pseudonymizer object store configuration." unless uploader.available?
begin
dumper.tables_to_csv
uploader.upload
ensure
uploader.cleanup
end
end
end
end
require 'spec_helper'
describe Pseudonymizer::Dumper do
let!(:project) { create(:project) }
let(:base_dir) { Dir.mktmpdir }
let(:options) do
Pseudonymizer::Options.new(
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest)
)
end
subject(:pseudo) { described_class.new(options) }
before do
allow(options).to receive(:output_dir).and_return(base_dir)
end
after do
FileUtils.rm_rf(base_dir)
end
describe 'Pseudo tables' do
it 'outputs project tables to csv' do
column_names = %w(id name path description)
pseudo.config[:tables] = {
projects: {
whitelist: column_names,
pseudo: %w(id)
}
}
expect(pseudo.output_dir).to eq(base_dir)
# grab the first table it outputs. There would only be 1.
project_table_file = pseudo.tables_to_csv[0]
expect(project_table_file).to end_with("projects.csv.gz")
columns = []
project_data = []
Zlib::GzipReader.open(project_table_file) do |gz|
csv = CSV.new(gz, headers: true)
# csv.shift # read the header row
project_data = csv.gets
columns = csv.headers
end
# check if CSV columns are correct
expect(columns).to include(*column_names)
# is it pseudonymous
# sha 256 is 64 chars in length
expect(project_data["id"].length).to eq(64)
end
it "warns when pseudonymized fields are extraneous" do
column_names = %w(id name path description)
pseudo.config[:tables] = {
projects: {
whitelist: column_names,
pseudo: %w(id extraneous)
}
}
expect(Rails.logger).to receive(:warn).with(/extraneous/)
pseudo.tables_to_csv
end
end
describe "manifest is valid" do
it "all tables exist" do
existing_tables = ActiveRecord::Base.connection.tables
tables = options.config['tables'].keys
expect(existing_tables).to include(*tables)
end
it "all whitelisted attributes exist" do
options.config['tables'].each do |table, table_def|
whitelisted = table_def['whitelist']
existing_columns = ActiveRecord::Base.connection.columns(table.to_sym).map(&:name)
diff = whitelisted - existing_columns
expect(diff).to be_empty, "#{table} should define columns #{whitelisted.inspect}: missing #{diff.inspect}"
end
end
it "all pseudonymized attributes are whitelisted" do
options.config['tables'].each do |table, table_def|
whitelisted = table_def['whitelist']
pseudonymized = table_def['pseudo']
diff = pseudonymized - whitelisted
expect(diff).to be_empty, "#{table} should whitelist columns #{pseudonymized.inspect}: missing #{diff.inspect}"
end
end
end
end
require 'spec_helper'
describe Pseudonymizer::Pager do
let(:page_size) { 1 }
let!(:projects) { create_list(:project, 10) }
subject { described_class.new("projects", whitelisted_columns) }
before do
stub_const("Pseudonymizer::Pager::PAGE_SIZE", page_size)
end
shared_examples "yield results in page" do
it do
page_count = 0
result_count = 0
subject.pages do |page|
result_count += page.count
page_count += 1
end
expect(result_count).to eq(projects.count)
expect(page_count).to eq(projects.count / page_size)
end
end
context "`id` column is present" do
let(:whitelisted_columns) { %w(id name) }
describe "#pages" do
it "delegates to #pages_per_id" do
expect(subject).to receive(:pages_per_id)
subject.pages {|page| nil}
end
include_examples "yield results in page"
end
end
context "`id` column is missing" do
let(:whitelisted_columns) { %w(name) }
describe "#pages" do
it "delegates to #pages_per_offset" do
expect(subject).to receive(:pages_per_offset)
subject.pages {|page| nil}
end
include_examples "yield results in page"
end
end
end
require 'spec_helper'
describe Pseudonymizer::Uploader do
let(:base_dir) { Dir.mktmpdir }
let(:options) do
Pseudonymizer::Options.new(
config: YAML.load_file(Gitlab.config.pseudonymizer.manifest)
)
end
let(:remote_directory) { subject.send(:remote_directory) }
subject { described_class.new(options) }
def mock_file(file_name)
FileUtils.touch(File.join(base_dir, file_name))
end
before do
allow(options).to receive(:output_dir).and_return(base_dir)
stub_object_storage_pseudonymizer
10.times {|i| mock_file("file_#{i}.test")}
mock_file("schema.yml")
mock_file("file_list.json")
end
after do
FileUtils.rm_rf(base_dir)
end
describe "#upload" do
it "upload all file in the directory" do
subject.upload
expect(remote_directory.files.all.count).to eq(12)
end
end
describe "#cleanup" do
it "cleans the directory" do
subject.cleanup
expect(Dir[File.join(base_dir, "*")].length).to eq(0)
end
end
end
......@@ -15,9 +15,14 @@ module StubObjectStorage
return unless enabled
stub_object_storage(connection_params: uploader.object_store_credentials,
remote_directory: remote_directory)
end
def stub_object_storage(connection_params:, remote_directory:)
Fog.mock!
::Fog::Storage.new(uploader.object_store_credentials).tap do |connection|
::Fog::Storage.new(connection_params).tap do |connection|
begin
connection.directories.create(key: remote_directory)
rescue Excon::Error::Conflict
......@@ -57,4 +62,9 @@ module StubObjectStorage
</InitiateMultipartUploadResult>
EOS
end
def stub_object_storage_pseudonymizer
stub_object_storage(connection_params: Pseudonymizer::Uploader.object_store_credentials,
remote_directory: Pseudonymizer::Uploader.remote_directory)
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment