Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
G
gitlab-ce
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
gitlab-ce
Commits
9c010a97
Commit
9c010a97
authored
Jun 18, 2018
by
Micaël Bergeron
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
apply feedback
parent
a4b43b89
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
76 additions
and
80 deletions
+76
-80
app/workers/pseudonymizer_worker.rb
app/workers/pseudonymizer_worker.rb
+1
-1
lib/pseudonymizer/dumper.rb
lib/pseudonymizer/dumper.rb
+29
-16
lib/pseudonymizer/manifest.yml
lib/pseudonymizer/manifest.yml
+0
-45
lib/pseudonymizer/options.rb
lib/pseudonymizer/options.rb
+4
-4
lib/tasks/gitlab/db.rake
lib/tasks/gitlab/db.rake
+2
-1
spec/lib/pseudonymizer/dumper_spec.rb
spec/lib/pseudonymizer/dumper_spec.rb
+40
-13
No files found.
app/workers/pseudonymizer_worker.rb
View file @
9c010a97
...
@@ -7,7 +7,7 @@ class PseudonymizerWorker
...
@@ -7,7 +7,7 @@ class PseudonymizerWorker
options
=
Pseudonymizer
::
Options
.
new
(
options
=
Pseudonymizer
::
Options
.
new
(
config:
YAML
.
load_file
(
Rails
.
root
.
join
(
Gitlab
.
config
.
pseudonymizer
.
manifest
)),
config:
YAML
.
load_file
(
Rails
.
root
.
join
(
Gitlab
.
config
.
pseudonymizer
.
manifest
)),
start_at:
Time
.
now
.
utc
output_dir:
ENV
[
'PSEUDONYMIZER_OUTPUT_DIR'
]
)
)
dumper
=
Pseudonymizer
::
Dumper
.
new
(
options
)
dumper
=
Pseudonymizer
::
Dumper
.
new
(
options
)
...
...
lib/pseudonymizer/dumper.rb
View file @
9c010a97
...
@@ -4,7 +4,7 @@ require 'csv'
...
@@ -4,7 +4,7 @@ require 'csv'
require
'yaml'
require
'yaml'
module
Pseudonymizer
module
Pseudonymizer
PAGE_SIZE
=
10000
PAGE_SIZE
=
ENV
.
fetch
(
'PSEUDONYMIZER_BATCH'
,
100_000
)
class
Anon
class
Anon
def
initialize
(
fields
)
def
initialize
(
fields
)
...
@@ -38,45 +38,57 @@ module Pseudonymizer
...
@@ -38,45 +38,57 @@ module Pseudonymizer
@output_dir
=
options
.
output_dir
@output_dir
=
options
.
output_dir
@start_at
=
options
.
start_at
@start_at
=
options
.
start_at
reset!
end
def
reset!
@schema
=
Hash
.
new
{
|
h
,
k
|
h
[
k
]
=
{}
}
@schema
=
Hash
.
new
{
|
h
,
k
|
h
[
k
]
=
{}
}
@output_files
=
[]
@output_files
=
[]
end
end
def
tables_to_csv
def
tables_to_csv
tables
=
config
[
"tables"
]
reset!
tables
=
config
[
"tables"
]
FileUtils
.
mkdir_p
(
output_dir
)
unless
File
.
directory?
(
output_dir
)
FileUtils
.
mkdir_p
(
output_dir
)
unless
File
.
directory?
(
output_dir
)
schema_to_yml
schema_to_yml
file_list_to_json
@output_files
=
tables
.
map
do
|
k
,
v
|
tables
.
map
do
|
k
,
v
|
table_to_csv
(
k
,
v
[
'whitelist'
],
v
[
'pseudo'
])
table_to_csv
(
k
,
v
[
'whitelist'
],
v
[
'pseudo'
])
end
end
file_list_to_json
@output_files
end
end
private
private
def
get_and_log_file_name
(
ext
,
prefix
=
nil
,
filename
=
nil
)
def
output_filename
(
basename
=
nil
,
ext
=
"csv.gz"
)
file_timestamp
=
filename
||
"
#{
prefix
}
_
#{
@start_at
.
to_i
}
"
file_timestamp
=
"
#{
basename
}
.
#{
ext
}
"
file_timestamp
=
"
#{
file_timestamp
}
.
#{
ext
}
"
@output_files
<<
file_timestamp
File
.
join
(
output_dir
,
file_timestamp
)
File
.
join
(
output_dir
,
file_timestamp
)
end
end
def
schema_to_yml
def
schema_to_yml
file_path
=
get_and_log_file_name
(
"yml"
,
"schema
"
)
file_path
=
output_filename
(
"schema"
,
"yml
"
)
File
.
open
(
file_path
,
'w'
)
{
|
file
|
file
.
write
(
@schema
.
to_yaml
)
}
File
.
open
(
file_path
,
'w'
)
{
|
file
|
file
.
write
(
@schema
.
to_yaml
)
}
end
end
def
file_list_to_json
def
file_list_to_json
file_path
=
get_and_log_file_name
(
"json"
,
nil
,
"file_list"
)
file_path
=
output_filename
(
"file_list"
,
"json"
)
File
.
open
(
file_path
,
'w'
)
{
|
file
|
file
.
write
(
@output_files
.
to_json
)
}
File
.
open
(
file_path
,
'w'
)
do
|
file
|
relative_files
=
@output_files
.
map
(
&
File
.
method
(
:basename
))
file
.
write
(
relative_files
.
to_json
)
end
end
end
def
table_to_csv
(
table
,
whitelist_columns
,
pseudonymity_columns
)
def
table_to_csv
(
table
,
whitelist_columns
,
pseudonymity_columns
)
table_to_schema
(
table
)
table_to_schema
(
table
)
write_to_csv_file
(
table
,
table_page_results
(
table
,
whitelist_columns
,
pseudonymity_columns
))
write_to_csv_file
(
table
,
table_page_results
(
table
,
whitelist_columns
,
pseudonymity_columns
)
)
rescue
=>
e
rescue
=>
e
Rails
.
logger
.
error
(
"Failed to export
#{
table
}
:
#{
e
}
"
)
Rails
.
logger
.
error
(
"Failed to export
#{
table
}
:
#{
e
}
"
)
end
end
...
@@ -134,15 +146,16 @@ module Pseudonymizer
...
@@ -134,15 +146,16 @@ module Pseudonymizer
end
end
def
write_to_csv_file
(
table
,
contents
)
def
write_to_csv_file
(
table
,
contents
)
file_path
=
get_and_log_file_name
(
"csv"
,
table
)
file_path
=
output_filename
(
table
,
"csv.gz"
)
Rails
.
logger
.
info
"
#{
self
.
class
.
name
}
writing
#{
table
}
to
#{
file_path
}
."
Rails
.
logger
.
info
"
#{
self
.
class
.
name
}
writing
#{
table
}
to
#{
file_path
}
."
CSV
.
open
(
file_path
,
'w'
)
do
|
csv
|
Zlib
::
GzipWriter
.
open
(
file_path
)
do
|
io
|
csv
=
CSV
.
new
(
io
)
contents
.
with_index
do
|
row
,
i
|
contents
.
with_index
do
|
row
,
i
|
csv
<<
row
.
keys
if
i
==
0
# header
csv
<<
row
.
keys
if
i
==
0
# header
csv
<<
row
.
values
csv
<<
row
.
values
csv
.
flush
if
i
%
PAGE_SIZE
end
end
csv
.
close
end
end
file_path
file_path
...
...
lib/pseudonymizer/manifest.yml
View file @
9c010a97
...
@@ -96,15 +96,8 @@ tables:
...
@@ -96,15 +96,8 @@ tables:
-
author_id
-
author_id
-
assignee_id
-
assignee_id
-
iid
-
iid
-
cached_markdown_version
-
updated_by_id
-
updated_by_id
-
last_edited_by_id
-
last_edited_by_id
-
lock_version
-
start_date
-
end_date
-
last_edited_at
-
created_at
-
updated_at
issue_assignees
:
issue_assignees
:
whitelist
:
whitelist
:
-
user_id
-
user_id
...
@@ -208,8 +201,6 @@ tables:
...
@@ -208,8 +201,6 @@ tables:
-
title
-
title
-
color
-
color
-
project_id
-
project_id
-
created_at
-
updated_at
-
template
-
template
-
type
-
type
-
group_id
-
group_id
...
@@ -423,12 +414,10 @@ tables:
...
@@ -423,12 +414,10 @@ tables:
-
created_at
-
created_at
-
updated_at
-
updated_at
-
project_id
-
project_id
-
attachment
-
line_code
-
line_code
-
commit_id
-
commit_id
-
noteable_id
-
noteable_id
-
system
-
system
-
st_diff
-
updated_by_id
-
updated_by_id
-
type
-
type
-
position
-
position
...
@@ -436,35 +425,18 @@ tables:
...
@@ -436,35 +425,18 @@ tables:
-
resolved_at
-
resolved_at
-
resolved_by_id
-
resolved_by_id
-
discussion_id
-
discussion_id
-
note_html
-
cached_markdown_version
-
change_position
-
change_position
-
resolved_by_push
-
resolved_by_push
pseudo
:
pseudo
:
-
id
-
id
-
note
-
note
-
noteable_type
-
author_id
-
author_id
-
created_at
-
updated_at
-
project_id
-
project_id
-
attachment
-
line_code
-
commit_id
-
commit_id
-
noteable_id
-
noteable_id
-
system
-
st_diff
-
updated_by_id
-
updated_by_id
-
type
-
position
-
original_position
-
resolved_at
-
resolved_by_id
-
resolved_by_id
-
discussion_id
-
discussion_id
-
note_html
-
cached_markdown_version
-
change_position
-
resolved_by_push
notification_settings
:
notification_settings
:
whitelist
:
whitelist
:
-
id
-
id
...
@@ -492,8 +464,6 @@ tables:
...
@@ -492,8 +464,6 @@ tables:
-
source_id
-
source_id
-
source_type
-
source_type
-
level
-
level
-
created_at
-
updated_at
-
new_note
-
new_note
-
new_issue
-
new_issue
-
reopen_issue
-
reopen_issue
...
@@ -526,8 +496,6 @@ tables:
...
@@ -526,8 +496,6 @@ tables:
pseudo
:
pseudo
:
-
id
-
id
-
project_id
-
project_id
-
created_at
-
updated_at
-
enabled
-
enabled
-
domain
-
domain
project_custom_attributes
:
project_custom_attributes
:
...
@@ -540,8 +508,6 @@ tables:
...
@@ -540,8 +508,6 @@ tables:
-
value
-
value
pseudo
:
pseudo
:
-
id
-
id
-
created_at
-
updated_at
-
project_id
-
project_id
-
key
-
key
-
value
-
value
...
@@ -565,8 +531,6 @@ tables:
...
@@ -565,8 +531,6 @@ tables:
-
wiki_access_level
-
wiki_access_level
-
snippets_access_level
-
snippets_access_level
-
builds_access_level
-
builds_access_level
-
created_at
-
updated_at
-
repository_access_level
-
repository_access_level
project_group_links
:
project_group_links
:
whitelist
:
whitelist
:
...
@@ -581,8 +545,6 @@ tables:
...
@@ -581,8 +545,6 @@ tables:
-
id
-
id
-
project_id
-
project_id
-
group_id
-
group_id
-
created_at
-
updated_at
-
group_access
-
group_access
-
expires_at
-
expires_at
project_import_data
:
project_import_data
:
...
@@ -615,8 +577,6 @@ tables:
...
@@ -615,8 +577,6 @@ tables:
-
last_update_started_at
-
last_update_started_at
-
last_update_scheduled_at
-
last_update_scheduled_at
-
next_execution_timestamp
-
next_execution_timestamp
-
created_at
-
updated_at
project_repository_states
:
project_repository_states
:
whitelist
:
whitelist
:
-
id
-
id
...
@@ -730,8 +690,6 @@ tables:
...
@@ -730,8 +690,6 @@ tables:
-
name
-
name
-
path
-
path
-
description
-
description
-
created_at
-
updated_at
-
creator_id
-
creator_id
-
namespace_id
-
namespace_id
-
last_activity_at
-
last_activity_at
...
@@ -875,7 +833,6 @@ tables:
...
@@ -875,7 +833,6 @@ tables:
pseudo
:
pseudo
:
-
id
-
id
-
email
-
email
-
remember_created_at
-
current_sign_in_ip
-
current_sign_in_ip
-
last_sign_in_ip
-
last_sign_in_ip
-
name
-
name
...
@@ -897,12 +854,10 @@ tables:
...
@@ -897,12 +854,10 @@ tables:
-
hide_project_limit
-
hide_project_limit
-
note
-
note
-
unlock_token
-
unlock_token
-
otp_grace_period_started_at
-
external
-
external
-
incoming_email_token
-
incoming_email_token
-
organization
-
organization
-
auditor
-
auditor
-
two_factor_grace_period
-
two_factor_grace_period
-
ghost
-
rss_token
-
rss_token
-
theme_id
-
theme_id
lib/pseudonymizer/options.rb
View file @
9c010a97
...
@@ -2,14 +2,14 @@ module Pseudonymizer
...
@@ -2,14 +2,14 @@ module Pseudonymizer
class
Options
class
Options
attr_reader
:config
attr_reader
:config
attr_reader
:start_at
attr_reader
:start_at
attr_reader
:output_dir
def
initialize
(
config:
{})
def
initialize
(
config:
{}
,
output_dir:
nil
)
@config
=
config
@config
=
config
@start_at
=
Time
.
now
.
utc
@start_at
=
Time
.
now
.
utc
end
def
output_dir
base_dir
=
output_dir
||
File
.
join
(
Dir
.
tmpdir
,
'gitlab-pseudonymizer'
)
File
.
join
(
Dir
.
tmpdir
,
'gitlab-pseudonymizer'
,
start_at
.
iso8601
)
@output_dir
=
File
.
join
(
base_dir
,
start_at
.
iso8601
)
end
end
def
upload_dir
def
upload_dir
...
...
lib/tasks/gitlab/db.rake
View file @
9c010a97
...
@@ -76,7 +76,8 @@ namespace :gitlab do
...
@@ -76,7 +76,8 @@ namespace :gitlab do
abort
"The pseudonymizer is disabled."
unless
Gitlab
::
CurrentSettings
.
pseudonymizer_enabled?
abort
"The pseudonymizer is disabled."
unless
Gitlab
::
CurrentSettings
.
pseudonymizer_enabled?
options
=
Pseudonymizer
::
Options
.
new
(
options
=
Pseudonymizer
::
Options
.
new
(
config:
YAML
.
load_file
(
Rails
.
root
.
join
(
Gitlab
.
config
.
pseudonymizer
.
manifest
))
config:
YAML
.
load_file
(
Rails
.
root
.
join
(
Gitlab
.
config
.
pseudonymizer
.
manifest
)),
output_dir:
ENV
[
'PSEUDONYMIZER_OUTPUT_DIR'
]
)
)
dumper
=
Pseudonymizer
::
Dumper
.
new
(
options
)
dumper
=
Pseudonymizer
::
Dumper
.
new
(
options
)
...
...
spec/lib/pseudonymizer/dumper_spec.rb
View file @
9c010a97
...
@@ -20,9 +20,10 @@ describe Pseudonymizer::Dumper do
...
@@ -20,9 +20,10 @@ describe Pseudonymizer::Dumper do
describe
'Pseudo tables'
do
describe
'Pseudo tables'
do
it
'outputs project tables to csv'
do
it
'outputs project tables to csv'
do
column_names
=
%w(id name path description)
pseudo
.
config
[
"tables"
]
=
{
pseudo
.
config
[
"tables"
]
=
{
"projects"
=>
{
"projects"
=>
{
"whitelist"
=>
%w(id name path description)
,
"whitelist"
=>
column_names
,
"pseudo"
=>
%w(id)
"pseudo"
=>
%w(id)
}
}
}
}
...
@@ -31,26 +32,52 @@ describe Pseudonymizer::Dumper do
...
@@ -31,26 +32,52 @@ describe Pseudonymizer::Dumper do
# grab the first table it outputs. There would only be 1.
# grab the first table it outputs. There would only be 1.
project_table_file
=
pseudo
.
tables_to_csv
[
0
]
project_table_file
=
pseudo
.
tables_to_csv
[
0
]
expect
(
project_table_file
).
to
include
(
"projects.csv.gz"
)
expect
(
project_table_file
.
include?
"projects_"
).
to
be
true
expect
(
project_table_file
.
include?
".csv"
).
to
be
true
columns
=
[]
columns
=
[]
project_data
=
[]
project_data
=
[]
File
.
foreach
(
project_table_file
).
with_index
do
|
line
,
line_num
|
Zlib
::
GzipReader
.
open
(
project_table_file
)
do
|
gz
|
if
line_num
==
0
csv
=
CSV
.
new
(
gz
,
headers:
true
)
columns
=
line
.
split
(
","
)
# csv.shift # read the header row
elsif
line_num
==
1
project_data
=
csv
.
gets
project_data
=
line
.
split
(
","
)
columns
=
csv
.
headers
break
end
end
end
# check if CSV columns are correct
# check if CSV columns are correct
expect
(
columns
.
to_set
).
to
eq
(
%W(id name path description
\n
)
.
to_set
)
expect
(
columns
).
to
include
(
*
column_names
)
# is it pseudonymous
# is it pseudonymous
expect
(
project_data
[
0
]).
not_to
eq
(
1
)
# sha 256 is 64 chars in length
# sha 256 is 64 chars in length
expect
(
project_data
[
0
].
length
).
to
eq
(
64
)
expect
(
project_data
[
"id"
].
length
).
to
eq
(
64
)
end
end
describe
"manifest is valid"
do
it
"all tables exist"
do
existing_tables
=
ActiveRecord
::
Base
.
connection
.
tables
tables
=
options
.
config
[
'tables'
].
keys
expect
(
existing_tables
).
to
include
(
*
tables
)
end
it
"all whitelisted attributes exist"
do
options
.
config
[
'tables'
].
each
do
|
table
,
table_def
|
whitelisted
=
table_def
[
'whitelist'
]
existing_columns
=
ActiveRecord
::
Base
.
connection
.
columns
(
table
.
to_sym
).
map
(
&
:name
)
diff
=
whitelisted
-
existing_columns
expect
(
diff
).
to
be_empty
,
"
#{
table
}
should define columns
#{
whitelisted
.
inspect
}
: missing
#{
diff
.
inspect
}
"
end
end
it
"all pseudonymized attributes are whitelisted"
do
options
.
config
[
'tables'
].
each
do
|
table
,
table_def
|
whitelisted
=
table_def
[
'whitelist'
]
pseudonymized
=
table_def
[
'pseudo'
]
diff
=
pseudonymized
-
whitelisted
expect
(
diff
).
to
be_empty
,
"
#{
table
}
should whitelist columns
#{
pseudonymized
.
inspect
}
: missing
#{
diff
.
inspect
}
"
end
end
end
end
end
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment