Commit 3903395b authored by Dylan Griffith's avatar Dylan Griffith

Add Elasticsearch migration to backfill project permissions on MRs

As part of our efforts to avoid using joins in Elasticsearch queries
https://gitlab.com/groups/gitlab-org/-/epics/2054 we need to
"denormalize" (copy them into child docs) the permission related fields
needed for searching.

In order to allow us to search for merge requests without joining to the
project we need to store the `merge_requests_access_level` as well as
the `visibility_level` of the project on the merge request record. In
https://gitlab.com/gitlab-org/gitlab/-/merge_requests/59731 we started
storing these fields for any newly created or updated Merge Requests.

This MR is to backfill all the existing MRs in the index that don't have
this value set. Technically we are backfilling 3 fields but, since all
these fields were added in that MR, we can just check for the existence
of any of those fields to be sure that a record has them all.

This MR does not yet use these fields as it's only the 2nd step of all
the steps described in https://gitlab.com/groups/gitlab-org/-/epics/5468

Once these fields are backfilled we can start using them and get some
performance gains. Technically we could add that code in this MR to use
these new fields (conditioned upon the migration being finished) but in
the past we've found that we often ran into issues that were only with
one part of the code that led to us needing to revert and then do the
whole process again and code review again and this is less efficient.

This MR is actually pretty much an identical copy of the same thing we
already did for issues a while back in
https://gitlab.com/gitlab-org/gitlab/-/merge_requests/47819 but all I
did was find and replace `issues` with `merge_requests` since that's the
only difference.

Changelog: changed
parent 805cb39b
---
title: Add Elasticsearch migration to backfill project permissions on MRs
merge_request: 59836
author:
type: changed
# frozen_string_literal: true
class AddNewDataToMergeRequestsDocuments < Elastic::Migration
batched!
throttle_delay 5.minutes
QUERY_BATCH_SIZE = 5000
UPDATE_BATCH_SIZE = 100
def migrate
if completed?
log "Skipping adding visibility_level field to merge_requests documents migration since it is already applied"
return
end
log "Adding visibility_level field to merge_requests documents for batch of #{QUERY_BATCH_SIZE} documents"
query = {
size: QUERY_BATCH_SIZE,
query: {
bool: {
filter: merge_requests_missing_visibility_level_filter
}
}
}
results = client.search(index: helper.target_index_name, body: query)
hits = results.dig('hits', 'hits') || []
document_references = hits.map do |hit|
id = hit.dig('_source', 'id')
es_id = hit.dig('_id')
es_parent = hit.dig('_source', 'join_field', 'parent')
# ensure that any merge_requests missing from the database will be removed from Elasticsearch
# as the data is back-filled
Gitlab::Elastic::DocumentReference.new(MergeRequest, id, es_id, es_parent)
end
document_references.each_slice(UPDATE_BATCH_SIZE) do |refs|
Elastic::ProcessBookkeepingService.track!(*refs)
end
log "Adding visibility_level field to merge_requests documents is completed for batch of #{document_references.size} documents"
end
def completed?
log "completed check: Refreshing #{helper.target_index_name}"
helper.refresh_index(index_name: helper.target_index_name)
query = {
size: 0,
aggs: {
merge_requests: {
filter: merge_requests_missing_visibility_level_filter
}
}
}
results = client.search(index: helper.target_index_name, body: query)
doc_count = results.dig('aggregations', 'merge_requests', 'doc_count')
log "Migration has #{doc_count} documents remaining" if doc_count
doc_count && doc_count == 0
end
private
def merge_requests_missing_visibility_level_filter
{
bool: {
must_not: field_exists('visibility_level'),
filter: merge_request_type_filter
}
}
end
def merge_request_type_filter
{
term: {
type: {
value: 'merge_request'
}
}
}
end
def field_exists(field)
{
exists: {
field: field
}
}
end
end
# frozen_string_literal: true
require 'spec_helper'
require File.expand_path('ee/elastic/migrate/20210421140400_add_new_data_to_merge_requests_documents.rb')
RSpec.describe AddNewDataToMergeRequestsDocuments, :elastic, :sidekiq_inline do
let(:version) { 20210421140400 }
let(:migration) { described_class.new(version) }
let(:merge_requests) { create_list(:merge_request, 3) }
before do
stub_ee_application_setting(elasticsearch_search: true, elasticsearch_indexing: true)
# ensure merge_requests are indexed
merge_requests
ensure_elasticsearch_index!
end
describe 'migration_options' do
it 'has migration options set', :aggregate_failures do
expect(migration.batched?).to be_truthy
expect(migration.throttle_delay).to eq(5.minutes)
end
end
describe '.migrate' do
subject { migration.migrate }
context 'when migration is already completed' do
before do
add_visibility_level_for_merge_requests(merge_requests)
end
it 'does not modify data', :aggregate_failures do
expect(::Elastic::ProcessBookkeepingService).not_to receive(:track!)
subject
end
end
context 'migration process' do
before do
remove_visibility_level_for_merge_requests(merge_requests)
end
it 'updates all merge_request documents' do
# track calls are batched in groups of 100
expect(::Elastic::ProcessBookkeepingService).to receive(:track!).once do |*tracked_refs|
expect(tracked_refs.count).to eq(3)
end
subject
end
it 'only updates merge_request documents missing visibility_level', :aggregate_failures do
merge_request = merge_requests.first
add_visibility_level_for_merge_requests(merge_requests[1..-1])
expected = [Gitlab::Elastic::DocumentReference.new(MergeRequest, merge_request.id, merge_request.es_id, merge_request.es_parent)]
expect(::Elastic::ProcessBookkeepingService).to receive(:track!).with(*expected).once
subject
end
it 'processes in batches', :aggregate_failures do
stub_const("#{described_class}::QUERY_BATCH_SIZE", 2)
stub_const("#{described_class}::UPDATE_BATCH_SIZE", 1)
expect(::Elastic::ProcessBookkeepingService).to receive(:track!).exactly(3).times.and_call_original
# cannot use subject in spec because it is memoized
migration.migrate
ensure_elasticsearch_index!
migration.migrate
end
end
end
describe '.completed?' do
subject { migration.completed? }
context 'when documents are missing visibility_level' do
before do
remove_visibility_level_for_merge_requests(merge_requests)
end
it { is_expected.to be_falsey }
end
context 'when no documents are missing visibility_level' do
before do
add_visibility_level_for_merge_requests(merge_requests)
end
it { is_expected.to be_truthy }
end
end
private
def add_visibility_level_for_merge_requests(merge_requests)
script = {
source: "ctx._source['visibility_level'] = params.visibility_level;",
lang: "painless",
params: {
visibility_level: Gitlab::VisibilityLevel::PRIVATE
}
}
update_by_query(merge_requests, script)
end
def remove_visibility_level_for_merge_requests(merge_requests)
script = {
source: "ctx._source.remove('visibility_level')"
}
update_by_query(merge_requests, script)
end
def update_by_query(merge_requests, script)
merge_request_ids = merge_requests.map { |i| i.id }
client = MergeRequest.__elasticsearch__.client
client.update_by_query({
index: MergeRequest.__elasticsearch__.index_name,
wait_for_completion: true, # run synchronously
refresh: true, # make operation visible to search
body: {
script: script,
query: {
bool: {
must: [
{
terms: {
id: merge_request_ids
}
},
{
term: {
type: {
value: 'merge_request'
}
}
}
]
}
}
}
})
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment