Commit 4523cbbb authored by Jan Provaznik's avatar Jan Provaznik

Merge branch '205178-change-repository-indexing-to-sorted-sets-algorithm' into 'master'

Enable de-duplication of the ElasticCommitIndexerWorker jobs

See merge request gitlab-org/gitlab!31500
parents 80963589 5602d2f6
......@@ -14,8 +14,8 @@ module Elastic
end
end
def index_commits_and_blobs(from_rev: nil, to_rev: nil)
::ElasticCommitIndexerWorker.perform_async(project.id, from_rev, to_rev)
def index_commits_and_blobs
::ElasticCommitIndexerWorker.perform_async(project.id)
end
end
end
......@@ -8,8 +8,8 @@ module Elastic
delegate(:delete_index_for_commits_and_blobs, :elastic_search, to: :__elasticsearch__)
def index_wiki_blobs(to_sha = nil)
ElasticCommitIndexerWorker.perform_async(project.id, nil, to_sha, true)
def index_wiki_blobs
ElasticCommitIndexerWorker.perform_async(project.id, nil, nil, true)
end
end
end
......@@ -61,8 +61,7 @@ module EE
after_transition started: :finished do |state, _|
if state.project.use_elasticsearch?
state.run_after_commit do
last_indexed_commit = state.project.index_status&.last_commit
ElasticCommitIndexerWorker.perform_async(state.project_id, last_indexed_commit)
ElasticCommitIndexerWorker.perform_async(state.project_id)
end
end
end
......
......@@ -18,7 +18,7 @@ module EE
def enqueue_elasticsearch_indexing
return unless should_index_commits?
project.repository.index_commits_and_blobs(from_rev: oldrev, to_rev: newrev)
project.repository.index_commits_and_blobs
end
def enqueue_update_external_pull_requests
......
......@@ -10,11 +10,9 @@ module EE
super
return unless project.use_elasticsearch?
return unless default_branch_changes.any?
# For all changes on the default branch (usually master) trigger an ES update
default_branch_changes.each do |change|
project.wiki.index_wiki_blobs(change[:newrev])
end
project.wiki.index_wiki_blobs
end
end
end
......
......@@ -485,7 +485,7 @@
:urgency: :throttled
:resource_boundary: :unknown
:weight: 1
:idempotent:
:idempotent: true
- :name: elastic_full_index
:feature_category: :global_search
:has_external_dependencies:
......
# frozen_string_literal: true
class ElasticCommitIndexerWorker # rubocop:disable Scalability/IdempotentWorker
class ElasticCommitIndexerWorker
include ApplicationWorker
feature_category :global_search
sidekiq_options retry: 2
urgency :throttled
idempotent!
# Performs the commits and blobs indexation
#
# project_id - The ID of the project to index
# oldrev @deprecated - The revision to start indexing at (default: INDEXED_SHA)
# newrev @deprecated - The revision to stop indexing at (default: HEAD)
# wiki - Treat this project as a Wiki
#
# The indexation will cover all commits within INDEXED_SHA..HEAD
def perform(project_id, oldrev = nil, newrev = nil, wiki = false)
return true unless Gitlab::CurrentSettings.elasticsearch_indexing?
project = Project.find(project_id)
return true unless project.use_elasticsearch?
Gitlab::Elastic::Indexer.new(project, wiki: wiki).run(newrev)
Gitlab::Elastic::Indexer.new(project, wiki: wiki).run
end
end
---
title: Make the ElasticCommitIndexer idempotent to enable job de-duplication.
merge_request: 31500
author: mbergeron
type: performance
......@@ -40,9 +40,7 @@ module Elastic
def search_commit(query, page: 1, per: 20, options: {})
page ||= 1
fields = %w(message^10 sha^5 author.name^2 author.email^2 committer.name committer.email).map {|i| "commit.#{i}"}
query_with_prefix = query.split(/\s+/).map { |s| s.gsub(SHA_REGEX) { |sha| "#{sha}*" } }.join(' ')
query_hash = {
......
......@@ -16,7 +16,8 @@ module Gitlab
end
end
attr_reader :project, :index_status
attr_reader :project, :index_status, :wiki
alias_method :index_wiki?, :wiki
def initialize(project, wiki: false)
@project = project
......@@ -26,45 +27,52 @@ module Gitlab
@index_status = project.index_status
end
def run(to_sha = nil)
to_sha = nil if to_sha == Gitlab::Git::BLANK_SHA
head_commit = repository.try(:commit)
if repository.nil? || !repository.exists? || repository.empty? || head_commit.nil?
update_index_status(Gitlab::Git::BLANK_SHA)
return
end
# Runs the indexation process, which is the following:
# - Purge the index for any unreachable commits;
# - Run the `gitlab-elasticsearch-indexer`;
# - Update the `index_status` for the associated project;
#
# ref - Git ref up to which the indexation will run (default: HEAD)
def run(ref = 'HEAD')
commit = find_indexable_commit(ref)
return update_index_status(Gitlab::Git::BLANK_SHA) unless commit
repository.__elasticsearch__.elastic_writing_targets.each do |target|
run_indexer!(to_sha, target)
Sidekiq.logger.debug(message: "Indexation running for #{project.id} #{from_sha}..#{commit.sha}",
project_id: project.id,
wiki: index_wiki?)
run_indexer!(commit.sha, target)
end
update_index_status(to_sha)
# update the index status only if all writes were successful
update_index_status(commit.sha)
true
end
private
def wiki?
@wiki
def find_indexable_commit(ref)
!repository.empty? && repository.commit(ref)
end
private
def repository
wiki? ? project.wiki.repository : project.repository
index_wiki? ? project.wiki.repository : project.repository
end
def run_indexer!(to_sha, target)
vars = build_envvars(to_sha, target)
if index_status && !repository_contains_last_indexed_commit?
target.delete_index_for_commits_and_blobs(wiki: wiki?)
# This might happen when default branch has been reset or rebased.
base_sha = if purge_unreachable_commits_from_index!(to_sha, target)
Gitlab::Git::EMPTY_TREE_ID
else
from_sha
end
vars = build_envvars(base_sha, to_sha, target)
path_to_indexer = Gitlab.config.elasticsearch.indexer_path
command =
if wiki?
if index_wiki?
[path_to_indexer, "--blob-type=wiki_blob", "--skip-commits", project.id.to_s, repository_path]
else
[path_to_indexer, project.id.to_s, repository_path]
......@@ -75,7 +83,19 @@ module Gitlab
raise Error, output unless status&.zero?
end
def build_envvars(to_sha, target)
# Remove all indexed data for commits and blobs for a project.
#
# @return: whether the index has been purged
def purge_unreachable_commits_from_index!(to_sha, target)
return false if last_commit_ancestor_of?(to_sha)
target.delete_index_for_commits_and_blobs(wiki: index_wiki?)
true
rescue ::Elasticsearch::Transport::Transport::Errors::BadRequest => e
Gitlab::ErrorTracking.track_exception(e, project_id: project.id)
end
def build_envvars(from_sha, to_sha, target)
# We accept any form of settings, including string and array
# This is why JSON is needed
vars = {
......@@ -96,16 +116,14 @@ module Gitlab
end
def last_commit
if wiki?
index_status&.last_wiki_commit
else
index_status&.last_commit
end
index_wiki? ? index_status&.last_wiki_commit : index_status&.last_commit
end
def from_sha
strong_memoize(:from_sha) do
repository_contains_last_indexed_commit? ? last_commit : Gitlab::Git::EMPTY_TREE_ID
end
end
def repository_contains_last_indexed_commit?
strong_memoize(:repository_contains_last_indexed_commit) do
......@@ -113,6 +131,15 @@ module Gitlab
end
end
def last_commit_ancestor_of?(to_sha)
return true if from_sha == Gitlab::Git::BLANK_SHA
return false unless repository_contains_last_indexed_commit?
# we always treat the `EMPTY_TREE_ID` as an ancestor to make sure
# we don't try to purge an empty index
from_sha == Gitlab::Git::EMPTY_TREE_ID || repository.ancestor?(from_sha, to_sha)
end
def repository_path
"#{repository.disk_path}.git"
end
......@@ -131,7 +158,7 @@ module Gitlab
# rubocop: disable CodeReuse/ActiveRecord
def update_index_status(to_sha)
head_commit = repository.try(:commit)
raise "Invalid sha #{to_sha}" unless to_sha.present?
# An index_status should always be created,
# even if the repository is empty, so we know it's been looked at.
......@@ -142,17 +169,11 @@ module Gitlab
retry
end
# Don't update the index status if we never reached HEAD
return if head_commit && to_sha && head_commit.sha != to_sha
sha = head_commit.try(:sha)
sha ||= Gitlab::Git::BLANK_SHA
attributes =
if wiki?
{ last_wiki_commit: sha, wiki_indexed_at: Time.now }
if index_wiki?
{ last_wiki_commit: to_sha, wiki_indexed_at: Time.now }
else
{ last_commit: sha, indexed_at: Time.now }
{ last_commit: to_sha, indexed_at: Time.now }
end
@index_status.update(attributes)
......
......@@ -7,20 +7,19 @@ describe Gitlab::Elastic::Indexer do
before do
stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'true')
stub_ee_application_setting(ee_application_setting) if ee_application_setting.present?
end
let(:ee_application_setting) { { elasticsearch_url: ['http://localhost:9200'] } }
let(:project) { create(:project, :repository) }
let(:expected_from_sha) { Gitlab::Git::EMPTY_TREE_ID }
let(:to_commit) { project.commit }
let(:to_sha) { to_commit.try(:sha) }
let(:indexer) { described_class.new(project) }
let(:popen_success) { [[''], 0] }
let(:popen_failure) { [['error'], 1] }
context 'empty project' do
subject(:indexer) { described_class.new(project) }
context 'empty project', :elastic do
let(:project) { create(:project) }
it 'updates the index status without running the indexing command' do
......@@ -30,102 +29,59 @@ describe Gitlab::Elastic::Indexer do
expect_index_status(Gitlab::Git::BLANK_SHA)
end
end
context 'wikis' do
let(:project) { create(:project, :wiki_repo) }
let(:indexer) { described_class.new(project, wiki: true) }
before do
project.wiki.create_page('test.md', '# term')
end
it 'runs the indexer with the right flags' do
expect_popen.with(
[
TestEnv.indexer_bin_path,
'--blob-type=wiki_blob',
'--skip-commits',
project.id.to_s,
"#{project.wiki.repository.disk_path}.git"
],
nil,
hash_including(
'ELASTIC_CONNECTION_INFO' => elasticsearch_config.to_json,
'RAILS_ENV' => Rails.env,
'FROM_SHA' => expected_from_sha,
'TO_SHA' => nil
)
).and_return(popen_success)
context 'when indexing an unborn head', :elastic do
it 'updates the index status without running the indexing command' do
allow(project.repository).to receive(:exists?).and_return(false)
expect_popen.never
indexer.run
end
context 'when IndexStatus#last_wiki_commit is no longer in repository', :elastic do
let(:user) { project.owner }
let(:ee_application_setting) { nil }
before do
stub_ee_application_setting(elasticsearch_indexing: true)
ElasticIndexerWorker.new.perform('index', 'Project', project.id, project.es_id)
expect_index_status(Gitlab::Git::BLANK_SHA)
end
end
def change_wiki_and_index(project, &blk)
yield blk if blk
current_commit = project.wiki.repository.commit('master').sha
described_class.new(project, wiki: true).run(current_commit)
ensure_elasticsearch_index!
end
def indexed_wiki_paths_for(term)
blobs = ProjectWiki.elastic_search(
term,
type: 'wiki_blob'
)[:wiki_blobs][:results].response
describe '#find_indexable_commit' do
it 'is truthy for reachable commits' do
expect(indexer.find_indexable_commit(project.repository.commit.sha)).to be_an_instance_of(::Commit)
end
blobs.map do |blob|
blob['_source']['blob']['path']
it 'is falsey for unreachable commits', :aggregate_failures do
expect(indexer.find_indexable_commit(Gitlab::Git::BLANK_SHA)).to be_nil
expect(indexer.find_indexable_commit(Gitlab::Git::EMPTY_TREE_ID)).to be_nil
end
end
it 'reindexes from scratch' do
sha_for_reset = nil
context 'with an indexed project', :elastic do
let(:to_sha) { project.repository.commit.sha }
change_wiki_and_index(project) do
sha_for_reset = project.wiki.repository.create_file(user, '12', '', message: '12', branch_name: 'master')
project.wiki.repository.create_file(user, '23', '', message: '23', branch_name: 'master')
before do
# enable the indexing and index the project
stub_ee_application_setting(elasticsearch_indexing: true)
Elastic::IndexRecordService.new.execute(project, true)
end
expect(indexed_wiki_paths_for('12')).to include('12')
expect(indexed_wiki_paths_for('23')).to include('23')
project.index_status.update!(last_wiki_commit: '____________')
shared_examples 'index up to the specified commit' do
it 'updates the index status when the indexing is a success' do
expect_popen.and_return(popen_success)
change_wiki_and_index(project) do
project.wiki.repository.write_ref('master', sha_for_reset)
end
indexer.run(to_sha)
expect(indexed_wiki_paths_for('12')).to include('12')
expect(indexed_wiki_paths_for('23')).not_to include('23')
end
end
expect_index_status(to_sha)
end
context 'repository has unborn head' do
it 'updates the index status without running the indexing command' do
allow(project.repository).to receive(:exists?).and_return(false)
expect_popen.never
it 'leaves the index status untouched when the indexing fails' do
expect_popen.and_return(popen_failure)
indexer.run
expect { indexer.run }.to raise_error(Gitlab::Elastic::Indexer::Error)
expect_index_status(Gitlab::Git::BLANK_SHA)
expect(project.index_status).to be_nil
end
end
context 'test project' do
let(:project) { create(:project, :repository) }
context 'when indexing a HEAD commit', :elastic do
it_behaves_like 'index up to the specified commit'
it 'runs the indexing command' do
gitaly_connection_data = {
......@@ -149,7 +105,7 @@ describe Gitlab::Elastic::Indexer do
)
).and_return(popen_success)
indexer.run(to_sha)
indexer.run
end
context 'when IndexStatus exists' do
......@@ -169,40 +125,16 @@ describe Gitlab::Elastic::Indexer do
end
end
end
it 'updates the index status when the indexing is a success' do
expect_popen.and_return(popen_success)
indexer.run(to_sha)
expect_index_status(to_sha)
end
it 'leaves the index status untouched when indexing a non-HEAD commit' do
expect_popen.and_return(popen_success)
indexer.run(project.repository.commit('HEAD~1'))
expect(project.index_status).to be_nil
end
it 'leaves the index status untouched when the indexing fails' do
expect_popen.and_return(popen_failure)
expect { indexer.run }.to raise_error(Gitlab::Elastic::Indexer::Error)
context 'when indexing a non-HEAD commit', :elastic do
let(:to_sha) { project.repository.commit('HEAD~1').sha }
expect(project.index_status).to be_nil
end
end
it_behaves_like 'index up to the specified commit'
context 'reverting a change', :elastic do
context 'after reverting a change' do
let(:user) { project.owner }
let!(:initial_commit) { project.repository.commit('master').sha }
let(:ee_application_setting) { nil }
before do
stub_ee_application_setting(elasticsearch_indexing: true)
end
def change_repository_and_index(project, &blk)
yield blk if blk
......@@ -224,11 +156,18 @@ describe Gitlab::Elastic::Indexer do
end
end
context 'when IndexStatus#last_commit is no longer in repository' do
before do
ElasticIndexerWorker.new.perform('index', 'Project', project.id, project.es_id)
def indexed_commits_for(term)
commits = Repository.elastic_search(
term,
type: 'commit'
)[:commits][:results].response
commits.map do |commit|
commit['_source']['commit']['sha']
end
end
context 'when IndexStatus#last_commit is no longer in repository' do
it 'reindexes from scratch' do
sha_for_reset = nil
......@@ -252,23 +191,104 @@ describe Gitlab::Elastic::Indexer do
end
context 'when branch is reset to an earlier commit' do
before do
it 'reverses already indexed commits' do
change_repository_and_index(project) do
project.repository.create_file(user, '12', '', message: '12', branch_name: 'master')
end
head = project.repository.commit.sha
expect(indexed_commits_for('12')).to include(head)
expect(indexed_file_paths_for('12')).to include('12')
end
it 'reverses already indexed commits' do
# resetting the repository should purge the index of the outstanding commits
change_repository_and_index(project) do
project.repository.write_ref('master', initial_commit)
end
expect(indexed_commits_for('12')).not_to include(head)
expect(indexed_file_paths_for('12')).not_to include('12')
end
end
end
end
context "when indexing a project's wiki", :elastic do
let(:project) { create(:project, :wiki_repo) }
let(:indexer) { described_class.new(project, wiki: true) }
let(:to_sha) { project.wiki.repository.commit('master').sha }
before do
project.wiki.create_page('test.md', '# term')
end
it 'runs the indexer with the right flags' do
expect_popen.with(
[
TestEnv.indexer_bin_path,
'--blob-type=wiki_blob',
'--skip-commits',
project.id.to_s,
"#{project.wiki.repository.disk_path}.git"
],
nil,
hash_including(
'ELASTIC_CONNECTION_INFO' => elasticsearch_config.to_json,
'RAILS_ENV' => Rails.env,
'FROM_SHA' => expected_from_sha,
'TO_SHA' => to_sha
)
).and_return(popen_success)
indexer.run
end
context 'when IndexStatus#last_wiki_commit is no longer in repository' do
let(:user) { project.owner }
def change_wiki_and_index(project, &blk)
yield blk if blk
current_commit = project.wiki.repository.commit('master').sha
described_class.new(project, wiki: true).run(current_commit)
ensure_elasticsearch_index!
end
def indexed_wiki_paths_for(term)
blobs = ProjectWiki.elastic_search(
term,
type: 'wiki_blob'
)[:wiki_blobs][:results].response
blobs.map do |blob|
blob['_source']['blob']['path']
end
end
it 'reindexes from scratch' do
sha_for_reset = nil
change_wiki_and_index(project) do
sha_for_reset = project.wiki.repository.create_file(user, '12', '', message: '12', branch_name: 'master')
project.wiki.repository.create_file(user, '23', '', message: '23', branch_name: 'master')
end
expect(indexed_wiki_paths_for('12')).to include('12')
expect(indexed_wiki_paths_for('23')).to include('23')
project.index_status.update!(last_wiki_commit: '____________')
change_wiki_and_index(project) do
project.wiki.repository.write_ref('master', sha_for_reset)
end
expect(indexed_wiki_paths_for('12')).to include('12')
expect(indexed_wiki_paths_for('23')).not_to include('23')
end
end
end
end
context 'when SSL env vars are not set explicitly' do
let(:ruby_cert_file) { OpenSSL::X509::DEFAULT_CERT_FILE }
......@@ -319,6 +339,7 @@ describe Gitlab::Elastic::Indexer do
def envvars
indexer.send(:build_envvars,
Gitlab::Git::BLANK_SHA,
Gitlab::Git::BLANK_SHA,
project.repository.__elasticsearch__.elastic_writing_targets.first)
end
......
......@@ -33,13 +33,12 @@ describe ProjectWiki, :elastic do
Sidekiq::Testing.inline! do
project.wiki.find_page('omega_page').delete
last_commit = project.wiki.repository.commit.sha
expect_next_instance_of(Gitlab::Elastic::Indexer) do |indexer|
expect(indexer).to receive(:run).with(last_commit).and_call_original
expect(indexer).to receive(:run).and_call_original
end
project.wiki.index_wiki_blobs(last_commit)
project.wiki.index_wiki_blobs
ensure_elasticsearch_index!
end
......
......@@ -52,7 +52,7 @@ describe ProjectImportState, type: :model do
context 'no index status' do
it 'schedules a full index of the repository' do
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(import_state.project_id, nil)
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(import_state.project_id)
import_state.finish
end
......@@ -61,8 +61,8 @@ describe ProjectImportState, type: :model do
context 'with index status' do
let(:index_status) { IndexStatus.create!(project: project, indexed_at: Time.now, last_commit: 'foo') }
it 'schedules a progressive index of the repository' do
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(import_state.project_id, index_status.last_commit)
it 'schedules a full index of the repository' do
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(import_state.project_id)
import_state.finish
end
......
......@@ -55,7 +55,7 @@ describe Git::BranchPushService do
end
it 'runs ElasticCommitIndexerWorker' do
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(project.id, oldrev, newrev)
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(project.id)
subject.execute
end
......@@ -95,7 +95,7 @@ describe Git::BranchPushService do
end
it 'runs ElasticCommitIndexerWorker' do
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(project.id, oldrev, newrev)
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(project.id)
subject.execute
end
......@@ -110,7 +110,7 @@ describe Git::BranchPushService do
end
it 'runs ElasticCommitIndexerWorker' do
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(project.id, oldrev, newrev)
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(project.id)
subject.execute
end
......
......@@ -28,7 +28,7 @@ describe Git::WikiPushService do
end
it 'triggers a wiki update' do
expect(project.wiki).to receive(:index_wiki_blobs).with("797823")
expect(project.wiki).to receive(:index_wiki_blobs)
described_class.new(project, project.owner, changes: post_received.changes).execute
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment