Commit 6e415666 authored by Mario de la Ossa's avatar Mario de la Ossa

Fix elasticsearch initial index cancelled by push

When doing the initial Elasticsearch indexing, if a project hasn't been
indexed yet and a push is received, we index the data received on that
push, but then do not index data from before that push.

This is fixed by making ElasticBatchProjectIndexerWorker ignore the
Project's IndexStatus object unless we explicitly ask for it to update
indexed projects via the UPDATE_INDEX env flag
parent 0014186c
...@@ -366,6 +366,12 @@ Here are some common pitfalls and how to overcome them: ...@@ -366,6 +366,12 @@ Here are some common pitfalls and how to overcome them:
You will need to re-run all the rake tasks to re-index the database, repositories, and wikis. You will need to re-run all the rake tasks to re-index the database, repositories, and wikis.
- **No new data is added to the Elasticsearch index when I push code**
When performing the initial indexing of blobs, we lock all projects until the project finishes indexing. It could
happen that an error during the process causes one or multiple projects to remain locked. In order to unlock them,
run the `gitlab:elastic:clear_locked_projects` rake task.
- **"Can't specify parent if no parent field has been configured"** - **"Can't specify parent if no parent field has been configured"**
If you enabled Elasticsearch before GitLab 8.12 and have not rebuilt indexes you will get If you enabled Elasticsearch before GitLab 8.12 and have not rebuilt indexes you will get
......
...@@ -8,7 +8,7 @@ module EE ...@@ -8,7 +8,7 @@ module EE
override :execute_related_hooks override :execute_related_hooks
def execute_related_hooks def execute_related_hooks
if ::Gitlab::CurrentSettings.elasticsearch_indexing? && default_branch? if ::Gitlab::CurrentSettings.elasticsearch_indexing? && default_branch? && should_index_commits?
::ElasticCommitIndexerWorker.perform_async(project.id, params[:oldrev], params[:newrev]) ::ElasticCommitIndexerWorker.perform_async(project.id, params[:oldrev], params[:newrev])
end end
...@@ -17,6 +17,10 @@ module EE ...@@ -17,6 +17,10 @@ module EE
private private
def should_index_commits?
::Gitlab::Redis::SharedState.with { |redis| !redis.sismember(:elastic_projects_indexing, project.id) }
end
override :pipeline_options override :pipeline_options
def pipeline_options def pipeline_options
{ mirror_update: project.mirror? && project.repository.up_to_date_with_upstream?(branch_name) } { mirror_update: project.mirror? && project.repository.up_to_date_with_upstream?(branch_name) }
......
...@@ -11,17 +11,23 @@ class ElasticBatchProjectIndexerWorker ...@@ -11,17 +11,23 @@ class ElasticBatchProjectIndexerWorker
sidekiq_options retry: 10 sidekiq_options retry: 10
def perform(start, finish, update_index = false) def perform(start, finish, update_index = false)
projects = build_relation(start, finish, update_index) projects = build_relation(start, finish)
projects.find_each { |project| run_indexer(project) } projects.find_each { |project| run_indexer(project, update_index) }
end end
private private
def run_indexer(project) def run_indexer(project, update_index)
# Ensure we remove the hold on the project, no matter what, so ElasticCommitIndexerWorker can do its thing
# We do this before the indexer starts to avoid the possibility of pushes coming in during this time not
# being indexed.
Gitlab::Redis::SharedState.with { |redis| redis.srem(:elastic_projects_indexing, project.id) }
logger.info "Indexing #{project.full_name} (ID=#{project.id})..." logger.info "Indexing #{project.full_name} (ID=#{project.id})..."
last_commit = project.index_status.try(:last_commit) # Get the last commit if we're updating indexed projects - otherwise we'll want to index everything
last_commit = project.index_status.try(:last_commit) if update_index
Gitlab::Elastic::Indexer.new(project).run(last_commit) Gitlab::Elastic::Indexer.new(project).run(last_commit)
logger.info "Indexing #{project.full_name} (ID=#{project.id}) is done!" logger.info "Indexing #{project.full_name} (ID=#{project.id}) is done!"
...@@ -30,13 +36,9 @@ class ElasticBatchProjectIndexerWorker ...@@ -30,13 +36,9 @@ class ElasticBatchProjectIndexerWorker
end end
# rubocop: disable CodeReuse/ActiveRecord # rubocop: disable CodeReuse/ActiveRecord
def build_relation(start, finish, update_index) def build_relation(start, finish)
relation = Project.includes(:index_status) relation = Project.includes(:index_status)
unless update_index
relation = relation.where('index_statuses.id IS NULL').references(:index_statuses)
end
table = Project.arel_table table = Project.arel_table
relation = relation.where(table[:id].gteq(start)) if start relation = relation.where(table[:id].gteq(start)) if start
relation = relation.where(table[:id].lteq(finish)) if finish relation = relation.where(table[:id].lteq(finish)) if finish
......
---
title: Fix repo pushes while initial Elasticsearch indexing not permitting initial
indexing to complete
merge_request: 9478
author:
type: fixed
...@@ -2,6 +2,11 @@ namespace :gitlab do ...@@ -2,6 +2,11 @@ namespace :gitlab do
namespace :elastic do namespace :elastic do
desc "GitLab | Elasticsearch | Index eveything at once" desc "GitLab | Elasticsearch | Index eveything at once"
task :index do task :index do
# UPDATE_INDEX=true can cause some projects not to be indexed properly if someone were to push a commit to the
# project before the rake task could get to it, so we set it to `nil` here to avoid that. It doesn't make sense
# to use this configuration during a full re-index anyways.
ENV['UPDATE_INDEX'] = nil
Rake::Task["gitlab:elastic:create_empty_index"].invoke Rake::Task["gitlab:elastic:create_empty_index"].invoke
Rake::Task["gitlab:elastic:clear_index_status"].invoke Rake::Task["gitlab:elastic:clear_index_status"].invoke
Rake::Task["gitlab:elastic:index_wikis"].invoke Rake::Task["gitlab:elastic:index_wikis"].invoke
...@@ -40,6 +45,13 @@ namespace :gitlab do ...@@ -40,6 +45,13 @@ namespace :gitlab do
end end
end end
desc 'GitLab | Elasticsearch | Unlock repositories for indexing in case something gets stuck'
task clear_locked_projects: :environment do
Gitlab::Redis::SharedState.with { |redis| redis.del(:elastic_projects_indexing) }
puts 'Cleared all locked projects. Incremental indexing should work now.'
end
desc "GitLab | Elasticsearch | Index wiki repositories" desc "GitLab | Elasticsearch | Index wiki repositories"
task index_wikis: :environment do task index_wikis: :environment do
projects = apply_project_filters(Project.with_wiki_enabled) projects = apply_project_filters(Project.with_wiki_enabled)
...@@ -180,6 +192,7 @@ namespace :gitlab do ...@@ -180,6 +192,7 @@ namespace :gitlab do
relation.all.in_batches(of: batch_size, start: ENV['ID_FROM'], finish: ENV['ID_TO']) do |relation| # rubocop: disable Cop/InBatches relation.all.in_batches(of: batch_size, start: ENV['ID_FROM'], finish: ENV['ID_TO']) do |relation| # rubocop: disable Cop/InBatches
ids = relation.reorder(:id).pluck(:id) ids = relation.reorder(:id).pluck(:id)
Gitlab::Redis::SharedState.with { |redis| redis.sadd(:elastic_projects_indexing, ids) }
yield ids[0], ids[-1] yield ids[0], ids[-1]
end end
end end
......
...@@ -16,16 +16,43 @@ describe GitPushService do ...@@ -16,16 +16,43 @@ describe GitPushService do
described_class.new(project, user, oldrev: oldrev, newrev: newrev, ref: ref) described_class.new(project, user, oldrev: oldrev, newrev: newrev, ref: ref)
end end
before do
allow(project.repository).to receive(:commit).and_call_original
allow(project.repository).to receive(:commit).with("master").and_return(nil)
end
context 'deleted branch' do context 'deleted branch' do
let(:newrev) { blankrev } let(:newrev) { blankrev }
it 'handles when remote branch exists' do it 'handles when remote branch exists' do
allow(project.repository).to receive(:commit).and_call_original
allow(project.repository).to receive(:commit).with("master").and_return(nil)
expect(project.repository).to receive(:commit).with("refs/remotes/upstream/master").and_return(sample_commit) expect(project.repository).to receive(:commit).with("refs/remotes/upstream/master").and_return(sample_commit)
subject.execute subject.execute
end end
end end
context 'ElasticSearch indexing' do
before do
stub_ee_application_setting(elasticsearch_indexing?: true)
end
context 'when the project is locked by elastic.rake', :clean_gitlab_redis_shared_state do
before do
Gitlab::Redis::SharedState.with { |redis| redis.sadd(:elastic_projects_indexing, project.id) }
end
it 'does not run ElasticCommitIndexerWorker' do
expect(ElasticCommitIndexerWorker).not_to receive(:perform_async)
subject.execute
end
end
it 'runs ElasticCommitIndexerWorker' do
expect(ElasticCommitIndexerWorker).to receive(:perform_async).with(project.id, oldrev, newrev)
subject.execute
end
end
end end
end end
...@@ -6,23 +6,38 @@ describe ElasticBatchProjectIndexerWorker do ...@@ -6,23 +6,38 @@ describe ElasticBatchProjectIndexerWorker do
describe '#perform' do describe '#perform' do
it 'runs the indexer for projects in the batch range' do it 'runs the indexer for projects in the batch range' do
projects.each {|project| expect_index(project) } projects.each { |project| expect_index(project, false) }
worker.perform(projects.first.id, projects.last.id) worker.perform(projects.first.id, projects.last.id)
end end
it 'skips projects not in the batch range' do it 'skips projects not in the batch range' do
expect_index(projects.first).never expect_index(projects.first, false).never
expect_index(projects.last) expect_index(projects.last, false)
worker.perform(projects.last.id, projects.last.id) worker.perform(projects.last.id, projects.last.id)
end end
it 'clears the "locked" state from redis when the project finishes indexing' do
Gitlab::Redis::SharedState.with { |redis| redis.sadd(:elastic_projects_indexing, projects.first.id) }
expect_index(projects.first, false).and_call_original
expect_next_instance_of(Gitlab::Elastic::Indexer) do |indexer|
expect(indexer).to receive(:run).with(nil)
end
expect { worker.perform(projects.first.id, projects.first.id) }
.to change { project_locked?(projects.first) }.from(true).to(false)
end
context 'update_index = false' do context 'update_index = false' do
it 'skips projects that were already indexed' do it 'indexes all projects it receives even if already indexed' do
projects.first.create_index_status! projects.first.build_index_status.update!(last_commit: 'foo')
expect_index(projects.first).never expect_index(projects.first, false).and_call_original
expect_next_instance_of(Gitlab::Elastic::Indexer) do |indexer|
expect(indexer).to receive(:run).with(nil)
end
worker.perform(projects.first.id, projects.first.id) worker.perform(projects.first.id, projects.first.id)
end end
...@@ -32,8 +47,8 @@ describe ElasticBatchProjectIndexerWorker do ...@@ -32,8 +47,8 @@ describe ElasticBatchProjectIndexerWorker do
it 'reindexes projects that were already indexed' do it 'reindexes projects that were already indexed' do
projects.first.create_index_status! projects.first.create_index_status!
expect_index(projects.first) expect_index(projects.first, true)
expect_index(projects.last) expect_index(projects.last, true)
worker.perform(projects.first.id, projects.last.id, true) worker.perform(projects.first.id, projects.last.id, true)
end end
...@@ -41,7 +56,7 @@ describe ElasticBatchProjectIndexerWorker do ...@@ -41,7 +56,7 @@ describe ElasticBatchProjectIndexerWorker do
it 'starts indexing at the last indexed commit' do it 'starts indexing at the last indexed commit' do
projects.first.create_index_status!(last_commit: 'foo') projects.first.create_index_status!(last_commit: 'foo')
expect_index(projects.first).and_call_original expect_index(projects.first, true).and_call_original
expect_any_instance_of(Gitlab::Elastic::Indexer).to receive(:run).with('foo') expect_any_instance_of(Gitlab::Elastic::Indexer).to receive(:run).with('foo')
worker.perform(projects.first.id, projects.first.id, true) worker.perform(projects.first.id, projects.first.id, true)
...@@ -49,7 +64,11 @@ describe ElasticBatchProjectIndexerWorker do ...@@ -49,7 +64,11 @@ describe ElasticBatchProjectIndexerWorker do
end end
end end
def expect_index(project) def expect_index(project, update_index)
expect(worker).to receive(:run_indexer).with(project) expect(worker).to receive(:run_indexer).with(project, update_index)
end
def project_locked?(project)
Gitlab::Redis::SharedState.with { |redis| redis.sismember(:elastic_projects_indexing, project.id) }
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment