Commit f034fcf8 authored by Stan Hu's avatar Stan Hu

Fail jobs that fail to render registration response

When a runner registers for a build, the JSON payload may require
loading a few Gitaly RPCs, which may fail. If these any of these fail,
the build will be appear as though it has been assigned to the runner,
but the runner will have received a 500 error and not proceed.

To recover from situations like this, we try to render the payload in
`RegisterJobService`. If that fails, we mark the job as failed with a
`scheduler_failure` so that it may be retried again.

This came out of https://gitlab.com/gitlab-org/gitlab/-/issues/227215.
parent 1457a8b9
...@@ -11,7 +11,7 @@ module Ci ...@@ -11,7 +11,7 @@ module Ci
METRICS_SHARD_TAG_PREFIX = 'metrics_shard::'.freeze METRICS_SHARD_TAG_PREFIX = 'metrics_shard::'.freeze
DEFAULT_METRICS_SHARD = 'default'.freeze DEFAULT_METRICS_SHARD = 'default'.freeze
Result = Struct.new(:build, :valid?) Result = Struct.new(:build, :build_json, :valid?)
def initialize(runner) def initialize(runner)
@runner = runner @runner = runner
...@@ -59,7 +59,7 @@ module Ci ...@@ -59,7 +59,7 @@ module Ci
end end
register_failure register_failure
Result.new(nil, valid) Result.new(nil, nil, valid)
end end
# rubocop: enable CodeReuse/ActiveRecord # rubocop: enable CodeReuse/ActiveRecord
...@@ -71,7 +71,7 @@ module Ci ...@@ -71,7 +71,7 @@ module Ci
# In case when 2 runners try to assign the same build, second runner will be declined # In case when 2 runners try to assign the same build, second runner will be declined
# with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method. # with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method.
if assign_runner!(build, params) if assign_runner!(build, params)
Result.new(build, true) present_build!(build)
end end
rescue StateMachines::InvalidTransition, ActiveRecord::StaleObjectError rescue StateMachines::InvalidTransition, ActiveRecord::StaleObjectError
# We are looping to find another build that is not conflicting # We are looping to find another build that is not conflicting
...@@ -83,8 +83,10 @@ module Ci ...@@ -83,8 +83,10 @@ module Ci
# In case we hit the concurrency-access lock, # In case we hit the concurrency-access lock,
# we still have to return 409 in the end, # we still have to return 409 in the end,
# to make sure that this is properly handled by runner. # to make sure that this is properly handled by runner.
Result.new(nil, false) Result.new(nil, nil, false)
rescue => ex rescue => ex
# If an error (e.g. GRPC::DeadlineExceeded) occurred constructing
# the result, consider this as a failure to be retried.
scheduler_failure!(build) scheduler_failure!(build)
track_exception_for_build(ex, build) track_exception_for_build(ex, build)
...@@ -92,6 +94,15 @@ module Ci ...@@ -92,6 +94,15 @@ module Ci
nil nil
end end
# Force variables evaluation to occur now
def present_build!(build)
# We need to use the presenter here because Gitaly calls in the presenter
# may fail, and we need to ensure the response has been generated.
presented_build = ::Ci::BuildRunnerPresenter.new(build) # rubocop:disable CodeReuse/Presenter
build_json = ::API::Entities::JobRequest::Response.new(presented_build).to_json
Result.new(build, build_json, true)
end
def assign_runner!(build, params) def assign_runner!(build, params)
build.runner_id = runner.id build.runner_id = runner.id
build.runner_session_attributes = params[:session] if params[:session].present? build.runner_session_attributes = params[:session] if params[:session].present?
......
---
title: Fail jobs that fail to render registration response
merge_request: 36274
author:
type: fixed
...@@ -108,6 +108,20 @@ module API ...@@ -108,6 +108,20 @@ module API
end end
optional :job_age, type: Integer, desc: %q(Job should be older than passed age in seconds to be ran on runner) optional :job_age, type: Integer, desc: %q(Job should be older than passed age in seconds to be ran on runner)
end end
# Since we serialize the build output ourselves to ensure Gitaly
# gRPC calls succeed, we need a custom Grape format to handle
# this:
# 1. Grape will ordinarily call `JSON.dump` when Content-Type is set
# to application/json. To avoid this, we need to define a custom type in
# `content_type` and a custom formatter to go with it.
# 2. Grape will parse the request input with the parser defined for
# `content_type`. If no such parser exists, it will be treated as text. We
# reuse the existing JSON parser to preserve the previous behavior.
content_type :build_json, 'application/json'
formatter :build_json, ->(object, _) { object }
parser :build_json, ::Grape::Parser::Json
post '/request' do post '/request' do
authenticate_runner! authenticate_runner!
...@@ -128,9 +142,10 @@ module API ...@@ -128,9 +142,10 @@ module API
result = ::Ci::RegisterJobService.new(current_runner).execute(runner_params) result = ::Ci::RegisterJobService.new(current_runner).execute(runner_params)
if result.valid? if result.valid?
if result.build if result.build_json
Gitlab::Metrics.add_event(:build_found) Gitlab::Metrics.add_event(:build_found)
present ::Ci::BuildRunnerPresenter.new(result.build), with: Entities::JobRequest::Response env['api.format'] = :build_json
body result.build_json
else else
Gitlab::Metrics.add_event(:build_not_found) Gitlab::Metrics.add_event(:build_not_found)
header 'X-GitLab-Last-Update', new_update header 'X-GitLab-Last-Update', new_update
......
...@@ -518,6 +518,7 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do ...@@ -518,6 +518,7 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
request_job info: { platform: :darwin } request_job info: { platform: :darwin }
expect(response).to have_gitlab_http_status(:created) expect(response).to have_gitlab_http_status(:created)
expect(response.headers['Content-Type']).to eq('application/json')
expect(response.headers).not_to have_key('X-GitLab-Last-Update') expect(response.headers).not_to have_key('X-GitLab-Last-Update')
expect(runner.reload.platform).to eq('darwin') expect(runner.reload.platform).to eq('darwin')
expect(json_response['id']).to eq(job.id) expect(json_response['id']).to eq(job.id)
...@@ -569,6 +570,24 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do ...@@ -569,6 +570,24 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
end end
end end
context 'when a Gitaly exception is thrown during response' do
before do
allow_next_instance_of(Ci::BuildRunnerPresenter) do |instance|
allow(instance).to receive(:artifacts).and_raise(GRPC::DeadlineExceeded)
end
end
it 'fails the job as a scheduler failure' do
request_job
expect(response).to have_gitlab_http_status(:no_content)
expect(job.reload.failed?).to be_truthy
expect(job.failure_reason).to eq('scheduler_failure')
expect(job.runner_id).to eq(runner.id)
expect(job.runner_session).to be_nil
end
end
context 'when GIT_DEPTH is not specified and there is no default git depth for the project' do context 'when GIT_DEPTH is not specified and there is no default git depth for the project' do
before do before do
project.update!(ci_default_git_depth: nil) project.update!(ci_default_git_depth: nil)
...@@ -1090,7 +1109,7 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do ...@@ -1090,7 +1109,7 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
def request_job(token = runner.token, **params) def request_job(token = runner.token, **params)
new_params = params.merge(token: token, last_update: last_update) new_params = params.merge(token: token, last_update: last_update)
post api('/jobs/request'), params: new_params, headers: { 'User-Agent' => user_agent } post api('/jobs/request'), params: new_params.to_json, headers: { 'User-Agent' => user_agent, 'Content-Type': 'application/json' }
end end
end end
......
...@@ -109,12 +109,14 @@ module Ci ...@@ -109,12 +109,14 @@ module Ci
end end
context 'shared runner' do context 'shared runner' do
let(:build) { execute(shared_runner) } let(:response) { described_class.new(shared_runner).execute }
let(:build) { response.build }
it { expect(build).to be_kind_of(Build) } it { expect(build).to be_kind_of(Build) }
it { expect(build).to be_valid } it { expect(build).to be_valid }
it { expect(build).to be_running } it { expect(build).to be_running }
it { expect(build.runner).to eq(shared_runner) } it { expect(build.runner).to eq(shared_runner) }
it { expect(Gitlab::Json.parse(response.build_json)['id']).to eq(build.id) }
end end
context 'specific runner' do context 'specific runner' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment