Commit f034fcf8 authored by Stan Hu's avatar Stan Hu

Fail jobs that fail to render registration response

When a runner registers for a build, the JSON payload may require
loading a few Gitaly RPCs, which may fail. If these any of these fail,
the build will be appear as though it has been assigned to the runner,
but the runner will have received a 500 error and not proceed.

To recover from situations like this, we try to render the payload in
`RegisterJobService`. If that fails, we mark the job as failed with a
`scheduler_failure` so that it may be retried again.

This came out of https://gitlab.com/gitlab-org/gitlab/-/issues/227215.
parent 1457a8b9
......@@ -11,7 +11,7 @@ module Ci
METRICS_SHARD_TAG_PREFIX = 'metrics_shard::'.freeze
DEFAULT_METRICS_SHARD = 'default'.freeze
Result = Struct.new(:build, :valid?)
Result = Struct.new(:build, :build_json, :valid?)
def initialize(runner)
@runner = runner
......@@ -59,7 +59,7 @@ module Ci
end
register_failure
Result.new(nil, valid)
Result.new(nil, nil, valid)
end
# rubocop: enable CodeReuse/ActiveRecord
......@@ -71,7 +71,7 @@ module Ci
# In case when 2 runners try to assign the same build, second runner will be declined
# with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method.
if assign_runner!(build, params)
Result.new(build, true)
present_build!(build)
end
rescue StateMachines::InvalidTransition, ActiveRecord::StaleObjectError
# We are looping to find another build that is not conflicting
......@@ -83,8 +83,10 @@ module Ci
# In case we hit the concurrency-access lock,
# we still have to return 409 in the end,
# to make sure that this is properly handled by runner.
Result.new(nil, false)
Result.new(nil, nil, false)
rescue => ex
# If an error (e.g. GRPC::DeadlineExceeded) occurred constructing
# the result, consider this as a failure to be retried.
scheduler_failure!(build)
track_exception_for_build(ex, build)
......@@ -92,6 +94,15 @@ module Ci
nil
end
# Force variables evaluation to occur now
def present_build!(build)
# We need to use the presenter here because Gitaly calls in the presenter
# may fail, and we need to ensure the response has been generated.
presented_build = ::Ci::BuildRunnerPresenter.new(build) # rubocop:disable CodeReuse/Presenter
build_json = ::API::Entities::JobRequest::Response.new(presented_build).to_json
Result.new(build, build_json, true)
end
def assign_runner!(build, params)
build.runner_id = runner.id
build.runner_session_attributes = params[:session] if params[:session].present?
......
---
title: Fail jobs that fail to render registration response
merge_request: 36274
author:
type: fixed
......@@ -108,6 +108,20 @@ module API
end
optional :job_age, type: Integer, desc: %q(Job should be older than passed age in seconds to be ran on runner)
end
# Since we serialize the build output ourselves to ensure Gitaly
# gRPC calls succeed, we need a custom Grape format to handle
# this:
# 1. Grape will ordinarily call `JSON.dump` when Content-Type is set
# to application/json. To avoid this, we need to define a custom type in
# `content_type` and a custom formatter to go with it.
# 2. Grape will parse the request input with the parser defined for
# `content_type`. If no such parser exists, it will be treated as text. We
# reuse the existing JSON parser to preserve the previous behavior.
content_type :build_json, 'application/json'
formatter :build_json, ->(object, _) { object }
parser :build_json, ::Grape::Parser::Json
post '/request' do
authenticate_runner!
......@@ -128,9 +142,10 @@ module API
result = ::Ci::RegisterJobService.new(current_runner).execute(runner_params)
if result.valid?
if result.build
if result.build_json
Gitlab::Metrics.add_event(:build_found)
present ::Ci::BuildRunnerPresenter.new(result.build), with: Entities::JobRequest::Response
env['api.format'] = :build_json
body result.build_json
else
Gitlab::Metrics.add_event(:build_not_found)
header 'X-GitLab-Last-Update', new_update
......
......@@ -518,6 +518,7 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
request_job info: { platform: :darwin }
expect(response).to have_gitlab_http_status(:created)
expect(response.headers['Content-Type']).to eq('application/json')
expect(response.headers).not_to have_key('X-GitLab-Last-Update')
expect(runner.reload.platform).to eq('darwin')
expect(json_response['id']).to eq(job.id)
......@@ -569,6 +570,24 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
end
end
context 'when a Gitaly exception is thrown during response' do
before do
allow_next_instance_of(Ci::BuildRunnerPresenter) do |instance|
allow(instance).to receive(:artifacts).and_raise(GRPC::DeadlineExceeded)
end
end
it 'fails the job as a scheduler failure' do
request_job
expect(response).to have_gitlab_http_status(:no_content)
expect(job.reload.failed?).to be_truthy
expect(job.failure_reason).to eq('scheduler_failure')
expect(job.runner_id).to eq(runner.id)
expect(job.runner_session).to be_nil
end
end
context 'when GIT_DEPTH is not specified and there is no default git depth for the project' do
before do
project.update!(ci_default_git_depth: nil)
......@@ -1090,7 +1109,7 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
def request_job(token = runner.token, **params)
new_params = params.merge(token: token, last_update: last_update)
post api('/jobs/request'), params: new_params, headers: { 'User-Agent' => user_agent }
post api('/jobs/request'), params: new_params.to_json, headers: { 'User-Agent' => user_agent, 'Content-Type': 'application/json' }
end
end
......
......@@ -109,12 +109,14 @@ module Ci
end
context 'shared runner' do
let(:build) { execute(shared_runner) }
let(:response) { described_class.new(shared_runner).execute }
let(:build) { response.build }
it { expect(build).to be_kind_of(Build) }
it { expect(build).to be_valid }
it { expect(build).to be_running }
it { expect(build.runner).to eq(shared_runner) }
it { expect(Gitlab::Json.parse(response.build_json)['id']).to eq(build.id) }
end
context 'specific runner' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment