Commit d3a4b526 authored by Sean McGivern's avatar Sean McGivern

Merge branch 'feature/gb/live-build-traces-finalize-api' into 'master'

Migrate live traces before updating build state

See merge request gitlab-org/gitlab!41304
parents b0072e3c 39457e88
......@@ -38,8 +38,9 @@ module Ci
has_one :deployment, as: :deployable, class_name: 'Deployment'
has_one :resource, class_name: 'Ci::Resource', inverse_of: :build
has_one :pending_state, class_name: 'Ci::BuildPendingState', inverse_of: :build
has_many :trace_sections, class_name: 'Ci::BuildTraceSection'
has_many :trace_chunks, class_name: 'Ci::BuildTraceChunk', foreign_key: :build_id
has_many :trace_chunks, class_name: 'Ci::BuildTraceChunk', foreign_key: :build_id, inverse_of: :build
has_many :report_results, class_name: 'Ci::BuildReportResult', inverse_of: :build
has_many :job_artifacts, class_name: 'Ci::JobArtifact', foreign_key: :job_id, dependent: :destroy, inverse_of: :job # rubocop:disable Cop/ActiveRecordDependent
......@@ -956,6 +957,10 @@ module Ci
var[:value]&.to_i if var
end
def remove_pending_state!
pending_state.try(:delete)
end
private
def auto_retry
......
......@@ -5,5 +5,8 @@ class Ci::BuildPendingState < ApplicationRecord
belongs_to :build, class_name: 'Ci::Build', foreign_key: :build_id
enum state: Ci::Stage.statuses
enum failure_reason: CommitStatus.failure_reasons
validates :build, presence: true
end
......@@ -26,6 +26,8 @@ module Ci
fog: 3
}
scope :live, -> { redis }
class << self
def all_stores
@all_stores ||= self.data_stores.keys
......@@ -116,6 +118,16 @@ module Ci
redis?
end
##
# Build trace chunk is final (the last one that we do not expect to ever
# become full) when a runner submitted a build pending state and there is
# no chunk with higher index in the database.
#
def final?
build.pending_state.present? &&
build.trace_chunks.maximum(:chunk_index).to_i == chunk_index
end
private
def get_data
......@@ -128,8 +140,9 @@ module Ci
current_data = data
old_store_class = current_store
current_size = current_data&.bytesize.to_i
unless current_data&.bytesize.to_i == CHUNK_SIZE
unless current_size == CHUNK_SIZE || final?
raise FailedToPersistDataError, 'Data is not fulfilled in a bucket'
end
......
......@@ -13,6 +13,7 @@ module Ci
end
job.trace.archive!
job.remove_pending_state!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
......
# frozen_string_literal: true
module Ci
class UpdateBuildStateService
Result = Struct.new(:status, keyword_init: true)
ACCEPT_TIMEOUT = 5.minutes.freeze
attr_reader :build, :params, :metrics
def initialize(build, params, metrics = ::Gitlab::Ci::Trace::Metrics.new)
@build = build
@params = params
@metrics = metrics
end
def execute
overwrite_trace! if has_trace?
if accept_request?
accept_build_state!
else
check_migration_state
update_build_state!
end
end
private
def accept_build_state!
if Time.current - ensure_pending_state.created_at > ACCEPT_TIMEOUT
metrics.increment_trace_operation(operation: :discarded)
return update_build_state!
end
build.trace_chunks.live.find_each do |chunk|
chunk.schedule_to_persist!
end
metrics.increment_trace_operation(operation: :accepted)
Result.new(status: 202)
end
def overwrite_trace!
metrics.increment_trace_operation(operation: :overwrite)
build.trace.set(params[:trace]) if Gitlab::Ci::Features.trace_overwrite?
end
def check_migration_state
return unless accept_available?
if has_chunks? && !live_chunks_pending?
metrics.increment_trace_operation(operation: :finalized)
end
end
def update_build_state!
case build_state
when 'running'
build.touch if build.needs_touch?
Result.new(status: 200)
when 'success'
build.success!
Result.new(status: 200)
when 'failed'
build.drop!(params[:failure_reason] || :unknown_failure)
Result.new(status: 200)
else
Result.new(status: 400)
end
end
def accept_available?
!build_running? && has_checksum? && chunks_migration_enabled?
end
def accept_request?
accept_available? && live_chunks_pending?
end
def build_state
params.dig(:state).to_s
end
def has_trace?
params.dig(:trace).present?
end
def has_checksum?
params.dig(:checksum).present?
end
def has_chunks?
build.trace_chunks.any?
end
def live_chunks_pending?
build.trace_chunks.live.any?
end
def build_running?
build_state == 'running'
end
def ensure_pending_state
Ci::BuildPendingState.create_or_find_by!(
build_id: build.id,
state: params.fetch(:state),
trace_checksum: params.fetch(:checksum),
failure_reason: params.dig(:failure_reason)
)
rescue ActiveRecord::RecordNotFound
metrics.increment_trace_operation(operation: :conflict)
build.pending_state
end
def chunks_migration_enabled?
::Gitlab::Ci::Features.accept_trace?(build.project)
end
end
end
---
title: Migrate live traces before updating build state
merge_request: 41304
author:
type: added
---
name: ci_accept_trace
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/41304
rollout_issue_url:
group: group::continuous integration
type: ops
default_enabled: false
\ No newline at end of file
---
name: ci_trace_overwrite
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/41304
rollout_issue_url:
group: group::continuous integration
type: ops
default_enabled: false
\ No newline at end of file
# frozen_string_literal: true
class ChangeBuildPendingStateEnums < ActiveRecord::Migration[6.0]
DOWNTIME = false
def up
change_column :ci_build_pending_states, :state, :integer, limit: 2
change_column :ci_build_pending_states, :failure_reason, :integer, limit: 2
end
def down
change_column :ci_build_pending_states, :state, :integer
change_column :ci_build_pending_states, :failure_reason, :integer
end
end
c46bafefa5fd79a6644cbe259260b66aaded36aad4ae28a84ddd8bb072c2167d
\ No newline at end of file
......@@ -9853,8 +9853,8 @@ CREATE TABLE public.ci_build_pending_states (
created_at timestamp with time zone NOT NULL,
updated_at timestamp with time zone NOT NULL,
build_id bigint NOT NULL,
state integer,
failure_reason integer,
state smallint,
failure_reason smallint,
trace_checksum bytea
);
......
......@@ -159,29 +159,29 @@ module API
end
desc 'Updates a job' do
http_codes [[200, 'Job was updated'], [403, 'Forbidden']]
http_codes [[200, 'Job was updated'],
[202, 'Update accepted'],
[400, 'Unknown parameters'],
[403, 'Forbidden']]
end
params do
requires :token, type: String, desc: %q(Runners's authentication token)
requires :id, type: Integer, desc: %q(Job's ID)
optional :trace, type: String, desc: %q(Job's full trace)
optional :state, type: String, desc: %q(Job's status: success, failed)
optional :checksum, type: String, desc: %q(Job's trace CRC32 checksum)
optional :failure_reason, type: String, desc: %q(Job's failure_reason)
end
put '/:id' do
job = authenticate_job!
job.trace.set(params[:trace]) if params[:trace]
Gitlab::Metrics.add_event(:update_build)
case params[:state].to_s
when 'running'
job.touch if job.needs_touch?
when 'success'
job.success!
when 'failed'
job.drop!(params[:failure_reason] || :unknown_failure)
service = ::Ci::UpdateBuildStateService
.new(job, declared_params(include_missing: false))
service.execute.then do |result|
status result.status
end
end
......
......@@ -69,6 +69,15 @@ module Gitlab
def self.child_of_child_pipeline_enabled?(project)
::Feature.enabled?(:ci_child_of_child_pipeline, project, default_enabled: false)
end
def self.trace_overwrite?
::Feature.enabled?(:ci_trace_overwrite, type: :ops, default_enabled: false)
end
def self.accept_trace?(project)
::Feature.enabled?(:ci_enable_live_trace, project) &&
::Feature.enabled?(:ci_accept_trace, project, type: :ops, default_enabled: false)
end
end
end
end
......@@ -6,7 +6,8 @@ module Gitlab
class Metrics
extend Gitlab::Utils::StrongMemoize
OPERATIONS = [:mutated].freeze
OPERATIONS = [:appended, :mutated, :overwrite, :accepted,
:finalized, :discarded, :flaky].freeze
def increment_trace_operation(operation: :unknown)
unless OPERATIONS.include?(operation)
......
# frozen_string_literal: true
FactoryBot.define do
factory :ci_build_pending_state, class: 'Ci::BuildPendingState' do
build factory: :ci_build
trace_checksum { 'crc32:12345678' }
state { 'success' }
end
end
......@@ -463,6 +463,8 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
end
describe '#persist_data!' do
let(:build) { create(:ci_build, :running) }
subject { build_trace_chunk.persist_data! }
shared_examples_for 'Atomic operation' do
......@@ -524,6 +526,18 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
expect(Ci::BuildTraceChunks::Database.new.data(build_trace_chunk)).to be_nil
expect(Ci::BuildTraceChunks::Fog.new.data(build_trace_chunk)).to be_nil
end
context 'when chunk is a final one' do
before do
create(:ci_build_pending_state, build: build)
end
it 'persists the data' do
subject
expect(build_trace_chunk.fog?).to be_truthy
end
end
end
end
......@@ -573,6 +587,18 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
expect(Ci::BuildTraceChunks::Database.new.data(build_trace_chunk)).to eq(data)
expect(Ci::BuildTraceChunks::Fog.new.data(build_trace_chunk)).to be_nil
end
context 'when chunk is a final one' do
before do
create(:ci_build_pending_state, build: build)
end
it 'persists the data' do
subject
expect(build_trace_chunk.fog?).to be_truthy
end
end
end
end
......@@ -622,6 +648,54 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
end
end
describe 'final?' do
let(:build) { create(:ci_build, :running) }
context 'when build pending state exists' do
before do
create(:ci_build_pending_state, build: build)
end
context 'when chunks is not the last one' do
before do
create(:ci_build_trace_chunk, chunk_index: 1, build: build)
end
it 'is not a final chunk' do
expect(build.reload.pending_state).to be_present
expect(build_trace_chunk).not_to be_final
end
end
context 'when chunks is the last one' do
it 'is a final chunk' do
expect(build.reload.pending_state).to be_present
expect(build_trace_chunk).to be_final
end
end
end
context 'when build pending state does not exist' do
context 'when chunks is not the last one' do
before do
create(:ci_build_trace_chunk, chunk_index: 1, build: build)
end
it 'is not a final chunk' do
expect(build.reload.pending_state).not_to be_present
expect(build_trace_chunk).not_to be_final
end
end
context 'when chunks is the last one' do
it 'is not a final chunk' do
expect(build.reload.pending_state).not_to be_present
expect(build_trace_chunk).not_to be_final
end
end
end
end
describe 'deletes data in redis after a parent record destroyed' do
let(:project) { create(:project) }
......
......@@ -105,6 +105,67 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
it { expect(job).to be_unmet_prerequisites }
end
context 'when unmigrated live trace chunks exist' do
context 'when accepting trace feature is enabled' do
before do
stub_feature_flags(ci_accept_trace: true)
end
context 'when checksum is present' do
context 'when live trace chunk is still live' do
it 'responds with 202' do
update_job(state: 'success', checksum: 'crc32:12345678')
expect(job.pending_state).to be_present
expect(response).to have_gitlab_http_status(:accepted)
end
end
context 'when runner retries request after receiving 202' do
it 'responds with 202 and then with 200', :sidekiq_inline do
perform_enqueued_jobs do
update_job(state: 'success', checksum: 'crc32:12345678')
end
expect(job.reload.pending_state).to be_present
expect(response).to have_gitlab_http_status(:accepted)
perform_enqueued_jobs do
update_job(state: 'success', checksum: 'crc32:12345678')
end
expect(job.reload.pending_state).not_to be_present
expect(response).to have_gitlab_http_status(:ok)
end
end
context 'when live trace chunk has been migrated' do
before do
job.trace_chunks.first.update!(data_store: :database)
end
it 'responds with 200' do
update_job(state: 'success', checksum: 'crc:12345678')
expect(job.reload).to be_success
expect(job.pending_state).not_to be_present
expect(response).to have_gitlab_http_status(:ok)
end
end
end
context 'when checksum is not present' do
it 'responds with 200' do
update_job(state: 'success')
expect(job.reload).to be_success
expect(job.pending_state).not_to be_present
expect(response).to have_gitlab_http_status(:ok)
end
end
end
end
end
context 'when trace is given' do
......
......@@ -50,7 +50,7 @@ RSpec.describe Ci::RetryBuildService do
metadata runner_session trace_chunks upstream_pipeline_id
artifacts_file artifacts_metadata artifacts_size commands
resource resource_group_id processed security_scans author
pipeline_id report_results].freeze
pipeline_id report_results pending_state].freeze
shared_examples 'build duplication' do
let(:another_pipeline) { create(:ci_empty_pipeline, project: project) }
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Ci::UpdateBuildStateService do
let(:project) { create(:project) }
let(:pipeline) { create(:ci_pipeline, project: project) }
let(:build) { create(:ci_build, :running, pipeline: pipeline) }
let(:metrics) { spy('metrics') }
subject { described_class.new(build, params, metrics) }
before do
stub_feature_flags(ci_enable_live_trace: true)
end
context 'when build does not have checksum' do
context 'when state has changed' do
let(:params) { { state: 'success' } }
it 'updates a state of a running build' do
subject.execute
expect(build).to be_success
end
it 'returns 200 OK status' do
result = subject.execute
expect(result.status).to eq 200
end
it 'does not increment finalized trace metric' do
subject.execute
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :finalized)
end
end
context 'when it is a heartbeat request' do
let(:params) { { state: 'success' } }
it 'updates a build timestamp' do
expect { subject.execute }.to change { build.updated_at }
end
end
context 'when request payload carries a trace' do
let(:params) { { state: 'success', trace: 'overwritten' } }
it 'overwrites a trace and updates trace operation metric' do
result = subject.execute
expect(build.trace.raw).to eq 'overwritten'
expect(result.status).to eq 200
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :overwrite)
end
end
context 'when state is unknown' do
let(:params) { { state: 'unknown' } }
it 'responds with 400 bad request' do
result = subject.execute
expect(result.status).to eq 400
expect(build).to be_running
end
end
end
context 'when build has a checksum' do
let(:params) do
{ checksum: 'crc32:12345678', state: 'failed', failure_reason: 'script_failure' }
end
context 'when build trace has been migrated' do
before do
create(:ci_build_trace_chunk, :database_with_data, build: build)
end
it 'updates a build state' do
subject.execute
expect(build).to be_failed
end
it 'responds with 200 OK status' do
result = subject.execute
expect(result.status).to eq 200
end
it 'increments trace finalized operation metric' do
subject.execute
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :finalized)
end
end
context 'when build trace has not been migrated yet' do
before do
create(:ci_build_trace_chunk, :redis_with_data, build: build)
end
it 'does not update a build state' do
subject.execute
expect(build).to be_running
end
it 'responds with 202 accepted' do
result = subject.execute
expect(result.status).to eq 202
end
it 'schedules live chunks for migration' do
expect(Ci::BuildTraceChunkFlushWorker)
.to receive(:perform_async)
.with(build.trace_chunks.first.id)
subject.execute
end
it 'increments trace accepted operation metric' do
subject.execute
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :accepted)
end
it 'creates a pending state record' do
subject.execute
build.pending_state.then do |status|
expect(status).to be_present
expect(status.state).to eq 'failed'
expect(status.trace_checksum).to eq 'crc32:12345678'
expect(status.failure_reason).to eq 'script_failure'
end
end
context 'when build pending state is outdated' do
before do
build.create_pending_state(
state: 'failed',
trace_checksum: 'crc32:12345678',
failure_reason: 'script_failure',
created_at: 10.minutes.ago
)
end
it 'responds with 200 OK' do
result = subject.execute
expect(result.status).to eq 200
end
it 'updates build state' do
subject.execute
expect(build.reload).to be_failed
expect(build.failure_reason).to eq 'script_failure'
end
it 'increments discarded traces metric' do
subject.execute
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :discarded)
end
it 'does not increment finalized trace metric' do
subject.execute
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :finalized)
end
end
context 'when build pending state has changes' do
before do
build.create_pending_state(
state: 'success',
created_at: 10.minutes.ago
)
end
it 'uses stored state and responds with 200 OK' do
result = subject.execute
expect(result.status).to eq 200
end
it 'increments conflict trace metric' do
subject.execute
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :conflict)
end
end
context 'when live traces are disabled' do
before do
stub_feature_flags(ci_enable_live_trace: false)
end
it 'responds with 200 OK' do
result = subject.execute
expect(result.status).to eq 200
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment