Commit ccf6b258 authored by Marius Bobin's avatar Marius Bobin Committed by Fabio Pitino

Skip archiving job traces after five failed attempts

parent 177ceaff
......@@ -724,6 +724,10 @@ module Ci
update_column(:trace, nil)
end
def ensure_trace_metadata!
Ci::BuildTraceMetadata.find_or_upsert_for!(id)
end
def artifacts_expose_as
options.dig(:artifacts, :expose_as)
end
......
......@@ -2,6 +2,7 @@
module Ci
class BuildTraceMetadata < Ci::ApplicationRecord
MAX_ATTEMPTS = 5
self.table_name = 'ci_build_trace_metadata'
self.primary_key = :build_id
......@@ -9,5 +10,49 @@ module Ci
belongs_to :trace_artifact, class_name: 'Ci::JobArtifact'
validates :build, presence: true
validates :archival_attempts, presence: true
def self.find_or_upsert_for!(build_id)
record = find_by(build_id: build_id)
return record if record
upsert({ build_id: build_id }, unique_by: :build_id)
find_by!(build_id: build_id)
end
# The job is retried around 5 times during the 7 days retention period for
# trace chunks as defined in `Ci::BuildTraceChunks::RedisBase::CHUNK_REDIS_TTL`
def can_attempt_archival_now?
return false unless archival_attempts_available?
return true unless last_archival_attempt_at
last_archival_attempt_at + backoff < Time.current
end
def archival_attempts_available?
archival_attempts <= MAX_ATTEMPTS
end
def increment_archival_attempts!
increment!(:archival_attempts, touch: :last_archival_attempt_at)
end
def track_archival!(trace_artifact_id)
update!(trace_artifact_id: trace_artifact_id, archived_at: Time.current)
end
def archival_attempts_message
if archival_attempts_available?
'The job can not be archived right now.'
else
'The job is out of archival attempts.'
end
end
private
def backoff
::Gitlab::Ci::Trace::Backoff.new(archival_attempts).value_with_jitter
end
end
end
......@@ -3,6 +3,13 @@
module Ci
class ArchiveTraceService
def execute(job, worker_name:)
unless job.trace.can_attempt_archival_now?
Sidekiq.logger.warn(class: worker_name,
message: job.trace.archival_attempts_message,
job_id: job.id)
return
end
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_live_trace?
......@@ -25,6 +32,8 @@ module Ci
rescue ::Gitlab::Ci::Trace::AlreadyArchivedError
# It's already archived, thus we can safely ignore this exception.
rescue StandardError => e
job.trace.increment_archival_attempts!
# Tracks this error with application logs, Sentry, and Prometheus.
# If `archive!` keeps failing for over a week, that could incur data loss.
# (See more https://docs.gitlab.com/ee/administration/job_logs.html#new-incremental-logging-architecture)
......
# frozen_string_literal: true
class AddTimestampColumnsToCiBuildTraceMetadata < Gitlab::Database::Migration[1.0]
def change
add_column :ci_build_trace_metadata, :last_archival_attempt_at, :datetime_with_timezone
add_column :ci_build_trace_metadata, :archived_at, :datetime_with_timezone
end
end
9fe4e2a3d5c50507220ac8363a9f7975ca1fc87575ee0c2ba8948c6d9bcd7019
\ No newline at end of file
......@@ -11329,7 +11329,9 @@ CREATE TABLE ci_build_trace_metadata (
trace_artifact_id bigint,
archival_attempts smallint DEFAULT 0 NOT NULL,
checksum bytea,
remote_checksum bytea
remote_checksum bytea,
last_archival_attempt_at timestamp with time zone,
archived_at timestamp with time zone
);
CREATE TABLE ci_builds (
......@@ -4,6 +4,7 @@ module Gitlab
module Ci
class Trace
include ::Gitlab::ExclusiveLeaseHelpers
include ::Gitlab::Utils::StrongMemoize
include Checksummable
LOCK_TTL = 10.minutes
......@@ -23,6 +24,8 @@ module Gitlab
attr_reader :job
delegate :old_trace, to: :job
delegate :can_attempt_archival_now?, :increment_archival_attempts!,
:archival_attempts_message, to: :trace_metadata
def initialize(job)
@job = job
......@@ -251,11 +254,19 @@ module Gitlab
File.open(path) do |stream|
# TODO: Set `file_format: :raw` after we've cleaned up legacy traces migration
# https://gitlab.com/gitlab-org/gitlab-foss/merge_requests/20307
job.create_job_artifacts_trace!(
trace_artifact = job.create_job_artifacts_trace!(
project: job.project,
file_type: :trace,
file: stream,
file_sha256: self.class.hexdigest(path))
trace_metadata.track_archival!(trace_artifact.id)
end
end
def trace_metadata
strong_memoize(:trace_metadata) do
job.ensure_trace_metadata!
end
end
......
# frozen_string_literal: true
module Gitlab
module Ci
class Trace
##
# Trace::Backoff class is responsible for calculating a backoff value
# for when to be able to retry archiving a build's trace
#
# Because we're updating `last_archival_attempt_at` timestamp with every
# failed archival attempt, we need to be sure that sum of the backoff values
# for 1..MAX_ATTEMPTS is under 7 days(CHUNK_REDIS_TTL).
#
class Backoff
include Gitlab::Utils::StrongMemoize
MAX_JITTER_VALUE = 4
attr_reader :archival_attempts
def initialize(archival_attempts)
@archival_attempts = archival_attempts
end
def value
(((chunks_ttl / (3.5 * max_attempts)) * archival_attempts) / 1.hour).hours
end
# This formula generates an increasing delay between executions
# 9.6, 19.2, 28.8, 38.4, 48.0 + a random amount of time to
# change the order of execution for the jobs.
# With maximum value for each call to rand(4), this sums up to 6.8 days
# and with minimum values is 6 days.
#
def value_with_jitter
value + jitter
end
private
def jitter
rand(MAX_JITTER_VALUE).hours
end
def chunks_ttl
::Ci::BuildTraceChunks::RedisBase::CHUNK_REDIS_TTL
end
def max_attempts
::Ci::BuildTraceMetadata::MAX_ATTEMPTS
end
end
end
end
end
# frozen_string_literal: true
FactoryBot.define do
factory :ci_build_trace_metadata, class: 'Ci::BuildTraceMetadata' do
build factory: :ci_build
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Ci::Trace::Backoff do
using RSpec::Parameterized::TableSyntax
subject(:backoff) { described_class.new(archival_attempts) }
it 'keeps the MAX_ATTEMPTS limit in sync' do
expect(Ci::BuildTraceMetadata::MAX_ATTEMPTS).to eq(5)
end
it 'keeps the Redis TTL limit in sync' do
expect(Ci::BuildTraceChunks::RedisBase::CHUNK_REDIS_TTL).to eq(7.days)
end
describe '#value' do
where(:archival_attempts, :result) do
1 | 9.6
2 | 19.2
3 | 28.8
4 | 38.4
5 | 48.0
end
with_them do
subject { backoff.value }
it { is_expected.to eq(result.hours) }
end
end
describe '#value_with_jitter' do
where(:archival_attempts, :min_value, :max_value) do
1 | 9.6 | 13.6
2 | 19.2 | 23.2
3 | 28.8 | 32.8
4 | 38.4 | 42.4
5 | 48.0 | 52.0
end
with_them do
subject { backoff.value_with_jitter }
it { is_expected.to be_in(min_value.hours..max_value.hours) }
end
end
it 'all retries are happening under the 7 days limit' do
backoff_total = 1.upto(Ci::BuildTraceMetadata::MAX_ATTEMPTS).sum do |attempt|
backoff = described_class.new(attempt)
expect(backoff).to receive(:rand)
.with(described_class::MAX_JITTER_VALUE)
.and_return(described_class::MAX_JITTER_VALUE)
backoff.value_with_jitter
end
expect(backoff_total).to be < Ci::BuildTraceChunks::RedisBase::CHUNK_REDIS_TTL
end
end
......@@ -130,4 +130,18 @@ RSpec.describe Gitlab::Ci::Trace, :clean_gitlab_redis_shared_state, factory_defa
end
end
end
describe '#can_attempt_archival_now?' do
it 'creates the record and returns true' do
expect(trace.can_attempt_archival_now?).to be_truthy
end
end
describe '#increment_archival_attempts!' do
it 'creates the record and increments its value' do
expect { trace.increment_archival_attempts! }
.to change { build.reload.trace_metadata&.archival_attempts }.from(nil).to(1)
.and change { build.reload.trace_metadata&.last_archival_attempt_at }
end
end
end
......@@ -5259,4 +5259,14 @@ RSpec.describe Ci::Build do
expect(described_class.with_coverage_regex).to eq([build_with_coverage_regex])
end
end
describe '#ensure_trace_metadata!' do
it 'delegates to Ci::BuildTraceMetadata' do
expect(Ci::BuildTraceMetadata)
.to receive(:find_or_upsert_for!)
.with(build.id)
build.ensure_trace_metadata!
end
end
end
......@@ -7,4 +7,128 @@ RSpec.describe Ci::BuildTraceMetadata do
it { is_expected.to belong_to(:trace_artifact) }
it { is_expected.to validate_presence_of(:build) }
it { is_expected.to validate_presence_of(:archival_attempts) }
describe '#can_attempt_archival_now?' do
let(:metadata) do
build(:ci_build_trace_metadata,
archival_attempts: archival_attempts,
last_archival_attempt_at: last_archival_attempt_at)
end
subject { metadata.can_attempt_archival_now? }
context 'when archival_attempts is over the limit' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS + 1 }
let(:last_archival_attempt_at) {}
it { is_expected.to be_falsey }
end
context 'when last_archival_attempt_at is not set' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS }
let(:last_archival_attempt_at) {}
it { is_expected.to be_truthy }
end
context 'when last_archival_attempt_at is set' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS }
let(:last_archival_attempt_at) { 6.days.ago }
it { is_expected.to be_truthy }
end
context 'when last_archival_attempt_at is too close' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS }
let(:last_archival_attempt_at) { 1.hour.ago }
it { is_expected.to be_falsey }
end
end
describe '#archival_attempts_available?' do
let(:metadata) do
build(:ci_build_trace_metadata, archival_attempts: archival_attempts)
end
subject { metadata.archival_attempts_available? }
context 'when archival_attempts is over the limit' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS + 1 }
it { is_expected.to be_falsey }
end
context 'when archival_attempts is at the limit' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS }
it { is_expected.to be_truthy }
end
end
describe '#increment_archival_attempts!' do
let_it_be(:metadata) do
create(:ci_build_trace_metadata,
archival_attempts: 2,
last_archival_attempt_at: 1.day.ago)
end
it 'increments the attempts' do
expect { metadata.increment_archival_attempts! }
.to change { metadata.reload.archival_attempts }
end
it 'updates the last_archival_attempt_at timestamp' do
expect { metadata.increment_archival_attempts! }
.to change { metadata.reload.last_archival_attempt_at }
end
end
describe '#track_archival!' do
let(:trace_artifact) { create(:ci_job_artifact) }
let(:metadata) { create(:ci_build_trace_metadata) }
it 'stores the artifact id and timestamp' do
expect(metadata.trace_artifact_id).to be_nil
metadata.track_archival!(trace_artifact.id)
metadata.reload
expect(metadata.trace_artifact_id).to eq(trace_artifact.id)
expect(metadata.archived_at).to be_like_time(Time.current)
end
end
describe '.find_or_upsert_for!' do
let_it_be(:build) { create(:ci_build) }
subject(:execute) do
described_class.find_or_upsert_for!(build.id)
end
it 'creates a new record' do
metadata = execute
expect(metadata).to be_a(described_class)
expect(metadata.id).to eq(build.id)
expect(metadata.archival_attempts).to eq(0)
end
context 'with existing records' do
before do
create(:ci_build_trace_metadata,
build: build,
archival_attempts: described_class::MAX_ATTEMPTS)
end
it 'returns the existing record' do
metadata = execute
expect(metadata).to be_a(described_class)
expect(metadata.id).to eq(build.id)
expect(metadata.archival_attempts).to eq(described_class::MAX_ATTEMPTS)
end
end
end
end
......@@ -12,6 +12,7 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do
expect { subject }.not_to raise_error
expect(job.reload.job_artifacts_trace).to be_exist
expect(job.trace_metadata.trace_artifact).to eq(job.job_artifacts_trace)
end
context 'when trace is already archived' do
......@@ -59,6 +60,54 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do
end
end
context 'when the job is out of archival attempts' do
before do
create(:ci_build_trace_metadata,
build: job,
archival_attempts: Ci::BuildTraceMetadata::MAX_ATTEMPTS + 1,
last_archival_attempt_at: 1.week.ago)
end
it 'skips archiving' do
expect(job.trace).not_to receive(:archive!)
subject
end
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: Ci::ArchiveTraceWorker.name,
message: 'The job is out of archival attempts.',
job_id: job.id)
subject
end
end
context 'when the archival process is backed off' do
before do
create(:ci_build_trace_metadata,
build: job,
archival_attempts: Ci::BuildTraceMetadata::MAX_ATTEMPTS - 1,
last_archival_attempt_at: 1.hour.ago)
end
it 'skips archiving' do
expect(job.trace).not_to receive(:archive!)
subject
end
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: Ci::ArchiveTraceWorker.name,
message: 'The job can not be archived right now.',
job_id: job.id)
subject
end
end
context 'when job failed to archive trace but did not raise an exception' do
before do
allow_next_instance_of(Gitlab::Ci::Trace) do |instance|
......@@ -98,6 +147,7 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do
.and_call_original
expect { subject }.not_to raise_error
expect(job.trace_metadata.archival_attempts).to eq(1)
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment