Commit 8ddcbf40 authored by Fabio Pitino's avatar Fabio Pitino

Merge branch 'ci-skip-processing-failed-traces' into 'master'

Skip archiving job traces after five failed attempts

See merge request gitlab-org/gitlab!68285
parents 5adf195b ccf6b258
...@@ -724,6 +724,10 @@ module Ci ...@@ -724,6 +724,10 @@ module Ci
update_column(:trace, nil) update_column(:trace, nil)
end end
def ensure_trace_metadata!
Ci::BuildTraceMetadata.find_or_upsert_for!(id)
end
def artifacts_expose_as def artifacts_expose_as
options.dig(:artifacts, :expose_as) options.dig(:artifacts, :expose_as)
end end
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
module Ci module Ci
class BuildTraceMetadata < Ci::ApplicationRecord class BuildTraceMetadata < Ci::ApplicationRecord
MAX_ATTEMPTS = 5
self.table_name = 'ci_build_trace_metadata' self.table_name = 'ci_build_trace_metadata'
self.primary_key = :build_id self.primary_key = :build_id
...@@ -9,5 +10,49 @@ module Ci ...@@ -9,5 +10,49 @@ module Ci
belongs_to :trace_artifact, class_name: 'Ci::JobArtifact' belongs_to :trace_artifact, class_name: 'Ci::JobArtifact'
validates :build, presence: true validates :build, presence: true
validates :archival_attempts, presence: true
def self.find_or_upsert_for!(build_id)
record = find_by(build_id: build_id)
return record if record
upsert({ build_id: build_id }, unique_by: :build_id)
find_by!(build_id: build_id)
end
# The job is retried around 5 times during the 7 days retention period for
# trace chunks as defined in `Ci::BuildTraceChunks::RedisBase::CHUNK_REDIS_TTL`
def can_attempt_archival_now?
return false unless archival_attempts_available?
return true unless last_archival_attempt_at
last_archival_attempt_at + backoff < Time.current
end
def archival_attempts_available?
archival_attempts <= MAX_ATTEMPTS
end
def increment_archival_attempts!
increment!(:archival_attempts, touch: :last_archival_attempt_at)
end
def track_archival!(trace_artifact_id)
update!(trace_artifact_id: trace_artifact_id, archived_at: Time.current)
end
def archival_attempts_message
if archival_attempts_available?
'The job can not be archived right now.'
else
'The job is out of archival attempts.'
end
end
private
def backoff
::Gitlab::Ci::Trace::Backoff.new(archival_attempts).value_with_jitter
end
end end
end end
...@@ -3,6 +3,13 @@ ...@@ -3,6 +3,13 @@
module Ci module Ci
class ArchiveTraceService class ArchiveTraceService
def execute(job, worker_name:) def execute(job, worker_name:)
unless job.trace.can_attempt_archival_now?
Sidekiq.logger.warn(class: worker_name,
message: job.trace.archival_attempts_message,
job_id: job.id)
return
end
# TODO: Remove this logging once we confirmed new live trace architecture is functional. # TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667. # See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_live_trace? unless job.has_live_trace?
...@@ -25,6 +32,8 @@ module Ci ...@@ -25,6 +32,8 @@ module Ci
rescue ::Gitlab::Ci::Trace::AlreadyArchivedError rescue ::Gitlab::Ci::Trace::AlreadyArchivedError
# It's already archived, thus we can safely ignore this exception. # It's already archived, thus we can safely ignore this exception.
rescue StandardError => e rescue StandardError => e
job.trace.increment_archival_attempts!
# Tracks this error with application logs, Sentry, and Prometheus. # Tracks this error with application logs, Sentry, and Prometheus.
# If `archive!` keeps failing for over a week, that could incur data loss. # If `archive!` keeps failing for over a week, that could incur data loss.
# (See more https://docs.gitlab.com/ee/administration/job_logs.html#new-incremental-logging-architecture) # (See more https://docs.gitlab.com/ee/administration/job_logs.html#new-incremental-logging-architecture)
......
# frozen_string_literal: true
class AddTimestampColumnsToCiBuildTraceMetadata < Gitlab::Database::Migration[1.0]
def change
add_column :ci_build_trace_metadata, :last_archival_attempt_at, :datetime_with_timezone
add_column :ci_build_trace_metadata, :archived_at, :datetime_with_timezone
end
end
9fe4e2a3d5c50507220ac8363a9f7975ca1fc87575ee0c2ba8948c6d9bcd7019
\ No newline at end of file
...@@ -11329,7 +11329,9 @@ CREATE TABLE ci_build_trace_metadata ( ...@@ -11329,7 +11329,9 @@ CREATE TABLE ci_build_trace_metadata (
trace_artifact_id bigint, trace_artifact_id bigint,
archival_attempts smallint DEFAULT 0 NOT NULL, archival_attempts smallint DEFAULT 0 NOT NULL,
checksum bytea, checksum bytea,
remote_checksum bytea remote_checksum bytea,
last_archival_attempt_at timestamp with time zone,
archived_at timestamp with time zone
); );
CREATE TABLE ci_builds ( CREATE TABLE ci_builds (
...@@ -4,6 +4,7 @@ module Gitlab ...@@ -4,6 +4,7 @@ module Gitlab
module Ci module Ci
class Trace class Trace
include ::Gitlab::ExclusiveLeaseHelpers include ::Gitlab::ExclusiveLeaseHelpers
include ::Gitlab::Utils::StrongMemoize
include Checksummable include Checksummable
LOCK_TTL = 10.minutes LOCK_TTL = 10.minutes
...@@ -23,6 +24,8 @@ module Gitlab ...@@ -23,6 +24,8 @@ module Gitlab
attr_reader :job attr_reader :job
delegate :old_trace, to: :job delegate :old_trace, to: :job
delegate :can_attempt_archival_now?, :increment_archival_attempts!,
:archival_attempts_message, to: :trace_metadata
def initialize(job) def initialize(job)
@job = job @job = job
...@@ -251,11 +254,19 @@ module Gitlab ...@@ -251,11 +254,19 @@ module Gitlab
File.open(path) do |stream| File.open(path) do |stream|
# TODO: Set `file_format: :raw` after we've cleaned up legacy traces migration # TODO: Set `file_format: :raw` after we've cleaned up legacy traces migration
# https://gitlab.com/gitlab-org/gitlab-foss/merge_requests/20307 # https://gitlab.com/gitlab-org/gitlab-foss/merge_requests/20307
job.create_job_artifacts_trace!( trace_artifact = job.create_job_artifacts_trace!(
project: job.project, project: job.project,
file_type: :trace, file_type: :trace,
file: stream, file: stream,
file_sha256: self.class.hexdigest(path)) file_sha256: self.class.hexdigest(path))
trace_metadata.track_archival!(trace_artifact.id)
end
end
def trace_metadata
strong_memoize(:trace_metadata) do
job.ensure_trace_metadata!
end end
end end
......
# frozen_string_literal: true
module Gitlab
module Ci
class Trace
##
# Trace::Backoff class is responsible for calculating a backoff value
# for when to be able to retry archiving a build's trace
#
# Because we're updating `last_archival_attempt_at` timestamp with every
# failed archival attempt, we need to be sure that sum of the backoff values
# for 1..MAX_ATTEMPTS is under 7 days(CHUNK_REDIS_TTL).
#
class Backoff
include Gitlab::Utils::StrongMemoize
MAX_JITTER_VALUE = 4
attr_reader :archival_attempts
def initialize(archival_attempts)
@archival_attempts = archival_attempts
end
def value
(((chunks_ttl / (3.5 * max_attempts)) * archival_attempts) / 1.hour).hours
end
# This formula generates an increasing delay between executions
# 9.6, 19.2, 28.8, 38.4, 48.0 + a random amount of time to
# change the order of execution for the jobs.
# With maximum value for each call to rand(4), this sums up to 6.8 days
# and with minimum values is 6 days.
#
def value_with_jitter
value + jitter
end
private
def jitter
rand(MAX_JITTER_VALUE).hours
end
def chunks_ttl
::Ci::BuildTraceChunks::RedisBase::CHUNK_REDIS_TTL
end
def max_attempts
::Ci::BuildTraceMetadata::MAX_ATTEMPTS
end
end
end
end
end
# frozen_string_literal: true
FactoryBot.define do
factory :ci_build_trace_metadata, class: 'Ci::BuildTraceMetadata' do
build factory: :ci_build
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Ci::Trace::Backoff do
using RSpec::Parameterized::TableSyntax
subject(:backoff) { described_class.new(archival_attempts) }
it 'keeps the MAX_ATTEMPTS limit in sync' do
expect(Ci::BuildTraceMetadata::MAX_ATTEMPTS).to eq(5)
end
it 'keeps the Redis TTL limit in sync' do
expect(Ci::BuildTraceChunks::RedisBase::CHUNK_REDIS_TTL).to eq(7.days)
end
describe '#value' do
where(:archival_attempts, :result) do
1 | 9.6
2 | 19.2
3 | 28.8
4 | 38.4
5 | 48.0
end
with_them do
subject { backoff.value }
it { is_expected.to eq(result.hours) }
end
end
describe '#value_with_jitter' do
where(:archival_attempts, :min_value, :max_value) do
1 | 9.6 | 13.6
2 | 19.2 | 23.2
3 | 28.8 | 32.8
4 | 38.4 | 42.4
5 | 48.0 | 52.0
end
with_them do
subject { backoff.value_with_jitter }
it { is_expected.to be_in(min_value.hours..max_value.hours) }
end
end
it 'all retries are happening under the 7 days limit' do
backoff_total = 1.upto(Ci::BuildTraceMetadata::MAX_ATTEMPTS).sum do |attempt|
backoff = described_class.new(attempt)
expect(backoff).to receive(:rand)
.with(described_class::MAX_JITTER_VALUE)
.and_return(described_class::MAX_JITTER_VALUE)
backoff.value_with_jitter
end
expect(backoff_total).to be < Ci::BuildTraceChunks::RedisBase::CHUNK_REDIS_TTL
end
end
...@@ -130,4 +130,18 @@ RSpec.describe Gitlab::Ci::Trace, :clean_gitlab_redis_shared_state, factory_defa ...@@ -130,4 +130,18 @@ RSpec.describe Gitlab::Ci::Trace, :clean_gitlab_redis_shared_state, factory_defa
end end
end end
end end
describe '#can_attempt_archival_now?' do
it 'creates the record and returns true' do
expect(trace.can_attempt_archival_now?).to be_truthy
end
end
describe '#increment_archival_attempts!' do
it 'creates the record and increments its value' do
expect { trace.increment_archival_attempts! }
.to change { build.reload.trace_metadata&.archival_attempts }.from(nil).to(1)
.and change { build.reload.trace_metadata&.last_archival_attempt_at }
end
end
end end
...@@ -5259,4 +5259,14 @@ RSpec.describe Ci::Build do ...@@ -5259,4 +5259,14 @@ RSpec.describe Ci::Build do
expect(described_class.with_coverage_regex).to eq([build_with_coverage_regex]) expect(described_class.with_coverage_regex).to eq([build_with_coverage_regex])
end end
end end
describe '#ensure_trace_metadata!' do
it 'delegates to Ci::BuildTraceMetadata' do
expect(Ci::BuildTraceMetadata)
.to receive(:find_or_upsert_for!)
.with(build.id)
build.ensure_trace_metadata!
end
end
end end
...@@ -7,4 +7,128 @@ RSpec.describe Ci::BuildTraceMetadata do ...@@ -7,4 +7,128 @@ RSpec.describe Ci::BuildTraceMetadata do
it { is_expected.to belong_to(:trace_artifact) } it { is_expected.to belong_to(:trace_artifact) }
it { is_expected.to validate_presence_of(:build) } it { is_expected.to validate_presence_of(:build) }
it { is_expected.to validate_presence_of(:archival_attempts) }
describe '#can_attempt_archival_now?' do
let(:metadata) do
build(:ci_build_trace_metadata,
archival_attempts: archival_attempts,
last_archival_attempt_at: last_archival_attempt_at)
end
subject { metadata.can_attempt_archival_now? }
context 'when archival_attempts is over the limit' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS + 1 }
let(:last_archival_attempt_at) {}
it { is_expected.to be_falsey }
end
context 'when last_archival_attempt_at is not set' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS }
let(:last_archival_attempt_at) {}
it { is_expected.to be_truthy }
end
context 'when last_archival_attempt_at is set' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS }
let(:last_archival_attempt_at) { 6.days.ago }
it { is_expected.to be_truthy }
end
context 'when last_archival_attempt_at is too close' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS }
let(:last_archival_attempt_at) { 1.hour.ago }
it { is_expected.to be_falsey }
end
end
describe '#archival_attempts_available?' do
let(:metadata) do
build(:ci_build_trace_metadata, archival_attempts: archival_attempts)
end
subject { metadata.archival_attempts_available? }
context 'when archival_attempts is over the limit' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS + 1 }
it { is_expected.to be_falsey }
end
context 'when archival_attempts is at the limit' do
let(:archival_attempts) { described_class::MAX_ATTEMPTS }
it { is_expected.to be_truthy }
end
end
describe '#increment_archival_attempts!' do
let_it_be(:metadata) do
create(:ci_build_trace_metadata,
archival_attempts: 2,
last_archival_attempt_at: 1.day.ago)
end
it 'increments the attempts' do
expect { metadata.increment_archival_attempts! }
.to change { metadata.reload.archival_attempts }
end
it 'updates the last_archival_attempt_at timestamp' do
expect { metadata.increment_archival_attempts! }
.to change { metadata.reload.last_archival_attempt_at }
end
end
describe '#track_archival!' do
let(:trace_artifact) { create(:ci_job_artifact) }
let(:metadata) { create(:ci_build_trace_metadata) }
it 'stores the artifact id and timestamp' do
expect(metadata.trace_artifact_id).to be_nil
metadata.track_archival!(trace_artifact.id)
metadata.reload
expect(metadata.trace_artifact_id).to eq(trace_artifact.id)
expect(metadata.archived_at).to be_like_time(Time.current)
end
end
describe '.find_or_upsert_for!' do
let_it_be(:build) { create(:ci_build) }
subject(:execute) do
described_class.find_or_upsert_for!(build.id)
end
it 'creates a new record' do
metadata = execute
expect(metadata).to be_a(described_class)
expect(metadata.id).to eq(build.id)
expect(metadata.archival_attempts).to eq(0)
end
context 'with existing records' do
before do
create(:ci_build_trace_metadata,
build: build,
archival_attempts: described_class::MAX_ATTEMPTS)
end
it 'returns the existing record' do
metadata = execute
expect(metadata).to be_a(described_class)
expect(metadata.id).to eq(build.id)
expect(metadata.archival_attempts).to eq(described_class::MAX_ATTEMPTS)
end
end
end
end end
...@@ -12,6 +12,7 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do ...@@ -12,6 +12,7 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do
expect { subject }.not_to raise_error expect { subject }.not_to raise_error
expect(job.reload.job_artifacts_trace).to be_exist expect(job.reload.job_artifacts_trace).to be_exist
expect(job.trace_metadata.trace_artifact).to eq(job.job_artifacts_trace)
end end
context 'when trace is already archived' do context 'when trace is already archived' do
...@@ -59,6 +60,54 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do ...@@ -59,6 +60,54 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do
end end
end end
context 'when the job is out of archival attempts' do
before do
create(:ci_build_trace_metadata,
build: job,
archival_attempts: Ci::BuildTraceMetadata::MAX_ATTEMPTS + 1,
last_archival_attempt_at: 1.week.ago)
end
it 'skips archiving' do
expect(job.trace).not_to receive(:archive!)
subject
end
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: Ci::ArchiveTraceWorker.name,
message: 'The job is out of archival attempts.',
job_id: job.id)
subject
end
end
context 'when the archival process is backed off' do
before do
create(:ci_build_trace_metadata,
build: job,
archival_attempts: Ci::BuildTraceMetadata::MAX_ATTEMPTS - 1,
last_archival_attempt_at: 1.hour.ago)
end
it 'skips archiving' do
expect(job.trace).not_to receive(:archive!)
subject
end
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: Ci::ArchiveTraceWorker.name,
message: 'The job can not be archived right now.',
job_id: job.id)
subject
end
end
context 'when job failed to archive trace but did not raise an exception' do context 'when job failed to archive trace but did not raise an exception' do
before do before do
allow_next_instance_of(Gitlab::Ci::Trace) do |instance| allow_next_instance_of(Gitlab::Ci::Trace) do |instance|
...@@ -98,6 +147,7 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do ...@@ -98,6 +147,7 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do
.and_call_original .and_call_original
expect { subject }.not_to raise_error expect { subject }.not_to raise_error
expect(job.trace_metadata.archival_attempts).to eq(1)
end end
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment