Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
G
gitlab-ce
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
gitlab-ce
Commits
ccf6b258
Commit
ccf6b258
authored
Sep 06, 2021
by
Marius Bobin
Committed by
Fabio Pitino
Sep 06, 2021
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Skip archiving job traces after five failed attempts
parent
177ceaff
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
404 additions
and
2 deletions
+404
-2
app/models/ci/build.rb
app/models/ci/build.rb
+4
-0
app/models/ci/build_trace_metadata.rb
app/models/ci/build_trace_metadata.rb
+45
-0
app/services/ci/archive_trace_service.rb
app/services/ci/archive_trace_service.rb
+9
-0
db/migrate/20210824105038_add_timestamp_columns_to_ci_build_trace_metadata.rb
...05038_add_timestamp_columns_to_ci_build_trace_metadata.rb
+8
-0
db/schema_migrations/20210824105038
db/schema_migrations/20210824105038
+1
-0
db/structure.sql
db/structure.sql
+3
-1
lib/gitlab/ci/trace.rb
lib/gitlab/ci/trace.rb
+12
-1
lib/gitlab/ci/trace/backoff.rb
lib/gitlab/ci/trace/backoff.rb
+55
-0
spec/factories/ci/build_trace_metadata.rb
spec/factories/ci/build_trace_metadata.rb
+7
-0
spec/lib/gitlab/ci/trace/backoff_spec.rb
spec/lib/gitlab/ci/trace/backoff_spec.rb
+62
-0
spec/lib/gitlab/ci/trace_spec.rb
spec/lib/gitlab/ci/trace_spec.rb
+14
-0
spec/models/ci/build_spec.rb
spec/models/ci/build_spec.rb
+10
-0
spec/models/ci/build_trace_metadata_spec.rb
spec/models/ci/build_trace_metadata_spec.rb
+124
-0
spec/services/ci/archive_trace_service_spec.rb
spec/services/ci/archive_trace_service_spec.rb
+50
-0
No files found.
app/models/ci/build.rb
View file @
ccf6b258
...
...
@@ -724,6 +724,10 @@ module Ci
update_column
(
:trace
,
nil
)
end
def
ensure_trace_metadata!
Ci
::
BuildTraceMetadata
.
find_or_upsert_for!
(
id
)
end
def
artifacts_expose_as
options
.
dig
(
:artifacts
,
:expose_as
)
end
...
...
app/models/ci/build_trace_metadata.rb
View file @
ccf6b258
...
...
@@ -2,6 +2,7 @@
module
Ci
class
BuildTraceMetadata
<
Ci
::
ApplicationRecord
MAX_ATTEMPTS
=
5
self
.
table_name
=
'ci_build_trace_metadata'
self
.
primary_key
=
:build_id
...
...
@@ -9,5 +10,49 @@ module Ci
belongs_to
:trace_artifact
,
class_name:
'Ci::JobArtifact'
validates
:build
,
presence:
true
validates
:archival_attempts
,
presence:
true
def
self
.
find_or_upsert_for!
(
build_id
)
record
=
find_by
(
build_id:
build_id
)
return
record
if
record
upsert
({
build_id:
build_id
},
unique_by: :build_id
)
find_by!
(
build_id:
build_id
)
end
# The job is retried around 5 times during the 7 days retention period for
# trace chunks as defined in `Ci::BuildTraceChunks::RedisBase::CHUNK_REDIS_TTL`
def
can_attempt_archival_now?
return
false
unless
archival_attempts_available?
return
true
unless
last_archival_attempt_at
last_archival_attempt_at
+
backoff
<
Time
.
current
end
def
archival_attempts_available?
archival_attempts
<=
MAX_ATTEMPTS
end
def
increment_archival_attempts!
increment!
(
:archival_attempts
,
touch: :last_archival_attempt_at
)
end
def
track_archival!
(
trace_artifact_id
)
update!
(
trace_artifact_id:
trace_artifact_id
,
archived_at:
Time
.
current
)
end
def
archival_attempts_message
if
archival_attempts_available?
'The job can not be archived right now.'
else
'The job is out of archival attempts.'
end
end
private
def
backoff
::
Gitlab
::
Ci
::
Trace
::
Backoff
.
new
(
archival_attempts
).
value_with_jitter
end
end
end
app/services/ci/archive_trace_service.rb
View file @
ccf6b258
...
...
@@ -3,6 +3,13 @@
module
Ci
class
ArchiveTraceService
def
execute
(
job
,
worker_name
:)
unless
job
.
trace
.
can_attempt_archival_now?
Sidekiq
.
logger
.
warn
(
class:
worker_name
,
message:
job
.
trace
.
archival_attempts_message
,
job_id:
job
.
id
)
return
end
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless
job
.
has_live_trace?
...
...
@@ -25,6 +32,8 @@ module Ci
rescue
::
Gitlab
::
Ci
::
Trace
::
AlreadyArchivedError
# It's already archived, thus we can safely ignore this exception.
rescue
StandardError
=>
e
job
.
trace
.
increment_archival_attempts!
# Tracks this error with application logs, Sentry, and Prometheus.
# If `archive!` keeps failing for over a week, that could incur data loss.
# (See more https://docs.gitlab.com/ee/administration/job_logs.html#new-incremental-logging-architecture)
...
...
db/migrate/20210824105038_add_timestamp_columns_to_ci_build_trace_metadata.rb
0 → 100644
View file @
ccf6b258
# frozen_string_literal: true
class
AddTimestampColumnsToCiBuildTraceMetadata
<
Gitlab
::
Database
::
Migration
[
1.0
]
def
change
add_column
:ci_build_trace_metadata
,
:last_archival_attempt_at
,
:datetime_with_timezone
add_column
:ci_build_trace_metadata
,
:archived_at
,
:datetime_with_timezone
end
end
db/schema_migrations/20210824105038
0 → 100644
View file @
ccf6b258
9fe4e2a3d5c50507220ac8363a9f7975ca1fc87575ee0c2ba8948c6d9bcd7019
\ No newline at end of file
db/structure.sql
View file @
ccf6b258
...
...
@@ -11329,7 +11329,9 @@ CREATE TABLE ci_build_trace_metadata (
trace_artifact_id bigint,
archival_attempts smallint DEFAULT 0 NOT NULL,
checksum bytea,
remote_checksum bytea
remote_checksum bytea,
last_archival_attempt_at timestamp with time zone,
archived_at timestamp with time zone
);
CREATE TABLE ci_builds (
lib/gitlab/ci/trace.rb
View file @
ccf6b258
...
...
@@ -4,6 +4,7 @@ module Gitlab
module
Ci
class
Trace
include
::
Gitlab
::
ExclusiveLeaseHelpers
include
::
Gitlab
::
Utils
::
StrongMemoize
include
Checksummable
LOCK_TTL
=
10
.
minutes
...
...
@@ -23,6 +24,8 @@ module Gitlab
attr_reader
:job
delegate
:old_trace
,
to: :job
delegate
:can_attempt_archival_now?
,
:increment_archival_attempts!
,
:archival_attempts_message
,
to: :trace_metadata
def
initialize
(
job
)
@job
=
job
...
...
@@ -251,11 +254,19 @@ module Gitlab
File
.
open
(
path
)
do
|
stream
|
# TODO: Set `file_format: :raw` after we've cleaned up legacy traces migration
# https://gitlab.com/gitlab-org/gitlab-foss/merge_requests/20307
job
.
create_job_artifacts_trace!
(
trace_artifact
=
job
.
create_job_artifacts_trace!
(
project:
job
.
project
,
file_type: :trace
,
file:
stream
,
file_sha256:
self
.
class
.
hexdigest
(
path
))
trace_metadata
.
track_archival!
(
trace_artifact
.
id
)
end
end
def
trace_metadata
strong_memoize
(
:trace_metadata
)
do
job
.
ensure_trace_metadata!
end
end
...
...
lib/gitlab/ci/trace/backoff.rb
0 → 100644
View file @
ccf6b258
# frozen_string_literal: true
module
Gitlab
module
Ci
class
Trace
##
# Trace::Backoff class is responsible for calculating a backoff value
# for when to be able to retry archiving a build's trace
#
# Because we're updating `last_archival_attempt_at` timestamp with every
# failed archival attempt, we need to be sure that sum of the backoff values
# for 1..MAX_ATTEMPTS is under 7 days(CHUNK_REDIS_TTL).
#
class
Backoff
include
Gitlab
::
Utils
::
StrongMemoize
MAX_JITTER_VALUE
=
4
attr_reader
:archival_attempts
def
initialize
(
archival_attempts
)
@archival_attempts
=
archival_attempts
end
def
value
(((
chunks_ttl
/
(
3.5
*
max_attempts
))
*
archival_attempts
)
/
1
.
hour
).
hours
end
# This formula generates an increasing delay between executions
# 9.6, 19.2, 28.8, 38.4, 48.0 + a random amount of time to
# change the order of execution for the jobs.
# With maximum value for each call to rand(4), this sums up to 6.8 days
# and with minimum values is 6 days.
#
def
value_with_jitter
value
+
jitter
end
private
def
jitter
rand
(
MAX_JITTER_VALUE
).
hours
end
def
chunks_ttl
::
Ci
::
BuildTraceChunks
::
RedisBase
::
CHUNK_REDIS_TTL
end
def
max_attempts
::
Ci
::
BuildTraceMetadata
::
MAX_ATTEMPTS
end
end
end
end
end
spec/factories/ci/build_trace_metadata.rb
0 → 100644
View file @
ccf6b258
# frozen_string_literal: true
FactoryBot
.
define
do
factory
:ci_build_trace_metadata
,
class:
'Ci::BuildTraceMetadata'
do
build
factory: :ci_build
end
end
spec/lib/gitlab/ci/trace/backoff_spec.rb
0 → 100644
View file @
ccf6b258
# frozen_string_literal: true
require
'spec_helper'
RSpec
.
describe
Gitlab
::
Ci
::
Trace
::
Backoff
do
using
RSpec
::
Parameterized
::
TableSyntax
subject
(
:backoff
)
{
described_class
.
new
(
archival_attempts
)
}
it
'keeps the MAX_ATTEMPTS limit in sync'
do
expect
(
Ci
::
BuildTraceMetadata
::
MAX_ATTEMPTS
).
to
eq
(
5
)
end
it
'keeps the Redis TTL limit in sync'
do
expect
(
Ci
::
BuildTraceChunks
::
RedisBase
::
CHUNK_REDIS_TTL
).
to
eq
(
7
.
days
)
end
describe
'#value'
do
where
(
:archival_attempts
,
:result
)
do
1
|
9.6
2
|
19.2
3
|
28.8
4
|
38.4
5
|
48.0
end
with_them
do
subject
{
backoff
.
value
}
it
{
is_expected
.
to
eq
(
result
.
hours
)
}
end
end
describe
'#value_with_jitter'
do
where
(
:archival_attempts
,
:min_value
,
:max_value
)
do
1
|
9.6
|
13.6
2
|
19.2
|
23.2
3
|
28.8
|
32.8
4
|
38.4
|
42.4
5
|
48.0
|
52.0
end
with_them
do
subject
{
backoff
.
value_with_jitter
}
it
{
is_expected
.
to
be_in
(
min_value
.
hours
..
max_value
.
hours
)
}
end
end
it
'all retries are happening under the 7 days limit'
do
backoff_total
=
1
.
upto
(
Ci
::
BuildTraceMetadata
::
MAX_ATTEMPTS
).
sum
do
|
attempt
|
backoff
=
described_class
.
new
(
attempt
)
expect
(
backoff
).
to
receive
(
:rand
)
.
with
(
described_class
::
MAX_JITTER_VALUE
)
.
and_return
(
described_class
::
MAX_JITTER_VALUE
)
backoff
.
value_with_jitter
end
expect
(
backoff_total
).
to
be
<
Ci
::
BuildTraceChunks
::
RedisBase
::
CHUNK_REDIS_TTL
end
end
spec/lib/gitlab/ci/trace_spec.rb
View file @
ccf6b258
...
...
@@ -130,4 +130,18 @@ RSpec.describe Gitlab::Ci::Trace, :clean_gitlab_redis_shared_state, factory_defa
end
end
end
describe
'#can_attempt_archival_now?'
do
it
'creates the record and returns true'
do
expect
(
trace
.
can_attempt_archival_now?
).
to
be_truthy
end
end
describe
'#increment_archival_attempts!'
do
it
'creates the record and increments its value'
do
expect
{
trace
.
increment_archival_attempts!
}
.
to
change
{
build
.
reload
.
trace_metadata
&
.
archival_attempts
}.
from
(
nil
).
to
(
1
)
.
and
change
{
build
.
reload
.
trace_metadata
&
.
last_archival_attempt_at
}
end
end
end
spec/models/ci/build_spec.rb
View file @
ccf6b258
...
...
@@ -5259,4 +5259,14 @@ RSpec.describe Ci::Build do
expect
(
described_class
.
with_coverage_regex
).
to
eq
([
build_with_coverage_regex
])
end
end
describe
'#ensure_trace_metadata!'
do
it
'delegates to Ci::BuildTraceMetadata'
do
expect
(
Ci
::
BuildTraceMetadata
)
.
to
receive
(
:find_or_upsert_for!
)
.
with
(
build
.
id
)
build
.
ensure_trace_metadata!
end
end
end
spec/models/ci/build_trace_metadata_spec.rb
View file @
ccf6b258
...
...
@@ -7,4 +7,128 @@ RSpec.describe Ci::BuildTraceMetadata do
it
{
is_expected
.
to
belong_to
(
:trace_artifact
)
}
it
{
is_expected
.
to
validate_presence_of
(
:build
)
}
it
{
is_expected
.
to
validate_presence_of
(
:archival_attempts
)
}
describe
'#can_attempt_archival_now?'
do
let
(
:metadata
)
do
build
(
:ci_build_trace_metadata
,
archival_attempts:
archival_attempts
,
last_archival_attempt_at:
last_archival_attempt_at
)
end
subject
{
metadata
.
can_attempt_archival_now?
}
context
'when archival_attempts is over the limit'
do
let
(
:archival_attempts
)
{
described_class
::
MAX_ATTEMPTS
+
1
}
let
(
:last_archival_attempt_at
)
{}
it
{
is_expected
.
to
be_falsey
}
end
context
'when last_archival_attempt_at is not set'
do
let
(
:archival_attempts
)
{
described_class
::
MAX_ATTEMPTS
}
let
(
:last_archival_attempt_at
)
{}
it
{
is_expected
.
to
be_truthy
}
end
context
'when last_archival_attempt_at is set'
do
let
(
:archival_attempts
)
{
described_class
::
MAX_ATTEMPTS
}
let
(
:last_archival_attempt_at
)
{
6
.
days
.
ago
}
it
{
is_expected
.
to
be_truthy
}
end
context
'when last_archival_attempt_at is too close'
do
let
(
:archival_attempts
)
{
described_class
::
MAX_ATTEMPTS
}
let
(
:last_archival_attempt_at
)
{
1
.
hour
.
ago
}
it
{
is_expected
.
to
be_falsey
}
end
end
describe
'#archival_attempts_available?'
do
let
(
:metadata
)
do
build
(
:ci_build_trace_metadata
,
archival_attempts:
archival_attempts
)
end
subject
{
metadata
.
archival_attempts_available?
}
context
'when archival_attempts is over the limit'
do
let
(
:archival_attempts
)
{
described_class
::
MAX_ATTEMPTS
+
1
}
it
{
is_expected
.
to
be_falsey
}
end
context
'when archival_attempts is at the limit'
do
let
(
:archival_attempts
)
{
described_class
::
MAX_ATTEMPTS
}
it
{
is_expected
.
to
be_truthy
}
end
end
describe
'#increment_archival_attempts!'
do
let_it_be
(
:metadata
)
do
create
(
:ci_build_trace_metadata
,
archival_attempts:
2
,
last_archival_attempt_at:
1
.
day
.
ago
)
end
it
'increments the attempts'
do
expect
{
metadata
.
increment_archival_attempts!
}
.
to
change
{
metadata
.
reload
.
archival_attempts
}
end
it
'updates the last_archival_attempt_at timestamp'
do
expect
{
metadata
.
increment_archival_attempts!
}
.
to
change
{
metadata
.
reload
.
last_archival_attempt_at
}
end
end
describe
'#track_archival!'
do
let
(
:trace_artifact
)
{
create
(
:ci_job_artifact
)
}
let
(
:metadata
)
{
create
(
:ci_build_trace_metadata
)
}
it
'stores the artifact id and timestamp'
do
expect
(
metadata
.
trace_artifact_id
).
to
be_nil
metadata
.
track_archival!
(
trace_artifact
.
id
)
metadata
.
reload
expect
(
metadata
.
trace_artifact_id
).
to
eq
(
trace_artifact
.
id
)
expect
(
metadata
.
archived_at
).
to
be_like_time
(
Time
.
current
)
end
end
describe
'.find_or_upsert_for!'
do
let_it_be
(
:build
)
{
create
(
:ci_build
)
}
subject
(
:execute
)
do
described_class
.
find_or_upsert_for!
(
build
.
id
)
end
it
'creates a new record'
do
metadata
=
execute
expect
(
metadata
).
to
be_a
(
described_class
)
expect
(
metadata
.
id
).
to
eq
(
build
.
id
)
expect
(
metadata
.
archival_attempts
).
to
eq
(
0
)
end
context
'with existing records'
do
before
do
create
(
:ci_build_trace_metadata
,
build:
build
,
archival_attempts:
described_class
::
MAX_ATTEMPTS
)
end
it
'returns the existing record'
do
metadata
=
execute
expect
(
metadata
).
to
be_a
(
described_class
)
expect
(
metadata
.
id
).
to
eq
(
build
.
id
)
expect
(
metadata
.
archival_attempts
).
to
eq
(
described_class
::
MAX_ATTEMPTS
)
end
end
end
end
spec/services/ci/archive_trace_service_spec.rb
View file @
ccf6b258
...
...
@@ -12,6 +12,7 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do
expect
{
subject
}.
not_to
raise_error
expect
(
job
.
reload
.
job_artifacts_trace
).
to
be_exist
expect
(
job
.
trace_metadata
.
trace_artifact
).
to
eq
(
job
.
job_artifacts_trace
)
end
context
'when trace is already archived'
do
...
...
@@ -59,6 +60,54 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do
end
end
context
'when the job is out of archival attempts'
do
before
do
create
(
:ci_build_trace_metadata
,
build:
job
,
archival_attempts:
Ci
::
BuildTraceMetadata
::
MAX_ATTEMPTS
+
1
,
last_archival_attempt_at:
1
.
week
.
ago
)
end
it
'skips archiving'
do
expect
(
job
.
trace
).
not_to
receive
(
:archive!
)
subject
end
it
'leaves a warning message in sidekiq log'
do
expect
(
Sidekiq
.
logger
).
to
receive
(
:warn
).
with
(
class:
Ci
::
ArchiveTraceWorker
.
name
,
message:
'The job is out of archival attempts.'
,
job_id:
job
.
id
)
subject
end
end
context
'when the archival process is backed off'
do
before
do
create
(
:ci_build_trace_metadata
,
build:
job
,
archival_attempts:
Ci
::
BuildTraceMetadata
::
MAX_ATTEMPTS
-
1
,
last_archival_attempt_at:
1
.
hour
.
ago
)
end
it
'skips archiving'
do
expect
(
job
.
trace
).
not_to
receive
(
:archive!
)
subject
end
it
'leaves a warning message in sidekiq log'
do
expect
(
Sidekiq
.
logger
).
to
receive
(
:warn
).
with
(
class:
Ci
::
ArchiveTraceWorker
.
name
,
message:
'The job can not be archived right now.'
,
job_id:
job
.
id
)
subject
end
end
context
'when job failed to archive trace but did not raise an exception'
do
before
do
allow_next_instance_of
(
Gitlab
::
Ci
::
Trace
)
do
|
instance
|
...
...
@@ -98,6 +147,7 @@ RSpec.describe Ci::ArchiveTraceService, '#execute' do
.
and_call_original
expect
{
subject
}.
not_to
raise_error
expect
(
job
.
trace_metadata
.
archival_attempts
).
to
eq
(
1
)
end
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment