Commit 622a16a9 authored by Stan Hu's avatar Stan Hu

Always attempt retry of job trace read when file is missing

In https://gitlab.com/gitlab-org/gitlab/-/merge_requests/40075, we added
a check for the write lock to determine whether to retry reading a
missing, archived build log. However, in production, it appears this
issue is still happening about 1000 times a day, so let's always retry
to see if this reduces that incident rate.

This reverts commit f09bbb5808d1d9a3b57822233124a6ee6b9b10c8.
parent b70d1130
---
title: Always attempt retry of job trace read when file is missing
merge_request: 40516
author:
type: fixed
......@@ -79,9 +79,7 @@ module Gitlab
job.trace_chunks.any? || current_path.present? || old_trace.present?
end
def read(&block)
should_retry = true if lock_taken?(lock_key)
def read(should_retry: true, &block)
read_stream(&block)
rescue Errno::ENOENT
raise unless should_retry
......@@ -195,13 +193,10 @@ module Gitlab
end
def in_write_lock(&blk)
lock_key = "trace:write:lock:#{job.id}"
in_lock(lock_key, ttl: LOCK_TTL, retries: LOCK_RETRIES, sleep_sec: LOCK_SLEEP, &blk)
end
def lock_key
"trace:write:lock:#{job.id}"
end
def archive_stream!(stream)
clone_file!(stream, JobArtifactUploader.workhorse_upload_path) do |clone_path|
create_build_trace!(job, clone_path)
......
......@@ -39,10 +39,5 @@ module Gitlab
ensure
lease&.cancel
end
def lock_taken?(key)
lease = Gitlab::ExclusiveLease.new(key, timeout: 0)
lease.exists?
end
end
end
......@@ -23,26 +23,14 @@ RSpec.describe Gitlab::Ci::Trace, :clean_gitlab_redis_shared_state do
artifact1.file.migrate!(ObjectStorage::Store::REMOTE)
end
context 'when write lock is not present' do
it 'raises an exception' do
expect { artifact2.job.trace.raw }.to raise_error(Errno::ENOENT)
end
end
it 'reloads the trace after is it migrated' do
stub_const('Gitlab::HttpIO::BUFFER_SIZE', test_data.length)
context 'when write lock is present', :clean_gitlab_redis_shared_state do
before do
Gitlab::ExclusiveLease.new("trace:write:lock:#{job.id}", timeout: 10.seconds).try_obtain
expect_next_instance_of(Gitlab::HttpIO) do |http_io|
expect(http_io).to receive(:get_chunk).and_return(test_data, "")
end
it 'reloads the trace after is it migrated' do
stub_const('Gitlab::HttpIO::BUFFER_SIZE', test_data.length)
expect_next_instance_of(Gitlab::HttpIO) do |http_io|
expect(http_io).to receive(:get_chunk).and_return(test_data, "")
end
expect(artifact2.job.trace.raw).to eq(test_data)
end
expect(artifact2.job.trace.raw).to eq(test_data)
end
end
......
......@@ -111,14 +111,4 @@ RSpec.describe Gitlab::ExclusiveLeaseHelpers, :clean_gitlab_redis_shared_state d
end
end
end
describe '#lock_taken?' do
it 'returns true when lock has been taken' do
expect(class_instance.lock_taken?(unique_key)).to be false
class_instance.in_lock(unique_key) do
expect(class_instance.lock_taken?(unique_key)).to be true
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment