Commit c2e0e689 authored by Shinya Maeda's avatar Shinya Maeda

Validate the existence of archived traces before removing live trace

Often live traces are removed even though the archived trace
doesn't exist. This commit checkes the existence strictly.
parent 1a3fda63
......@@ -531,6 +531,14 @@ module Ci
trace.exist?
end
def has_live_trace?
trace.live_trace_exist?
end
def has_archived_trace?
trace.archived_trace_exist?
end
def artifacts_file
job_artifacts_archive&.file
end
......
......@@ -176,6 +176,10 @@ module Ci
end
end
def self.archived_trace_exists_for?(job_id)
where(job_id: job_id).trace.take&.file&.file&.exists?
end
private
def file_format_adapter_class
......
......@@ -2,8 +2,25 @@
module Ci
class ArchiveTraceService
def execute(job)
def execute(job, worker_name:)
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_live_trace?
Sidekiq.logger.warn(class: worker_name,
message: 'The job does not have live trace but going to be archived.',
job_id: job.id)
return
end
job.trace.archive!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_archived_trace?
Sidekiq.logger.warn(class: worker_name,
message: 'The job does not have archived trace after archiving.',
job_id: job.id)
end
rescue ::Gitlab::Ci::Trace::AlreadyArchivedError
# It's already archived, thus we can safely ignore this exception.
rescue => e
......@@ -11,7 +28,7 @@ module Ci
# If `archive!` keeps failing for over a week, that could incur data loss.
# (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture)
# In order to avoid interrupting the system, we do not raise an exception here.
archive_error(e, job)
archive_error(e, job, worker_name)
end
private
......@@ -22,9 +39,12 @@ module Ci
"Counter of failed attempts of trace archiving")
end
def archive_error(error, job)
def archive_error(error, job, worker_name)
failed_archive_counter.increment
Rails.logger.error "Failed to archive trace. id: #{job.id} message: #{error.message}" # rubocop:disable Gitlab/RailsLogger
Sidekiq.logger.warn(class: worker_name,
message: "Failed to archive trace. message: #{error.message}.",
job_id: job.id)
Gitlab::Sentry
.track_exception(error,
......
......@@ -7,7 +7,7 @@ class ArchiveTraceWorker
# rubocop: disable CodeReuse/ActiveRecord
def perform(job_id)
Ci::Build.without_archived_trace.find_by(id: job_id).try do |job|
Ci::ArchiveTraceService.new.execute(job)
Ci::ArchiveTraceService.new.execute(job, worker_name: self.class.name)
end
end
# rubocop: enable CodeReuse/ActiveRecord
......
......@@ -11,7 +11,7 @@ module Ci
# This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL
# More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791
Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build|
Ci::ArchiveTraceService.new.execute(build)
Ci::ArchiveTraceService.new.execute(build, worker_name: self.class.name)
end
end
# rubocop: enable CodeReuse/ActiveRecord
......
---
title: Extra logging for new live trace architecture
merge_request: 30892
author:
type: fixed
......@@ -63,7 +63,15 @@ module Gitlab
end
def exist?
trace_artifact&.exists? || job.trace_chunks.any? || current_path.present? || old_trace.present?
archived_trace_exist? || live_trace_exist?
end
def archived_trace_exist?
trace_artifact&.exists?
end
def live_trace_exist?
job.trace_chunks.any? || current_path.present? || old_trace.present?
end
def read
......@@ -167,7 +175,7 @@ module Gitlab
def clone_file!(src_stream, temp_dir)
FileUtils.mkdir_p(temp_dir)
Dir.mktmpdir('tmp-trace', temp_dir) do |dir_path|
Dir.mktmpdir("tmp-trace-#{job.id}", temp_dir) do |dir_path|
temp_path = File.join(dir_path, "job.log")
FileUtils.touch(temp_path)
size = IO.copy_stream(src_stream, temp_path)
......
......@@ -166,6 +166,13 @@ module Gitlab
end
def destroy!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless build.has_archived_trace?
Sidekiq.logger.warn(message: 'The job does not have archived trace but going to be destroyed.',
job_id: build.id)
end
trace_chunks.fast_destroy_all
@tell = @size = 0
ensure
......
......@@ -442,5 +442,15 @@ describe Gitlab::Ci::Trace::ChunkedIO, :clean_gitlab_redis_cache do
expect(Ci::BuildTraceChunk.where(build: build).count).to eq(0)
end
context 'when the job does not have archived trace' do
it 'leaves a message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
message: 'The job does not have archived trace but going to be destroyed.',
job_id: build.id).and_call_original
subject
end
end
end
end
......@@ -692,6 +692,34 @@ describe Ci::Build do
end
end
describe '#has_live_trace?' do
subject { build.has_live_trace? }
let(:build) { create(:ci_build, :trace_live) }
it { is_expected.to be_truthy }
context 'when build does not have live trace' do
let(:build) { create(:ci_build) }
it { is_expected.to be_falsy }
end
end
describe '#has_archived_trace?' do
subject { build.has_archived_trace? }
let(:build) { create(:ci_build, :trace_artifact) }
it { is_expected.to be_truthy }
context 'when build does not have archived trace' do
let(:build) { create(:ci_build) }
it { is_expected.to be_falsy }
end
end
describe '#has_job_artifacts?' do
subject { build.has_job_artifacts? }
......
......@@ -70,6 +70,31 @@ describe Ci::JobArtifact do
end
end
describe '.archived_trace_exists_for?' do
subject { described_class.archived_trace_exists_for?(job_id) }
let!(:artifact) { create(:ci_job_artifact, :trace, job: job) }
let(:job) { create(:ci_build) }
context 'when the specified job_id exists' do
let(:job_id) { job.id }
it { is_expected.to be_truthy }
context 'when the job does have archived trace' do
let!(:artifact) { }
it { is_expected.to be_falsy }
end
end
context 'when the specified job_id does not exist' do
let(:job_id) { 10000 }
it { is_expected.to be_falsy }
end
end
describe 'callbacks' do
subject { create(:ci_job_artifact, :archive) }
......
......@@ -3,7 +3,7 @@
require 'spec_helper'
describe Ci::ArchiveTraceService, '#execute' do
subject { described_class.new.execute(job) }
subject { described_class.new.execute(job, worker_name: ArchiveTraceWorker.name) }
context 'when job is finished' do
let(:job) { create(:ci_build, :success, :trace_live) }
......@@ -25,6 +25,34 @@ describe Ci::ArchiveTraceService, '#execute' do
expect { subject }.not_to change { Ci::JobArtifact.trace.count }
end
end
context 'when job does not have trace' do
let(:job) { create(:ci_build, :success) }
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: 'The job does not have live trace but going to be archived.',
job_id: job.id)
subject
end
end
context 'when job failed to archive trace but did not raise an exception' do
before do
allow_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!) {}
end
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: 'The job does not have archived trace after archiving.',
job_id: job.id)
subject
end
end
end
context 'when job is running' do
......@@ -37,10 +65,10 @@ describe Ci::ArchiveTraceService, '#execute' do
issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502',
extra: { job_id: job.id } ).once
expect(Rails.logger)
.to receive(:error)
.with("Failed to archive trace. id: #{job.id} message: Job is not finished yet")
.and_call_original
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: "Failed to archive trace. message: Job is not finished yet.",
job_id: job.id).and_call_original
expect(Gitlab::Metrics)
.to receive(:counter)
......
......@@ -720,6 +720,58 @@ shared_examples_for 'trace with enabled live trace feature' do
end
end
describe '#archived_trace_exist?' do
subject { trace.archived_trace_exist? }
context 'when trace does not exist' do
it { is_expected.to be_falsy }
end
context 'when archived trace exists' do
before do
create(:ci_job_artifact, :trace, job: build)
end
it { is_expected.to be_truthy }
end
context 'when live trace exists' do
before do
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
stream.write('abc')
end
end
it { is_expected.to be_falsy }
end
end
describe '#live_trace_exist?' do
subject { trace.live_trace_exist? }
context 'when trace does not exist' do
it { is_expected.to be_falsy }
end
context 'when archived trace exists' do
before do
create(:ci_job_artifact, :trace, job: build)
end
it { is_expected.to be_falsy }
end
context 'when live trace exists' do
before do
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
stream.write('abc')
end
end
it { is_expected.to be_truthy }
end
end
describe '#archive!' do
subject { trace.archive! }
......
......@@ -11,7 +11,7 @@ describe ArchiveTraceWorker do
it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(job)
.to receive(:execute).with(job, anything)
subject
end
......
......@@ -34,7 +34,7 @@ describe Ci::ArchiveTracesCronWorker do
it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(build)
.to receive(:execute).with(build, anything)
subject
end
......@@ -60,7 +60,10 @@ describe Ci::ArchiveTracesCronWorker do
end
it 'puts a log' do
expect(Rails.logger).to receive(:error).with("Failed to archive trace. id: #{build.id} message: Unexpected error")
expect(Sidekiq.logger).to receive(:warn).with(
class: described_class.name,
message: "Failed to archive trace. message: Unexpected error.",
job_id: build.id)
subject
end
......
......@@ -7,8 +7,6 @@ describe StuckCiJobsWorker do
let!(:runner) { create :ci_runner }
let!(:job) { create :ci_build, runner: runner }
let(:trace_lease_key) { "trace:write:lock:#{job.id}" }
let(:trace_lease_uuid) { SecureRandom.uuid }
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid }
......@@ -16,7 +14,6 @@ describe StuckCiJobsWorker do
before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
stub_exclusive_lease(trace_lease_key, trace_lease_uuid)
job.update!(status: status, updated_at: updated_at)
end
......@@ -195,7 +192,6 @@ describe StuckCiJobsWorker do
end
it 'cancels exclusive leases after worker perform' do
expect_to_cancel_exclusive_lease(trace_lease_key, trace_lease_uuid)
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
worker.perform
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment