Commit 4435cdde authored by Kamil Trzciński's avatar Kamil Trzciński

Merge branch 'safe-archiving-for-traces' into 'master'

Extra logging for new live trace architecture

See merge request gitlab-org/gitlab-ce!30892
parents 995e5b52 c2e0e689
......@@ -531,6 +531,14 @@ module Ci
trace.exist?
end
def has_live_trace?
trace.live_trace_exist?
end
def has_archived_trace?
trace.archived_trace_exist?
end
def artifacts_file
job_artifacts_archive&.file
end
......
......@@ -176,6 +176,10 @@ module Ci
end
end
def self.archived_trace_exists_for?(job_id)
where(job_id: job_id).trace.take&.file&.file&.exists?
end
private
def file_format_adapter_class
......
......@@ -2,8 +2,25 @@
module Ci
class ArchiveTraceService
def execute(job)
def execute(job, worker_name:)
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_live_trace?
Sidekiq.logger.warn(class: worker_name,
message: 'The job does not have live trace but going to be archived.',
job_id: job.id)
return
end
job.trace.archive!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_archived_trace?
Sidekiq.logger.warn(class: worker_name,
message: 'The job does not have archived trace after archiving.',
job_id: job.id)
end
rescue ::Gitlab::Ci::Trace::AlreadyArchivedError
# It's already archived, thus we can safely ignore this exception.
rescue => e
......@@ -11,7 +28,7 @@ module Ci
# If `archive!` keeps failing for over a week, that could incur data loss.
# (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture)
# In order to avoid interrupting the system, we do not raise an exception here.
archive_error(e, job)
archive_error(e, job, worker_name)
end
private
......@@ -22,9 +39,12 @@ module Ci
"Counter of failed attempts of trace archiving")
end
def archive_error(error, job)
def archive_error(error, job, worker_name)
failed_archive_counter.increment
Rails.logger.error "Failed to archive trace. id: #{job.id} message: #{error.message}" # rubocop:disable Gitlab/RailsLogger
Sidekiq.logger.warn(class: worker_name,
message: "Failed to archive trace. message: #{error.message}.",
job_id: job.id)
Gitlab::Sentry
.track_exception(error,
......
......@@ -7,7 +7,7 @@ class ArchiveTraceWorker
# rubocop: disable CodeReuse/ActiveRecord
def perform(job_id)
Ci::Build.without_archived_trace.find_by(id: job_id).try do |job|
Ci::ArchiveTraceService.new.execute(job)
Ci::ArchiveTraceService.new.execute(job, worker_name: self.class.name)
end
end
# rubocop: enable CodeReuse/ActiveRecord
......
......@@ -11,7 +11,7 @@ module Ci
# This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL
# More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791
Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build|
Ci::ArchiveTraceService.new.execute(build)
Ci::ArchiveTraceService.new.execute(build, worker_name: self.class.name)
end
end
# rubocop: enable CodeReuse/ActiveRecord
......
---
title: Extra logging for new live trace architecture
merge_request: 30892
author:
type: fixed
......@@ -63,7 +63,15 @@ module Gitlab
end
def exist?
trace_artifact&.exists? || job.trace_chunks.any? || current_path.present? || old_trace.present?
archived_trace_exist? || live_trace_exist?
end
def archived_trace_exist?
trace_artifact&.exists?
end
def live_trace_exist?
job.trace_chunks.any? || current_path.present? || old_trace.present?
end
def read
......@@ -167,7 +175,7 @@ module Gitlab
def clone_file!(src_stream, temp_dir)
FileUtils.mkdir_p(temp_dir)
Dir.mktmpdir('tmp-trace', temp_dir) do |dir_path|
Dir.mktmpdir("tmp-trace-#{job.id}", temp_dir) do |dir_path|
temp_path = File.join(dir_path, "job.log")
FileUtils.touch(temp_path)
size = IO.copy_stream(src_stream, temp_path)
......
......@@ -166,6 +166,13 @@ module Gitlab
end
def destroy!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless build.has_archived_trace?
Sidekiq.logger.warn(message: 'The job does not have archived trace but going to be destroyed.',
job_id: build.id)
end
trace_chunks.fast_destroy_all
@tell = @size = 0
ensure
......
......@@ -442,5 +442,15 @@ describe Gitlab::Ci::Trace::ChunkedIO, :clean_gitlab_redis_cache do
expect(Ci::BuildTraceChunk.where(build: build).count).to eq(0)
end
context 'when the job does not have archived trace' do
it 'leaves a message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
message: 'The job does not have archived trace but going to be destroyed.',
job_id: build.id).and_call_original
subject
end
end
end
end
......@@ -692,6 +692,34 @@ describe Ci::Build do
end
end
describe '#has_live_trace?' do
subject { build.has_live_trace? }
let(:build) { create(:ci_build, :trace_live) }
it { is_expected.to be_truthy }
context 'when build does not have live trace' do
let(:build) { create(:ci_build) }
it { is_expected.to be_falsy }
end
end
describe '#has_archived_trace?' do
subject { build.has_archived_trace? }
let(:build) { create(:ci_build, :trace_artifact) }
it { is_expected.to be_truthy }
context 'when build does not have archived trace' do
let(:build) { create(:ci_build) }
it { is_expected.to be_falsy }
end
end
describe '#has_job_artifacts?' do
subject { build.has_job_artifacts? }
......
......@@ -70,6 +70,31 @@ describe Ci::JobArtifact do
end
end
describe '.archived_trace_exists_for?' do
subject { described_class.archived_trace_exists_for?(job_id) }
let!(:artifact) { create(:ci_job_artifact, :trace, job: job) }
let(:job) { create(:ci_build) }
context 'when the specified job_id exists' do
let(:job_id) { job.id }
it { is_expected.to be_truthy }
context 'when the job does have archived trace' do
let!(:artifact) { }
it { is_expected.to be_falsy }
end
end
context 'when the specified job_id does not exist' do
let(:job_id) { 10000 }
it { is_expected.to be_falsy }
end
end
describe 'callbacks' do
subject { create(:ci_job_artifact, :archive) }
......
......@@ -3,7 +3,7 @@
require 'spec_helper'
describe Ci::ArchiveTraceService, '#execute' do
subject { described_class.new.execute(job) }
subject { described_class.new.execute(job, worker_name: ArchiveTraceWorker.name) }
context 'when job is finished' do
let(:job) { create(:ci_build, :success, :trace_live) }
......@@ -25,6 +25,34 @@ describe Ci::ArchiveTraceService, '#execute' do
expect { subject }.not_to change { Ci::JobArtifact.trace.count }
end
end
context 'when job does not have trace' do
let(:job) { create(:ci_build, :success) }
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: 'The job does not have live trace but going to be archived.',
job_id: job.id)
subject
end
end
context 'when job failed to archive trace but did not raise an exception' do
before do
allow_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!) {}
end
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: 'The job does not have archived trace after archiving.',
job_id: job.id)
subject
end
end
end
context 'when job is running' do
......@@ -37,10 +65,10 @@ describe Ci::ArchiveTraceService, '#execute' do
issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502',
extra: { job_id: job.id } ).once
expect(Rails.logger)
.to receive(:error)
.with("Failed to archive trace. id: #{job.id} message: Job is not finished yet")
.and_call_original
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: "Failed to archive trace. message: Job is not finished yet.",
job_id: job.id).and_call_original
expect(Gitlab::Metrics)
.to receive(:counter)
......
......@@ -720,6 +720,58 @@ shared_examples_for 'trace with enabled live trace feature' do
end
end
describe '#archived_trace_exist?' do
subject { trace.archived_trace_exist? }
context 'when trace does not exist' do
it { is_expected.to be_falsy }
end
context 'when archived trace exists' do
before do
create(:ci_job_artifact, :trace, job: build)
end
it { is_expected.to be_truthy }
end
context 'when live trace exists' do
before do
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
stream.write('abc')
end
end
it { is_expected.to be_falsy }
end
end
describe '#live_trace_exist?' do
subject { trace.live_trace_exist? }
context 'when trace does not exist' do
it { is_expected.to be_falsy }
end
context 'when archived trace exists' do
before do
create(:ci_job_artifact, :trace, job: build)
end
it { is_expected.to be_falsy }
end
context 'when live trace exists' do
before do
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
stream.write('abc')
end
end
it { is_expected.to be_truthy }
end
end
describe '#archive!' do
subject { trace.archive! }
......
......@@ -11,7 +11,7 @@ describe ArchiveTraceWorker do
it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(job)
.to receive(:execute).with(job, anything)
subject
end
......
......@@ -34,7 +34,7 @@ describe Ci::ArchiveTracesCronWorker do
it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(build)
.to receive(:execute).with(build, anything)
subject
end
......@@ -60,7 +60,10 @@ describe Ci::ArchiveTracesCronWorker do
end
it 'puts a log' do
expect(Rails.logger).to receive(:error).with("Failed to archive trace. id: #{build.id} message: Unexpected error")
expect(Sidekiq.logger).to receive(:warn).with(
class: described_class.name,
message: "Failed to archive trace. message: Unexpected error.",
job_id: build.id)
subject
end
......
......@@ -7,8 +7,6 @@ describe StuckCiJobsWorker do
let!(:runner) { create :ci_runner }
let!(:job) { create :ci_build, runner: runner }
let(:trace_lease_key) { "trace:write:lock:#{job.id}" }
let(:trace_lease_uuid) { SecureRandom.uuid }
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid }
......@@ -16,7 +14,6 @@ describe StuckCiJobsWorker do
before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
stub_exclusive_lease(trace_lease_key, trace_lease_uuid)
job.update!(status: status, updated_at: updated_at)
end
......@@ -195,7 +192,6 @@ describe StuckCiJobsWorker do
end
it 'cancels exclusive leases after worker perform' do
expect_to_cancel_exclusive_lease(trace_lease_key, trace_lease_uuid)
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
worker.perform
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment