Commit 62f8dab1 authored by Adam Hegyi's avatar Adam Hegyi

Merge branch '212322-remove-duplicate-vulnerabilities' into 'master'

Remove duplicates from vulnerability_occurrences

See merge request gitlab-org/gitlab!49937
parents a26f4166 f5c331fd
---
title: Remove duplicates from vulnerability_occurrences
merge_request: 49937
author:
type: other
# frozen_string_literal: true
class ScheduleRemoveDuplicateVulnerabilitiesFindings < ActiveRecord::Migration[6.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
INDEX_NAME = 'tmp_idx_deduplicate_vulnerability_occurrences'
MIGRATION = 'RemoveDuplicateVulnerabilitiesFindings'
DELAY_INTERVAL = 2.minutes.to_i
BATCH_SIZE = 5_000
disable_ddl_transaction!
class VulnerabilitiesFinding < ActiveRecord::Base
include ::EachBatch
self.table_name = "vulnerability_occurrences"
end
def up
add_concurrent_index :vulnerability_occurrences,
%i[project_id report_type location_fingerprint primary_identifier_id id],
name: INDEX_NAME
say "Scheduling #{MIGRATION} jobs"
queue_background_migration_jobs_by_range_at_intervals(
VulnerabilitiesFinding,
MIGRATION,
DELAY_INTERVAL,
batch_size: BATCH_SIZE
)
end
def down
remove_concurrent_index_by_name(:vulnerability_occurrences, INDEX_NAME)
end
end
322d7270e942c161cc8b50b8c3f531c93b6e6e938e415c1b6010a70b630bf82e
\ No newline at end of file
...@@ -23415,6 +23415,8 @@ CREATE INDEX temporary_index_vulnerabilities_on_id ON vulnerabilities USING btre ...@@ -23415,6 +23415,8 @@ CREATE INDEX temporary_index_vulnerabilities_on_id ON vulnerabilities USING btre
CREATE UNIQUE INDEX term_agreements_unique_index ON term_agreements USING btree (user_id, term_id); CREATE UNIQUE INDEX term_agreements_unique_index ON term_agreements USING btree (user_id, term_id);
CREATE INDEX tmp_idx_deduplicate_vulnerability_occurrences ON vulnerability_occurrences USING btree (project_id, report_type, location_fingerprint, primary_identifier_id, id);
CREATE INDEX tmp_index_oauth_applications_on_id_where_trusted ON oauth_applications USING btree (id) WHERE (trusted = true); CREATE INDEX tmp_index_oauth_applications_on_id_where_trusted ON oauth_applications USING btree (id) WHERE (trusted = true);
CREATE INDEX tmp_index_on_vulnerabilities_non_dismissed ON vulnerabilities USING btree (id) WHERE (state <> 2); CREATE INDEX tmp_index_on_vulnerabilities_non_dismissed ON vulnerabilities USING btree (id) WHERE (state <> 2);
......
# frozen_string_literal: true
# rubocop: disable Style/Documentation
class Gitlab::BackgroundMigration::RemoveDuplicateVulnerabilitiesFindings
DELETE_BATCH_SIZE = 100
# rubocop:disable Gitlab/NamespacedClass
class VulnerabilitiesFinding < ActiveRecord::Base
self.table_name = "vulnerability_occurrences"
end
# rubocop:enable Gitlab/NamespacedClass
def perform(start_id, end_id)
batch = VulnerabilitiesFinding.where(id: start_id..end_id)
cte = Gitlab::SQL::CTE.new(:batch, batch.select(:report_type, :location_fingerprint, :primary_identifier_id, :project_id))
query = VulnerabilitiesFinding
.select('batch.report_type', 'batch.location_fingerprint', 'batch.primary_identifier_id', 'batch.project_id', 'array_agg(id) as ids')
.distinct
.with(cte.to_arel)
.from(cte.alias_to(Arel.sql('batch')))
.joins(
%(
INNER JOIN
vulnerability_occurrences ON
vulnerability_occurrences.report_type = batch.report_type AND
vulnerability_occurrences.location_fingerprint = batch.location_fingerprint AND
vulnerability_occurrences.primary_identifier_id = batch.primary_identifier_id AND
vulnerability_occurrences.project_id = batch.project_id
)).group('batch.report_type', 'batch.location_fingerprint', 'batch.primary_identifier_id', 'batch.project_id')
.having('COUNT(*) > 1')
ids_to_delete = []
query.to_a.each do |record|
# We want to keep the latest finding since it might have recent metadata
duplicate_ids = record.ids.uniq.sort
duplicate_ids.pop
ids_to_delete.concat(duplicate_ids)
if ids_to_delete.size == DELETE_BATCH_SIZE
VulnerabilitiesFinding.where(id: ids_to_delete).delete_all
ids_to_delete.clear
end
end
VulnerabilitiesFinding.where(id: ids_to_delete).delete_all if ids_to_delete.any?
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::RemoveDuplicateVulnerabilitiesFindings do
let(:namespace) { table(:namespaces).create!(name: 'user', path: 'user') }
let(:users) { table(:users) }
let(:user) { create_user! }
let(:project) { table(:projects).create!(id: 123, namespace_id: namespace.id) }
let(:scanners) { table(:vulnerability_scanners) }
let!(:scanner) { scanners.create!(project_id: project.id, external_id: 'test 1', name: 'test scanner 1') }
let!(:scanner2) { scanners.create!(project_id: project.id, external_id: 'test 2', name: 'test scanner 2') }
let!(:scanner3) { scanners.create!(project_id: project.id, external_id: 'test 3', name: 'test scanner 3') }
let!(:unrelated_scanner) { scanners.create!(project_id: project.id, external_id: 'unreleated_scanner', name: 'unrelated scanner') }
let(:vulnerabilities) { table(:vulnerabilities) }
let(:vulnerability_findings) { table(:vulnerability_occurrences) }
let(:vulnerability_identifiers) { table(:vulnerability_identifiers) }
let(:vulnerability_identifier) do
vulnerability_identifiers.create!(
project_id: project.id,
external_type: 'vulnerability-identifier',
external_id: 'vulnerability-identifier',
fingerprint: '7e394d1b1eb461a7406d7b1e08f057a1cf11287a',
name: 'vulnerability identifier')
end
let!(:first_finding) do
create_finding!(
uuid: "test1",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner.id,
project_id: project.id
)
end
let!(:first_duplicate) do
create_finding!(
uuid: "test2",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner2.id,
project_id: project.id
)
end
let!(:second_duplicate) do
create_finding!(
uuid: "test3",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner3.id,
project_id: project.id
)
end
let!(:unrelated_finding) do
create_finding!(
uuid: "unreleated_finding",
vulnerability_id: nil,
report_type: 1,
location_fingerprint: 'random_location_fingerprint',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: unrelated_scanner.id,
project_id: project.id
)
end
subject { described_class.new.perform(first_finding.id, unrelated_finding.id) }
before do
stub_const("#{described_class}::DELETE_BATCH_SIZE", 1)
end
it "removes entries which would result in duplicate UUIDv5" do
expect(vulnerability_findings.count).to eq(4)
expect { subject }.to change { vulnerability_findings.count }.from(4).to(2)
expect(vulnerability_findings.pluck(:id)).to eq([second_duplicate.id, unrelated_finding.id])
end
private
def create_vulnerability!(project_id:, author_id:, title: 'test', severity: 7, confidence: 7, report_type: 0)
vulnerabilities.create!(
project_id: project_id,
author_id: author_id,
title: title,
severity: severity,
confidence: confidence,
report_type: report_type
)
end
# rubocop:disable Metrics/ParameterLists
def create_finding!(
vulnerability_id:, project_id:, scanner_id:, primary_identifier_id:,
name: "test", severity: 7, confidence: 7, report_type: 0,
project_fingerprint: '123qweasdzxc', location_fingerprint: 'test',
metadata_version: 'test', raw_metadata: 'test', uuid: 'test')
vulnerability_findings.create!(
vulnerability_id: vulnerability_id,
project_id: project_id,
name: name,
severity: severity,
confidence: confidence,
report_type: report_type,
project_fingerprint: project_fingerprint,
scanner_id: scanner_id,
primary_identifier_id: vulnerability_identifier.id,
location_fingerprint: location_fingerprint,
metadata_version: metadata_version,
raw_metadata: raw_metadata,
uuid: uuid
)
end
# rubocop:enable Metrics/ParameterLists
def create_user!(name: "Example User", email: "user@example.com", user_type: nil, created_at: Time.zone.now, confirmed_at: Time.zone.now)
users.create!(
name: name,
email: email,
username: name,
projects_limit: 0,
user_type: user_type,
confirmed_at: confirmed_at
)
end
end
# frozen_string_literal: true
require 'spec_helper'
require Rails.root.join('db', 'post_migrate', '20201112130710_schedule_remove_duplicate_vulnerabilities_findings.rb')
RSpec.describe ScheduleRemoveDuplicateVulnerabilitiesFindings, :migration do
let(:namespace) { table(:namespaces).create!(name: 'user', path: 'user') }
let(:users) { table(:users) }
let(:user) { create_user! }
let(:project) { table(:projects).create!(id: 123, namespace_id: namespace.id) }
let(:scanners) { table(:vulnerability_scanners) }
let!(:scanner) { scanners.create!(project_id: project.id, external_id: 'test 1', name: 'test scanner 1') }
let!(:scanner2) { scanners.create!(project_id: project.id, external_id: 'test 2', name: 'test scanner 2') }
let!(:scanner3) { scanners.create!(project_id: project.id, external_id: 'test 3', name: 'test scanner 3') }
let!(:unrelated_scanner) { scanners.create!(project_id: project.id, external_id: 'unreleated_scanner', name: 'unrelated scanner') }
let(:vulnerabilities) { table(:vulnerabilities) }
let(:vulnerability_findings) { table(:vulnerability_occurrences) }
let(:vulnerability_identifiers) { table(:vulnerability_identifiers) }
let(:vulnerability_identifier) do
vulnerability_identifiers.create!(
project_id: project.id,
external_type: 'vulnerability-identifier',
external_id: 'vulnerability-identifier',
fingerprint: '7e394d1b1eb461a7406d7b1e08f057a1cf11287a',
name: 'vulnerability identifier')
end
let!(:first_finding) do
create_finding!(
uuid: "test1",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner.id,
project_id: project.id
)
end
let!(:first_duplicate) do
create_finding!(
uuid: "test2",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner2.id,
project_id: project.id
)
end
let!(:second_duplicate) do
create_finding!(
uuid: "test3",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner3.id,
project_id: project.id
)
end
let!(:unrelated_finding) do
create_finding!(
uuid: "unreleated_finding",
vulnerability_id: nil,
report_type: 1,
location_fingerprint: 'random_location_fingerprint',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: unrelated_scanner.id,
project_id: project.id
)
end
before do
stub_const("#{described_class}::BATCH_SIZE", 1)
end
around do |example|
freeze_time { Sidekiq::Testing.fake! { example.run } }
end
it 'schedules background migration' do
migrate!
expect(BackgroundMigrationWorker.jobs.size).to eq(4)
expect(described_class::MIGRATION).to be_scheduled_migration(first_finding.id, first_finding.id)
expect(described_class::MIGRATION).to be_scheduled_migration(first_duplicate.id, first_duplicate.id)
expect(described_class::MIGRATION).to be_scheduled_migration(second_duplicate.id, second_duplicate.id)
expect(described_class::MIGRATION).to be_scheduled_migration(unrelated_finding.id, unrelated_finding.id)
end
private
def create_vulnerability!(project_id:, author_id:, title: 'test', severity: 7, confidence: 7, report_type: 0)
vulnerabilities.create!(
project_id: project_id,
author_id: author_id,
title: title,
severity: severity,
confidence: confidence,
report_type: report_type
)
end
# rubocop:disable Metrics/ParameterLists
def create_finding!(
vulnerability_id:, project_id:, scanner_id:, primary_identifier_id:,
name: "test", severity: 7, confidence: 7, report_type: 0,
project_fingerprint: '123qweasdzxc', location_fingerprint: 'test',
metadata_version: 'test', raw_metadata: 'test', uuid: 'test')
vulnerability_findings.create!(
vulnerability_id: vulnerability_id,
project_id: project_id,
name: name,
severity: severity,
confidence: confidence,
report_type: report_type,
project_fingerprint: project_fingerprint,
scanner_id: scanner_id,
primary_identifier_id: vulnerability_identifier.id,
location_fingerprint: location_fingerprint,
metadata_version: metadata_version,
raw_metadata: raw_metadata,
uuid: uuid
)
end
# rubocop:enable Metrics/ParameterLists
def create_user!(name: "Example User", email: "user@example.com", user_type: nil, created_at: Time.now, confirmed_at: Time.now)
users.create!(
name: name,
email: email,
username: name,
projects_limit: 0,
user_type: user_type,
confirmed_at: confirmed_at
)
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment