Add Prometheus metrics to track Geo autocorrect numbers

parent 3ab1648e
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20180726172057) do ActiveRecord::Schema.define(version: 20180803001726) do
# These are extensions that must be enabled in order to support this database # These are extensions that must be enabled in order to support this database
enable_extension "plpgsql" enable_extension "plpgsql"
...@@ -1159,6 +1159,8 @@ ActiveRecord::Schema.define(version: 20180726172057) do ...@@ -1159,6 +1159,8 @@ ActiveRecord::Schema.define(version: 20180726172057) do
t.integer "wikis_checksum_failed_count" t.integer "wikis_checksum_failed_count"
t.integer "wikis_checksum_mismatch_count" t.integer "wikis_checksum_mismatch_count"
t.binary "storage_configuration_digest" t.binary "storage_configuration_digest"
t.integer "repositories_retrying_verification_count"
t.integer "wikis_retrying_verification_count"
end end
add_index "geo_node_statuses", ["geo_node_id"], name: "index_geo_node_statuses_on_geo_node_id", unique: true, using: :btree add_index "geo_node_statuses", ["geo_node_id"], name: "index_geo_node_statuses_on_geo_node_id", unique: true, using: :btree
......
...@@ -57,39 +57,41 @@ The following metrics are available: ...@@ -57,39 +57,41 @@ The following metrics are available:
Sidekiq jobs may also gather metrics, and these metrics can be accessed if the Sidekiq exporter is enabled (e.g. via Sidekiq jobs may also gather metrics, and these metrics can be accessed if the Sidekiq exporter is enabled (e.g. via
the `monitoring.sidekiq_exporter` configuration option in `gitlab.yml`. the `monitoring.sidekiq_exporter` configuration option in `gitlab.yml`.
| Metric | Type | Since | Description | Labels | | Metric | Type | Since | Description | Labels |
|:------------------------------------------- |:------- |:----- |:----------- |:------ | |:-------------------------------------------- |:------- |:----- |:----------- |:------ |
| geo_db_replication_lag_seconds | Gauge | 10.2 | Database replication lag (seconds) | url | geo_db_replication_lag_seconds | Gauge | 10.2 | Database replication lag (seconds) | url
| geo_repositories | Gauge | 10.2 | Total number of repositories available on primary | url | geo_repositories | Gauge | 10.2 | Total number of repositories available on primary | url
| geo_repositories_synced | Gauge | 10.2 | Number of repositories synced on secondary | url | geo_repositories_synced | Gauge | 10.2 | Number of repositories synced on secondary | url
| geo_repositories_failed | Gauge | 10.2 | Number of repositories failed to sync on secondary | url | geo_repositories_failed | Gauge | 10.2 | Number of repositories failed to sync on secondary | url
| geo_lfs_objects | Gauge | 10.2 | Total number of LFS objects available on primary | url | geo_lfs_objects | Gauge | 10.2 | Total number of LFS objects available on primary | url
| geo_lfs_objects_synced | Gauge | 10.2 | Number of LFS objects synced on secondary | url | geo_lfs_objects_synced | Gauge | 10.2 | Number of LFS objects synced on secondary | url
| geo_lfs_objects_failed | Gauge | 10.2 | Number of LFS objects failed to sync on secondary | url | geo_lfs_objects_failed | Gauge | 10.2 | Number of LFS objects failed to sync on secondary | url
| geo_attachments | Gauge | 10.2 | Total number of file attachments available on primary | url | geo_attachments | Gauge | 10.2 | Total number of file attachments available on primary | url
| geo_attachments_synced | Gauge | 10.2 | Number of attachments synced on secondary | url | geo_attachments_synced | Gauge | 10.2 | Number of attachments synced on secondary | url
| geo_attachments_failed | Gauge | 10.2 | Number of attachments failed to sync on secondary | url | geo_attachments_failed | Gauge | 10.2 | Number of attachments failed to sync on secondary | url
| geo_last_event_id | Gauge | 10.2 | Database ID of the latest event log entry on the primary | url | geo_last_event_id | Gauge | 10.2 | Database ID of the latest event log entry on the primary | url
| geo_last_event_timestamp | Gauge | 10.2 | UNIX timestamp of the latest event log entry on the primary | url | geo_last_event_timestamp | Gauge | 10.2 | UNIX timestamp of the latest event log entry on the primary | url
| geo_cursor_last_event_id | Gauge | 10.2 | Last database ID of the event log processed by the secondary | url | geo_cursor_last_event_id | Gauge | 10.2 | Last database ID of the event log processed by the secondary | url
| geo_cursor_last_event_timestamp | Gauge | 10.2 | Last UNIX timestamp of the event log processed by the secondary | url | geo_cursor_last_event_timestamp | Gauge | 10.2 | Last UNIX timestamp of the event log processed by the secondary | url
| geo_status_failed_total | Counter | 10.2 | Number of times retrieving the status from the Geo Node failed | url | geo_status_failed_total | Counter | 10.2 | Number of times retrieving the status from the Geo Node failed | url
| geo_last_successful_status_check_timestamp | Gauge | 10.2 | Last timestamp when the status was successfully updated | url | geo_last_successful_status_check_timestamp | Gauge | 10.2 | Last timestamp when the status was successfully updated | url
| geo_lfs_objects_synced_missing_on_primary | Gauge | 10.7 | Number of LFS objects marked as synced due to the file missing on the primary | url | geo_lfs_objects_synced_missing_on_primary | Gauge | 10.7 | Number of LFS objects marked as synced due to the file missing on the primary | url
| geo_job_artifacts_synced_missing_on_primary | Gauge | 10.7 | Number of job artifacts marked as synced due to the file missing on the primary | url | geo_job_artifacts_synced_missing_on_primary | Gauge | 10.7 | Number of job artifacts marked as synced due to the file missing on the primary | url
| geo_attachments_synced_missing_on_primary | Gauge | 10.7 | Number of attachments marked as synced due to the file missing on the primary | url | geo_attachments_synced_missing_on_primary | Gauge | 10.7 | Number of attachments marked as synced due to the file missing on the primary | url
| geo_repositories_checksummed_count | Gauge | 10.7 | Number of repositories checksummed on primary | url | geo_repositories_checksummed_count | Gauge | 10.7 | Number of repositories checksummed on primary | url
| geo_repositories_checksum_failed_count | Gauge | 10.7 | Number of repositories failed to calculate the checksum on primary | url | geo_repositories_checksum_failed_count | Gauge | 10.7 | Number of repositories failed to calculate the checksum on primary | url
| geo_wikis_checksummed_count | Gauge | 10.7 | Number of wikis checksummed on primary | url | geo_wikis_checksummed_count | Gauge | 10.7 | Number of wikis checksummed on primary | url
| geo_wikis_checksum_failed_count | Gauge | 10.7 | Number of wikis failed to calculate the checksum on primary | url | geo_wikis_checksum_failed_count | Gauge | 10.7 | Number of wikis failed to calculate the checksum on primary | url
| geo_repositories_verified_count | Gauge | 10.7 | Number of repositories verified on secondary | url | geo_repositories_verified_count | Gauge | 10.7 | Number of repositories verified on secondary | url
| geo_repositories_verification_failed_count | Gauge | 10.7 | Number of repositories failed to verify on secondary | url | geo_repositories_verification_failed_count | Gauge | 10.7 | Number of repositories failed to verify on secondary | url
| geo_repositories_checksum_mismatch_count | Gauge | 10.7 | Number of repositories that checksum mismatch on secondary | url | geo_repositories_checksum_mismatch_count | Gauge | 10.7 | Number of repositories that checksum mismatch on secondary | url
| geo_wikis_verified_count | Gauge | 10.7 | Number of wikis verified on secondary | url | geo_wikis_verified_count | Gauge | 10.7 | Number of wikis verified on secondary | url
| geo_wikis_verification_failed_count | Gauge | 10.7 | Number of wikis failed to verify on secondary | url | geo_wikis_verification_failed_count | Gauge | 10.7 | Number of wikis failed to verify on secondary | url
| geo_wikis_checksum_mismatch_count | Gauge | 10.7 | Number of wikis that checksum mismatch on secondary | url | geo_wikis_checksum_mismatch_count | Gauge | 10.7 | Number of wikis that checksum mismatch on secondary | url
| geo_repositories_checked_count | Gauge | 11.1 | Number of repositories that have been checked via `git fsck` | url | geo_repositories_checked_count | Gauge | 11.1 | Number of repositories that have been checked via `git fsck` | url
| geo_repositories_checked_failed_count | Gauge | 11.1 | Number of repositories that have a failure from `git fsck` | url | geo_repositories_checked_failed_count | Gauge | 11.1 | Number of repositories that have a failure from `git fsck` | url
| geo_repositories_retrying_verification_count | Gauge | 11.2 | Number of repositories verification failures that Geo is actively trying to correct on secondary | url
| geo_wikis_retrying_verification_count | Gauge | 11.2 | Number of wikis verification failures that Geo is actively trying to correct on secondary | url
### Ruby metrics ### Ruby metrics
......
...@@ -206,6 +206,8 @@ Example response: ...@@ -206,6 +206,8 @@ Example response:
"wikis_verification_failed_count": 3, "wikis_verification_failed_count": 3,
"wikis_verified_in_percentage": "24.39%", "wikis_verified_in_percentage": "24.39%",
"wikis_checksum_mismatch_count": 1, "wikis_checksum_mismatch_count": 1,
"repositories_retrying_verification_count": 1,
"wikis_retrying_verification_count": 3,
"repositories_checked_count": 7, "repositories_checked_count": 7,
"repositories_checked_failed_count": 2, "repositories_checked_failed_count": 2,
"repositories_checked_in_percentage": "17.07%", "repositories_checked_in_percentage": "17.07%",
...@@ -265,6 +267,8 @@ Example response: ...@@ -265,6 +267,8 @@ Example response:
"wikis_verification_failed_count": 3, "wikis_verification_failed_count": 3,
"wikis_verified_in_percentage": "24.39%", "wikis_verified_in_percentage": "24.39%",
"wikis_checksum_mismatch_count": 1, "wikis_checksum_mismatch_count": 1,
"repositories_retrying_verification_count": 4,
"wikis_retrying_verification_count": 2,
"repositories_checked_count": 5, "repositories_checked_count": 5,
"repositories_checked_failed_count": 1, "repositories_checked_failed_count": 1,
"repositories_checked_in_percentage": "12.20%", "repositories_checked_in_percentage": "12.20%",
......
...@@ -88,6 +88,14 @@ module Geo ...@@ -88,6 +88,14 @@ module Geo
Geo::ProjectRegistry.wiki_checksum_mismatch.count Geo::ProjectRegistry.wiki_checksum_mismatch.count
end end
def count_repositories_retrying_verification
Geo::ProjectRegistry.repositories_retrying_verification.count
end
def count_wikis_retrying_verification
Geo::ProjectRegistry.wikis_retrying_verification.count
end
def count_verification_failed_repositories def count_verification_failed_repositories
find_verification_failed_project_registries('repository').count find_verification_failed_project_registries('repository').count
end end
......
...@@ -53,6 +53,20 @@ class Geo::ProjectRegistry < Geo::BaseRegistry ...@@ -53,6 +53,20 @@ class Geo::ProjectRegistry < Geo::BaseRegistry
where(repository_checksum_mismatch.or(wiki_checksum_mismatch)) where(repository_checksum_mismatch.or(wiki_checksum_mismatch))
end end
def self.repositories_retrying_verification
where(
arel_table[:repository_verification_retry_count].gt(0)
.and(arel_table[:resync_repository].eq(true))
)
end
def self.wikis_retrying_verification
where(
arel_table[:wiki_verification_retry_count].gt(0)
.and(arel_table[:resync_wiki].eq(true))
)
end
def self.retry_due def self.retry_due
where( where(
arel_table[:repository_retry_at].lt(Time.now) arel_table[:repository_retry_at].lt(Time.now)
......
...@@ -77,7 +77,9 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -77,7 +77,9 @@ class GeoNodeStatus < ActiveRecord::Base
hashed_storage_migrated_max_id: 'Highest ID present in projects migrated to hashed storage', hashed_storage_migrated_max_id: 'Highest ID present in projects migrated to hashed storage',
hashed_storage_attachments_max_id: 'Highest ID present in attachments migrated to hashed storage', hashed_storage_attachments_max_id: 'Highest ID present in attachments migrated to hashed storage',
repositories_checked_count: 'Number of repositories checked', repositories_checked_count: 'Number of repositories checked',
repositories_checked_failed_count: 'Number of failed repositories checked' repositories_checked_failed_count: 'Number of failed repositories checked',
repositories_retrying_verification_count: 'Number of repositories verification failures that Geo is actively trying to correct on secondary',
wikis_retrying_verification_count: 'Number of wikis verification failures that Geo is actively trying to correct on secondary'
}.freeze }.freeze
EXPIRATION_IN_MINUTES = 5 EXPIRATION_IN_MINUTES = 5
...@@ -238,6 +240,8 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -238,6 +240,8 @@ class GeoNodeStatus < ActiveRecord::Base
self.wikis_verified_count = projects_finder.count_verified_wikis self.wikis_verified_count = projects_finder.count_verified_wikis
self.wikis_verification_failed_count = projects_finder.count_verification_failed_wikis self.wikis_verification_failed_count = projects_finder.count_verification_failed_wikis
self.wikis_checksum_mismatch_count = projects_finder.count_wikis_checksum_mismatch self.wikis_checksum_mismatch_count = projects_finder.count_wikis_checksum_mismatch
self.repositories_retrying_verification_count = projects_finder.count_repositories_retrying_verification
self.wikis_retrying_verification_count = projects_finder.count_wikis_retrying_verification
end end
end end
......
# frozen_string_literal: true
class AddVerificationRetryCountsToGeoNodeStatuses < ActiveRecord::Migration
DOWNTIME = false
def change
add_column :geo_node_statuses, :repositories_retrying_verification_count, :integer
add_column :geo_node_statuses, :wikis_retrying_verification_count, :integer
end
end
...@@ -331,6 +331,9 @@ module EE ...@@ -331,6 +331,9 @@ module EE
end end
expose :wikis_checksum_mismatch_count expose :wikis_checksum_mismatch_count
expose :repositories_retrying_verification_count
expose :wikis_retrying_verification_count
expose :replication_slots_count expose :replication_slots_count
expose :replication_slots_used_count expose :replication_slots_used_count
expose :replication_slots_used_in_percentage do |node| expose :replication_slots_used_in_percentage do |node|
......
...@@ -33,6 +33,8 @@ FactoryBot.define do ...@@ -33,6 +33,8 @@ FactoryBot.define do
wikis_verified_count 499 wikis_verified_count 499
wikis_verification_failed_count 99 wikis_verification_failed_count 99
wikis_checksum_mismatch_count 10 wikis_checksum_mismatch_count 10
repositories_retrying_verification_count 25
wikis_retrying_verification_count 3
last_event_id 2 last_event_id 2
last_event_timestamp { Time.now.to_i } last_event_timestamp { Time.now.to_i }
cursor_last_event_id 1 cursor_last_event_id 1
......
...@@ -40,6 +40,8 @@ ...@@ -40,6 +40,8 @@
"wikis_verification_failed_count", "wikis_verification_failed_count",
"wikis_verified_in_percentage", "wikis_verified_in_percentage",
"wikis_checksum_mismatch_count", "wikis_checksum_mismatch_count",
"repositories_retrying_verification_count",
"wikis_retrying_verification_count",
"repositories_checked_count", "repositories_checked_count",
"repositories_checked_failed_count", "repositories_checked_failed_count",
"repositories_checked_in_percentage", "repositories_checked_in_percentage",
...@@ -103,6 +105,8 @@ ...@@ -103,6 +105,8 @@
"wikis_verification_failed_count": { "type": ["integer", "null"] }, "wikis_verification_failed_count": { "type": ["integer", "null"] },
"wikis_verified_in_percentage": { "type": "string" }, "wikis_verified_in_percentage": { "type": "string" },
"wikis_checksum_mismatch_count": { "type": ["integer", "null"] }, "wikis_checksum_mismatch_count": { "type": ["integer", "null"] },
"repositories_retrying_verification_count": { "type": ["integer", "null"] },
"wikis_retrying_verification_count": { "type": ["integer", "null"] },
"repositories_checked_count": { "type": ["integer", "null"] }, "repositories_checked_count": { "type": ["integer", "null"] },
"repositories_checked_failed_count": { "type": ["integer", "null"] }, "repositories_checked_failed_count": { "type": ["integer", "null"] },
"repositories_checked_in_percentage": { "type": "string" }, "repositories_checked_in_percentage": { "type": "string" },
......
...@@ -712,6 +712,27 @@ describe GeoNodeStatus, :geo do ...@@ -712,6 +712,27 @@ describe GeoNodeStatus, :geo do
end end
end end
describe '#repositories_retrying_verification_count' do
before do
stub_current_geo_node(secondary)
end
it 'returns the right number of repositories retrying verification' do
create(:geo_project_registry, :repository_verification_failed, repository_verification_retry_count: 1)
create(:geo_project_registry, :repository_verification_failed, repository_verification_retry_count: nil)
create(:geo_project_registry, :repository_verified)
expect(subject.repositories_retrying_verification_count).to eq(1)
end
it 'returns existing value when feature flag if off' do
allow(Gitlab::Geo).to receive(:repository_verification_enabled?).and_return(false)
create(:geo_node_status, :healthy, geo_node: secondary)
expect(subject.repositories_retrying_verification_count).to eq(25)
end
end
describe '#wikis_verified_count' do describe '#wikis_verified_count' do
before do before do
stub_current_geo_node(secondary) stub_current_geo_node(secondary)
...@@ -773,6 +794,27 @@ describe GeoNodeStatus, :geo do ...@@ -773,6 +794,27 @@ describe GeoNodeStatus, :geo do
end end
end end
describe '#wikis_retrying_verification_count' do
before do
stub_current_geo_node(secondary)
end
it 'returns the right number of wikis retrying verification' do
create(:geo_project_registry, :wiki_verification_failed, wiki_verification_retry_count: 1)
create(:geo_project_registry, :wiki_verification_failed, wiki_verification_retry_count: nil)
create(:geo_project_registry, :wiki_verified)
expect(subject.wikis_retrying_verification_count).to eq(1)
end
it 'returns existing value when feature flag if off' do
allow(Gitlab::Geo).to receive(:repository_verification_enabled?).and_return(false)
create(:geo_node_status, :healthy, geo_node: secondary)
expect(subject.wikis_retrying_verification_count).to eq(3)
end
end
describe '#last_event_id and #last_event_date' do describe '#last_event_id and #last_event_date' do
it 'returns nil when no events are available' do it 'returns nil when no events are available' do
expect(subject.last_event_id).to be_nil expect(subject.last_event_id).to be_nil
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment