class GeoNodeStatus < ActiveRecord::Base include ShaAttribute belongs_to :geo_node delegate :selective_sync_type, to: :geo_node after_initialize :initialize_feature_flags attr_accessor :storage_shards attr_accessor :repository_verification_enabled # Prometheus metrics, no need to store them in the database attr_accessor :event_log_count, :event_log_max_id, :repository_created_max_id, :repository_updated_max_id, :repository_deleted_max_id, :repository_renamed_max_id, :repositories_changed_max_id, :lfs_object_deleted_max_id, :job_artifact_deleted_max_id, :lfs_objects_registry_count, :job_artifacts_registry_count, :attachments_registry_count, :hashed_storage_migrated_max_id, :hashed_storage_attachments_max_id, :repositories_checked_count, :repositories_checked_failed_count sha_attribute :storage_configuration_digest # It's needed for backward compatibility as we expose them via public API alias_attribute :wikis_count, :projects_count alias_attribute :repositories_count, :projects_count # Be sure to keep this consistent with Prometheus naming conventions PROMETHEUS_METRICS = { db_replication_lag_seconds: 'Database replication lag (seconds)', repositories_count: 'Total number of repositories available on primary', repositories_synced_count: 'Number of repositories synced on secondary', repositories_failed_count: 'Number of repositories failed to sync on secondary', wikis_synced_count: 'Number of wikis synced on secondary', wikis_failed_count: 'Number of wikis failed to sync on secondary', repositories_checksummed_count: 'Number of repositories checksummed on primary', repositories_checksum_failed_count: 'Number of repositories failed to calculate the checksum on primary', wikis_checksummed_count: 'Number of wikis checksummed on primary', wikis_checksum_failed_count: 'Number of wikis failed to calculate the checksum on primary', repositories_verified_count: 'Number of repositories verified on secondary', repositories_verification_failed_count: 'Number of repositories failed to verify on secondary', repositories_checksum_mismatch_count: 'Number of repositories that checksum mismatch on secondary', wikis_verified_count: 'Number of wikis verified on secondary', wikis_verification_failed_count: 'Number of wikis failed to verify on secondary', wikis_checksum_mismatch_count: 'Number of wikis that checksum mismatch on secondary', lfs_objects_count: 'Total number of syncable LFS objects available on primary', lfs_objects_synced_count: 'Number of syncable LFS objects synced on secondary', lfs_objects_failed_count: 'Number of syncable LFS objects failed to sync on secondary', lfs_objects_registry_count: 'Number of LFS objects in the registry', lfs_objects_synced_missing_on_primary_count: 'Number of LFS objects marked as synced due to the file missing on the primary', job_artifacts_count: 'Total number of syncable job artifacts available on primary', job_artifacts_synced_count: 'Number of syncable job artifacts synced on secondary', job_artifacts_failed_count: 'Number of syncable job artifacts failed to sync on secondary', job_artifacts_registry_count: 'Number of job artifacts in the registry', job_artifacts_synced_missing_on_primary_count: 'Number of job artifacts marked as synced due to the file missing on the primary', attachments_count: 'Total number of syncable file attachments available on primary', attachments_synced_count: 'Number of syncable file attachments synced on secondary', attachments_failed_count: 'Number of syncable file attachments failed to sync on secondary', attachments_registry_count: 'Number of attachments in the registry', attachments_synced_missing_on_primary_count: 'Number of attachments marked as synced due to the file missing on the primary', replication_slots_count: 'Total number of replication slots on the primary', replication_slots_used_count: 'Number of replication slots in use on the primary', replication_slots_max_retained_wal_bytes: 'Maximum number of bytes retained in the WAL on the primary', last_event_id: 'Database ID of the latest event log entry on the primary', last_event_timestamp: 'Time of the latest event log entry on the primary', cursor_last_event_id: 'Last database ID of the event log processed by the secondary', cursor_last_event_timestamp: 'Time of the event log processed by the secondary', last_successful_status_check_timestamp: 'Time when Geo node status was updated internally', status_message: 'Summary of health status', event_log_count: 'Number of entries in the Geo event log', event_log_max_id: 'Highest ID present in the Geo event log', repository_created_max_id: 'Highest ID present in repositories created', repository_updated_max_id: 'Highest ID present in repositories updated', repository_deleted_max_id: 'Highest ID present in repositories deleted', repository_renamed_max_id: 'Highest ID present in repositories renamed', repositories_changed_max_id: 'Highest ID present in repositories changed', lfs_object_deleted_max_id: 'Highest ID present in LFS objects deleted', job_artifact_deleted_max_id: 'Highest ID present in job artifacts deleted', hashed_storage_migrated_max_id: 'Highest ID present in projects migrated to hashed storage', hashed_storage_attachments_max_id: 'Highest ID present in attachments migrated to hashed storage', repositories_checked_count: 'Number of repositories checked', repositories_checked_failed_count: 'Number of failed repositories checked', repositories_retrying_verification_count: 'Number of repositories verification failures that Geo is actively trying to correct on secondary', wikis_retrying_verification_count: 'Number of wikis verification failures that Geo is actively trying to correct on secondary' }.freeze EXPIRATION_IN_MINUTES = 5 HEALTHY_STATUS = 'Healthy'.freeze UNHEALTHY_STATUS = 'Unhealthy'.freeze def self.current_node_status current_node = Gitlab::Geo.current_node return unless current_node status = current_node.find_or_build_status status.load_data_from_current_node status.save if Gitlab::Geo.primary? status end def self.fast_current_node_status attrs = Rails.cache.read(cache_key) if attrs new(attrs) else spawn_worker nil end end def self.spawn_worker ::Geo::MetricsUpdateWorker.perform_async end def self.cache_key "geo-node:#{Gitlab::Geo.current_node.id}:status" end def self.from_json(json_data) json_data.slice!(*allowed_params) GeoNodeStatus.new(HashWithIndifferentAccess.new(json_data)) end EXCLUDED_PARAMS = %w[id created_at].freeze EXTRA_PARAMS = %w[ last_event_timestamp cursor_last_event_timestamp storage_shards ].freeze def self.allowed_params self.column_names - EXCLUDED_PARAMS + EXTRA_PARAMS end def initialize_feature_flags self.repository_verification_enabled = Gitlab::Geo.repository_verification_enabled? end def update_cache! Rails.cache.write(self.class.cache_key, attributes) end def load_data_from_current_node self.status_message = begin HealthCheck::Utils.process_checks(['geo']) rescue NotImplementedError => e e.to_s end latest_event = Geo::EventLog.latest_event self.last_event_id = latest_event&.id self.last_event_date = latest_event&.created_at self.projects_count = projects_finder.count_projects self.lfs_objects_count = lfs_objects_finder.count_syncable self.job_artifacts_count = job_artifacts_finder.count_syncable self.attachments_count = attachments_finder.count_syncable self.last_successful_status_check_at = Time.now self.storage_shards = StorageShard.all self.storage_configuration_digest = StorageShard.build_digest self.version = Gitlab::VERSION self.revision = Gitlab.revision self.event_log_count = Geo::EventLog.count # Geo::PruneEventLogWorker might remove old events, so log maximum id self.event_log_max_id = Geo::EventLog.maximum(:id) self.repository_created_max_id = Geo::RepositoryCreatedEvent.maximum(:id) self.repository_updated_max_id = Geo::RepositoryUpdatedEvent.maximum(:id) self.repository_deleted_max_id = Geo::RepositoryDeletedEvent.maximum(:id) self.repository_renamed_max_id = Geo::RepositoryRenamedEvent.maximum(:id) self.repositories_changed_max_id = Geo::RepositoriesChangedEvent.maximum(:id) self.lfs_object_deleted_max_id = Geo::LfsObjectDeletedEvent.maximum(:id) self.job_artifact_deleted_max_id = Geo::JobArtifactDeletedEvent.maximum(:id) self.hashed_storage_migrated_max_id = Geo::HashedStorageMigratedEvent.maximum(:id) self.hashed_storage_attachments_max_id = Geo::HashedStorageAttachmentsEvent.maximum(:id) load_primary_data load_secondary_data self end def load_primary_data if Gitlab::Geo.primary? self.replication_slots_count = geo_node.replication_slots_count self.replication_slots_used_count = geo_node.replication_slots_used_count self.replication_slots_max_retained_wal_bytes = geo_node.replication_slots_max_retained_wal_bytes if repository_verification_enabled self.repositories_checksummed_count = repository_verification_finder.count_verified_repositories self.repositories_checksum_failed_count = repository_verification_finder.count_verification_failed_repositories self.wikis_checksummed_count = repository_verification_finder.count_verified_wikis self.wikis_checksum_failed_count = repository_verification_finder.count_verification_failed_wikis end self.repositories_checked_count = Project.where.not(last_repository_check_at: nil).count self.repositories_checked_failed_count = Project.where(last_repository_check_failed: true).count end end def load_secondary_data if Gitlab::Geo.secondary? self.db_replication_lag_seconds = Gitlab::Geo::HealthCheck.db_replication_lag_seconds self.cursor_last_event_id = current_cursor_last_event_id self.cursor_last_event_date = Geo::EventLog.find_by(id: self.cursor_last_event_id)&.created_at self.repositories_synced_count = projects_finder.count_synced_repositories self.repositories_failed_count = projects_finder.count_failed_repositories self.wikis_synced_count = projects_finder.count_synced_wikis self.wikis_failed_count = projects_finder.count_failed_wikis self.lfs_objects_synced_count = lfs_objects_finder.count_synced self.lfs_objects_failed_count = lfs_objects_finder.count_failed self.lfs_objects_registry_count = lfs_objects_finder.count_registry self.lfs_objects_synced_missing_on_primary_count = lfs_objects_finder.count_synced_missing_on_primary self.job_artifacts_synced_count = job_artifacts_finder.count_synced self.job_artifacts_failed_count = job_artifacts_finder.count_failed self.job_artifacts_registry_count = job_artifacts_finder.count_registry self.job_artifacts_synced_missing_on_primary_count = job_artifacts_finder.count_synced_missing_on_primary self.attachments_synced_count = attachments_finder.count_synced self.attachments_failed_count = attachments_finder.count_failed self.attachments_registry_count = attachments_finder.count_registry self.attachments_synced_missing_on_primary_count = attachments_finder.count_synced_missing_on_primary load_verification_data self.repositories_checked_count = Geo::ProjectRegistry.where.not(last_repository_check_at: nil).count self.repositories_checked_failed_count = Geo::ProjectRegistry.where(last_repository_check_failed: true).count end end def load_verification_data if repository_verification_enabled self.repositories_verified_count = projects_finder.count_verified_repositories self.repositories_verification_failed_count = projects_finder.count_verification_failed_repositories self.repositories_checksum_mismatch_count = projects_finder.count_repositories_checksum_mismatch self.wikis_verified_count = projects_finder.count_verified_wikis self.wikis_verification_failed_count = projects_finder.count_verification_failed_wikis self.wikis_checksum_mismatch_count = projects_finder.count_wikis_checksum_mismatch self.repositories_retrying_verification_count = projects_finder.count_repositories_retrying_verification self.wikis_retrying_verification_count = projects_finder.count_wikis_retrying_verification end end def current_cursor_last_event_id return unless Gitlab::Geo.secondary? min_gap_id = ::Gitlab::Geo::EventGapTracking.min_gap_id last_processed_id = Geo::EventLogState.last_processed&.event_id [min_gap_id, last_processed_id].compact.min end def healthy? !outdated? && status_message_healthy? end def health if outdated? return "Status has not been updated in the past #{EXPIRATION_IN_MINUTES} minutes" end status_message end def health_status healthy? ? HEALTHY_STATUS : UNHEALTHY_STATUS end def outdated? return false unless updated_at updated_at < EXPIRATION_IN_MINUTES.minutes.ago end def status_message_healthy? status_message.blank? || status_message == HEALTHY_STATUS end def last_successful_status_check_timestamp self.last_successful_status_check_at.to_i end def last_successful_status_check_timestamp=(value) self.last_successful_status_check_at = Time.at(value) end def last_event_timestamp self.last_event_date.to_i end def last_event_timestamp=(value) self.last_event_date = Time.at(value) end def cursor_last_event_timestamp self.cursor_last_event_date.to_i end def cursor_last_event_timestamp=(value) self.cursor_last_event_date = Time.at(value) end def repositories_synced_in_percentage calc_percentage(projects_count, repositories_synced_count) end def wikis_synced_in_percentage calc_percentage(projects_count, wikis_synced_count) end def repositories_checksummed_in_percentage calc_percentage(projects_count, repositories_checksummed_count) end def wikis_checksummed_in_percentage calc_percentage(projects_count, wikis_checksummed_count) end def repositories_verified_in_percentage calc_percentage(projects_count, repositories_verified_count) end def wikis_verified_in_percentage calc_percentage(projects_count, wikis_verified_count) end def repositories_checked_in_percentage calc_percentage(projects_count, repositories_checked_count) end def lfs_objects_synced_in_percentage calc_percentage(lfs_objects_count, lfs_objects_synced_count) end def job_artifacts_synced_in_percentage calc_percentage(job_artifacts_count, job_artifacts_synced_count) end def attachments_synced_in_percentage calc_percentage(attachments_count, attachments_synced_count) end def replication_slots_used_in_percentage calc_percentage(replication_slots_count, replication_slots_used_count) end def storage_shards_match? return true if geo_node.primary? return false unless storage_configuration_digest && primary_storage_digest storage_configuration_digest == primary_storage_digest end def [](key) public_send(key) # rubocop:disable GitlabSecurity/PublicSend end private def primary_storage_digest @primary_storage_digest ||= Gitlab::Geo.primary_node.find_or_build_status.storage_configuration_digest end def attachments_finder @attachments_finder ||= Geo::AttachmentRegistryFinder.new(current_node: geo_node) end def lfs_objects_finder @lfs_objects_finder ||= Geo::LfsObjectRegistryFinder.new(current_node: geo_node) end def job_artifacts_finder @job_artifacts_finder ||= Geo::JobArtifactRegistryFinder.new(current_node: geo_node) end def projects_finder @projects_finder ||= Geo::ProjectRegistryFinder.new(current_node: geo_node) end def repository_verification_finder @repository_verification_finder ||= Geo::RepositoryVerificationFinder.new end def calc_percentage(total, count) return 0 if !total.present? || total.zero? (count.to_f / total.to_f) * 100.0 end end