From 0eff75fa2b6691b6fba31fcc2842f51debd249a9 Mon Sep 17 00:00:00 2001
From: Nick Thomas <nick@gitlab.com>
Date: Mon, 8 Jul 2019 17:11:59 +0100
Subject: [PATCH] Cache branch and tag names as Redis sets

This allows us to check inclusion for the *_exists? methods without
downloading the full list of branch names, which is over 100KiB in size
for gitlab-ce at the moment.
---
 app/models/repository.rb                      | 12 ++-
 .../64251-branch-name-set-cache.yml           |  5 ++
 lib/gitlab/repository_cache_adapter.rb        | 53 ++++++++++++++
 lib/gitlab/repository_set_cache.rb            | 67 +++++++++++++++++
 .../gitlab/repository_cache_adapter_spec.rb   |  7 +-
 spec/lib/gitlab/repository_set_cache_spec.rb  | 73 +++++++++++++++++++
 spec/models/repository_spec.rb                | 52 ++++++++++---
 7 files changed, 252 insertions(+), 17 deletions(-)
 create mode 100644 changelogs/unreleased/64251-branch-name-set-cache.yml
 create mode 100644 lib/gitlab/repository_set_cache.rb
 create mode 100644 spec/lib/gitlab/repository_set_cache_spec.rb

diff --git a/app/models/repository.rb b/app/models/repository.rb
index 9d45a12fa6e..561c6579086 100644
--- a/app/models/repository.rb
+++ b/app/models/repository.rb
@@ -239,13 +239,13 @@ class Repository
   def branch_exists?(branch_name)
     return false unless raw_repository
 
-    branch_names.include?(branch_name)
+    branch_names_include?(branch_name)
   end
 
   def tag_exists?(tag_name)
     return false unless raw_repository
 
-    tag_names.include?(tag_name)
+    tag_names_include?(tag_name)
   end
 
   def ref_exists?(ref)
@@ -561,10 +561,10 @@ class Repository
   end
 
   delegate :branch_names, to: :raw_repository
-  cache_method :branch_names, fallback: []
+  cache_method_as_redis_set :branch_names, fallback: []
 
   delegate :tag_names, to: :raw_repository
-  cache_method :tag_names, fallback: []
+  cache_method_as_redis_set :tag_names, fallback: []
 
   delegate :branch_count, :tag_count, :has_visible_content?, to: :raw_repository
   cache_method :branch_count, fallback: 0
@@ -1126,6 +1126,10 @@ class Repository
     @cache ||= Gitlab::RepositoryCache.new(self)
   end
 
+  def redis_set_cache
+    @redis_set_cache ||= Gitlab::RepositorySetCache.new(self)
+  end
+
   def request_store_cache
     @request_store_cache ||= Gitlab::RepositoryCache.new(self, backend: Gitlab::SafeRequestStore)
   end
diff --git a/changelogs/unreleased/64251-branch-name-set-cache.yml b/changelogs/unreleased/64251-branch-name-set-cache.yml
new file mode 100644
index 00000000000..6ce4bdf5e43
--- /dev/null
+++ b/changelogs/unreleased/64251-branch-name-set-cache.yml
@@ -0,0 +1,5 @@
+---
+title: Cache branch and tag names as Redis sets
+merge_request: 30476
+author:
+type: performance
diff --git a/lib/gitlab/repository_cache_adapter.rb b/lib/gitlab/repository_cache_adapter.rb
index e40c366ed02..75503ee1789 100644
--- a/lib/gitlab/repository_cache_adapter.rb
+++ b/lib/gitlab/repository_cache_adapter.rb
@@ -23,6 +23,37 @@ module Gitlab
         end
       end
 
+      # Caches and strongly memoizes the method as a Redis Set.
+      #
+      # This only works for methods that do not take any arguments. The method
+      # should return an Array of Strings to be cached.
+      #
+      # In addition to overriding the named method, a "name_include?" method is
+      # defined. This uses the "SISMEMBER" query to efficiently check membership
+      # without needing to load the entire set into memory.
+      #
+      # name     - The name of the method to be cached.
+      # fallback - A value to fall back to if the repository does not exist, or
+      #            in case of a Git error. Defaults to nil.
+      def cache_method_as_redis_set(name, fallback: nil)
+        uncached_name = alias_uncached_method(name)
+
+        define_method(name) do
+          cache_method_output_as_redis_set(name, fallback: fallback) do
+            __send__(uncached_name) # rubocop:disable GitlabSecurity/PublicSend
+          end
+        end
+
+        define_method("#{name}_include?") do |value|
+          # If the cache isn't populated, we can't rely on it
+          return redis_set_cache.include?(name, value) if redis_set_cache.exist?(name)
+
+          # Since we have to pull all branch names to populate the cache, use
+          # the data we already have to answer the query just this once
+          __send__(name).include?(value) # rubocop:disable GitlabSecurity/PublicSend
+        end
+      end
+
       # Caches truthy values from the method. All values are strongly memoized,
       # and cached in RequestStore.
       #
@@ -84,6 +115,11 @@ module Gitlab
       raise NotImplementedError
     end
 
+    # RepositorySetCache to be used. Should be overridden by the including class
+    def redis_set_cache
+      raise NotImplementedError
+    end
+
     # List of cached methods. Should be overridden by the including class
     def cached_methods
       raise NotImplementedError
@@ -100,6 +136,18 @@ module Gitlab
       end
     end
 
+    # Caches and strongly memoizes the supplied block as a Redis Set. The result
+    # will be provided as a sorted array.
+    #
+    # name     - The name of the method to be cached.
+    # fallback - A value to fall back to if the repository does not exist, or
+    #            in case of a Git error. Defaults to nil.
+    def cache_method_output_as_redis_set(name, fallback: nil, &block)
+      memoize_method_output(name, fallback: fallback) do
+        redis_set_cache.fetch(name, &block).sort
+      end
+    end
+
     # Caches truthy values from the supplied block. All values are strongly
     # memoized, and cached in RequestStore.
     #
@@ -154,6 +202,7 @@ module Gitlab
         clear_memoization(memoizable_name(name))
       end
 
+      expire_redis_set_method_caches(methods)
       expire_request_store_method_caches(methods)
     end
 
@@ -169,6 +218,10 @@ module Gitlab
       end
     end
 
+    def expire_redis_set_method_caches(methods)
+      methods.each { |name| redis_set_cache.expire(name) }
+    end
+
     # All cached repository methods depend on the existence of a Git repository,
     # so if the repository doesn't exist, we already know not to call it.
     def fallback_early?(method_name)
diff --git a/lib/gitlab/repository_set_cache.rb b/lib/gitlab/repository_set_cache.rb
new file mode 100644
index 00000000000..fb634328a95
--- /dev/null
+++ b/lib/gitlab/repository_set_cache.rb
@@ -0,0 +1,67 @@
+# frozen_string_literal: true
+
+# Interface to the Redis-backed cache store for keys that use a Redis set
+module Gitlab
+  class RepositorySetCache
+    attr_reader :repository, :namespace, :expires_in
+
+    def initialize(repository, extra_namespace: nil, expires_in: 2.weeks)
+      @repository = repository
+      @namespace = "#{repository.full_path}:#{repository.project.id}"
+      @namespace = "#{@namespace}:#{extra_namespace}" if extra_namespace
+      @expires_in = expires_in
+    end
+
+    def cache_key(type)
+      [type, namespace, 'set'].join(':')
+    end
+
+    def expire(key)
+      with { |redis| redis.del(cache_key(key)) }
+    end
+
+    def exist?(key)
+      with { |redis| redis.exists(cache_key(key)) }
+    end
+
+    def read(key)
+      with { |redis| redis.smembers(cache_key(key)) }
+    end
+
+    def write(key, value)
+      full_key = cache_key(key)
+
+      with do |redis|
+        redis.multi do
+          redis.del(full_key)
+
+          # Splitting into groups of 1000 prevents us from creating a too-long
+          # Redis command
+          value.in_groups_of(1000, false) { |subset| redis.sadd(full_key, subset) }
+
+          redis.expire(full_key, expires_in)
+        end
+      end
+
+      value
+    end
+
+    def fetch(key, &block)
+      if exist?(key)
+        read(key)
+      else
+        write(key, yield)
+      end
+    end
+
+    def include?(key, value)
+      with { |redis| redis.sismember(cache_key(key), value) }
+    end
+
+    private
+
+    def with(&blk)
+      Gitlab::Redis::Cache.with(&blk) # rubocop:disable CodeReuse/ActiveRecord
+    end
+  end
+end
diff --git a/spec/lib/gitlab/repository_cache_adapter_spec.rb b/spec/lib/gitlab/repository_cache_adapter_spec.rb
index 0295138fc3a..04fc24b6205 100644
--- a/spec/lib/gitlab/repository_cache_adapter_spec.rb
+++ b/spec/lib/gitlab/repository_cache_adapter_spec.rb
@@ -4,6 +4,7 @@ describe Gitlab::RepositoryCacheAdapter do
   let(:project) { create(:project, :repository) }
   let(:repository) { project.repository }
   let(:cache) { repository.send(:cache) }
+  let(:redis_set_cache) { repository.send(:redis_set_cache) }
 
   describe '#cache_method_output', :use_clean_rails_memory_store_caching do
     let(:fallback) { 10 }
@@ -206,9 +207,11 @@ describe Gitlab::RepositoryCacheAdapter do
   describe '#expire_method_caches' do
     it 'expires the caches of the given methods' do
       expect(cache).to receive(:expire).with(:rendered_readme)
-      expect(cache).to receive(:expire).with(:gitignore)
+      expect(cache).to receive(:expire).with(:branch_names)
+      expect(redis_set_cache).to receive(:expire).with(:rendered_readme)
+      expect(redis_set_cache).to receive(:expire).with(:branch_names)
 
-      repository.expire_method_caches(%i(rendered_readme gitignore))
+      repository.expire_method_caches(%i(rendered_readme branch_names))
     end
 
     it 'does not expire caches for non-existent methods' do
diff --git a/spec/lib/gitlab/repository_set_cache_spec.rb b/spec/lib/gitlab/repository_set_cache_spec.rb
new file mode 100644
index 00000000000..9695e13d842
--- /dev/null
+++ b/spec/lib/gitlab/repository_set_cache_spec.rb
@@ -0,0 +1,73 @@
+require 'spec_helper'
+
+describe Gitlab::RepositorySetCache, :clean_gitlab_redis_cache do
+  let(:project) { create(:project) }
+  let(:repository) { project.repository }
+  let(:namespace) { "#{repository.full_path}:#{project.id}" }
+  let(:cache) { described_class.new(repository) }
+
+  describe '#cache_key' do
+    subject { cache.cache_key(:foo) }
+
+    it 'includes the namespace' do
+      is_expected.to eq("foo:#{namespace}:set")
+    end
+
+    context 'with a given namespace' do
+      let(:extra_namespace) { 'my:data' }
+      let(:cache) { described_class.new(repository, extra_namespace: extra_namespace) }
+
+      it 'includes the full namespace' do
+        is_expected.to eq("foo:#{namespace}:#{extra_namespace}:set")
+      end
+    end
+  end
+
+  describe '#expire' do
+    it 'expires the given key from the cache' do
+      cache.write(:foo, ['value'])
+
+      expect(cache.read(:foo)).to contain_exactly('value')
+      expect(cache.expire(:foo)).to eq(1)
+      expect(cache.read(:foo)).to be_empty
+    end
+  end
+
+  describe '#exist?' do
+    it 'checks whether the key exists' do
+      expect(cache.exist?(:foo)).to be(false)
+
+      cache.write(:foo, ['value'])
+
+      expect(cache.exist?(:foo)).to be(true)
+    end
+  end
+
+  describe '#fetch' do
+    let(:blk) { -> { ['block value'] } }
+
+    subject { cache.fetch(:foo, &blk) }
+
+    it 'fetches the key from the cache when filled' do
+      cache.write(:foo, ['value'])
+
+      is_expected.to contain_exactly('value')
+    end
+
+    it 'writes the value of the provided block when empty' do
+      cache.expire(:foo)
+
+      is_expected.to contain_exactly('block value')
+      expect(cache.read(:foo)).to contain_exactly('block value')
+    end
+  end
+
+  describe '#include?' do
+    it 'checks inclusion in the Redis set' do
+      cache.write(:foo, ['value'])
+
+      expect(cache.include?(:foo, 'value')).to be(true)
+      expect(cache.include?(:foo, 'bar')).to be(false)
+    end
+  end
+end
diff --git a/spec/models/repository_spec.rb b/spec/models/repository_spec.rb
index e68de2e73a8..423334c66f2 100644
--- a/spec/models/repository_spec.rb
+++ b/spec/models/repository_spec.rb
@@ -1223,36 +1223,66 @@ describe Repository do
   end
 
   describe '#branch_exists?' do
-    it 'uses branch_names' do
-      allow(repository).to receive(:branch_names).and_return(['foobar'])
+    let(:branch) { repository.root_ref }
 
-      expect(repository.branch_exists?('foobar')).to eq(true)
-      expect(repository.branch_exists?('master')).to eq(false)
+    subject { repository.branch_exists?(branch) }
+
+    it 'delegates to branch_names when the cache is empty' do
+      repository.expire_branches_cache
+
+      expect(repository).to receive(:branch_names).and_call_original
+      is_expected.to eq(true)
+    end
+
+    it 'uses redis set caching when the cache is filled' do
+      repository.branch_names # ensure the branch name cache is filled
+
+      expect(repository)
+        .to receive(:branch_names_include?)
+        .with(branch)
+        .and_call_original
+
+      is_expected.to eq(true)
     end
   end
 
   describe '#tag_exists?' do
-    it 'uses tag_names' do
-      allow(repository).to receive(:tag_names).and_return(['foobar'])
+    let(:tag) { repository.tags.first.name }
+
+    subject { repository.tag_exists?(tag) }
+
+    it 'delegates to tag_names when the cache is empty' do
+      repository.expire_tags_cache
+
+      expect(repository).to receive(:tag_names).and_call_original
+      is_expected.to eq(true)
+    end
+
+    it 'uses redis set caching when the cache is filled' do
+      repository.tag_names # ensure the tag name cache is filled
+
+      expect(repository)
+        .to receive(:tag_names_include?)
+        .with(tag)
+        .and_call_original
 
-      expect(repository.tag_exists?('foobar')).to eq(true)
-      expect(repository.tag_exists?('master')).to eq(false)
+      is_expected.to eq(true)
     end
   end
 
-  describe '#branch_names', :use_clean_rails_memory_store_caching do
+  describe '#branch_names', :clean_gitlab_redis_cache do
     let(:fake_branch_names) { ['foobar'] }
 
     it 'gets cached across Repository instances' do
       allow(repository.raw_repository).to receive(:branch_names).once.and_return(fake_branch_names)
 
-      expect(repository.branch_names).to eq(fake_branch_names)
+      expect(repository.branch_names).to match_array(fake_branch_names)
 
       fresh_repository = Project.find(project.id).repository
       expect(fresh_repository.object_id).not_to eq(repository.object_id)
 
       expect(fresh_repository.raw_repository).not_to receive(:branch_names)
-      expect(fresh_repository.branch_names).to eq(fake_branch_names)
+      expect(fresh_repository.branch_names).to match_array(fake_branch_names)
     end
   end
 
-- 
2.30.9