Commit 573ec5c4 authored by Robert May's avatar Robert May Committed by Nick Thomas

Cached binary detection [RUN ALL RSPEC] [RUN AS-IF-FOSS]

parent 047ed0b5
---
name: cached_encoding_detection
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/60128
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/328819
milestone: '13.12'
type: development
group: group::source code
default_enabled: false
...@@ -38,7 +38,7 @@ module Gitlab ...@@ -38,7 +38,7 @@ module Gitlab
# If Charlock says its binary # If Charlock says its binary
else else
detect_encoding[:type] == :binary find_encoding[:type] == :binary
end end
end end
...@@ -137,23 +137,25 @@ module Gitlab ...@@ -137,23 +137,25 @@ module Gitlab
end end
def ruby_encoding def ruby_encoding
if hash = detect_encoding if hash = find_encoding
hash[:ruby_encoding] hash[:ruby_encoding]
end end
end end
def encoding def encoding
if hash = detect_encoding if hash = find_encoding
hash[:encoding] hash[:encoding]
end end
end end
def detect_encoding
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables
end
def empty? def empty?
data.nil? || data == "" data.nil? || data == ""
end end
private
def find_encoding
@find_encoding ||= Gitlab::EncodingHelper.detect_encoding(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables
end
end end
end end
...@@ -20,7 +20,7 @@ module Gitlab ...@@ -20,7 +20,7 @@ module Gitlab
return message if message.valid_encoding? return message if message.valid_encoding?
# return message if message type is binary # return message if message type is binary
detect = CharlockHolmes::EncodingDetector.detect(message) detect = detect_encoding(message)
return message.force_encoding("BINARY") if detect_binary?(message, detect) return message.force_encoding("BINARY") if detect_binary?(message, detect)
if detect && detect[:encoding] && detect[:confidence] > ENCODING_CONFIDENCE_THRESHOLD if detect && detect[:encoding] && detect[:confidence] > ENCODING_CONFIDENCE_THRESHOLD
...@@ -37,16 +37,30 @@ module Gitlab ...@@ -37,16 +37,30 @@ module Gitlab
"--broken encoding: #{encoding}" "--broken encoding: #{encoding}"
end end
def detect_encoding(data, limit: CharlockHolmes::EncodingDetector::DEFAULT_BINARY_SCAN_LEN, cache_key: nil)
return if data.nil?
if Feature.enabled?(:cached_encoding_detection, type: :development, default_enabled: :yaml)
return CharlockHolmes::EncodingDetector.new(limit).detect(data) unless cache_key.present?
Rails.cache.fetch([:detect_binary, CharlockHolmes::VERSION, cache_key], expires_in: 1.week) do
CharlockHolmes::EncodingDetector.new(limit).detect(data)
end
else
CharlockHolmes::EncodingDetector.new(limit).detect(data)
end
end
def detect_binary?(data, detect = nil) def detect_binary?(data, detect = nil)
detect ||= CharlockHolmes::EncodingDetector.detect(data) detect ||= detect_encoding(data)
detect && detect[:type] == :binary && detect[:confidence] == 100 detect && detect[:type] == :binary && detect[:confidence] == 100
end end
def detect_libgit2_binary?(data) # EncodingDetector checks the first 1024 * 1024 bytes for NUL byte, libgit2 checks
# EncodingDetector checks the first 1024 * 1024 bytes for NUL byte, libgit2 checks # only the first 8000 (https://github.com/libgit2/libgit2/blob/2ed855a9e8f9af211e7274021c2264e600c0f86b/src/filter.h#L15),
# only the first 8000 (https://github.com/libgit2/libgit2/blob/2ed855a9e8f9af211e7274021c2264e600c0f86b/src/filter.h#L15), # which is what we use below to keep a consistent behavior.
# which is what we use below to keep a consistent behavior. def detect_libgit2_binary?(data, cache_key: nil)
detect = CharlockHolmes::EncodingDetector.new(8000).detect(data) detect = detect_encoding(data, limit: 8000, cache_key: cache_key)
detect && detect[:type] == :binary detect && detect[:type] == :binary
end end
...@@ -54,7 +68,8 @@ module Gitlab ...@@ -54,7 +68,8 @@ module Gitlab
message = force_encode_utf8(message) message = force_encode_utf8(message)
return message if message.valid_encoding? return message if message.valid_encoding?
detect = CharlockHolmes::EncodingDetector.detect(message) detect = detect_encoding(message)
if detect && detect[:encoding] if detect && detect[:encoding]
begin begin
CharlockHolmes::Converter.convert(message, detect[:encoding], 'UTF-8') CharlockHolmes::Converter.convert(message, detect[:encoding], 'UTF-8')
......
...@@ -110,8 +110,8 @@ module Gitlab ...@@ -110,8 +110,8 @@ module Gitlab
end end
end end
def binary?(data) def binary?(data, cache_key: nil)
EncodingHelper.detect_libgit2_binary?(data) EncodingHelper.detect_libgit2_binary?(data, cache_key: cache_key)
end end
def size_could_be_lfs?(size) def size_could_be_lfs?(size)
......
...@@ -41,7 +41,7 @@ module Gitlab ...@@ -41,7 +41,7 @@ module Gitlab
size: blob_data[:size], size: blob_data[:size],
commit_id: blob_data[:revision], commit_id: blob_data[:revision],
data: data, data: data,
binary: Gitlab::Git::Blob.binary?(data) binary: Gitlab::Git::Blob.binary?(data, cache_key: blob_data[:oid])
) )
end end
end end
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
require 'spec_helper' require 'spec_helper'
RSpec.describe 'Diff file viewer', :js do RSpec.describe 'Diff file viewer', :js, :with_clean_rails_cache do
let(:project) { create(:project, :public, :repository) } let(:project) { create(:project, :public, :repository) }
def visit_commit(sha, anchor: nil) def visit_commit(sha, anchor: nil)
......
...@@ -216,4 +216,63 @@ RSpec.describe Gitlab::EncodingHelper do ...@@ -216,4 +216,63 @@ RSpec.describe Gitlab::EncodingHelper do
expect(test).not_to eq(io_stream) expect(test).not_to eq(io_stream)
end end
end end
describe '#detect_encoding' do
subject { ext_class.detect_encoding(data, **kwargs) }
let(:data) { binary_string }
let(:kwargs) { {} }
shared_examples 'detects encoding' do
it { is_expected.to be_a(Hash) }
it 'correctly detects the binary' do
expect(subject[:type]).to eq(:binary)
end
context 'data is nil' do
let(:data) { nil }
it { is_expected.to be_nil }
end
context 'limit is provided' do
let(:kwargs) do
{ limit: 10 }
end
it 'correctly detects the binary' do
expect(subject[:type]).to eq(:binary)
end
end
end
context 'cached_encoding_detection is enabled' do
before do
stub_feature_flags(cached_encoding_detection: true)
end
it_behaves_like 'detects encoding'
context 'cache_key is provided' do
let(:kwargs) do
{ cache_key: %w(foo bar) }
end
it 'uses that cache_key to serve from the cache' do
expect(Rails.cache).to receive(:fetch).with([:detect_binary, CharlockHolmes::VERSION, %w(foo bar)], expires_in: 1.week).and_call_original
expect(subject[:type]).to eq(:binary)
end
end
end
context 'cached_encoding_detection is disabled' do
before do
stub_feature_flags(cached_encoding_detection: false)
end
it_behaves_like 'detects encoding'
end
end
end end
...@@ -53,7 +53,10 @@ RSpec.describe Gitlab::Git::Blame, :seed_helper do ...@@ -53,7 +53,10 @@ RSpec.describe Gitlab::Git::Blame, :seed_helper do
end end
it 'converts to UTF-8' do it 'converts to UTF-8' do
expect(CharlockHolmes::EncodingDetector).to receive(:detect).and_return(nil) expect_next_instance_of(CharlockHolmes::EncodingDetector) do |detector|
expect(detector).to receive(:detect).and_return(nil)
end
data = [] data = []
blame.each do |commit, line| blame.each do |commit, line|
data << { data << {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment