Commit 372b9fe7 authored by Kerri Miller's avatar Kerri Miller

Merge branch '207124-encoding-undefinedconversionerror-in-highlight_cache-rb' into 'master'

Force convert invalid ASCII 8bit diffs and use a replacement ASCII character

See merge request gitlab-org/gitlab!79996
parents 9e948118 0ecf9ab8
---
name: convert_diff_to_utf8_with_replacement_symbol
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/79996
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/354526
milestone: '14.9'
type: development
group: group::code review
default_enabled: false
......@@ -15,6 +15,8 @@ module Gitlab
# https://gitlab.com/gitlab-org/gitlab_git/merge_requests/77#note_4754193
ENCODING_CONFIDENCE_THRESHOLD = 50
UNICODE_REPLACEMENT_CHARACTER = "�"
def encode!(message)
message = force_encode_utf8(message)
return message if message.valid_encoding?
......@@ -65,6 +67,10 @@ module Gitlab
message.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
end
def encode_utf8_with_replacement_character(data)
encode_utf8(data, replace: UNICODE_REPLACEMENT_CHARACTER)
end
def encode_utf8(message, replace: "")
message = force_encode_utf8(message)
return message if message.valid_encoding?
......
......@@ -140,7 +140,7 @@ module Gitlab
text.start_with?(BINARY_NOTICE_PATTERN)
end
end
def initialize(raw_diff, expanded: true)
def initialize(raw_diff, expanded: true, replace_invalid_utf8_chars: true)
@expanded = expanded
case raw_diff
......@@ -157,6 +157,8 @@ module Gitlab
else
raise "Invalid raw diff type: #{raw_diff.class}"
end
encode_diff_to_utf8(replace_invalid_utf8_chars)
end
def to_hash
......@@ -227,6 +229,13 @@ module Gitlab
private
def encode_diff_to_utf8(replace_invalid_utf8_chars)
return unless Feature.enabled?(:convert_diff_to_utf8_with_replacement_symbol, default_enabled: :yaml)
return unless replace_invalid_utf8_chars && !detect_binary?(@diff)
@diff = Gitlab::EncodingHelper.encode_utf8_with_replacement_character(@diff)
end
def init_from_hash(hash)
raw_diff = hash.symbolize_keys
......
......@@ -161,6 +161,52 @@ EOT
expect(diff).not_to have_binary_notice
end
end
context 'when diff contains invalid characters' do
let(:bad_string) { [0xae].pack("C*") }
let(:bad_string_two) { [0x89].pack("C*") }
let(:diff) { described_class.new(@raw_diff_hash.merge({ diff: bad_string })) }
let(:diff_two) { described_class.new(@raw_diff_hash.merge({ diff: bad_string_two })) }
context 'when replace_invalid_utf8_chars is true' do
it 'will convert invalid characters and not cause an encoding error' do
expect(diff.diff).to include(Gitlab::EncodingHelper::UNICODE_REPLACEMENT_CHARACTER)
expect(diff_two.diff).to include(Gitlab::EncodingHelper::UNICODE_REPLACEMENT_CHARACTER)
expect { Oj.dump(diff) }.not_to raise_error(EncodingError)
expect { Oj.dump(diff_two) }.not_to raise_error(EncodingError)
end
context 'when the diff is binary' do
let(:project) { create(:project, :repository) }
it 'will not try to replace characters' do
expect(Gitlab::EncodingHelper).not_to receive(:encode_utf8_with_replacement_character?)
expect(binary_diff(project).diff).not_to be_empty
end
end
context 'when convert_diff_to_utf8_with_replacement_symbol feature flag is disabled' do
before do
stub_feature_flags(convert_diff_to_utf8_with_replacement_symbol: false)
end
it 'will not try to convert invalid characters' do
expect(Gitlab::EncodingHelper).not_to receive(:encode_utf8_with_replacement_character?)
end
end
end
context 'when replace_invalid_utf8_chars is false' do
let(:not_replaced_diff) { described_class.new(@raw_diff_hash.merge({ diff: bad_string, replace_invalid_utf8_chars: false }) ) }
let(:not_replaced_diff_two) { described_class.new(@raw_diff_hash.merge({ diff: bad_string_two, replace_invalid_utf8_chars: false }) ) }
it 'will not try to convert invalid characters' do
expect(Gitlab::EncodingHelper).not_to receive(:encode_utf8_with_replacement_character?)
end
end
end
end
describe 'straight diffs' do
......@@ -255,12 +301,11 @@ EOT
let(:project) { create(:project, :repository) }
it 'fake binary message when it detects binary' do
# Rugged will not detect this as binary, but we can fake it
diff_message = "Binary files files/images/icn-time-tracking.pdf and files/images/icn-time-tracking.pdf differ\n"
binary_diff = described_class.between(project.repository, 'add-pdf-text-binary', 'add-pdf-text-binary^').first
expect(binary_diff.diff).not_to be_empty
expect(binary_diff.json_safe_diff).to eq(diff_message)
diff = binary_diff(project)
expect(diff.diff).not_to be_empty
expect(diff.json_safe_diff).to eq(diff_message)
end
it 'leave non-binary diffs as-is' do
......@@ -374,4 +419,9 @@ EOT
expect(diff.line_count).to eq(0)
end
end
def binary_diff(project)
# rugged will not detect this as binary, but we can fake it
described_class.between(project.repository, 'add-pdf-text-binary', 'add-pdf-text-binary^').first
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment