Commit 399056ed authored by Zeger-Jan van de Weg's avatar Zeger-Jan van de Weg Committed by Stan Hu

Remove dependencies on Linguist

This saves about 128 MB of baseline RAM usage per Unicorn and
Sidekiq process (!).

Linguist wasn't detecting languages anymore from CE/EE since
9ae8b574. However, Linguist::BlobHelper
was still being depended on by BlobLike and others.

This removes the Linguist gem, given it isn't required anymore.
EscapeUtils were pulled in as dependency, but given Banzai depends on
it, it is now added explicitly.

Previously, Linguist was used to detect the best ACE mode. Instead,
we rely on ACE to guess the best mode based on the file extension.
parent a5ecb5bb
......@@ -83,9 +83,6 @@ gem 'net-ldap'
# Only used to compute wiki page slugs
gem 'gitlab-gollum-lib', '~> 4.2', require: false
# Language detection
gem 'github-linguist', '~> 5.3.3', require: 'linguist'
# API
gem 'grape', '~> 1.1'
gem 'grape-entity', '~> 0.7.1'
......@@ -146,6 +143,7 @@ gem 'rouge', '~> 3.1'
gem 'truncato', '~> 0.7.9'
gem 'bootstrap_form', '~> 2.7.0'
gem 'nokogiri', '~> 1.8.2'
gem 'escape_utils', '~> 1.1'
# Calendar rendering
gem 'icalendar'
......
......@@ -277,11 +277,6 @@ GEM
gitaly-proto (0.118.1)
google-protobuf (~> 3.1)
grpc (~> 1.10)
github-linguist (5.3.3)
charlock_holmes (~> 0.7.5)
escape_utils (~> 1.1.0)
mime-types (>= 1.19)
rugged (>= 0.25.1)
github-markup (1.7.0)
gitlab-flowdock-git-hook (1.0.1)
flowdock (~> 0.7)
......@@ -1006,6 +1001,7 @@ DEPENDENCIES
ed25519 (~> 1.2)
email_reply_trimmer (~> 0.1)
email_spec (~> 2.2.0)
escape_utils (~> 1.1)
factory_bot_rails (~> 4.8.2)
faraday (~> 0.12)
fast_blank
......@@ -1028,7 +1024,6 @@ DEPENDENCIES
gettext_i18n_rails (~> 1.8.0)
gettext_i18n_rails_js (~> 1.3)
gitaly-proto (~> 0.118.1)
github-linguist (~> 5.3.3)
github-markup (~> 1.7.0)
gitlab-flowdock-git-hook (~> 1.0.1)
gitlab-gollum-lib (~> 4.2)
......@@ -1187,4 +1182,4 @@ DEPENDENCIES
wikicloth (= 0.8.1)
BUNDLED WITH
1.16.4
1.16.6
......@@ -280,11 +280,6 @@ GEM
gitaly-proto (0.118.1)
google-protobuf (~> 3.1)
grpc (~> 1.10)
github-linguist (5.3.3)
charlock_holmes (~> 0.7.5)
escape_utils (~> 1.1.0)
mime-types (>= 1.19)
rugged (>= 0.25.1)
github-markup (1.7.0)
gitlab-flowdock-git-hook (1.0.1)
flowdock (~> 0.7)
......@@ -1015,6 +1010,7 @@ DEPENDENCIES
ed25519 (~> 1.2)
email_reply_trimmer (~> 0.1)
email_spec (~> 2.2.0)
escape_utils (~> 1.1)
factory_bot_rails (~> 4.8.2)
faraday (~> 0.12)
fast_blank
......@@ -1037,7 +1033,6 @@ DEPENDENCIES
gettext_i18n_rails (~> 1.8.0)
gettext_i18n_rails_js (~> 1.3)
gitaly-proto (~> 0.118.1)
github-linguist (~> 5.3.3)
github-markup (~> 1.7.0)
gitlab-flowdock-git-hook (~> 1.0.1)
gitlab-gollum-lib (~> 4.2)
......@@ -1196,4 +1191,4 @@ DEPENDENCIES
wikicloth (= 0.8.1)
BUNDLED WITH
1.16.4
1.16.6
......@@ -13,11 +13,11 @@ export default () => {
if (editBlobForm.length) {
const urlRoot = editBlobForm.data('relativeUrlRoot');
const assetsPath = editBlobForm.data('assetsPrefix');
const blobLanguage = editBlobForm.data('blobLanguage');
const filePath = editBlobForm.data('blobFilename')
const currentAction = $('.js-file-title').data('currentAction');
const projectId = editBlobForm.data('project-id');
new EditBlob(`${urlRoot}${assetsPath}`, blobLanguage, currentAction, projectId);
new EditBlob(`${urlRoot}${assetsPath}`, filePath, currentAction, projectId);
new NewCommitForm(editBlobForm);
}
......
......@@ -5,6 +5,7 @@ import axios from '~/lib/utils/axios_utils';
import createFlash from '~/flash';
import { __ } from '~/locale';
import TemplateSelectorMediator from '../blob/file_template_mediator';
import getModeByFileExtension from '~/lib/utils/ace_utils';
export default class EditBlob {
constructor(assetsPath, aceMode, currentAction, projectId) {
......@@ -14,9 +15,10 @@ export default class EditBlob {
this.initFileSelectors(currentAction, projectId);
}
configureAceEditor(aceMode, assetsPath) {
configureAceEditor(filePath, assetsPath) {
ace.config.set('modePath', `${assetsPath}/ace`);
ace.config.loadModule('ace/ext/searchbox');
ace.config.loadModule('ace/ext/modelist');
this.editor = ace.edit('editor');
......@@ -25,8 +27,8 @@ export default class EditBlob {
this.editor.focus();
if (aceMode) {
this.editor.getSession().setMode(`ace/mode/${aceMode}`);
if (filePath) {
this.editor.getSession().setMode(getModeByFileExtension(filePath));
}
}
......
/*= require ace/ace */
/*= require ace/ext-modelist */
/*= require ace/ext-searchbox */
/*= require ./ace/ace_config_paths */
/* global ace */
export default function getModeByFileExtension(path) {
const modelist = ace.require("ace/ext/modelist");
return modelist.getModeForPath(path).mode;
};
......@@ -5,6 +5,7 @@ import Vue from 'vue';
import axios from '~/lib/utils/axios_utils';
import flash from '~/flash';
import { __ } from '~/locale';
import getModeByFileExtension from '~/lib/utils/ace_utils';
(global => {
global.mergeConflicts = global.mergeConflicts || {};
......@@ -72,7 +73,7 @@ import { __ } from '~/locale';
this.fileLoaded = true;
this.editor = ace.edit(content);
this.editor.$blockScrolling = Infinity; // Turn off annoying warning
this.editor.getSession().setMode(`ace/mode/${data.blob_ace_mode}`);
this.editor.getSession().setMode(getModeByFileExtension(data.new_path));
this.editor.on('change', () => {
this.saveDiffResolution();
});
......
......@@ -195,7 +195,7 @@ module BlobHelper
{
'relative-url-root' => Rails.application.config.relative_url_root,
'assets-prefix' => Gitlab::Application.config.assets.prefix,
'blob-language' => @blob && @blob.language.try(:ace_mode),
'blob-filename' => @blob && @blob.path,
'project-id' => project.id
}
end
......
......@@ -162,7 +162,7 @@ class Blob < SimpleDelegator
if stored_externally?
if rich_viewer
rich_viewer.binary?
elsif Linguist::Language.find_by_extension(name).any?
elsif known_extension?
false
elsif _mime_type
_mime_type.binary?
......
......@@ -2,7 +2,7 @@
module BlobLike
extend ActiveSupport::Concern
include Linguist::BlobHelper
include Gitlab::BlobHelper
def id
raise NotImplementedError
......
---
title: Remove Linguist gem, reducing Rails memory usage by 128MB per process
merge_request: 21008
author:
type: changed
# This has been extracted from https://github.com/github/linguist/blob/master/lib/linguist/blob_helper.rb
module Gitlab
module BlobHelper
def extname
File.extname(name.to_s)
end
def known_extension?
LanguageData.extensions.include?(extname)
end
def viewable?
!large? && text?
end
MEGABYTE = 1024 * 1024
def large?
size.to_i > MEGABYTE
end
def binary?
# Large blobs aren't even loaded into memory
if data.nil?
true
# Treat blank files as text
elsif data == ""
false
# Charlock doesn't know what to think
elsif encoding.nil?
true
# If Charlock says its binary
else
detect_encoding[:type] == :binary
end
end
def text?
!binary?
end
def image?
['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
end
# Internal: Lookup mime type for extension.
#
# Returns a MIME::Type
# rubocop:disable Gitlab/ModuleWithInstanceVariables
def _mime_type
if defined? @_mime_type
@_mime_type
else
guesses = ::MIME::Types.type_for(extname.to_s)
# Prefer text mime types over binary
@_mime_type = guesses.detect { |type| type.ascii? } || guesses.first
end
end
# rubocop:enable Gitlab/ModuleWithInstanceVariables
# Public: Get the actual blob mime type
#
# Examples
#
# # => 'text/plain'
# # => 'text/html'
#
# Returns a mime type String.
def mime_type
_mime_type ? _mime_type.to_s : 'text/plain'
end
def binary_mime_type?
_mime_type ? _mime_type.binary? : false
end
def lines
@lines ||=
if viewable? && data
# `data` is usually encoded as ASCII-8BIT even when the content has
# been detected as a different encoding. However, we are not allowed
# to change the encoding of `data` because we've made the implicit
# guarantee that each entry in `lines` is encoded the same way as
# `data`.
#
# Instead, we re-encode each possible newline sequence as the
# detected encoding, then force them back to the encoding of `data`
# (usually a binary encoding like ASCII-8BIT). This means that the
# byte sequence will match how newlines are likely encoded in the
# file, but we don't have to change the encoding of `data` as far as
# Ruby is concerned. This allows us to correctly parse out each line
# without changing the encoding of `data`, and
# also--importantly--without having to duplicate many (potentially
# large) strings.
begin
data.split(encoded_newlines_re, -1)
rescue Encoding::ConverterNotFoundError
# The data is not splittable in the detected encoding. Assume it's
# one big line.
[data]
end
else
[]
end
end
def content_type
# rubocop:disable Style/MultilineTernaryOperator
# rubocop:disable Style/NestedTernaryOperator
@content_type ||= binary_mime_type? || binary? ? mime_type :
(encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
# rubocop:enable Style/NestedTernaryOperator
# rubocop:enable Style/MultilineTernaryOperator
end
def encoded_newlines_re
@encoded_newlines_re ||=
Regexp.union(["\r\n", "\r", "\n"].map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) })
end
def ruby_encoding
if hash = detect_encoding
hash[:ruby_encoding]
end
end
def encoding
if hash = detect_encoding
hash[:encoding]
end
end
def detect_encoding
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables
end
def empty?
data.nil? || data == ""
end
end
end
......@@ -158,7 +158,6 @@ module Gitlab
json_hash.tap do |json_hash|
if opts[:full_content]
json_hash[:content] = content
json_hash[:blob_ace_mode] = our_blob && our_blob.language.try(:ace_mode)
else
json_hash[:sections] = sections if type.text?
json_hash[:type] = type
......
......@@ -3,13 +3,13 @@
module Gitlab
module Git
class Blob
include Linguist::BlobHelper
include Gitlab::BlobHelper
include Gitlab::EncodingHelper
# This number is the maximum amount of data that we want to display to
# the user. We load as much as we can for encoding detection
# (Linguist) and LFS pointer parsing. All other cases where we need full
# blob data should use load_all_data!.
# the user. We load as much as we can for encoding detection and LFS
# pointer parsing. All other cases where we need full blob data should
# use load_all_data!.
MAX_DATA_DISPLAY_SIZE = 10.megabytes
# These limits are used as a heuristic to ignore files which can't be LFS
......
# Gitaly note: JV: no RPC's here.
module Gitlab
module Git
class BlobSnippet
include Linguist::BlobHelper
attr_accessor :ref
attr_accessor :lines
attr_accessor :filename
attr_accessor :startline
def initialize(ref, lines, startline, filename)
@ref, @lines, @startline, @filename = ref, lines, startline, filename
end
def data
lines&.join("\n")
end
def name
filename
end
def size
data.length
end
def mode
nil
end
end
end
end
# frozen_string_literal: true
module Gitlab
module LanguageData
EXTENSION_MUTEX = Mutex.new
class << self
include Gitlab::Utils::StrongMemoize
def extensions
EXTENSION_MUTEX.synchronize do
strong_memoize(:extensions) do
Set.new.tap do |set|
YAML.load_file(Rails.root.join('vendor', 'languages.yml')).each do |_name, details|
details['extensions']&.each do |ext|
next unless ext.start_with?('.')
set << ext.downcase
end
end
end
end
end
end
def clear_extensions!
EXTENSION_MUTEX.synchronize do
clear_memoization(:extensions)
end
end
end
end
end
......@@ -150,7 +150,6 @@ describe Projects::MergeRequests::ConflictsController do
'new_path' => path,
'blob_icon' => 'file-text-o',
'blob_path' => a_string_ending_with(path),
'blob_ace_mode' => 'ruby',
'content' => content)
end
end
......
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::BlobHelper do
include FakeBlobHelpers
let(:project) { create(:project) }
let(:blob) { fake_blob(path: 'file.txt') }
let(:large_blob) { fake_blob(path: 'test.pdf', size: 2.megabytes, binary: true) }
describe '#extname' do
it 'returns the extension' do
expect(blob.extname).to eq('.txt')
end
end
describe '#known_extension?' do
it 'returns true' do
expect(blob.known_extension?).to be_truthy
end
end
describe '#viewable' do
it 'returns true' do
expect(blob.viewable?).to be_truthy
end
it 'returns false' do
expect(large_blob.viewable?).to be_falsey
end
end
describe '#large?' do
it 'returns false' do
expect(blob.large?).to be_falsey
end
it 'returns true' do
expect(large_blob.large?).to be_truthy
end
end
describe '#binary?' do
it 'returns true' do
expect(large_blob.binary?).to be_truthy
end
it 'returns false' do
expect(blob.binary?).to be_falsey
end
end
describe '#text?' do
it 'returns true' do
expect(blob.text?).to be_truthy
end
it 'returns false' do
expect(large_blob.text?).to be_falsey
end
end
describe '#image?' do
it 'returns false' do
expect(blob.image?).to be_falsey
end
end
describe '#mime_type' do
it 'returns text/plain' do
expect(blob.mime_type).to eq('text/plain')
end
it 'returns application/pdf' do
expect(large_blob.mime_type).to eq('application/pdf')
end
end
describe '#binary_mime_type?' do
it 'returns false' do
expect(blob.binary_mime_type?).to be_falsey
end
end
describe '#lines' do
it 'returns the payload in an Array' do
expect(blob.lines).to eq(['foo'])
end
end
describe '#content_type' do
it 'returns text/plain' do
expect(blob.content_type).to eq('text/plain; charset=utf-8')
end
it 'returns text/plain' do
expect(large_blob.content_type).to eq('application/pdf')
end
end
describe '#encoded_newlines_re' do
it 'returns a regular expression' do
expect(blob.encoded_newlines_re).to eq(/\r\n|\r|\n/)
end
end
describe '#ruby_encoding' do
it 'returns UTF-8' do
expect(blob.ruby_encoding).to eq('UTF-8')
end
end
describe '#encoding' do
it 'returns UTF-8' do
expect(blob.ruby_encoding).to eq('UTF-8')
end
end
describe '#empty?' do
it 'returns false' do
expect(blob.empty?).to be_falsey
end
end
end
......@@ -267,11 +267,6 @@ FILE
it 'includes the full content of the conflict' do
expect(conflict_file.as_json(full_content: true)).to have_key(:content)
end
it 'includes the detected language of the conflict file' do
expect(conflict_file.as_json(full_content: true)[:blob_ace_mode])
.to eq('ruby')
end
end
end
end
# encoding: UTF-8
require "spec_helper"
describe Gitlab::Git::BlobSnippet, :seed_helper do
describe '#data' do
context 'empty lines' do
let(:snippet) { Gitlab::Git::BlobSnippet.new('master', nil, nil, nil) }
it { expect(snippet.data).to be_nil }
end
context 'present lines' do
let(:snippet) { Gitlab::Git::BlobSnippet.new('master', %w(wow much), 1, 'wow.rb') }
it { expect(snippet.data).to eq("wow\nmuch") }
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::LanguageData do
describe '#extensions' do
before do
described_class.clear_extensions!
end
it 'loads the extensions once' do
expect(YAML).to receive(:load_file).once.and_call_original
2.times do
expect(described_class.extensions).to be_a(Set)
expect(described_class.extensions.count).to be > 0
# Sanity check for known extensions
expect(described_class.extensions).to include(*%w(.rb .yml .json))
end
end
end
end
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment