Commit dbf1b5af authored by Thong Kuah's avatar Thong Kuah

Test that project homepage is accessible by crawlers

We simulate this by parsing robots.txt and blocking access to the
Disallow lines in robots.txt
parent 2e5d1039
require 'gitlab/testing/request_blocker_middleware'
require 'gitlab/testing/robots_blocker_middleware'
require 'gitlab/testing/request_inspector_middleware'
require 'gitlab/testing/clear_process_memory_cache_middleware'
require 'gitlab/utils'
......@@ -6,6 +7,7 @@ require 'gitlab/utils'
Rails.application.configure do
# Make sure the middleware is inserted first in middleware chain
config.middleware.insert_before(ActionDispatch::Static, Gitlab::Testing::RequestBlockerMiddleware)
config.middleware.insert_before(ActionDispatch::Static, Gitlab::Testing::RobotsBlockerMiddleware)
config.middleware.insert_before(ActionDispatch::Static, Gitlab::Testing::RequestInspectorMiddleware)
config.middleware.insert_before(ActionDispatch::Static, Gitlab::Testing::ClearProcessMemoryCacheMiddleware)
......
# frozen_string_literal: true
module Gitlab
module RobotsTxt
def self.disallowed?(path)
parsed_robots_txt.disallowed?(path)
end
def self.parsed_robots_txt
@parsed_robots_txt ||= Parser.new(robots_txt)
end
def self.robots_txt
File.read(Rails.root.join('public', 'robots.txt'))
end
end
end
# frozen_string_literal: true
module Gitlab
module RobotsTxt
class Parser
attr_reader :disallow_rules
def initialize(content)
@raw_content = content
@disallow_rules = parse_raw_content!
end
def disallowed?(path)
disallow_rules.any? { |rule| path =~ rule }
end
private
# This parser is very basic as it only knows about `Disallow:` lines,
# and simply ignores all other lines.
#
# Order of predecence, 'Allow:`, etc are ignored for now.
def parse_raw_content!
@raw_content.each_line.map do |line|
if line.start_with?('Disallow:')
value = line.sub('Disallow:', '').strip
value = Regexp.escape(value).gsub('\*', '.*')
Regexp.new("^#{value}")
else
nil
end
end.compact
end
end
end
end
# frozen_string_literal: true
# rubocop:disable Style/ClassVars
module Gitlab
module Testing
class RobotsBlockerMiddleware
@@block_requests = Concurrent::AtomicBoolean.new(false)
# Block requests according to robots.txt.
# Any new requests disallowed by robots.txt will return an HTTP 503 status.
def self.block_requests!
@@block_requests.value = true
end
# Allows the server to accept requests again.
def self.allow_requests!
@@block_requests.value = false
end
def initialize(app)
@app = app
end
def call(env)
request = Rack::Request.new(env)
if block_requests? && Gitlab::RobotsTxt.disallowed?(request.path_info)
block_request(env)
else
@app.call(env)
end
end
private
def block_requests?
@@block_requests.true?
end
def block_request(env)
[503, {}, []]
end
end
end
end
......@@ -14,4 +14,25 @@ RSpec.describe 'Projects > Show > User sees README' do
expect(page).to have_content 'testme'
end
end
context 'obeying robots.txt' do
before do
Gitlab::Testing::RobotsBlockerMiddleware.block_requests!
end
after do
Gitlab::Testing::RobotsBlockerMiddleware.allow_requests!
end
# For example, see this regression we had in
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/39520
it 'does not block the requests necessary to load the project README', :js do
visit project_path(project)
wait_for_requests
page.within('.readme-holder') do
expect(page).to have_content 'testme'
end
end
end
end
# frozen_string_literal: true
require 'fast_spec_helper'
require 'rspec-parameterized'
RSpec.describe Gitlab::RobotsTxt::Parser do
describe '#disallowed?' do
subject { described_class.new(content).disallowed?(path) }
context 'a simple robots.txt file' do
using RSpec::Parameterized::TableSyntax
let(:content) do
<<~TXT
User-Agent: *
Disallow: /autocomplete/users
Disallow: /search
Disallow: /api
TXT
end
where(:path, :result) do
'/autocomplete/users' | true
'/autocomplete/users/a.html' | true
'/search' | true
'/search.html' | true
'/api' | true
'/api/grapql' | true
'/api/index.html' | true
'/projects' | false
end
with_them do
it { is_expected.to eq(result), "#{path} expected to be #{result}" }
end
end
context 'robots.txt file with wildcard' do
using RSpec::Parameterized::TableSyntax
let(:content) do
<<~TXT
User-Agent: *
Disallow: /search
User-Agent: *
Disallow: /*/*.git
Disallow: /*/archive/
Disallow: /*/repository/archive*
TXT
end
where(:path, :result) do
'/search' | true
'/namespace/project.git' | true
'/project/archive/' | true
'/project/archive/file.gz' | true
'/project/repository/archive' | true
'/project/repository/archive.gz' | true
'/project/repository/archive/file.gz' | true
'/projects' | false
'/git' | false
'/projects/git' | false
end
with_them do
it { is_expected.to eq(result), "#{path} expected to be #{result}" }
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment