Commit 96b78c1d authored by fjsanpedro's avatar fjsanpedro

Expand Robots.txt parser with Allow rules

In this commit we exand the Robots.txt parser.
Now, `Allow` rules are parsed, they take precedence
over `Disallow` ones and the detection is case
insensitive.
parent 0dffbcb4
......@@ -3,34 +3,68 @@
module Gitlab
module RobotsTxt
class Parser
attr_reader :disallow_rules
DISALLOW_REGEX = /^disallow: /i.freeze
ALLOW_REGEX = /^allow: /i.freeze
attr_reader :disallow_rules, :allow_rules
def initialize(content)
@raw_content = content
@disallow_rules = parse_raw_content!
@disallow_rules, @allow_rules = parse_raw_content!
end
def disallowed?(path)
return false if allow_rules.any? { |rule| path =~ rule }
disallow_rules.any? { |rule| path =~ rule }
end
private
# This parser is very basic as it only knows about `Disallow:` lines,
# and simply ignores all other lines.
# This parser is very basic as it only knows about `Disallow:`
# and `Allow:` lines, and simply ignores all other lines.
#
# Patterns ending in `$`, and `*` for 0 or more characters are recognized.
#
# Order of predecence, 'Allow:`, etc are ignored for now.
# It is case insensitive and `Allow` rules takes precedence
# over `Disallow`.
def parse_raw_content!
@raw_content.each_line.map do |line|
if line.start_with?('Disallow:')
value = line.sub('Disallow:', '').strip
disallowed = []
allowed = []
@raw_content.each_line.each do |line|
if disallow_rule?(line)
disallowed << get_disallow_pattern(line)
elsif allow_rule?(line)
allowed << get_allow_pattern(line)
end
end
[disallowed, allowed]
end
def disallow_rule?(line)
line =~ DISALLOW_REGEX
end
def get_disallow_pattern(line)
get_pattern(line, DISALLOW_REGEX)
end
def allow_rule?(line)
line =~ ALLOW_REGEX
end
def get_allow_pattern(line)
get_pattern(line, ALLOW_REGEX)
end
def get_pattern(line, rule_regex)
value = line.sub(rule_regex, '').strip
value = Regexp.escape(value).gsub('\*', '.*')
value = value.sub(/\\\$$/, '$')
Regexp.new("^#{value}")
else
nil
end
end.compact
end
end
end
......
......@@ -21,6 +21,7 @@ Disallow: /dashboard
Disallow: /users
Disallow: /help
Disallow: /s/
Disallow: /-/profile
# Only specifically allow the Sign In page to avoid very ugly search results
Allow: /users/sign_in
......
......@@ -14,8 +14,13 @@ RSpec.describe Gitlab::RobotsTxt::Parser do
<<~TXT
User-Agent: *
Disallow: /autocomplete/users
Disallow: /search
disallow: /search
Disallow: /api
Allow: /users
Disallow: /help
allow: /help
Disallow: /test$
Disallow: /ex$mple$
TXT
end
......@@ -28,6 +33,12 @@ RSpec.describe Gitlab::RobotsTxt::Parser do
'/api/grapql' | true
'/api/index.html' | true
'/projects' | false
'/users' | false
'/help' | false
'/test' | true
'/testfoo' | false
'/ex$mple' | true
'/ex$mplefoo' | false
end
with_them do
......@@ -47,6 +58,7 @@ RSpec.describe Gitlab::RobotsTxt::Parser do
Disallow: /*/*.git
Disallow: /*/archive/
Disallow: /*/repository/archive*
Allow: /*/repository/archive/foo
TXT
end
......@@ -61,6 +73,7 @@ RSpec.describe Gitlab::RobotsTxt::Parser do
'/projects' | false
'/git' | false
'/projects/git' | false
'/project/repository/archive/foo' | false
end
with_them do
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe 'Robots.txt Requests', :aggregate_failures do
before do
Gitlab::Testing::RobotsBlockerMiddleware.block_requests!
end
after do
Gitlab::Testing::RobotsBlockerMiddleware.allow_requests!
end
it 'allows the requests' do
requests = [
'/users/sign_in'
]
requests.each do |request|
get request
expect(response).not_to have_gitlab_http_status(:service_unavailable), "#{request} must be allowed"
end
end
it 'blocks the requests' do
requests = [
'/autocomplete/users',
'/search',
'/admin',
'/profile',
'/dashboard',
'/users',
'/users/foo',
'/help',
'/s/',
'/-/profile',
'/foo/bar/new',
'/foo/bar/edit',
'/foo/bar/raw',
'/groups/foo/analytics',
'/groups/foo/contribution_analytics',
'/groups/foo/group_members',
'/foo/bar/project.git',
'/foo/bar/archive/foo',
'/foo/bar/repository/archive',
'/foo/bar/activity',
'/foo/bar/blame',
'/foo/bar/commits',
'/foo/bar/commit',
'/foo/bar/compare',
'/foo/bar/network',
'/foo/bar/graphs',
'/foo/bar/merge_requests/1.patch',
'/foo/bar/merge_requests/1.diff',
'/foo/bar/merge_requests/1/diffs',
'/foo/bar/deploy_keys',
'/foo/bar/hooks',
'/foo/bar/services',
'/foo/bar/protected_branches',
'/foo/bar/uploads/foo',
'/foo/bar/project_members',
'/foo/bar/settings'
]
requests.each do |request|
get request
expect(response).to have_gitlab_http_status(:service_unavailable), "#{request} must be disallowed"
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment