Commit 66c21aca authored by Brett Walker's avatar Brett Walker

Use xpath search of Nokogiri instead of css search

For large node trees, xpath is significantly faster and uses
less memory

Changelog: security
parent e38d70c7
......@@ -7,8 +7,11 @@ module Banzai
PRIVATE_IMAGE_PATH = '/secure/attachment/'
CSS_WITH_ATTACHMENT_ICON = 'with-attachment-icon'
CSS = 'img'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
def call
doc.xpath('descendant-or-self::img').each do |img|
doc.xpath(XPATH).each do |img|
next unless img['src'].start_with?(PRIVATE_IMAGE_PATH)
img_link = "#{project.jira_service.url}#{img['src']}"
......
......@@ -9,8 +9,11 @@ module Gitlab
# Part of FileUploader::MARKDOWN_PATTERN but with a non-greedy file name matcher (?<file>.*) vs (?<file>.*?)
NON_GREEDY_UPLOAD_FILE_PATH_PATTERN = %r{/uploads/(?<secret>[0-9a-f]{32})/(?<file>.*)}.freeze
CSS = 'img'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
def call
doc.css('img').each do |image_node|
doc.xpath(XPATH).each do |image_node|
image_node['class'] = 'gl-image'
original_src = image_node.delete('data-src').value
......
......@@ -9,14 +9,15 @@ module Gitlab
# +Banzai::Filter::ReferenceRedactorFilter+, so it's easier to find and
# anonymize `user` references.
class MentionAnonymizationFilter < HTML::Pipeline::Filter
LINK_CSS_SELECTOR = "a.gfm[data-reference-type='user']"
CSS = 'a.gfm[data-reference-type="user"]'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
# Static for now. In https://gitlab.com/gitlab-org/gitlab/-/issues/209114
# we'll map names with a more sophisticated approach.
ANONYMIZED_NAME = 'Incident Responder'
def call
doc.css(LINK_CSS_SELECTOR).each do |link_node|
doc.xpath(XPATH).each do |link_node|
link_node.replace(ANONYMIZED_NAME)
end
......
......@@ -6,10 +6,13 @@ module Banzai
module Filter
# HTML filter that converts relative urls into absolute ones.
class AbsoluteLinkFilter < HTML::Pipeline::Filter
CSS = 'a.gfm'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
def call
return doc unless context[:only_path] == false
doc.search('a.gfm').each do |el|
doc.xpath(XPATH).each do |el|
process_link_attr el.attribute('href')
end
......
......@@ -3,14 +3,20 @@
module Banzai
module Filter
class AsciiDocPostProcessingFilter < HTML::Pipeline::Filter
CSS_MATH = '[data-math-style]'
XPATH_MATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_MATH).freeze
CSS_MERM = '[data-mermaid-style]'
XPATH_MERM = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_MERM).freeze
def call
doc.search('[data-math-style]').each do |node|
doc.xpath(XPATH_MATH).each do |node|
node.set_attribute('class', 'code math js-render-math')
end
doc.search('[data-mermaid-style]').each do |node|
doc.xpath(XPATH_MERM).each do |node|
node.set_attribute('class', 'js-render-mermaid')
end
doc
end
end
......
......@@ -7,6 +7,9 @@ module Banzai
class BaseRelativeLinkFilter < HTML::Pipeline::Filter
include Gitlab::Utils::StrongMemoize
CSS = 'a:not(.gfm), img:not(.gfm), video:not(.gfm), audio:not(.gfm)'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
protected
def linkable_attributes
......@@ -35,7 +38,7 @@ module Banzai
def fetch_linkable_attributes
attrs = []
attrs += doc.search('a:not(.gfm), img:not(.gfm), video:not(.gfm), audio:not(.gfm)').flat_map do |el|
attrs += doc.xpath(XPATH).flat_map do |el|
[el.attribute('href'), el.attribute('src'), el.attribute('data-src')]
end
......
......@@ -7,8 +7,11 @@ module Banzai
class ColorFilter < HTML::Pipeline::Filter
COLOR_CHIP_CLASS = 'gfm-color_chip'
CSS = 'code'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
def call
doc.css('code').each do |node|
doc.xpath(XPATH).each do |node|
color = ColorParser.parse(node.content)
node << color_chip(color) if color
end
......
......@@ -11,7 +11,7 @@ module Banzai
return doc unless context[:project]
return doc unless Feature.enabled?(:custom_emoji, context[:project])
doc.search(".//text()").each do |node|
doc.xpath('descendant-or-self::text()').each do |node|
content = node.to_html
next if has_ancestor?(node, IGNORED_ANCESTOR_TAGS)
......
......@@ -11,7 +11,7 @@ module Banzai
IGNORE_UNICODE_EMOJIS = %w(™ © ®).freeze
def call
doc.search(".//text()").each do |node|
doc.xpath('descendant-or-self::text()').each do |node|
content = node.to_html
next if has_ancestor?(node, IGNORED_ANCESTOR_TAGS)
......
......@@ -23,17 +23,23 @@ module Banzai
FOOTNOTE_LINK_REFERENCE_PATTERN = /\A#{FOOTNOTE_LINK_ID_PREFIX}\d+\z/.freeze
FOOTNOTE_START_NUMBER = 1
CSS_SECTION = "ol > li[id=#{FOOTNOTE_ID_PREFIX}#{FOOTNOTE_START_NUMBER}]"
XPATH_SECTION = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_SECTION).freeze
CSS_FOOTNOTE = 'sup > a[id]'
XPATH_FOOTNOTE = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_FOOTNOTE).freeze
def call
return doc unless first_footnote = doc.at_css("ol > li[id=#{fn_id(FOOTNOTE_START_NUMBER)}]")
return doc unless first_footnote = doc.at_xpath(XPATH_SECTION)
# Sanitization stripped off the section wrapper - add it back in
first_footnote.parent.wrap('<section class="footnotes">')
rand_suffix = "-#{random_number}"
modified_footnotes = {}
doc.css('sup > a[id]').each do |link_node|
doc.xpath(XPATH_FOOTNOTE).each do |link_node|
ref_num = link_node[:id].delete_prefix(FOOTNOTE_LINK_ID_PREFIX)
footnote_node = doc.at_css("li[id=#{fn_id(ref_num)}]")
node_xpath = Gitlab::Utils::Nokogiri.css_to_xpath("li[id=#{fn_id(ref_num)}]")
footnote_node = doc.at_xpath(node_xpath)
if INTEGER_PATTERN.match?(ref_num) && (footnote_node || modified_footnotes[ref_num])
link_node[:href] += rand_suffix
......
......@@ -60,7 +60,7 @@ module Banzai
IGNORED_ANCESTOR_TAGS = %w(pre code tt).to_set
def call
doc.search(".//text()").each do |node|
doc.xpath('descendant-or-self::text()').each do |node|
next if has_ancestor?(node, IGNORED_ANCESTOR_TAGS)
next unless node.content =~ TAGS_PATTERN
......
......@@ -6,8 +6,11 @@ module Banzai
# HTML filter that moves the value of image `src` attributes to `data-src`
# so they can be lazy loaded.
class ImageLazyLoadFilter < HTML::Pipeline::Filter
CSS = 'img'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
def call
doc.xpath('descendant-or-self::img').each do |img|
doc.xpath(XPATH).each do |img|
img.add_class('lazy')
img['data-src'] = img['src']
img['src'] = LazyImageTagHelper.placeholder_image
......
......@@ -7,7 +7,7 @@ module Banzai
IGNORED_ANCESTOR_TAGS = %w(pre code tt).to_set
def call
doc.search(".//text()").each do |node|
doc.xpath('descendant-or-self::text()').each do |node|
next if has_ancestor?(node, IGNORED_ANCESTOR_TAGS)
content = node.to_html
......
......@@ -8,6 +8,7 @@ module Banzai
include Gitlab::Utils::StrongMemoize
METRICS_CSS_CLASS = '.js-render-metrics'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(METRICS_CSS_CLASS).freeze
EMBED_LIMIT = 100
Route = Struct.new(:regex, :permission)
......@@ -41,7 +42,7 @@ module Banzai
# @return [Nokogiri::XML::NodeSet]
def nodes
strong_memoize(:nodes) do
nodes = doc.css(METRICS_CSS_CLASS)
nodes = doc.xpath(XPATH)
nodes.drop(EMBED_LIMIT).each(&:remove)
nodes
......
......@@ -15,10 +15,11 @@ module Banzai
.map { |diagram_type| %(pre[lang="#{diagram_type}"] > code) }
.join(', ')
return doc unless doc.at(diagram_selectors)
xpath = Gitlab::Utils::Nokogiri.css_to_xpath(diagram_selectors)
return doc unless doc.at_xpath(xpath)
diagram_format = "svg"
doc.css(diagram_selectors).each do |node|
doc.xpath(xpath).each do |node|
diagram_type = node.parent['lang']
img_tag = Nokogiri::HTML::DocumentFragment.parse(%(<img src="#{create_image_src(diagram_type, diagram_format, node.content)}"/>))
node.parent.replace(img_tag)
......
......@@ -8,6 +8,11 @@ module Banzai
NOT_LITERAL_REGEX = %r{#{LITERAL_KEYWORD}-((%5C|\\).+?)-#{LITERAL_KEYWORD}}.freeze
SPAN_REGEX = %r{<span>(.*?)</span>}.freeze
CSS_A = 'a'
XPATH_A = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_A).freeze
CSS_CODE = 'code'
XPATH_CODE = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_CODE).freeze
def call
return doc unless result[:escaped_literals]
......@@ -24,12 +29,12 @@ module Banzai
# Banzai::Renderer::CommonMark::HTML. However, we eventually want to use
# the built-in compiled renderer, rather than the ruby version, for speed.
# So let's do this work here.
doc.css('a').each do |node|
doc.xpath(XPATH_A).each do |node|
node.attributes['href'].value = node.attributes['href'].value.gsub(SPAN_REGEX, '\1') if node.attributes['href']
node.attributes['title'].value = node.attributes['title'].value.gsub(SPAN_REGEX, '\1') if node.attributes['title']
end
doc.css('code').each do |node|
doc.xpath(XPATH_CODE).each do |node|
node.attributes['lang'].value = node.attributes['lang'].value.gsub(SPAN_REGEX, '\1') if node.attributes['lang']
end
......
......@@ -10,6 +10,11 @@ module Banzai
# HTML filter that adds class="code math" and removes the dollar sign in $`2+2`$.
#
class MathFilter < HTML::Pipeline::Filter
CSS_MATH = 'pre.code.language-math'
XPATH_MATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_MATH).freeze
CSS_CODE = 'code'
XPATH_CODE = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_CODE).freeze
# Attribute indicating inline or display math.
STYLE_ATTRIBUTE = 'data-math-style'
......@@ -21,7 +26,7 @@ module Banzai
DOLLAR_SIGN = '$'
def call
doc.css('code').each do |code|
doc.xpath(XPATH_CODE).each do |code|
closing = code.next
opening = code.previous
......@@ -39,7 +44,7 @@ module Banzai
end
end
doc.css('pre.code.language-math').each do |el|
doc.xpath(XPATH_MATH).each do |el|
el[STYLE_ATTRIBUTE] = 'display'
el[:class] += " #{TAG_CLASS}"
end
......
......@@ -4,8 +4,11 @@
module Banzai
module Filter
class MermaidFilter < HTML::Pipeline::Filter
CSS = 'pre[lang="mermaid"] > code'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
def call
doc.css('pre[lang="mermaid"] > code').add_class('js-render-mermaid')
doc.xpath(XPATH).add_class('js-render-mermaid')
doc
end
......
......@@ -8,12 +8,15 @@ module Banzai
# HTML that replaces all `code plantuml` tags with PlantUML img tags.
#
class PlantumlFilter < HTML::Pipeline::Filter
CSS = 'pre > code[lang="plantuml"]'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
def call
return doc unless settings.plantuml_enabled? && doc.at('pre > code[lang="plantuml"]')
return doc unless settings.plantuml_enabled? && doc.at_xpath(XPATH)
plantuml_setup
doc.css('pre > code[lang="plantuml"]').each do |node|
doc.xpath(XPATH).each do |node|
img_tag = Nokogiri::HTML::DocumentFragment.parse(
Asciidoctor::PlantUml::Processor.plantuml_content(node.content, {}))
node.parent.replace(img_tag)
......
......@@ -7,10 +7,13 @@ module Banzai
# Class used for tagging elements that should be rendered
TAG_CLASS = 'js-render-suggestion'
CSS = 'pre.language-suggestion > code'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
def call
return doc unless suggestions_filter_enabled?
doc.search('pre.language-suggestion > code').each do |node|
doc.xpath(XPATH).each do |node|
node.add_class(TAG_CLASS)
end
......
......@@ -14,8 +14,11 @@ module Banzai
PARAMS_DELIMITER = ':'
LANG_PARAMS_ATTR = 'data-lang-params'
CSS = 'pre:not([data-math-style]):not([data-mermaid-style]):not([data-kroki-style]) > code'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
def call
doc.search('pre:not([data-math-style]):not([data-mermaid-style]):not([data-kroki-style]) > code').each do |node|
doc.xpath(XPATH).each do |node|
highlight_node(node)
end
......
......@@ -19,6 +19,9 @@ module Banzai
class TableOfContentsFilter < HTML::Pipeline::Filter
include Gitlab::Utils::Markdown
CSS = 'h1, h2, h3, h4, h5, h6'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
def call
return doc if context[:no_header_anchors]
......@@ -27,7 +30,7 @@ module Banzai
headers = Hash.new(0)
header_root = current_header = HeaderNode.new
doc.css('h1, h2, h3, h4, h5, h6').each do |node|
doc.xpath(XPATH).each do |node|
if header_content = node.children.first
id = string_to_anchor(node.text)
......
......@@ -10,14 +10,21 @@ module Banzai
class WikiLinkFilter < HTML::Pipeline::Filter
include Gitlab::Utils::SanitizeNodeLink
CSS_A = 'a:not(.gfm)'
XPATH_A = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_A).freeze
CSS_VA = 'video, audio'
XPATH_VA = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_VA).freeze
CSS_IMG = 'img'
XPATH_IMG = Gitlab::Utils::Nokogiri.css_to_xpath(CSS_IMG).freeze
def call
return doc unless wiki?
doc.search('a:not(.gfm)').each { |el| process_link(el.attribute('href'), el) }
doc.xpath(XPATH_A).each { |el| process_link(el.attribute('href'), el) }
doc.search('video, audio').each { |el| process_link(el.attribute('src'), el) }
doc.xpath(XPATH_VA).each { |el| process_link(el.attribute('src'), el) }
doc.search('img').each do |el|
doc.xpath(XPATH_IMG).each do |el|
attr = el.attribute('data-src') || el.attribute('src')
process_link(attr, el)
......
......@@ -6,6 +6,9 @@ module Gitlab
# Matches for instance "-1", "+1" or "-1+2".
SUGGESTION_CONTEXT = /^(\-(?<above>\d+))?(\+(?<below>\d+))?$/.freeze
CSS = 'pre.language-suggestion'
XPATH = Gitlab::Utils::Nokogiri.css_to_xpath(CSS).freeze
class << self
# Returns an array of Gitlab::Diff::Suggestion which represents each
# suggestion in the given text.
......@@ -17,7 +20,7 @@ module Gitlab
no_original_data: true,
suggestions_filter_enabled: supports_suggestion)
doc = Nokogiri::HTML(html)
suggestion_nodes = doc.search('pre.language-suggestion')
suggestion_nodes = doc.xpath(XPATH)
return [] if suggestion_nodes.empty?
......
# frozen_string_literal: true
module Gitlab
module Utils
class Nokogiri
class << self
# Use Nokogiri to convert a css selector into an xpath selector.
# Nokogiri can use css selectors with `doc.search()`. However
# for large node trees, it is _much_ slower than using xpath,
# by several orders of magnitude.
# https://gitlab.com/gitlab-org/gitlab/-/issues/329186
def css_to_xpath(css)
xpath = ::Nokogiri::CSS.xpath_for(css)
# Due to https://github.com/sparklemotion/nokogiri/issues/572,
# we remove the leading `//` and add `descendant-or-self::`
# in order to ensure we're searching from this node and all
# descendants.
xpath.map { |t| "descendant-or-self::#{t[2..-1]}" }.join('|')
end
end
end
end
end
......@@ -36,9 +36,11 @@ RSpec.describe Banzai::Pipeline::PostProcessPipeline do
end
let(:doc) { HTML::Pipeline.parse(html) }
let(:non_related_xpath_calls) { 2 }
it 'searches for attributes only once' do
expect(doc).to receive(:search).once.and_call_original
expect(doc).to receive(:xpath).exactly(non_related_xpath_calls + 1).times
.and_call_original
subject
end
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Utils::Nokogiri do
describe '#css_to_xpath' do
using RSpec::Parameterized::TableSyntax
where(:css, :xpath) do
'img' | "descendant-or-self::img"
'a.gfm' | "descendant-or-self::a[contains(concat(' ',normalize-space(@class),' '),' gfm ')]"
'a:not(.gfm)' | "descendant-or-self::a[not(contains(concat(' ',normalize-space(@class),' '),' gfm '))]"
'video, audio' | "descendant-or-self::video|descendant-or-self::audio"
'[data-math-style]' | "descendant-or-self::*[@data-math-style]"
'[data-mermaid-style]' | "descendant-or-self::*[@data-mermaid-style]"
'.js-render-metrics' | "descendant-or-self::*[contains(concat(' ',normalize-space(@class),' '),' js-render-metrics ')]"
'h1, h2, h3, h4, h5, h6' | "descendant-or-self::h1|descendant-or-self::h2|descendant-or-self::h3|descendant-or-self::h4|descendant-or-self::h5|descendant-or-self::h6"
'pre.code.language-math' | "descendant-or-self::pre[contains(concat(' ',normalize-space(@class),' '),' code ') and contains(concat(' ',normalize-space(@class),' '),' language-math ')]"
'pre > code[lang="plantuml"]' | "descendant-or-self::pre/code[@lang=\"plantuml\"]"
'pre[lang="mermaid"] > code' | "descendant-or-self::pre[@lang=\"mermaid\"]/code"
'pre.language-suggestion' | "descendant-or-self::pre[contains(concat(' ',normalize-space(@class),' '),' language-suggestion ')]"
'pre.language-suggestion > code' | "descendant-or-self::pre[contains(concat(' ',normalize-space(@class),' '),' language-suggestion ')]/code"
'a.gfm[data-reference-type="user"]' | "descendant-or-self::a[contains(concat(' ',normalize-space(@class),' '),' gfm ') and @data-reference-type=\"user\"]"
'a:not(.gfm), img:not(.gfm), video:not(.gfm), audio:not(.gfm)' | "descendant-or-self::a[not(contains(concat(' ',normalize-space(@class),' '),' gfm '))]|descendant-or-self::img[not(contains(concat(' ',normalize-space(@class),' '),' gfm '))]|descendant-or-self::video[not(contains(concat(' ',normalize-space(@class),' '),' gfm '))]|descendant-or-self::audio[not(contains(concat(' ',normalize-space(@class),' '),' gfm '))]"
'pre:not([data-math-style]):not([data-mermaid-style]):not([data-kroki-style]) > code' | "descendant-or-self::pre[not(@data-math-style) and not(@data-mermaid-style) and not(@data-kroki-style)]/code"
end
with_them do
it 'generates the xpath' do
expect(described_class.css_to_xpath(css)).to eq xpath
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment