Commit 8959c6b6 authored by Brett Walker's avatar Brett Walker Committed by Matthias Käppler

Add Banzai filter to tag markdown literals

so they don’t get confused as references
parent 69260e86
---
title: Escaped markdown should not be interpreted as shortcuts
merge_request: 45922
author:
type: changed
---
name: honor_escaped_markdown
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45922
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/300531
milestone: '13.9'
type: development
group: 'group::project management'
default_enabled: false
# frozen_string_literal: true
module Banzai
module Filter
class MarkdownPostEscapeFilter < HTML::Pipeline::Filter
LITERAL_KEYWORD = MarkdownPreEscapeFilter::LITERAL_KEYWORD
LITERAL_REGEX = %r{#{LITERAL_KEYWORD}-(.*?)-#{LITERAL_KEYWORD}}.freeze
NOT_LITERAL_REGEX = %r{#{LITERAL_KEYWORD}-((%5C|\\).+?)-#{LITERAL_KEYWORD}}.freeze
SPAN_REGEX = %r{<span>(.*?)</span>}.freeze
def call
return doc unless result[:escaped_literals]
# For any literals that actually didn't get escape processed
# (for example in code blocks), remove the special sequence.
html.gsub!(NOT_LITERAL_REGEX, '\1')
# Replace any left over literal sequences with `span` so that our
# reference processing is short-circuited
html.gsub!(LITERAL_REGEX, '<span>\1</span>')
# Since literals are converted in links, we need to remove any surrounding `span`.
# Note: this could have been done in the renderer,
# Banzai::Renderer::CommonMark::HTML. However, we eventually want to use
# the built-in compiled renderer, rather than the ruby version, for speed.
# So let's do this work here.
doc.css('a').each do |node|
node.attributes['href'].value = node.attributes['href'].value.gsub(SPAN_REGEX, '\1') if node.attributes['href']
node.attributes['title'].value = node.attributes['title'].value.gsub(SPAN_REGEX, '\1') if node.attributes['title']
end
doc.css('code').each do |node|
node.attributes['lang'].value = node.attributes['lang'].value.gsub(SPAN_REGEX, '\1') if node.attributes['lang']
end
doc
end
end
end
end
# frozen_string_literal: true
module Banzai
module Filter
# In order to allow a user to short-circuit our reference shortcuts
# (such as # or !), the user should be able to escape them, like \#.
# CommonMark supports this, however it removes all information about
# what was actually a literal. In order to short-circuit the reference,
# we must surround backslash escaped ASCII punctuation with a custom sequence.
# This way CommonMark will properly handle the backslash escaped chars
# but we will maintain knowledge (the sequence) that it was a literal.
#
# We need to surround the character, not just prefix it. It could
# get converted into an entity by CommonMark and we wouldn't know how many
# characters there are. The entire literal needs to be surrounded with
# a `span` tag, which short-circuits our reference processing.
#
# We can't use a custom HTML tag since we could be initially surrounding
# text in an href, and then CommonMark will not be able to parse links
# properly. So we use `cmliteral-` and `-cmliteral`
#
# https://spec.commonmark.org/0.29/#backslash-escapes
#
# This filter does the initial surrounding, and MarkdownPostEscapeFilter
# does the conversion into span tags.
class MarkdownPreEscapeFilter < HTML::Pipeline::TextFilter
ASCII_PUNCTUATION = %r{([\\][!"#$%&'()*+,-./:;<=>?@\[\\\]^_`{|}~])}.freeze
LITERAL_KEYWORD = 'cmliteral'
def call
return @text unless Feature.enabled?(:honor_escaped_markdown, context[:group] || context[:project]&.group)
@text.gsub(ASCII_PUNCTUATION) do |match|
# The majority of markdown does not have literals. If none
# are found, we can bypass the post filter
result[:escaped_literals] = true
"#{LITERAL_KEYWORD}-#{match}-#{LITERAL_KEYWORD}"
end
end
end
end
end
......@@ -5,7 +5,9 @@ module Banzai
class PlainMarkdownPipeline < BasePipeline
def self.filters
FilterArray[
Filter::MarkdownFilter
Filter::MarkdownPreEscapeFilter,
Filter::MarkdownFilter,
Filter::MarkdownPostEscapeFilter
]
end
end
......
......@@ -170,6 +170,8 @@ References should be parseable even inside _<%= merge_request.to_reference %>_ e
- Ignores invalid: <%= User.reference_prefix %>fake_user
- Ignored in code: `<%= user.to_reference %>`
- Ignored in links: [Link to <%= user.to_reference %>](#user-link)
- Ignored when backslash escaped: \<%= user.to_reference %>
- Ignored when backslash escaped: \<%= group.to_reference %>
- Link to user by reference: [User](<%= user.to_reference %>)
#### IssueReferenceFilter
......@@ -178,6 +180,7 @@ References should be parseable even inside _<%= merge_request.to_reference %>_ e
- Issue in another project: <%= xissue.to_reference(project) %>
- Ignored in code: `<%= issue.to_reference %>`
- Ignored in links: [Link to <%= issue.to_reference %>](#issue-link)
- Ignored when backslash escaped: \<%= issue.to_reference %>
- Issue by URL: <%= urls.project_issue_url(issue.project, issue) %>
- Link to issue by reference: [Issue](<%= issue.to_reference %>)
- Link to issue by URL: [Issue](<%= urls.project_issue_url(issue.project, issue) %>)
......@@ -188,6 +191,7 @@ References should be parseable even inside _<%= merge_request.to_reference %>_ e
- Merge request in another project: <%= xmerge_request.to_reference(project) %>
- Ignored in code: `<%= merge_request.to_reference %>`
- Ignored in links: [Link to <%= merge_request.to_reference %>](#merge-request-link)
- Ignored when backslash escaped: \<%= merge_request.to_reference %>
- Merge request by URL: <%= urls.project_merge_request_url(merge_request.project, merge_request) %>
- Link to merge request by reference: [Merge request](<%= merge_request.to_reference %>)
- Link to merge request by URL: [Merge request](<%= urls.project_merge_request_url(merge_request.project, merge_request) %>)
......@@ -198,6 +202,7 @@ References should be parseable even inside _<%= merge_request.to_reference %>_ e
- Snippet in another project: <%= xsnippet.to_reference(project) %>
- Ignored in code: `<%= snippet.to_reference %>`
- Ignored in links: [Link to <%= snippet.to_reference %>](#snippet-link)
- Ignored when backslash escaped: \<%= snippet.to_reference %>
- Snippet by URL: <%= urls.project_snippet_url(snippet.project, snippet) %>
- Link to snippet by reference: [Snippet](<%= snippet.to_reference %>)
- Link to snippet by URL: [Snippet](<%= urls.project_snippet_url(snippet.project, snippet) %>)
......@@ -229,6 +234,7 @@ References should be parseable even inside _<%= merge_request.to_reference %>_ e
- Label by name in quotes: <%= label.to_reference(format: :name) %>
- Ignored in code: `<%= simple_label.to_reference %>`
- Ignored in links: [Link to <%= simple_label.to_reference %>](#label-link)
- Ignored when backslash escaped: \<%= simple_label.to_reference %>
- Link to label by reference: [Label](<%= label.to_reference %>)
#### MilestoneReferenceFilter
......@@ -239,6 +245,7 @@ References should be parseable even inside _<%= merge_request.to_reference %>_ e
- Milestone in another project: <%= xmilestone.to_reference(project) %>
- Ignored in code: `<%= simple_milestone.to_reference %>`
- Ignored in links: [Link to <%= simple_milestone.to_reference %>](#milestone-link)
- Ignored when backslash escaped: \<%= simple_milestone.to_reference %>
- Milestone by URL: <%= urls.milestone_url(milestone) %>
- Link to milestone by URL: [Milestone](<%= milestone.to_reference %>)
- Group milestone by name: <%= Milestone.reference_prefix %><%= group_milestone.name %>
......@@ -250,6 +257,7 @@ References should be parseable even inside _<%= merge_request.to_reference %>_ e
- Alert in another project: <%= xalert.to_reference(project) %>
- Ignored in code: `<%= alert.to_reference %>`
- Ignored in links: [Link to <%= alert.to_reference %>](#alert-link)
- Ignored when backslash escaped: \<%= alert.to_reference %>
- Alert by URL: <%= alert.details_url %>
- Link to alert by reference: [Alert](<%= alert.to_reference %>)
- Link to alert by URL: [Alert](<%= alert.details_url %>)
......
......@@ -131,4 +131,16 @@ RSpec.describe Banzai::Pipeline::FullPipeline do
expect(output).to include("test [[<em>TOC</em>]]")
end
end
describe 'backslash escapes' do
let_it_be(:project) { create(:project, :public) }
let_it_be(:issue) { create(:issue, project: project) }
it 'does not convert an escaped reference' do
markdown = "\\#{issue.to_reference}"
output = described_class.to_html(markdown, project: project)
expect(output).to include("<span>#</span>#{issue.iid}")
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Banzai::Pipeline::PlainMarkdownPipeline do
using RSpec::Parameterized::TableSyntax
describe 'backslash escapes' do
let_it_be(:project) { create(:project, :public) }
let_it_be(:issue) { create(:issue, project: project) }
def correct_html_included(markdown, expected)
result = described_class.call(markdown, {})
expect(result[:output].to_html).to include(expected)
result
end
context 'when feature flag honor_escaped_markdown is disabled' do
before do
stub_feature_flags(honor_escaped_markdown: false)
end
it 'does not escape the markdown' do
result = described_class.call(%q(\!), project: project)
output = result[:output].to_html
expect(output).to eq('<p data-sourcepos="1:1-1:2">!</p>')
expect(result[:escaped_literals]).to be_falsey
end
end
# Test strings taken from https://spec.commonmark.org/0.29/#backslash-escapes
describe 'CommonMark tests', :aggregate_failures do
it 'converts all ASCII punctuation to literals' do
markdown = %q(\!\"\#\$\%\&\'\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~) + %q[\(\)\\\\]
punctuation = %w(! " # $ % &amp; ' * + , - . / : ; &lt; = &gt; ? @ [ \\ ] ^ _ ` { | } ~) + %w[( )]
result = described_class.call(markdown, project: project)
output = result[:output].to_html
punctuation.each { |char| expect(output).to include("<span>#{char}</span>") }
expect(result[:escaped_literals]).to be_truthy
end
it 'does not convert other characters to literals' do
markdown = %q(\→\A\a\ \3\φ\«)
expected = '\→\A\a\ \3\φ\«'
result = correct_html_included(markdown, expected)
expect(result[:escaped_literals]).to be_falsey
end
describe 'escaped characters are treated as regular characters and do not have their usual Markdown meanings' do
where(:markdown, :expected) do
%q(\*not emphasized*) | %q(<span>*</span>not emphasized*)
%q(\<br/> not a tag) | %q(<span>&lt;</span>br/&gt; not a tag)
%q!\[not a link](/foo)! | %q!<span>[</span>not a link](/foo)!
%q(\`not code`) | %q(<span>`</span>not code`)
%q(1\. not a list) | %q(1<span>.</span> not a list)
%q(\# not a heading) | %q(<span>#</span> not a heading)
%q(\[foo]: /url "not a reference") | %q(<span>[</span>foo]: /url "not a reference")
%q(\&ouml; not a character entity) | %q(<span>&amp;</span>ouml; not a character entity)
end
with_them do
it 'keeps them as literals' do
correct_html_included(markdown, expected)
end
end
end
it 'backslash is itself escaped, the following character is not' do
markdown = %q(\\\\*emphasis*)
expected = %q(<span>\</span><em>emphasis</em>)
correct_html_included(markdown, expected)
end
it 'backslash at the end of the line is a hard line break' do
markdown = <<~MARKDOWN
foo\\
bar
MARKDOWN
expected = "foo<br>\nbar"
correct_html_included(markdown, expected)
end
describe 'backslash escapes do not work in code blocks, code spans, autolinks, or raw HTML' do
where(:markdown, :expected) do
%q(`` \[\` ``) | %q(<code>\[\`</code>)
%q( \[\]) | %Q(<code>\\[\\]\n</code>)
%Q(~~~\n\\[\\]\n~~~) | %Q(<code>\\[\\]\n</code>)
%q(<http://example.com?find=\*>) | %q(<a href="http://example.com?find=%5C*">http://example.com?find=\*</a>)
%q[<a href="/bar\/)">] | %q[<a href="/bar%5C/)">]
end
with_them do
it { correct_html_included(markdown, expected) }
end
end
describe 'work in all other contexts, including URLs and link titles, link references, and info strings in fenced code blocks' do
where(:markdown, :expected) do
%q![foo](/bar\* "ti\*tle")! | %q(<a href="/bar*" title="ti*tle">foo</a>)
%Q![foo]\n\n[foo]: /bar\\* "ti\\*tle"! | %q(<a href="/bar*" title="ti*tle">foo</a>)
%Q(``` foo\\+bar\nfoo\n```) | %Q(<code lang="foo+bar">foo\n</code>)
end
with_them do
it { correct_html_included(markdown, expected) }
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment