Commit 9610b616 authored by Robert Griesemer's avatar Robert Griesemer

go/doc: fix URL matched in ToHTML

Permit paired parentheses in URLs such as:

http://en.wikipedia.org/wiki/Camellia_(cipher)

Fixes #5043.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/85610043
parent e79bab30
......@@ -45,13 +45,13 @@ func commentEscape(w io.Writer, text string, nice bool) {
const (
// Regexp for Go identifiers
identRx = `[a-zA-Z_][a-zA-Z_0-9]*` // TODO(gri) ASCII only for now - fix this
identRx = `[\pL_][\pL_0-9]*`
// Regexp for URLs
protocol = `(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero):`
protocol = `https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero`
hostPart = `[a-zA-Z0-9_@\-]+`
filePart = `[a-zA-Z0-9_?%#~&/\-+=]+`
urlRx = protocol + `//` + // http://
filePart = `[a-zA-Z0-9_?%#~&/\-+=()]+` // parentheses may not be matching; see pairedParensPrefixLen
urlRx = `(` + protocol + `)://` + // http://
hostPart + `([.:]` + hostPart + `)*/?` + // //www.google.com:8080/
filePart + `([:.,]` + filePart + `)*`
)
......@@ -73,6 +73,29 @@ var (
html_endh = []byte("</h3>\n")
)
// pairedParensPrefixLen returns the length of the longest prefix of s containing paired parentheses.
func pairedParensPrefixLen(s string) int {
parens := 0
l := len(s)
for i, ch := range s {
switch ch {
case '(':
if parens == 0 {
l = i
}
parens++
case ')':
parens--
if parens == 0 {
l = len(s)
} else if parens < 0 {
return i
}
}
}
return l
}
// Emphasize and escape a line of text for HTML. URLs are converted into links;
// if the URL also appears in the words map, the link is taken from the map (if
// the corresponding map value is the empty string, the URL is not converted
......@@ -92,18 +115,26 @@ func emphasize(w io.Writer, line string, words map[string]string, nice bool) {
// write text before match
commentEscape(w, line[0:m[0]], nice)
// analyze match
// adjust match if necessary
match := line[m[0]:m[1]]
if n := pairedParensPrefixLen(match); n < len(match) {
// match contains unpaired parentheses (rare);
// redo matching with shortened line for correct indices
m = matchRx.FindStringSubmatchIndex(line[:m[0]+n])
match = match[:n]
}
// analyze match
url := ""
italics := false
if words != nil {
url, italics = words[string(match)]
url, italics = words[match]
}
if m[2] >= 0 {
// match against first parenthesized sub-regexp; must be match against urlRx
if !italics {
// no alternative URL in words list, use match instead
url = string(match)
url = match
}
italics = false // don't italicize URLs
}
......
......@@ -148,13 +148,16 @@ func TestToText(t *testing.T) {
}
var emphasizeTests = []struct {
in string
out string
in, out string
}{
{"http://www.google.com/", `<a href="http://www.google.com/">http://www.google.com/</a>`},
{"https://www.google.com/", `<a href="https://www.google.com/">https://www.google.com/</a>`},
{"http://www.google.com/path.", `<a href="http://www.google.com/path">http://www.google.com/path</a>.`},
{"http://en.wikipedia.org/wiki/Camellia_(cipher)", `<a href="http://en.wikipedia.org/wiki/Camellia_(cipher)">http://en.wikipedia.org/wiki/Camellia_(cipher)</a>`},
{"(http://www.google.com/)", `(<a href="http://www.google.com/">http://www.google.com/</a>)`},
{"http://gmail.com)", `<a href="http://gmail.com">http://gmail.com</a>)`},
{"((http://gmail.com))", `((<a href="http://gmail.com">http://gmail.com</a>))`},
{"http://gmail.com ((http://gmail.com)) ()", `<a href="http://gmail.com">http://gmail.com</a> ((<a href="http://gmail.com">http://gmail.com</a>)) ()`},
{"Foo bar http://example.com/ quux!", `Foo bar <a href="http://example.com/">http://example.com/</a> quux!`},
{"Hello http://example.com/%2f/ /world.", `Hello <a href="http://example.com/%2f/">http://example.com/%2f/</a> /world.`},
{"Lorem http: ipsum //host/path", "Lorem http: ipsum //host/path"},
......@@ -171,3 +174,34 @@ func TestEmphasize(t *testing.T) {
}
}
}
var pairedParensPrefixLenTests = []struct {
in, out string
}{
{"", ""},
{"foo", "foo"},
{"()", "()"},
{"foo()", "foo()"},
{"foo()()()", "foo()()()"},
{"foo()((()()))", "foo()((()()))"},
{"foo()((()()))bar", "foo()((()()))bar"},
{"foo)", "foo"},
{"foo))", "foo"},
{"foo)))))", "foo"},
{"(foo", ""},
{"((foo", ""},
{"(((((foo", ""},
{"(foo)", "(foo)"},
{"((((foo))))", "((((foo))))"},
{"foo()())", "foo()()"},
{"foo((()())", "foo"},
{"foo((()())) (() foo ", "foo((()())) "},
}
func TestPairedParensPrefixLen(t *testing.T) {
for i, tt := range pairedParensPrefixLenTests {
if out := tt.in[:pairedParensPrefixLen(tt.in)]; out != tt.out {
t.Errorf("#%d: mismatch\nhave: %q\nwant: %q", i, out, tt.out)
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment