Commit 4111f81c authored by Julien Muchembled's avatar Julien Muchembled

Make Contribution Tool accept non-conformant %-escaped URL (or unescaped URL)

This fixes TestWebCrawler.test_02_crawlWebSite

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@41759 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 642f146d
......@@ -40,6 +40,7 @@ from Products.ERP5Type.Globals import InitializeClass, DTMLFile
from Products.CMFCore.utils import _checkPermission
from Products.ERP5Type.Tool.BaseTool import BaseTool
from Products.ERP5Type import Permissions
from Products.ERP5Type.Utils import reencodeUrlEscapes
from Products.ERP5 import _dtmldir
from Products.ERP5.Document.Url import no_crawl_protocol_list
from AccessControl import Unauthorized
......@@ -661,10 +662,7 @@ class ContributionTool(BaseTool):
return file_object, filename, content_type tuple
"""
# Quote path part of url
url_tuple = urlparse.urlsplit(url)
quoted_path = urllib.quote(url_tuple[2])
url = urlparse.urlunsplit((url_tuple[0], url_tuple[1], quoted_path,
url_tuple[3], url_tuple[4]))
url = reencodeUrlEscapes(url)
# build a new file from the url
url_file = urllib2.urlopen(urllib2.Request(url,
headers={'Accept':'*/*'}))
......
......@@ -3304,3 +3304,28 @@ def guessEncodingFromText(data, content_type='text/html'):
raise NotImplementedError, 'No encoding detector found.'\
' You must install chardet and python-magic'
_reencodeUrlEscapes_map = dict((chr(x), chr(x) in (# safe
"!'()*-." "0123456789" "_~"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
# reserved (maybe unsafe)
"#$&+,/:;=?@[]")
and chr(x) or "%%%02X" % x)
for x in xrange(256))
def reencodeUrlEscapes(url):
"""Fix a non-conformant %-escaped URL (or quote an unescaped one)
This is a Python reimplementation of 'reencode_escapes' function of Wget 1.12
"""
from string import hexdigits
next_part = iter(url.split('%')).next
url = [_reencodeUrlEscapes_map[c] for c in next_part()]
try:
while True:
part = next_part()
url.append('%')
if len(part) < 2 or not (part[0] in hexdigits and part[1] in hexdigits):
url.append('25')
url += [_reencodeUrlEscapes_map[c] for c in part]
except StopIteration:
return ''.join(url)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment