Commit dd07ef91 authored by Tristan Cavelier's avatar Tristan Cavelier

erp5_web: fix <a> links are not absolute everytime on html embedding

- <a href="c"> was not transformed to absolute url
- split the WebPage_exportAsSingleFile and WebPage_extractReferredObjectDict
into two scripts to allow to evaluate string html data.
+ tests
parent 6b85d617
"""
Export the web page and its components to a single (m)html file.
`data` is the html to convert.
`format` parameter could also be "mhtml".
`base_url` is the url to use as base url when relative url are found,
by using it, the script will use `site_object_dict` for each href.
(Don't forget the ending '/' !)
`site_object_dict` is a dict of (domain, object) used to get the object
corresponding to the absolute url found. By default the dict returned
by `context.ERP5Site_getWebSiteDomainDict()` is used.
TODO: export same components into one mhtml attachment if possible.
"""
from zExceptions import Unauthorized
from base64 import b64encode, b64decode
portal = context.getPortalObject()
mhtml_message = {
"subtype": "related",
"param_list": [("type", "text/html")],
"header_dict": {"From": "<Saved by ERP5>", "Subject": title},
"attachment_list": [],
}
def main(data):
if isinstance(data, str):
data = data.decode("utf-8")
data = u"".join([fn(p) for fn, p in handleHtmlPartList(parseHtml(data))])
data = data.encode("utf-8")
if format == "mhtml":
mhtml_message["attachment_list"].insert(0, {
"mime_type": "text/html",
"encode": "quoted-printable",
"add_header_list": [("Content-Location", base_url)],
"data": data,
})
data = context.Base_formatAttachmentListToMIMEMultipartString(**mhtml_message)
return data
def handleHtmlTag(tag, attrs):
#if tag == "base": and "href" in attrs: # should not exist in safe-html
# NotImplemented
if tag == "object":
for i in range(len(attrs)):
if attrs[i][0] == "data":
attrs[i] = attrs[i][0], handleImageSource(attrs[i][1])
elif tag == "img":
for i in range(len(attrs)):
if attrs[i][0] == "src":
attrs[i] = attrs[i][0], handleImageSource(attrs[i][1])
elif tag == "link" and anny(attrs, key=lambda a: a[0] == "rel" and a[1] == "stylesheet"):
for i in range(len(attrs)):
if attrs[i][0] == "href":
attrs[i] = attrs[i][0], replaceFromDataUri(handleCssHref(attrs[i][1]), replaceCssUrl)
elif tag == "script":
for i in range(len(attrs)):
if attrs[i][0] == "src":
attrs[i] = attrs[i][0], handleJsSource(attrs[i][1])
else:
for i in range(len(attrs)):
if attrs[i][0] == "href" or attrs[i][0] == "src":
attrs[i] = attrs[i][0], makeHrefAbsolute(attrs[i][1])
for i in range(len(attrs)):
if attrs[i][0] == "style":
attrs[i] = attrs[i][0], replaceCssUrl(attrs[i][1])
return tag, attrs
def strHtmlPart(part):
part_type = part[0]
if part_type in ("starttag", "startendtag"):
tag, attrs = handleHtmlTag(part[1], part[2])
attrs_str = " ".join(["%s=\"%s\"" % (escapeHtml(k), escapeHtml(v or "")) for k, v in attrs])
return "<%s%s%s>" % (tag, " " + attrs_str if attrs_str else "", " /" if part_type == "startendtag" else "")
if part_type == "endtag":
return "</%s>" % part[1]
if part_type == "data":
return part[1]
if part_type == "entityref":
return "&%s;" % part[1]
if part_type == "charref":
return "&#%s;" % part[1]
if part_type == "comment":
return "<!--%s-->" % part[1]
if part_type in ("decl", "unknown_decl"):
return "<!%s>" % part[1]
if part_type == "pi":
return "<?%s>" % part[1]
disallow_script = not allow_script
def handleHtmlPartList(part_list):
res = []
style_data = ""
on_script = False
on_style = False
for part in part_list:
if on_script:
if part[0] == "endtag" and part[1] == "script":
on_script = False
# can only be data until </script> endtag
elif on_style:
if part[0] == "endtag" and part[1] == "style":
res.append((replaceCssUrl, style_data))
res.append((strHtmlPart, part))
style_data = ""
on_style = False
else:
# can only be data until </style> endtag
style_data += strHtmlPart(part)
else:
if part[0] == "starttag":
# when you save a page from a browser, every script tag are removed
if part[1] == "script" and disallow_script:
on_script = True
continue
elif part[1] == "style":
on_style = True
res.append((strHtmlPart, part))
return res
def handleCssHref(href):
return handleHref(href)
def handleJsSource(href):
return handleHref(href)
def handleHref(href):
if not isHrefAUrl(href):
return href
try:
obj = traverseHref(href)
except (KeyError, Unauthorized):
# KeyError can be side_object_dict[domain] KeyError
# or restrictedTraverse(path) KeyError
return makeHrefAbsolute(href)
return handleHrefObject(obj, href)
def handleImageSource(src):
if not isHrefAUrl(src):
return src
try:
obj = traverseHref(src)
except (KeyError, Unauthorized):
# KeyError can be side_object_dict[domain] KeyError
# or restrictedTraverse(path) KeyError
return makeHrefAbsolute(src)
return handleImageSourceObject(obj, src)
def replaceCssUrl(data):
parts = context.Base_parseCssForUrl(data)
data = ""
for part in parts:
if part[0] == "url":
url = part[2]
if isHrefAUrl(url):
data += handleImageSource(url)
else:
data += part[1]
else:
data += part[1]
return data
def handleImageSourceObject(obj, src):
if hasattr(obj, "convert"):
search = parseUrlSearch(extractUrlSearch(src))
format_kw = {}
for key, value in search:
if key == "format" and value is not None:
format_kw["format"] = value
elif key == "display" and value is not None:
format_kw["display"] = value
if format_kw:
mime, data = obj.convert(**format_kw)
return handleLinkedData(mime, str(data), src)
return handleHrefObject(obj, src, default_mimetype=bad_image_mime_type, default_data=bad_image_data)
def handleHrefObject(obj, src, default_mimetype="text/html", default_data="<p>Linked page not found</p>"):
# handle File portal_skins/folder/file.png
# XXX handle "?portal_skin=" parameter ?
if hasattr(obj, "getContentType"):
mime = obj.getContentType()
if mime:
if hasattr(obj, "data"):
data = obj.data or ""
else:
data = getattr(obj, "getData", lambda: str(obj))() or ""
if isinstance(data, unicode):
data = data.encode("utf-8")
return handleLinkedData(mime, data, src)
return handleLinkedData(default_mimetype, default_data, src)
# handle Object.view
# XXX handle url query parameters ? Not so easy because we need to
# use the same behavior as when we call a script from browser URL bar.
if not hasattr(obj, "getPortalType") and callable(obj):
mime, data = "text/html", obj()
if isinstance(data, unicode):
data = data.encode("utf-8")
return handleLinkedData(mime, data, src)
return handleLinkedData(default_mimetype, default_data, src)
bad_image_data_url = (
"data:image/png;base64," + # little image showing cannot load image
"iVBORw0KGgoAAAANSUhEUgAAABEAAAATCAIAAAD5x3GmAAACWklEQVQokZWSX0hTcRTHz/" +
"3TunMmMyxrQUzEPQSCFEI0fCi0HmSKdsUGg3q2h5I99dj7bE8Nw6cwLDb3kO7JP5m6h0TE" +
"CmYQjJYgpaPc7q67+93de04P0zvnQ+CP78Pvdzgfzjnf3+GICE55+NMCACACACKOj49rmv" +
"afvNHRUZ4/KkBEjLFQKJRTjXyRTqigUSwWI6JQKGSaJhEREQ8ApmkCgFrif+8bJ7RfMAGA" +
"MRYMBsPhMCLWzFPUUdVI1cjjEj0usXLXdLJ6sTCx2jIBAd1otVVe11vPbKT1iqeJRMLKKp" +
"fLVYaoChxGEAwDbt0ZsNs4ABAEbiLyoqYOEax/ZyfsYmX4q5iCAABQd1aoen3UGmDt/zod" +
"/EWnuJczcgcIABzHu91um81W9YCI8Jga6rirqUV41O9pQqeDR6J6iRvs7VUeDFQZJCKEih" +
"DxfINemIioq4ms7GtrwkaH4KovZ2WfujLL1/SGiIgZZSmavj2Veto0GYXO7vzawo7saztX" +
"3JF9+bUF6Oyu8YAAtnLvNrJBAOPb7lbkizQyPZuWfX8+LeTaG00NHDe7r8Rmju0oQaawVA" +
"Eqga+/Xkc+B1vexDSJzx+AJvEtk1FDEHjLAEXfXdt7ZgEA0H754UjH2GZgWFGR2UVFxc3A" +
"sIh4yDDGFjPPdfxhAdea/Y87xpJy//bqnN3b05XK2r0928n55P2+w3kMw9CXmy/AE4u5Fw" +
"h89A/tLM9d6urxTr9/G4/74zMfBvt+rsxzRKTruqIojNUsgSRJB+vrqVcv705Fc8ViqVSS" +
"JMnpcMz5h/4B1Qxz9NOjZCgAAAAASUVORK5CYII="
)
bad_image_data = b64decode(bad_image_data_url.split(",", 1)[1])
bad_image_mime_type = "image/png"
if site_object_dict is None:
site_object_dict = context.ERP5Site_getWebSiteDomainDict()
base_url_root_object = getattr(context, "getWebSiteValue", str)() or portal
base_url_object = context
# Resolve base_url by removing everything after the last slash
force_base_url = False
if base_url is not None:
if base_url.startswith("https://"):
force_base_url = True
request_protocol = "https:"
elif base_url.startswith("http://"):
force_base_url = True
request_protocol = "http:"
else:
raise ValueError("invalid `base_url` argument")
if force_base_url:
root_url = "/".join(base_url.split("/", 3)[:3])
if root_url != base_url:
base_url = "/".join(base_url.split("/")[:-1])
else:
request_protocol = context.REQUEST.SERVER_URL.split(":", 1)[0] + ":"
root_url = base_url_root_object.absolute_url()
base_url = base_url_object.absolute_url()
assert base_url_object.getRelativeUrl().startswith(base_url_root_object.getRelativeUrl())
base_path = base_url_object.getRelativeUrl()[len(base_url_root_object.getRelativeUrl()):]
if not base_path.startswith("/"):
base_path = "/" + base_path
def handleLinkedData(mime, data, href):
if format == "mhtml":
url = makeHrefAbsolute(href)
mhtml_message["attachment_list"].append({
"mime_type": mime,
"encode": "quoted-printable" if mime.startswith("text/") else None,
"add_header_list": [("Content-Location", url)],
"data": str(data),
})
return url
else:
return "data:%s;base64,%s" % (mime, b64encode(data))
def makeHrefAbsolute(href):
if isHrefAnAbsoluteUrl(href) or not isHrefAUrl(href):
return href
if href.startswith("//"):
return request_protocol + href
if href.startswith("/"):
return root_url + href
return base_url + "/" + href
def isHrefAnAbsoluteUrl(href):
return href.startswith("https://") or href.startswith("http://")
def isHrefAUrl(href):
if href.startswith("https://") or href.startswith("http://"):
return True
split = href.split(":", 1)
if len(split) == 1:
return True
return not split[0].isalpha()
normalize_kw = {"keep_empty": False, "keep_trailing_slash": False}
def traverseHref(href, allow_hash=False):
url = href.split("?", 1)[0]
if not allow_hash:
url = url.split("#", 1)[0]
if url.startswith("https://") or url.startswith("http://") or url.startswith("//"): # absolute url possibly on other sites
site_url = "/".join(url.split("/", 3)[:3])
domain = url.split("/", 3)[2]
site_object = site_object_dict[domain]
relative_path = url[len(site_url):]
relative_path = (relative_path[1:] if relative_path[:1] == "/" else relative_path)
relative_path = context.Base_normalizeUrlPathname("/" + relative_path, **normalize_kw)[1:]
return site_object.restrictedTraverse(str(relative_path))
if url.startswith("/"): # absolute path, relative url
if force_base_url:
return traverseHref(root_url + href, allow_hash=allow_hash) # use site_domain_dict
return base_url_root_object.restrictedTraverse(str(context.Base_normalizeUrlPathname(url, **normalize_kw)[1:]))
# relative url
if force_base_url:
return traverseHref(base_url + "/" + href, allow_hash=allow_hash) # use site_domain_dict
return base_url_root_object.restrictedTraverse(str(context.Base_normalizeUrlPathname(base_path + "/" + url, **normalize_kw)[1:]))
def replaceFromDataUri(data_uri, replacer):
split = data_uri.split(",", 1)
if len(split) != 2:
return data_uri
header, data = split
if "text/css" not in header:
return data_uri
is_base64 = False
if ";base64" in header:
is_base64 = True
data = b64decode(data)
data = replacer(data)
return "%s,%s" % (header, b64encode(data) if is_base64 else data)
def extractUrlSearch(url):
url = url.split("#", 1)[0].split("?", 1)
url[0] = ""
return "?".join(url)
def parseUrlSearch(search):
if search[:1] == "?":
search = search[1:]
result = []
for part in search.split("&"):
key = part.split("=")
value = "=".join(key[1:]) if len(key) else None
result.append((key[0], value))
return result
def parseHtml(text):
return context.Base_parseHtml(text)
def escapeHtml(text):
return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace("\"", "&quot;")
def anny(iterable, key=None):
for i in iterable:
if key:
i = key(i)
if i:
return True
return False
return main(data)
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>Script_magic</string> </key>
<value> <int>3</int> </value>
</item>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
</klass>
<tuple/>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>data, allow_script=False, format="embedded_html", base_url=None, site_object_dict=None, title=\'Untitled\'</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Base_convertHtmlToSingleFile</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
"""
Extract all object referenced by html components
`data` is the html to parse.
`allow_tag_list` is the white list of tag to parse.
Default is to allow every tag.
`deny_tag_list` is the black list of tag to parse.
Default is to deny no tag.
`base_url` is the url to use as base url when relative url are found,
by using it, the script will use `site_object_dict` for each href.
(Don't forget the ending '/' !)
`site_object_dict` is a dict of (domain, object) used to get the object
corresponding to the absolute url found. By default the dict returned
by `context.ERP5Site_getWebSiteDomainDict()` is used.
"""
from zExceptions import Unauthorized
portal = context.getPortalObject()
href_object_dict = {}
if not isinstance(allow_tag_list, (list, tuple)):
allow_tag_list = None
if not isinstance(deny_tag_list, (list, tuple)):
deny_tag_list = []
def main(data):
if isinstance(data, str):
data = data.decode("utf-8")
for part in context.Base_parseHtml(data):
handleHtmlPart(part)
return href_object_dict
def handleHtmlTag(tag, attrs):
if allow_tag_list is not None:
if tag not in allow_tag_list:
return
if tag in deny_tag_list:
return
#if tag == "base": and "href" in attrs: # should not exist in safe-html
# NotImplemented
if tag == "object":
for i in range(len(attrs)):
if attrs[i][0] == "data":
handleHref(attrs[i][1])
elif tag == "style":
# for style tags, next data will always be the entire text until </style>
on_next_data[0] = handleCss
else:
for i in range(len(attrs)):
if attrs[i][0] in ("src", "href"):
handleHref(attrs[i][1])
for i in range(len(attrs)):
if attrs[i][0] == "style":
handleCss(attrs[i][1])
on_next_data = [lambda x: x]
def handleHtmlPart(part):
part_type = part[0]
if part_type in ("starttag", "startendtag"):
return handleHtmlTag(part[1], part[2])
if part_type == "data":
if on_next_data[0] is None:
return part[1]
on_next_data[0](part[1])
on_next_data[0] = None
return None
def handleHref(href):
# handles "base_url/document_module/id"
# handles "base_url/R-Document.Reference"
# handles "base_url/R-Document.Reference/view"
if not isHrefAUrl(href):
return href
try:
obj = traverseHref(href, allow_method=False)
except (KeyError, Unauthorized):
obj = None
href_object_dict[href.encode("utf-8")] = obj
def handleCss(data):
for part in context.Base_parseCssForUrl(data):
if part[0] == "url":
handleHref(part[2])
def isHrefAUrl(href):
if href.startswith("https://") or href.startswith("http://"):
return True
split = href.split(":", 1)
if len(split) == 1:
return True
return not split[0].isalpha()
def traverseHref(url, allow_method=True, allow_hash=False):
base_obj, relative_path = prepareHrefTraverse(url, allow_hash=allow_hash)
obj = base_obj.restrictedTraverse(relative_path)
if allow_method or obj is None:
return obj
try:
obj.getUid()
except AttributeError:
obj = base_obj.restrictedTraverse("/".join(relative_path.split("/")[:-1]))
return obj
if site_object_dict is None:
site_object_dict = context.ERP5Site_getWebSiteDomainDict()
base_url_root_object = getattr(context, "getWebSiteValue", str)() or portal
base_url_object = context
# Resolve base_url by removing everything after the last slash
force_base_url = False
if base_url is not None:
if base_url.startswith("https://") or base_url.startswith("http://"):
force_base_url = True
else:
raise ValueError("invalid `base_url` argument")
if force_base_url:
root_url = "/".join(base_url.split("/", 3)[:3])
if root_url != base_url:
base_url = "/".join(base_url.split("/")[:-1])
else:
root_url = base_url_root_object.absolute_url()
base_url = base_url_object.absolute_url()
base_path = "."
if base_url_object.getRelativeUrl().startswith(base_url_root_object.getRelativeUrl()):
base_path = base_url_object.getRelativeUrl()[len(base_url_root_object.getRelativeUrl()):]
if base_path and not base_path.startswith("/"):
base_path = "/" + base_path
normalize_kw = {"keep_empty": False, "keep_trailing_slash": False}
def prepareHrefTraverse(href, allow_hash=False):
url = href.split("?")[0]
if not allow_hash:
url = url.split("#")[0]
if url.startswith("https://") or url.startswith("http://") or url.startswith("//"): # absolute url possibly on other sites
site_url = "/".join(url.split("/", 3)[:3])
domain = url.split("/", 3)[2]
site_object = site_object_dict[domain]
relative_path = url[len(site_url):]
relative_path = (relative_path[1:] if relative_path[:1] == "/" else relative_path)
relative_path = context.Base_normalizeUrlPathname("/" + relative_path, **normalize_kw)[1:]
return site_object, str(relative_path)
if url.startswith("/"): # absolute path, relative url
if force_base_url:
return prepareHrefTraverse(root_url + href, allow_hash=allow_hash) # use site_domain_dict
return base_url_root_object, str(context.Base_normalizeUrlPathname(url, **normalize_kw)[1:])
# relative path
if force_base_url:
return prepareHrefTraverse(base_url + "/" + href, allow_hash=allow_hash) # use site_domain_dict
return base_url_root_object, str(context.Base_normalizeUrlPathname(base_path + "/" + url, **normalize_kw)[1:])
return main(data)
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>Script_magic</string> </key>
<value> <int>3</int> </value>
</item>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
</klass>
<tuple/>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>data, allow_tag_list=None, deny_tag_list=None, base_url=None, site_object_dict=None</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Base_extractReferredObjectDictFromHtml</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
"""
Export the web page and its components to a single (m)html file.
`format` parameter could also be "mhtml".
TODO: export same components into one mhtml attachment if possible.
see Base_convertHtmlToSingleFile for documentation
"""
from zExceptions import Unauthorized
from base64 import b64encode, b64decode
portal = context.getPortalObject()
mhtml_message = {
"subtype": "related",
"param_list": [("type", "text/html")],
"header_dict": {"From": "<Saved by ERP5>", "Subject": "Untitled"},
"attachment_list": [],
}
def main():
data = context.getTextContent("")
if isinstance(data, str):
data = data.decode("utf-8")
data = u"".join([fn(p) for fn, p in handleHtmlPartList(parseHtml(data))])
data = data.encode("utf-8")
data = context.Base_convertHtmlToSingleFile(
context.getTextContent(""),
allow_script=allow_script,
format=format,
base_url=base_url,
site_object_dict=site_object_dict,
title=context.getTitle() or "Untitled",
)
if REQUEST is not None:
if format == "mhtml":
mhtml_message["header_dict"]["Subject"] = context.getTitle() or "Untitled"
mhtml_message["attachment_list"].insert(0, {
"mime_type": "text/html",
"encode": "quoted-printable",
"add_header_list": [("Content-Location", context.absolute_url())],
"data": data,
})
res = context.Base_formatAttachmentListToMIMEMultipartString(**mhtml_message)
if REQUEST is not None:
REQUEST.RESPONSE.setHeader("Content-Type", "multipart/related")
REQUEST.RESPONSE.setHeader("Content-Disposition", 'attachment;filename="%s-%s-%s.mhtml"' % (
context.getReference("untitled").replace('"', '\\"'),
context.getVersion("001").replace('"', '\\"'),
context.getLanguage("en").replace('"', '\\"'),
))
return res
if REQUEST is not None:
REQUEST.RESPONSE.setHeader("Content-Type", "multipart/related")
REQUEST.RESPONSE.setHeader("Content-Disposition", 'attachment;filename="%s-%s-%s.mhtml"' % (
context.getReference("untitled").replace('"', '\\"'),
context.getVersion("001").replace('"', '\\"'),
context.getLanguage("en").replace('"', '\\"'),
))
else:
REQUEST.RESPONSE.setHeader("Content-Type", "text/html")
REQUEST.RESPONSE.setHeader("Content-Disposition", 'attachment;filename="%s-%s-%s.html"' % (
context.getReference("untitled").replace('"', '\\"'),
context.getVersion("001").replace('"', '\\"'),
context.getLanguage("en").replace('"', '\\"'),
))
return data
def handleHtmlTag(tag, attrs):
#if tag == "base": and "href" in attrs: # should not exist in safe-html
# NotImplemented
if tag == "object":
for i in range(len(attrs)):
if attrs[i][0] == "data":
attrs[i] = attrs[i][0], handleImageSource(attrs[i][1])
elif tag == "img":
for i in range(len(attrs)):
if attrs[i][0] == "src":
attrs[i] = attrs[i][0], handleImageSource(attrs[i][1])
elif tag == "link" and anny(attrs, key=lambda a: a[0] == "rel" and a[1] == "stylesheet"):
for i in range(len(attrs)):
if attrs[i][0] == "href":
attrs[i] = attrs[i][0], replaceFromDataUri(handleCssHref(attrs[i][1]), replaceCssUrl)
elif tag == "script":
for i in range(len(attrs)):
if attrs[i][0] == "src":
attrs[i] = attrs[i][0], handleJsSource(attrs[i][1])
else:
for i in range(len(attrs)):
if attrs[i][0] == "href" or attrs[i][0] == "src":
attrs[i] = attrs[i][0], makeHrefAbsolute(attrs[i][1])
for i in range(len(attrs)):
if attrs[i][0] == "style":
attrs[i] = attrs[i][0], replaceCssUrl(attrs[i][1])
return tag, attrs
def strHtmlPart(part):
part_type = part[0]
if part_type in ("starttag", "startendtag"):
tag, attrs = handleHtmlTag(part[1], part[2])
attrs_str = " ".join(["%s=\"%s\"" % (escapeHtml(k), escapeHtml(v or "")) for k, v in attrs])
return "<%s%s%s>" % (tag, " " + attrs_str if attrs_str else "", " /" if part_type == "startendtag" else "")
if part_type == "endtag":
return "</%s>" % part[1]
if part_type == "data":
return part[1]
if part_type == "entityref":
return "&%s;" % part[1]
if part_type == "charref":
return "&#%s;" % part[1]
if part_type == "comment":
return "<!--%s-->" % part[1]
if part_type in ("decl", "unknown_decl"):
return "<!%s>" % part[1]
if part_type == "pi":
return "<?%s>" % part[1]
disallow_script = not allow_script
def handleHtmlPartList(part_list):
res = []
style_data = ""
on_script = False
on_style = False
for part in part_list:
if on_script:
if part[0] == "endtag" and part[1] == "script":
on_script = False
# can only be data until </script> endtag
elif on_style:
if part[0] == "endtag" and part[1] == "style":
res.append((replaceCssUrl, style_data))
res.append((strHtmlPart, part))
style_data = ""
on_style = False
else:
# can only be data until </style> endtag
style_data += strHtmlPart(part)
else:
if part[0] == "starttag":
# when you save a page from a browser, every script tag are removed
if part[1] == "script" and disallow_script:
on_script = True
continue
elif part[1] == "style":
on_style = True
res.append((strHtmlPart, part))
return res
def handleCssHref(href):
return handleHref(href)
def handleJsSource(href):
return handleHref(href)
def handleHref(href):
if not isHrefAUrl(href):
return href
try:
obj = traverseHref(href)
except (KeyError, Unauthorized):
return makeHrefAbsolute(href)
return handleHrefObject(obj, href)
def handleImageSource(src):
if not isHrefAUrl(src):
return src
try:
obj = traverseHref(src)
except (KeyError, Unauthorized):
return makeHrefAbsolute(src)
return handleImageSourceObject(obj, src)
def replaceCssUrl(data):
parts = context.Base_parseCssForUrl(data)
data = ""
for part in parts:
if part[0] == "url":
url = part[2]
if isHrefAUrl(url):
data += handleImageSource(url)
else:
data += part[1]
else:
data += part[1]
return data
def handleImageSourceObject(obj, src):
if hasattr(obj, "convert"):
search = parseUrlSearch(extractUrlSearch(src))
format_kw = {}
for key, value in search:
if key == "format" and value is not None:
format_kw["format"] = value
elif key == "display" and value is not None:
format_kw["display"] = value
if format_kw:
mime, data = obj.convert(**format_kw)
return handleLinkedData(mime, str(data), src)
return handleHrefObject(obj, src, default_mimetype=bad_image_mime_type, default_data=bad_image_data)
def handleHrefObject(obj, src, default_mimetype="text/html", default_data="<p>Linked page not found</p>"):
# handle File portal_skins/folder/file.png
# XXX handle "?portal_skin=" parameter ?
if hasattr(obj, "getContentType"):
mime = obj.getContentType()
if mime:
if hasattr(obj, "data"):
data = obj.data or ""
else:
data = getattr(obj, "getData", lambda: str(obj))() or ""
if isinstance(data, unicode):
data = data.encode("utf-8")
return handleLinkedData(mime, data, src)
return handleLinkedData(default_mimetype, default_data, src)
# handle Object.view
# XXX handle url query parameters ? Not so easy because we need to
# use the same behavior as when we call a script from browser URL bar.
if not hasattr(obj, "getPortalType") and callable(obj):
mime, data = "text/html", obj()
if isinstance(data, unicode):
data = data.encode("utf-8")
return handleLinkedData(mime, data, src)
return handleLinkedData(default_mimetype, default_data, src)
bad_image_data_url = (
"data:image/png;base64," + # little image showing cannot load image
"iVBORw0KGgoAAAANSUhEUgAAABEAAAATCAIAAAD5x3GmAAACWklEQVQokZWSX0hTcRTHz/" +
"3TunMmMyxrQUzEPQSCFEI0fCi0HmSKdsUGg3q2h5I99dj7bE8Nw6cwLDb3kO7JP5m6h0TE" +
"CmYQjJYgpaPc7q67+93de04P0zvnQ+CP78Pvdzgfzjnf3+GICE55+NMCACACACKOj49rmv" +
"afvNHRUZ4/KkBEjLFQKJRTjXyRTqigUSwWI6JQKGSaJhEREQ8ApmkCgFrif+8bJ7RfMAGA" +
"MRYMBsPhMCLWzFPUUdVI1cjjEj0usXLXdLJ6sTCx2jIBAd1otVVe11vPbKT1iqeJRMLKKp" +
"fLVYaoChxGEAwDbt0ZsNs4ABAEbiLyoqYOEax/ZyfsYmX4q5iCAABQd1aoen3UGmDt/zod" +
"/EWnuJczcgcIABzHu91um81W9YCI8Jga6rirqUV41O9pQqeDR6J6iRvs7VUeDFQZJCKEih" +
"DxfINemIioq4ms7GtrwkaH4KovZ2WfujLL1/SGiIgZZSmavj2Veto0GYXO7vzawo7saztX" +
"3JF9+bUF6Oyu8YAAtnLvNrJBAOPb7lbkizQyPZuWfX8+LeTaG00NHDe7r8Rmju0oQaawVA" +
"Eqga+/Xkc+B1vexDSJzx+AJvEtk1FDEHjLAEXfXdt7ZgEA0H754UjH2GZgWFGR2UVFxc3A" +
"sIh4yDDGFjPPdfxhAdea/Y87xpJy//bqnN3b05XK2r0928n55P2+w3kMw9CXmy/AE4u5Fw" +
"h89A/tLM9d6urxTr9/G4/74zMfBvt+rsxzRKTruqIojNUsgSRJB+vrqVcv705Fc8ViqVSS" +
"JMnpcMz5h/4B1Qxz9NOjZCgAAAAASUVORK5CYII="
)
bad_image_data = b64decode(bad_image_data_url.split(",", 1)[1])
bad_image_mime_type = "image/png"
request_protocol = context.REQUEST.SERVER_URL.split(":", 1)[0] + ":"
site_object_dict = context.ERP5Site_getWebSiteDomainDict()
base_url_root_object = getattr(context, "getWebSiteValue", str)() or portal
base_url_object = context
assert base_url_object.getRelativeUrl().startswith(base_url_root_object.getRelativeUrl())
base_url = base_url_object.getRelativeUrl()[len(base_url_root_object.getRelativeUrl()):]
if not base_url.startswith("/"):
base_url = "/" + base_url
def handleLinkedData(mime, data, href):
if format == "mhtml":
url = makeHrefAbsolute(href)
mhtml_message["attachment_list"].append({
"mime_type": mime,
"encode": "quoted-printable" if mime.startswith("text/") else None,
"add_header_list": [("Content-Location", url)],
"data": str(data),
})
return url
else:
return "data:%s;base64,%s" % (mime, b64encode(data))
def makeHrefAbsolute(href):
if isHrefAnAbsoluteUrl(href) or not isHrefAUrl(href):
return href
if href.startswith("//"):
return request_protocol + href
if href.startswith("/"):
return base_url_root_object.absolute_url() + href
return base_url_object.absolute_url() + "/" + href
def isHrefAnAbsoluteUrl(href):
return href.startswith("https://") or href.startswith("http://")
def isHrefAUrl(href):
return href.startswith("https://") or href.startswith("http://") or not href.split(":", 1)[0].isalpha()
normalize_kw = {"keep_empty": False, "keep_trailing_slash": False}
def traverseHref(url, allow_hash=False):
url = url.split("?", 1)[0]
if not allow_hash:
url = url.split("#", 1)[0]
if url.startswith("https://") or url.startswith("http://") or url.startswith("//"): # absolute url possibly on other sites
site_url = "/".join(url.split("/", 3)[:3])
domain = url.split("/", 3)[2]
site_object = site_object_dict[domain]
relative_path = url[len(site_url):]
relative_path = (relative_path[1:] if relative_path[:1] == "/" else relative_path)
relative_path = context.Base_normalizeUrlPathname("/" + relative_path, **normalize_kw)[1:]
return site_object.restrictedTraverse(str(relative_path))
if url.startswith("/"): # absolute path, relative url
return base_url_root_object.restrictedTraverse(str(context.Base_normalizeUrlPathname(url, **normalize_kw)[1:]))
# relative url
return base_url_root_object.restrictedTraverse(str(context.Base_normalizeUrlPathname(base_url + "/" + url, **normalize_kw)[1:]))
def replaceFromDataUri(data_uri, replacer):
split = data_uri.split(",", 1)
if len(split) != 2:
return data_uri
header, data = split
if "text/css" not in header:
return data_uri
is_base64 = False
if ";base64" in header:
is_base64 = True
data = b64decode(data)
data = replacer(data)
return "%s,%s" % (header, b64encode(data) if is_base64 else data)
def extractUrlSearch(url):
url = url.split("#", 1)[0].split("?", 1)
url[0] = ""
return "?".join(url)
def parseUrlSearch(search):
if search[:1] == "?":
search = search[1:]
result = []
for part in search.split("&"):
key = part.split("=")
value = "=".join(key[1:]) if len(key) else None
result.append((key[0], value))
return result
def parseHtml(text):
return context.Base_parseHtml(text)
def escapeHtml(text):
return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace("\"", "&quot;")
def anny(iterable, key=None):
for i in iterable:
if key:
i = key(i)
if i:
return True
return False
return main()
return data
......@@ -50,7 +50,7 @@
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>REQUEST=None, allow_script=False, format="embedded_html"</string> </value>
<value> <string>REQUEST=None, allow_script=False, format="embedded_html", base_url=None, site_object_dict=None</string> </value>
</item>
<item>
<key> <string>id</string> </key>
......
from zExceptions import Unauthorized
portal = context.getPortalObject()
"""
Extract all object referenced by html components
href_object_dict = {}
if not isinstance(allow_tag_list, (list, tuple)):
allow_tag_list = None
if not isinstance(deny_tag_list, (list, tuple)):
deny_tag_list = []
see Base_extractReferredObjectDictFromHtml for documentation
"""
def main():
for part in context.Base_parseHtml(context.getTextContent("").decode("utf-8")):
handleHtmlPart(part)
return href_object_dict
def handleHtmlTag(tag, attrs):
if allow_tag_list is not None:
if tag not in allow_tag_list:
return
if tag in deny_tag_list:
return
#if tag == "base": and "href" in attrs: # should not exist in safe-html
# NotImplemented
if tag == "object":
for i in range(len(attrs)):
if attrs[i][0] == "data":
handleHref(attrs[i][1])
elif tag == "style":
# for style tags, next data will always be the entire text until </style>
on_next_data[0] = handleCss
else:
for i in range(len(attrs)):
if attrs[i][0] in ("src", "href"):
handleHref(attrs[i][1])
for i in range(len(attrs)):
if attrs[i][0] == "style":
handleCss(attrs[i][1])
on_next_data = [lambda x: x]
def handleHtmlPart(part):
part_type = part[0]
if part_type in ("starttag", "startendtag"):
return handleHtmlTag(part[1], part[2])
if part_type == "data":
if on_next_data[0] is None:
return part[1]
on_next_data[0](part[1])
on_next_data[0] = None
return None
def handleHref(href):
# handles "base_url/document_module/id"
# handles "base_url/R-Document.Reference"
# handles "base_url/R-Document.Reference/view"
if not isHrefAUrl(href):
return href
try:
obj = traverseHref(href, allow_method=False)
except (KeyError, Unauthorized):
obj = None
href_object_dict[href] = obj
def handleCss(data):
for part in context.Base_parseCssForUrl(data):
if part[0] == "url":
handleHref(part[2])
def isHrefAUrl(href):
return href.startswith("https://") or href.startswith("http://") or not href.split(":", 1)[0].isalpha()
def traverseHref(url, allow_method=True, allow_hash=False):
base_obj, relative_path = prepareHrefTraverse(url, allow_hash=allow_hash)
obj = base_obj.restrictedTraverse(relative_path)
if allow_method or obj is None:
return obj
try:
obj.getUid()
except AttributeError:
obj = base_obj.restrictedTraverse("/".join(relative_path.split("/")[:-1]))
return obj
site_object_dict = context.ERP5Site_getWebSiteDomainDict()
base_url_root_object = getattr(context, "getWebSiteValue", str)() or portal
base_url_object = context
base_url = "."
if base_url_object.getRelativeUrl().startswith(base_url_root_object.getRelativeUrl()):
base_url = base_url_object.getRelativeUrl()[len(base_url_root_object.getRelativeUrl()):]
if base_url and not base_url.startswith("/"):
base_url = "/" + base_url
normalize_kw = {"keep_empty": False, "keep_trailing_slash": False}
def prepareHrefTraverse(url, allow_hash=False):
url = url.split("?")[0]
if not allow_hash:
url = url.split("#")[0]
if url.startswith("https://") or url.startswith("http://") or url.startswith("//"): # absolute url possibly on other sites
site_url = "/".join(url.split("/", 3)[:3])
domain = url.split("/", 3)[2]
site_object = site_object_dict[domain]
relative_path = url[len(site_url):]
relative_path = (relative_path[1:] if relative_path[:1] == "/" else relative_path)
relative_path = context.Base_normalizeUrlPathname("/" + relative_path, **normalize_kw)[1:]
return site_object, str(relative_path)
if url.startswith("/"): # absolute path, relative url
return base_url_root_object, str(context.Base_normalizeUrlPathname(url, **normalize_kw)[1:])
# relative path
return base_url_root_object, str(context.Base_normalizeUrlPathname(base_url + "/" + url, **normalize_kw)[1:])
return main()
return context.Base_extractReferredObjectDictFromHtml(context.getTextContent(""), **kw)
......@@ -50,7 +50,7 @@
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>allow_tag_list=None, deny_tag_list=None</string> </value>
<value> <string>**kw</string> </value>
</item>
<item>
<key> <string>id</string> </key>
......
......@@ -1162,6 +1162,75 @@ return True
self.assertEqual(htmlmessage.get("Content-Location"), page.absolute_url())
self.assertEqual(quopri.decodestring(htmlmessage.get_payload()), html_data)
def test_WebPageAsEmbeddedHtml_pageWithLink(self):
"""Test convert one html page with links to embedded html file"""
# Test init part
web_page_module = self.portal.getDefaultModule(portal_type="Web Page")
page = web_page_module.newContent(portal_type="Web Page")
page.edit(text_content="".join([
"<p>Hello</p>",
'<a href="//a.a/">aa</a>',
'<a href="/b">bb</a>',
'<a href="c">cc</a>',
]))
# Test part
ehtml_data = page.WebPage_exportAsSingleFile(format="embedded_html")
self.assertEqual(ehtml_data, "".join([
"<p>Hello</p>",
'<a href="%s//a.a/">aa</a>' % self.portal.absolute_url().split("/", 1)[0],
'<a href="%s/b">bb</a>' % self.portal.absolute_url(),
'<a href="%s/c">cc</a>' % page.absolute_url(),
]))
ehtml_data = page.WebPage_exportAsSingleFile(format="embedded_html", base_url="https://hel.lo/world/dummy")
self.assertEqual(ehtml_data, "".join([
"<p>Hello</p>",
'<a href="https://a.a/">aa</a>',
'<a href="https://hel.lo/b">bb</a>',
'<a href="https://hel.lo/world/c">cc</a>',
]))
def test_WebPageAsMhtml_pageWithLink(self):
"""Test convert one html page with links to mhtml file"""
# Test init part
web_page_module = self.portal.getDefaultModule(portal_type="Web Page")
title = "Hello"
page = web_page_module.newContent(portal_type="Web Page")
page.edit(title=title, text_content="".join([
"<p>Hello</p>",
'<a href="//a.a/">aa</a>',
'<a href="/b">bb</a>',
'<a href="c">cc</a>',
]))
# Test part
mhtml_data = page.WebPage_exportAsSingleFile(format="mhtml")
message = EmailParser().parsestr(mhtml_data)
htmlmessage, = message.get_payload()
self.assertEqual( # should have only one content transfer encoding header
len([h for h in htmlmessage.keys() if h == "Content-Transfer-Encoding"]),
1,
)
self.assertEqual(
htmlmessage.get("Content-Transfer-Encoding"),
"quoted-printable",
)
self.assertEqual(htmlmessage.get("Content-Location"), page.absolute_url())
self.assertEqual(quopri.decodestring(htmlmessage.get_payload()), "".join([
"<p>Hello</p>",
'<a href="%s//a.a/">aa</a>' % self.portal.absolute_url().split("/", 1)[0],
'<a href="%s/b">bb</a>' % self.portal.absolute_url(),
'<a href="%s/c">cc</a>' % page.absolute_url(),
]))
mhtml_data = page.WebPage_exportAsSingleFile(format="mhtml", base_url="https://hel.lo/world/dummy")
message = EmailParser().parsestr(mhtml_data)
htmlmessage, = message.get_payload()
self.assertEqual(htmlmessage.get("Content-Location"), "https://hel.lo/world")
self.assertEqual(quopri.decodestring(htmlmessage.get_payload()), "".join([
"<p>Hello</p>",
'<a href="https://a.a/">aa</a>',
'<a href="https://hel.lo/b">bb</a>',
'<a href="https://hel.lo/world/c">cc</a>',
]))
def test_WebPageAsEmbeddedHtml_pageWithScript(self):
"""Test convert one html page with script to embedded html file"""
# Test init part
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment