erp5_web: fix <a> links are not absolute everytime on html embedding

- <a href="c"> was not transformed to absolute url - split the WebPage_exportAsSingleFile and WebPage_extractReferredObjectDict into two scripts to allow to evaluate string html data. + tests

erp5_web: fix <a> links are not absolute everytime on html embedding
- <a href="c"> was not transformed to absolute url - split the WebPage_exportAsSingleFile and WebPage_extractReferredObjectDict into two scripts to allow to evaluate string html data. + tests
dd07ef91 · Tristan Cavelier · 6b85d617 · dd07ef91 · dd07ef91 · dd07ef91
Commit dd07ef91 authored Oct 14, 2016 by Tristan Cavelier
9 changed files
--- a/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/Base_convertHtmlToSingleFile.py
+++ b/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/Base_convertHtmlToSingleFile.py
--- a/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/Base_convertHtmlToSingleFile.xml
+++ b/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/Base_convertHtmlToSingleFile.xml
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>Script_magic</string> </key>
+            <value> <int>3</int> </value>
+        </item>
+        <item>
+            <key> <string>_bind_names</string> </key>
+            <value>
+              <object>
+                <klass>
+                  <global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
+                </klass>
+                <tuple/>
+                <state>
+                  <dictionary>
+                    <item>
+                        <key> <string>_asgns</string> </key>
+                        <value>
+                          <dictionary>
+                            <item>
+                                <key> <string>name_container</string> </key>
+                                <value> <string>container</string> </value>
+                            </item>
+                            <item>
+                                <key> <string>name_context</string> </key>
+                                <value> <string>context</string> </value>
+                            </item>
+                            <item>
+                                <key> <string>name_m_self</string> </key>
+                                <value> <string>script</string> </value>
+                            </item>
+                            <item>
+                                <key> <string>name_subpath</string> </key>
+                                <value> <string>traverse_subpath</string> </value>
+                            </item>
+                          </dictionary>
+                        </value>
+                    </item>
+                  </dictionary>
+                </state>
+              </object>
+            </value>
+        </item>
+        <item>
+            <key> <string>_params</string> </key>
+            <value> <string>data, allow_script=False, format="embedded_html", base_url=None, site_object_dict=None, title=\'Untitled\'</string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>Base_convertHtmlToSingleFile</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
--- a/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/Base_extractReferredObjectDictFromHtml.py
+++ b/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/Base_extractReferredObjectDictFromHtml.py
+"""
+Extract all object referenced by html components
+
+`data` is the html to parse.
+`allow_tag_list` is the white list of tag to parse.
+  Default is to allow every tag.
+`deny_tag_list` is the black list of tag to parse.
+  Default is to deny no tag.
+`base_url` is the url to use as base url when relative url are found,
+  by using it, the script will use `site_object_dict` for each href.
+  (Don't forget the ending '/' !)
+`site_object_dict` is a dict of (domain, object) used to get the object
+  corresponding to the absolute url found. By default the dict returned
+  by `context.ERP5Site_getWebSiteDomainDict()` is used.
+"""
+
+from zExceptions import Unauthorized
+portal = context.getPortalObject()
+
+href_object_dict = {}
+if not isinstance(allow_tag_list, (list, tuple)):
+  allow_tag_list = None
+if not isinstance(deny_tag_list, (list, tuple)):
+  deny_tag_list = []
+
+def main(data):
+  if isinstance(data, str):
+    data = data.decode("utf-8")
+  for part in context.Base_parseHtml(data):
+    handleHtmlPart(part)
+  return href_object_dict
+
+def handleHtmlTag(tag, attrs):
+  if allow_tag_list is not None:
+    if tag not in allow_tag_list:
+      return
+  if tag in deny_tag_list:
+    return
+  #if tag == "base": and "href" in attrs:  # should not exist in safe-html
+  #  NotImplemented
+  if tag == "object":
+    for i in range(len(attrs)):
+      if attrs[i][0] == "data":
+        handleHref(attrs[i][1])
+  elif tag == "style":
+    # for style tags, next data will always be the entire text until </style>
+    on_next_data[0] = handleCss
+  else:
+    for i in range(len(attrs)):
+      if attrs[i][0] in ("src", "href"):
+        handleHref(attrs[i][1])
+  for i in range(len(attrs)):
+    if attrs[i][0] == "style":
+      handleCss(attrs[i][1])
+
+
+on_next_data = [lambda x: x]
+def handleHtmlPart(part):
+  part_type = part[0]
+  if part_type in ("starttag", "startendtag"):
+    return handleHtmlTag(part[1], part[2])
+  if part_type == "data":
+    if on_next_data[0] is None:
+      return part[1]
+    on_next_data[0](part[1])
+    on_next_data[0] = None
+    return None
+
+def handleHref(href):
+  # handles "base_url/document_module/id"
+  # handles "base_url/R-Document.Reference"
+  # handles "base_url/R-Document.Reference/view"
+  if not isHrefAUrl(href):
+    return href
+  try:
+    obj = traverseHref(href, allow_method=False)
+  except (KeyError, Unauthorized):
+    obj = None
+  href_object_dict[href.encode("utf-8")] = obj
+
+def handleCss(data):
+  for part in context.Base_parseCssForUrl(data):
+    if part[0] == "url":
+      handleHref(part[2])
+
+def isHrefAUrl(href):
+  if href.startswith("https://") or href.startswith("http://"):
+    return True
+  split = href.split(":", 1)
+  if len(split) == 1:
+    return True
+  return not split[0].isalpha()
+
+def traverseHref(url, allow_method=True, allow_hash=False):
+  base_obj, relative_path = prepareHrefTraverse(url, allow_hash=allow_hash)
+  obj = base_obj.restrictedTraverse(relative_path)
+  if allow_method or obj is None:
+    return obj
+  try:
+    obj.getUid()
+  except AttributeError:
+    obj = base_obj.restrictedTraverse("/".join(relative_path.split("/")[:-1]))
+  return obj
+
+if site_object_dict is None:
+  site_object_dict = context.ERP5Site_getWebSiteDomainDict()
+base_url_root_object = getattr(context, "getWebSiteValue", str)() or portal
+base_url_object = context
+
+# Resolve base_url by removing everything after the last slash
+force_base_url = False
+if base_url is not None:
+  if base_url.startswith("https://") or base_url.startswith("http://"):
+    force_base_url = True
+  else:
+    raise ValueError("invalid `base_url` argument")
+if force_base_url:
+  root_url = "/".join(base_url.split("/", 3)[:3])
+  if root_url != base_url:
+    base_url = "/".join(base_url.split("/")[:-1])
+else:
+  root_url = base_url_root_object.absolute_url()
+  base_url = base_url_object.absolute_url()
+
+base_path = "."
+if base_url_object.getRelativeUrl().startswith(base_url_root_object.getRelativeUrl()):
+  base_path = base_url_object.getRelativeUrl()[len(base_url_root_object.getRelativeUrl()):]
+  if base_path and not base_path.startswith("/"):
+    base_path = "/" + base_path
+
+normalize_kw = {"keep_empty": False, "keep_trailing_slash": False}
+def prepareHrefTraverse(href, allow_hash=False):
+  url = href.split("?")[0]
+  if not allow_hash:
+    url = url.split("#")[0]
+  if url.startswith("https://") or url.startswith("http://") or url.startswith("//"):  # absolute url possibly on other sites
+    site_url = "/".join(url.split("/", 3)[:3])
+    domain = url.split("/", 3)[2]
+    site_object = site_object_dict[domain]
+    relative_path = url[len(site_url):]
+    relative_path = (relative_path[1:] if relative_path[:1] == "/" else relative_path)
+    relative_path = context.Base_normalizeUrlPathname("/" + relative_path, **normalize_kw)[1:]
+    return site_object, str(relative_path)
+  if url.startswith("/"):  # absolute path, relative url
+    if force_base_url:
+      return prepareHrefTraverse(root_url + href, allow_hash=allow_hash)  # use site_domain_dict
+    return base_url_root_object, str(context.Base_normalizeUrlPathname(url, **normalize_kw)[1:])
+  # relative path
+  if force_base_url:
+    return prepareHrefTraverse(base_url + "/" + href, allow_hash=allow_hash)  # use site_domain_dict
+  return base_url_root_object, str(context.Base_normalizeUrlPathname(base_path + "/" + url, **normalize_kw)[1:])
+
+return main(data)
--- a/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/Base_extractReferredObjectDictFromHtml.xml
+++ b/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/Base_extractReferredObjectDictFromHtml.xml
+<?xml version="1.0"?>
+<ZopeData>
+  <record id="1" aka="AAAAAAAAAAE=">
+    <pickle>
+      <global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
+    </pickle>
+    <pickle>
+      <dictionary>
+        <item>
+            <key> <string>Script_magic</string> </key>
+            <value> <int>3</int> </value>
+        </item>
+        <item>
+            <key> <string>_bind_names</string> </key>
+            <value>
+              <object>
+                <klass>
+                  <global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
+                </klass>
+                <tuple/>
+                <state>
+                  <dictionary>
+                    <item>
+                        <key> <string>_asgns</string> </key>
+                        <value>
+                          <dictionary>
+                            <item>
+                                <key> <string>name_container</string> </key>
+                                <value> <string>container</string> </value>
+                            </item>
+                            <item>
+                                <key> <string>name_context</string> </key>
+                                <value> <string>context</string> </value>
+                            </item>
+                            <item>
+                                <key> <string>name_m_self</string> </key>
+                                <value> <string>script</string> </value>
+                            </item>
+                            <item>
+                                <key> <string>name_subpath</string> </key>
+                                <value> <string>traverse_subpath</string> </value>
+                            </item>
+                          </dictionary>
+                        </value>
+                    </item>
+                  </dictionary>
+                </state>
+              </object>
+            </value>
+        </item>
+        <item>
+            <key> <string>_params</string> </key>
+            <value> <string>data, allow_tag_list=None, deny_tag_list=None, base_url=None, site_object_dict=None</string> </value>
+        </item>
+        <item>
+            <key> <string>id</string> </key>
+            <value> <string>Base_extractReferredObjectDictFromHtml</string> </value>
+        </item>
+      </dictionary>
+    </pickle>
+  </record>
+</ZopeData>
--- a/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/WebPage_exportAsSingleFile.py
+++ b/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/WebPage_exportAsSingleFile.py
--- a/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/WebPage_exportAsSingleFile.xml
+++ b/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/WebPage_exportAsSingleFile.xml
@@ -50,7 +50,7 @@
        </item>
        <item>
            <key> <string>_params</string> </key>
-            <value> <string>REQUEST=None, allow_script=False, format="embedded_html"</string> </value>
+            <value> <string>REQUEST=None, allow_script=False, format="embedded_html", base_url=None, site_object_dict=None</string> </value>
        </item>
        <item>
            <key> <string>id</string> </key>

--- a/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/WebPage_extractReferredObjectDict.py
+++ b/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/WebPage_extractReferredObjectDict.py
-from zExceptions import Unauthorized
-portal = context.getPortalObject()
+"""
+Extract all object referenced by html components

-href_object_dict = {}
-if not isinstance(allow_tag_list, (list, tuple)):
-  allow_tag_list = None
-if not isinstance(deny_tag_list, (list, tuple)):
-  deny_tag_list = []
+see Base_extractReferredObjectDictFromHtml for documentation
+"""

-def main():
-  for part in context.Base_parseHtml(context.getTextContent("").decode("utf-8")):
-    handleHtmlPart(part)
-  return href_object_dict
-
-def handleHtmlTag(tag, attrs):
-  if allow_tag_list is not None:
-    if tag not in allow_tag_list:
-      return
-  if tag in deny_tag_list:
-    return
-  #if tag == "base": and "href" in attrs:  # should not exist in safe-html
-  #  NotImplemented
-  if tag == "object":
-    for i in range(len(attrs)):
-      if attrs[i][0] == "data":
-        handleHref(attrs[i][1])
-  elif tag == "style":
-    # for style tags, next data will always be the entire text until </style>
-    on_next_data[0] = handleCss
-  else:
-    for i in range(len(attrs)):
-      if attrs[i][0] in ("src", "href"):
-        handleHref(attrs[i][1])
-  for i in range(len(attrs)):
-    if attrs[i][0] == "style":
-      handleCss(attrs[i][1])
-
-
-on_next_data = [lambda x: x]
-def handleHtmlPart(part):
-  part_type = part[0]
-  if part_type in ("starttag", "startendtag"):
-    return handleHtmlTag(part[1], part[2])
-  if part_type == "data":
-    if on_next_data[0] is None:
-      return part[1]
-    on_next_data[0](part[1])
-    on_next_data[0] = None
-    return None
-
-def handleHref(href):
-  # handles "base_url/document_module/id"
-  # handles "base_url/R-Document.Reference"
-  # handles "base_url/R-Document.Reference/view"
-  if not isHrefAUrl(href):
-    return href
-  try:
-    obj = traverseHref(href, allow_method=False)
-  except (KeyError, Unauthorized):
-    obj = None
-  href_object_dict[href] = obj
-
-def handleCss(data):
-  for part in context.Base_parseCssForUrl(data):
-    if part[0] == "url":
-      handleHref(part[2])
-
-def isHrefAUrl(href):
-  return href.startswith("https://") or href.startswith("http://") or not href.split(":", 1)[0].isalpha()
-
-def traverseHref(url, allow_method=True, allow_hash=False):
-  base_obj, relative_path = prepareHrefTraverse(url, allow_hash=allow_hash)
-  obj = base_obj.restrictedTraverse(relative_path)
-  if allow_method or obj is None:
-    return obj
-  try:
-    obj.getUid()
-  except AttributeError:
-    obj = base_obj.restrictedTraverse("/".join(relative_path.split("/")[:-1]))
-  return obj
-
-site_object_dict = context.ERP5Site_getWebSiteDomainDict()
-base_url_root_object = getattr(context, "getWebSiteValue", str)() or portal
-base_url_object = context
-base_url = "."
-if base_url_object.getRelativeUrl().startswith(base_url_root_object.getRelativeUrl()):
-  base_url = base_url_object.getRelativeUrl()[len(base_url_root_object.getRelativeUrl()):]
-  if base_url and not base_url.startswith("/"):
-    base_url = "/" + base_url
-
-normalize_kw = {"keep_empty": False, "keep_trailing_slash": False}
-def prepareHrefTraverse(url, allow_hash=False):
-  url = url.split("?")[0]
-  if not allow_hash:
-    url = url.split("#")[0]
-  if url.startswith("https://") or url.startswith("http://") or url.startswith("//"):  # absolute url possibly on other sites
-    site_url = "/".join(url.split("/", 3)[:3])
-    domain = url.split("/", 3)[2]
-    site_object = site_object_dict[domain]
-    relative_path = url[len(site_url):]
-    relative_path = (relative_path[1:] if relative_path[:1] == "/" else relative_path)
-    relative_path = context.Base_normalizeUrlPathname("/" + relative_path, **normalize_kw)[1:]
-    return site_object, str(relative_path)
-  if url.startswith("/"):  # absolute path, relative url
-    return base_url_root_object, str(context.Base_normalizeUrlPathname(url, **normalize_kw)[1:])
-  # relative path
-  return base_url_root_object, str(context.Base_normalizeUrlPathname(base_url + "/" + url, **normalize_kw)[1:])
-
-return main()
+return context.Base_extractReferredObjectDictFromHtml(context.getTextContent(""), **kw)
--- a/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/WebPage_extractReferredObjectDict.xml
+++ b/bt5/erp5_web/SkinTemplateItem/portal_skins/erp5_web/WebPage_extractReferredObjectDict.xml
@@ -50,7 +50,7 @@
        </item>
        <item>
            <key> <string>_params</string> </key>
-            <value> <string>allow_tag_list=None, deny_tag_list=None</string> </value>
+            <value> <string>**kw</string> </value>
        </item>
        <item>
            <key> <string>id</string> </key>

--- a/product/ERP5/tests/testERP5WebWithDms.py
+++ b/product/ERP5/tests/testERP5WebWithDms.py
@@ -1162,6 +1162,75 @@ return True
    self.assertEqual(htmlmessage.get("Content-Location"), page.absolute_url())
    self.assertEqual(quopri.decodestring(htmlmessage.get_payload()), html_data)

+  def test_WebPageAsEmbeddedHtml_pageWithLink(self):
+    """Test convert one html page with links to embedded html file"""
+    # Test init part
+    web_page_module = self.portal.getDefaultModule(portal_type="Web Page")
+    page = web_page_module.newContent(portal_type="Web Page")
+    page.edit(text_content="".join([
+      "<p>Hello</p>",
+      '<a href="//a.a/">aa</a>',
+      '<a href="/b">bb</a>',
+      '<a href="c">cc</a>',
+    ]))
+    # Test part
+    ehtml_data = page.WebPage_exportAsSingleFile(format="embedded_html")
+    self.assertEqual(ehtml_data, "".join([
+      "<p>Hello</p>",
+      '<a href="%s//a.a/">aa</a>' % self.portal.absolute_url().split("/", 1)[0],
+      '<a href="%s/b">bb</a>' % self.portal.absolute_url(),
+      '<a href="%s/c">cc</a>' % page.absolute_url(),
+    ]))
+    ehtml_data = page.WebPage_exportAsSingleFile(format="embedded_html", base_url="https://hel.lo/world/dummy")
+    self.assertEqual(ehtml_data, "".join([
+      "<p>Hello</p>",
+      '<a href="https://a.a/">aa</a>',
+      '<a href="https://hel.lo/b">bb</a>',
+      '<a href="https://hel.lo/world/c">cc</a>',
+    ]))
+
+  def test_WebPageAsMhtml_pageWithLink(self):
+    """Test convert one html page with links to mhtml file"""
+    # Test init part
+    web_page_module = self.portal.getDefaultModule(portal_type="Web Page")
+    title = "Hello"
+    page = web_page_module.newContent(portal_type="Web Page")
+    page.edit(title=title, text_content="".join([
+      "<p>Hello</p>",
+      '<a href="//a.a/">aa</a>',
+      '<a href="/b">bb</a>',
+      '<a href="c">cc</a>',
+    ]))
+    # Test part
+    mhtml_data = page.WebPage_exportAsSingleFile(format="mhtml")
+    message = EmailParser().parsestr(mhtml_data)
+    htmlmessage, = message.get_payload()
+    self.assertEqual(  # should have only one content transfer encoding header
+      len([h for h in htmlmessage.keys() if h == "Content-Transfer-Encoding"]),
+      1,
+    )
+    self.assertEqual(
+      htmlmessage.get("Content-Transfer-Encoding"),
+      "quoted-printable",
+    )
+    self.assertEqual(htmlmessage.get("Content-Location"), page.absolute_url())
+    self.assertEqual(quopri.decodestring(htmlmessage.get_payload()), "".join([
+      "<p>Hello</p>",
+      '<a href="%s//a.a/">aa</a>' % self.portal.absolute_url().split("/", 1)[0],
+      '<a href="%s/b">bb</a>' % self.portal.absolute_url(),
+      '<a href="%s/c">cc</a>' % page.absolute_url(),
+    ]))
+    mhtml_data = page.WebPage_exportAsSingleFile(format="mhtml", base_url="https://hel.lo/world/dummy")
+    message = EmailParser().parsestr(mhtml_data)
+    htmlmessage, = message.get_payload()
+    self.assertEqual(htmlmessage.get("Content-Location"), "https://hel.lo/world")
+    self.assertEqual(quopri.decodestring(htmlmessage.get_payload()), "".join([
+      "<p>Hello</p>",
+      '<a href="https://a.a/">aa</a>',
+      '<a href="https://hel.lo/b">bb</a>',
+      '<a href="https://hel.lo/world/c">cc</a>',
+    ]))
+
  def test_WebPageAsEmbeddedHtml_pageWithScript(self):
    """Test convert one html page with script to embedded html file"""
    # Test init part