Make safe_html transforms more robuts against dirty html documents.

- In case of failure of HTMLParser, lxml take under its hand the broken html and recover it. Then put back only once to HTMLParser again. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@33407 20353a03-c40f-0410-a6d1-a30d3c3de9de

Make safe_html transforms more robuts against dirty html documents.
- In case of failure of HTMLParser, lxml take under its hand the broken html and recover it. Then put back only once to HTMLParser again. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@33407 20353a03-c40f-0410-a6d1-a30d3c3de9de
a38c59c9 · Nicolas Delaby · 112180a6 · a38c59c9
Commit a38c59c9 authored Mar 04, 2010 by Nicolas Delaby
Hide whitespace changes
Inline Side-by-side

Showing with 40 additions and 12 deletions

product/PortalTransforms/transforms/safe_html.py product/PortalTransforms/transforms/safe_html.py +40 -12

No files found.
--- a/product/PortalTransforms/transforms/safe_html.py
+++ b/product/PortalTransforms/transforms/safe_html.py
 # -*- coding: utf-8 -*-
 import logging
-from HTMLParser import HTMLParser
+from HTMLParser import HTMLParser, HTMLParseError
 import re
 from cgi import escape
 from zope.interface import implements
@@ -14,6 +14,9 @@ from Products.CMFDefault.utils import VALID_TAGS
 from Products.CMFDefault.utils import NASTY_TAGS
 from Products.PortalTransforms.utils import safeToInt
+from lxml import etree
+from lxml.etree import HTMLParser as LHTMLParser
 # tag mapping: tag -> short or long tag
 VALID_TAGS = VALID_TAGS.copy()
 NASTY_TAGS = NASTY_TAGS.copy()
@@ -256,17 +259,42 @@ class SafeHTML:
            data.setData(orig)
            return data
-        try:
+        html_string = orig
-            safe = scrubHTML(
+        allready_repaired = False
-                bodyfinder(orig),
+        while True:
-                valid=self.config.get('valid_tags', {}),
+            try:
-                nasty=self.config.get('nasty_tags', {}),
+                safe = scrubHTML(
-                remove_javascript=self.config.get('remove_javascript', True),
+                    bodyfinder(html_string),
-                raise_error=False)
+                    valid=self.config.get('valid_tags', {}),
-        except IllegalHTML, inst:
+                    nasty=self.config.get('nasty_tags', {}),
-            data.setData(msg_pat % ("Error", str(inst)))
+                    remove_javascript=self.config.get('remove_javascript', True),
-        else:
+                    raise_error=False)
-            data.setData(safe)
+            except IllegalHTML, inst:
+                data.setData(msg_pat % ("Error", str(inst)))
+                break
+            except HTMLParseError:
+                # ouch !
+                # HTMLParser is not able to parse very dirty HTML string,
+                # try to repair any broken html with help of lxml
+                if allready_repaired:
+                  raise
+                allready_repaired = True
+                encoding = kwargs.get('encoding')
+                # recover parameter is equal to True by default
+                # in lxml API. I pass the argument to improve readability
+                # of above code.
+                try:
+                    lparser = LHTMLParser(encoding=encoding, recover=True)
+                except LookupError:
+                    # Provided encoding is not known by parser, so discard it
+                    lparser = LHTMLParser(recover=True)
+                repaired_html_tree = etree.HTML(orig, parser=lparser)
+                html_string = etree.tostring(repaired_html_tree)
+                # avoid breaking now.
+                # continue into the loop with repaired html
+            else:
+                data.setData(safe)
+                break
        return data
 def register():