From fd1e8f088bf9b4efd087be304218459f5a06be7d Mon Sep 17 00:00:00 2001 From: Nicolas Delaby <nicolas@nexedi.com> Date: Tue, 4 Jan 2011 10:32:54 +0000 Subject: [PATCH] Workaround bug in HTMLParser (2.5<= v <=2.7) which is impossible to fix due lack of HTMLParser API which does not accept encoding parameter. So decoding strings on the fly can not be ensured in all cases. Python3 solve the problem by accepting only unicode bytes. The fix consist to pass unicode content to the parser. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@41979 20353a03-c40f-0410-a6d1-a30d3c3de9de --- product/PortalTransforms/transforms/safe_html.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/product/PortalTransforms/transforms/safe_html.py b/product/PortalTransforms/transforms/safe_html.py index 208ebe0568..b22bfce5b3 100644 --- a/product/PortalTransforms/transforms/safe_html.py +++ b/product/PortalTransforms/transforms/safe_html.py @@ -279,6 +279,16 @@ def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS, remove_javascript=remove_javascript, raise_error=raise_error, default_encoding=default_encoding) + # HTMLParser is affected by a known bug referenced + # by http://bugs.python.org/issue3932 + # As suggested by python developpers: + # "Python 3.0 implicitly rejects non-unicode strings" + # We try to decode strings against provided codec first + if isinstance(html, str): + try: + html = html.decode(default_encoding) + except UnicodeDecodeError: + pass parser.feed(html) parser.close() result = parser.getResult() -- 2.30.9