From fd1e8f088bf9b4efd087be304218459f5a06be7d Mon Sep 17 00:00:00 2001
From: Nicolas Delaby <nicolas@nexedi.com>
Date: Tue, 4 Jan 2011 10:32:54 +0000
Subject: [PATCH] Workaround bug in HTMLParser (2.5<= v <=2.7) which is
 impossible to fix due lack of HTMLParser API which does not accept encoding
 parameter. So decoding strings on the fly can not be ensured in all cases.
 Python3 solve the problem by accepting only unicode bytes.

The fix consist to pass unicode content to the parser.



git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@41979 20353a03-c40f-0410-a6d1-a30d3c3de9de
---
 product/PortalTransforms/transforms/safe_html.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/product/PortalTransforms/transforms/safe_html.py b/product/PortalTransforms/transforms/safe_html.py
index 208ebe0568..b22bfce5b3 100644
--- a/product/PortalTransforms/transforms/safe_html.py
+++ b/product/PortalTransforms/transforms/safe_html.py
@@ -279,6 +279,16 @@ def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS,
                              remove_javascript=remove_javascript,
                              raise_error=raise_error,
                              default_encoding=default_encoding)
+    # HTMLParser is affected by a known bug referenced
+    # by http://bugs.python.org/issue3932 
+    # As suggested by python developpers:
+    # "Python 3.0 implicitly rejects non-unicode strings"
+    # We try to decode strings against provided codec first
+    if isinstance(html, str):
+      try:
+        html = html.decode(default_encoding)
+      except UnicodeDecodeError:
+        pass
     parser.feed(html)
     parser.close()
     result = parser.getResult()
-- 
2.30.9