From e141325e08f02ea16041481949d50c7e953f68e4 Mon Sep 17 00:00:00 2001
From: Jean-Paul Smets <jp@nexedi.com>
Date: Sat, 14 Feb 2009 09:28:10 +0000
Subject: [PATCH] Make _stripHTML a reusable private method for all subclasses.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@25557 20353a03-c40f-0410-a6d1-a30d3c3de9de
---
 product/ERP5/Document/Document.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/product/ERP5/Document/Document.py b/product/ERP5/Document/Document.py
index f6b57dac8f..b6f842f9a4 100644
--- a/product/ERP5/Document/Document.py
+++ b/product/ERP5/Document/Document.py
@@ -1271,6 +1271,13 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
       return data
     kw['format'] = 'html'
     mime, html = self.convert(**kw)
+    return self._stripHTML(str(html))
+
+  def _stripHTML(self, html, charset=None):
+    """
+      A private method which can be reused by subclasses
+      to strip HTML content
+    """
     body_list = re.findall(self.body_parser, str(html))
     if len(body_list):
       stripped_html = body_list[0]
@@ -1279,6 +1286,9 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
     # find charset and convert to utf-8
     charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient is datastream 
                                                           # instance but hard to do better
+    if charset and not charset_list:
+      # Use optional parameter is we can not find encoding in HTML
+      charset_list = [charset]
     if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'):
       try:
         stripped_html = unicode(str(stripped_html), 
-- 
2.30.9