From e141325e08f02ea16041481949d50c7e953f68e4 Mon Sep 17 00:00:00 2001 From: Jean-Paul Smets <jp@nexedi.com> Date: Sat, 14 Feb 2009 09:28:10 +0000 Subject: [PATCH] Make _stripHTML a reusable private method for all subclasses. git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@25557 20353a03-c40f-0410-a6d1-a30d3c3de9de --- product/ERP5/Document/Document.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/product/ERP5/Document/Document.py b/product/ERP5/Document/Document.py index f6b57dac8f..b6f842f9a4 100644 --- a/product/ERP5/Document/Document.py +++ b/product/ERP5/Document/Document.py @@ -1271,6 +1271,13 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna return data kw['format'] = 'html' mime, html = self.convert(**kw) + return self._stripHTML(str(html)) + + def _stripHTML(self, html, charset=None): + """ + A private method which can be reused by subclasses + to strip HTML content + """ body_list = re.findall(self.body_parser, str(html)) if len(body_list): stripped_html = body_list[0] @@ -1279,6 +1286,9 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna # find charset and convert to utf-8 charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient is datastream # instance but hard to do better + if charset and not charset_list: + # Use optional parameter is we can not find encoding in HTML + charset_list = [charset] if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'): try: stripped_html = unicode(str(stripped_html), -- 2.30.9