Commit e141325e authored by Jean-Paul Smets's avatar Jean-Paul Smets

Make _stripHTML a reusable private method for all subclasses.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@25557 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 98ef47a1
No related merge requests found
......@@ -1271,6 +1271,13 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
return data
kw['format'] = 'html'
mime, html = self.convert(**kw)
return self._stripHTML(str(html))
def _stripHTML(self, html, charset=None):
"""
A private method which can be reused by subclasses
to strip HTML content
"""
body_list = re.findall(self.body_parser, str(html))
if len(body_list):
stripped_html = body_list[0]
......@@ -1279,6 +1286,9 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, Sna
# find charset and convert to utf-8
charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient is datastream
# instance but hard to do better
if charset and not charset_list:
# Use optional parameter is we can not find encoding in HTML
charset_list = [charset]
if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'):
try:
stripped_html = unicode(str(stripped_html),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment