Commit f5d12302 authored by Nicolas Delaby's avatar Nicolas Delaby

Use chardet to guess which enconding is used when encode is missing or wrong

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@24814 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent e59f0b9e
...@@ -128,12 +128,17 @@ class EmailDocument(File, TextDocument): ...@@ -128,12 +128,17 @@ class EmailDocument(File, TextDocument):
result = {} result = {}
for (name, value) in self._getMessage().items(): for (name, value) in self._getMessage().items():
for text, encoding in decode_header(value): for text, encoding in decode_header(value):
if encoding is not None:
try: try:
if encoding is not None:
text = text.decode(encoding).encode('utf-8') text = text.decode(encoding).encode('utf-8')
else:
text = text.decode().encode('utf-8')
except UnicodeDecodeError: except UnicodeDecodeError:
encoding = self._guessEncoding(text) encoding = self._guessEncoding(text)
if encoding is not None:
text = text.decode(encoding).encode('utf-8') text = text.decode(encoding).encode('utf-8')
else:
text = repr(text)
if name in result: if name in result:
result[name] = '%s %s' % (result[name], text) result[name] = '%s %s' % (result[name], text)
else: else:
...@@ -145,7 +150,6 @@ class EmailDocument(File, TextDocument): ...@@ -145,7 +150,6 @@ class EmailDocument(File, TextDocument):
""" """
Returns a list of dictionnaries for every attachment. Each dictionnary Returns a list of dictionnaries for every attachment. Each dictionnary
represents the metadata of the attachment. represents the metadata of the attachment.
**kw - support for listbox (TODO: improve it) **kw - support for listbox (TODO: improve it)
""" """
result = [] result = []
...@@ -233,6 +237,7 @@ class EmailDocument(File, TextDocument): ...@@ -233,6 +237,7 @@ class EmailDocument(File, TextDocument):
return self._baseGetTitle() return self._baseGetTitle()
else: else:
return self._baseGetTitle(default) return self._baseGetTitle(default)
message = self._getMessage()
subject = self.getContentInformation().get('Subject', '') subject = self.getContentInformation().get('Subject', '')
# Remove all newlines # Remove all newlines
if '\r' in subject: if '\r' in subject:
...@@ -288,23 +293,38 @@ class EmailDocument(File, TextDocument): ...@@ -288,23 +293,38 @@ class EmailDocument(File, TextDocument):
for part in self._getMessage().walk(): for part in self._getMessage().walk():
if part.get_content_type() == 'text/plain' and not text_result and not part.is_multipart(): if part.get_content_type() == 'text/plain' and not text_result and not part.is_multipart():
part_encoding = part.get_content_charset() part_encoding = part.get_content_charset()
if part_encoding not in (None, 'utf-8',): message_text = part.get_payload(decode=1)
if part_encoding != 'utf-8':
try: try:
text_result = part.get_payload(decode=1).decode(part_encoding).encode('utf-8') if part_encoding is not None:
text_result = message_text.decode(part_encoding).encode('utf-8')
else:
text_result = message_text.decode().encode('utf-8')
except (UnicodeDecodeError, LookupError): except (UnicodeDecodeError, LookupError):
text_result = part.get_payload(decode=1) codec = self._guessEncoding(message_text)
if codec is not None:
text_result = message_text.decode(codec).encode('utf-8')
else:
text_result = repr(message_text)
else: else:
text_result = part.get_payload(decode=1) text_result = message_text
elif part.get_content_type() == 'text/html' and not html_result and not part.is_multipart(): elif part.get_content_type() == 'text/html' and not html_result and not part.is_multipart():
part_encoding = part.get_content_charset() part_encoding = part.get_content_charset()
if part_encoding not in (None, 'utf-8',): message_text = part.get_payload(decode=1)
if part_encoding != 'utf-8':
try: try:
text_result = part.get_payload(decode=1).\ if part_encoding is not None:
decode(part_encoding).encode('utf-8') text_result = message_text.decode(part_encoding).encode('utf-8')
else:
text_result = message_text.decode().encode('utf-8')
except (UnicodeDecodeError, LookupError): except (UnicodeDecodeError, LookupError):
text_result = part.get_payload(decode=1) codec = self._guessEncoding(message_text)
if codec is not None:
text_result = message_text.decode(codec).encode('utf-8')
else:
text_result = repr(message_text)
else: else:
text_result = part.get_payload(decode=1) text_result = message_text
if default is _MARKER: if default is _MARKER:
return text_result return text_result
return text_result or default return text_result or default
...@@ -605,14 +625,11 @@ class EmailDocument(File, TextDocument): ...@@ -605,14 +625,11 @@ class EmailDocument(File, TextDocument):
Some Email Clients indicate wrong encoding Some Email Clients indicate wrong encoding
This method try to guess which encoding is used. This method try to guess which encoding is used.
""" """
from encodings.aliases import aliases
codec_list = set(aliases.values())
for codec in codec_list:
try: try:
string.decode(codec) import chardet
except (UnicodeDecodeError, IOError): except ImportError:
continue return None
return codec return chardet.detect(string).get('encoding', None)
## Compatibility layer ## Compatibility layer
#from Products.ERP5Type import Document #from Products.ERP5Type import Document
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment