Use chardet to guess which enconding is used when encode is missing or wrong

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@24814 20353a03-c40f-0410-a6d1-a30d3c3de9de

Use chardet to guess which enconding is used when encode is missing or wrong
git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@24814 20353a03-c40f-0410-a6d1-a30d3c3de9de
f5d12302 · Nicolas Delaby · e59f0b9e · f5d12302
Commit f5d12302 authored Dec 05, 2008 by Nicolas Delaby
Hide whitespace changes
Inline Side-by-side

Showing with 39 additions and 22 deletions

product/ERP5/Document/EmailDocument.py product/ERP5/Document/EmailDocument.py +39 -22

No files found.
--- a/product/ERP5/Document/EmailDocument.py
+++ b/product/ERP5/Document/EmailDocument.py
@@ -128,12 +128,17 @@ class EmailDocument(File, TextDocument):
    result = {}
    for (name, value) in self._getMessage().items():
      for text, encoding in decode_header(value):
-        if encoding is not None:
+        try:
-          try:
+          if encoding is not None:
            text = text.decode(encoding).encode('utf-8')
-          except UnicodeDecodeError:
+          else:
-            encoding = self._guessEncoding(text)
+            text = text.decode().encode('utf-8')
+        except UnicodeDecodeError:
+          encoding = self._guessEncoding(text)
+          if encoding is not None:
            text = text.decode(encoding).encode('utf-8')
+          else:
+            text = repr(text)
        if name in result:
          result[name] = '%s %s' % (result[name], text)
        else:
@@ -145,7 +150,6 @@ class EmailDocument(File, TextDocument):
    """
    Returns a list of dictionnaries for every attachment. Each dictionnary
    represents the metadata of the attachment.
    **kw - support for listbox (TODO: improve it)
    """
    result = []
@@ -233,6 +237,7 @@ class EmailDocument(File, TextDocument):
        return self._baseGetTitle()
      else:
        return self._baseGetTitle(default)
+    message = self._getMessage()
    subject = self.getContentInformation().get('Subject', '')
    # Remove all newlines
    if '\r' in subject:
@@ -288,23 +293,38 @@ class EmailDocument(File, TextDocument):
    for part in self._getMessage().walk():
      if part.get_content_type() == 'text/plain' and not text_result and not part.is_multipart():
        part_encoding = part.get_content_charset()
-        if part_encoding not in (None, 'utf-8',):
+        message_text = part.get_payload(decode=1)
+        if part_encoding != 'utf-8':
          try:
-            text_result = part.get_payload(decode=1).decode(part_encoding).encode('utf-8')
+            if part_encoding is not None:
+              text_result = message_text.decode(part_encoding).encode('utf-8')
+            else:
+              text_result = message_text.decode().encode('utf-8')
          except (UnicodeDecodeError, LookupError):
-            text_result = part.get_payload(decode=1)
+            codec = self._guessEncoding(message_text)
+            if codec is not None:
+              text_result = message_text.decode(codec).encode('utf-8')
+            else:
+              text_result = repr(message_text)
        else:
-          text_result = part.get_payload(decode=1)
+          text_result = message_text
      elif part.get_content_type() == 'text/html' and not html_result and not part.is_multipart():
        part_encoding = part.get_content_charset()
-        if part_encoding not in (None, 'utf-8',):
+        message_text = part.get_payload(decode=1)
+        if part_encoding != 'utf-8':
          try:
-            text_result = part.get_payload(decode=1).\
+            if part_encoding is not None:
-                          decode(part_encoding).encode('utf-8')
+              text_result = message_text.decode(part_encoding).encode('utf-8')
+            else:
+              text_result = message_text.decode().encode('utf-8')
          except (UnicodeDecodeError, LookupError):
-            text_result = part.get_payload(decode=1)
+            codec = self._guessEncoding(message_text)
+            if codec is not None:
+              text_result = message_text.decode(codec).encode('utf-8')
+            else:
+              text_result = repr(message_text)
        else:
-          text_result = part.get_payload(decode=1)
+          text_result = message_text
    if default is _MARKER:
      return text_result
    return text_result or default
@@ -605,14 +625,11 @@ class EmailDocument(File, TextDocument):
    Some Email Clients indicate wrong encoding
    This method try to guess which encoding is used.
    """
-    from encodings.aliases import aliases
+    try:
-    codec_list = set(aliases.values())
+      import chardet
-    for codec in codec_list:
+    except ImportError:
-      try:
+      return None
-        string.decode(codec)
+    return chardet.detect(string).get('encoding', None)
-      except (UnicodeDecodeError, IOError):
-        continue
-      return codec
 ## Compatibility layer
 #from Products.ERP5Type import Document