[Erp5-report] r24814 - /erp5/trunk/products/ERP5/Document/EmailDocument.py

Fri Dec 5 19:11:16 CET 2008

Author: nicolas
Date: Fri Dec  5 19:11:16 2008
New Revision: 24814

URL: http://svn.erp5.org?rev=24814&view=rev
Log:
Use chardet to guess which enconding is used when encode is missing or wrong

Modified:
    erp5/trunk/products/ERP5/Document/EmailDocument.py

Modified: erp5/trunk/products/ERP5/Document/EmailDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/EmailDocument.py?rev=24814&r1=24813&r2=24814&view=diff
==============================================================================

--- erp5/trunk/products/ERP5/Document/EmailDocument.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/EmailDocument.py [utf8] Fri Dec  5 19:11:16 2008
@@ -128,12 +128,17 @@
     result = {}
     for (name, value) in self._getMessage().items():
       for text, encoding in decode_header(value):
-        if encoding is not None:
-          try:
+        try:
+          if encoding is not None:
             text = text.decode(encoding).encode('utf-8')
-          except UnicodeDecodeError:
-            encoding = self._guessEncoding(text)
+          else:
+            text = text.decode().encode('utf-8')
+        except UnicodeDecodeError:
+          encoding = self._guessEncoding(text)
+          if encoding is not None:
             text = text.decode(encoding).encode('utf-8')
+          else:
+            text = repr(text)
         if name in result:
           result[name] = '%s %s' % (result[name], text)
         else:
@@ -145,7 +150,6 @@
     """
     Returns a list of dictionnaries for every attachment. Each dictionnary
     represents the metadata of the attachment.
-    
     **kw - support for listbox (TODO: improve it)
     """
     result = []
@@ -233,6 +237,7 @@
         return self._baseGetTitle()
       else:
         return self._baseGetTitle(default)
+    message = self._getMessage()
     subject = self.getContentInformation().get('Subject', '')
     # Remove all newlines
     if '\r' in subject:
@@ -288,23 +293,38 @@
     for part in self._getMessage().walk():
       if part.get_content_type() == 'text/plain' and not text_result and not part.is_multipart():
         part_encoding = part.get_content_charset()
-        if part_encoding not in (None, 'utf-8',):
+        message_text = part.get_payload(decode=1)
+        if part_encoding != 'utf-8':
           try:
-            text_result = part.get_payload(decode=1).decode(part_encoding).encode('utf-8')
+            if part_encoding is not None:
+              text_result = message_text.decode(part_encoding).encode('utf-8')
+            else:
+              text_result = message_text.decode().encode('utf-8')
           except (UnicodeDecodeError, LookupError):
-            text_result = part.get_payload(decode=1)
+            codec = self._guessEncoding(message_text)
+            if codec is not None:
+              text_result = message_text.decode(codec).encode('utf-8')
+            else:
+              text_result = repr(message_text)
         else:
-          text_result = part.get_payload(decode=1)
+          text_result = message_text
       elif part.get_content_type() == 'text/html' and not html_result and not part.is_multipart():
         part_encoding = part.get_content_charset()
-        if part_encoding not in (None, 'utf-8',):
+        message_text = part.get_payload(decode=1)
+        if part_encoding != 'utf-8':
           try:
-            text_result = part.get_payload(decode=1).\
-                          decode(part_encoding).encode('utf-8')
+            if part_encoding is not None:
+              text_result = message_text.decode(part_encoding).encode('utf-8')
+            else:
+              text_result = message_text.decode().encode('utf-8')
           except (UnicodeDecodeError, LookupError):
-            text_result = part.get_payload(decode=1)
+            codec = self._guessEncoding(message_text)
+            if codec is not None:
+              text_result = message_text.decode(codec).encode('utf-8')
+            else:
+              text_result = repr(message_text)
         else:
-          text_result = part.get_payload(decode=1)
+          text_result = message_text
     if default is _MARKER:
       return text_result
     return text_result or default
@@ -605,14 +625,11 @@
     Some Email Clients indicate wrong encoding
     This method try to guess which encoding is used.
     """
-    from encodings.aliases import aliases
-    codec_list = set(aliases.values())
-    for codec in codec_list:
-      try:
-        string.decode(codec)
-      except (UnicodeDecodeError, IOError):
-        continue
-      return codec
+    try:
+      import chardet
+    except ImportError:
+      return None
+    return chardet.detect(string).get('encoding', None)
 
 ## Compatibility layer
 #from Products.ERP5Type import Document