[Erp5-report] r25565 - in /erp5/trunk/products/ERP5/Document: Document.py EmailDocument.py

Sat Feb 14 11:05:02 CET 2009

Author: jp
Date: Sat Feb 14 11:05:01 2009
New Revision: 25565

URL: http://svn.erp5.org?rev=25565&view=rev
Log:
Move generic HTML processing to where it belongs (ie. conversion handling superclass for now, mixin some day)

Modified:
    erp5/trunk/products/ERP5/Document/Document.py
    erp5/trunk/products/ERP5/Document/EmailDocument.py

Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=25565&r1=25564&r2=25565&view=diff
==============================================================================

--- erp5/trunk/products/ERP5/Document/Document.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/Document.py [utf8] Sat Feb 14 11:05:01 2009
@@ -1278,14 +1278,25 @@
       A private method which can be reused by subclasses
       to strip HTML content
     """
+    def _guessEncoding(self, string):
+      """
+        Some Email Clients indicate wrong encoding
+        This method try to guess which encoding is used.
+      """
+      try:
+        import chardet
+      except ImportError:
+        return None
+      return chardet.detect(string).get('encoding', None)
+
     body_list = re.findall(self.body_parser, str(html))
     if len(body_list):
       stripped_html = body_list[0]
     else:
       stripped_html = html
     # find charset and convert to utf-8
-    charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient is datastream 
-                                                          # instance but hard to do better
+    charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient if this 
+                                         # is datastream instance but hard to do better
     if charset and not charset_list:
       # Use optional parameter is we can not find encoding in HTML
       charset_list = [charset]
@@ -1297,6 +1308,7 @@
         return str(stripped_html)
     return stripped_html
 
+
   security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
   def getContentInformation(self):
     """

Modified: erp5/trunk/products/ERP5/Document/EmailDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/EmailDocument.py?rev=25565&r1=25564&r2=25565&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/EmailDocument.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/EmailDocument.py [utf8] Sat Feb 14 11:05:01 2009
@@ -39,6 +39,8 @@
 from Products.ERP5.Document.File import File
 from Products.ERP5.Document.Document import ConversionError
 from Products.ERP5.Tool.NotificationTool import buildEmailMessage
+
+from zLOG import LOG, INFO
 
 try:
   from Products.MimetypesRegistry.common import MimeTypeException
@@ -300,7 +302,9 @@
               text_result = message_text.decode(part_encoding).encode('utf-8')
             else:
               text_result = message_text.decode().encode('utf-8')
-          except (UnicodeDecodeError, LookupError):
+          except (UnicodeDecodeError, LookupError), error_message:
+            LOG('EmailDocument.getTextContent', INFO, 
+                'Failed to decode %s TEXT message with error: %s' % (part_encoding, error_message))
             codec = self._guessEncoding(message_text)
             if codec is not None:
               try:
@@ -313,24 +317,12 @@
           text_result = message_text
       elif part.get_content_type() == 'text/html' and not html_result and not part.is_multipart():
         part_encoding = part.get_content_charset()
-        message_text = part.get_payload(decode=1)
-        if part_encoding != 'utf-8':
-          try:
-            if part_encoding is not None:
-              text_result = message_text.decode(part_encoding).encode('utf-8')
-            else:
-              text_result = message_text.decode().encode('utf-8')
-          except (UnicodeDecodeError, LookupError):
-            codec = self._guessEncoding(message_text)
-            if codec is not None:
-              try:
-                text_result = message_text.decode(codec).encode('utf-8')
-              except (UnicodeDecodeError, LookupError):
-                text_result = repr(message_text)
-            else:
-              text_result = repr(message_text)
-        else:
-          text_result = message_text
+        part_html = part.get_payload(decode=1)
+        # Invoke Document class HTML stripper
+        html_result = self._stripHTML(part_html, charset=part_encoding)
+    if html_result:
+      # Give priority to HTML
+      text_result = html_result
     if default is _MARKER:
       return text_result
     return text_result or default
@@ -399,6 +391,8 @@
     """
     For FCKEditor Compatibility, we should remove DTD,
     blank lines and some tags in html document
+
+    XXX - What is this SHIT !!!!!!!!!!!!!!!!!!!!!!!!!!
     """
     if html_text is None:
       html_text = self.getTextContent()
@@ -626,17 +620,6 @@
     """
     self.MailHost.send(message)
 
-  def _guessEncoding(self, string):
-    """
-    Some Email Clients indicate wrong encoding
-    This method try to guess which encoding is used.
-    """
-    try:
-      import chardet
-    except ImportError:
-      return None
-    return chardet.detect(string).get('encoding', None)
-
 ## Compatibility layer
 #from Products.ERP5Type import Document
 #Document.MailMessage = EmailDocument