[Erp5-report] r35217 nicolas - /erp5/trunk/products/ERP5/Document/Document.py

nobody at svn.erp5.org nobody at svn.erp5.org
Wed May 12 15:31:19 CEST 2010


Author: nicolas
Date: Wed May 12 15:31:15 2010
New Revision: 35217

URL: http://svn.erp5.org?rev=35217&view=rev
Log:
Extend guessEncoding method when chardet does not detect
acceptable encoding (it is reliable for html content only),
So fallback to file command (only available on linux2 platform)
to detect used encoding for text/plain.


Modified:
    erp5/trunk/products/ERP5/Document/Document.py

Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=35217&r1=35216&r2=35217&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/Document.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/Document.py [utf8] Wed May 12 15:31:15 2010
@@ -1172,7 +1172,8 @@
     """
     return self._stripHTML(self._asHTML(**kw))
 
-  def _guessEncoding(self, string):
+  security.declarePrivate('_guessEncoding')
+  def _guessEncoding(self, string, mime='text/html'):
     """
       Try to guess the encoding for this string.
       Returns None if no encoding can be guessed.
@@ -1180,8 +1181,24 @@
     try:
       import chardet
     except ImportError:
-      return None
-    return chardet.detect(string).get('encoding', None)
+      chardet = None
+    if chardet is not None and (mime == 'text/html'\
+                                               or os.sys.platform != 'linux2'):
+      # chardet works fine on html document and its platform independent
+      return chardet.detect(string).get('encoding', None)
+    else:
+      # file command provide better result
+      # for text/plain documents
+      # store the content into tempfile
+      file_descriptor, path = tempfile.mkstemp()
+      file_object = os.fdopen(file_descriptor, 'w')
+      file_object.write(string)
+      file_object.close()
+      # run file command against tempfile to and read encoded
+      command_result = Popen(['file', '-b', '--mime-encoding', path],
+                                                  stdout=PIPE).communicate()[0]
+      # return detected encoding
+      return command_result.strip()
 
   def _stripHTML(self, html, charset=None):
     """




More information about the Erp5-report mailing list