[Erp5-report] r14531 - /erp5/trunk/products/ERP5/Document/Document.py

nobody at svn.erp5.org nobody at svn.erp5.org
Mon May 21 17:24:37 CEST 2007


Author: bartek
Date: Mon May 21 17:24:37 2007
New Revision: 14531

URL: http://svn.erp5.org?rev=14531&view=rev
Log:
for preview, convert to utf8 using charset found in html headers

Modified:
    erp5/trunk/products/ERP5/Document/Document.py

Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=14531&r1=14530&r2=14531&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/Document.py (original)
+++ erp5/trunk/products/ERP5/Document/Document.py Mon May 21 17:24:37 2007
@@ -394,6 +394,7 @@
   body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
   title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
   base_parser = re.compile('<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL)
+  charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE)
 
   # Declarative security
   security = ClassSecurityInfo()
@@ -1083,14 +1084,20 @@
       (without html and body tags, etc.) which can be used to inline
       a preview of the document.
     """
-    if self.hasConversion(format='stripped-html'):
+    if self.hasConversion(format='stripped-html'): # XXX this is redundant since we never set it
       mime, data = self.getConversion(format='stripped-html')
       return data
     mime, html = self.convert(format='html')
     body_list = re.findall(self.body_parser, str(html))
     if len(body_list):
-      return body_list[0]
-    return html
+      stripped_html = body_list[0]
+    else:
+      stripped_html = html
+    # find charset and convert to utf-8
+    charset_list = self.charset_parser.findall(html)
+    if charset_list:
+      stripped_html = unicode(stripped_html, charset_list[0]).encode('utf-8')
+    return stripped_html
 
   security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
   def getContentInformation(self):




More information about the Erp5-report mailing list