[Erp5-report] r34372 nicolas - /erp5/trunk/products/ERP5/Document/

nobody at svn.erp5.org nobody at svn.erp5.org
Thu Apr 8 12:06:49 CEST 2010


Author: nicolas
Date: Thu Apr  8 12:06:47 2010
New Revision: 34372

URL: http://svn.erp5.org?rev=34372&view=rev
Log:
Enhance charset replacement with regular expression.

Modified:
    erp5/trunk/products/ERP5/Document/Document.py
    erp5/trunk/products/ERP5/Document/TextDocument.py

Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=34372&r1=34371&r2=34372&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/Document.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/Document.py [utf8] Thu Apr  8 12:06:47 2010
@@ -490,7 +490,7 @@
   href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
   body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
   title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
-  charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE)
+  charset_parser = re.compile('(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
 
   # Declarative security
   security = ClassSecurityInfo()

Modified: erp5/trunk/products/ERP5/Document/TextDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/TextDocument.py?rev=34372&r1=34371&r2=34372&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/TextDocument.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/TextDocument.py [utf8] Thu Apr  8 12:06:47 2010
@@ -230,9 +230,7 @@
             mime_type = 'text/x-html-safe'
             if charset is None:
               # find charset
-              charset_list = self.charset_parser.findall(text_content)
-              if charset_list:
-                charset = charset_list[0]
+              charset = self.charset_parser.search(text_content).group('charset')
             if charset and charset not in ('utf-8', 'UTF-8'):
               try:
                 text_content = text_content.decode(charset).encode('utf-8')
@@ -241,7 +239,16 @@
               else:
                 charset = 'utf-8' # Override charset if convertion succeeds
                 # change charset value in html_document as well
-                self.charset_parser.sub('utf-8', text_content)
+                def subCharset(matchobj):
+                  keyword = matchobj.group('keyword')
+                  charset = matchobj.group('charset')
+                  if not (keyword or charset):
+                    # no match, return same string
+                    return matchobj.group(0)
+                  elif keyword:
+                    # if keyword is present, replace charset just after
+                    return keyword + 'utf-8'
+                text_content = self.charset_parser.sub(subCharset, text_content)
           result = portal_transforms.convertToData(mime_type, text_content,
                                                    object=self, context=self,
                                                    filename=filename,




More information about the Erp5-report mailing list