[Erp5-report] r34372 nicolas - /erp5/trunk/products/ERP5/Document/
nobody at svn.erp5.org
nobody at svn.erp5.org
Thu Apr 8 12:06:49 CEST 2010
Author: nicolas
Date: Thu Apr 8 12:06:47 2010
New Revision: 34372
URL: http://svn.erp5.org?rev=34372&view=rev
Log:
Enhance charset replacement with regular expression.
Modified:
erp5/trunk/products/ERP5/Document/Document.py
erp5/trunk/products/ERP5/Document/TextDocument.py
Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=34372&r1=34371&r2=34372&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/Document.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/Document.py [utf8] Thu Apr 8 12:06:47 2010
@@ -490,7 +490,7 @@
href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
- charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE)
+ charset_parser = re.compile('(?P<keyword>charset="?)(?P<charset>[a-z0-9\-]+)', re.IGNORECASE)
# Declarative security
security = ClassSecurityInfo()
Modified: erp5/trunk/products/ERP5/Document/TextDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/TextDocument.py?rev=34372&r1=34371&r2=34372&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/TextDocument.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/TextDocument.py [utf8] Thu Apr 8 12:06:47 2010
@@ -230,9 +230,7 @@
mime_type = 'text/x-html-safe'
if charset is None:
# find charset
- charset_list = self.charset_parser.findall(text_content)
- if charset_list:
- charset = charset_list[0]
+ charset = self.charset_parser.search(text_content).group('charset')
if charset and charset not in ('utf-8', 'UTF-8'):
try:
text_content = text_content.decode(charset).encode('utf-8')
@@ -241,7 +239,16 @@
else:
charset = 'utf-8' # Override charset if convertion succeeds
# change charset value in html_document as well
- self.charset_parser.sub('utf-8', text_content)
+ def subCharset(matchobj):
+ keyword = matchobj.group('keyword')
+ charset = matchobj.group('charset')
+ if not (keyword or charset):
+ # no match, return same string
+ return matchobj.group(0)
+ elif keyword:
+ # if keyword is present, replace charset just after
+ return keyword + 'utf-8'
+ text_content = self.charset_parser.sub(subCharset, text_content)
result = portal_transforms.convertToData(mime_type, text_content,
object=self, context=self,
filename=filename,
More information about the Erp5-report
mailing list