[Erp5-report] r34360 nicolas - /erp5/trunk/products/ERP5/Document/
nobody at svn.erp5.org
nobody at svn.erp5.org
Thu Apr 8 10:58:31 CEST 2010
Author: nicolas
Date: Thu Apr 8 10:58:29 2010
New Revision: 34360
URL: http://svn.erp5.org?rev=34360&view=rev
Log:
Output always safe html content.
* _safeHTML is removed
* The stripping is done inside convert method
* Conversion Cache is handled corectly
Modified:
erp5/trunk/products/ERP5/Document/Document.py
erp5/trunk/products/ERP5/Document/TextDocument.py
Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=34360&r1=34359&r2=34360&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/Document.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/Document.py [utf8] Thu Apr 8 10:58:29 2010
@@ -490,7 +490,6 @@
href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
body_parser = re.compile('<body[^>]*>(.*?)</body>', re.IGNORECASE + re.DOTALL)
title_parser = re.compile('<title[^>]*>(.*?)</title>', re.IGNORECASE + re.DOTALL)
- base_parser = re.compile('<base[^>]*href=[\'"](.*?)[\'"][^>]*>', re.IGNORECASE + re.DOTALL)
charset_parser = re.compile('charset="?([a-z0-9\-]+)', re.IGNORECASE)
# Declarative security
@@ -1151,14 +1150,9 @@
"""
if not self.hasBaseData():
raise ConversionError('This document has not been processed yet.')
- try:
- # FIXME: no substitution may occur in this case.
- mime, data = self.getConversion(format='base-html')
- return data
- except KeyError:
- kw['format'] = 'html'
- mime, html = self.convert(**kw)
- return html
+ kw['format'] = 'html'
+ mime, html = self.convert(**kw)
+ return html
security.declareProtected(Permissions.View, 'asStrippedHTML')
def asStrippedHTML(self, **kw):
@@ -1167,16 +1161,7 @@
(without html and body tags, etc.) which can be used to inline
a preview of the document.
"""
- if not self.hasBaseData():
- return ''
- try:
- # FIXME: no substitution may occur in this case.
- mime, data = self.getConversion(format='stripped-html')
- return data
- except KeyError:
- kw['format'] = 'html'
- mime, html = self.convert(**kw)
- return self._stripHTML(str(html))
+ return self._stripHTML(self._asHTML(**kw))
def _guessEncoding(self, string):
"""
@@ -1199,48 +1184,7 @@
stripped_html = body_list[0]
else:
stripped_html = html
- # find charset and convert to utf-8
- charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient if this
- # is datastream instance but hard to do better
- if charset and not charset_list:
- # Use optional parameter is we can not find encoding in HTML
- charset_list = [charset]
- if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'):
- try:
- stripped_html = unicode(str(stripped_html),
- charset_list[0]).encode('utf-8')
- except (UnicodeDecodeError, LookupError):
- return str(stripped_html)
return stripped_html
-
- def _safeHTML(self, html, format='text/x-html-safe', charset=None):
- """
- A private method to strip HTML content in safe mode,
- w/o emmbed javascript, forms and any external plugins imports.
- This should be used when we do not trust the user (Anonymous)
- who push data into database.
- - html: content to strip
- - format: destination format
- - charset: charset used to encode string. Take precedence
- on charset values found in html string
- """
- portal = self.getPortalObject()
- if charset is None:
- # find charset
- charset_list = self.charset_parser.findall(html)
- if charset_list:
- charset = charset_list[0]
- if charset and charset not in ('utf-8', 'UTF-8'):
- try:
- safe_html_string = html.decode(charset).encode('utf-8')
- except (UnicodeDecodeError, LookupError):
- pass
- else:
- charset = 'utf-8' # Override charset if convertion succeeds
- transform_tool = getToolByName(portal, 'portal_transforms')
- safe_html_string = transform_tool.convertToData(format, html,
- encoding=charset)
- return safe_html_string
security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
def getContentInformation(self):
Modified: erp5/trunk/products/ERP5/Document/TextDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/TextDocument.py?rev=34360&r1=34359&r2=34360&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/TextDocument.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/TextDocument.py [utf8] Thu Apr 8 10:58:29 2010
@@ -202,7 +202,8 @@
**substitution_method_parameter_dict)
security.declareProtected(Permissions.AccessContentsInformation, 'convert')
- def convert(self, format, substitution_method_parameter_dict=None, safe_substitute=True, **kw):
+ def convert(self, format, substitution_method_parameter_dict=None,
+ safe_substitute=True, charset=None, text_content=None, **kw):
"""
Convert text using portal_transforms or oood
"""
@@ -212,35 +213,55 @@
if format == 'raw':
return 'text/plain', self.getTextContent()
portal = self.getPortalObject()
- mime_type = getToolByName(portal, 'mimetypes_registry').lookupExtension('name.%s' % format)
- mime_type = str(mime_type)
+ mime_type = getToolByName(portal, 'mimetypes_registry').\
+ lookupExtension('name.%s' % format)
+ original_mime_type = mime_type = str(mime_type)
src_mimetype = self.getTextFormat(DEFAULT_TEXT_FORMAT)
if not src_mimetype.startswith('text/'):
src_mimetype = 'text/%s' % src_mimetype
- # check if document has set text_content and convert if necessary
- text_content = self.getTextContent()
+ if text_content is None:
+ # check if document has set text_content and convert if necessary
+ text_content = self.getTextContent()
if text_content:
if not self.hasConversion(format=format):
portal_transforms = getToolByName(portal, 'portal_transforms')
filename = self.getSourceReference(self.getTitleOrId())
+ if mime_type == 'text/html':
+ mime_type = 'text/x-html-safe'
+ if charset is None:
+ # find charset
+ charset_list = self.charset_parser.findall(text_content)
+ if charset_list:
+ charset = charset_list[0]
+ if charset and charset not in ('utf-8', 'UTF-8'):
+ try:
+ text_content = text_content.decode(charset).encode('utf-8')
+ except (UnicodeDecodeError, LookupError):
+ pass
+ else:
+ charset = 'utf-8' # Override charset if convertion succeeds
+ # change charset value in html_document as well
+ self.charset_parser.sub('utf-8', text_content)
result = portal_transforms.convertToData(mime_type, text_content,
object=self, context=self,
filename=filename,
- mimetype=src_mimetype)
+ mimetype=src_mimetype,
+ encoding=charset)
if result is None:
raise ConversionError('TextDocument conversion error. '
- 'portal_transforms failed to convert to %s: %r' % (mime_type, self))
- self.setConversion(result, mime_type, format=format)
+ 'portal_transforms failed to convert'\
+ 'to %s: %r' % (mime_type, self))
+ self.setConversion(result, original_mime_type, format=format)
else:
mime_type, result = self.getConversion(format=format)
if substitution_method_parameter_dict is None:
substitution_method_parameter_dict = {}
result = self._substituteTextContent(result, safe_substitute=safe_substitute,
**substitution_method_parameter_dict)
- return mime_type, result
+ return original_mime_type, result
else:
# text_content is not set, return empty string instead of None
- return mime_type, ''
+ return original_mime_type, ''
def __call__(self):
_setCacheHeaders(_ViewEmulator().__of__(self), {})
More information about the Erp5-report
mailing list