[Erp5-report] r35262 nicolas - /erp5/trunk/products/PortalTransforms/transforms/safe_html.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Wed May 12 19:37:19 CEST 2010
Author: nicolas
Date: Wed May 12 19:37:19 2010
New Revision: 35262
URL: http://svn.erp5.org?rev=35262&view=rev
Log:
Add new option which enable conversion of document into new codec (default is utf-8)
and replace its charset declaration inside html conversion
Modified:
erp5/trunk/products/PortalTransforms/transforms/safe_html.py
Modified: erp5/trunk/products/PortalTransforms/transforms/safe_html.py
URL: http://svn.erp5.org/erp5/trunk/products/PortalTransforms/transforms/safe_html.py?rev=35262&r1=35261&r2=35262&view=diff
==============================================================================
--- erp5/trunk/products/PortalTransforms/transforms/safe_html.py [utf8] (original)
+++ erp5/trunk/products/PortalTransforms/transforms/safe_html.py [utf8] Wed May 12 19:37:19 2010
@@ -15,6 +15,7 @@
from lxml import etree
from lxml.etree import HTMLParser as LHTMLParser
+from lxml.html import tostring
# tag mapping: tag -> short or long tag
VALID_TAGS = VALID_TAGS.copy()
@@ -71,6 +72,19 @@
except ValueError:
return entity_value
+charset_parser = re.compile('charset="?(?P<charset>[^"]*)"?$')
+class CharsetReplacer:
+ def __init__(self, encoding):
+ self.encoding = encoding
+
+ def __call__(self, match):
+ if match is None:
+ return ''
+ charset = match.group('charset')
+ if charset != self.encoding:
+ return match.group(0).replace(charset, self.encoding)
+ return match.group(0)
+
class StrippingParser(HTMLParser):
"""Pass only allowed tags; raise exception for known-bad.
@@ -80,7 +94,8 @@
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
- def __init__(self, valid, nasty, remove_javascript, raise_error):
+ def __init__(self, valid, nasty, remove_javascript, raise_error,
+ default_encoding):
HTMLParser.__init__( self )
self.result = []
self.valid = valid
@@ -88,6 +103,8 @@
self.remove_javascript = remove_javascript
self.raise_error = raise_error
self.suppress = False
+ self.default_encoding = default_encoding
+ self.original_charset = None
def handle_data(self, data):
if self.suppress: return
@@ -117,14 +134,12 @@
def handle_starttag(self, tag, attrs):
""" Delete all tags except for legal ones.
"""
-
if self.suppress: return
if self.valid.has_key(tag):
self.result.append('<' + tag)
remove_script = getattr(self,'remove_javascript',True)
-
for k, v in attrs:
if remove_script and k.strip().lower().startswith('on'):
if not self.raise_error: continue
@@ -134,6 +149,14 @@
elif remove_script and hasScript(v):
if not self.raise_error: continue
else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v
+ elif k.lower() == 'content' and self.default_encoding and\
+ self.default_encoding not in v:
+ match = charset_parser.search(v)
+ if match is not None:
+ self.original_charset = match.group('charset')
+ self.result.append(' %s="%s"' % (k,
+ charset_parser.sub(CharsetReplacer(self.default_encoding), v)
+ ,))
else:
self.result.append(' %s="%s"' % (k, v))
@@ -162,15 +185,22 @@
return ''.join(self.result)
def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS,
- remove_javascript=True, raise_error=True):
+ remove_javascript=True, raise_error=True,
+ default_encoding=None):
""" Strip illegal HTML tags from string text.
"""
+
parser = StrippingParser(valid=valid, nasty=nasty,
remove_javascript=remove_javascript,
- raise_error=raise_error)
+ raise_error=raise_error,
+ default_encoding=default_encoding)
parser.feed(html)
parser.close()
+ if parser.original_charset:
+ result = parser.getResult().decode(parser.original_charset)\
+ .encode(default_encoding)
+ return result
return parser.getResult()
class SafeHTML:
@@ -206,6 +236,7 @@
'nasty_tags': NASTY_TAGS,
'remove_javascript': 1,
'disable_transform': 0,
+ 'default_encoding': 'utf-8',
}
self.config_metadata = {
@@ -230,7 +261,11 @@
'This does not effect <script> tags. 0 to leave the attributes.'),
'disable_transform' : ("int",
'disable_transform',
- 'If 1, nothing is done.')
+ 'If 1, nothing is done.'),
+ 'default_encoding': ('string',
+ 'default_encoding',
+ 'Encoding used for html string.'\
+ ' If encoding is different, the string will be converted' ),
}
self.config.update(kwargs)
@@ -269,7 +304,8 @@
valid=self.config.get('valid_tags', {}),
nasty=self.config.get('nasty_tags', {}),
remove_javascript=self.config.get('remove_javascript', True),
- raise_error=False)
+ raise_error=False,
+ default_encoding=self.config.get('default_encoding', 'utf-8'))
except IllegalHTML, inst:
data.setData(msg_pat % ("Error", str(inst)))
break
@@ -292,7 +328,8 @@
lparser = LHTMLParser(recover=True,
remove_comments=True)
repaired_html_tree = etree.HTML(orig, parser=lparser)
- html_string = etree.tostring(repaired_html_tree)
+ html_string = tostring(repaired_html_tree,
+ include_meta_content_type=True)
# avoid breaking now.
# continue into the loop with repaired html
else:
More information about the Erp5-report
mailing list