[Erp5-report] r33407 nicolas - /erp5/trunk/products/PortalTransforms/transforms/safe_html.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Thu Mar 4 18:50:38 CET 2010
Author: nicolas
Date: Thu Mar 4 18:50:37 2010
New Revision: 33407
URL: http://svn.erp5.org?rev=33407&view=rev
Log:
Make safe_html transforms more robuts against dirty html documents.
- In case of failure of HTMLParser, lxml take under its hand
the broken html and recover it. Then put back only once
to HTMLParser again.
Modified:
erp5/trunk/products/PortalTransforms/transforms/safe_html.py
Modified: erp5/trunk/products/PortalTransforms/transforms/safe_html.py
URL: http://svn.erp5.org/erp5/trunk/products/PortalTransforms/transforms/safe_html.py?rev=33407&r1=33406&r2=33407&view=diff
==============================================================================
--- erp5/trunk/products/PortalTransforms/transforms/safe_html.py [utf8] (original)
+++ erp5/trunk/products/PortalTransforms/transforms/safe_html.py [utf8] Thu Mar 4 18:50:37 2010
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
import logging
-from HTMLParser import HTMLParser
+from HTMLParser import HTMLParser, HTMLParseError
import re
from cgi import escape
from zope.interface import implements
@@ -13,6 +13,9 @@
from Products.CMFDefault.utils import VALID_TAGS
from Products.CMFDefault.utils import NASTY_TAGS
from Products.PortalTransforms.utils import safeToInt
+
+from lxml import etree
+from lxml.etree import HTMLParser as LHTMLParser
# tag mapping: tag -> short or long tag
VALID_TAGS = VALID_TAGS.copy()
@@ -256,17 +259,42 @@
data.setData(orig)
return data
- try:
- safe = scrubHTML(
- bodyfinder(orig),
- valid=self.config.get('valid_tags', {}),
- nasty=self.config.get('nasty_tags', {}),
- remove_javascript=self.config.get('remove_javascript', True),
- raise_error=False)
- except IllegalHTML, inst:
- data.setData(msg_pat % ("Error", str(inst)))
- else:
- data.setData(safe)
+ html_string = orig
+ allready_repaired = False
+ while True:
+ try:
+ safe = scrubHTML(
+ bodyfinder(html_string),
+ valid=self.config.get('valid_tags', {}),
+ nasty=self.config.get('nasty_tags', {}),
+ remove_javascript=self.config.get('remove_javascript', True),
+ raise_error=False)
+ except IllegalHTML, inst:
+ data.setData(msg_pat % ("Error", str(inst)))
+ break
+ except HTMLParseError:
+ # ouch !
+ # HTMLParser is not able to parse very dirty HTML string,
+ # try to repair any broken html with help of lxml
+ if allready_repaired:
+ raise
+ allready_repaired = True
+ encoding = kwargs.get('encoding')
+ # recover parameter is equal to True by default
+ # in lxml API. I pass the argument to improve readability
+ # of above code.
+ try:
+ lparser = LHTMLParser(encoding=encoding, recover=True)
+ except LookupError:
+ # Provided encoding is not known by parser, so discard it
+ lparser = LHTMLParser(recover=True)
+ repaired_html_tree = etree.HTML(orig, parser=lparser)
+ html_string = etree.tostring(repaired_html_tree)
+ # avoid breaking now.
+ # continue into the loop with repaired html
+ else:
+ data.setData(safe)
+ break
return data
def register():
More information about the Erp5-report
mailing list