[Erp5-report] r41722 jm - /erp5/trunk/products/PortalTransforms/transforms/safe_html.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Thu Dec 23 17:36:24 CET 2010
Author: jm
Date: Thu Dec 23 17:36:24 2010
New Revision: 41722
URL: http://svn.erp5.org?rev=41722&view=rev
Log:
safe_html: fix repairing with BeautifulSoup (+ some refactoring)
Modified:
erp5/trunk/products/PortalTransforms/transforms/safe_html.py
Modified: erp5/trunk/products/PortalTransforms/transforms/safe_html.py
URL: http://svn.erp5.org/erp5/trunk/products/PortalTransforms/transforms/safe_html.py?rev=41722&r1=41721&r2=41722&view=diff
==============================================================================
--- erp5/trunk/products/PortalTransforms/transforms/safe_html.py [utf8] (original)
+++ erp5/trunk/products/PortalTransforms/transforms/safe_html.py [utf8] Thu Dec 23 17:36:24 2010
@@ -205,15 +205,14 @@ class StrippingParser(HTMLParser):
elif remove_script and hasScript(v):
if not self.raise_error: continue
else: raise IllegalHTML, 'Script URI "%s" not allowed.' % v
- elif tag.lower() == 'meta' and k.lower() == 'content' and\
- self.default_encoding and self.default_encoding not in v:
- match = charset_parser.search(v)
- if match is not None:
- self.original_charset = match.group('charset')
- self.result.append(' %s="%s"' % (k,
- charset_parser.sub(CharsetReplacer(self.default_encoding), v)
- ,))
else:
+ if tag.lower() == 'meta' and k.lower() == 'content' and \
+ self.default_encoding and self.default_encoding not in v:
+ match = charset_parser.search(v)
+ if match is not None:
+ self.original_charset = match.group('charset')
+ v = charset_parser.sub(
+ CharsetReplacer(self.default_encoding), v)
self.result.append(' %s="%s"' % (k, v))
#UNUSED endTag = '</%s>' % tag
@@ -351,13 +350,11 @@ class SafeHTML:
data.setData(orig)
return data
- html_string = orig
- already_repaired = False
- one_more_bullet_with_beautifulsoup = soupfromstring is not None
+ repaired = 0
while True:
try:
- safe = scrubHTML(
- html_string,
+ orig = scrubHTML(
+ orig,
valid=self.config.get('valid_tags', {}),
nasty=self.config.get('nasty_tags', {}),
remove_javascript=self.config.get('remove_javascript', True),
@@ -368,42 +365,38 @@ class SafeHTML:
break
except HTMLParseError:
# ouch !
- # HTMLParser is not able to parse very dirty HTML string,
- # try to repair any broken html with help of lxml
- if already_repaired and not one_more_bullet_with_beautifulsoup:
- # Even lxml nor BeautifulSoup doesn't perform miracles
- # so Give up !
- raise
- elif already_repaired and one_more_bullet_with_beautifulsoup:
- # Is BeautifulSoup can perform miracles ?
- one_more_bullet_with_beautifulsoup = False
- # This function can raise the exception HTMLParseError.
- # So consider this parsing as last chance
- # to get parsable html.
- repaired_html_tree = soupfromstring(html_string)
- html_string = tostring(repaired_html_tree,
- include_meta_content_type=True,
- method='xml')
- already_repaired = True
- encoding = kwargs.get('encoding')
- # recover parameter is equal to True by default
- # in lxml API. I pass the argument to improve readability
- # of above code.
- try:
- lparser = LHTMLParser(encoding=encoding, recover=True,
- remove_comments=True)
- except LookupError:
- # Provided encoding is not known by parser, so discard it
- lparser = LHTMLParser(recover=True,
- remove_comments=True)
- repaired_html_tree = etree.HTML(orig, parser=lparser)
- html_string = tostring(repaired_html_tree,
- include_meta_content_type=True,
- method='xml')
+ # HTMLParser is not able to parse very dirty HTML string
+ if not repaired:
+ # try to repair any broken html with help of lxml
+ encoding = kwargs.get('encoding')
+ # recover parameter is equal to True by default
+ # in lxml API. I pass the argument to improve readability
+ # of above code.
+ try:
+ lparser = LHTMLParser(encoding=encoding, recover=True,
+ remove_comments=True)
+ except LookupError:
+ # Provided encoding is not known by parser so discard it
+ lparser = LHTMLParser(recover=True,
+ remove_comments=True)
+ repaired_html_tree = etree.HTML(orig, parser=lparser)
+ elif repaired > (soupfromstring is not None):
+ # Neither lxml nor BeautifulSoup worked so give up !
+ raise
+ else:
+ # Can BeautifulSoup perform miracles ?
+ # This function may raise HTMLParseError.
+ # So consider this parsing as last chance
+ # to get parsable html.
+ repaired_html_tree = soupfromstring(orig)
+ orig = tostring(repaired_html_tree,
+ include_meta_content_type=True,
+ method='xml')
+ repaired += 1
# avoid breaking now.
# continue into the loop with repaired html
else:
- data.setData(safe)
+ data.setData(orig)
break
return data
More information about the Erp5-report
mailing list