[Erp5-report] r39243 nicolas - /erp5/trunk/products/PortalTransforms/transforms/safe_html.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Fri Oct 15 17:46:51 CEST 2010
Author: nicolas
Date: Fri Oct 15 17:46:47 2010
New Revision: 39243
URL: http://svn.erp5.org?rev=39243&view=rev
Log:
Add one more chance to get parsable html content with help of BeautifulSoup.
This patch doesn't require hard dependency with BeautifulSoup.
Modified:
erp5/trunk/products/PortalTransforms/transforms/safe_html.py
Modified: erp5/trunk/products/PortalTransforms/transforms/safe_html.py
URL: http://svn.erp5.org/erp5/trunk/products/PortalTransforms/transforms/safe_html.py?rev=39243&r1=39242&r2=39243&view=diff
==============================================================================
--- erp5/trunk/products/PortalTransforms/transforms/safe_html.py [utf8] (original)
+++ erp5/trunk/products/PortalTransforms/transforms/safe_html.py [utf8] Fri Oct 15 17:46:47 2010
@@ -17,6 +17,11 @@ from lxml import etree
from lxml.etree import HTMLParser as LHTMLParser
from lxml.html import tostring
+try:
+ from lxml.html.soupparser import fromstring as soupfromstring
+except ImportError:
+ # Means BeautifulSoup module is not installed
+ soupfromstring = None
# tag mapping: tag -> short or long tag
VALID_TAGS = VALID_TAGS.copy()
NASTY_TAGS = NASTY_TAGS.copy()
@@ -347,6 +352,7 @@ class SafeHTML:
html_string = orig
already_repaired = False
+ one_more_bullet_with_beautifulsoup = soupfromstring is not None
while True:
try:
safe = scrubHTML(
@@ -363,8 +369,20 @@ class SafeHTML:
# ouch !
# HTMLParser is not able to parse very dirty HTML string,
# try to repair any broken html with help of lxml
- if already_repaired:
+ if already_repaired and not one_more_bullet_with_beautifulsoup:
+ # Even lxml nor BeautifulSoup doesn't perform miracles
+ # so Give up !
raise
+ elif already_repaired and one_more_bullet_with_beautifulsoup:
+ # Is BeautifulSoup can perform miracles ?
+ one_more_bullet_with_beautifulsoup = False
+ # This function can raise the exception HTMLParseError.
+ # So consider this parsing as last chance
+ # to get parsable html.
+ repaired_html_tree = soupfromstring(html_string)
+ html_string = tostring(repaired_html_tree,
+ include_meta_content_type=True,
+ method='xml')
already_repaired = True
encoding = kwargs.get('encoding')
# recover parameter is equal to True by default
More information about the Erp5-report
mailing list