[Erp5-report] r31627 nicolas - /erp5/trunk/products/ERP5OOo/transforms/html_to_odt.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Thu Jan 7 14:34:17 CET 2010
Author: nicolas
Date: Thu Jan 7 14:34:14 2010
New Revision: 31627
URL: http://svn.erp5.org?rev=31627&view=rev
Log:
Recover broken HTML documents, specially regarding encoding used. reviewed by Kazuhiko
Modified:
erp5/trunk/products/ERP5OOo/transforms/html_to_odt.py
Modified: erp5/trunk/products/ERP5OOo/transforms/html_to_odt.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/transforms/html_to_odt.py?rev=31627&r1=31626&r2=31627&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/transforms/html_to_odt.py [utf8] (original)
+++ erp5/trunk/products/ERP5OOo/transforms/html_to_odt.py [utf8] Thu Jan 7 14:34:14 2010
@@ -1,7 +1,12 @@
+# -*- coding: utf-8 -*-
from Products.PortalTransforms.interfaces import itransform
from zope.interface import implements
from oood_commandtransform import OOOdCommandTransform, OOoDocumentDataStream
from zLOG import LOG
+from lxml import etree, html
+from lxml.etree import Element, SubElement
+
+html_parser = etree.HTMLParser(remove_blank_text=True, encoding='utf-8')
class HTMLToOdt:
"""Transforms HTML to odt by using oood"""
@@ -25,6 +30,19 @@
raise AttributeError(attr)
def convert(self, orig, data, cache=None, filename=None, context=None, **kwargs):
+ # Try to recover broken HTML documents, specially regarding encoding used
+ html_node = etree.XML(orig, parser=html_parser)
+ html_tree = html_node.getroottree()
+ head = html_tree.find('head')
+ if head is None:
+ # This part of code is supposed to be useless
+ # lxml.html.tostring function with include_meta_content_type
+ # parameter to True, should do the same things. But it does not.
+ head = Element('head')
+ html_node.insert(0, head)
+ SubElement(head, 'meta', **{'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
+ orig = html.tostring(html_tree, encoding='utf-8')
+
doc = OOOdCommandTransform(context, filename, orig, self.inputs[0])
doc.convert()
odt = doc.convertTo('odt')
More information about the Erp5-report
mailing list