[Erp5-report] r25557 - /erp5/trunk/products/ERP5/Document/Document.py

Sat Feb 14 10:28:11 CET 2009

Author: jp
Date: Sat Feb 14 10:28:10 2009
New Revision: 25557

URL: http://svn.erp5.org?rev=25557&view=rev
Log:
Make _stripHTML a reusable private method for all subclasses.

Modified:
    erp5/trunk/products/ERP5/Document/Document.py

Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=25557&r1=25556&r2=25557&view=diff
==============================================================================

--- erp5/trunk/products/ERP5/Document/Document.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/Document.py [utf8] Sat Feb 14 10:28:10 2009
@@ -1271,6 +1271,13 @@
       return data
     kw['format'] = 'html'
     mime, html = self.convert(**kw)
+    return self._stripHTML(str(html))
+
+  def _stripHTML(self, html, charset=None):
+    """
+      A private method which can be reused by subclasses
+      to strip HTML content
+    """
     body_list = re.findall(self.body_parser, str(html))
     if len(body_list):
       stripped_html = body_list[0]
@@ -1279,6 +1286,9 @@
     # find charset and convert to utf-8
     charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient is datastream 
                                                           # instance but hard to do better
+    if charset and not charset_list:
+      # Use optional parameter is we can not find encoding in HTML
+      charset_list = [charset]
     if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'):
       try:
         stripped_html = unicode(str(stripped_html),