[Erp5-report] r9070 - /erp5/trunk/utils/ooodoc_server/worker.py

nobody at svn.erp5.org nobody at svn.erp5.org
Mon Aug 7 16:50:13 CEST 2006


Author: bartek
Date: Mon Aug  7 16:50:06 2006
New Revision: 9070

URL: http://svn.erp5.org?rev=9070&view=rev
Log:
improved text extraction from html-calc (strip horizontal rules, add meta and title if there are any)

Modified:
    erp5/trunk/utils/ooodoc_server/worker.py

Modified: erp5/trunk/utils/ooodoc_server/worker.py
URL: http://svn.erp5.org/erp5/trunk/utils/ooodoc_server/worker.py?rev=9070&r1=9069&r2=9070&view=diff
==============================================================================
--- erp5/trunk/utils/ooodoc_server/worker.py (original)
+++ erp5/trunk/utils/ooodoc_server/worker.py Mon Aug  7 16:50:06 2006
@@ -28,6 +28,7 @@
 ##############################################################################
 
 import os,sys,pdb,time, formatter, htmllib
+sys.path.append('/etc/ooodoc')
 import config
 sys.path.append(config.unopath)
 import uno
@@ -42,7 +43,39 @@
 
 from mimemapper import mimemapper
 
-class Worker(object):
+class PlainWriter(formatter.DumbWriter):
+  '''Writer stripping horizontal rules'''
+
+  def send_hor_rule(self,*args,**kwargs):
+    pass
+
+class CustomHTMLParser(htmllib.HTMLParser):
+  '''parser that would pass on content of basic meta tags from web pages'''
+
+  def do_meta(self,tag):
+    t=dict(tag)
+    if t.get('name','x').lower() in ('keywords','description'):
+      print t['content']
+
+  def feed(self,s):
+    htmllib.HTMLParser(self,s)
+    print s.title
+
+
+class HTMLStripper(object):
+
+  def strip_html(self,source_file,dest_file):
+    fileob=open(source_file)
+    filestring=fileob.read()
+    fileob.close()
+    fileob=open(dest_file,'w')
+    w=PlainWriter(fileob)
+    f=formatter.AbstractFormatter(w)
+    p=CustomHTMLParser(f)
+    p.feed(filestring)
+    fileob.close()
+
+class Worker(HTMLStripper):
 
   '''
   This class encapsulates an OOo instance providing interface
@@ -104,6 +137,30 @@
     self._loadFile(fname)
     self._generate(format)
     return mimemapper.getMimeFor(format),fileUrlToSystemPath(self.destUrl)
+
+  def getplaintext(self,fname):
+    '''get plain text out of the document (to be used in SearchableText)'''
+    self._loadFile(fname)
+    self._checkMimeType(1)
+    mime=self.metadata['MIMEType']
+    if mime=='application/vnd.oasis.opendocument.text':
+      self._generate('txt')
+      return None,fileUrlToSystemPath(self.destUrl)
+    if mime=='application/vnd.oasis.opendocument.spreadsheet':
+      # for want of a cleaner solution, we save as html (to save all sheets) and 
+      # convert to plain text
+      self._generate('html-calc')
+      filename=fileUrlToSystemPath(self.destUrl)
+      self.strip_html(filename,filename)
+      return None, filename
+    if mime=='application/vnd.oasis.opendocument.presentation':
+      # we do not know what to do
+      pass
+    if mime=='application/vnd.oasis.opendocument.graphics':
+      # no plain text here
+      pass
+    # we should never get to this point, anyway...
+    return None,None
 
   def setmetadata(self,fname,meta):
     '''set metadata on OOo file (fname does not change)'''




More information about the Erp5-report mailing list