[Erp5-report] r9070 - /erp5/trunk/utils/ooodoc_server/worker.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Mon Aug 7 16:50:13 CEST 2006
Author: bartek
Date: Mon Aug 7 16:50:06 2006
New Revision: 9070
URL: http://svn.erp5.org?rev=9070&view=rev
Log:
improved text extraction from html-calc (strip horizontal rules, add meta and title if there are any)
Modified:
erp5/trunk/utils/ooodoc_server/worker.py
Modified: erp5/trunk/utils/ooodoc_server/worker.py
URL: http://svn.erp5.org/erp5/trunk/utils/ooodoc_server/worker.py?rev=9070&r1=9069&r2=9070&view=diff
==============================================================================
--- erp5/trunk/utils/ooodoc_server/worker.py (original)
+++ erp5/trunk/utils/ooodoc_server/worker.py Mon Aug 7 16:50:06 2006
@@ -28,6 +28,7 @@
##############################################################################
import os,sys,pdb,time, formatter, htmllib
+sys.path.append('/etc/ooodoc')
import config
sys.path.append(config.unopath)
import uno
@@ -42,7 +43,39 @@
from mimemapper import mimemapper
-class Worker(object):
+class PlainWriter(formatter.DumbWriter):
+ '''Writer stripping horizontal rules'''
+
+ def send_hor_rule(self,*args,**kwargs):
+ pass
+
+class CustomHTMLParser(htmllib.HTMLParser):
+ '''parser that would pass on content of basic meta tags from web pages'''
+
+ def do_meta(self,tag):
+ t=dict(tag)
+ if t.get('name','x').lower() in ('keywords','description'):
+ print t['content']
+
+ def feed(self,s):
+ htmllib.HTMLParser(self,s)
+ print s.title
+
+
+class HTMLStripper(object):
+
+ def strip_html(self,source_file,dest_file):
+ fileob=open(source_file)
+ filestring=fileob.read()
+ fileob.close()
+ fileob=open(dest_file,'w')
+ w=PlainWriter(fileob)
+ f=formatter.AbstractFormatter(w)
+ p=CustomHTMLParser(f)
+ p.feed(filestring)
+ fileob.close()
+
+class Worker(HTMLStripper):
'''
This class encapsulates an OOo instance providing interface
@@ -104,6 +137,30 @@
self._loadFile(fname)
self._generate(format)
return mimemapper.getMimeFor(format),fileUrlToSystemPath(self.destUrl)
+
+ def getplaintext(self,fname):
+ '''get plain text out of the document (to be used in SearchableText)'''
+ self._loadFile(fname)
+ self._checkMimeType(1)
+ mime=self.metadata['MIMEType']
+ if mime=='application/vnd.oasis.opendocument.text':
+ self._generate('txt')
+ return None,fileUrlToSystemPath(self.destUrl)
+ if mime=='application/vnd.oasis.opendocument.spreadsheet':
+ # for want of a cleaner solution, we save as html (to save all sheets) and
+ # convert to plain text
+ self._generate('html-calc')
+ filename=fileUrlToSystemPath(self.destUrl)
+ self.strip_html(filename,filename)
+ return None, filename
+ if mime=='application/vnd.oasis.opendocument.presentation':
+ # we do not know what to do
+ pass
+ if mime=='application/vnd.oasis.opendocument.graphics':
+ # no plain text here
+ pass
+ # we should never get to this point, anyway...
+ return None,None
def setmetadata(self,fname,meta):
'''set metadata on OOo file (fname does not change)'''
More information about the Erp5-report
mailing list