[Erp5-report] r9207 - /erp5/trunk/products/ERP5OOo/Document/OOoDocument.py

nobody at svn.erp5.org nobody at svn.erp5.org
Mon Aug 14 22:25:54 CEST 2006


Author: bartek
Date: Mon Aug 14 22:25:53 2006
New Revision: 9207

URL: http://svn.erp5.org?rev=9207&view=rev
Log:
plain text extraction here; tidying up returned messages and codes; switched of redundant logging; added generic DMSFile property sheet;

Modified:
    erp5/trunk/products/ERP5OOo/Document/OOoDocument.py

Modified: erp5/trunk/products/ERP5OOo/Document/OOoDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/OOoDocument.py?rev=9207&r1=9206&r2=9207&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/OOoDocument.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/OOoDocument.py Mon Aug 14 22:25:53 2006
@@ -36,7 +36,7 @@
 from Products.ERP5.Document.File import File
 from Products.ERP5Type.XMLObject import XMLObject
 from DateTime import DateTime
-import xmlrpclib, base64, mimetypes
+import xmlrpclib, base64, mimetypes, re, zipfile, cStringIO
 # to overwrite WebDAV methods
 from Products.CMFDefault.File import File as CMFFile
 from Products.CMFCore.utils import getToolByName
@@ -104,6 +104,7 @@
                     , PropertySheet.Version
                     , PropertySheet.Reference
                     , PropertySheet.Document
+                    , PropertySheet.DMSFile
                     , PropertySheet.OOoDocument
                     )
 
@@ -116,10 +117,9 @@
   # XXX the above craves for a separate class, but I'm not sure how to handle
   # it in ZODB, so for now let it be
 
-  #def __init__(self,*args,**kwargs):
-    #XMLObject.__init__(self,*args,**kwargs)
-    #File.__init__(self,*args,**kwargs)
-    #self.__dav_collection__=0
+  # regexps for stripping xml from docs
+  rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)
+  rx_compr=re.compile('\s+')
 
   ### Content indexing methods
   security.declareProtected(Permissions.View, 'getSearchableText')
@@ -164,7 +164,7 @@
 
   def returnMessage(self,msg,code=0):
     """
-    code may be used in the future to indicate a problem
+    code > 0 indicates a problem
     we distinguish data return from message by checking if it is a tuple
     """
     m=Message(domain='ui',message=msg)
@@ -178,11 +178,11 @@
     and gets converted file as well as metadata
     """
     if force==0 and not self.isFileUploaded():
-      return self.returnMessage('OOo file is up do date')
+      return self.returnMessage('OOo file is up do date',1)
     try:
       self._convert()
     except xmlrpclib.Fault,e:
-      return self.returnMessage('Problem: %s' % str(e))
+      return self.returnMessage('Problem: %s' % str(e),2)
     self.setLastConvertTime(DateTime())
     return self.returnMessage('converted')
 
@@ -224,7 +224,7 @@
     """
     if not self.hasOOfile(): return False
     allowed=self.getTargetFormatItemList()
-    self.log('allowed',allowed)
+    #self.log('allowed',allowed)
     if allowed is None: return False
     return (format in [x[1] for x in allowed])
 
@@ -235,15 +235,15 @@
     based on the values provided by the user. This is implemented
     through the invocation of the conversion server.
     """
-    self.log('editMetadata',newmeta)
+    #self.log('editMetadata',newmeta)
     for k,v in newmeta.items():
       # OOo uses capitalized meta names
       newmeta[k.capitalize()]=v
       newmeta.pop(k)
-    self.log('newmeta',newmeta)
+    #self.log('newmeta',newmeta)
     sp=self._mkProxy()
     meta,oo_data=sp.run_setmetadata(self.getTitle(),enc(self._unpackData(self.oo_data)),newmeta)
-    self.log('res editMetadata',meta)
+    #self.log('res editMetadata',meta)
     self.oo_data=Pdata(dec(oo_data))
     self._setMetaData(meta)
     return True # XXX why return ? - why not?
@@ -256,13 +256,28 @@
     on the object. Update metadata information.
     """
     sp=self._mkProxy()
-    self.log('_convert',enc(self._unpackData(self.data))[:500])
+    #self.log('_convert',enc(self._unpackData(self.data))[:500])
     meta,oo_data=sp.run_convert(self.getOriginalFilename(),enc(self._unpackData(self.data)))
     self.oo_data=Pdata(dec(oo_data))
     # now we get text content 
-    nic,text_data=sp.run_getplaintext(self.getOriginalFilename(),enc(self._unpackData(self.oo_data)))
+    text_data=self.extractTextContent()
     self.setTextContent(dec(text_data))
     self._setMetaData(meta)
+
+  security.declareProtected(Permissions.View,'extractTextContent')
+  def extractTextContent(self):
+    """
+    extract plain text from ooo docs - the simplest way possible, works for all ODF formats
+    """
+    cs=cStringIO.StringIO()
+    cs.write(self._unpackData(self.oo_data))
+    z=zipfile.ZipFile(cs)
+    s=z.read('content.xml')
+    s=self.rx_strip.sub(" ",s) # strip xml
+    s=self.rx_compr.sub(" ",s) # compress multiple spaces
+    cs.close()
+    z.close()
+    return s
 
   security.declarePrivate('_setMetaData')
   def _setMetaData(self,meta):
@@ -277,10 +292,10 @@
           could also support user fields in OOo
           (user fields are so useful actually...)
     """
-    self.log('meta',meta)
+    #self.log('meta',meta)
     for k,v in meta.items():
       meta[k]=v.encode('utf-8')
-    self.log('meta',meta)
+    #self.log('meta',meta)
     self.setTitle(meta.get('Title',''))
     self.setSubject(meta.get('Subject',''))
     self.setKeywords(meta.get('Keywords',''))
@@ -475,7 +490,7 @@
     # real version:
     sp=self._mkProxy()
     mime,file=sp.run_generate(self.getOriginalFilename(),enc(self._unpackData(self.oo_data)),format)
-    self.log('_makeFile',mime)
+    #self.log('_makeFile',mime)
     return mime,Pdata(dec(file))
 
   security.declareProtected(Permissions.View,'getCacheInfo')
@@ -484,8 +499,8 @@
     Get cache details as string (for debugging)
     """
     s='CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>'
-    self.log('getCacheInfo',self.cached_time)
-    self.log('getCacheInfo',self.cached_data)
+    #self.log('getCacheInfo',self.cached_time)
+    #self.log('getCacheInfo',self.cached_data)
     for f in self.cached_time.keys():
       t=self.cached_time[f]
       data=self.cached_data.get(f)




More information about the Erp5-report mailing list