[Erp5-report] r9427 - /erp5/trunk/products/ERP5OOo/Document/

nobody at svn.erp5.org nobody at svn.erp5.org
Fri Aug 25 12:46:22 CEST 2006


Author: bartek
Date: Fri Aug 25 12:46:18 2006
New Revision: 9427

URL: http://svn.erp5.org?rev=9427&view=rev
Log:
caching moved out to mixin class; stripping headers from html representation; caching in PdfDocument;

Modified:
    erp5/trunk/products/ERP5OOo/Document/DMSFile.py
    erp5/trunk/products/ERP5OOo/Document/OOoDocument.py
    erp5/trunk/products/ERP5OOo/Document/PdfDocument.py

Modified: erp5/trunk/products/ERP5OOo/Document/DMSFile.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/DMSFile.py?rev=9427&r1=9426&r2=9427&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/DMSFile.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/DMSFile.py Fri Aug 25 12:46:18 2006
@@ -35,9 +35,97 @@
 # to overwrite WebDAV methods
 from Products.CMFDefault.File import File as CMFFile
 
-import mimetypes
+import mimetypes, re
+from DateTime import DateTime
 mimetypes.init()
 
+
+rs=[]
+rs.append(re.compile('<!.*>'))
+rs.append(re.compile('<HEAD>.*</HEAD>',re.DOTALL|re.MULTILINE|re.IGNORECASE))
+rs.append(re.compile('<.?(HTML|BODY)[^>]*>',re.DOTALL|re.MULTILINE|re.IGNORECASE))
+
+def stripHtml(txt):
+  for r in rs:
+    txt=r.sub('',txt)
+  return txt
+
+
+class CachingMixin:
+  # time of generation of various formats
+  cached_time={}
+  # generated files (cache)
+  cached_data={}
+  # mime types for cached formats XXX to be refactored
+  cached_mime={}
+
+  # Declarative security
+  security = ClassSecurityInfo()
+  security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+  security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
+  def clearCache(self):
+    """
+    Clear cache (invoked by interaction workflow upon file upload
+    needed here to overwrite class attribute with instance attrs
+    """
+    self.cached_time={}
+    self.cached_data={}
+    self.cached_mime={}
+
+  security.declareProtected(Permissions.View,'hasFileCache')
+  def hasFileCache(self,format):
+    """
+    Checks whether we have a version in this format
+    """
+    return self.cached_data.has_key(format)
+
+  def getCacheTime(self,format):
+    """
+    Checks when if ever was the file produced
+    """
+    return self.cached_time.get(format,0)
+
+  def cacheUpdate(self,format):
+      self.cached_time[format]=DateTime()
+
+  def cacheSet(self,format,mime=None,data=None):
+    if mime is not None:
+      self.cached_mime[format]=mime
+    if data is not None:
+      self.cached_data[format]=data
+
+  def cacheGet(self,format):
+    '''
+    we could be much cooler here - pass testing and updating methods to this function
+    so that it does it all by itself; this'd eliminate the need for cacheSet public method
+    '''
+    return self.cached_mime.get(format,''),self.cached_data.get(format,'')
+
+  security.declareProtected(Permissions.View,'getCacheInfo')
+  def getCacheInfo(self):
+    """
+    Get cache details as string (for debugging)
+    """
+    s='CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>'
+    #self.log('getCacheInfo',self.cached_time)
+    #self.log('getCacheInfo',self.cached_data)
+    for f in self.cached_time.keys():
+      t=self.cached_time[f]
+      data=self.cached_data.get(f)
+      if data:
+        if isinstance(data,str):
+          ln=len(data)
+        else:
+          ln=0
+          while data is not None:
+            ln+=len(data.data)
+            data=data.next
+      else:
+        ln='no data!!!'
+      s+='<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % (f,str(ln),str(t),str(self.isFileChanged(f)))
+    s+='</table>'
+    return s
 
 class DMSFile(XMLObject,File):
   """

Modified: erp5/trunk/products/ERP5OOo/Document/OOoDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/OOoDocument.py?rev=9427&r1=9426&r2=9427&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/OOoDocument.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/OOoDocument.py Fri Aug 25 12:46:18 2006
@@ -35,7 +35,7 @@
 from Products.ERP5Type.Cache import CachingMethod
 from Products.ERP5.Document.File import File
 from Products.ERP5Type.XMLObject import XMLObject
-from Products.ERP5OOo.Document.DMSFile import DMSFile
+from Products.ERP5OOo.Document.DMSFile import DMSFile, CachingMixin, stripHtml
 from DateTime import DateTime
 import xmlrpclib, base64, re, zipfile, cStringIO
 # to overwrite WebDAV methods
@@ -48,7 +48,7 @@
 class ConvertionError(Exception):pass
 
 #class OOoDocument(File):
-class OOoDocument(DMSFile):
+class OOoDocument(DMSFile, CachingMixin):
   """
     A file document able to convert OOo compatible files to
     any OOo supported format, to capture metadata and to
@@ -107,30 +107,11 @@
                     , PropertySheet.OOoDocument
                     )
 
-  # time of generation of various formats
-  cached_time={}
-  # generated files (cache)
-  cached_data={}
-  # mime types for cached formats XXX to be refactored
-  cached_mime={}
-  # XXX the above craves for a separate class, but I'm not sure how to handle
-  # it in ZODB, so for now let it be
-
   # regexps for stripping xml from docs
   rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)
   rx_compr=re.compile('\s+')
 
   searchable_attrs=DMSFile.searchable_attrs+('text_content',)
-
-  security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
-  def clearCache(self):
-    """
-    Clear cache (invoked by interaction workflow upon file upload
-    needed here to overwrite class attribute with instance attrs
-    """
-    self.cached_time={}
-    self.cached_data={}
-    self.cached_mime={}
 
   def _getServerCoordinates(self):
     """
@@ -353,7 +334,8 @@
       return self.returnMessage('no pdf format found')
     fmt=tgts[0]
     self.makeFile(fmt)
-    self.snapshot=Pdata(self._unpackData(self.cached_data[fmt]))  # XXX - use propertysheet accessors
+    #self.snapshot=Pdata(self._unpackData(self.cached_data[fmt]))
+    self.snapshot=Pdata(self._unpackData(self.cacheGet(format)[1]))
     return self.returnMessage('snapshot created')
 
   security.declareProtected(Permissions.View,'getSnapshot')
@@ -364,7 +346,7 @@
     '''getSnapshot'''
     if not self.hasSnapshot():
       self.createSnapshot()
-    return self.snapshot # XXX - use propertysheet accessors
+    return self.snapshot
 
   security.declareProtected(Permissions.ManagePortal,'deleteSnapshot')
   def deleteSnapshot(self):
@@ -380,7 +362,6 @@
     '''
     get simplified html version to display
     '''
-    # XXX use caching method
     # we have to figure out which html format to use
     tgts=[x[1] for x in self.getTargetFormatItemList() if x[1].startswith('html')]
     if len(tgts)==0:
@@ -398,7 +379,7 @@
         break
     z.close()
     cs.close()
-    return h
+    return stripHtml(h)
 
   security.declareProtected(Permissions.View,'getTargetFile')
   def getTargetFile(self,format,REQUEST=None):
@@ -409,7 +390,7 @@
       return self.returnMessage('can not convert to '+format+' for some reason')
     try:
       self.makeFile(format)
-      return self.cached_mime[format],self.cached_data[format]
+      return self.cacheGet(format)
     except ConvertionError,e:
       return self.returnMessage(str(e))
 
@@ -420,19 +401,6 @@
     """
     if not self.hasOOfile():return True
     return self.getLastUploadTime() > self.getLastConvertTime()
-
-  security.declareProtected(Permissions.View,'hasFileCache')
-  def hasFileCache(self,format):
-    """
-    Checks whether we have a version in this format
-    """
-    return self.cached_data.has_key(format)
-
-  def getCacheTime(self,format):
-    """
-    Checks when if ever was the file produced
-    """
-    return self.cached_time.get(format,0)
 
   security.declareProtected(Permissions.View,'isFileChanged')
   def isFileChanged(self,format):
@@ -467,14 +435,15 @@
       raise ConvertionError('needs conversion')
     if self.isFileChanged(format):
       try:
-        self.cached_mime[format],self.cached_data[format]=self._makeFile(format)
+        mime,data=self._makeFile(format)
+        self.cacheSet(format,mime,data)
         self._p_changed=1 # XXX not sure it is necessary
       except xmlrpclib.Fault,e:
         if REQUEST is not None:
           return self.returnMessage('Problem: %s' % str(e))
         else:
           raise ConvertionError(str(e))
-      self.cached_time[format]=DateTime()
+      self.cacheUpdate(format)
       if REQUEST is not None:
         return self.returnMessage('%s created' % format)
     else:
@@ -492,31 +461,6 @@
     kw=sp.run_generate(self.getOriginalFilename(),enc(self._unpackData(self.oo_data)),None,format)
     #self.log('_makeFile',mime)
     return kw['mime'],Pdata(dec(kw['data']))
-
-  security.declareProtected(Permissions.View,'getCacheInfo')
-  def getCacheInfo(self):
-    """
-    Get cache details as string (for debugging)
-    """
-    s='CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>'
-    #self.log('getCacheInfo',self.cached_time)
-    #self.log('getCacheInfo',self.cached_data)
-    for f in self.cached_time.keys():
-      t=self.cached_time[f]
-      data=self.cached_data.get(f)
-      if data:
-        if isinstance(data,str):
-          ln=len(data)
-        else:
-          ln=0
-          while data is not None:
-            ln+=len(data.data)
-            data=data.next
-      else:
-        ln='no data!!!'
-      s+='<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % (f,str(ln),str(t),str(self.isFileChanged(f)))
-    s+='</table>'
-    return s
 
   # make sure to call the right edit methods
   _edit=File._edit

Modified: erp5/trunk/products/ERP5OOo/Document/PdfDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/PdfDocument.py?rev=9427&r1=9426&r2=9427&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/PdfDocument.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/PdfDocument.py Fri Aug 25 12:46:18 2006
@@ -30,12 +30,12 @@
 from Products.CMFCore.WorkflowCore import WorkflowMethod
 from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
 from Products.ERP5Type.Cache import CachingMethod
-from Products.ERP5OOo.Document.DMSFile import DMSFile
+from Products.ERP5OOo.Document.DMSFile import DMSFile, CachingMixin, stripHtml
 
 import tempfile, os
 
 
-class PdfDocument(DMSFile):
+class PdfDocument(DMSFile, CachingMixin):
   """
   PdfDocument - same as file, but has its own getSearchableText method
   (converts via pdftotext)
@@ -68,9 +68,12 @@
     """
     Used by the catalog for basic full text indexing
     we get text content by using pdftotext
-    but we have to do it only once
+    but we have to do it only once after uplad
+    for simplicity we check only modification_date, which means we rebuild txt and html after every edit
+    but that shouldn't hurt too much
     """
-    if hasattr(self,'data') and (force==1 or self.getTextContent() is None):
+    if hasattr(self,'data') and (force==1 or self.getCacheTime('txt')<self.getModificationDate() or self.getTextContent() is None):
+      self.log('PdfDocument','regenerating txt')
       tmp=tempfile.NamedTemporaryFile()
       tmp.write(self._unpackData(self.data))
       tmp.seek(0)
@@ -79,26 +82,31 @@
       self.setTextContent(r.read().replace('\n',' '))
       tmp.close()
       r.close()
+      self.cacheUpdate('txt')
     return DMSFile.getSearchableText(self,md)
 
   SearchableText=getSearchableText
 
-  def getHtmlRepresentation(self):
+  def getHtmlRepresentation(self, force=0):
     '''
     get simplified html version to display
     '''
-    # XXX use caching method
     if not hasattr(self,'data'):
       return 'no data'
-    tmp=tempfile.NamedTemporaryFile()
-    tmp.write(self._unpackData(self.data))
-    tmp.seek(0)
-    cmd='pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
-    r=os.popen(cmd)
-    h=r.read()
-    tmp.close()
-    r.close()
-    return h
+    if force==1 or self.getCacheTime('html')<self.getModificationDate():
+      self.log('PdfDocument','regenerating html')
+      tmp=tempfile.NamedTemporaryFile()
+      tmp.write(self._unpackData(self.data))
+      tmp.seek(0)
+      cmd='pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
+      r=os.popen(cmd)
+      h=r.read()
+      tmp.close()
+      r.close()
+      h=stripHtml(h)
+      self.cacheSet('html',data=h)
+      self.cacheUpdate('html')
+    return self.cacheGet('html')[1]
 
 # vim: syntax=python shiftwidth=2 
 




More information about the Erp5-report mailing list