[Erp5-report] r9427 - /erp5/trunk/products/ERP5OOo/Document/
nobody at svn.erp5.org
nobody at svn.erp5.org
Fri Aug 25 12:46:22 CEST 2006
Author: bartek
Date: Fri Aug 25 12:46:18 2006
New Revision: 9427
URL: http://svn.erp5.org?rev=9427&view=rev
Log:
caching moved out to mixin class; stripping headers from html representation; caching in PdfDocument;
Modified:
erp5/trunk/products/ERP5OOo/Document/DMSFile.py
erp5/trunk/products/ERP5OOo/Document/OOoDocument.py
erp5/trunk/products/ERP5OOo/Document/PdfDocument.py
Modified: erp5/trunk/products/ERP5OOo/Document/DMSFile.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/DMSFile.py?rev=9427&r1=9426&r2=9427&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/DMSFile.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/DMSFile.py Fri Aug 25 12:46:18 2006
@@ -35,9 +35,97 @@
# to overwrite WebDAV methods
from Products.CMFDefault.File import File as CMFFile
-import mimetypes
+import mimetypes, re
+from DateTime import DateTime
mimetypes.init()
+
+rs=[]
+rs.append(re.compile('<!.*>'))
+rs.append(re.compile('<HEAD>.*</HEAD>',re.DOTALL|re.MULTILINE|re.IGNORECASE))
+rs.append(re.compile('<.?(HTML|BODY)[^>]*>',re.DOTALL|re.MULTILINE|re.IGNORECASE))
+
+def stripHtml(txt):
+ for r in rs:
+ txt=r.sub('',txt)
+ return txt
+
+
+class CachingMixin:
+ # time of generation of various formats
+ cached_time={}
+ # generated files (cache)
+ cached_data={}
+ # mime types for cached formats XXX to be refactored
+ cached_mime={}
+
+ # Declarative security
+ security = ClassSecurityInfo()
+ security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+ security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
+ def clearCache(self):
+ """
+ Clear cache (invoked by interaction workflow upon file upload
+ needed here to overwrite class attribute with instance attrs
+ """
+ self.cached_time={}
+ self.cached_data={}
+ self.cached_mime={}
+
+ security.declareProtected(Permissions.View,'hasFileCache')
+ def hasFileCache(self,format):
+ """
+ Checks whether we have a version in this format
+ """
+ return self.cached_data.has_key(format)
+
+ def getCacheTime(self,format):
+ """
+ Checks when if ever was the file produced
+ """
+ return self.cached_time.get(format,0)
+
+ def cacheUpdate(self,format):
+ self.cached_time[format]=DateTime()
+
+ def cacheSet(self,format,mime=None,data=None):
+ if mime is not None:
+ self.cached_mime[format]=mime
+ if data is not None:
+ self.cached_data[format]=data
+
+ def cacheGet(self,format):
+ '''
+ we could be much cooler here - pass testing and updating methods to this function
+ so that it does it all by itself; this'd eliminate the need for cacheSet public method
+ '''
+ return self.cached_mime.get(format,''),self.cached_data.get(format,'')
+
+ security.declareProtected(Permissions.View,'getCacheInfo')
+ def getCacheInfo(self):
+ """
+ Get cache details as string (for debugging)
+ """
+ s='CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>'
+ #self.log('getCacheInfo',self.cached_time)
+ #self.log('getCacheInfo',self.cached_data)
+ for f in self.cached_time.keys():
+ t=self.cached_time[f]
+ data=self.cached_data.get(f)
+ if data:
+ if isinstance(data,str):
+ ln=len(data)
+ else:
+ ln=0
+ while data is not None:
+ ln+=len(data.data)
+ data=data.next
+ else:
+ ln='no data!!!'
+ s+='<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % (f,str(ln),str(t),str(self.isFileChanged(f)))
+ s+='</table>'
+ return s
class DMSFile(XMLObject,File):
"""
Modified: erp5/trunk/products/ERP5OOo/Document/OOoDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/OOoDocument.py?rev=9427&r1=9426&r2=9427&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/OOoDocument.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/OOoDocument.py Fri Aug 25 12:46:18 2006
@@ -35,7 +35,7 @@
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.File import File
from Products.ERP5Type.XMLObject import XMLObject
-from Products.ERP5OOo.Document.DMSFile import DMSFile
+from Products.ERP5OOo.Document.DMSFile import DMSFile, CachingMixin, stripHtml
from DateTime import DateTime
import xmlrpclib, base64, re, zipfile, cStringIO
# to overwrite WebDAV methods
@@ -48,7 +48,7 @@
class ConvertionError(Exception):pass
#class OOoDocument(File):
-class OOoDocument(DMSFile):
+class OOoDocument(DMSFile, CachingMixin):
"""
A file document able to convert OOo compatible files to
any OOo supported format, to capture metadata and to
@@ -107,30 +107,11 @@
, PropertySheet.OOoDocument
)
- # time of generation of various formats
- cached_time={}
- # generated files (cache)
- cached_data={}
- # mime types for cached formats XXX to be refactored
- cached_mime={}
- # XXX the above craves for a separate class, but I'm not sure how to handle
- # it in ZODB, so for now let it be
-
# regexps for stripping xml from docs
rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)
rx_compr=re.compile('\s+')
searchable_attrs=DMSFile.searchable_attrs+('text_content',)
-
- security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
- def clearCache(self):
- """
- Clear cache (invoked by interaction workflow upon file upload
- needed here to overwrite class attribute with instance attrs
- """
- self.cached_time={}
- self.cached_data={}
- self.cached_mime={}
def _getServerCoordinates(self):
"""
@@ -353,7 +334,8 @@
return self.returnMessage('no pdf format found')
fmt=tgts[0]
self.makeFile(fmt)
- self.snapshot=Pdata(self._unpackData(self.cached_data[fmt])) # XXX - use propertysheet accessors
+ #self.snapshot=Pdata(self._unpackData(self.cached_data[fmt]))
+ self.snapshot=Pdata(self._unpackData(self.cacheGet(format)[1]))
return self.returnMessage('snapshot created')
security.declareProtected(Permissions.View,'getSnapshot')
@@ -364,7 +346,7 @@
'''getSnapshot'''
if not self.hasSnapshot():
self.createSnapshot()
- return self.snapshot # XXX - use propertysheet accessors
+ return self.snapshot
security.declareProtected(Permissions.ManagePortal,'deleteSnapshot')
def deleteSnapshot(self):
@@ -380,7 +362,6 @@
'''
get simplified html version to display
'''
- # XXX use caching method
# we have to figure out which html format to use
tgts=[x[1] for x in self.getTargetFormatItemList() if x[1].startswith('html')]
if len(tgts)==0:
@@ -398,7 +379,7 @@
break
z.close()
cs.close()
- return h
+ return stripHtml(h)
security.declareProtected(Permissions.View,'getTargetFile')
def getTargetFile(self,format,REQUEST=None):
@@ -409,7 +390,7 @@
return self.returnMessage('can not convert to '+format+' for some reason')
try:
self.makeFile(format)
- return self.cached_mime[format],self.cached_data[format]
+ return self.cacheGet(format)
except ConvertionError,e:
return self.returnMessage(str(e))
@@ -420,19 +401,6 @@
"""
if not self.hasOOfile():return True
return self.getLastUploadTime() > self.getLastConvertTime()
-
- security.declareProtected(Permissions.View,'hasFileCache')
- def hasFileCache(self,format):
- """
- Checks whether we have a version in this format
- """
- return self.cached_data.has_key(format)
-
- def getCacheTime(self,format):
- """
- Checks when if ever was the file produced
- """
- return self.cached_time.get(format,0)
security.declareProtected(Permissions.View,'isFileChanged')
def isFileChanged(self,format):
@@ -467,14 +435,15 @@
raise ConvertionError('needs conversion')
if self.isFileChanged(format):
try:
- self.cached_mime[format],self.cached_data[format]=self._makeFile(format)
+ mime,data=self._makeFile(format)
+ self.cacheSet(format,mime,data)
self._p_changed=1 # XXX not sure it is necessary
except xmlrpclib.Fault,e:
if REQUEST is not None:
return self.returnMessage('Problem: %s' % str(e))
else:
raise ConvertionError(str(e))
- self.cached_time[format]=DateTime()
+ self.cacheUpdate(format)
if REQUEST is not None:
return self.returnMessage('%s created' % format)
else:
@@ -492,31 +461,6 @@
kw=sp.run_generate(self.getOriginalFilename(),enc(self._unpackData(self.oo_data)),None,format)
#self.log('_makeFile',mime)
return kw['mime'],Pdata(dec(kw['data']))
-
- security.declareProtected(Permissions.View,'getCacheInfo')
- def getCacheInfo(self):
- """
- Get cache details as string (for debugging)
- """
- s='CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>'
- #self.log('getCacheInfo',self.cached_time)
- #self.log('getCacheInfo',self.cached_data)
- for f in self.cached_time.keys():
- t=self.cached_time[f]
- data=self.cached_data.get(f)
- if data:
- if isinstance(data,str):
- ln=len(data)
- else:
- ln=0
- while data is not None:
- ln+=len(data.data)
- data=data.next
- else:
- ln='no data!!!'
- s+='<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % (f,str(ln),str(t),str(self.isFileChanged(f)))
- s+='</table>'
- return s
# make sure to call the right edit methods
_edit=File._edit
Modified: erp5/trunk/products/ERP5OOo/Document/PdfDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/PdfDocument.py?rev=9427&r1=9426&r2=9427&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/PdfDocument.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/PdfDocument.py Fri Aug 25 12:46:18 2006
@@ -30,12 +30,12 @@
from Products.CMFCore.WorkflowCore import WorkflowMethod
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod
-from Products.ERP5OOo.Document.DMSFile import DMSFile
+from Products.ERP5OOo.Document.DMSFile import DMSFile, CachingMixin, stripHtml
import tempfile, os
-class PdfDocument(DMSFile):
+class PdfDocument(DMSFile, CachingMixin):
"""
PdfDocument - same as file, but has its own getSearchableText method
(converts via pdftotext)
@@ -68,9 +68,12 @@
"""
Used by the catalog for basic full text indexing
we get text content by using pdftotext
- but we have to do it only once
+ but we have to do it only once after uplad
+ for simplicity we check only modification_date, which means we rebuild txt and html after every edit
+ but that shouldn't hurt too much
"""
- if hasattr(self,'data') and (force==1 or self.getTextContent() is None):
+ if hasattr(self,'data') and (force==1 or self.getCacheTime('txt')<self.getModificationDate() or self.getTextContent() is None):
+ self.log('PdfDocument','regenerating txt')
tmp=tempfile.NamedTemporaryFile()
tmp.write(self._unpackData(self.data))
tmp.seek(0)
@@ -79,26 +82,31 @@
self.setTextContent(r.read().replace('\n',' '))
tmp.close()
r.close()
+ self.cacheUpdate('txt')
return DMSFile.getSearchableText(self,md)
SearchableText=getSearchableText
- def getHtmlRepresentation(self):
+ def getHtmlRepresentation(self, force=0):
'''
get simplified html version to display
'''
- # XXX use caching method
if not hasattr(self,'data'):
return 'no data'
- tmp=tempfile.NamedTemporaryFile()
- tmp.write(self._unpackData(self.data))
- tmp.seek(0)
- cmd='pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
- r=os.popen(cmd)
- h=r.read()
- tmp.close()
- r.close()
- return h
+ if force==1 or self.getCacheTime('html')<self.getModificationDate():
+ self.log('PdfDocument','regenerating html')
+ tmp=tempfile.NamedTemporaryFile()
+ tmp.write(self._unpackData(self.data))
+ tmp.seek(0)
+ cmd='pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
+ r=os.popen(cmd)
+ h=r.read()
+ tmp.close()
+ r.close()
+ h=stripHtml(h)
+ self.cacheSet('html',data=h)
+ self.cacheUpdate('html')
+ return self.cacheGet('html')[1]
# vim: syntax=python shiftwidth=2
More information about the Erp5-report
mailing list