[Erp5-report] r9403 - /erp5/trunk/products/ERP5OOo/Document/
nobody at svn.erp5.org
nobody at svn.erp5.org
Thu Aug 24 16:21:10 CEST 2006
Author: bartek
Date: Thu Aug 24 16:21:04 2006
New Revision: 9403
URL: http://svn.erp5.org?rev=9403&view=rev
Log:
major refactoring; fixed snapshot generation; plain text extraction from PdfDocument (req. pdftotext);
Added:
erp5/trunk/products/ERP5OOo/Document/PdfDocument.py
erp5/trunk/products/ERP5OOo/Document/__init__.py
Modified:
erp5/trunk/products/ERP5OOo/Document/DMSFile.py
erp5/trunk/products/ERP5OOo/Document/OOoDocument.py
Modified: erp5/trunk/products/ERP5OOo/Document/DMSFile.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/DMSFile.py?rev=9403&r1=9402&r2=9403&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/DMSFile.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/DMSFile.py Thu Aug 24 16:21:04 2006
@@ -43,8 +43,7 @@
"""
Special base class, different from File only in that it can contain things
(like Role Definition, for example)
- Could (perhaps should) be a parent class for OOoDocument
- Should probably be located somewhere else
+ will be merged with File when WebDAV issues are solved
"""
# CMF Type Definition
meta_type = 'ERP5 DMS File'
@@ -70,19 +69,31 @@
_edit=File._edit
edit=File.edit
+ searchable_attrs=('title','description','id','reference','version',
+ 'short_title','keywords','subject','original_filename','source_project_title')
+
### Content indexing methods
security.declareProtected(Permissions.View, 'getSearchableText')
def getSearchableText(self, md=None):
- """\
+ """
Used by the catalog for basic full text indexing
- And so we end up with a strange hybrid of File and Document
- This is the same as in OOoDocument except that no text_content here
- Some people call it 'copy-and-paste programming'
"""
- searchable_attrs=('title','description','id','reference','version',
- 'short_title','keywords','subject','original_filename','source_project_title')
- searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',searchable_attrs))
+ searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',self.searchable_attrs))
return searchable_text
+
+ security.declarePrivate('_unpackData')
+ def _unpackData(self,data):
+ """
+ Unpack Pdata into string
+ """
+ if isinstance(data,str):
+ return data
+ else:
+ data_list=[]
+ while data is not None:
+ data_list.append(data.data)
+ data=data.next
+ return ''.join(data_list)
SearchableText=getSearchableText
@@ -93,8 +104,8 @@
if fname:
content_type,enc=mimetypes.guess_type(fname)
if content_type is not None:
- self.content_type=content_type
- return content_type
+ self.content_type=content_type
+ return content_type
# BG copied from File in case
Modified: erp5/trunk/products/ERP5OOo/Document/OOoDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/OOoDocument.py?rev=9403&r1=9402&r2=9403&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/OOoDocument.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/OOoDocument.py Thu Aug 24 16:21:04 2006
@@ -35,21 +35,20 @@
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.File import File
from Products.ERP5Type.XMLObject import XMLObject
+from Products.ERP5OOo.Document.DMSFile import DMSFile
from DateTime import DateTime
-import xmlrpclib, base64, mimetypes, re, zipfile, cStringIO
+import xmlrpclib, base64, re, zipfile, cStringIO
# to overwrite WebDAV methods
from Products.CMFDefault.File import File as CMFFile
from Products.CMFCore.utils import getToolByName
-mimetypes.init()
-
enc=base64.encodestring
dec=base64.decodestring
class ConvertionError(Exception):pass
#class OOoDocument(File):
-class OOoDocument(XMLObject,File):
+class OOoDocument(DMSFile):
"""
A file document able to convert OOo compatible files to
any OOo supported format, to capture metadata and to
@@ -121,20 +120,7 @@
rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)
rx_compr=re.compile('\s+')
- ### Content indexing methods
- security.declareProtected(Permissions.View, 'getSearchableText')
- def getSearchableText(self, md=None):
- """\
- Used by the catalog for basic full text indexing
- And so we end up with a strange hybrid of File and Document
- """
- searchable_attrs=('title','description','id','text_content','reference','version',
- 'short_title','keywords','subject','original_filename','source_project_title')
- searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',searchable_attrs))
- return searchable_text
-
- SearchableText=getSearchableText
-
+ searchable_attrs=DMSFile.searchable_attrs+('text_content',)
security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
def clearCache(self):
@@ -313,20 +299,6 @@
data=self.oo_data
return data
- security.declarePrivate('_unpackData')
- def _unpackData(self,data):
- """
- Unpack Pdata into string
- """
- if isinstance(data,str):
- return data
- else:
- data_list=[]
- while data is not None:
- data_list.append(data.data)
- data=data.next
- return ''.join(data_list)
-
security.declareProtected(Permissions.View,'hasFile')
def hasFile(self):
"""
@@ -373,8 +345,15 @@
return self.returnMessage('already has a snapshot')
raise ConvertionError('already has a snapshot')
# making snapshot
- self.makeFile('pdf')
- self.snapshot=Pdata(self._unpackData(self.cached_data['pdf'])) # XXX - use propertysheet accessors
+ # we have to figure out which pdf format to use
+ tgts=[x[1] for x in self.getTargetFormatItemList() if x[1].endswith('pdf')]
+ if len(tgts)>1:
+ return self.returnMessage('multiple pdf formats found - this shouldnt happen')
+ if len(tgts)==0:
+ return self.returnMessage('no pdf format found')
+ fmt=tgts[0]
+ self.makeFile(fmt)
+ self.snapshot=Pdata(self._unpackData(self.cached_data[fmt])) # XXX - use propertysheet accessors
return self.returnMessage('snapshot created')
security.declareProtected(Permissions.View,'getSnapshot')
@@ -515,19 +494,6 @@
s+='</table>'
return s
- # this will go out after refactoring (will be inherited from DMS File
- # and eventually from File
- security.declareProtected(Permissions.ModifyPortalContent, 'guessMimeType')
- def guessMimeType(self,fname=''):
- '''get mime type from file name'''
- if fname=='':fname=self.getOriginalFilename()
- if fname:
- content_type,enc=mimetypes.guess_type(fname)
- if content_type is not None:
- self.content_type=content_type
- return content_type
-
-
# make sure to call the right edit methods
_edit=File._edit
edit=File.edit
Added: erp5/trunk/products/ERP5OOo/Document/PdfDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/PdfDocument.py?rev=9403&view=auto
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/PdfDocument.py (added)
+++ erp5/trunk/products/ERP5OOo/Document/PdfDocument.py Thu Aug 24 16:21:04 2006
@@ -1,0 +1,88 @@
+
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5Type.Cache import CachingMethod
+from Products.ERP5OOo.Document.DMSFile import DMSFile
+
+import tempfile, os
+
+
+class PdfDocument(DMSFile):
+ """
+ PdfDocument - same as file, but has its own getSearchableText method
+ (converts via pdftotext)
+ """
+ # CMF Type Definition
+ meta_type = 'ERP5 Pdf Document'
+ portal_type = 'Pdf Document'
+ isPortalContent = 1
+ isRADContent = 1
+
+ # Declarative security
+ security = ClassSecurityInfo()
+ security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+ # Default Properties
+ property_sheets = ( PropertySheet.Base
+ , PropertySheet.CategoryCore
+ , PropertySheet.DublinCore
+ , PropertySheet.Version
+ , PropertySheet.Reference
+ , PropertySheet.DMSFile
+ , PropertySheet.Document
+ )
+
+ searchable_attrs=DMSFile.searchable_attrs+('text_content',)
+
+ ### Content indexing methods
+ security.declareProtected(Permissions.View, 'getSearchableText')
+ def getSearchableText(self, md=None, force=0):
+ """
+ Used by the catalog for basic full text indexing
+ we get text content by using pdftotext
+ but we have to do it only once
+ """
+ if hasattr(self,'data') and (force==1 or self.getTextContent() is None):
+ tmp=tempfile.NamedTemporaryFile()
+ tmp.write(self._unpackData(self.data))
+ tmp.seek(0)
+ cmd='pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
+ r=os.popen(cmd)
+ self.setTextContent(r.read().replace('\n',' '))
+ tmp.close()
+ r.close()
+ return DMSFile.getSearchableText(self,md)
+
+ SearchableText=getSearchableText
+
+
+# vim: syntax=python shiftwidth=2
+
Added: erp5/trunk/products/ERP5OOo/Document/__init__.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/__init__.py?rev=9403&view=auto
==============================================================================
(empty)
More information about the Erp5-report
mailing list