[Erp5-report] r9403 - /erp5/trunk/products/ERP5OOo/Document/

nobody at svn.erp5.org nobody at svn.erp5.org
Thu Aug 24 16:21:10 CEST 2006


Author: bartek
Date: Thu Aug 24 16:21:04 2006
New Revision: 9403

URL: http://svn.erp5.org?rev=9403&view=rev
Log:
major refactoring; fixed snapshot generation; plain text extraction from PdfDocument (req. pdftotext);

Added:
    erp5/trunk/products/ERP5OOo/Document/PdfDocument.py
    erp5/trunk/products/ERP5OOo/Document/__init__.py
Modified:
    erp5/trunk/products/ERP5OOo/Document/DMSFile.py
    erp5/trunk/products/ERP5OOo/Document/OOoDocument.py

Modified: erp5/trunk/products/ERP5OOo/Document/DMSFile.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/DMSFile.py?rev=9403&r1=9402&r2=9403&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/DMSFile.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/DMSFile.py Thu Aug 24 16:21:04 2006
@@ -43,8 +43,7 @@
   """
   Special base class, different from File only in that it can contain things 
   (like Role Definition, for example)
-  Could (perhaps should) be a parent class for OOoDocument
-  Should probably be located somewhere else
+  will be merged with File when WebDAV issues are solved
   """
   # CMF Type Definition
   meta_type = 'ERP5 DMS File'
@@ -70,19 +69,31 @@
   _edit=File._edit
   edit=File.edit
 
+  searchable_attrs=('title','description','id','reference','version',
+      'short_title','keywords','subject','original_filename','source_project_title')
+
   ### Content indexing methods
   security.declareProtected(Permissions.View, 'getSearchableText')
   def getSearchableText(self, md=None):
-    """\
+    """
     Used by the catalog for basic full text indexing
-    And so we end up with a strange hybrid of File and Document
-    This is the same as in OOoDocument except that no text_content here
-    Some people call it 'copy-and-paste programming'
     """
-    searchable_attrs=('title','description','id','reference','version',
-        'short_title','keywords','subject','original_filename','source_project_title')
-    searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',searchable_attrs))
+    searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',self.searchable_attrs))
     return searchable_text
+
+  security.declarePrivate('_unpackData')
+  def _unpackData(self,data):
+    """
+    Unpack Pdata into string
+    """
+    if isinstance(data,str):
+      return data
+    else:
+      data_list=[]
+      while data is not None:
+        data_list.append(data.data)
+        data=data.next
+      return ''.join(data_list)
 
   SearchableText=getSearchableText
 
@@ -93,8 +104,8 @@
     if fname:
       content_type,enc=mimetypes.guess_type(fname)
       if content_type is not None:
-	self.content_type=content_type
-	return content_type
+        self.content_type=content_type
+    return content_type
 
 
   # BG copied from File in case

Modified: erp5/trunk/products/ERP5OOo/Document/OOoDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/OOoDocument.py?rev=9403&r1=9402&r2=9403&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/OOoDocument.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/OOoDocument.py Thu Aug 24 16:21:04 2006
@@ -35,21 +35,20 @@
 from Products.ERP5Type.Cache import CachingMethod
 from Products.ERP5.Document.File import File
 from Products.ERP5Type.XMLObject import XMLObject
+from Products.ERP5OOo.Document.DMSFile import DMSFile
 from DateTime import DateTime
-import xmlrpclib, base64, mimetypes, re, zipfile, cStringIO
+import xmlrpclib, base64, re, zipfile, cStringIO
 # to overwrite WebDAV methods
 from Products.CMFDefault.File import File as CMFFile
 from Products.CMFCore.utils import getToolByName
 
-mimetypes.init()
-
 enc=base64.encodestring
 dec=base64.decodestring
 
 class ConvertionError(Exception):pass
 
 #class OOoDocument(File):
-class OOoDocument(XMLObject,File):
+class OOoDocument(DMSFile):
   """
     A file document able to convert OOo compatible files to
     any OOo supported format, to capture metadata and to
@@ -121,20 +120,7 @@
   rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)
   rx_compr=re.compile('\s+')
 
-  ### Content indexing methods
-  security.declareProtected(Permissions.View, 'getSearchableText')
-  def getSearchableText(self, md=None):
-    """\
-    Used by the catalog for basic full text indexing
-    And so we end up with a strange hybrid of File and Document
-    """
-    searchable_attrs=('title','description','id','text_content','reference','version',
-        'short_title','keywords','subject','original_filename','source_project_title')
-    searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',searchable_attrs))
-    return searchable_text
-
-  SearchableText=getSearchableText
-
+  searchable_attrs=DMSFile.searchable_attrs+('text_content',)
 
   security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
   def clearCache(self):
@@ -313,20 +299,6 @@
     data=self.oo_data
     return data
 
-  security.declarePrivate('_unpackData')
-  def _unpackData(self,data):
-    """
-    Unpack Pdata into string
-    """
-    if isinstance(data,str):
-      return data
-    else:
-      data_list=[]
-      while data is not None:
-        data_list.append(data.data)
-        data=data.next
-      return ''.join(data_list)
-
   security.declareProtected(Permissions.View,'hasFile')
   def hasFile(self):
     """
@@ -373,8 +345,15 @@
         return self.returnMessage('already has a snapshot')
       raise ConvertionError('already has a snapshot')
     # making snapshot
-    self.makeFile('pdf')
-    self.snapshot=Pdata(self._unpackData(self.cached_data['pdf']))  # XXX - use propertysheet accessors
+    # we have to figure out which pdf format to use
+    tgts=[x[1] for x in self.getTargetFormatItemList() if x[1].endswith('pdf')]
+    if len(tgts)>1:
+      return self.returnMessage('multiple pdf formats found - this shouldnt happen')
+    if len(tgts)==0:
+      return self.returnMessage('no pdf format found')
+    fmt=tgts[0]
+    self.makeFile(fmt)
+    self.snapshot=Pdata(self._unpackData(self.cached_data[fmt]))  # XXX - use propertysheet accessors
     return self.returnMessage('snapshot created')
 
   security.declareProtected(Permissions.View,'getSnapshot')
@@ -515,19 +494,6 @@
     s+='</table>'
     return s
 
-  # this will go out after refactoring (will be inherited from DMS File
-  # and eventually from File
-  security.declareProtected(Permissions.ModifyPortalContent, 'guessMimeType')
-  def guessMimeType(self,fname=''):
-    '''get mime type from file name'''
-    if fname=='':fname=self.getOriginalFilename()
-    if fname:
-      content_type,enc=mimetypes.guess_type(fname)
-      if content_type is not None:
-	self.content_type=content_type
-	return content_type
-
-
   # make sure to call the right edit methods
   _edit=File._edit
   edit=File.edit

Added: erp5/trunk/products/ERP5OOo/Document/PdfDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/PdfDocument.py?rev=9403&view=auto
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/PdfDocument.py (added)
+++ erp5/trunk/products/ERP5OOo/Document/PdfDocument.py Thu Aug 24 16:21:04 2006
@@ -1,0 +1,88 @@
+
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5Type.Cache import CachingMethod
+from Products.ERP5OOo.Document.DMSFile import DMSFile
+
+import tempfile, os
+
+
+class PdfDocument(DMSFile):
+  """
+  PdfDocument - same as file, but has its own getSearchableText method
+  (converts via pdftotext)
+  """
+  # CMF Type Definition
+  meta_type = 'ERP5 Pdf Document'
+  portal_type = 'Pdf Document'
+  isPortalContent = 1
+  isRADContent = 1
+
+  # Declarative security
+  security = ClassSecurityInfo()
+  security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+  # Default Properties
+  property_sheets = ( PropertySheet.Base
+                    , PropertySheet.CategoryCore
+                    , PropertySheet.DublinCore
+                    , PropertySheet.Version
+                    , PropertySheet.Reference
+                    , PropertySheet.DMSFile
+                    , PropertySheet.Document
+                    )
+
+  searchable_attrs=DMSFile.searchable_attrs+('text_content',)
+
+  ### Content indexing methods
+  security.declareProtected(Permissions.View, 'getSearchableText')
+  def getSearchableText(self, md=None, force=0):
+    """
+    Used by the catalog for basic full text indexing
+    we get text content by using pdftotext
+    but we have to do it only once
+    """
+    if hasattr(self,'data') and (force==1 or self.getTextContent() is None):
+      tmp=tempfile.NamedTemporaryFile()
+      tmp.write(self._unpackData(self.data))
+      tmp.seek(0)
+      cmd='pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
+      r=os.popen(cmd)
+      self.setTextContent(r.read().replace('\n',' '))
+      tmp.close()
+      r.close()
+    return DMSFile.getSearchableText(self,md)
+
+  SearchableText=getSearchableText
+
+
+# vim: syntax=python shiftwidth=2 
+

Added: erp5/trunk/products/ERP5OOo/Document/__init__.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/__init__.py?rev=9403&view=auto
==============================================================================
    (empty)




More information about the Erp5-report mailing list