[Erp5-report] r13628 - /erp5/trunk/products/ERP5/Document/PDFDocument.py

Mon Mar 26 13:49:36 CEST 2007

Author: jp
Date: Mon Mar 26 13:49:35 2007
New Revision: 13628

URL: http://svn.erp5.org?rev=13628&view=rev
Log:
Code review and refactoring based on Document API.

Modified:
    erp5/trunk/products/ERP5/Document/PDFDocument.py

Modified: erp5/trunk/products/ERP5/Document/PDFDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/PDFDocument.py?rev=13628&r1=13627&r2=13628&view=diff
==============================================================================

--- erp5/trunk/products/ERP5/Document/PDFDocument.py (original)
+++ erp5/trunk/products/ERP5/Document/PDFDocument.py Mon Mar 26 13:49:35 2007
@@ -1,4 +1,3 @@
-
 ##############################################################################
 #
 # Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
@@ -31,23 +30,20 @@
 from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
 from Products.ERP5Type.Cache import CachingMethod
 from Products.ERP5.Document.Image import Image
-from Products.ERP5.Document.File import File, stripHtml
 from Products.ERP5.Document.Document import ConversionCacheMixin
 from Products.CMFCore.utils import getToolByName
 from zLOG import LOG
 
-import tempfile, os, glob, zipfile, cStringIO, re
+import tempfile, os, cStringIO
 
-
-class PDFDocument(File, ConversionCacheMixin):
+class PDFDocument(Image, ConversionCacheMixin):
   """
-  PdfDocument - same as file, but has its own getSearchableText method
-  (converts via pdftotext)
-  in effect it has two separate caches - from CachingMixin for txt and html
-  and for image formats from Image
+  PDFDocument is a subclass of Image which is able to
+  extract text content from a PDF file either as text
+  or as HTML.
   """
   # CMF Type Definition
-  meta_type = 'ERP5 PDF'
+  meta_type = 'ERP5 PDF Document'
   portal_type = 'PDF'
   isPortalContent = 1
   isRADContent = 1
@@ -58,17 +54,20 @@
 
   # Default Properties
   property_sheets = ( PropertySheet.Base
+                    , PropertySheet.XMLObject
                     , PropertySheet.CategoryCore
                     , PropertySheet.DublinCore
                     , PropertySheet.Version
                     , PropertySheet.Reference
                     , PropertySheet.Document
-                    , PropertySheet.TextDocument
                     , PropertySheet.Data
+                    , PropertySheet.ExternalDocument
+                    , PropertySheet.Url
+                    , PropertySheet.Periodicity
                     )
 
-
-  def index_html(self, REQUEST, RESPONSE, format=None, force=0):
+  security.declareProtected(Permissions.View, 'index_html')
+  def index_html(self, REQUEST, RESPONSE, display=None, format='', quality=75, resolution=None):
     """
       Returns data in the appropriate format (graphical)
       it is always a zip because multi-page pdfs are converted into a zip
@@ -77,126 +76,92 @@
     if format is None:
       RESPONSE.setHeader('Content-Type', 'application/pdf')
       return self._unpackData(self.data)
+    if format in ('html', 'txt', 'text'):
+      mime, data = self.convert(format)
+      RESPONSE.setHeader('Content-Length', len(data))
+      RESPONSE.setHeader('Content-Type', '%s;charset=UTF-8' % mime)
+      RESPONSE.setHeader('Accept-Ranges', 'bytes')
+      return data
+    return Image.index_html(self, REQUEST, RESPONSE, display=display,
+                            format=format, quality=quality, resolution=resolution)
+
+  # Conversion API
+  security.declareProtected(Permissions.ModifyPortalContent, 'convert')
+  def convert(self, format, **kw):
+    """
+    Implementation of conversion for PDF files
+    """
     if format == 'html':
-      RESPONSE.setHeader('Content-Type', 'text/html;charset=UTF-8')
-      return self.getHtmlRepresentation(force)
-    if format == 'txt':
-      RESPONSE.setHeader('Content-Type', 'text/plain;charset=UTF-8')
-      self._convertToText(force)
-      return self.getTextContent()
-    mime = 'image/'+format.lower()
-    if force or not self.hasConversion(format = format):
-      self.setConversion(self._makeFile(format), 'application/zip', format=format)
-    RESPONSE.setHeader('Content-Type', 'application/zip')
-    return self.getConversion(format = format)
+      if not self.hasConversion(format=format):
+        data = self._convertToHTML()
+        self.setConversion(data, mime='text/html', format=format)
+      return self.getConversion(format=format)
+    elif format in ('txt', 'text'):
+      if not self.hasConversion(format='txt'):
+        data = self._convertToText()
+        self.setConversion(data, mime='text/plain', format='txt')
+      return self.getConversion(format=format)
+    else:
+      return Image.convert(self, format, **kw)
 
-  def _makeFile(self,format):
-    tempfile.tempdir = os.path.join(os.getenv('INSTANCE_HOME'), 'tmp')
-    os.putenv('TMPDIR', '/tmp') # because if we run zope as root, we have /root/tmp here and convert goes crazy
-    if not os.path.exists(tempfile.tempdir):
-      os.mkdir(tempfile.tempdir, 0775)
-    fr = tempfile.mktemp(suffix='.pdf')
-    to = tempfile.mktemp(suffix = '.' + format)
-    file_fr = open(fr, 'w')
-    file_fr.write(self._unpackData(self.data))
-    file_fr.close()
-    cmd = 'convert %s %s' % (fr, to)
-    os.system(cmd)
-    # pack it
-    f = cStringIO.StringIO()
-    z = zipfile.ZipFile(f, 'a')
-    for fname in glob.glob(to.replace('.', '*')):
-      base = os.path.basename(fname)
-      pg = re.match('.*?(\d*)\.'+format, base).groups()
-      if pg:
-        pg = pg[0]
-        arcname = '%s/page-%s.%s' % (format, pg, format)
-      else:
-        arcname = base
-      z.write(fname, arcname)
-    z.close()
-    f.seek(0)
-    return f.read()
-
-  searchable_property_list = File.searchable_property_list + ('text_content',)
-
-  ### Content indexing methods
-  security.declareProtected(Permissions.View, 'getSearchableText')
-  def getSearchableText(self, md=None, force=0):
+  security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
+  def populateContent(self):
     """
-      Used by the catalog for basic full text indexing
-      conditionally convert pdf to text
+      Convert each page to an Image and populate the
+      PDF directory with converted images. May be useful
+      to provide online PDF reader
     """
-    self._convertToText(force)
-    return File.getSearchableText(self, md)
+    raise NotImplementedError
 
   security.declarePrivate('_convertToText')
-  def _convertToText(self, force):
+  def _convertToText(self):
     """
-      Private implementation method.
-      If we don't have txt cache or we are forced to convert, we try to do it
-      using system pdftotext utility. We set the result as text_content property.
-      We mark it in cache as done, even if we fail, so we don't keep trying if it
-      doesn't work.
+      Convert the PDF text content to text with pdftotext
     """
-    if hasattr(self, 'data') and (force == 1 or not self.hasConversion(format = 'txt')):
-      # XXX-JPS accessing attribute data is bad
-      self.log('PdfDocument', 'regenerating txt')
-      try:
-        try:
-          tmp = tempfile.NamedTemporaryFile()
-          tmp.write(self._unpackData(self.data))
-          tmp.seek(0)
-          cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
-          r = os.popen(cmd)
-          self.setTextContent(r.read().replace('\n', ' '))
-          tmp.close()
-          r.close()
-        except Exception, e:
-          self.log(str(e))
-          msg = 'Conversion to text failed: ' + str(e)
-        else:
-          msg = 'Converted to text'
-      finally:
-        self.processFile(comment=msg)
-        # we don't need to store it twice, just mark we have it (or rather we already tried)
-        # we try only once
-        self.setConversion('empty', format = 'txt') 
+    tmp = tempfile.NamedTemporaryFile()
+    tmp.write(self._unpackData(self.data))
+    tmp.seek(0)
+    cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
+    r = os.popen(cmd)
+    h = r.read()
+    tmp.close()
+    r.close()
+    return h
 
-  SearchableText=getSearchableText
+  security.declarePrivate('_convertToHTML')
+  def _convertToHTML(self):
+    """
+    Convert the PDF text content to HTML with pdftohtml
+    """
+    tmp = tempfile.NamedTemporaryFile()
+    tmp.write(self._unpackData(self.data))
+    tmp.seek(0)
+    cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
+    r = os.popen(cmd)
+    h = r.read()
+    tmp.close()
+    r.close()
+    h = h.replace('<BODY bgcolor="#A0A0A0"', '<BODY ') # Quick hack to remove bg color - XXX
+    return h
 
-  security.declarePrivate('_convertToBase')
-  def _convertToBase(self):
-    self._convertToText(force=1)
-
-  security.declareProtected(Permissions.View, 'getHtmlRepresentation')
-  def getHtmlRepresentation(self, force=0):
-    '''
-    get simplified html version to display
-    If we fail to convert, we set workflow message and put error message
-    as html preview so that the user knows what's going on
-    '''
-    portal_workflow = getToolByName(self, 'portal_workflow')
-    if not hasattr(self, 'data'):
-      return 'no data'
-    if force==1 or not self.hasConversion(format = 'html'):
-      try:
-        self.log('PDF', 'regenerating html')
-        tmp = tempfile.NamedTemporaryFile()
-        tmp.write(self._unpackData(self.data))
-        tmp.seek(0)
-        cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
-        r = os.popen(cmd)
-        h = r.read()
-        tmp.close()
-        r.close()
-        h = stripHtml(h)
-      except Exception, e:
-        msg = 'Could not convert to html: ' + str(e)
-        h = msg
-        portal_workflow.doActionFor(self, 'process', comment=msg)
-      self.setConversion(h, format = 'html')
-    return self.getConversion(format = 'html')[1]
-
-# vim: syntax=python shiftwidth=2 
-
+  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
+  def getContentInformation(self):
+    """
+    Returns the information about the PDF document with
+    pdfinfo.
+    """
+    tmp = tempfile.NamedTemporaryFile()
+    tmp.write(self._unpackData(self.data))
+    tmp.seek(0)
+    cmd = 'pdfinfo -meta -box %s' % tmp.name
+    r = os.popen(cmd)
+    h = r.read()
+    tmp.close()
+    r.close()
+    result = {}
+    for line in h.splitlines():
+      item_list = line.split(':')
+      key = item_list[0].strip()
+      value = ':'.join(item_list[1:]).strip()
+      result[key] = value
+    return result