[Erp5-report] r12211 - /erp5/trunk/products/ERP5/Document/PDFDocument.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Mon Jan 22 17:08:26 CET 2007
Author: bartek
Date: Mon Jan 22 17:08:21 2007
New Revision: 12211
URL: http://svn.erp5.org?rev=12211&view=rev
Log:
fixed and improved conversion and generation of txt and html formats
Modified:
erp5/trunk/products/ERP5/Document/PDFDocument.py
Modified: erp5/trunk/products/ERP5/Document/PDFDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/PDFDocument.py?rev=12211&r1=12210&r2=12211&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/PDFDocument.py (original)
+++ erp5/trunk/products/ERP5/Document/PDFDocument.py Mon Jan 22 17:08:21 2007
@@ -33,6 +33,7 @@
from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.File import File, stripHtml
from Products.ERP5.Document.Document import ConversionCacheMixin
+from Products.CMFCore.utils import getToolByName
from zLOG import LOG
import tempfile, os, glob, zipfile, cStringIO, re
@@ -62,14 +63,24 @@
, PropertySheet.Version
, PropertySheet.Reference
, PropertySheet.Document
+ , PropertySheet.TextDocument
, PropertySheet.Data
)
def index_html(self, REQUEST, RESPONSE, format, force=0):
"""
- Returns data in the appropriate format
+ Returns data in the appropriate format (graphical)
+ it is always a zip because multi-page pdfs are converted into a zip
+ file of many images
"""
+ if format == 'html':
+ RESPONSE.setHeader('Content-Type', 'text/html;charset=UTF-8')
+ return self.getHtmlRepresentation(force)
+ if format == 'txt':
+ RESPONSE.setHeader('Content-Type', 'text/plain;charset=UTF-8')
+ self._convertToText(force)
+ return self.getTextContent()
mime = 'image/'+format.lower()
if force or not self.hasConversion(format = format):
self.setConversion(self._makeFile(format), 'application/zip', format=format)
@@ -110,25 +121,45 @@
security.declareProtected(Permissions.View, 'getSearchableText')
def getSearchableText(self, md=None, force=0):
"""
- Used by the catalog for basic full text indexing
- we get text content by using pdftotext
- but we have to do it only once after uplad
- for simplicity we check only modification_date, which means we rebuild txt and html after every edit
- but that shouldn't hurt too much
+ Used by the catalog for basic full text indexing
+ conditionally convert pdf to text
"""
- if hasattr(self, 'data') and (force == 1 or not self.hasConversion(format = 'txt') or self.getTextContent() is None):
+ self._convertToText(force)
+ return File.getSearchableText(self, md)
+
+ security.declarePrivate('_convertToText')
+ def _convertToText(self, force):
+ """
+ Private implementation method.
+ If we don't have txt cache or we are forced to convert, we try to do it
+ using system pdftotext utility. We set the result as text_content property.
+ We mark it in cache as done, even if we fail, so we don't keep trying if it
+ doesn't work.
+ """
+ portal_workflow = getToolByName(self, 'portal_workflow')
+ if hasattr(self, 'data') and (force == 1 or not self.hasConversion(format = 'txt')):
# XXX-JPS accessing attribute data is bad
self.log('PdfDocument', 'regenerating txt')
- tmp = tempfile.NamedTemporaryFile()
- tmp.write(self._unpackData(self.data))
- tmp.seek(0)
- cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
- r = os.popen(cmd)
- self.setTextContent(r.read().replace('\n', ' '))
- tmp.close()
- r.close()
- self.setConversion('empty', format = 'txt') # we don't need to store it twice, just mark we have it
- return File.getSearchableText(self, md)
+ try:
+ try:
+ tmp = tempfile.NamedTemporaryFile()
+ tmp.write(self._unpackData(self.data))
+ tmp.seek(0)
+ cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
+ r = os.popen(cmd)
+ self.setTextContent(r.read().replace('\n', ' '))
+ tmp.close()
+ r.close()
+ except Exception, e:
+ self.log(str(e))
+ msg = 'Conversion to text failed: ' + str(e)
+ else:
+ msg = 'Converted to text'
+ finally:
+ portal_workflow.doActionFor(self, 'process', comment=msg)
+ # we don't need to store it twice, just mark we have it (or rather we already tried)
+ # we try only once
+ self.setConversion('empty', format = 'txt')
SearchableText=getSearchableText
@@ -136,22 +167,29 @@
def getHtmlRepresentation(self, force=0):
'''
get simplified html version to display
+ If we fail to convert, we set workflow message and put error message
+ as html preview so that the user knows what's going on
'''
+ portal_workflow = getToolByName(self, 'portal_workflow')
if not hasattr(self, 'data'):
return 'no data'
if force==1 or not self.hasConversion(format = 'html'):
- self.log('PDF', 'regenerating html')
- tmp = tempfile.NamedTemporaryFile()
- tmp.write(self._unpackData(self.data))
- tmp.seek(0)
- cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
- r = os.popen(cmd)
- h = r.read()
- tmp.close()
- r.close()
- h = stripHtml(h)
+ try:
+ self.log('PDF', 'regenerating html')
+ tmp = tempfile.NamedTemporaryFile()
+ tmp.write(self._unpackData(self.data))
+ tmp.seek(0)
+ cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
+ r = os.popen(cmd)
+ h = r.read()
+ tmp.close()
+ r.close()
+ h = stripHtml(h)
+ except Exception, e:
+ msg = 'Could not convert to html: ' + str(e)
+ h = msg
+ portal_workflow.doActionFor(self, 'process', comment=msg)
self.setConversion(h, format = 'html')
- self.updateConversion(format = 'html')
return self.getConversion(format = 'html')[1]
# vim: syntax=python shiftwidth=2
More information about the Erp5-report
mailing list