[Erp5-report] r24314 - /erp5/trunk/products/ERP5/Document/PDFDocument.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Fri Oct 24 11:46:43 CEST 2008
Author: romain
Date: Fri Oct 24 11:46:36 2008
New Revision: 24314
URL: http://svn.erp5.org?rev=24314&view=rev
Log:
Allow pdf to text convertion by using portal_transforms.
Modified:
erp5/trunk/products/ERP5/Document/PDFDocument.py
Modified: erp5/trunk/products/ERP5/Document/PDFDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/PDFDocument.py?rev=24314&r1=24313&r2=24314&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/PDFDocument.py (original)
+++ erp5/trunk/products/ERP5/Document/PDFDocument.py Fri Oct 24 11:46:36 2008
@@ -36,7 +36,7 @@
from Products.ERP5.Document.Document import ConversionCacheMixin
from Products.ERP5.Document.File import _unpackData
-from zLOG import LOG
+from zLOG import LOG, WARNING
class PDFDocument(Image, ConversionCacheMixin):
"""
@@ -136,7 +136,54 @@
h = r.read()
tmp.close()
r.close()
- return h
+
+ if h != '':
+ return h
+ else:
+ # Try to use OCR
+ # As high dpi images are required, it may take some times to convert the
+ # pdf.
+ # It may be required to use activities to fill the cache and at the end,
+ # to calculate the final result
+ text = ''
+ content_information = self.getContentInformation()
+ page_count = int(content_information.get('Pages', 0))
+ for page_number in range(page_count):
+ src_mimetype, png_data = self.convert(
+ 'png', quality=100, resolution=300,
+ frame=page_number, display='identical')
+ if not src_mimetype.endswith('png'):
+ continue
+ content = '%s' % png_data
+ mime_type = getToolByName(self, 'mimetypes_registry').\
+ lookupExtension('name.%s' % 'txt')
+ if content is not None:
+ portal_transforms = getToolByName(self, 'portal_transforms')
+ result = portal_transforms.convertToData(mime_type, content,
+ context=self,
+ filename=self.title_or_id(),
+ mimetype=src_mimetype)
+ if result is None:
+ # portal_transforms fails to convert.
+ LOG('TextDocument.convert', WARNING,
+ 'portal_transforms failed to convert to %s: %r' % (mime_type, self))
+ result = ''
+ text += result
+ return text
+
+ security.declareProtected('View', 'getSizeFromImageDisplay')
+ def getSizeFromImageDisplay(self, image_display):
+ """
+ Return the size for this image display, or None if this image display name
+ is not known. If the preference is not set, (0, 0) is returned.
+ """
+ # identical parameter can be considered as a hack, in order not to
+ # resize the image to prevent text distorsion when using OCR.
+ # A cleaner API is required.
+ if image_display == 'identical':
+ return (self.getWidth(), self.getHeight())
+ else:
+ return Image.getSizeFromImageDisplay(self, image_display)
security.declarePrivate('_convertToHTML')
def _convertToHTML(self):
More information about the Erp5-report
mailing list