[Erp5-report] r35234 nicolas - /erp5/trunk/products/ERP5/Document/PDFDocument.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Wed May 12 17:14:06 CEST 2010
Author: nicolas
Date: Wed May 12 17:14:04 2010
New Revision: 35234
URL: http://svn.erp5.org?rev=35234&view=rev
Log:
Use subprocess instead of os.popen for reliability
No need to inherit from CachedConvertableMixin as Image already inherit.
Modified:
erp5/trunk/products/ERP5/Document/PDFDocument.py
Modified: erp5/trunk/products/ERP5/Document/PDFDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/PDFDocument.py?rev=35234&r1=35233&r2=35234&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/PDFDocument.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/PDFDocument.py [utf8] Wed May 12 17:14:04 2010
@@ -34,10 +34,11 @@
from Products.ERP5Type import Permissions, PropertySheet
from Products.ERP5.Document.Image import Image
-from Products.ERP5.Document.Document import ConversionError
-from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
-
-class PDFDocument(Image, CachedConvertableMixin):
+from Products.ERP5.Document.Document import ConversionError,\
+ VALID_TEXT_FORMAT_LIST
+from subprocess import Popen, PIPE
+
+class PDFDocument(Image):
"""
PDFDocument is a subclass of Image which is able to
extract text content from a PDF file either as text
@@ -107,15 +108,14 @@
if not self.data:
return ''
tmp = tempfile.NamedTemporaryFile()
- tmp.write(str(self.data))
+ tmp.write(str(self.getData()))
tmp.seek(0)
- cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
- r = os.popen(cmd)
- h = r.read()
+ command_result = Popen(['pdftotext', '-layout', '-enc', 'UTF-8',
+ '-nopgbrk', tmp.name, '-'],
+ stdout=PIPE).communicate()[0]
+ h = command_result
tmp.close()
- r.close()
-
- if h != '':
+ if h:
return h
else:
# Try to use OCR
@@ -189,13 +189,17 @@
tmp = tempfile.NamedTemporaryFile()
tmp.write(str(self.data))
tmp.seek(0)
- cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
- r = os.popen(cmd)
- h = r.read()
+ command_result = Popen(['pdftohtml', '-enc', 'UTF-8', '-stdout',
+ '-noframes', '-i', tmp.name], stdout=PIPE)\
+ .communicate()[0]
+
+ h = command_result
tmp.close()
- r.close()
- h = h.replace('<BODY bgcolor="#A0A0A0"', '<BODY ') # Quick hack to remove bg color - XXX
- h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1], 'href="asEntireHTML') # Make links relative
+ # Quick hack to remove bg color - XXX
+ h = h.replace('<BODY bgcolor="#A0A0A0"', '<BODY ')
+ # Make links relative
+ h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1],
+ 'href="asEntireHTML')
return h
security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
@@ -216,10 +220,9 @@
tmp.seek(0)
try:
# First, we use pdfinfo to get standard metadata
- cmd = 'pdfinfo -meta -box %s' % tmp.name
- r = os.popen(cmd)
- h = r.read()
- r.close()
+ command_result = Popen(['pdfinfo', '-meta', '-box', tmp.name],
+ stdout=PIPE).communicate()[0]
+ h = command_result
result = {}
for line in h.splitlines():
item_list = line.split(':')
@@ -228,10 +231,9 @@
result[key] = value
# Then we use pdftk to get extra metadata
- cmd = 'pdftk %s dump_data output' % tmp.name
- r = os.popen(cmd)
- h = r.read()
- r.close()
+ command_result = Popen(['pdftk', tmp.name, 'dump_data', 'output'],
+ stdout=PIPE).communicate()[0]
+ h = command_result
line_list = (line for line in h.splitlines())
while True:
try:
@@ -256,4 +258,4 @@
del self._content_information
except (AttributeError, KeyError):
pass
- Image._setFile(self, data, precondition)
+ Image._setFile(self, data, precondition=precondition)
More information about the Erp5-report
mailing list