[Erp5-report] r35853 mayoro - /erp5/trunk/products/ERP5Form/PDFParser.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Tue Jun 1 12:50:04 CEST 2010
Author: mayoro
Date: Tue Jun 1 12:50:00 2010
New Revision: 35853
URL: http://svn.erp5.org?rev=35853&view=rev
Log:
Initial version of PDFParser.py for parsing PDF files
Added:
erp5/trunk/products/ERP5Form/PDFParser.py
Added: erp5/trunk/products/ERP5Form/PDFParser.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5Form/PDFParser.py?rev=35853&view=auto
==============================================================================
--- erp5/trunk/products/ERP5Form/PDFParser.py (added)
+++ erp5/trunk/products/ERP5Form/PDFParser.py [utf8] Tue Jun 1 12:50:00 2010
@@ -1,0 +1,189 @@
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2008 Nexedi SA and Contributors. All Rights Reserved.
+# Mayoro DIAGNE <mayoro at nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+import commands
+import re
+import random
+from AccessControl import ClassSecurityInfo
+from tempfile import NamedTemporaryFile
+
+PDFTK_EXECUTABLE = "pdftk"
+
+class PDFParser:
+ """
+ PDF parser API provide methods wich allow to manipulate a pdf file allowing
+ to convert pages as images
+ """
+ security = ClassSecurityInfo()
+
+ def __init__(self, pdf_file_descriptor):
+ """
+ initialise self.data with pdf_file_descriptor if string's
+ The __init__ function can take either a filename, an open file object
+ or the content of the file
+ Initialise self.width self.height and self.pages (page count)
+ """
+ self.width = None
+ self.height = None
+ self.pages = None
+
+ if pdf_file_descriptor is None:
+ raise ValueError, "No PDF file provided, please choose a pdf form "
+
+
+ if type(pdf_file_descriptor) == 'str':
+ self.data = pdf_file_descriptor
+ elif hasattr(pdf_file_descriptor, "read"):
+ pdf_file_descriptor.seek(0)
+ self.data = pdf_file_descriptor.read()
+ pdf_file_descriptor.close()
+ else:
+ source = open(pdf_file_descriptor, "rb")
+ source.seek(0)
+ self.data = source.read()
+ source.close()
+
+ # opening new file on HDD to save PDF content
+ temp_pdf_file = NamedTemporaryFile(mode= "w+b")
+ temp_pdf_name = temp_pdf_file.name
+ # going to the begining of the input file
+ # saving content
+ temp_pdf = open(temp_pdf_name,'w')
+ # saving content
+ temp_pdf.write(self.data)
+ temp_pdf.close()
+ command_output = commands.getstatusoutput('pdfinfo %s' % \
+ temp_pdf_name)
+ if command_output[0] != 0:
+ raise ValueError, 'Error: convert command failed with the following'\
+ 'error message : \n%s' % command_output[1]
+
+ # get the pdf page size
+ rawstr = r'''
+ Page\ssize: #begining of the instersting line
+ \s* #some spaces
+ (\S+)\sx\s(\S+) #the matching pattern : width and height in pts'''
+ compile_obj = re.compile(rawstr, re.MULTILINE | re.VERBOSE)
+ match_obj = re.search(compile_obj, command_output[1])
+ width, height = match_obj.groups()
+
+ # get the pdf page_count
+ rawstr = r'''
+ Pages: #begining of the instersting line
+ \s* #some spaces
+ (\S+) #the matching pattern : width and height in pts'''
+ compile_obj = re.compile(rawstr, re.MULTILINE | re.VERBOSE)
+ match_obj = re.search(compile_obj, command_output[1])
+ page_count = match_obj.groups()[0]
+ attributes = {}
+ self.width = int(round(float(width)))
+ self.height = int(round(float(height)))
+ self.pages = int(page_count)
+
+ def getData(self):
+ """
+ Return the content of the pdf file
+ """
+ return self.data
+
+ security.declarePublic('getPageCount')
+ def getPageCount(self):
+ """
+ Return the page count of the pdf file
+ """
+ #self.getContentInformation()['Pages']
+ return self.pages
+
+ security.declarePublic('getPageWidth')
+ def getPageWidth(self):
+ """
+ Return the width of the pdf file
+ """
+ return self.width
+
+ security.declarePublic('getPageHeight')
+ def getPageHeight(self):
+ """
+ Return the page count of the pdf file
+ """
+ return self.height
+
+ security.declarePublic('getPageImage')
+ def getPageImage(self, page, format, resolution, quality):
+ """
+ Return an instance of newTempImage containing the pape page of
+ the pdf file
+ width, height: attributes in pixel (px)
+ format: jpg, png, etc...
+ resolution: resolution of produced image for exemple 600
+ quality: quality of produced image for exemple 200 raisonable quality
+ more hight is quality more time it takes to be gererated
+ """
+ from Products.ERP5Type.Document import newTempPDFDocument
+ from Products.ERP5Type.Document import newTempImage
+ temp_pdf_document_name = "tmp%s.pdf" % str(random.random()).split('.')[-1]
+ temp_pdf_document = newTempPDFDocument(self, temp_pdf_document_name)
+ temp_pdf_document.setData(self.getData())
+ display = 'xlarge'
+ mime, image_data = temp_pdf_document.convert(format = format,
+ frame = page,
+ resolution = resolution,
+ quality = quality,
+ display = display)
+ page_image = None
+ if image_data is not None:
+ page_image = newTempImage(self, "page_%s" % page)
+ page_image.setData(page_image)
+ return page_image
+
+
+ def getFlattenedPDF(self):
+ """
+ Return a flattened PDF. It's use to merge an input PDF's interactive
+ form fields with the PDF's pages
+ """
+ temp_input_file = NamedTemporaryFile(mode= "w+b")
+ temp_input_name = temp_input_file.name
+ temp_input = open(temp_input_name,'w')
+ temp_input.write(self.getData())
+ temp_input.close()
+ temp_output_file = NamedTemporaryFile(mode= "w+b")
+ temp_output_name = temp_output_file.name
+ command_output = commands.getstatusoutput('pdftk %s output %s flatten'\
+ % (temp_input_name, temp_output_name))
+ if command_output[0] != 0:
+ raise IOError, "pdftk failed with the following error %s"\
+ % command_output[1]
+ temp_output = open(temp_output_name,'rb')
+ temp_output.seek(0)
+ datas = temp_output.read()
+ temp_output.close()
+ return datas
+
+
+
More information about the Erp5-report
mailing list