[Erp5-report] r42937 seb - /erp5/trunk/products/PortalTransforms/transforms/
nobody at svn.erp5.org
nobody at svn.erp5.org
Wed Feb 2 16:04:00 CET 2011
Author: seb
Date: Wed Feb 2 16:04:00 2011
New Revision: 42937
URL: http://svn.erp5.org?rev=42937&view=rev
Log:
add transformations to get text from tiff files thanks
to tesseract
Added:
erp5/trunk/products/PortalTransforms/transforms/tiff_to_text.py
Modified:
erp5/trunk/products/PortalTransforms/transforms/__init__.py
Modified: erp5/trunk/products/PortalTransforms/transforms/__init__.py
URL: http://svn.erp5.org/erp5/trunk/products/PortalTransforms/transforms/__init__.py?rev=42937&r1=42936&r2=42937&view=diff
==============================================================================
--- erp5/trunk/products/PortalTransforms/transforms/__init__.py [utf8] (original)
+++ erp5/trunk/products/PortalTransforms/transforms/__init__.py [utf8] Wed Feb 2 16:04:00 2011
@@ -45,6 +45,7 @@ modules = [
#'textile_to_html',# textile, depends on PyTextile http://dom.eav.free.fr/python/textile-mirror-2.0.10.tar.gz
'web_intelligent_plain_text_to_html',
'html_to_web_intelligent_plain_text',
+ 'tiff_to_text', # transforms tiff images to text
]
g = globals()
Added: erp5/trunk/products/PortalTransforms/transforms/tiff_to_text.py
URL: http://svn.erp5.org/erp5/trunk/products/PortalTransforms/transforms/tiff_to_text.py?rev=42937&view=auto
==============================================================================
--- erp5/trunk/products/PortalTransforms/transforms/tiff_to_text.py (added)
+++ erp5/trunk/products/PortalTransforms/transforms/tiff_to_text.py [utf8] Wed Feb 2 16:04:00 2011
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+from Products.PortalTransforms.interfaces import ITransform
+from Products.PortalTransforms.data import datastream
+from Products.PortalTransforms.libtransforms.commandtransform \
+ import commandtransform
+import os
+import tempfile
+from zope.interface import implements
+
+class tiff_to_text(commandtransform):
+ implements(ITransform)
+ __name__ = "tiff_to_text"
+
+ inputs = ('image/tiff',)
+ output = 'text/plain'
+ output_encoding = 'utf-8'
+
+ __version__ = '2011-02-01.01'
+
+ binaryName = "tesseract"
+ binaryArgs = "%(infile)s "
+
+ def __init__(self):
+ commandtransform.__init__(self, binary=self.binaryName)
+
+ def convert(self, data, cache, **kwargs):
+ kwargs['filename'] = 'input.tiff'
+ tmp_dir, input_file = self.initialize_tmpdir(data,
+ filename='input.tiff')
+
+ text = None
+ try:
+ command = self.binary
+ output_file_path = os.path.join(tmp_dir, 'output')
+ cmd = '%s %s %s' % (
+ self.binary, input_file, output_file_path)
+ os.system(cmd)
+ output_file = open(output_file_path + '.txt', 'r')
+ out = output_file.read()
+ output_file.close()
+ finally:
+ self.cleanDir(tmp_dir)
+
+ data = datastream('output.txt')
+ data.setData(out)
+ return data
+
+def register():
+ return tiff_to_text()
More information about the Erp5-report
mailing list