[Erp5-report] r42937 seb - /erp5/trunk/products/PortalTransforms/transforms/

nobody at svn.erp5.org nobody at svn.erp5.org
Wed Feb 2 16:04:00 CET 2011


Author: seb
Date: Wed Feb  2 16:04:00 2011
New Revision: 42937

URL: http://svn.erp5.org?rev=42937&view=rev
Log:
add transformations to get text from tiff files thanks
to tesseract

Added:
    erp5/trunk/products/PortalTransforms/transforms/tiff_to_text.py
Modified:
    erp5/trunk/products/PortalTransforms/transforms/__init__.py

Modified: erp5/trunk/products/PortalTransforms/transforms/__init__.py
URL: http://svn.erp5.org/erp5/trunk/products/PortalTransforms/transforms/__init__.py?rev=42937&r1=42936&r2=42937&view=diff
==============================================================================
--- erp5/trunk/products/PortalTransforms/transforms/__init__.py [utf8] (original)
+++ erp5/trunk/products/PortalTransforms/transforms/__init__.py [utf8] Wed Feb  2 16:04:00 2011
@@ -45,6 +45,7 @@ modules = [
     #'textile_to_html',# textile, depends on PyTextile http://dom.eav.free.fr/python/textile-mirror-2.0.10.tar.gz 
     'web_intelligent_plain_text_to_html',
     'html_to_web_intelligent_plain_text',
+    'tiff_to_text',   # transforms tiff images to text
     ]
 
 g = globals()

Added: erp5/trunk/products/PortalTransforms/transforms/tiff_to_text.py
URL: http://svn.erp5.org/erp5/trunk/products/PortalTransforms/transforms/tiff_to_text.py?rev=42937&view=auto
==============================================================================
--- erp5/trunk/products/PortalTransforms/transforms/tiff_to_text.py (added)
+++ erp5/trunk/products/PortalTransforms/transforms/tiff_to_text.py [utf8] Wed Feb  2 16:04:00 2011
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+from Products.PortalTransforms.interfaces import ITransform
+from Products.PortalTransforms.data import datastream
+from Products.PortalTransforms.libtransforms.commandtransform \
+    import commandtransform
+import os
+import tempfile
+from zope.interface import implements
+
+class tiff_to_text(commandtransform):
+    implements(ITransform)
+    __name__  = "tiff_to_text"
+
+    inputs   = ('image/tiff',)
+    output  = 'text/plain'
+    output_encoding = 'utf-8'
+
+    __version__ = '2011-02-01.01'
+
+    binaryName = "tesseract"
+    binaryArgs = "%(infile)s "
+
+    def __init__(self):
+        commandtransform.__init__(self, binary=self.binaryName)
+
+    def convert(self, data, cache, **kwargs):
+      kwargs['filename'] = 'input.tiff'
+      tmp_dir, input_file = self.initialize_tmpdir(data, 
+                                   filename='input.tiff')
+
+      text = None
+      try:
+        command = self.binary
+        output_file_path = os.path.join(tmp_dir, 'output')
+        cmd = '%s %s %s' % (
+            self.binary, input_file, output_file_path)
+        os.system(cmd)
+        output_file = open(output_file_path + '.txt', 'r')
+        out = output_file.read()
+        output_file.close()
+      finally:
+        self.cleanDir(tmp_dir)
+
+      data = datastream('output.txt')
+      data.setData(out)
+      return data
+
+def register():
+    return tiff_to_text()



More information about the Erp5-report mailing list