[Erp5-report] r34146 mame - /erp5/trunk/products/ERP5/Document/

Fri Mar 26 13:57:38 CET 2010

Author: mame
Date: Fri Mar 26 13:57:37 2010
New Revision: 34146

URL: http://svn.erp5.org?rev=34146&view=rev
Log:
Modified to revert back to revision before 34090

Modified:
    erp5/trunk/products/ERP5/Document/Document.py
    erp5/trunk/products/ERP5/Document/Image.py
    erp5/trunk/products/ERP5/Document/PDFDocument.py
    erp5/trunk/products/ERP5/Document/TextDocument.py

Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=34146&r1=34145&r2=34146&view=diff
==============================================================================

--- erp5/trunk/products/ERP5/Document/Document.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/Document.py [utf8] Fri Mar 26 13:57:37 2010
@@ -56,12 +56,6 @@
 
 # Mixin Import
 from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
-from Products.ERP5.mixin.convertable import ConvertableMixin
-from Products.ERP5.mixin.text_convertable import TextConvertableMixin
-from Products.ERP5.mixin.base_convertable import BaseConvertableMixin
-from Products.ERP5.mixin.html_convertable import HTMLConvertableMixin
-from Products.ERP5.mixin.metadata_discoverable import MetadataDiscoverableMixin
-from Products.ERP5.mixin.document import DocumentMixin
 
 _MARKER = []
 VALID_ORDER_KEY_LIST = ('user_login', 'content', 'file_name', 'input')
@@ -312,8 +306,7 @@
     return method()
 
 
-class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConvertableMixin, TextConvertableMixin,HTMLConvertableMixin,
-          DocumentMixin, BaseConvertableMixin, MetadataDiscoverableMixin, CachedConvertableMixin, SnapshotMixin, UpdateMixIn):
+class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, SnapshotMixin, UpdateMixIn):
   """Document is an abstract class with all methods related to document
   management in ERP5. This includes searchable text, explicit relations,
   implicit relations, metadata, versions, languages, etc.
@@ -1084,6 +1077,267 @@
     method = self._getTypeBasedMethod('finishIngestion', fallback_script_id='Document_finishIngestion')
     return method()
 
+  # Conversion methods
+  security.declareProtected(Permissions.AccessContentsInformation, 'convert')
+  def convert(self, format, **kw):
+    """
+      Main content conversion function, returns result which should
+      be returned and stored in cache.
+      format - the format specied in the form of an extension
+      string (ex. jpeg, html, text, txt, etc.)
+      **kw can be various things - e.g. resolution
+
+      Default implementation returns an empty string (html, text)
+      or raises an error.
+
+      TODO:
+      - implement guards API so that conversion to certain
+        formats require certain permission
+    """
+    if format == 'html':
+      return 'text/html', '' # XXX - Why ?
+    if format in ('text', 'txt'):
+      return 'text/plain', '' # XXX - Why ?
+    raise NotImplementedError
+
+  security.declareProtected(Permissions.View, 'asSubjectText')
+  def asSubjectText(self, **kw):
+    """
+      Converts the subject of the document to a textual representation.
+    """
+    subject = self.getSubject()
+    if not subject:
+      # XXX not sure if this fallback is a good idea.
+      subject = self.getTitle()
+    if subject is None:
+      subject = ''
+    return str(subject)
+
+  security.declareProtected(Permissions.View, 'asText')
+  def asText(self, **kw):
+    """
+      Converts the content of the document to a textual representation.
+    """
+    kw['format'] = 'txt'
+    mime, data = self.convert(**kw)
+    return str(data)
+
+  security.declareProtected(Permissions.View, 'asEntireHTML')
+  def asEntireHTML(self, **kw):
+    """
+      Returns a complete HTML representation of the document
+      (with body tags, etc.). Adds if necessary a base
+      tag so that the document can be displayed in an iframe
+      or standalone.
+
+      Actual conversion is delegated to _asHTML
+    """
+    html = self._asHTML(**kw)
+    if self.getUrlString():
+      # If a URL is defined, add the base tag
+      # if base is defined yet.
+      html = str(html)
+      if not html.find('<base') >= 0:
+        base = '<base href="%s">' % self.getContentBaseURL()
+        html = html.replace('<head>', '<head>%s' % base)
+      self.setConversion(html, mime='text/html', format='base-html')
+    return html
+
+  security.declarePrivate('_asHTML')
+  def _asHTML(self, **kw):
+    """
+      A private method which converts to HTML. This method
+      is the one to override in subclasses.
+    """
+    if not self.hasBaseData():
+      raise ConversionError('This document has not been processed yet.')
+    try:
+      # FIXME: no substitution may occur in this case.
+      mime, data = self.getConversion(format='base-html')
+      return data
+    except KeyError:
+      kw['format'] = 'html'
+      mime, html = self.convert(**kw)
+      return html
+
+  security.declareProtected(Permissions.View, 'asStrippedHTML')
+  def asStrippedHTML(self, **kw):
+    """
+      Returns a stripped HTML representation of the document
+      (without html and body tags, etc.) which can be used to inline
+      a preview of the document.
+    """
+    if not self.hasBaseData():
+      return ''
+    try:
+      # FIXME: no substitution may occur in this case.
+      mime, data = self.getConversion(format='stripped-html')
+      return data
+    except KeyError:
+      kw['format'] = 'html'
+      mime, html = self.convert(**kw)
+      return self._stripHTML(str(html))
+
+  def _guessEncoding(self, string):
+    """
+      Try to guess the encoding for this string.
+      Returns None if no encoding can be guessed.
+    """
+    try:
+      import chardet
+    except ImportError:
+      return None
+    return chardet.detect(string).get('encoding', None)
+
+  def _stripHTML(self, html, charset=None):
+    """
+      A private method which can be reused by subclasses
+      to strip HTML content
+    """
+    body_list = re.findall(self.body_parser, str(html))
+    if len(body_list):
+      stripped_html = body_list[0]
+    else:
+      stripped_html = html
+    # find charset and convert to utf-8
+    charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient if this
+                                         # is datastream instance but hard to do better
+    if charset and not charset_list:
+      # Use optional parameter is we can not find encoding in HTML
+      charset_list = [charset]
+    if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'):
+      try:
+        stripped_html = unicode(str(stripped_html),
+                                charset_list[0]).encode('utf-8')
+      except (UnicodeDecodeError, LookupError):
+        return str(stripped_html)
+    return stripped_html
+
+  def _safeHTML(self, html, format='text/x-html-safe', charset=None):
+    """
+      A private method to strip HTML content in safe mode,
+      w/o emmbed javascript, forms and any external plugins imports.
+      This should be used when we do not trust the user (Anonymous)
+      who push data into database.
+      - html: content to strip
+      - format: destination format
+      - charset: charset used to encode string. Take precedence
+      on charset values found in html string
+    """
+    portal = self.getPortalObject()
+    if charset is None:
+      # find charset
+      charset_list = self.charset_parser.findall(html)
+      if charset_list:
+        charset = charset_list[0]
+    if charset and charset not in ('utf-8', 'UTF-8'):
+      try:
+        safe_html_string = html.decode(charset).encode('utf-8')
+      except (UnicodeDecodeError, LookupError):
+        pass
+      else:
+        charset = 'utf-8' # Override charset if convertion succeeds
+    transform_tool = getToolByName(portal, 'portal_transforms')
+    safe_html_string = transform_tool.convertToData(format, html,
+                                                    encoding=charset)
+    return safe_html_string
+
+  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
+  def getContentInformation(self):
+    """
+    Returns the content information from the HTML conversion.
+    The default implementation tries to build a dictionnary
+    from the HTML conversion of the document and extract
+    the document title.
+    """
+    result = {}
+    html = self.asEntireHTML()
+    if not html: return result
+    title_list = re.findall(self.title_parser, str(html))
+    if title_list:
+      result['title'] = title_list[0]
+    return result
+
+  # Base format support
+  security.declareProtected(Permissions.ModifyPortalContent, 'convertToBaseFormat')
+  def convertToBaseFormat(self, **kw):
+    """
+      Converts the content of the document to a base format
+      which is later used for all conversions. This method
+      is common to all kinds of documents and handles
+      exceptions in a unified way.
+
+      Implementation is delegated to _convertToBaseFormat which
+      must be overloaded by subclasses of Document which
+      need a base format.
+
+      convertToBaseFormat is called upon file upload, document
+      ingestion by the processing_status_workflow.
+
+      NOTE: the data of the base format conversion should be stored
+      using the base_data property. Refer to Document.py propertysheet.
+      Use accessors (getBaseData, setBaseData, hasBaseData, etc.)
+    """
+    if getattr(self, 'hasData', None) is not None and not self.hasData():
+      # Empty document cannot be converted
+      return
+    try:
+      message = self._convertToBaseFormat() # Call implemetation method
+      self.clearConversionCache() # Conversion cache is now invalid
+      if message is None:
+        # XXX Need to translate.
+        message = 'Converted to %s.' % self.getBaseContentType()
+      self.convertFile(comment=message) # Invoke workflow method
+    except NotImplementedError:
+      message = ''
+    return message
+
+  def _convertToBaseFormat(self):
+    """
+    """
+    raise NotImplementedError
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'isSupportBaseDataConversion')
+  def isSupportBaseDataConversion(self):
+    """
+    """
+    return False
+
+  def convertFile(self, **kw): # XXX - It it really useful to explicitly define ?
+    """
+    Workflow transition invoked when conversion occurs.
+    """
+  convertFile = WorkflowMethod(convertFile)
+
+  security.declareProtected(Permissions.AccessContentsInformation,
+                            'getMetadataMappingDict')
+  def getMetadataMappingDict(self):
+    """
+    Return a dict of metadata mapping used to update base metadata of the
+    document
+    """
+    try:
+      method = self._getTypeBasedMethod('getMetadataMappingDict')
+    except KeyError, AttributeError:
+      method = None
+    if method is not None:
+      return method()
+    else:
+      return {}
+
+  security.declareProtected(Permissions.ModifyPortalContent, 'updateBaseMetadata')
+  def updateBaseMetadata(self, **kw):
+    """
+    Update the base format data with the latest properties entered
+    by the user. For example, if title is changed in ERP5 interface,
+    the base format file should be updated accordingly.
+
+    Default implementation does nothing. Refer to OOoDocument class
+    for an example of implementation.
+    """
+    pass
+
   # Transformation API
   security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
   def populateContent(self):

Modified: erp5/trunk/products/ERP5/Document/Image.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Image.py?rev=34146&r1=34145&r2=34146&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/Image.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/Image.py [utf8] Fri Mar 26 13:57:37 2010
@@ -53,8 +53,6 @@
 from zLOG import LOG, WARNING
 
 from Products.CMFCore.utils import getToolByName
-#Mixin import
-from Products.ERP5.mixin.convertable import ConvertableMixin
 
 default_displays_id_list = ('nano', 'micro', 'thumbnail',
                             'xsmall', 'small', 'medium',
@@ -62,7 +60,7 @@
 
 default_formats = ['jpg', 'jpeg', 'png', 'gif', 'pnm', 'ppm']
 
-class Image(File, OFSImage, ConvertableMixin):
+class Image(File, OFSImage):
   """
     An Image is a File which contains image data. It supports
     various conversions of format, size, resolution through
@@ -324,36 +322,11 @@
     return mime_type, result
 
   # Conversion API
-  security.declareProtected(Permissions.View, 'getAllowedTargetItemList')
-  def getAllowedTargetItemList(self):
-    import commands
-    import re
-    import os
-    new_result = []
-    filename = os.path.abspath(self.getSourceReference())
-    result = commands.getstatusoutput('convert -list format %s ' % self.getSourceReference())
-    new_list = re.split('\n',result[1])
-    allowed = []
-    for new_str in new_list:
-      test_str = new_str.lstrip()
-      pattern = re.compile(r'''([A-z]+[*]?\s+[A-z]+\s+[rw+-]+\s+[A-z]+\s+[A-z]+\D+[A-z]+)''',re.VERBOSE)
-      if re.match(pattern,test_str):
-        new_result.append(test_str)
-    
-    len_new_result = len(new_result)
-    for i in range(0,len_new_result):
-      allowed.append(list((new_result[i].split()[1].lower(),' '.join(new_result[i].split()[3:])))) 
-    return [(y, x) for x, y in allowed]
-
   security.declareProtected(Permissions.AccessContentsInformation, 'convert')
   def convert(self, format, display=None, quality=75, resolution=None, frame=None, **kw):
     """
     Implementation of conversion for Image files
     """
-    # Raise an error if the format is not permitted
-    if not self.isTargetFormatPermitted(format):
-      raise Unauthorized("User does not have enough permission to access document"
-				     " in %s format" % (format or 'original'))
     if format in ('text', 'txt', 'html', 'base_html', 'stripped-html'):
       try:
         return self.getConversion(format=format)
@@ -366,7 +339,7 @@
     if (display is not None or resolution is not None or quality != 75 or format != ''\
                             or frame is not None) and image_size:
       kw = dict(display=display, format=format, quality=quality,
-               resolution=resolution, frame=frame, image_size=image_size)
+                resolution=resolution, frame=frame, image_size=image_size)
       try:
         mime, image = self.getConversion(**kw)
       except KeyError:
@@ -396,7 +369,7 @@
     # display may be set from a cookie (?)
     image_size = self.getSizeFromImageDisplay(display)
     kw = dict(display=display, format=format, quality=quality,
-		  resolution=resolution, frame=frame, image_size=image_size)
+              resolution=resolution, frame=frame, image_size=image_size)
     _setCacheHeaders(_ViewEmulator().__of__(self), kw)
 
     if (display is not None or resolution is not None or quality != 75 or format != ''\

Modified: erp5/trunk/products/ERP5/Document/PDFDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/PDFDocument.py?rev=34146&r1=34145&r2=34146&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/PDFDocument.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/PDFDocument.py [utf8] Fri Mar 26 13:57:37 2010
@@ -37,13 +37,7 @@
 from Products.ERP5.Document.Document import ConversionError
 from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
 
-
-from zLOG import LOG, WARNING
-# Mixin import
-from Products.ERP5.mixin.convertable import ConvertableMixin
-
-
-class PDFDocument(Image, ConvertableMixin, CachedConvertableMixin):
+class PDFDocument(Image, CachedConvertableMixin):
   """
   PDFDocument is a subclass of Image which is able to
   extract text content from a PDF file either as text
@@ -104,11 +98,6 @@
                             resolution=resolution, frame=frame)
 
   # Conversion API
-  security.declareProtected(Permissions.View, 'getAllowedTargetItemList')
-  def getAllowedTargetItemList(self):
-    return Image.getAllowedTargetItemList(self) + \
-       [('Text', 'txt'),('Plain Text','text'), ('HTML Document', 'html')]
-  
   security.declareProtected(Permissions.AccessContentsInformation, 'convert')
   def convert(self, format, **kw):
     """

Modified: erp5/trunk/products/ERP5/Document/TextDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/TextDocument.py?rev=34146&r1=34145&r2=34146&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/TextDocument.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/TextDocument.py [utf8] Fri Mar 26 13:57:37 2010
@@ -43,12 +43,9 @@
 except ImportError:
   from Products.ERP5Type.patches.string import Template
 
-# Mixin import
-from Products.ERP5.mixin.convertable import ConvertableMixin
-
 DEFAULT_TEXT_FORMAT = 'text/html'
 
-class TextDocument(Document, TextContent, ConvertableMixin):
+class TextDocument(Document, TextContent):
     """
         A Document contains text which can be formatted using
         *Structured Text* or *HTML*. Text can be automatically translated
@@ -149,10 +146,6 @@
       if format is None:
         # The default is to use ERP5 Forms to render the page
         return self.view()
-      # Raise an error if the format is not permitted
-      if not self.isTargetFormatPermitted(format):
-	raise Unauthorized("User does not have enough permission to access document"
-				" in %s format" % (format or 'original'))
       mime, data = self.convert(format=format) 
       RESPONSE.setHeader('Content-Length', len(str(data))) # XXX - Not efficient 
                                                            # if datastream instance
@@ -207,25 +200,12 @@
         substitution_method_parameter_dict = {}
       return self._substituteTextContent(subject, safe_substitute=safe_substitute,
                                          **substitution_method_parameter_dict)
-    
-    security.declareProtected(Permissions.View, 'getAllowedTargetItemList')
-    def getAllowedTargetItemList(self):
-      mime_type = getToolByName(self, 'mimetypes_registry')
-      allowed=[]
-      for extension in mime_type.extensions:
-        allowed.append((mime_type.extensions[extension].name(),extension))
-
-      return [(y, x) for x, y in allowed]
-    
+
     security.declareProtected(Permissions.AccessContentsInformation, 'convert')
     def convert(self, format, substitution_method_parameter_dict=None, safe_substitute=True, **kw):
       """
         Convert text using portal_transforms or oood
       """
-      # Raise an error if the format is not permitted
-      if not self.isTargetFormatPermitted(format):
-	raise Unauthorized("User does not have enough permission to access document"
-					 " in %s format" % (format or 'original'))
       # Accelerate rendering in Web mode
       _setCacheHeaders(_ViewEmulator().__of__(self), {'format' : format})
       # Return the raw content