[Erp5-report] r29844 - in /erp5/trunk/products: ERP5/Document/ ERP5OOo/Document/

nobody at svn.erp5.org nobody at svn.erp5.org
Tue Oct 20 13:03:35 CEST 2009


Author: jp
Date: Tue Oct 20 13:03:32 2009
New Revision: 29844

URL: http://svn.erp5.org?rev=29844&view=rev
Log:
Use mixin.CachedConvertableMixin. getConvertedSize removed from Document class (redundant with getConversionSize). docstring put again where it belongs (ie. Document class). Preparing the removal of document interface.

Modified:
    erp5/trunk/products/ERP5/Document/Document.py
    erp5/trunk/products/ERP5/Document/File.py
    erp5/trunk/products/ERP5/Document/PDFDocument.py
    erp5/trunk/products/ERP5OOo/Document/OOoDocument.py

Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=29844&r1=29843&r2=29844&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/Document.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/Document.py [utf8] Tue Oct 20 13:03:32 2009
@@ -52,8 +52,10 @@
 import cStringIO
 import string
 from OFS.Image import Pdata
-import md5
 from Products.PythonScripts.Utility import allow_class
+
+# Mixin Import
+from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
 
 _MARKER = []
 VALID_ORDER_KEY_LIST = ('user_login', 'content', 'file_name', 'input')
@@ -112,136 +114,6 @@
 
 class NotConvertedError(Exception):pass
 allow_class(NotConvertedError)
-
-class ConversionCacheMixin:
-  """
-    This class provides a generic API to store in the ZODB
-    various converted versions of a file or of a string.
-
-    Versions are stored in dictionaries; the class stores also
-    generation time of every format and its mime-type string.
-    Format can be a string or a tuple (e.g. format, resolution).
-  """
-
-  # Declarative security
-  security = ClassSecurityInfo()
-  security.declareObjectProtected(Permissions.AccessContentsInformation)
-
-  def _getCacheFactory(self):
-    """
-    """
-    if self.isTempObject():
-      return
-    cache_tool = getToolByName(self, 'portal_caches')
-    preference_tool = getToolByName(self, 'portal_preferences')
-    cache_factory_name = preference_tool.getPreferredConversionCacheFactory('document_cache_factory')
-    cache_factory = cache_tool.getRamCacheRoot().get(cache_factory_name)
-    #XXX This conditional statement should be remove as soon as
-    #Broadcasting will be enable among all zeo clients.
-    #Interaction which update portal_caches should interact with all nodes.
-    if cache_factory is None and getattr(cache_tool, cache_factory_name, None) is not None:
-      #ram_cache_root is not up to date for current node
-      cache_tool.updateCache()
-    return cache_tool.getRamCacheRoot().get(cache_factory_name)
-
-  security.declareProtected(Permissions.ModifyPortalContent, 'clearConversionCache')
-  def clearConversionCache(self):
-    """
-    """
-    if self.isTempObject():
-      self.temp_conversion_data = {}
-      return
-    for cache_plugin in self._getCacheFactory().getCachePluginList():
-      cache_plugin.delete(self.getPath(), DEFAULT_CACHE_SCOPE)
-
-  security.declareProtected(Permissions.View, 'hasConversion')
-  def hasConversion(self, **kw):
-    """
-    If you want to get conversion cache value if exists, please write
-    the code like:
-
-      try:
-        mime, data = getConversion(**kw)
-      except KeyError:
-        ...
-
-    instead of:
-
-      if self.hasConversion(**kw):
-        mime, data = self.getConversion(**kw)
-      else:
-        ...
-
-    for better performance.
-    """
-    try:
-      self.getConversion(**kw)
-      return True
-    except KeyError:
-      return False
-
-  security.declareProtected(Permissions.ModifyPortalContent, 'setConversion')
-  def setConversion(self, data, mime=None, calculation_time=None, **kw):
-    """
-    """
-    cache_id = self.generateCacheId(**kw)
-    if self.isTempObject():
-      if getattr(aq_base(self), 'temp_conversion_data', None) is None:
-        self.temp_conversion_data = {}
-      self.temp_conversion_data[cache_id] = (mime, aq_base(data))
-      return
-    cache_factory = self._getCacheFactory()
-    cache_duration = cache_factory.cache_duration
-    if data is not None:
-      for cache_plugin in cache_factory.getCachePluginList():
-        try:
-          cache_entry = cache_plugin.get(self.getPath(), DEFAULT_CACHE_SCOPE)
-          cache_dict = cache_entry.getValue()
-        except KeyError:
-          cache_dict = {}
-        cache_dict.update({cache_id: (self.getContentMd5(), mime, aq_base(data))})
-        cache_plugin.set(self.getPath(), DEFAULT_CACHE_SCOPE,
-                         cache_dict, calculation_time=calculation_time,
-                         cache_duration=cache_duration)
-
-  security.declareProtected(Permissions.View, 'getConversion')
-  def getConversion(self, **kw):
-    """
-    """
-    cache_id = self.generateCacheId(**kw)
-    if self.isTempObject():
-      return getattr(aq_base(self), 'temp_conversion_data', {})[cache_id]
-    for cache_plugin in self._getCacheFactory().getCachePluginList():
-      cache_entry = cache_plugin.get(self.getPath(), DEFAULT_CACHE_SCOPE)
-      data_list = cache_entry.getValue().get(cache_id)
-      if data_list:
-        md5sum, mime, data = data_list
-        if md5sum != self.getContentMd5():
-          raise KeyError, 'Conversion cache key is compromised for %r' % cache_id
-        return mime, data
-    raise KeyError, 'Conversion cache key does not exists for %r' % cache_id
-
-  security.declareProtected(Permissions.View, 'getConversionSize')
-  def getConversionSize(self, **kw):
-    """
-    """
-    try:
-      return len(self.getConversion(**kw))
-    except KeyError:
-      return 0
-
-  def generateCacheId(self, **kw):
-    """Generate proper cache id based on **kw.
-    Function inspired from ERP5Type.Cache
-    """
-    return str(makeSortedTuple(kw)).translate(string.maketrans('', ''), '[]()<>\'", ')
-
-  security.declareProtected(Permissions.ModifyPortalContent, 'updateContentMd5')
-  def updateContentMd5(self):
-    """Update md5 checksum from the original file
-    """
-    data = self.getData()
-    self._setContentMd5(md5.new(data).hexdigest()) #reindex is useless
 
 class PermanentURLMixIn(ExtensibleTraversableMixIn):
   """
@@ -455,8 +327,168 @@
     return method()
 
 
-class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConversionCacheMixin, SnapshotMixin, UpdateMixIn):
-  """
+class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, SnapshotMixin, UpdateMixIn):
+  """Document is an abstract class with all methods related to document
+  management in ERP5. This includes searchable text, explicit relations,
+  implicit relations, metadata, versions, languages, etc.
+
+  Documents may either store their content directly or cache content
+  which is retrieved from a specified URL. The second case if often
+  referred as "External Document". Standalone "External Documents" may
+  be created by specifying a URL to the contribution tool which is in
+  charge of initiating the download process and selecting the appropriate
+  document type. Groups of "External Documents" may also be generated from
+  so-called "External Source" (refer to ExternalSource class for more
+  information).
+
+  External Documents may be downloaded once or updated at regular interval.
+  The later can be useful to update the content of an external source.
+  Previous versions may be stored in place or kept in a separate file.
+  This feature is known as the crawling API. It is mostly implemented
+  in ContributionTool with wrappers in the Document class. It can be useful
+  for create a small search engine.
+
+  There are currently two types of Document subclasses:
+
+  * File for binary file based documents. File has subclasses such as Image,
+    OOoDocument, PDFDocument, etc. to implement specific conversion methods.
+
+  * TextDocument for text based documents. TextDocument has subclasses such
+    as Wiki to implement specific methods. 
+    TextDocument itself has a subclass (XSLTDocument) which provides
+    XSLT based analysis and transformation of XML content based on XSLT
+    templates. 
+
+  Conversion should be achieved through the convert method and other methods
+  of the conversion API (convertToBaseFormat, etc.).
+  Moreover, any Document subclass must ne able to convert documents to text
+  (asText method) and HTML (asHTML method). Text is required for full text
+  indexing. HTML is required for crawling.
+
+  Instances can be created directly, or via portal_contributions tool which
+  manages document ingestion process whereby a file can be uploaded by http
+  or sent in by email or dropped in by webdav or in some other way as yet
+  unknown. The ingestion process has the following steps:
+
+  (1) portal type detection
+  (2) object creation and upload of data
+  (3) metadata discovery (optionally with conversion of data to another format)
+  (4) other possible actions to finalise the ingestion (ex. by assigning
+      a reference)
+
+  This class handles (3) and calls a ZMI script to do (4).
+
+  Metadata can be drawn from various sources:
+
+  input      -   data supplied with http request or set on the object during (2) (e.g.
+                 discovered from email text)
+  file_name  -   data which might be encoded in file name
+  user_login -   information about user who is contributing the file
+  content    -   data which might be derived from document content
+
+  If a certain property is defined in more than one source, it is set according to
+  preference order returned by a script 
+     Document_getPreferredDocumentMetadataDiscoveryOrderList
+     (or any type-based version since discovery is type dependent)
+
+  Methods for discovering metadata are:
+
+    getPropertyDictFromInput
+    getPropertyDictFromFileName
+    getPropertyDictFromUserLogin
+    getPropertyDictFromContent
+
+  Methods for processing content are implemented either in Document class
+  or in Base class:
+
+    getSearchableReferenceList (Base)
+    getSearchableText (Base)
+    index_html (overriden in Document subclasses)
+
+  Methods for handling relations are implemented either in Document class
+  or in Base class:
+
+    getImplicitSuccessorValueList (Base)
+    getImplicitPredecessorValueList (Base)
+    getImplicitSimilarValueList (Base)
+    getSimilarCloudValueList (Document)
+
+  Implicit relations consist in finding document references inside
+  searchable text (ex. INV-23456) and deducting relations from that.
+  Two customisable methods required. One to find a list of implicit references
+  inside the content (getSearchableReferenceList) and one to convert a given
+  document reference into a list of reference strings which could be present
+  in other content (asSearchableReferenceList).
+
+  document.getSearchableReferenceList() returns
+    [
+     {'reference':' INV-12367'},
+     {'reference': 'INV-1112', 'version':'012}', 
+     {'reference': 'AB-CC-DRK', 'version':'011', 'language': 'en'}
+    ]
+
+  The Document class behaviour can be extended / customized through scripts
+  (which are type-based so can be adjusted per portal type).
+
+  * Document_getPropertyDictFromUserLogin - finds a user (by user_login or
+    from session) and returns properties which should be set on the document
+
+  * Document_getPropertyDictFromContent - analyzes document content and returns
+    properties which should be set on the document
+
+  * Base_getImplicitSuccessorValueList - finds appropriate all documents
+    referenced in the current content
+
+  * Base_getImplicitPredecessorValueList - finds document predecessors based on
+    the document coordinates (can use only complete coordinates, or also partial)
+
+  * Document_getPreferredDocumentMetadataDiscoveryOrderList - returns an order
+    in which metadata should be set/overwritten
+
+  * Document_finishIngestion - called by portal_activities after all the ingestion
+    is completed (and after document has been converted, so text_content
+    is available if the document has it)
+
+  * Document_getNewRevision - calculates revision number which should be set
+    on this document. Implementation depends on revision numbering policy which
+    can be very different. Interaction workflow should call setNewRevision method.
+
+  * Document_populateContent - analyses the document content and produces
+    subcontent based on it (ex. images, news, etc.). This scripts can
+    involve for example an XSLT transformation to process XML.
+
+  Subcontent: documents may include subcontent (files, images, etc.)
+  so that publication of rich content can be path independent. Subcontent
+  can also be used to help the rendering in HTML of complex documents
+  such as ODF documents.
+
+  Consistency checking:
+    Default implementation uses DocumentReferenceConstraint to check if the 
+    reference/language/version triplet is unique. Additional constraints
+    can be added if necessary.
+
+  NOTE: Document.py supports a notion of revision which is very specific.
+  The underlying concept is that, as soon as a document has a reference,
+  the association of (reference, version, language) must be unique accross
+  the whole system. This means that a given document in a given version in a
+  given language is unique. The underlying idea is similar to the one in a Wiki
+  system in which each page is unique and acts the the atom of collaboration.
+  In the case of ERP5, if a team collaborates on a Text document written with
+  an offline word processor, all updates should be placed inside the same object.
+  A Contribution will thus modify an existing document, if allowed from security
+  point of view, and increase the revision number. Same goes for properties
+  (title). Each change generates a new revision.
+
+    conversion API - not same as document - XXX BAD
+    XXX make multiple interfaces
+
+  TODO:
+    - move all implementation bits to MixIn classes
+    - in the end, Document class should have zero code
+      and only serve as a quick and easy way to create 
+      new types of documents (one could even consider 
+      that this class should be trashed)
+    - 
   """
 
   meta_type = 'ERP5 Document'
@@ -467,7 +499,14 @@
   isDocument = 1
   __dav_collection__=0
 
-  zope.interface.implements( interfaces.IDocument, )
+  zope.interface.implements(interfaces.IConvertable,
+                            interfaces.ITextConvertable,
+                            interfaces.IHtmlConvertable,
+                            interfaces.ICachedConvertable,
+                            interfaces.IVersionable,
+                            interfaces.IDownloadable,
+                            interfaces.ICrawlable,
+                           )
 
   # Regular expressions
   href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
@@ -1051,13 +1090,13 @@
     # Try not to invoke an automatic transition here
     self._edit(**kw)
     # Finish ingestion by calling method
-    self.finishIngestion()
-    self.reindexObject()
+    self.finishIngestion() # XXX - is this really the right place ?
+    self.reindexObject() # XXX - is this really the right place ?
     # Revision merge is tightly coupled
     # to metadata discovery - refer to the documentation of mergeRevision method
-    merged_doc = self.mergeRevision()
-    merged_doc.reindexObject()
-    return merged_doc
+    merged_doc = self.mergeRevision() # XXX - is this really the right place ?
+    merged_doc.reindexObject() # XXX - is this really the right place ?
+    return merged_doc # XXX - is this really the right place ?
 
   security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
   def finishIngestion(self):
@@ -1091,13 +1130,6 @@
     if format in ('text', 'txt'):
       return 'text/plain', '' # XXX - Why ?
     raise NotImplementedError
-
-  def getConvertedSize(self, format):
-    """
-      Returns the size of the converted document
-    """
-    format, data = self.convert(format)
-    return len(data)
 
   security.declareProtected(Permissions.View, 'asSubjectText')
   def asSubjectText(self, **kw):
@@ -1272,7 +1304,7 @@
     """
     return False
 
-  def convertFile(self, **kw):
+  def convertFile(self, **kw): # XXX - It it really useful to explicitly define ?
     """
     Workflow transition invoked when conversion occurs.
     """

Modified: erp5/trunk/products/ERP5/Document/File.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/File.py?rev=29844&r1=29843&r2=29844&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/File.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/File.py [utf8] Tue Oct 20 13:03:32 2009
@@ -35,13 +35,15 @@
 from Products.ERP5Type import Permissions, PropertySheet, Constraint, interfaces
 from Products.ERP5Type.Cache import CachingMethod
 from Products.ERP5.Document.Document import Document
-from Products.ERP5.Document.Document import ConversionCacheMixin
 from Products.ERP5.Document.Document import ConversionError
 from Products.ERP5Type.Base import Base
 from Products.CMFDefault.File import File as CMFFile
 from zLOG import LOG
 from DateTime import DateTime
 
+# Mixin Import
+from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
+
 mimetypes.init()
 
 def _unpackData(data):
@@ -51,7 +53,7 @@
   """
   return str(data)
 
-class File(Document, CMFFile, ConversionCacheMixin):
+class File(Document, CMFFile, CachedConvertableMixin):
   """
       A File can contain raw data which can be uploaded and downloaded.
       It is the root class of Image, OOoDocument (ERP5OOo product),

Modified: erp5/trunk/products/ERP5/Document/PDFDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/PDFDocument.py?rev=29844&r1=29843&r2=29844&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/PDFDocument.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Document/PDFDocument.py [utf8] Tue Oct 20 13:03:32 2009
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 ##############################################################################
 #
 # Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
@@ -34,11 +35,12 @@
 from Products.ERP5Type import Permissions, PropertySheet, Constraint, interfaces
 from Products.ERP5Type.Cache import CachingMethod
 from Products.ERP5.Document.Image import Image
-from Products.ERP5.Document.Document import ConversionCacheMixin, ConversionError
+from Products.ERP5.Document.Document import ConversionError
+from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
 
 from zLOG import LOG, WARNING
 
-class PDFDocument(Image, ConversionCacheMixin):
+class PDFDocument(Image, CachedConvertableMixin):
   """
   PDFDocument is a subclass of Image which is able to
   extract text content from a PDF file either as text

Modified: erp5/trunk/products/ERP5OOo/Document/OOoDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/OOoDocument.py?rev=29844&r1=29843&r2=29844&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/OOoDocument.py [utf8] (original)
+++ erp5/trunk/products/ERP5OOo/Document/OOoDocument.py [utf8] Tue Oct 20 13:03:32 2009
@@ -47,10 +47,12 @@
 from Products.ERP5Type.UnrestrictedMethod import UnrestrictedMethod
 from Products.ERP5.Document.File import File
 from Products.ERP5.Document.Document import PermanentURLMixIn
-from Products.ERP5.Document.Document import ConversionCacheMixin
 from Products.ERP5.Document.Document import ConversionError
 from Products.ERP5.Document.Document import NotConvertedError
 from zLOG import LOG, ERROR
+
+# Mixin Import
+from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
 
 enc=base64.encodestring
 dec=base64.decodestring
@@ -80,7 +82,7 @@
     return SafeTransport.make_connection(self, h)
 
 
-class OOoDocument(PermanentURLMixIn, File, ConversionCacheMixin):
+class OOoDocument(PermanentURLMixIn, File, CachedConvertableMixin):
   """
     A file document able to convert OOo compatible files to
     any OOo supported format, to capture metadata and to




More information about the Erp5-report mailing list