[Erp5-report] r13563 - /erp5/trunk/products/ERP5/Tool/ContributionTool.py

Thu Mar 22 15:37:27 CET 2007

Author: jp
Date: Thu Mar 22 15:37:26 2007
New Revision: 13563

URL: http://svn.erp5.org?rev=13563&view=rev
Log:
Crawling API

Modified:
    erp5/trunk/products/ERP5/Tool/ContributionTool.py

Modified: erp5/trunk/products/ERP5/Tool/ContributionTool.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Tool/ContributionTool.py?rev=13563&r1=13562&r2=13563&view=diff
==============================================================================

--- erp5/trunk/products/ERP5/Tool/ContributionTool.py (original)
+++ erp5/trunk/products/ERP5/Tool/ContributionTool.py Thu Mar 22 15:37:26 2007
@@ -29,14 +29,15 @@
 import cStringIO
 import re
 import string
-import urllib2
+import urllib2, urllib
 
 from AccessControl import ClassSecurityInfo, getSecurityManager
 from Globals import InitializeClass, DTMLFile
 from Products.ERP5Type.Tool.BaseTool import BaseTool
 from Products.ERP5Type import Permissions
 from Products.ERP5 import _dtmldir
-from Products.ERP5.Document.BusinessTemplate import getChainByType
+from Products.ERP5.Document.Url import no_crawl_protocol_list, no_host_protocol_list
+
 from zLOG import LOG
 from DateTime import DateTime
 from Acquisition import aq_base
@@ -58,16 +59,25 @@
     metadata can be derived. 
 
     Configuration Scripts:
-  
+
       - ContributionTool_getPropertyDictFromFileName: receives file name and a 
         dict derived from filename by regular expression, and does any necesary
         operations (e.g. mapping document type id onto a real portal_type).
+
+    Problems which are not solved
+
+      - handling of relative links in HTML contents (or others...)
+        some text rewriting is necessary.
+
   """
   title = 'Contribution Tool'
   id = 'portal_contributions'
   meta_type = 'ERP5 Contribution Tool'
   portal_type = 'Contribution Tool'
 
+  # Regular expressions
+  simple_normaliser = re.compile('#.*')
+
   # Declarative Security
   security = ClassSecurityInfo()
 
@@ -79,14 +89,35 @@
     """
       Finds the appropriate portal type based on the file name
       or if necessary the content of the document.
-    """
+
+      NOTE: XXX This implementation can be greatly accelerated by
+      caching a dict resulting which combines getContentTypeRegistryTypeDict
+      and valid_portal_type_list
+    """
+    def getContentTypeRegistryTypeDict():
+      result = {}
+      for id, pred in self.content_type_registry.listPredicates():
+        (p, type) = pred
+        result[type] = None
+      return result
+
     portal_type = None
     # We should only consider those portal_types which share the
-    # same meta_type with the current object
+    # same constructor with the current object and which are not
+    # part of the definitions of content_type_registry. For
+    # example if content type registry has a definition for
+    # RSS feed, then there is no reason to consider this type
+    # whenever receiving some text/html content although both
+    # types share the same constructor. However, if Memo has
+    # same constructor as Text and Memo is not in content_type_registry
+    # then it should be considered.
     valid_portal_type_list = []
-    for pt in self.portal_types.objectValues():
-      if pt.meta_type == document.meta_type:
-        valid_portal_type_list.append(pt.id)
+    content_registry_type_dict = getContentTypeRegistryTypeDict()
+    portal_type_tool = self.portal_types
+    for pt in portal_type_tool.objectValues():
+      if hasattr(pt, 'factory') and pt.factory == portal_type_tool[document.getPortalType()].factory:
+        if not content_registry_type_dict.has_key(pt.id):
+          valid_portal_type_list.append(pt.id)
 
     # Check if the filename tells which portal_type this is
     portal_type_list = self.getPropertyDictFromFileName(file_name).get('portal_type', [])
@@ -111,7 +142,7 @@
 
     if portal_type is None:
       # We can not do anything anymore
-      #return document.portal_type # XXX Wrong
+      #return document.portal_type # XXX Wrong or maybe right ?
       return None
 
     if portal_type not in valid_portal_type_list:
@@ -123,6 +154,7 @@
 
   security.declareProtected(Permissions.AddPortalContent, 'newContent')
   def newContent(self, id=None, portal_type=None, url=None, container=None,
+                       container_path=None,
                        discover_metadata=1, temp_object=0,
                        user_login=None, **kw):
     """
@@ -138,6 +170,9 @@
       container -- if specified, it is possible to define
       where to contribute the content. Else, ContributionTool
       tries to guess.
+
+      container_path -- if specified, defines the container path
+      and has precedence over container
 
       url -- if specified, content is download from the URL.
 
@@ -178,7 +213,8 @@
       file = cStringIO.StringIO()
       file.write(data)
       file.seek(0)
-      file_name = url.split('/')[-1]
+      file_name = url.split('/')[-1] or url.split('/')[-2]
+      file_name = self._encodeURL(file_name)
       if hasattr(file, 'headers'):
         headers = file.headers
         if hasattr(headers, 'type'):
@@ -201,9 +237,9 @@
       raise ValueError, "could not determine portal type"
 
     # So we will simulate WebDAV to get an empty object
-    # with PUT_factory - we provid<e the mime_type as
+    # with PUT_factory - we provide the mime_type as
     # parameter
-    ob = self.PUT_factory( file_name, mime_type, None )
+    ob = self.PUT_factory(file_name, mime_type, None)
 
     # Raise an error if we could not guess the portal type
     if ob is None:
@@ -214,7 +250,7 @@
     document = BaseTool._getOb(self, file_name)
 
     # Then edit the document contents (so that upload can happen)
-    kw.setdefault('source_reference', file_name)
+    kw.setdefault('source_reference', file_name) # XXX redundant with discoverMetadata
     document._edit(**kw)
     if url: document.fromURL(url)
 
@@ -222,11 +258,17 @@
     BaseTool._delObject(self, file_name)
 
     # Move the document to where it belongs
-    document = self._setObject(file_name, ob, user_login=user_login, container=container)
+    if container_path is not None:
+      container = self.getPortalObject().restrictedTraverse(container_path)
+    document = self._setObject(file_name, ob, user_login=user_login, container=container, id=id)
     document = self._getOb(file_name) # Call _getOb to purge cache
+
+    # Notify workflows
+    document.notifyWorkflowCreated()
 
     # Reindex it and return the document
     document.reindexObject()
+    if document.getCrawlingDepth() > 0: document.activate().crawlContent()
     return document
 
   security.declareProtected( Permissions.AddPortalContent, 'newXML' )
@@ -273,7 +315,7 @@
     return property_dict
 
   # WebDAV virtual folder support
-  def _setObject(self, name, ob, user_login=None, container=None):
+  def _setObject(self, name, ob, user_login=None, container=None, id=None):
     """
       The strategy is to let NullResource.PUT do everything as
       usual and at the last minute put the object in a different
@@ -321,7 +363,10 @@
         module = self.getDefaultModule(ob.portal_type)
       else:
         module = container
-      new_id = module.generateNewId()
+      if id is None:
+        new_id = module.generateNewId()
+      else:
+        new_id = id
       ob.id = new_id
       module._setObject(new_id, ob)
 
@@ -389,4 +434,122 @@
 
     return wrapper(object_list)
 
+  # Crawling methods
+  def _normaliseURL(self, url, base_url=None):
+    """
+      Returns a normalised version of the url so
+      that we do not download twice the same content.
+      URL normalisation is an important part in crawlers.
+      The current implementation is obviously simplistic.
+      Refer to http://en.wikipedia.org/wiki/Web_crawler
+      and study Harvestman for more ideas.
+    """
+    url = self.simple_normaliser.sub('', url)
+    url_split = url.split(':')
+    url_protocol = url_split[0]
+    if url_protocol in no_host_protocol_list:
+      return url
+    if base_url and len(url_split) == 1:
+      # Make relative URL absolute
+      url = '%s/%s' % (base_url, url)
+    return url
+
+  def _encodeURL(self, url):
+    """
+    Returns the URL as an ID. ID should be chosen in such
+    way that it is optimal with HBTreeFolder (ie. so that
+    distribution of access time on a cluster is possible)
+
+    NOTE: alternate approach is based on a url table
+    and catalog lookup. It is faster ? Not sure. Since
+    we must anyway insert objects in btrees and this
+    is simimar in cost to accessing them.
+    """
+    url = urllib.quote(url, safe='')
+    url = url.replace('_', '__')
+    url = url.replace('%', '_')
+    return url
+
+  security.declareProtected(Permissions.AddPortalContent, 'crawlContent')
+  def crawlContent(self, content):
+    """
+      Analyses content and download linked pages
+
+      XXX: missing is the conversion of content local href to something
+      valid.
+    """
+    depth = content.getCrawlingDepth()
+    if depth <= 0:
+      # Do nothing if crawling depth is reached
+      return
+    base_url = content.getContentBaseURL()
+    url_list = map(lambda url: self._normaliseURL(url, base_url), set(content.getContentURLList()))
+    for url in set(url_list):
+      # Some url protocols should not be crawled
+      if url.split(':')[0] in no_crawl_protocol_list:
+        continue
+      #if content.getParentValue()
+      # in place of not ?
+      container = content.getParentValue()
+      # Calculate the id under which content will be stored
+      id = self._encodeURL(url)
+      # Try to access the document if it already exists
+      document = container.get(id, None)
+      if document is None:
+        # XXX - This call is not working due to missing group_method_id
+        # therefore, multiple call happen in parallel and eventually fail
+        # (the same URL is created multiple times)
+        self.activate(activity="SQLQueue").newContentFromURL(container_path=container.getRelativeUrl(),
+                                                      id=id, url=url, crawling_depth=depth - 1)
+      else:
+        # Update depth to the max. of the two values
+        new_depth = max(depth - 1, document.getCrawlingDepth())
+        document._setCrawlingDepth(new_depth)
+        # And activate updateContentFromURL on existing document
+        next_date = document.getNextAlarmDate() 
+        document.activate(at_date=next_date).updateContentFromURL()
+
+  security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL')
+  def updateContentFromURL(self, content):
+    """
+      Updates an existing content.
+    """
+    # Step 1: download new content
+    url = content.asURL()
+    data = urllib2.urlopen(url).read()
+    file = cStringIO.StringIO()
+    file.write(data)
+    file.seek(0)
+    # Step 2: compare and update if necessary (md5)
+    # do here some md5 stuff to compare contents...
+    if 1:
+      content._edit(file=file)
+      # Step 3: convert to base format
+      content.convertToBaseFormat()
+      # Step 4: activate populate (unless interaction workflow does it)
+      content.activate().populateContent()
+      # Step 5: activate crawlContent
+      content.activate().crawlContent()
+    else:
+      # XXX
+      # We must handle the case for which content type has changed in between
+      pass
+    # Step 6: activate updateContentFromURL at next period
+    next_date = content.getNextAlarmDate()
+    content.activate(at_date=next_date).updateContentFromURL()
+
+  security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL')
+  def newContentFromURL(self, **kw):
+    """
+      A wrapper method for newContent which provides extra safety
+      in case or errors (ie. download, access, conflict, etc.).
+      The method is able to handle a certain number of exceptions
+      and can postpone itself through an activity based on
+      the type of exception (ex. for a 404, postpone 1 day), using
+      the at_date parameter and some standard values.
+
+      NOTE: implementation needs to be done.
+    """
+    return self.newContent(**kw)
+
 InitializeClass(ContributionTool)