[Erp5-report] r13564 - /erp5/trunk/products/ERP5/Document/Document.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Thu Mar 22 15:43:22 CET 2007
Author: jp
Date: Thu Mar 22 15:43:20 2007
New Revision: 13564
URL: http://svn.erp5.org?rev=13564&view=rev
Log:
Initial crawling API
Modified:
erp5/trunk/products/ERP5/Document/Document.py
Modified: erp5/trunk/products/ERP5/Document/Document.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Document/Document.py?rev=13564&r1=13563&r2=13564&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Document/Document.py (original)
+++ erp5/trunk/products/ERP5/Document/Document.py Thu Mar 22 15:43:20 2007
@@ -190,10 +190,13 @@
so-called "External Source" (refer to ExternalSource class
for more information).
- External Documents may be downloaded once or at
+ External Documents may be downloaded once or updated at
regular interval. The later can be useful to update the content
of an external source. Previous versions may be stored
- in place or kept in a separate file.
+ in place or kept in a separate file. This feature
+ is known as the crawling API. It is mostly implemented
+ in ContributionTool with wrappers in the Document class.
+ It can be useful for create a small search engine.
There are currently two types of Document subclasses:
@@ -212,9 +215,12 @@
Document classes which implement conversion should use
the ConversionCacheMixin class so that converted values are
stored inside ZODB and do not need to be recalculated.
-
- XXX IDEA - ISSUE: generic API for conversion.
- converted_document = document.convert(...)
+ More generally, conversion should be achieved through
+ the convert method and other methods of the conversion
+ API (convertToBaseFormat, etc.). Moreover, any Document
+ subclass must ne able to convert documents to text
+ (asText method) and HTML (asHTML method). Text is required
+ for full text indexing. HTML is required for crawling.
Instances can be created directly, or via portal_contributions tool
which manages document ingestion process whereby a file can be uploaded
@@ -254,7 +260,7 @@
getSearchableReferenceList (Base)
getSearchableText (Base)
- index_html (Document)
+ index_html (overriden in Document subclasses)
Methods for handling relations are implemented either in
Document class or in Base class:
@@ -304,8 +310,14 @@
on this document. Implementation depends on revision numbering policy which
can be very different. Interaction workflow should call setNewRevision method.
+ * Document_populateContent - analyses the document content and produces
+ subcontent based on it (ex. images, news, etc.). This scripts can
+ involve for example an XSLT transformation to process XML.
+
Subcontent: documents may include subcontent (files, images, etc.)
- so that publication of rich content can be path independent.
+ so that publication of rich content can be path independent. Subcontent
+ can also be used to help the rendering in HTML of complex documents
+ such as ODF documents.
Consistency checking:
Default implementation uses DocumentReferenceConstraint to check if the
@@ -321,6 +333,9 @@
isDocument = 1
__dav_collection__=0
+ # Regular expressions
+ href_parser = re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
+
# Declarative security
security = ClassSecurityInfo()
security.declareObjectProtected(Permissions.AccessContentsInformation)
@@ -332,8 +347,9 @@
, PropertySheet.DublinCore
, PropertySheet.Version
, PropertySheet.Document
+ , PropertySheet.Snapshot
+ , PropertySheet.ExternalDocument
, PropertySheet.Url
- , PropertySheet.Snapshot
, PropertySheet.Periodicity
)
@@ -623,7 +639,7 @@
"""
Returns true if no other document of the same
portal_type and reference has the same version and language
-
+
XXX should delegate to script with proxy roles
XXX-JPS revision ?
"""
@@ -805,7 +821,7 @@
### Metadata disovery and ingestion methods
security.declareProtected(Permissions.ModifyPortalContent, 'discoverMetadata')
- def discoverMetadata(self, file_name, user_login=None):
+ def discoverMetadata(self, file_name, user_login=None): # XXX Was filename always there ? at least make it optional
"""
This is the main metadata discovery function - controls the process
of discovering data from various sources. The discovery itself is
@@ -814,18 +830,19 @@
file_name - this parameter is a file name of the form "AA-BBB-CCC-223-en"
user_login - this is a login string of a person; can be None if the user is
- currently logged in, then we'll get him from session
+ currently logged in, then we'll get him from session
"""
# Get the order
# Preference is made of a sequence of 'user_login', 'content', 'file_name', 'input'
- self.setSourceReference(file_name)
+ self._setSourceReference(file_name) # XXX Who added this ???
+ # filename is often undefined....
method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList',
fallback_script_id = 'Document_getPreferredDocumentMetadataDiscoveryOrderList')
order_list = list(method())
order_list.reverse()
- # Start with everything until content - build a dictionnary according to the order
+ # Start with everything until content - build a dictionary according to the order
content_index = order_list.index('content')
kw = {}
first_list = order_list[:content_index]
@@ -843,7 +860,7 @@
result = method()
if result is not None:
kw.update(result)
-
+
# Edit content
try:
del(kw['portal_type'])
@@ -953,7 +970,7 @@
return self.convert(format='html')
# Base format support
- security.declareProtected(Permissions.View, 'convertToBaseFormat')
+ security.declareProtected(Permissions.ModifyPortalContent, 'convertToBaseFormat')
def convertToBaseFormat(self, REQUEST=None):
"""
Converts the content of the document to a base format
@@ -1039,6 +1056,7 @@
pass
# Transformation API
+ security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
def populateContent(self):
"""
Populates the Document with subcontent based on the
@@ -1050,25 +1068,62 @@
individual records. Other application: populate
an HTML text document with its images, used in
conversion with convertToBaseFormat.
-
- NOTE: to be implemented as typed base method.
- """
- pass
+ """
+ try:
+ method = self._getTypeBasedMethod('populateContent')
+ except KeyError, AttributeError:
+ method = None
+ if method is not None: method()
# Crawling API
+ security.declareProtected(Permissions.AccessContentsInformation, 'getContentURLList')
def getContentURLList(self):
"""
Returns a list of URLs referenced by the content of this document.
-
- NOTE: to be implemented as typed base method or
- by subclass.
- """
- pass
-
+ Default implementation consists in analysing the document
+ converted to HTML. Subclasses may overload this method
+ if necessary. However, it is better to extend the conversion
+ methods in order to produce valid HTML, which is useful to
+ many people, rather than overload this method which is only
+ useful for crawling.
+ """
+ html_content = self.asStrippedHTML()
+ return re.findall(self.href_parser, str(html_content))
+
+ security.declareProtected(Permissions.ModifyPortalContent, 'updateContentFromURL')
def updateContentFromURL(self):
"""
Download and update content of this document from its source URL.
-
- NOTE: to be implemented here.
- """
- pass
+ Implementation is handled by ContributionTool.
+ """
+ self.portal_contributions.updateContentFromURL(self)
+
+ security.declareProtected(Permissions.ModifyPortalContent, 'crawlContent')
+ def crawlContent(self):
+ """
+ Initialises the crawling process on the current document.
+ """
+ self.portal_contributions.crawlContent(self)
+
+ security.declareProtected(Permissions.AccessContentsInformation, 'getContentBaseURL')
+ def getContentBaseURL(self):
+ """
+ Returns the content base URL based on the actual content or
+ on its URL.
+ """
+ # XXX TODO - try to retrive base URL from content
+ # If no base_url defined, define the base URL from our URL
+ base_url = self.asURL()
+ base_url_list = base_url.split('/')
+ if len(base_url_list): base_url = '/'.join(base_url_list[:-1])
+ return base_url
+
+ # Alarm date calculation
+ security.declareProtected(Permissions.AccessContentsInformation, 'getNextAlarmDate')
+ def getNextAlarmDate(self):
+ """
+ This method is only there to have something to test.
+ Serious refactory of Alarm, Periodicity and CalendarPeriod
+ classes is needed.
+ """
+ return DateTime() + .01
More information about the Erp5-report
mailing list