[Erp5-report] r9851 - /erp5/trunk/products/ERP5OOo/Document/ExternalDocument.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Tue Sep 12 21:29:26 CEST 2006
Author: bartek
Date: Tue Sep 12 21:29:21 2006
New Revision: 9851
URL: http://svn.erp5.org?rev=9851&view=rev
Log:
a draft implementation of external data source (http only)
Added:
erp5/trunk/products/ERP5OOo/Document/ExternalDocument.py
Added: erp5/trunk/products/ERP5OOo/Document/ExternalDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/ExternalDocument.py?rev=9851&view=auto
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/ExternalDocument.py (added)
+++ erp5/trunk/products/ERP5OOo/Document/ExternalDocument.py Tue Sep 12 21:29:21 2006
@@ -1,0 +1,145 @@
+
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5Type.Cache import CachingMethod
+#from Products.ERP5.Document.Url import Url
+from Products.ERP5OOo.Document.DMSFile import DMSFile, stripHtml
+
+import mimetypes, re, urllib
+from DateTime import DateTime
+
+rx=[]
+rx.append(re.compile('<!--.*?-->',re.DOTALL|re.MULTILINE)) # clear comments (sometimes JavaScript code in comments contains > chars)
+rx.append(re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)) # clear tags
+rx.append(re.compile('&#[^;]+;')) # clear most entities (how to safely clear things like »?)
+rx.append(re.compile('\s+')) # compress multiple spaces
+
+def clearHtml(s):
+ for r in rx:
+ s=r.sub(" ",s)
+ s=s.replace(' '," ")
+ return s
+
+class SpiderException(Exception):
+
+ def __init__(self,code, msg):
+ msg="%i: %s" % (code, msg)
+ Exception.__init__(self,msg)
+
+class Opener(urllib.FancyURLopener):
+
+ def http_error_default(self, url, fp, code, msg, headers):
+ raise SpiderException(code, msg)
+
+class ExternalDocument(DMSFile):
+ """
+ caching sources from outside
+ """
+ # CMF Type Definition
+ meta_type = 'ERP5 External Document'
+ portal_type = 'External Document'
+ isPortalContent = 1
+ isRADContent = 1
+
+ # Declarative security
+ security = ClassSecurityInfo()
+ security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+ # Default Properties
+ property_sheets = ( PropertySheet.Base
+ , PropertySheet.CategoryCore
+ , PropertySheet.DublinCore
+ , PropertySheet.Version
+ , PropertySheet.Reference
+ , PropertySheet.DMSFile
+ , PropertySheet.Document
+ , PropertySheet.Url
+ , PropertySheet.ExternalDocument
+ )
+
+ protocols=(('Web page','http'),)
+
+ searchable_attrs=DMSFile.searchable_attrs+('text_content',)
+
+ security.declareProtected(Permissions.View, 'getProtocolList')
+ def getProtocolList(self):
+ """
+ """
+ return [x[1] for x in self.protocols]
+
+ security.declareProtected(Permissions.View, 'getProtocolItemList')
+ def getProtocolItemList(self):
+ """
+ """
+ return self.protocols
+
+ security.declareProtected(Permissions.View, 'getProtocolItemList')
+ def spiderSource(self):
+ """
+ Spidering policy questions:
+ - refreshing: how often
+ - what to do if site not accessible - erate text_content or keep it?
+ Shall we delegate these questions to preferences?
+ Or use portal_alarms?
+ """
+ op=Opener()
+ try:
+ f=op.open(self.getQualifiedUrl())
+ except (IOError, SpiderException),e:
+ self.setStatusMessage("Tried on %s: %s" % (self._time(),str(e)))
+ return False
+ s=f.read()
+ chars=len(s)
+ if chars==0:
+ self.setStatusMessage("Tried on %s: got empty string" % self._time())
+ return False
+ # XXX here we will check encoding and convert to UTF8
+ # and decode htmlentities
+ s=stripHtml(s) # remove headers, doctype and the like
+ s=clearHtml(s) # remove tags and entities
+ self.setTextContent(s)
+ self.setStatusMessage("Spidered on %s, %i chars" % (self._time(), chars))
+ return True
+
+ security.declareProtected(Permissions.View, 'getProtocolItemList')
+ def getQualifiedUrl(self):
+ """
+ this should be in the Url, not here
+ otherwise why does the url have a property 'url_protocol'?
+ """
+ return (self.getUrlProtocol() or '')+'://'+(self.getUrlString() or '')
+
+ def _time(self):
+ return DateTime().strftime('%Y/%m/%d %H:%M:%S')
+
+
+# vim: syntax=python shiftwidth=2
+
More information about the Erp5-report
mailing list