[Erp5-report] r10447 - in /erp5/trunk/products/ERP5OOo: Document/ PropertySheet/
nobody at svn.erp5.org
nobody at svn.erp5.org
Sun Oct 1 21:27:54 CEST 2006
Author: bartek
Date: Sun Oct 1 21:27:52 2006
New Revision: 10447
URL: http://svn.erp5.org?rev=10447&view=rev
Log:
external web page, supports recursive spidering
Added:
erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py
Modified:
erp5/trunk/products/ERP5OOo/PropertySheet/ExternalDocument.py
Added: erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py?rev=10447&view=auto
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py (added)
+++ erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py Sun Oct 1 21:27:52 2006
@@ -1,0 +1,207 @@
+
+
+##############################################################################
+#
+# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+
+from AccessControl import ClassSecurityInfo
+from Products.CMFCore.WorkflowCore import WorkflowMethod
+from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
+from Products.ERP5OOo.Document.DMSFile import stripHtml
+from Products.ERP5OOo.Document.ExternalDocument import ExternalDocument, SpiderException
+
+import mimetypes, re, urllib
+from htmlentitydefs import name2codepoint
+
+rx=[]
+rx.append(re.compile('<!--.*?-->',re.DOTALL|re.MULTILINE)) # clear comments (sometimes JavaScript code in comments contains > chars)
+rx.append(re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)) # clear tags
+rx.append(re.compile('\s+')) # compress multiple spaces
+
+def clearHtml(s):
+ for r in rx:
+ s=r.sub(" ",s)
+ return s
+
+
+tgtencoding='utf-8'
+encodings=['iso-8859-2','iso-8859-15','windows-1250']
+rx_charset=re.compile('<meta.*charset="?([\w\d\-]*)',re.DOTALL|re.MULTILINE|re.IGNORECASE)
+
+def recode(s):
+ """
+ maybe it can be useful system-wide
+ """
+ _encodings=encodings[:] # local copy
+ _encodings.insert(0,tgtencoding) # if not declared or declared wrongly, we try
+ m=rx_charset.search(s)
+ if m and len(m.groups())>0:
+ enc=m.groups()[0].lower()
+ if enc==tgtencoding:
+ return s
+ if enc in _encodings:
+ _encodings.remove(enc)
+ _encodings.insert(0,enc) # we'll start from what we've found
+ for enc in _encodings:
+ try:
+ return s.decode(enc).encode('utf-8')
+ except UnicodeDecodeError, LookupError:
+ pass
+ raise CanNotDecode('sorry')
+
+def _convertEntities(txt,rx,mapper=None):
+ def repl(code):
+ if mapper:
+ code=mapper.get(code)
+ if code is None:
+ return ''
+ return unichr(int(code)).encode(tgtencoding)
+ res=re.split(rx,txt)
+ res[1::2]=map(repl,res[1::2]) # Isn't it beautiful? :)
+ return ''.join(res)
+
+rx_chars=re.compile('&#(\d{3});')
+rx_ents=re.compile('&(\w{1,6});')
+
+def convertEntities(txt):
+ txt=_convertEntities(txt,rx_chars)
+ txt=_convertEntities(txt,rx_ents, name2codepoint)
+ return txt
+
+class ExternalWebPage(ExternalDocument):
+ """
+ caching sources from outside
+ """
+ # CMF Type Definition
+ meta_type = 'ERP5 External Web Page'
+ portal_type = 'External Web Page'
+ isPortalContent = 1
+ isRADContent = 1
+
+ # Declarative security
+ security = ClassSecurityInfo()
+ security.declareObjectProtected(Permissions.AccessContentsInformation)
+
+ # Default Properties
+ property_sheets = ( PropertySheet.Base
+ , PropertySheet.CategoryCore
+ , PropertySheet.DublinCore
+ , PropertySheet.Version
+ , PropertySheet.Reference
+ , PropertySheet.DMSFile
+ , PropertySheet.Document
+ , PropertySheet.Url
+ , PropertySheet.ExternalDocument
+ )
+
+ def _findTopObject(self):
+ '''
+ find the top object from which the spidering begun
+ we search upwards untill we find or reach portal object
+ the top object is the one that is maintaining the dictionary
+ I think we have to do it instead of using simple acquisition
+ because we have to find a non-empty one
+ '''
+ ob=self
+ if hasattr(self,'urldict') and len(self.urldict)>0:
+ return self
+ else:
+ while 1:
+ ob=ob.aq_parent
+ if ob==self.getPortalObject():
+ return self
+ if hasattr(ob,'urldict') and len(ob.urldict)>0:
+ return ob
+
+ security.declareProtected(Permissions.ModifyPortalContent,'addUrl')
+ def addUrl(self,url):
+ '''
+ record url that has already been spidered
+ '''
+ self.urldict[url]=1
+ self._p_changed=1
+
+ security.declareProtected(Permissions.ModifyPortalContent,'checkUrl')
+ def checkUrl(self,url):
+ '''
+ check if the url has already been spidered
+ '''
+ return self.urldict.has_key(url)
+
+ security.declareProtected(Permissions.ModifyPortalContent,'resetTopObject')
+ def resetTopObject(self):
+ '''
+ reset the url dictionary
+ remember do it before you start recursive spidering
+ '''
+ self.urldict={}
+ self._p_changed=1
+
+ def _processData(self,s):
+ top=self._findTopObject()
+ # record my url in top object
+ top.addUrl(self.getQualifiedUrl())
+ # remove current subobjects
+ self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='External Web Page')])
+ if self.getOptionRecursively()>0 and self.getRecursionDepth()>0:
+ # first find links in text
+ rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
+ for ref in re.findall(rx, s):
+ if ref.startswith('javascript'):
+ continue
+ # XXX not sure where to store those already spidered
+ # for now, the only precaution against infinite loop is recursion depth
+ # select internal links
+ baseref='/'.join(self.getQualifiedUrl().split('/')[:-1])
+ if not ref.startswith('http'):
+ # complete relative paths
+ ref=baseref+'/'+ref
+ # create subobjects
+ if ref.startswith(baseref) and not top.checkUrl(ref):
+ n=self.newContent(portal_type='External Web Page')
+ # set coordinates
+ n.setUrlProtocol('http')
+ n.setUrlString(ref)
+ n.setOptionRecursively(1)
+ n.setRecursionDepth(self.getRecursionDepth()-1)
+ # copy attributes
+ for atr in self.portal_types[self.getPortalType()].getInstanceBaseCategoryList():
+ n.setProperty(atr,self.getProperty(atr))
+ n.activate(activity='SQLQueue').ExternalDocument_spiderAndSetState()
+ # process self
+ # here we check encoding and convert to UTF8
+ try:
+ s=recode(s)
+ except CanNotDecode:
+ self.setStatusMessage("Spidered on %s, %i chars, but could not decode" % (self._time(), chars))
+ return False
+ s=stripHtml(s) # remove headers, doctype and the like
+ s=clearHtml(s) # remove tags
+ s=convertEntities(s) # convert charrefs and named entities
+ return s
+
+
+# vim: filetype=python syntax=python shiftwidth=2
Modified: erp5/trunk/products/ERP5OOo/PropertySheet/ExternalDocument.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/PropertySheet/ExternalDocument.py?rev=10447&r1=10446&r2=10447&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/PropertySheet/ExternalDocument.py (original)
+++ erp5/trunk/products/ERP5OOo/PropertySheet/ExternalDocument.py Sun Oct 1 21:27:52 2006
@@ -7,5 +7,14 @@
'description' : 'message about status',
'type' : 'string',
'mode' : 'w' },
+ { 'id' : 'option_recursively',
+ 'description' : 'do we want recursive spidering (meaningless in some classes)',
+ 'type' : 'int',
+ 'mode' : 'w'},
+ { 'id' : 'recursion_depth',
+ 'description' : 'how deep should recursive spidering be (0 - no recursion) (meaningless in some classes)',
+ 'type' : 'int',
+ 'default' : 5,
+ 'mode' : 'w'},
)
More information about the Erp5-report
mailing list