[Erp5-report] r10487 - /erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py

nobody at svn.erp5.org nobody at svn.erp5.org
Mon Oct 2 16:25:04 CEST 2006


Author: bartek
Date: Mon Oct  2 16:25:02 2006
New Revision: 10487

URL: http://svn.erp5.org?rev=10487&view=rev
Log:
scan only text/html, skip anchors, don't follow mailto

Modified:
    erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py

Modified: erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py?rev=10487&r1=10486&r2=10487&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py Mon Oct  2 16:25:02 2006
@@ -160,7 +160,11 @@
     self.urldict={}
     self._p_changed=1
 
-  def _processData(self,s):
+  def _processData(self,s, inf):
+    # since this is a web page, we don't want anything else
+    # XXX we should find another way - like this, we end up with empty draft objects
+    if (inf.getmaintype(),inf.getsubtype())!=('text','html'):
+      raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype()))
     top=self._findTopObject()
     # record my url in top object
     top.addUrl(self.getQualifiedUrl())
@@ -170,11 +174,11 @@
       # first find links in text
       rx=re.compile('<a[^>]*href=[\'"](.*?)[\'"]',re.IGNORECASE)
       for ref in re.findall(rx, s):
-        if ref.startswith('javascript'):
+        # eliminate anchors and specials, select internal links
+        if ref.startswith('javascript') or ref.startswith('mailto'):
           continue
-        # XXX not sure where to store those already spidered
-        # for now, the only precaution against infinite loop is recursion depth
-        # select internal links
+        ref=re.sub('#.*','',ref)
+        if ref=='':continue
         baseref='/'.join(self.getQualifiedUrl().split('/')[:-1])
         if not ref.startswith('http'):
           # complete relative paths




More information about the Erp5-report mailing list