[Erp5-report] r10538 - /erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py
nobody at svn.erp5.org
nobody at svn.erp5.org
Wed Oct 4 17:35:31 CEST 2006
Author: bartek
Date: Wed Oct 4 17:35:29 2006
New Revision: 10538
URL: http://svn.erp5.org?rev=10538&view=rev
Log:
fixed url recording; clean multiple slashes from urls;
Modified:
erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py
Modified: erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py?rev=10538&r1=10537&r2=10538&view=diff
==============================================================================
--- erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py (original)
+++ erp5/trunk/products/ERP5OOo/Document/ExternalWebPage.py Wed Oct 4 17:35:29 2006
@@ -166,8 +166,6 @@
if (inf.getmaintype(),inf.getsubtype())!=('text','html'):
raise SpiderException(100,'this is %s/%s' % (inf.getmaintype(),inf.getsubtype()))
top=self._findTopObject()
- # record my url in top object
- top.addUrl(self.getQualifiedUrl())
# remove current subobjects
self.manage_delObjects([i.getId() for i in self.searchFolder(portal_type='External Web Page')])
if self.getOptionRecursively()>0 and self.getRecursionDepth()>0:
@@ -179,12 +177,18 @@
continue
ref=re.sub('#.*','',ref)
if ref=='':continue
- baseref='/'.join(self.getQualifiedUrl().split('/')[:-1])
+ #baseref='/'.join(self.getQualifiedUrl().split('/'))
+ baseref=self.getQualifiedUrl()
if not ref.startswith('http'):
# complete relative paths
ref=baseref+'/'+ref
+ # eliminate multiple slashes
+ rx=re.compile('([^:]{1})\/{2,}')
+ ref=re.sub(rx,'\1/',ref)
# create subobjects
if ref.startswith(baseref) and not top.checkUrl(ref):
+ # record my url in top object
+ top.addUrl(ref)
n=self.newContent(portal_type='External Web Page')
# set coordinates
n.setUrlProtocol('http')
More information about the Erp5-report
mailing list