[Erp5-report] r41759 jm - in /erp5/trunk/products: ERP5/Tool/ ERP5Type/

nobody at svn.erp5.org nobody at svn.erp5.org
Fri Dec 24 16:52:22 CET 2010


Author: jm
Date: Fri Dec 24 16:52:21 2010
New Revision: 41759

URL: http://svn.erp5.org?rev=41759&view=rev
Log:
Make Contribution Tool accept non-conformant %-escaped URL (or unescaped URL)

This fixes TestWebCrawler.test_02_crawlWebSite

Modified:
    erp5/trunk/products/ERP5/Tool/ContributionTool.py
    erp5/trunk/products/ERP5Type/Utils.py

Modified: erp5/trunk/products/ERP5/Tool/ContributionTool.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5/Tool/ContributionTool.py?rev=41759&r1=41758&r2=41759&view=diff
==============================================================================
--- erp5/trunk/products/ERP5/Tool/ContributionTool.py [utf8] (original)
+++ erp5/trunk/products/ERP5/Tool/ContributionTool.py [utf8] Fri Dec 24 16:52:21 2010
@@ -40,6 +40,7 @@ from Products.ERP5Type.Globals import In
 from Products.CMFCore.utils import _checkPermission
 from Products.ERP5Type.Tool.BaseTool import BaseTool
 from Products.ERP5Type import Permissions
+from Products.ERP5Type.Utils import reencodeUrlEscapes
 from Products.ERP5 import _dtmldir
 from Products.ERP5.Document.Url import no_crawl_protocol_list
 from AccessControl import Unauthorized
@@ -661,10 +662,7 @@ class ContributionTool(BaseTool):
     return file_object, filename, content_type tuple
     """
     # Quote path part of url
-    url_tuple = urlparse.urlsplit(url)
-    quoted_path = urllib.quote(url_tuple[2])
-    url = urlparse.urlunsplit((url_tuple[0], url_tuple[1], quoted_path,
-                               url_tuple[3], url_tuple[4]))
+    url = reencodeUrlEscapes(url)
     # build a new file from the url
     url_file = urllib2.urlopen(urllib2.Request(url,
                                                headers={'Accept':'*/*'}))

Modified: erp5/trunk/products/ERP5Type/Utils.py
URL: http://svn.erp5.org/erp5/trunk/products/ERP5Type/Utils.py?rev=41759&r1=41758&r2=41759&view=diff
==============================================================================
--- erp5/trunk/products/ERP5Type/Utils.py [utf8] (original)
+++ erp5/trunk/products/ERP5Type/Utils.py [utf8] Fri Dec 24 16:52:21 2010
@@ -3304,3 +3304,28 @@ def guessEncodingFromText(data, content_
     raise NotImplementedError, 'No encoding detector found.'\
                                   ' You must install chardet and python-magic'
 
+_reencodeUrlEscapes_map = dict((chr(x), chr(x) in (# safe
+                                                   "!'()*-." "0123456789" "_~"
+                                                   "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                                   "abcdefghijklmnopqrstuvwxyz"
+                                                   # reserved (maybe unsafe)
+                                                   "#$&+,/:;=?@[]")
+                                        and chr(x) or "%%%02X" % x)
+                               for x in xrange(256))
+def reencodeUrlEscapes(url):
+  """Fix a non-conformant %-escaped URL (or quote an unescaped one)
+
+  This is a Python reimplementation of 'reencode_escapes' function of Wget 1.12
+  """
+  from string import hexdigits
+  next_part = iter(url.split('%')).next
+  url = [_reencodeUrlEscapes_map[c] for c in next_part()]
+  try:
+    while True:
+      part = next_part()
+      url.append('%')
+      if len(part) < 2 or not (part[0] in hexdigits and part[1] in hexdigits):
+        url.append('25')
+      url += [_reencodeUrlEscapes_map[c] for c in part]
+  except StopIteration:
+    return ''.join(url)



More information about the Erp5-report mailing list