diff --git a/scrapy/tests/test_utils_url.py b/scrapy/tests/test_utils_url.py index 1c4d95445..ffd377444 100644 --- a/scrapy/tests/test_utils_url.py +++ b/scrapy/tests/test_utils_url.py @@ -39,7 +39,7 @@ class UrlUtilsTest(unittest.TestCase): "http://www.example.com/test?p(29)url(http://www.another.net/page)") self.assertEqual(safe_url_string("http://www.example.com/Brochures_&_Paint_Cards&PageSize=200"), "http://www.example.com/Brochures_&_Paint_Cards&PageSize=200") - + def test_safe_download_url(self): self.assertEqual(safe_download_url('http://www.scrapy.org/../'), 'http://www.scrapy.org/') @@ -61,7 +61,7 @@ class UrlUtilsTest(unittest.TestCase): def test_url_query_parameter_2(self): """ This problem was seen several times in the feeds. Sometime affiliate URLs contains - nested encoded affiliate URL with direct URL as parameters. For example: + nested encoded affiliate URL with direct URL as parameters. For example: aff_url1 = 'http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1' the typical code to extract needed URL from it is: aff_url2 = url_query_parameter(aff_url1, 'url') @@ -70,7 +70,7 @@ class UrlUtilsTest(unittest.TestCase): the direct URL extraction is url = url_query_parameter(aff_url2, 'referredURL') but this will not work, because aff_url2 contains ' (comma sign encoded in the feed) - and the URL extraction will fail, current workaround was made in the spider, + and the URL extraction will fail, current workaround was made in the spider, just a replace for ' to %27 """ return # FIXME: this test should pass but currently doesnt @@ -102,6 +102,12 @@ class UrlUtilsTest(unittest.TestCase): 'http://domain/test?arg1=v1;arg2=v2') self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'), 'http://domain/moreInfo.asp?prodID=20') + url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60' + self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue', is_quoted=True), + 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60') + url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60' + self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'), + 'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60') def test_url_query_cleaner(self): self.assertEqual(url_query_cleaner("product.html?id=200&foo=bar&name=wired", 'id'), @@ -126,7 +132,7 @@ class UrlUtilsTest(unittest.TestCase): "http://www.example.com/do?a=3&b=2&c=1") self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"), "http://www.example.com/do?a=1") - + # sorting by argument values self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"), "http://www.example.com/do?a=50&b=2&b=5&c=3") @@ -143,7 +149,7 @@ class UrlUtilsTest(unittest.TestCase): self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'), 'http://www.example.com/do?1750%2C4=') - + # spaces self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"), "http://www.example.com/do?a=1&q=a+space") diff --git a/scrapy/utils/url.py b/scrapy/utils/url.py index 62f02af83..938c1097f 100644 --- a/scrapy/utils/url.py +++ b/scrapy/utils/url.py @@ -85,7 +85,8 @@ def is_url(text): def url_query_parameter(url, parameter, default=None, keep_blank_values=0): """Return the value of a url parameter, given the url and parameter name""" queryparams = cgi.parse_qs(urlparse.urlsplit(str(url))[3], keep_blank_values=keep_blank_values) - return queryparams.get(parameter, [default])[0] + result = queryparams.get(parameter, [default])[0] + return result def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='='): """Clean url arguments leaving only those passed in the parameterlist""" @@ -97,26 +98,28 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='='): base = url query = "" parameters = [] - + # unique parameters while keeping order unique = {} querylist = [] - for pair in parameters: + for pair in parameters: k = pair[0] if not unique.get(k): querylist += [pair] unique[k] = 1 - + query = sep.join([kvsep.join(pair) for pair in querylist if pair[0] in parameterlist]) return '?'.join([base, query]) - -def add_or_replace_parameter(url, name, new_value, sep='&'): + +def add_or_replace_parameter(url, name, new_value, sep='&', url_is_quoted=False): """Add or remove a parameter to a given url""" def has_querystring(url): _, _, _, query, _ = urlparse.urlsplit(url) return bool(query) parameter = url_query_parameter(url, name, keep_blank_values=1) + if url_is_quoted: + parameter = urllib.quote(parameter) if parameter is None: if has_querystring(url): next_url = url + sep + name + '=' + new_value