mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 12:03:58 +00:00
utils.url: add_or_replace_parameter function fixed, quoted urls support and test cases added
This commit is contained in:
parent
c205f7d8e5
commit
460f690c5c
@ -39,7 +39,7 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
"http://www.example.com/test?p(29)url(http://www.another.net/page)")
|
||||
self.assertEqual(safe_url_string("http://www.example.com/Brochures_&_Paint_Cards&PageSize=200"),
|
||||
"http://www.example.com/Brochures_&_Paint_Cards&PageSize=200")
|
||||
|
||||
|
||||
def test_safe_download_url(self):
|
||||
self.assertEqual(safe_download_url('http://www.scrapy.org/../'),
|
||||
'http://www.scrapy.org/')
|
||||
@ -61,7 +61,7 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
def test_url_query_parameter_2(self):
|
||||
"""
|
||||
This problem was seen several times in the feeds. Sometime affiliate URLs contains
|
||||
nested encoded affiliate URL with direct URL as parameters. For example:
|
||||
nested encoded affiliate URL with direct URL as parameters. For example:
|
||||
aff_url1 = 'http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1'
|
||||
the typical code to extract needed URL from it is:
|
||||
aff_url2 = url_query_parameter(aff_url1, 'url')
|
||||
@ -70,7 +70,7 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
the direct URL extraction is
|
||||
url = url_query_parameter(aff_url2, 'referredURL')
|
||||
but this will not work, because aff_url2 contains ' (comma sign encoded in the feed)
|
||||
and the URL extraction will fail, current workaround was made in the spider,
|
||||
and the URL extraction will fail, current workaround was made in the spider,
|
||||
just a replace for ' to %27
|
||||
"""
|
||||
return # FIXME: this test should pass but currently doesnt
|
||||
@ -102,6 +102,12 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
'http://domain/test?arg1=v1;arg2=v2')
|
||||
self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'),
|
||||
'http://domain/moreInfo.asp?prodID=20')
|
||||
url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60'
|
||||
self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue', is_quoted=True),
|
||||
'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
|
||||
url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60'
|
||||
self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'),
|
||||
'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
|
||||
|
||||
def test_url_query_cleaner(self):
|
||||
self.assertEqual(url_query_cleaner("product.html?id=200&foo=bar&name=wired", 'id'),
|
||||
@ -126,7 +132,7 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
"http://www.example.com/do?a=3&b=2&c=1")
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"),
|
||||
"http://www.example.com/do?a=1")
|
||||
|
||||
|
||||
# sorting by argument values
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
|
||||
"http://www.example.com/do?a=50&b=2&b=5&c=3")
|
||||
@ -143,7 +149,7 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
|
||||
self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
|
||||
'http://www.example.com/do?1750%2C4=')
|
||||
|
||||
|
||||
# spaces
|
||||
self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"),
|
||||
"http://www.example.com/do?a=1&q=a+space")
|
||||
|
@ -85,7 +85,8 @@ def is_url(text):
|
||||
def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
|
||||
"""Return the value of a url parameter, given the url and parameter name"""
|
||||
queryparams = cgi.parse_qs(urlparse.urlsplit(str(url))[3], keep_blank_values=keep_blank_values)
|
||||
return queryparams.get(parameter, [default])[0]
|
||||
result = queryparams.get(parameter, [default])[0]
|
||||
return result
|
||||
|
||||
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='='):
|
||||
"""Clean url arguments leaving only those passed in the parameterlist"""
|
||||
@ -97,26 +98,28 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='='):
|
||||
base = url
|
||||
query = ""
|
||||
parameters = []
|
||||
|
||||
|
||||
# unique parameters while keeping order
|
||||
unique = {}
|
||||
querylist = []
|
||||
for pair in parameters:
|
||||
for pair in parameters:
|
||||
k = pair[0]
|
||||
if not unique.get(k):
|
||||
querylist += [pair]
|
||||
unique[k] = 1
|
||||
|
||||
|
||||
query = sep.join([kvsep.join(pair) for pair in querylist if pair[0] in parameterlist])
|
||||
return '?'.join([base, query])
|
||||
|
||||
def add_or_replace_parameter(url, name, new_value, sep='&'):
|
||||
|
||||
def add_or_replace_parameter(url, name, new_value, sep='&', url_is_quoted=False):
|
||||
"""Add or remove a parameter to a given url"""
|
||||
def has_querystring(url):
|
||||
_, _, _, query, _ = urlparse.urlsplit(url)
|
||||
return bool(query)
|
||||
|
||||
parameter = url_query_parameter(url, name, keep_blank_values=1)
|
||||
if url_is_quoted:
|
||||
parameter = urllib.quote(parameter)
|
||||
if parameter is None:
|
||||
if has_querystring(url):
|
||||
next_url = url + sep + name + '=' + new_value
|
||||
|
Loading…
x
Reference in New Issue
Block a user