mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-22 01:33:17 +00:00
moved some url utils from decobot to scrapy
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40184
This commit is contained in:
parent
9164150bed
commit
0e6562cb47
@ -7,6 +7,7 @@ import re
|
||||
import urlparse
|
||||
import urllib
|
||||
import posixpath
|
||||
import cgi
|
||||
|
||||
def url_is_from_any_domain(url, domains):
|
||||
"""Return True if the url belongs to the given domain"""
|
||||
@ -68,7 +69,7 @@ def safe_download_url(url):
|
||||
to be within the document root.
|
||||
"""
|
||||
safe_url = safe_url_string(url)
|
||||
scheme, netloc, path, query, fragment = urlparse.urlsplit(safe_url)
|
||||
scheme, netloc, path, query, _ = urlparse.urlsplit(safe_url)
|
||||
if path:
|
||||
path = _parent_dirs.sub('', posixpath.normpath(path))
|
||||
if url.endswith('/') and not path.endswith('/'):
|
||||
@ -81,3 +82,78 @@ def safe_download_url(url):
|
||||
def is_url(text):
|
||||
return text.partition("://")[0] in ('file', 'http', 'https')
|
||||
|
||||
def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
|
||||
""" Return the given query parameter in the url.
|
||||
For example:
|
||||
>>> url_query_parameter("product.html?id=200&foo=bar", "id")
|
||||
'200'
|
||||
>>> url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault")
|
||||
'mydefault'
|
||||
>>> url_query_parameter("product.html?id=", "id")
|
||||
>>> url_query_parameter("product.html?id=", "id", keep_blank_values=1)
|
||||
''
|
||||
"""
|
||||
queryparams = cgi.parse_qs(urlparse.urlsplit(str(url))[3], keep_blank_values=keep_blank_values)
|
||||
return queryparams.get(parameter, [default])[0]
|
||||
|
||||
def url_query_cleaner(url, parameterlist=None, sep='&', kvsep='='):
|
||||
""" Return the given url with given query parameters.
|
||||
>>> url_query_cleaner("product.html?id=200&foo=bar&name=wired", 'id')
|
||||
'product.html?id=200'
|
||||
>>> url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
|
||||
'product.html?id=200&name=wired'
|
||||
"""
|
||||
parameterlist = parameterlist or []
|
||||
if not isinstance(parameterlist, (list, tuple)):
|
||||
parameterlist = [parameterlist]
|
||||
|
||||
try:
|
||||
base, query = url.split('?', 1)
|
||||
parameters = [pair.split(kvsep, 1) for pair in query.split(sep)]
|
||||
except:
|
||||
base = url
|
||||
query = ""
|
||||
parameters = []
|
||||
|
||||
# unique parameters while keeping order
|
||||
unique = {}
|
||||
querylist = []
|
||||
for pair in parameters:
|
||||
k = pair[0]
|
||||
if not unique.get(k):
|
||||
querylist += [pair]
|
||||
unique[k] = 1
|
||||
|
||||
query = sep.join([kvsep.join(pair) for pair in querylist if pair[0] in parameterlist])
|
||||
return '?'.join([base, query])
|
||||
|
||||
def _has_querystring(url):
|
||||
_, _, _, query, _ = urlparse.urlsplit(url)
|
||||
return bool(query)
|
||||
|
||||
def add_or_replace_parameter(url, name, new_value, sep='&'):
|
||||
"""
|
||||
>>> url = 'http://domain/test'
|
||||
>>> add_or_replace_parameter(url, 'arg', 'v')
|
||||
'http://domain/test?arg=v'
|
||||
>>> url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3'
|
||||
>>> add_or_replace_parameter(url, 'arg4', 'v4')
|
||||
'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4'
|
||||
>>> add_or_replace_parameter(url, 'arg3', 'nv3')
|
||||
'http://domain/test?arg1=v1&arg2=v2&arg3=nv3'
|
||||
>>> url = 'http://domain/test?arg1=v1'
|
||||
>>> add_or_replace_parameter(url, 'arg2', 'v2', sep=';')
|
||||
'http://domain/test?arg1=v1;arg2=v2'
|
||||
>>> add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20')
|
||||
'http://domain/moreInfo.asp?prodID=20'
|
||||
"""
|
||||
parameter = url_query_parameter(url, name, keep_blank_values=1)
|
||||
if parameter is None:
|
||||
if _has_querystring(url):
|
||||
next_url = url + sep + name + '=' + new_value
|
||||
else:
|
||||
next_url = url + '?' + name + '=' + new_value
|
||||
else:
|
||||
next_url = url.replace(name+'='+parameter,
|
||||
name+'='+new_value)
|
||||
return next_url
|
||||
|
Loading…
x
Reference in New Issue
Block a user