1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-22 01:33:17 +00:00

moved some url utils from decobot to scrapy

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40184
This commit is contained in:
olveyra 2008-08-27 17:37:32 +00:00
parent 9164150bed
commit 0e6562cb47

View File

@ -7,6 +7,7 @@ import re
import urlparse
import urllib
import posixpath
import cgi
def url_is_from_any_domain(url, domains):
"""Return True if the url belongs to the given domain"""
@ -68,7 +69,7 @@ def safe_download_url(url):
to be within the document root.
"""
safe_url = safe_url_string(url)
scheme, netloc, path, query, fragment = urlparse.urlsplit(safe_url)
scheme, netloc, path, query, _ = urlparse.urlsplit(safe_url)
if path:
path = _parent_dirs.sub('', posixpath.normpath(path))
if url.endswith('/') and not path.endswith('/'):
@ -81,3 +82,78 @@ def safe_download_url(url):
def is_url(text):
return text.partition("://")[0] in ('file', 'http', 'https')
def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
""" Return the given query parameter in the url.
For example:
>>> url_query_parameter("product.html?id=200&foo=bar", "id")
'200'
>>> url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault")
'mydefault'
>>> url_query_parameter("product.html?id=", "id")
>>> url_query_parameter("product.html?id=", "id", keep_blank_values=1)
''
"""
queryparams = cgi.parse_qs(urlparse.urlsplit(str(url))[3], keep_blank_values=keep_blank_values)
return queryparams.get(parameter, [default])[0]
def url_query_cleaner(url, parameterlist=None, sep='&', kvsep='='):
""" Return the given url with given query parameters.
>>> url_query_cleaner("product.html?id=200&foo=bar&name=wired", 'id')
'product.html?id=200'
>>> url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
'product.html?id=200&name=wired'
"""
parameterlist = parameterlist or []
if not isinstance(parameterlist, (list, tuple)):
parameterlist = [parameterlist]
try:
base, query = url.split('?', 1)
parameters = [pair.split(kvsep, 1) for pair in query.split(sep)]
except:
base = url
query = ""
parameters = []
# unique parameters while keeping order
unique = {}
querylist = []
for pair in parameters:
k = pair[0]
if not unique.get(k):
querylist += [pair]
unique[k] = 1
query = sep.join([kvsep.join(pair) for pair in querylist if pair[0] in parameterlist])
return '?'.join([base, query])
def _has_querystring(url):
_, _, _, query, _ = urlparse.urlsplit(url)
return bool(query)
def add_or_replace_parameter(url, name, new_value, sep='&'):
"""
>>> url = 'http://domain/test'
>>> add_or_replace_parameter(url, 'arg', 'v')
'http://domain/test?arg=v'
>>> url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3'
>>> add_or_replace_parameter(url, 'arg4', 'v4')
'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4'
>>> add_or_replace_parameter(url, 'arg3', 'nv3')
'http://domain/test?arg1=v1&arg2=v2&arg3=nv3'
>>> url = 'http://domain/test?arg1=v1'
>>> add_or_replace_parameter(url, 'arg2', 'v2', sep=';')
'http://domain/test?arg1=v1;arg2=v2'
>>> add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20')
'http://domain/moreInfo.asp?prodID=20'
"""
parameter = url_query_parameter(url, name, keep_blank_values=1)
if parameter is None:
if _has_querystring(url):
next_url = url + sep + name + '=' + new_value
else:
next_url = url + '?' + name + '=' + new_value
else:
next_url = url.replace(name+'='+parameter,
name+'='+new_value)
return next_url