1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 16:03:49 +00:00

Merge pull request #128 from dangra/cannonicalize-missing-url-path

handle missing paths in urls as /
This commit is contained in:
Daniel Graña 2012-05-03 10:27:11 -07:00
commit 43028876b5
2 changed files with 8 additions and 5 deletions

View File

@ -60,12 +60,15 @@ class UrlUtilsTest(unittest.TestCase):
def test_canonicalize_url(self): def test_canonicalize_url(self):
# simplest case # simplest case
self.assertEqual(canonicalize_url("http://www.example.com"), self.assertEqual(canonicalize_url("http://www.example.com/"),
"http://www.example.com") "http://www.example.com/")
# always return a str # always return a str
assert isinstance(canonicalize_url(u"http://www.example.com"), str) assert isinstance(canonicalize_url(u"http://www.example.com"), str)
# append missing path
self.assertEqual(canonicalize_url("http://www.example.com"),
"http://www.example.com/")
# typical usage # typical usage
self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"), self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
"http://www.example.com/do?a=1&b=2&c=3") "http://www.example.com/do?a=1&b=2&c=3")
@ -145,8 +148,8 @@ class UrlUtilsTest(unittest.TestCase):
'http://www.example.com/caf%E9-con-leche.htm') 'http://www.example.com/caf%E9-con-leche.htm')
# domains are case insensitive # domains are case insensitive
self.assertEqual(canonicalize_url("http://www.EXAMPLE.com"), self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
"http://www.example.com") "http://www.example.com/")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -52,7 +52,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
keyvals = cgi.parse_qsl(query, keep_blank_values) keyvals = cgi.parse_qsl(query, keep_blank_values)
keyvals.sort() keyvals.sort()
query = urllib.urlencode(keyvals) query = urllib.urlencode(keyvals)
path = safe_url_string(urllib.unquote(path)) path = safe_url_string(urllib.unquote(path)) or '/'
fragment = '' if not keep_fragments else fragment fragment = '' if not keep_fragments else fragment
return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment)) return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))