1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 06:43:44 +00:00

Merge branch 'dangra/issue-24'

This commit is contained in:
Daniel Graña 2013-01-30 16:55:53 -02:00
commit e5edb8ec3e
3 changed files with 30 additions and 1 deletions

View File

@ -243,6 +243,22 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
[Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
fragment='', nofollow=False)])
def test_encoded_url(self):
body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
lx = SgmlLinkExtractor()
self.assertEqual(lx.extract_links(response), [
Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
])
def test_encoded_url_in_restricted_xpath(self):
body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
lx = SgmlLinkExtractor(restrict_xpaths="//div")
self.assertEqual(lx.extract_links(response), [
Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
])
def test_deny_extensions(self):
html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
response = HtmlResponse("http://example.org/", body=html)

View File

@ -151,6 +151,12 @@ class UrlUtilsTest(unittest.TestCase):
self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
"http://www.example.com/")
# quoted slash and question sign
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
"http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
"http://foo.com/AC%2FDC/")
if __name__ == "__main__":
unittest.main()

View File

@ -52,10 +52,17 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
keyvals = cgi.parse_qsl(query, keep_blank_values)
keyvals.sort()
query = urllib.urlencode(keyvals)
path = safe_url_string(urllib.unquote(path)) or '/'
path = safe_url_string(_unquotepath(path)) or '/'
fragment = '' if not keep_fragments else fragment
return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
def _unquotepath(path):
for reserved in ('2f', '2F', '3f', '3F'):
path = path.replace('%' + reserved, '%25' + reserved.upper())
return urllib.unquote(path)
def parse_url(url, encoding=None):
"""Return urlparsed url from the given argument (which could be an already
parsed url)