mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 06:43:44 +00:00
Merge branch 'dangra/issue-24'
This commit is contained in:
commit
e5edb8ec3e
@ -243,6 +243,22 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
[Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
|
||||
fragment='', nofollow=False)])
|
||||
|
||||
def test_encoded_url(self):
|
||||
body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
|
||||
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
|
||||
lx = SgmlLinkExtractor()
|
||||
self.assertEqual(lx.extract_links(response), [
|
||||
Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
|
||||
])
|
||||
|
||||
def test_encoded_url_in_restricted_xpath(self):
|
||||
body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
|
||||
response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
|
||||
lx = SgmlLinkExtractor(restrict_xpaths="//div")
|
||||
self.assertEqual(lx.extract_links(response), [
|
||||
Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
|
||||
])
|
||||
|
||||
def test_deny_extensions(self):
|
||||
html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
|
||||
response = HtmlResponse("http://example.org/", body=html)
|
||||
|
@ -151,6 +151,12 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
|
||||
"http://www.example.com/")
|
||||
|
||||
# quoted slash and question sign
|
||||
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
|
||||
"http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
|
||||
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
|
||||
"http://foo.com/AC%2FDC/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -52,10 +52,17 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
|
||||
keyvals = cgi.parse_qsl(query, keep_blank_values)
|
||||
keyvals.sort()
|
||||
query = urllib.urlencode(keyvals)
|
||||
path = safe_url_string(urllib.unquote(path)) or '/'
|
||||
path = safe_url_string(_unquotepath(path)) or '/'
|
||||
fragment = '' if not keep_fragments else fragment
|
||||
return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
|
||||
|
||||
|
||||
def _unquotepath(path):
|
||||
for reserved in ('2f', '2F', '3f', '3F'):
|
||||
path = path.replace('%' + reserved, '%25' + reserved.upper())
|
||||
return urllib.unquote(path)
|
||||
|
||||
|
||||
def parse_url(url, encoding=None):
|
||||
"""Return urlparsed url from the given argument (which could be an already
|
||||
parsed url)
|
||||
|
Loading…
x
Reference in New Issue
Block a user