Merge branch 'dangra/issue-24'

2025-02-26 06:43:44 +00:00 · 2013-01-30 16:55:53 -02:00 · 2013-01-30 16:55:53 -02:00 · e5edb8ec3e
commit e5edb8ec3e
parent 3af240f5f9 0d3e4b4c43
3 changed files with 30 additions and 1 deletions
--- a/scrapy/tests/test_contrib_linkextractors.py
+++ b/scrapy/tests/test_contrib_linkextractors.py
@ -243,6 +243,22 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
                         [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
                               fragment='', nofollow=False)])

+    def test_encoded_url(self):
+        body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
+        response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
+        lx = SgmlLinkExtractor()
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
+        ])
+
+    def test_encoded_url_in_restricted_xpath(self):
+        body = """<html><body><div><a href="?page=2">BinB</a></body></html>"""
+        response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8')
+        lx = SgmlLinkExtractor(restrict_xpaths="//div")
+        self.assertEqual(lx.extract_links(response), [
+            Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB', fragment='', nofollow=False),
+        ])
+
    def test_deny_extensions(self):
        html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
        response = HtmlResponse("http://example.org/", body=html)
--- a/scrapy/tests/test_utils_url.py
+++ b/scrapy/tests/test_utils_url.py
@ -151,6 +151,12 @@ class UrlUtilsTest(unittest.TestCase):
        self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
                                          "http://www.example.com/")

+        # quoted slash and question sign
+        self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
+                         "http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
+        self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
+                         "http://foo.com/AC%2FDC/")
+

 if __name__ == "__main__":
    unittest.main()
--- a/scrapy/utils/url.py
+++ b/scrapy/utils/url.py
@ -52,10 +52,17 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
    keyvals = cgi.parse_qsl(query, keep_blank_values)
    keyvals.sort()
    query = urllib.urlencode(keyvals)
-    path = safe_url_string(urllib.unquote(path)) or '/'
+    path = safe_url_string(_unquotepath(path)) or '/'
    fragment = '' if not keep_fragments else fragment
    return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))

+
+def _unquotepath(path):
+    for reserved in ('2f', '2F', '3f', '3F'):
+        path = path.replace('%' + reserved, '%25' + reserved.upper())
+    return urllib.unquote(path)
+
+
 def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)