1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 15:24:12 +00:00

Merge pull request #241 from zuhao/master

Fix url_has_any_extension bug
This commit is contained in:
Pablo Hoffman 2013-02-11 05:29:14 -08:00
commit 2df010a972
2 changed files with 10 additions and 2 deletions

View File

@ -1,6 +1,7 @@
import unittest import unittest
from scrapy.spider import BaseSpider from scrapy.spider import BaseSpider
from scrapy.utils.url import url_is_from_any_domain, url_is_from_spider, canonicalize_url from scrapy.utils.url import url_is_from_any_domain, url_is_from_spider, canonicalize_url, url_has_any_extension
from scrapy.linkextractor import IGNORED_EXTENSIONS
__doctests__ = ['scrapy.utils.url'] __doctests__ = ['scrapy.utils.url']
@ -157,6 +158,13 @@ class UrlUtilsTest(unittest.TestCase):
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"), self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
"http://foo.com/AC%2FDC/") "http://foo.com/AC%2FDC/")
def test_url_has_any_extension(self):
self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", IGNORED_EXTENSIONS))
self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", IGNORED_EXTENSIONS))
self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", IGNORED_EXTENSIONS))
self.assertFalse(url_has_any_extension("http://www.example.com/", IGNORED_EXTENSIONS))
self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", IGNORED_EXTENSIONS))
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -28,7 +28,7 @@ def url_is_from_spider(url, spider):
getattr(spider, 'allowed_domains', [])) getattr(spider, 'allowed_domains', []))
def url_has_any_extension(url, extensions): def url_has_any_extension(url, extensions):
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions return posixpath.splitext(parse_url(url).path)[1].replace('.', '').lower() in extensions
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
encoding=None): encoding=None):