1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 11:44:11 +00:00

Fix url_has_any_extension bug

This commit is contained in:
Zuhao Wan 2013-02-11 17:19:31 +08:00
parent 910effd145
commit 27ca25472f
2 changed files with 10 additions and 2 deletions

View File

@ -1,6 +1,7 @@
import unittest
from scrapy.spider import BaseSpider
from scrapy.utils.url import url_is_from_any_domain, url_is_from_spider, canonicalize_url
from scrapy.utils.url import url_is_from_any_domain, url_is_from_spider, canonicalize_url, url_has_any_extension
from scrapy.linkextractor import IGNORED_EXTENSIONS
__doctests__ = ['scrapy.utils.url']
@ -157,6 +158,13 @@ class UrlUtilsTest(unittest.TestCase):
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
"http://foo.com/AC%2FDC/")
def test_url_has_any_extension(self):
self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", IGNORED_EXTENSIONS))
self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", IGNORED_EXTENSIONS))
self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", IGNORED_EXTENSIONS))
self.assertFalse(url_has_any_extension("http://www.example.com/", IGNORED_EXTENSIONS))
self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", IGNORED_EXTENSIONS))
if __name__ == "__main__":
unittest.main()

View File

@ -28,7 +28,7 @@ def url_is_from_spider(url, spider):
getattr(spider, 'allowed_domains', []))
def url_has_any_extension(url, extensions):
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
return posixpath.splitext(parse_url(url).path)[1].replace('.', '').lower() in extensions
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
encoding=None):