mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 06:43:44 +00:00
Fix url_has_any_extension bug
This commit is contained in:
parent
910effd145
commit
27ca25472f
@ -1,6 +1,7 @@
|
||||
import unittest
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.utils.url import url_is_from_any_domain, url_is_from_spider, canonicalize_url
|
||||
from scrapy.utils.url import url_is_from_any_domain, url_is_from_spider, canonicalize_url, url_has_any_extension
|
||||
from scrapy.linkextractor import IGNORED_EXTENSIONS
|
||||
|
||||
__doctests__ = ['scrapy.utils.url']
|
||||
|
||||
@ -157,6 +158,13 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
|
||||
"http://foo.com/AC%2FDC/")
|
||||
|
||||
def test_url_has_any_extension(self):
|
||||
self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", IGNORED_EXTENSIONS))
|
||||
self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", IGNORED_EXTENSIONS))
|
||||
self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", IGNORED_EXTENSIONS))
|
||||
self.assertFalse(url_has_any_extension("http://www.example.com/", IGNORED_EXTENSIONS))
|
||||
self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", IGNORED_EXTENSIONS))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -28,7 +28,7 @@ def url_is_from_spider(url, spider):
|
||||
getattr(spider, 'allowed_domains', []))
|
||||
|
||||
def url_has_any_extension(url, extensions):
|
||||
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
|
||||
return posixpath.splitext(parse_url(url).path)[1].replace('.', '').lower() in extensions
|
||||
|
||||
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
|
||||
encoding=None):
|
||||
|
Loading…
x
Reference in New Issue
Block a user