mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 15:24:12 +00:00
Merge pull request #241 from zuhao/master
Fix url_has_any_extension bug
This commit is contained in:
commit
2df010a972
@ -1,6 +1,7 @@
|
|||||||
import unittest
|
import unittest
|
||||||
from scrapy.spider import BaseSpider
|
from scrapy.spider import BaseSpider
|
||||||
from scrapy.utils.url import url_is_from_any_domain, url_is_from_spider, canonicalize_url
|
from scrapy.utils.url import url_is_from_any_domain, url_is_from_spider, canonicalize_url, url_has_any_extension
|
||||||
|
from scrapy.linkextractor import IGNORED_EXTENSIONS
|
||||||
|
|
||||||
__doctests__ = ['scrapy.utils.url']
|
__doctests__ = ['scrapy.utils.url']
|
||||||
|
|
||||||
@ -157,6 +158,13 @@ class UrlUtilsTest(unittest.TestCase):
|
|||||||
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
|
self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
|
||||||
"http://foo.com/AC%2FDC/")
|
"http://foo.com/AC%2FDC/")
|
||||||
|
|
||||||
|
def test_url_has_any_extension(self):
|
||||||
|
self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", IGNORED_EXTENSIONS))
|
||||||
|
self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", IGNORED_EXTENSIONS))
|
||||||
|
self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", IGNORED_EXTENSIONS))
|
||||||
|
self.assertFalse(url_has_any_extension("http://www.example.com/", IGNORED_EXTENSIONS))
|
||||||
|
self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", IGNORED_EXTENSIONS))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -28,7 +28,7 @@ def url_is_from_spider(url, spider):
|
|||||||
getattr(spider, 'allowed_domains', []))
|
getattr(spider, 'allowed_domains', []))
|
||||||
|
|
||||||
def url_has_any_extension(url, extensions):
|
def url_has_any_extension(url, extensions):
|
||||||
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
|
return posixpath.splitext(parse_url(url).path)[1].replace('.', '').lower() in extensions
|
||||||
|
|
||||||
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
|
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
|
||||||
encoding=None):
|
encoding=None):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user