mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 18:44:07 +00:00
allow spider allowed_domains to be set/tuple, #259
This commit is contained in:
parent
19d0942c74
commit
b48ec1dce4
@ -5,7 +5,6 @@ library.
|
||||
Some of the functions that used to be imported from this module have been moved
|
||||
to the w3lib.url module. Always import those from there instead.
|
||||
"""
|
||||
|
||||
import urlparse
|
||||
import urllib
|
||||
import cgi
|
||||
@ -13,6 +12,7 @@ import cgi
|
||||
from w3lib.url import *
|
||||
from scrapy.utils.python import unicode_to_str
|
||||
|
||||
|
||||
def url_is_from_any_domain(url, domains):
|
||||
"""Return True if the url belongs to any of the given domains"""
|
||||
host = parse_url(url).netloc
|
||||
@ -22,15 +22,18 @@ def url_is_from_any_domain(url, domains):
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def url_is_from_spider(url, spider):
|
||||
"""Return True if the url belongs to the given spider"""
|
||||
return url_is_from_any_domain(url, [spider.name] + \
|
||||
getattr(spider, 'allowed_domains', []))
|
||||
return url_is_from_any_domain(url,
|
||||
[spider.name] + list(getattr(spider, 'allowed_domains', [])))
|
||||
|
||||
|
||||
def url_has_any_extension(url, extensions):
|
||||
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
|
||||
|
||||
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
|
||||
|
||||
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
|
||||
encoding=None):
|
||||
"""Canonicalize the given url by applying the following procedures:
|
||||
|
||||
@ -70,6 +73,7 @@ def parse_url(url, encoding=None):
|
||||
return url if isinstance(url, urlparse.ParseResult) else \
|
||||
urlparse.urlparse(unicode_to_str(url, encoding))
|
||||
|
||||
|
||||
def escape_ajax(url):
|
||||
"""
|
||||
Return the crawleable url according to:
|
||||
|
Loading…
x
Reference in New Issue
Block a user