mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 21:04:12 +00:00
allow spider allowed_domains to be set/tuple, #259
This commit is contained in:
parent
19d0942c74
commit
b48ec1dce4
@ -5,7 +5,6 @@ library.
|
|||||||
Some of the functions that used to be imported from this module have been moved
|
Some of the functions that used to be imported from this module have been moved
|
||||||
to the w3lib.url module. Always import those from there instead.
|
to the w3lib.url module. Always import those from there instead.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import urlparse
|
import urlparse
|
||||||
import urllib
|
import urllib
|
||||||
import cgi
|
import cgi
|
||||||
@ -13,6 +12,7 @@ import cgi
|
|||||||
from w3lib.url import *
|
from w3lib.url import *
|
||||||
from scrapy.utils.python import unicode_to_str
|
from scrapy.utils.python import unicode_to_str
|
||||||
|
|
||||||
|
|
||||||
def url_is_from_any_domain(url, domains):
|
def url_is_from_any_domain(url, domains):
|
||||||
"""Return True if the url belongs to any of the given domains"""
|
"""Return True if the url belongs to any of the given domains"""
|
||||||
host = parse_url(url).netloc
|
host = parse_url(url).netloc
|
||||||
@ -22,15 +22,18 @@ def url_is_from_any_domain(url, domains):
|
|||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def url_is_from_spider(url, spider):
|
def url_is_from_spider(url, spider):
|
||||||
"""Return True if the url belongs to the given spider"""
|
"""Return True if the url belongs to the given spider"""
|
||||||
return url_is_from_any_domain(url, [spider.name] + \
|
return url_is_from_any_domain(url,
|
||||||
getattr(spider, 'allowed_domains', []))
|
[spider.name] + list(getattr(spider, 'allowed_domains', [])))
|
||||||
|
|
||||||
|
|
||||||
def url_has_any_extension(url, extensions):
|
def url_has_any_extension(url, extensions):
|
||||||
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
|
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
|
||||||
|
|
||||||
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
|
|
||||||
|
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
|
||||||
encoding=None):
|
encoding=None):
|
||||||
"""Canonicalize the given url by applying the following procedures:
|
"""Canonicalize the given url by applying the following procedures:
|
||||||
|
|
||||||
@ -70,6 +73,7 @@ def parse_url(url, encoding=None):
|
|||||||
return url if isinstance(url, urlparse.ParseResult) else \
|
return url if isinstance(url, urlparse.ParseResult) else \
|
||||||
urlparse.urlparse(unicode_to_str(url, encoding))
|
urlparse.urlparse(unicode_to_str(url, encoding))
|
||||||
|
|
||||||
|
|
||||||
def escape_ajax(url):
|
def escape_ajax(url):
|
||||||
"""
|
"""
|
||||||
Return the crawleable url according to:
|
Return the crawleable url according to:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user