1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 18:44:07 +00:00

allow spider allowed_domains to be set/tuple, #259

This commit is contained in:
Steven Almeroth 2013-03-06 00:19:47 -06:00
parent 19d0942c74
commit b48ec1dce4

View File

@ -5,7 +5,6 @@ library.
Some of the functions that used to be imported from this module have been moved
to the w3lib.url module. Always import those from there instead.
"""
import urlparse
import urllib
import cgi
@ -13,6 +12,7 @@ import cgi
from w3lib.url import *
from scrapy.utils.python import unicode_to_str
def url_is_from_any_domain(url, domains):
"""Return True if the url belongs to any of the given domains"""
host = parse_url(url).netloc
@ -22,15 +22,18 @@ def url_is_from_any_domain(url, domains):
else:
return False
def url_is_from_spider(url, spider):
"""Return True if the url belongs to the given spider"""
return url_is_from_any_domain(url, [spider.name] + \
getattr(spider, 'allowed_domains', []))
return url_is_from_any_domain(url,
[spider.name] + list(getattr(spider, 'allowed_domains', [])))
def url_has_any_extension(url, extensions):
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
encoding=None):
"""Canonicalize the given url by applying the following procedures:
@ -70,6 +73,7 @@ def parse_url(url, encoding=None):
return url if isinstance(url, urlparse.ParseResult) else \
urlparse.urlparse(unicode_to_str(url, encoding))
def escape_ajax(url):
"""
Return the crawleable url according to: