1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 21:04:12 +00:00

allow spider allowed_domains to be set/tuple, #259

This commit is contained in:
Steven Almeroth 2013-03-06 00:19:47 -06:00
parent 19d0942c74
commit b48ec1dce4

View File

@ -5,7 +5,6 @@ library.
Some of the functions that used to be imported from this module have been moved Some of the functions that used to be imported from this module have been moved
to the w3lib.url module. Always import those from there instead. to the w3lib.url module. Always import those from there instead.
""" """
import urlparse import urlparse
import urllib import urllib
import cgi import cgi
@ -13,6 +12,7 @@ import cgi
from w3lib.url import * from w3lib.url import *
from scrapy.utils.python import unicode_to_str from scrapy.utils.python import unicode_to_str
def url_is_from_any_domain(url, domains): def url_is_from_any_domain(url, domains):
"""Return True if the url belongs to any of the given domains""" """Return True if the url belongs to any of the given domains"""
host = parse_url(url).netloc host = parse_url(url).netloc
@ -22,15 +22,18 @@ def url_is_from_any_domain(url, domains):
else: else:
return False return False
def url_is_from_spider(url, spider): def url_is_from_spider(url, spider):
"""Return True if the url belongs to the given spider""" """Return True if the url belongs to the given spider"""
return url_is_from_any_domain(url, [spider.name] + \ return url_is_from_any_domain(url,
getattr(spider, 'allowed_domains', [])) [spider.name] + list(getattr(spider, 'allowed_domains', [])))
def url_has_any_extension(url, extensions): def url_has_any_extension(url, extensions):
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
encoding=None): encoding=None):
"""Canonicalize the given url by applying the following procedures: """Canonicalize the given url by applying the following procedures:
@ -70,6 +73,7 @@ def parse_url(url, encoding=None):
return url if isinstance(url, urlparse.ParseResult) else \ return url if isinstance(url, urlparse.ParseResult) else \
urlparse.urlparse(unicode_to_str(url, encoding)) urlparse.urlparse(unicode_to_str(url, encoding))
def escape_ajax(url): def escape_ajax(url):
""" """
Return the crawleable url according to: Return the crawleable url according to: