allow spider allowed_domains to be set/tuple, #259

2025-02-27 18:44:07 +00:00 · 2013-03-06 00:19:47 -06:00 · 2013-03-06 00:19:47 -06:00 · b48ec1dce4
commit b48ec1dce4
parent 19d0942c74
1 changed files with 8 additions and 4 deletions
--- a/scrapy/utils/url.py
+++ b/scrapy/utils/url.py
@ -5,7 +5,6 @@ library.
 Some of the functions that used to be imported from this module have been moved
 to the w3lib.url module. Always import those from there instead.
 """
-
 import urlparse
 import urllib
 import cgi
@ -13,6 +12,7 @@ import cgi
 from w3lib.url import *
 from scrapy.utils.python import unicode_to_str

+
 def url_is_from_any_domain(url, domains):
    """Return True if the url belongs to any of the given domains"""
    host = parse_url(url).netloc
@ -22,15 +22,18 @@ def url_is_from_any_domain(url, domains):
    else:
        return False

+
 def url_is_from_spider(url, spider):
    """Return True if the url belongs to the given spider"""
-    return url_is_from_any_domain(url, [spider.name] + \
-        getattr(spider, 'allowed_domains', []))
+    return url_is_from_any_domain(url,
+        [spider.name] + list(getattr(spider, 'allowed_domains', [])))
+

 def url_has_any_extension(url, extensions):
    return posixpath.splitext(parse_url(url).path)[1].lower() in extensions

-def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, \
+
+def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
        encoding=None):
    """Canonicalize the given url by applying the following procedures:

@ -70,6 +73,7 @@ def parse_url(url, encoding=None):
    return url if isinstance(url, urlparse.ParseResult) else \
        urlparse.urlparse(unicode_to_str(url, encoding))

+
 def escape_ajax(url):
    """
    Return the crawleable url according to: