diff --git a/docs/ref/spiders.rst b/docs/ref/spiders.rst index e69c98bbc..950e1a361 100644 --- a/docs/ref/spiders.rst +++ b/docs/ref/spiders.rst @@ -46,12 +46,12 @@ method ``parse`` for each of the resulting responses. This is the method called by Scrapy when the spider is opened for scraping when no particular URLs are specified. If particular URLs are specified, - the :meth:`BaseSpider.make_request_from_url` is used instead to create the + the :meth:`BaseSpider.make_requests_from_url` is used instead to create the Requests. This method is also called only once from Scrapy, so it's safe to implement it as a generator. - The default implementation uses :meth:`BaseSpider.make_request_from_url` to - generate Requests for each url in :attr:`start_urls`. + The default implementation uses :meth:`BaseSpider.make_requests_from_url` + to generate Requests for each url in :attr:`start_urls`. If you want to change the Requests used to start scraping a domain, this is the method to override. For example, if you need to start by login in using @@ -67,11 +67,13 @@ method ``parse`` for each of the resulting responses. # each of them, with another callback pass -.. method:: BaseSpider.make_request_from_url(url) +.. method:: BaseSpider.make_requests_from_url(url) - A method that receives a URL and returns an :class:`~scrapy.http.Request` - object to scrape that URL with this spider. This method is used to - construct the initial requests in the :meth:`start_requests` method. + A method that receives a URL and returns a :class:`~scrapy.http.Request` + object (or a list of :class:`~scrapy.http.Request` objects) to scrape. This + method is used to construct the initial requests in the + :meth:`start_requests` method, and is typically used to convert urls to + requests. Unless overridden, this method returns Requests with the :meth:`parse` method as their callback function, and with dont_filter parameter enabled diff --git a/scrapy/contrib/spiders/crawl.py b/scrapy/contrib/spiders/crawl.py index 8d4a97b08..c1ae74cf5 100644 --- a/scrapy/contrib/spiders/crawl.py +++ b/scrapy/contrib/spiders/crawl.py @@ -8,7 +8,7 @@ See documentation in docs/ref/spiders.rst import copy from scrapy.http import Request -from scrapy.spider import BaseSpider +from scrapy.contrib.spiders.init import InitSpider from scrapy.conf import settings class Rule(object): @@ -45,7 +45,7 @@ class Rule(object): else: self.follow = follow -class CrawlSpider(BaseSpider): +class CrawlSpider(InitSpider): """ Class for spiders that crawl over web pages and extract/parse their links given some crawling rules. diff --git a/scrapy/contrib/spiders/feed.py b/scrapy/contrib/spiders/feed.py index e41fa41e7..2ef9085c4 100644 --- a/scrapy/contrib/spiders/feed.py +++ b/scrapy/contrib/spiders/feed.py @@ -5,14 +5,14 @@ for scraping from an XML feed. See documentation in docs/ref/spiders.rst """ -from scrapy.spider import BaseSpider +from scrapy.contrib.spiders.init import InitSpider from scrapy.item import ScrapedItem from scrapy.http import Request from scrapy.utils.iterators import xmliter, csviter from scrapy.xpath.selector import XmlXPathSelector, HtmlXPathSelector from scrapy.core.exceptions import NotConfigured, NotSupported -class XMLFeedSpider(BaseSpider): +class XMLFeedSpider(InitSpider): """ This class intends to be the base class for spiders that scrape from XML feeds. @@ -88,7 +88,7 @@ class XMLFeedSpider(BaseSpider): for (prefix, uri) in self.namespaces: selector.register_namespace(prefix, uri) -class CSVFeedSpider(BaseSpider): +class CSVFeedSpider(InitSpider): """Spider for parsing CSV feeds. It receives a CSV file in a response; iterates through each of its rows, and calls parse_row with a dict containing each field's data. diff --git a/scrapy/contrib/spiders/init.py b/scrapy/contrib/spiders/init.py new file mode 100644 index 000000000..b37591ca5 --- /dev/null +++ b/scrapy/contrib/spiders/init.py @@ -0,0 +1,44 @@ +from scrapy.spider import BaseSpider + +class InitSpider(BaseSpider): + """Base Spider with initialization facilities""" + + def __init__(self): + super(InitSpider, self).__init__() + self._postinit_reqs = [] + self._init_complete = False + self._init_started = False + + def make_requests_from_url(self, url): + req = super(InitSpider, self).make_requests_from_url(url) + if self._init_complete: + return req + self._postinit_reqs.append(req) + if not self._init_started: + self._init_started = True + return self.init_request() + + def initialized(self, response=None): + """This method must be set as the callback of your last initialization + request. See self.init_request() docstring for more info. + """ + self._init_complete = True + reqs = self._postinit_reqs[:] + del self._postinit_reqs + return reqs + + def init_request(self): + """This function should return one initialization request, with the + self.initialized method as callback. When the self.initialized method + is called this spider is considered initialized. If you need to perform + several requests for initializing your spider, you can do so by using + different callbacks. The only requirement is that the final callback + (of the last initialization request) must be self.initialized. + + The default implementation calls self.initialized immediately, and + means that no initialization is needed. This method should be + overridden only when you need to perform requests to initialize your + spider + """ + return self.initialized() + diff --git a/scrapy/core/engine.py b/scrapy/core/engine.py index 77fef119c..123a46c1a 100644 --- a/scrapy/core/engine.py +++ b/scrapy/core/engine.py @@ -23,7 +23,7 @@ from scrapy.item import ScrapedItem from scrapy.item.pipeline import ItemPipelineManager from scrapy.spider import spiders from scrapy.spider.middleware import SpiderMiddlewareManager -from scrapy.utils.defer import chain_deferred, defer_succeed, mustbe_deferred, deferred_imap +from scrapy.utils.defer import chain_deferred, deferred_imap from scrapy.utils.request import request_info class ExecutionEngine(object): @@ -50,7 +50,6 @@ class ExecutionEngine(object): def __init__(self): self.configured = False self.keep_alive = False - self.initializing = set() # domais in intialization state self.cancelled = set() # domains in cancelation state self.debug_mode = settings.getbool('ENGINE_DEBUG') self.tasks = [] @@ -216,8 +215,7 @@ class ExecutionEngine(object): pending = self.scheduler.domain_has_pending_requests(domain) downloading = not self.downloader.domain_is_idle(domain) haspipe = not self.pipeline.domain_is_idle(domain) - oninit = domain in self.initializing - return not (pending or downloading or haspipe or oninit or scraping) + return not (pending or downloading or haspipe or scraping) def domain_is_open(self, domain): return domain in self.downloader.sites @@ -369,57 +367,6 @@ class ExecutionEngine(object): dwld.addBoth(_on_complete) return deferred - def initialize(self, spider): - domain = spider.domain_name - if not hasattr(spider, 'init_domain'): - return defer_succeed(True) - - def _initialize(req): - if isinstance(req, Request): - _response = None - def _referer(response): - req.deferred.addCallback(_setreferer, response) - return response - - def _setreferer(result, response): - if isinstance(result, Request): - result.headers.setdefault('Referer', response.url) - return result - - def _onerror(_failure): - ex = _failure.value - if isinstance(ex, IgnoreRequest): - log.msg(ex.message, log.DEBUG, domain=domain) - else: - return _failure - - schd = self.schedule(req, spider) - schd.addCallback(_referer) - chain_deferred(schd, req.deferred) - schd.addErrback(_onerror) - schd.addBoth(_initialize) - return schd - return req - - def _bugtrap(_failure): - log.msg("Bug in %s init_domain code: %s" % (domain, _failure), log.ERROR, domain=domain) - - def _state(state): - self.initializing.remove(domain) - if state is True: - log.msg('Succeded initialization for %s' % domain, log.INFO, domain=domain) - else: - log.msg('Failed initialization for %s' % domain, log.INFO, domain=domain) - return state - - log.msg('Started initialization for %s' % domain, log.INFO, domain=domain) - self.initializing.add(domain) - req = spider.init_domain() - deferred = mustbe_deferred(_initialize, req) - deferred.addErrback(_bugtrap) - deferred.addCallback(_state) - return deferred - def open_domain(self, domain, spider=None): log.msg("Domain opened", domain=domain) spider = spider or spiders.fromdomain(domain) @@ -431,15 +378,8 @@ class ExecutionEngine(object): self._scraping[domain] = set() signals.send_catch_log(signals.domain_open, sender=self.__class__, domain=domain, spider=spider) - # init_domain - dfd = self.initialize(spider) - def _state(state): - if state is True: - signals.send_catch_log(signals.domain_opened, sender=self.__class__, domain=domain, spider=spider) - self._run_starters(spider) - else: - self._domain_idle(domain) - dfd.addCallback(_state) + signals.send_catch_log(signals.domain_opened, sender=self.__class__, domain=domain, spider=spider) + self._run_starters(spider) def _domain_idle(self, domain): """Called when a domain gets idle. This function is called when there are no diff --git a/scrapy/core/manager.py b/scrapy/core/manager.py index 655cedcb2..a3918050f 100644 --- a/scrapy/core/manager.py +++ b/scrapy/core/manager.py @@ -7,7 +7,7 @@ from scrapy import log from scrapy.http import Request from scrapy.core.engine import scrapyengine from scrapy.spider import spiders -from scrapy.utils.misc import load_object +from scrapy.utils.misc import load_object, arg_to_iter from scrapy.utils.url import is_url from scrapy.conf import settings @@ -126,8 +126,8 @@ class ExecutionManager(object): for url in urls: spider = spiders.fromurl(url) if spider: - req = spider.make_request_from_url(url) - perdomain.setdefault(spider.domain_name, []).append(req) + for req in arg_to_iter(spider.make_requests_from_url(url)): + perdomain.setdefault(spider.domain_name, []).append(req) else: log.msg('Could not find spider for <%s>' % url, log.ERROR) diff --git a/scrapy/spider/models.py b/scrapy/spider/models.py index 33bf3427d..33e3e6d83 100644 --- a/scrapy/spider/models.py +++ b/scrapy/spider/models.py @@ -8,6 +8,7 @@ from twisted.plugin import IPlugin from scrapy import log from scrapy.http import Request +from scrapy.utils.misc import arg_to_iter def _valid_domain_name(obj): """Check the domain name specified is valid""" @@ -39,13 +40,6 @@ class ISpider(Interface, IPlugin) : invariant(_valid_domain_name) invariant(_valid_download_delay) - def init_domain(self): - """This is first called to initialize domain specific quirks, like - session cookies or login stuff - """ - pass - - class BaseSpider(object): """Base class for scrapy spiders. All spiders must inherit from this class. @@ -64,9 +58,12 @@ class BaseSpider(object): log.msg(message, domain=self.domain_name, level=level) def start_requests(self): - return [self.make_request_from_url(url) for url in self.start_urls] + reqs = [] + for url in self.start_urls: + reqs.extend(arg_to_iter(self.make_requests_from_url(url))) + return reqs - def make_request_from_url(self, url): + def make_requests_from_url(self, url): return Request(url, callback=self.parse, dont_filter=True) def parse(self, response):