diff --git a/docs/faq.rst b/docs/faq.rst index 953d28be6..5b1d30ec0 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -240,3 +240,8 @@ How can I see the cookies being sent and received from Scrapy? Enable the :setting:`COOKIES_DEBUG` setting. +How can I manually stop a running spider? +----------------------------------------- + +Raise the :exc:`~scrapy.exceptions.CloseSpider` exception from a callback. For +more info see: :exc:`~scrapy.exceptions.CloseSpider`. diff --git a/docs/topics/exceptions.rst b/docs/topics/exceptions.rst index d75f28fed..8a10ee796 100644 --- a/docs/topics/exceptions.rst +++ b/docs/topics/exceptions.rst @@ -22,6 +22,23 @@ DropItem The exception that must be raised by item pipeline stages to stop processing an Item. For more information see :ref:`topics-item-pipeline`. +CloseSpider +----------- + +.. exception:: CloseSpider(reason='cancelled') + + This exception can be raised from a spider callback to request the spider to be + closed/stopped. Supported arguments: + + :param reason: the reason for closing + :type reason: str + +For example:: + + def parse_page(self, response): + if 'Bandwidth exceeded' in response.body: + raise CloseSpider('bandwidth_exceeded') + IgnoreRequest ------------- diff --git a/scrapy/core/scraper.py b/scrapy/core/scraper.py index 0235a21bc..04d54d1a8 100644 --- a/scrapy/core/scraper.py +++ b/scrapy/core/scraper.py @@ -10,7 +10,7 @@ from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errba from scrapy.utils.spider import iterate_spider_output from scrapy.utils.misc import load_object from scrapy.utils.signal import send_catch_log, send_catch_log_deferred -from scrapy.exceptions import IgnoreRequest, DropItem +from scrapy.exceptions import CloseSpider, IgnoreRequest, DropItem from scrapy import signals from scrapy.http import Request, Response from scrapy.item import BaseItem @@ -143,6 +143,10 @@ class Scraper(object): return dfd.addCallback(iterate_spider_output) def handle_spider_error(self, _failure, request, response, spider, propagated_failure=None): + exc = _failure.value + if isinstance(exc, CloseSpider): + self.engine.close_spider(spider, exc.reason or 'cancelled') + return referer = request.headers.get('Referer', None) msg = "Spider error processing <%s> (referer: <%s>)" % \ (request.url, referer) diff --git a/scrapy/exceptions.py b/scrapy/exceptions.py index e52dafc6d..43e142160 100644 --- a/scrapy/exceptions.py +++ b/scrapy/exceptions.py @@ -20,6 +20,12 @@ class DontCloseSpider(Exception): """Request the spider not to be closed yet""" pass +class CloseSpider(Exception): + """Raise this from callbacks to request the spider to be closed""" + + def __init__(self, reason='cancelled'): + self.reason = reason + # Items class DropItem(Exception):