1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 19:04:21 +00:00

improved download errors propagation to the spiders, and removed no longer needed code to simplify

This commit is contained in:
Pablo Hoffman 2011-07-13 14:10:05 -03:00
parent 804c0279ec
commit 0b6c7ce9b8
3 changed files with 18 additions and 38 deletions

View File

@ -13,7 +13,7 @@ from scrapy import log, signals
from scrapy.stats import stats
from scrapy.core.downloader import Downloader
from scrapy.core.scraper import Scraper
from scrapy.exceptions import IgnoreRequest, DontCloseSpider
from scrapy.exceptions import DontCloseSpider
from scrapy.http import Response, Request
from scrapy.utils.misc import load_object
from scrapy.utils.signal import send_catch_log, send_catch_log_deferred
@ -139,7 +139,7 @@ class ExecutionEngine(object):
# response is a Response or Failure
d = defer.Deferred()
d.addBoth(self.scraper.enqueue_scrape, request, spider)
d.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider)
d.addErrback(log.err, spider=spider)
if isinstance(response, Failure):
d.errback(response)
else:
@ -204,27 +204,12 @@ class ExecutionEngine(object):
response=response, request=request, spider=spider)
return response
def _on_error(_failure):
"""handle an error processing a page"""
exc = _failure.value
if isinstance(exc, IgnoreRequest):
errmsg = _failure.getErrorMessage()
else:
errmsg = str(_failure)
if errmsg:
log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \
level=log.ERROR, spider=spider)
return Failure(IgnoreRequest(str(exc)))
def _on_complete(_):
self.next_request(spider)
return _
if spider not in self.downloader.sites:
return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)
dwld = mustbe_deferred(self.downloader.fetch, request, spider)
dwld.addCallbacks(_on_success, _on_error)
dwld.addCallback(_on_success)
dwld.addBoth(_on_complete)
return dwld

View File

@ -10,7 +10,7 @@ from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errba
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.misc import load_object
from scrapy.utils.signal import send_catch_log, send_catch_log_deferred
from scrapy.exceptions import CloseSpider, IgnoreRequest, DropItem
from scrapy.exceptions import CloseSpider, DropItem
from scrapy import signals
from scrapy.http import Request, Response
from scrapy.item import BaseItem
@ -134,7 +134,7 @@ class Scraper(object):
else:
# FIXME: don't ignore errors in spider middleware
dfd = self.call_spider(request_result, request, spider)
return dfd.addErrback(self._check_propagated_failure, \
return dfd.addErrback(self._log_download_errors, \
request_result, request, spider)
def call_spider(self, result, request, spider):
@ -142,15 +142,12 @@ class Scraper(object):
dfd.addCallbacks(request.callback or spider.parse, request.errback)
return dfd.addCallback(iterate_spider_output)
def handle_spider_error(self, _failure, request, response, spider, propagated_failure=None):
def handle_spider_error(self, _failure, request, response, spider):
exc = _failure.value
if isinstance(exc, CloseSpider):
self.engine.close_spider(spider, exc.reason or 'cancelled')
return
referer = request.headers.get('Referer', None)
msg = "Spider error processing <%s> (referer: <%s>)" % \
(request.url, referer)
log.err(_failure, msg, spider=spider)
log.err(_failure, "Spider error processing %s" % request, spider=spider)
send_catch_log(signal=signals.spider_error, failure=_failure, response=response, \
spider=spider)
stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \
@ -183,20 +180,15 @@ class Scraper(object):
log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \
(type(output).__name__, request), log.ERROR, spider=spider)
def _check_propagated_failure(self, spider_failure, propagated_failure, request, spider):
"""Log and silence the bugs raised outside of spiders, but still allow
spiders to be notified about general failures while downloading spider
generated requests
def _log_download_errors(self, spider_failure, download_failure, request, spider):
"""Log and silence errors that come from the engine (typically download
errors that got propagated thru here)
"""
# ignored requests are commonly propagated exceptions safes to be silenced
if isinstance(spider_failure.value, IgnoreRequest):
if spider_failure is download_failure:
log.msg("Error downloading %s: %s" % \
(request, spider_failure.getErrorMessage()), log.ERROR, spider=spider)
return
elif spider_failure is propagated_failure:
log.err(spider_failure, 'Unhandled error propagated to spider', \
spider=spider)
return # stop propagating this error
else:
return spider_failure # exceptions raised in the spider code
return spider_failure
def _itemproc_finished(self, output, item, response, spider):
"""ItemProcessor finished for the given ``item`` and returned ``output``

View File

@ -1,5 +1,7 @@
import os
from twisted.python.failure import Failure
class LogFormatter(object):
"""Class for generating log messages for different actions. All methods
must return a plain string which doesn't include the log level or the
@ -13,7 +15,8 @@ class LogFormatter(object):
request, referer, flags)
def scraped(self, item, response, spider):
return u"Scraped from %s%s%s" % (response, os.linesep, item)
src = response.getErrorMessage() if isinstance(response, Failure) else response
return u"Scraped from %s%s%s" % (src, os.linesep, item)
def dropped(self, item, exception, response, spider):
return u"Dropped: %s%s%s" % (exception, os.linesep, item)