mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 19:04:21 +00:00
improved download errors propagation to the spiders, and removed no longer needed code to simplify
This commit is contained in:
parent
804c0279ec
commit
0b6c7ce9b8
@ -13,7 +13,7 @@ from scrapy import log, signals
|
||||
from scrapy.stats import stats
|
||||
from scrapy.core.downloader import Downloader
|
||||
from scrapy.core.scraper import Scraper
|
||||
from scrapy.exceptions import IgnoreRequest, DontCloseSpider
|
||||
from scrapy.exceptions import DontCloseSpider
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.signal import send_catch_log, send_catch_log_deferred
|
||||
@ -139,7 +139,7 @@ class ExecutionEngine(object):
|
||||
# response is a Response or Failure
|
||||
d = defer.Deferred()
|
||||
d.addBoth(self.scraper.enqueue_scrape, request, spider)
|
||||
d.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider)
|
||||
d.addErrback(log.err, spider=spider)
|
||||
if isinstance(response, Failure):
|
||||
d.errback(response)
|
||||
else:
|
||||
@ -204,27 +204,12 @@ class ExecutionEngine(object):
|
||||
response=response, request=request, spider=spider)
|
||||
return response
|
||||
|
||||
def _on_error(_failure):
|
||||
"""handle an error processing a page"""
|
||||
exc = _failure.value
|
||||
if isinstance(exc, IgnoreRequest):
|
||||
errmsg = _failure.getErrorMessage()
|
||||
else:
|
||||
errmsg = str(_failure)
|
||||
if errmsg:
|
||||
log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \
|
||||
level=log.ERROR, spider=spider)
|
||||
return Failure(IgnoreRequest(str(exc)))
|
||||
|
||||
def _on_complete(_):
|
||||
self.next_request(spider)
|
||||
return _
|
||||
|
||||
if spider not in self.downloader.sites:
|
||||
return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)
|
||||
|
||||
dwld = mustbe_deferred(self.downloader.fetch, request, spider)
|
||||
dwld.addCallbacks(_on_success, _on_error)
|
||||
dwld.addCallback(_on_success)
|
||||
dwld.addBoth(_on_complete)
|
||||
return dwld
|
||||
|
||||
|
@ -10,7 +10,7 @@ from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errba
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.signal import send_catch_log, send_catch_log_deferred
|
||||
from scrapy.exceptions import CloseSpider, IgnoreRequest, DropItem
|
||||
from scrapy.exceptions import CloseSpider, DropItem
|
||||
from scrapy import signals
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.item import BaseItem
|
||||
@ -134,7 +134,7 @@ class Scraper(object):
|
||||
else:
|
||||
# FIXME: don't ignore errors in spider middleware
|
||||
dfd = self.call_spider(request_result, request, spider)
|
||||
return dfd.addErrback(self._check_propagated_failure, \
|
||||
return dfd.addErrback(self._log_download_errors, \
|
||||
request_result, request, spider)
|
||||
|
||||
def call_spider(self, result, request, spider):
|
||||
@ -142,15 +142,12 @@ class Scraper(object):
|
||||
dfd.addCallbacks(request.callback or spider.parse, request.errback)
|
||||
return dfd.addCallback(iterate_spider_output)
|
||||
|
||||
def handle_spider_error(self, _failure, request, response, spider, propagated_failure=None):
|
||||
def handle_spider_error(self, _failure, request, response, spider):
|
||||
exc = _failure.value
|
||||
if isinstance(exc, CloseSpider):
|
||||
self.engine.close_spider(spider, exc.reason or 'cancelled')
|
||||
return
|
||||
referer = request.headers.get('Referer', None)
|
||||
msg = "Spider error processing <%s> (referer: <%s>)" % \
|
||||
(request.url, referer)
|
||||
log.err(_failure, msg, spider=spider)
|
||||
log.err(_failure, "Spider error processing %s" % request, spider=spider)
|
||||
send_catch_log(signal=signals.spider_error, failure=_failure, response=response, \
|
||||
spider=spider)
|
||||
stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \
|
||||
@ -183,20 +180,15 @@ class Scraper(object):
|
||||
log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \
|
||||
(type(output).__name__, request), log.ERROR, spider=spider)
|
||||
|
||||
def _check_propagated_failure(self, spider_failure, propagated_failure, request, spider):
|
||||
"""Log and silence the bugs raised outside of spiders, but still allow
|
||||
spiders to be notified about general failures while downloading spider
|
||||
generated requests
|
||||
def _log_download_errors(self, spider_failure, download_failure, request, spider):
|
||||
"""Log and silence errors that come from the engine (typically download
|
||||
errors that got propagated thru here)
|
||||
"""
|
||||
# ignored requests are commonly propagated exceptions safes to be silenced
|
||||
if isinstance(spider_failure.value, IgnoreRequest):
|
||||
if spider_failure is download_failure:
|
||||
log.msg("Error downloading %s: %s" % \
|
||||
(request, spider_failure.getErrorMessage()), log.ERROR, spider=spider)
|
||||
return
|
||||
elif spider_failure is propagated_failure:
|
||||
log.err(spider_failure, 'Unhandled error propagated to spider', \
|
||||
spider=spider)
|
||||
return # stop propagating this error
|
||||
else:
|
||||
return spider_failure # exceptions raised in the spider code
|
||||
return spider_failure
|
||||
|
||||
def _itemproc_finished(self, output, item, response, spider):
|
||||
"""ItemProcessor finished for the given ``item`` and returned ``output``
|
||||
|
@ -1,5 +1,7 @@
|
||||
import os
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
class LogFormatter(object):
|
||||
"""Class for generating log messages for different actions. All methods
|
||||
must return a plain string which doesn't include the log level or the
|
||||
@ -13,7 +15,8 @@ class LogFormatter(object):
|
||||
request, referer, flags)
|
||||
|
||||
def scraped(self, item, response, spider):
|
||||
return u"Scraped from %s%s%s" % (response, os.linesep, item)
|
||||
src = response.getErrorMessage() if isinstance(response, Failure) else response
|
||||
return u"Scraped from %s%s%s" % (src, os.linesep, item)
|
||||
|
||||
def dropped(self, item, exception, response, spider):
|
||||
return u"Dropped: %s%s%s" % (exception, os.linesep, item)
|
||||
|
Loading…
x
Reference in New Issue
Block a user