mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 07:04:01 +00:00
Merge pull request #1060 from Curita/python-logging
[MRG+1] Python logging
This commit is contained in:
commit
fbb1078f58
30
conftest.py
30
conftest.py
@ -1,6 +1,5 @@
|
||||
import six
|
||||
import pytest
|
||||
from twisted.python import log
|
||||
|
||||
collect_ignore = ["scrapy/stats.py", "scrapy/project.py"]
|
||||
|
||||
@ -10,35 +9,6 @@ if six.PY3:
|
||||
if len(file_path) > 0 and file_path[0] != '#':
|
||||
collect_ignore.append(file_path)
|
||||
|
||||
class LogObservers:
|
||||
"""Class for keeping track of log observers across test modules"""
|
||||
|
||||
def __init__(self):
|
||||
self.observers = []
|
||||
|
||||
def add(self, logfile='test.log'):
|
||||
fileobj = open(logfile, 'wb')
|
||||
observer = log.FileLogObserver(fileobj)
|
||||
log.startLoggingWithObserver(observer.emit, 0)
|
||||
self.observers.append((fileobj, observer))
|
||||
|
||||
def remove(self):
|
||||
fileobj, observer = self.observers.pop()
|
||||
log.removeObserver(observer.emit)
|
||||
fileobj.close()
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def log_observers():
|
||||
return LogObservers()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def setlog(request, log_observers):
|
||||
"""Attach test.log file observer to twisted log, for trial compatibility"""
|
||||
log_observers.add()
|
||||
request.addfinalizer(log_observers.remove)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def chdir(tmpdir):
|
||||
|
@ -119,7 +119,7 @@ Built-in services
|
||||
topics/webservice
|
||||
|
||||
:doc:`topics/logging`
|
||||
Understand the simple logging facility provided by Scrapy.
|
||||
Learn how to use Python's builtin logging on Scrapy.
|
||||
|
||||
:doc:`topics/stats`
|
||||
Collect statistics about your scraping crawler.
|
||||
|
@ -154,15 +154,14 @@ similar to this::
|
||||
2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled downloader middlewares: ...
|
||||
2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled spider middlewares: ...
|
||||
2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled item pipelines: ...
|
||||
2014-01-23 18:13:07-0400 [dmoz] INFO: Spider opened
|
||||
2014-01-23 18:13:08-0400 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> (referer: None)
|
||||
2014-01-23 18:13:09-0400 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)
|
||||
2014-01-23 18:13:09-0400 [dmoz] INFO: Closing spider (finished)
|
||||
2014-01-23 18:13:07-0400 [scrapy] INFO: Spider opened
|
||||
2014-01-23 18:13:08-0400 [scrapy] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> (referer: None)
|
||||
2014-01-23 18:13:09-0400 [scrapy] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)
|
||||
2014-01-23 18:13:09-0400 [scrapy] INFO: Closing spider (finished)
|
||||
|
||||
|
||||
.. note::
|
||||
The lines containing ``[dmoz]`` correspond to the log messages for
|
||||
our spider. You can see a log line for each URL defined in ``start_urls``.
|
||||
At the end you can see a log line for each URL defined in ``start_urls``.
|
||||
Because these URLs are the starting ones, they have no referrers, which is
|
||||
shown at the end of the log line, where it says ``(referer: None)``.
|
||||
|
||||
@ -268,7 +267,7 @@ This is what the shell looks like::
|
||||
|
||||
[ ... Scrapy log here ... ]
|
||||
|
||||
2014-01-23 17:11:42-0400 [default] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)
|
||||
2014-01-23 17:11:42-0400 [scrapy] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)
|
||||
[s] Available Scrapy objects:
|
||||
[s] crawler <scrapy.crawler.Crawler object at 0x3636b50>
|
||||
[s] item {}
|
||||
@ -424,11 +423,11 @@ Spider would be like this::
|
||||
|
||||
Now crawling dmoz.org yields ``DmozItem`` objects::
|
||||
|
||||
[dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
[scrapy] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
{'desc': [u' - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.\n],
|
||||
'link': [u'http://gnosis.cx/TPiP/'],
|
||||
'title': [u'Text Processing in Python']}
|
||||
[dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
[scrapy] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
{'desc': [u' - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n'],
|
||||
'link': [u'http://www.informit.com/store/product.aspx?isbn=0130211192'],
|
||||
'title': [u'XML Processing with Python']}
|
||||
|
@ -19,20 +19,20 @@ To run it use::
|
||||
You should see an output like this::
|
||||
|
||||
2013-05-16 13:08:46-0300 [scrapy] INFO: Scrapy 0.17.0 started (bot: scrapybot)
|
||||
2013-05-16 13:08:47-0300 [follow] INFO: Spider opened
|
||||
2013-05-16 13:08:47-0300 [follow] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:48-0300 [follow] INFO: Crawled 74 pages (at 4440 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:49-0300 [follow] INFO: Crawled 143 pages (at 4140 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:50-0300 [follow] INFO: Crawled 210 pages (at 4020 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:51-0300 [follow] INFO: Crawled 274 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:52-0300 [follow] INFO: Crawled 343 pages (at 4140 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:53-0300 [follow] INFO: Crawled 410 pages (at 4020 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:54-0300 [follow] INFO: Crawled 474 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:55-0300 [follow] INFO: Crawled 538 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:56-0300 [follow] INFO: Crawled 602 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:57-0300 [follow] INFO: Closing spider (closespider_timeout)
|
||||
2013-05-16 13:08:57-0300 [follow] INFO: Crawled 666 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:57-0300 [follow] INFO: Dumping Scrapy stats:
|
||||
2013-05-16 13:08:47-0300 [scrapy] INFO: Spider opened
|
||||
2013-05-16 13:08:47-0300 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:48-0300 [scrapy] INFO: Crawled 74 pages (at 4440 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:49-0300 [scrapy] INFO: Crawled 143 pages (at 4140 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:50-0300 [scrapy] INFO: Crawled 210 pages (at 4020 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:51-0300 [scrapy] INFO: Crawled 274 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:52-0300 [scrapy] INFO: Crawled 343 pages (at 4140 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:53-0300 [scrapy] INFO: Crawled 410 pages (at 4020 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:54-0300 [scrapy] INFO: Crawled 474 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:55-0300 [scrapy] INFO: Crawled 538 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:56-0300 [scrapy] INFO: Crawled 602 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:57-0300 [scrapy] INFO: Closing spider (closespider_timeout)
|
||||
2013-05-16 13:08:57-0300 [scrapy] INFO: Crawled 666 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
|
||||
2013-05-16 13:08:57-0300 [scrapy] INFO: Dumping Scrapy stats:
|
||||
{'downloader/request_bytes': 231508,
|
||||
'downloader/request_count': 682,
|
||||
'downloader/request_method_count/GET': 682,
|
||||
@ -49,7 +49,7 @@ You should see an output like this::
|
||||
'scheduler/enqueued': 12767,
|
||||
'scheduler/enqueued/memory': 12767,
|
||||
'start_time': datetime.datetime(2013, 5, 16, 16, 8, 47, 676539)}
|
||||
2013-05-16 13:08:57-0300 [follow] INFO: Spider closed (closespider_timeout)
|
||||
2013-05-16 13:08:57-0300 [scrapy] INFO: Spider closed (closespider_timeout)
|
||||
|
||||
That tells you that Scrapy is able to crawl about 3900 pages per minute in the
|
||||
hardware where you run it. Note that this is a very simple spider intended to
|
||||
|
@ -132,16 +132,13 @@ Logging is another useful option for getting information about your spider run.
|
||||
Although not as convenient, it comes with the advantage that the logs will be
|
||||
available in all future runs should they be necessary again::
|
||||
|
||||
from scrapy import log
|
||||
|
||||
def parse_details(self, response):
|
||||
item = response.meta.get('item', None)
|
||||
if item:
|
||||
# populate more `item` fields
|
||||
return item
|
||||
else:
|
||||
self.log('No item received for %s' % response.url,
|
||||
level=log.WARNING)
|
||||
self.logger.warning('No item received for %s', response.url)
|
||||
|
||||
For more information, check the :ref:`topics-logging` section.
|
||||
|
||||
|
@ -232,14 +232,14 @@ header) and all cookies received in responses (ie. ``Set-Cookie`` header).
|
||||
|
||||
Here's an example of a log with :setting:`COOKIES_DEBUG` enabled::
|
||||
|
||||
2011-04-06 14:35:10-0300 [diningcity] INFO: Spider opened
|
||||
2011-04-06 14:35:10-0300 [diningcity] DEBUG: Sending cookies to: <GET http://www.diningcity.com/netherlands/index.html>
|
||||
2011-04-06 14:35:10-0300 [scrapy] INFO: Spider opened
|
||||
2011-04-06 14:35:10-0300 [scrapy] DEBUG: Sending cookies to: <GET http://www.diningcity.com/netherlands/index.html>
|
||||
Cookie: clientlanguage_nl=en_EN
|
||||
2011-04-06 14:35:14-0300 [diningcity] DEBUG: Received cookies from: <200 http://www.diningcity.com/netherlands/index.html>
|
||||
2011-04-06 14:35:14-0300 [scrapy] DEBUG: Received cookies from: <200 http://www.diningcity.com/netherlands/index.html>
|
||||
Set-Cookie: JSESSIONID=B~FA4DC0C496C8762AE4F1A620EAB34F38; Path=/
|
||||
Set-Cookie: ip_isocode=US
|
||||
Set-Cookie: clientlanguage_nl=en_EN; Expires=Thu, 07-Apr-2011 21:21:34 GMT; Path=/
|
||||
2011-04-06 14:49:50-0300 [diningcity] DEBUG: Crawled (200) <GET http://www.diningcity.com/netherlands/index.html> (referer: None)
|
||||
2011-04-06 14:49:50-0300 [scrapy] DEBUG: Crawled (200) <GET http://www.diningcity.com/netherlands/index.html> (referer: None)
|
||||
[...]
|
||||
|
||||
|
||||
|
@ -102,9 +102,12 @@ number of items will be specified through the ``MYEXT_ITEMCOUNT`` setting.
|
||||
|
||||
Here is the code of such extension::
|
||||
|
||||
import logging
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SpiderOpenCloseLogging(object):
|
||||
|
||||
def __init__(self, item_count):
|
||||
@ -133,15 +136,15 @@ Here is the code of such extension::
|
||||
return ext
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.log("opened spider %s" % spider.name)
|
||||
logger.info("opened spider %s", spider.name)
|
||||
|
||||
def spider_closed(self, spider):
|
||||
spider.log("closed spider %s" % spider.name)
|
||||
logger.info("closed spider %s", spider.name)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.items_scraped += 1
|
||||
if self.items_scraped % self.item_count == 0:
|
||||
spider.log("scraped %d items" % self.items_scraped)
|
||||
logger.info("scraped %d items", self.items_scraped)
|
||||
|
||||
|
||||
.. _topics-extensions-ref:
|
||||
|
@ -4,126 +4,232 @@
|
||||
Logging
|
||||
=======
|
||||
|
||||
Scrapy provides a logging facility which can be used through the
|
||||
:mod:`scrapy.log` module. The current underlying implementation uses `Twisted
|
||||
logging`_ but this may change in the future.
|
||||
.. note::
|
||||
:mod:`scrapy.log` has been deprecated alongside its functions in favor of
|
||||
explicit calls to the Python standard logging. Keep reading to learn more
|
||||
about the new logging system.
|
||||
|
||||
.. _Twisted logging: http://twistedmatrix.com/documents/current/core/howto/logging.html
|
||||
Scrapy uses `Python's builtin logging system
|
||||
<https://docs.python.org/2/library/logging.html>`_ for event logging. We'll
|
||||
provide some simple examples to get you started, but for more advanced
|
||||
use-cases it's strongly suggested to read thoroughly its documentation.
|
||||
|
||||
The logging service must be explicitly started through the
|
||||
:func:`scrapy.log.start` function to catch the top level Scrapy's log messages.
|
||||
On top of that, each crawler has its own independent log observer
|
||||
(automatically attached when it's created) that intercepts its spider's log
|
||||
messages.
|
||||
Logging works out of the box, and can be configured to some extent with the
|
||||
Scrapy settings listed in :ref:`topics-logging-settings`.
|
||||
|
||||
Scrapy calls :func:`scrapy.utils.log.configure_logging` to set some reasonable
|
||||
defaults and handle those settings in :ref:`topics-logging-settings` when
|
||||
running commands, so it's recommended to manually call it if you're running
|
||||
Scrapy from scripts as described in :ref:`run-from-script`.
|
||||
|
||||
.. _topics-logging-levels:
|
||||
|
||||
Log levels
|
||||
==========
|
||||
|
||||
Scrapy provides 5 logging levels:
|
||||
Python's builtin logging defines 5 different levels to indicate severity on a
|
||||
given log message. Here are the standard ones, listed in decreasing order:
|
||||
|
||||
1. :data:`~scrapy.log.CRITICAL` - for critical errors
|
||||
2. :data:`~scrapy.log.ERROR` - for regular errors
|
||||
3. :data:`~scrapy.log.WARNING` - for warning messages
|
||||
4. :data:`~scrapy.log.INFO` - for informational messages
|
||||
5. :data:`~scrapy.log.DEBUG` - for debugging messages
|
||||
|
||||
How to set the log level
|
||||
========================
|
||||
|
||||
You can set the log level using the `--loglevel/-L` command line option, or
|
||||
using the :setting:`LOG_LEVEL` setting.
|
||||
1. ``logging.CRITICAL`` - for critical errors (highest severity)
|
||||
2. ``logging.ERROR`` - for regular errors
|
||||
3. ``logging.WARNING`` - for warning messages
|
||||
4. ``logging.INFO`` - for informational messages
|
||||
5. ``logging.DEBUG`` - for debugging messages (lowest severity)
|
||||
|
||||
How to log messages
|
||||
===================
|
||||
|
||||
Here's a quick example of how to log a message using the ``WARNING`` level::
|
||||
Here's a quick example of how to log a message using the ``logging.WARNING``
|
||||
level::
|
||||
|
||||
from scrapy import log
|
||||
log.msg("This is a warning", level=log.WARNING)
|
||||
import logging
|
||||
logging.warning("This is a warning")
|
||||
|
||||
There are shortcuts for issuing log messages on any of the standard 5 levels,
|
||||
and there's also a general ``logging.log`` method which takes a given level as
|
||||
argument. If you need so, last example could be rewrote as::
|
||||
|
||||
import logging
|
||||
logging.log(logging.WARNING, "This is a warning")
|
||||
|
||||
On top of that, you can create different "loggers" to encapsulate messages (For
|
||||
example, a common practice it's to create different loggers for every module).
|
||||
These loggers can be configured independently, and they allow hierarchical
|
||||
constructions.
|
||||
|
||||
Last examples use the root logger behind the scenes, which is a top level
|
||||
logger where all messages are propagated to (unless otherwise specified). Using
|
||||
``logging`` helpers is merely a shortcut for getting the root logger
|
||||
explicitly, so this is also an equivalent of last snippets::
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger()
|
||||
logger.warning("This is a warning")
|
||||
|
||||
You can use a different logger just by getting its name with the
|
||||
``logging.getLogger`` function::
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger('mycustomlogger')
|
||||
logger.warning("This is a warning")
|
||||
|
||||
Finally, you can ensure having a custom logger for any module you're working on
|
||||
by using the ``__name__`` variable, which is populated with current module's
|
||||
path::
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("This is a warning")
|
||||
|
||||
.. seealso::
|
||||
|
||||
Module logging, `HowTo <https://docs.python.org/2/howto/logging.html>`_
|
||||
Basic Logging Tutorial
|
||||
|
||||
Module logging, `Loggers <https://docs.python.org/2/library/logging.html#logger-objects>`_
|
||||
Further documentation on loggers
|
||||
|
||||
.. _topics-logging-from-spiders:
|
||||
|
||||
Logging from Spiders
|
||||
====================
|
||||
|
||||
The recommended way to log from spiders is by using the Spider
|
||||
:meth:`~scrapy.spider.Spider.log` method, which already populates the
|
||||
``spider`` argument of the :func:`scrapy.log.msg` function. The other arguments
|
||||
are passed directly to the :func:`~scrapy.log.msg` function.
|
||||
Scrapy provides a :data:`~scrapy.spider.Spider.logger` within each Spider
|
||||
instance, that can be accessed and used like this::
|
||||
|
||||
scrapy.log module
|
||||
=================
|
||||
import scrapy
|
||||
|
||||
.. module:: scrapy.log
|
||||
:synopsis: Logging facility
|
||||
class MySpider(scrapy.Spider):
|
||||
|
||||
.. function:: start(logfile=None, loglevel=None, logstdout=None)
|
||||
name = 'myspider'
|
||||
start_urls = ['http://scrapinghub.com']
|
||||
|
||||
Start the top level Scrapy logger. This must be called before actually
|
||||
logging any top level messages (those logged using this module's
|
||||
:func:`~scrapy.log.msg` function instead of the :meth:`Spider.log
|
||||
<scrapy.spider.Spider.log>` method). Otherwise, messages logged before this
|
||||
call will get lost.
|
||||
def parse(self, response):
|
||||
self.logger.info('Parse function called on %s', response.url)
|
||||
|
||||
:param logfile: the file path to use for logging output. If omitted, the
|
||||
:setting:`LOG_FILE` setting will be used. If both are ``None``, the log
|
||||
will be sent to standard error.
|
||||
:type logfile: str
|
||||
That logger is created using the Spider's name, but you can use any custom
|
||||
Python logger you want. For example::
|
||||
|
||||
:param loglevel: the minimum logging level to log. Available values are:
|
||||
:data:`CRITICAL`, :data:`ERROR`, :data:`WARNING`, :data:`INFO` and
|
||||
:data:`DEBUG`.
|
||||
import logging
|
||||
import scrapy
|
||||
|
||||
:param logstdout: if ``True``, all standard output (and error) of your
|
||||
application will be logged instead. For example if you "print 'hello'"
|
||||
it will appear in the Scrapy log. If omitted, the :setting:`LOG_STDOUT`
|
||||
setting will be used.
|
||||
:type logstdout: boolean
|
||||
logger = logging.getLogger('mycustomlogger')
|
||||
|
||||
.. function:: msg(message, level=INFO, spider=None)
|
||||
class MySpider(scrapy.Spider):
|
||||
|
||||
Log a message
|
||||
name = 'myspider'
|
||||
start_urls = ['http://scrapinghub.com']
|
||||
|
||||
:param message: the message to log
|
||||
:type message: str
|
||||
def parse(self, response):
|
||||
logger.info('Parse function called on %s', response.url)
|
||||
|
||||
:param level: the log level for this message. See
|
||||
:ref:`topics-logging-levels`.
|
||||
.. _topics-logging-configuration:
|
||||
|
||||
:param spider: the spider to use for logging this message. This parameter
|
||||
should always be used when logging things related to a particular
|
||||
spider.
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
Logging configuration
|
||||
=====================
|
||||
|
||||
.. data:: CRITICAL
|
||||
Loggers on their own don't manage how messages sent through them are displayed.
|
||||
For this task, different "handlers" can be attached to any logger instance and
|
||||
they will redirect those messages to appropriate destinations, such as the
|
||||
standard output, files, emails, etc.
|
||||
|
||||
Log level for critical errors
|
||||
By default, Scrapy sets and configures a handler for the root logger, based on
|
||||
the settings below.
|
||||
|
||||
.. data:: ERROR
|
||||
|
||||
Log level for errors
|
||||
|
||||
.. data:: WARNING
|
||||
|
||||
Log level for warnings
|
||||
|
||||
.. data:: INFO
|
||||
|
||||
Log level for informational messages (recommended level for production
|
||||
deployments)
|
||||
|
||||
.. data:: DEBUG
|
||||
|
||||
Log level for debugging messages (recommended level for development)
|
||||
.. _topics-logging-settings:
|
||||
|
||||
Logging settings
|
||||
================
|
||||
----------------
|
||||
|
||||
These settings can be used to configure the logging:
|
||||
|
||||
* :setting:`LOG_FILE`
|
||||
* :setting:`LOG_ENABLED`
|
||||
* :setting:`LOG_ENCODING`
|
||||
* :setting:`LOG_FILE`
|
||||
* :setting:`LOG_LEVEL`
|
||||
* :setting:`LOG_FORMAT`
|
||||
* :setting:`LOG_DATEFORMAT`
|
||||
* :setting:`LOG_STDOUT`
|
||||
|
||||
First couple of settings define a destination for log messages. If
|
||||
:setting:`LOG_FILE` is set, messages sent through the root logger will be
|
||||
redirected to a file named :setting:`LOG_FILE` with encoding
|
||||
:setting:`LOG_ENCODING`. If unset and :setting:`LOG_ENABLED` is ``True``, log
|
||||
messages will be displayed on the standard error. Lastly, if
|
||||
:setting:`LOG_ENABLED` is ``False``, there won't be any visible log output.
|
||||
|
||||
:setting:`LOG_LEVEL` determines the minimum level of severity to display, those
|
||||
messages with lower severity will be filtered out. It ranges through the
|
||||
possible levels listed in :ref:`topics-logging-levels`.
|
||||
|
||||
:setting:`LOG_FORMAT` and :setting:`LOG_DATEFORMAT` specify formatting strings
|
||||
used as layouts for all messages. Those strings can contain any placeholders
|
||||
listed in `logging's logrecord attributes docs
|
||||
<https://docs.python.org/2/library/logging.html#logrecord-attributes>`_ and
|
||||
`datetime's strftime and strptime directives
|
||||
<https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_
|
||||
respectively.
|
||||
|
||||
Command-line options
|
||||
--------------------
|
||||
|
||||
There are command-line arguments, available for all commands, that you can use
|
||||
to override some of the Scrapy settings regarding logging.
|
||||
|
||||
* ``--logfile FILE``
|
||||
Overrides :setting:`LOG_FILE`
|
||||
* ``--loglevel/-L LEVEL``
|
||||
Overrides :setting:`LOG_LEVEL`
|
||||
* ``--nolog``
|
||||
Sets :setting:`LOG_ENABLED` to ``False``
|
||||
|
||||
.. seealso::
|
||||
|
||||
Module `logging.handlers <https://docs.python.org/2/library/logging.handlers.html>`_
|
||||
Further documentation on available handlers
|
||||
|
||||
scrapy.utils.log module
|
||||
=======================
|
||||
|
||||
.. module:: scrapy.utils.log
|
||||
:synopsis: Logging utils
|
||||
|
||||
.. function:: configure_logging(settings=None)
|
||||
|
||||
This function initializes logging defaults for Scrapy.
|
||||
|
||||
It's automatically called when using Scrapy commands, but needs to be
|
||||
called explicitely when running custom scripts. In that case, its usage is
|
||||
not required but it's recommended.
|
||||
|
||||
This function does:
|
||||
- Route warnings and Twisted logging through Python standard logging
|
||||
- Set a filter on Scrapy logger for formatting Twisted failures
|
||||
- Assign DEBUG and ERROR levels to Scrapy and Twisted loggers
|
||||
respectively
|
||||
|
||||
If `settings` is not ``None``, it will also create a root handler based on
|
||||
the settings listed in :ref:`topics-logging-settings`.
|
||||
|
||||
If you plan on configuring the handlers yourself is still recommended you
|
||||
call this function, keeping `settings` as ``None``. Bear in mind there
|
||||
won't be any log output set by default in that case.
|
||||
|
||||
To get you started on manually configuring logging's output, you can use
|
||||
`logging.basicConfig()`_ to set a basic root handler. This is an example on
|
||||
how to redirect ``INFO`` or higher messages to a file::
|
||||
|
||||
import logging
|
||||
from scrapy.utils.log import configure_logging
|
||||
|
||||
configure_logging() # Note we aren't providing settings in this case
|
||||
logging.basicConfig(filename='log.txt', format='%(levelname)s: %(message)s', level=logging.INFO)
|
||||
|
||||
Refer to :ref:`run-from-script` for more details about using Scrapy this
|
||||
way.
|
||||
|
||||
:param settings: settings used to create and configure a handler for the
|
||||
root logger.
|
||||
:type settings: :class:`~scrapy.settings.Settings` object or ``None``
|
||||
|
||||
.. _logging.basicConfig(): https://docs.python.org/2/library/logging.html#logging.basicConfig
|
||||
|
@ -30,9 +30,12 @@ project as example.
|
||||
|
||||
from twisted.internet import reactor
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.log import configure_logging
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
runner = CrawlerRunner(get_project_settings())
|
||||
settings = get_project_settings()
|
||||
configure_logging(settings)
|
||||
runner = CrawlerRunner(settings)
|
||||
|
||||
# 'followall' is the name of one of the spiders of the project.
|
||||
d = runner.crawl('followall', domain='scrapinghub.com')
|
||||
@ -54,11 +57,13 @@ the spider class as first argument in the :meth:`CrawlerRunner.crawl
|
||||
from twisted.internet import reactor
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.log import configure_logging
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
# Your spider definition
|
||||
...
|
||||
|
||||
configure_logging(settings)
|
||||
runner = CrawlerRunner({
|
||||
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
|
||||
})
|
||||
@ -85,9 +90,12 @@ Here is an example that runs multiple spiders simultaneously, using the
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.log import configure_logging
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
runner = CrawlerRunner(get_project_settings())
|
||||
settings = get_project_settings()
|
||||
configure_logging(settings)
|
||||
runner = CrawlerRunner(settings)
|
||||
dfs = set()
|
||||
for domain in ['scrapinghub.com', 'insophia.com']:
|
||||
d = runner.crawl('followall', domain=domain)
|
||||
@ -102,9 +110,12 @@ Same example but running the spiders sequentially by chaining the deferreds:
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.utils.log import configure_logging
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
runner = CrawlerRunner(get_project_settings())
|
||||
settings = get_project_settings()
|
||||
configure_logging(settings)
|
||||
runner = CrawlerRunner(settings)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def crawl():
|
||||
|
@ -189,7 +189,7 @@ Example::
|
||||
|
||||
def parse_page2(self, response):
|
||||
# this would log http://www.example.com/some_page.html
|
||||
self.log("Visited %s" % response.url)
|
||||
self.logger.info("Visited %s", response.url)
|
||||
|
||||
In some cases you may be interested in passing arguments to those callback
|
||||
functions so you can receive the arguments later, in the second callback. You
|
||||
@ -382,7 +382,7 @@ method for this job. Here's an example spider which uses it::
|
||||
def after_login(self, response):
|
||||
# check login succeed before going on
|
||||
if "authentication failed" in response.body:
|
||||
self.log("Login failed", level=log.ERROR)
|
||||
self.logger.error("Login failed")
|
||||
return
|
||||
|
||||
# continue scraping with authenticated session...
|
||||
|
@ -612,6 +612,31 @@ Default: ``None``
|
||||
|
||||
File name to use for logging output. If None, standard error will be used.
|
||||
|
||||
.. setting:: LOG_FORMAT
|
||||
|
||||
LOG_FORMAT
|
||||
----------
|
||||
|
||||
Default: ``'%(asctime)s [%(name)s] %(levelname)s: %(message)s'``
|
||||
|
||||
String for formatting log messsages. Refer to the `Python logging documentation`_ for the whole list of available
|
||||
placeholders.
|
||||
|
||||
.. _Python logging documentation: https://docs.python.org/2/library/logging.html#logrecord-attributes
|
||||
|
||||
.. setting:: LOG_DATEFORMAT
|
||||
|
||||
LOG_DATEFORMAT
|
||||
--------------
|
||||
|
||||
Default: ``'%Y-%m-%d %H:%M:%S%z'``
|
||||
|
||||
String for formatting date/time, expansion of the ``%(asctime)s`` placeholder
|
||||
in :setting:`LOG_FORMAT`. Refer to the `Python datetime documentation`_ for the whole list of available
|
||||
directives.
|
||||
|
||||
.. _Python datetime documentation: https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior
|
||||
|
||||
.. setting:: LOG_LEVEL
|
||||
|
||||
LOG_LEVEL
|
||||
|
@ -187,8 +187,8 @@ Here's an example of how you would call it from your spider::
|
||||
|
||||
When you run the spider, you will get something similar to this::
|
||||
|
||||
2014-01-23 17:48:31-0400 [myspider] DEBUG: Crawled (200) <GET http://example.com> (referer: None)
|
||||
2014-01-23 17:48:31-0400 [myspider] DEBUG: Crawled (200) <GET http://example.org> (referer: None)
|
||||
2014-01-23 17:48:31-0400 [scrapy] DEBUG: Crawled (200) <GET http://example.com> (referer: None)
|
||||
2014-01-23 17:48:31-0400 [scrapy] DEBUG: Crawled (200) <GET http://example.org> (referer: None)
|
||||
[s] Available Scrapy objects:
|
||||
[s] crawler <scrapy.crawler.Crawler object at 0x1e16b50>
|
||||
...
|
||||
@ -211,7 +211,7 @@ Finally you hit Ctrl-D (or Ctrl-Z in Windows) to exit the shell and resume the
|
||||
crawling::
|
||||
|
||||
>>> ^D
|
||||
2014-01-23 17:50:03-0400 [myspider] DEBUG: Crawled (200) <GET http://example.net> (referer: None)
|
||||
2014-01-23 17:50:03-0400 [scrapy] DEBUG: Crawled (200) <GET http://example.net> (referer: None)
|
||||
...
|
||||
|
||||
Note that you can't use the ``fetch`` shortcut here since the Scrapy engine is
|
||||
|
@ -112,6 +112,12 @@ scrapy.Spider
|
||||
:class:`~scrapy.settings.Settings` instance, see the
|
||||
:ref:`topics-settings` topic for a detailed introduction on this subject.
|
||||
|
||||
.. attribute:: logger
|
||||
|
||||
Python logger created with the Spider's :attr:`name`. You can use it to
|
||||
send log messages through it as described on
|
||||
:ref:`topics-logging-from-spiders`.
|
||||
|
||||
.. method:: from_crawler(crawler, \*args, \**kwargs)
|
||||
|
||||
This is the class method used by Scrapy to create your spiders.
|
||||
@ -194,9 +200,9 @@ scrapy.Spider
|
||||
|
||||
.. method:: log(message, [level, component])
|
||||
|
||||
Log a message using the :func:`scrapy.log.msg` function, automatically
|
||||
populating the spider argument with the :attr:`name` of this
|
||||
spider. For more information see :ref:`topics-logging`.
|
||||
Wrapper that sends a log message through the Spider's :attr:`logger`,
|
||||
kept for backwards compatibility. For more information see
|
||||
:ref:`topics-logging-from-spiders`.
|
||||
|
||||
.. method:: closed(reason)
|
||||
|
||||
@ -218,7 +224,7 @@ Let's see an example::
|
||||
]
|
||||
|
||||
def parse(self, response):
|
||||
self.log('A response from %s just arrived!' % response.url)
|
||||
self.logger.info('A response from %s just arrived!', response.url)
|
||||
|
||||
Return multiple Requests and items from a single callback::
|
||||
|
||||
@ -406,7 +412,7 @@ Let's now take a look at an example CrawlSpider with rules::
|
||||
)
|
||||
|
||||
def parse_item(self, response):
|
||||
self.log('Hi, this is an item page! %s' % response.url)
|
||||
self.logger.info('Hi, this is an item page! %s', response.url)
|
||||
item = scrapy.Item()
|
||||
item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
|
||||
item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
|
||||
@ -509,7 +515,6 @@ XMLFeedSpider example
|
||||
|
||||
These spiders are pretty easy to use, let's have a look at one example::
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.contrib.spiders import XMLFeedSpider
|
||||
from myproject.items import TestItem
|
||||
|
||||
@ -521,7 +526,7 @@ These spiders are pretty easy to use, let's have a look at one example::
|
||||
itertag = 'item'
|
||||
|
||||
def parse_node(self, response, node):
|
||||
log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract())))
|
||||
self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract()))
|
||||
|
||||
item = TestItem()
|
||||
item['id'] = node.xpath('@id').extract()
|
||||
@ -570,7 +575,6 @@ CSVFeedSpider example
|
||||
Let's see an example similar to the previous one, but using a
|
||||
:class:`CSVFeedSpider`::
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.contrib.spiders import CSVFeedSpider
|
||||
from myproject.items import TestItem
|
||||
|
||||
@ -583,7 +587,7 @@ Let's see an example similar to the previous one, but using a
|
||||
headers = ['id', 'name', 'description']
|
||||
|
||||
def parse_row(self, response, row):
|
||||
log.msg('Hi, this is a row!: %r' % row)
|
||||
self.logger.info('Hi, this is a row!: %r', row)
|
||||
|
||||
item = TestItem()
|
||||
item['id'] = row['id']
|
||||
|
@ -1,5 +1,5 @@
|
||||
[pytest]
|
||||
usefixtures = chdir setlog
|
||||
usefixtures = chdir
|
||||
python_files=test_*.py __init__.py
|
||||
addopts = --doctest-modules --assert=plain
|
||||
twisted = 1
|
||||
|
@ -1,5 +1,8 @@
|
||||
from __future__ import print_function
|
||||
import logging
|
||||
|
||||
from w3lib.url import is_url
|
||||
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.item import BaseItem
|
||||
@ -7,7 +10,9 @@ from scrapy.utils import display
|
||||
from scrapy.utils.conf import arglist_to_dict
|
||||
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy import log
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
@ -119,9 +124,9 @@ class Command(ScrapyCommand):
|
||||
if rule.link_extractor.matches(response.url) and rule.callback:
|
||||
return rule.callback
|
||||
else:
|
||||
log.msg(format='No CrawlSpider rules found in spider %(spider)r, '
|
||||
'please specify a callback to use for parsing',
|
||||
level=log.ERROR, spider=spider.name)
|
||||
logger.error('No CrawlSpider rules found in spider %(spider)r, '
|
||||
'please specify a callback to use for parsing',
|
||||
{'spider': spider.name})
|
||||
|
||||
def set_spidercls(self, url, opts):
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
@ -129,13 +134,13 @@ class Command(ScrapyCommand):
|
||||
try:
|
||||
self.spidercls = spider_loader.load(opts.spider)
|
||||
except KeyError:
|
||||
log.msg(format='Unable to find spider: %(spider)s',
|
||||
level=log.ERROR, spider=opts.spider)
|
||||
logger.error('Unable to find spider: %(spider)s',
|
||||
{'spider': opts.spider})
|
||||
else:
|
||||
self.spidercls = spidercls_for_request(spider_loader, Request(url))
|
||||
if not self.spidercls:
|
||||
log.msg(format='Unable to find spider for: %(url)s',
|
||||
level=log.ERROR, url=url)
|
||||
logger.error('Unable to find spider for: %(url)s',
|
||||
{'url': url})
|
||||
|
||||
request = Request(url, opts.callback)
|
||||
_start_requests = lambda s: [self.prepare_request(s, request, opts)]
|
||||
@ -148,8 +153,8 @@ class Command(ScrapyCommand):
|
||||
self.crawler_process.start()
|
||||
|
||||
if not self.first_response:
|
||||
log.msg(format='No response downloaded for: %(url)s',
|
||||
level=log.ERROR, url=url)
|
||||
logger.error('No response downloaded for: %(url)s',
|
||||
{'url': url})
|
||||
|
||||
def prepare_request(self, spider, request, opts):
|
||||
def callback(response):
|
||||
@ -170,8 +175,8 @@ class Command(ScrapyCommand):
|
||||
if callable(cb_method):
|
||||
cb = cb_method
|
||||
else:
|
||||
log.msg(format='Cannot find callback %(callback)r in spider: %(spider)s',
|
||||
callback=callback, spider=spider.name, level=log.ERROR)
|
||||
logger.error('Cannot find callback %(callback)r in spider: %(spider)s',
|
||||
{'callback': callback, 'spider': spider.name})
|
||||
return
|
||||
|
||||
# parse items and requests
|
||||
|
@ -9,7 +9,6 @@ from threading import Thread
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.shell import Shell
|
||||
from scrapy.http import Request
|
||||
from scrapy import log
|
||||
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
|
||||
|
||||
|
||||
@ -54,7 +53,6 @@ class Command(ScrapyCommand):
|
||||
# The crawler is created this way since the Shell manually handles the
|
||||
# crawling engine, so the set up in the crawl method won't work
|
||||
crawler = self.crawler_process._create_crawler(spidercls)
|
||||
self.crawler_process._setup_crawler_logging(crawler)
|
||||
# The Shell class needs a persistent engine in the crawler
|
||||
crawler.engine = crawler._create_engine()
|
||||
crawler.engine.start()
|
||||
|
@ -6,13 +6,15 @@ See documentation in docs/topics/extensions.rst
|
||||
|
||||
import sys
|
||||
import signal
|
||||
import logging
|
||||
import traceback
|
||||
import threading
|
||||
from pdb import Pdb
|
||||
|
||||
from scrapy.utils.engine import format_engine_status
|
||||
from scrapy.utils.trackref import format_live_refs
|
||||
from scrapy import log
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StackTraceDump(object):
|
||||
@ -31,12 +33,14 @@ class StackTraceDump(object):
|
||||
return cls(crawler)
|
||||
|
||||
def dump_stacktrace(self, signum, frame):
|
||||
stackdumps = self._thread_stacks()
|
||||
enginestatus = format_engine_status(self.crawler.engine)
|
||||
liverefs = format_live_refs()
|
||||
msg = "Dumping stack trace and engine status" \
|
||||
"\n{0}\n{1}\n{2}".format(enginestatus, liverefs, stackdumps)
|
||||
log.msg(msg)
|
||||
log_args = {
|
||||
'stackdumps': self._thread_stacks(),
|
||||
'enginestatus': format_engine_status(self.crawler.engine),
|
||||
'liverefs': format_live_refs(),
|
||||
}
|
||||
logger.info("Dumping stack trace and engine status\n"
|
||||
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
|
||||
log_args, extra={'crawler': self.crawler})
|
||||
|
||||
def _thread_stacks(self):
|
||||
id2name = dict((th.ident, th.name) for th in threading.enumerate())
|
||||
|
@ -1,14 +1,19 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
import re
|
||||
import logging
|
||||
|
||||
import six
|
||||
from w3lib import html
|
||||
from scrapy import log
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.response import _noscript_re, _script_re
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AjaxCrawlMiddleware(object):
|
||||
"""
|
||||
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
|
||||
@ -46,9 +51,9 @@ class AjaxCrawlMiddleware(object):
|
||||
|
||||
# scrapy already handles #! links properly
|
||||
ajax_crawl_request = request.replace(url=request.url+'#!')
|
||||
log.msg(format="Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
|
||||
level=log.DEBUG, spider=spider,
|
||||
ajax_crawl_request=ajax_crawl_request, request=request)
|
||||
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
|
||||
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
|
||||
extra={'spider': spider})
|
||||
|
||||
ajax_crawl_request.meta['ajax_crawlable'] = True
|
||||
return ajax_crawl_request
|
||||
|
@ -1,11 +1,13 @@
|
||||
import os
|
||||
import six
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import Response
|
||||
from scrapy.http.cookies import CookieJar
|
||||
from scrapy import log
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CookiesMiddleware(object):
|
||||
@ -54,7 +56,7 @@ class CookiesMiddleware(object):
|
||||
if cl:
|
||||
msg = "Sending cookies to: %s" % request + os.linesep
|
||||
msg += os.linesep.join("Cookie: %s" % c for c in cl)
|
||||
log.msg(msg, spider=spider, level=log.DEBUG)
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _debug_set_cookie(self, response, spider):
|
||||
if self.debug:
|
||||
@ -62,7 +64,7 @@ class CookiesMiddleware(object):
|
||||
if cl:
|
||||
msg = "Received cookies from: %s" % response + os.linesep
|
||||
msg += os.linesep.join("Set-Cookie: %s" % c for c in cl)
|
||||
log.msg(msg, spider=spider, level=log.DEBUG)
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _format_cookie(self, cookie):
|
||||
# build cookie string
|
||||
|
@ -1,11 +1,12 @@
|
||||
""" This module implements the DecompressionMiddleware which tries to recognise
|
||||
and extract the potentially compressed responses that may arrive.
|
||||
and extract the potentially compressed responses that may arrive.
|
||||
"""
|
||||
|
||||
import bz2
|
||||
import gzip
|
||||
import zipfile
|
||||
import tarfile
|
||||
import logging
|
||||
from tempfile import mktemp
|
||||
|
||||
import six
|
||||
@ -15,9 +16,10 @@ try:
|
||||
except ImportError:
|
||||
from io import BytesIO
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.responsetypes import responsetypes
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DecompressionMiddleware(object):
|
||||
""" This middleware tries to recognise and extract the possibly compressed
|
||||
@ -80,7 +82,7 @@ class DecompressionMiddleware(object):
|
||||
for fmt, func in six.iteritems(self._formats):
|
||||
new_response = func(response)
|
||||
if new_response:
|
||||
log.msg(format='Decompressed response with format: %(responsefmt)s',
|
||||
level=log.DEBUG, spider=spider, responsefmt=fmt)
|
||||
logger.debug('Decompressed response with format: %(responsefmt)s',
|
||||
{'responsefmt': fmt}, extra={'spider': spider})
|
||||
return new_response
|
||||
return response
|
||||
|
@ -1,10 +1,12 @@
|
||||
import logging
|
||||
from six.moves.urllib.parse import urljoin
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.response import get_meta_refresh
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseRedirectMiddleware(object):
|
||||
|
||||
@ -32,13 +34,13 @@ class BaseRedirectMiddleware(object):
|
||||
[request.url]
|
||||
redirected.dont_filter = request.dont_filter
|
||||
redirected.priority = request.priority + self.priority_adjust
|
||||
log.msg(format="Redirecting (%(reason)s) to %(redirected)s from %(request)s",
|
||||
level=log.DEBUG, spider=spider, request=request,
|
||||
redirected=redirected, reason=reason)
|
||||
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
|
||||
{'reason': reason, 'redirected': redirected, 'request': request},
|
||||
extra={'spider': spider})
|
||||
return redirected
|
||||
else:
|
||||
log.msg(format="Discarding %(request)s: max redirections reached",
|
||||
level=log.DEBUG, spider=spider, request=request)
|
||||
logger.debug("Discarding %(request)s: max redirections reached",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
raise IgnoreRequest("max redirections reached")
|
||||
|
||||
def _redirect_request_using_get(self, request, redirect_url):
|
||||
|
@ -17,17 +17,19 @@ About HTTP errors to consider:
|
||||
protocol. It's included by default because it's a common code used to
|
||||
indicate server overload, which would be something we want to retry
|
||||
"""
|
||||
import logging
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.error import TimeoutError, DNSLookupError, \
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError, \
|
||||
ConnectionLost, TCPTimedOutError
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.response import response_status_message
|
||||
from scrapy.xlib.tx import ResponseFailed
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RetryMiddleware(object):
|
||||
|
||||
@ -66,13 +68,15 @@ class RetryMiddleware(object):
|
||||
retries = request.meta.get('retry_times', 0) + 1
|
||||
|
||||
if retries <= self.max_retry_times:
|
||||
log.msg(format="Retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason)
|
||||
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
retryreq = request.copy()
|
||||
retryreq.meta['retry_times'] = retries
|
||||
retryreq.dont_filter = True
|
||||
retryreq.priority = request.priority + self.priority_adjust
|
||||
return retryreq
|
||||
else:
|
||||
log.msg(format="Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason)
|
||||
logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
|
@ -4,13 +4,16 @@ enable this middleware and enable the ROBOTSTXT_OBEY setting.
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from six.moves.urllib import robotparser
|
||||
|
||||
from scrapy import signals, log
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RobotsTxtMiddleware(object):
|
||||
DOWNLOAD_PRIORITY = 1000
|
||||
@ -32,8 +35,8 @@ class RobotsTxtMiddleware(object):
|
||||
return
|
||||
rp = self.robot_parser(request, spider)
|
||||
if rp and not rp.can_fetch(self._useragent, request.url):
|
||||
log.msg(format="Forbidden by robots.txt: %(request)s",
|
||||
level=log.DEBUG, request=request)
|
||||
logger.debug("Forbidden by robots.txt: %(request)s",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
raise IgnoreRequest
|
||||
|
||||
def robot_parser(self, request, spider):
|
||||
@ -54,8 +57,9 @@ class RobotsTxtMiddleware(object):
|
||||
|
||||
def _logerror(self, failure, request, spider):
|
||||
if failure.type is not IgnoreRequest:
|
||||
log.msg(format="Error downloading %%(request)s: %s" % failure.value,
|
||||
level=log.ERROR, request=request, spider=spider)
|
||||
logger.error("Error downloading %(request)s: %(f_exception)s",
|
||||
{'request': request, 'f_exception': failure.value},
|
||||
extra={'spider': spider, 'failure': failure})
|
||||
|
||||
def _parse_robots(self, response):
|
||||
rp = robotparser.RobotFileParser(response.url)
|
||||
|
@ -4,7 +4,10 @@ Feed Exports extension
|
||||
See documentation in docs/topics/feed-exports.rst
|
||||
"""
|
||||
|
||||
import sys, os, posixpath
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import posixpath
|
||||
from tempfile import TemporaryFile
|
||||
from datetime import datetime
|
||||
from six.moves.urllib.parse import urlparse
|
||||
@ -14,12 +17,14 @@ from zope.interface import Interface, implementer
|
||||
from twisted.internet import defer, threads
|
||||
from w3lib.url import file_uri_to_path
|
||||
|
||||
from scrapy import log, signals
|
||||
from scrapy import signals
|
||||
from scrapy.utils.ftp import ftp_makedirs_cwd
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.python import get_func_args
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IFeedStorage(Interface):
|
||||
"""Interface that all Feed Storages must implement"""
|
||||
@ -171,11 +176,15 @@ class FeedExporter(object):
|
||||
if not slot.itemcount and not self.store_empty:
|
||||
return
|
||||
slot.exporter.finish_exporting()
|
||||
logfmt = "%%s %s feed (%d items) in: %s" % (self.format, \
|
||||
slot.itemcount, slot.uri)
|
||||
logfmt = "%%s %(format)s feed (%(itemcount)d items) in: %(uri)s"
|
||||
log_args = {'format': self.format,
|
||||
'itemcount': slot.itemcount,
|
||||
'uri': slot.uri}
|
||||
d = defer.maybeDeferred(slot.storage.store, slot.file)
|
||||
d.addCallback(lambda _: log.msg(logfmt % "Stored", spider=spider))
|
||||
d.addErrback(log.err, logfmt % "Error storing", spider=spider)
|
||||
d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args,
|
||||
extra={'spider': spider}))
|
||||
d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args,
|
||||
extra={'spider': spider, 'failure': f}))
|
||||
return d
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
@ -198,7 +207,7 @@ class FeedExporter(object):
|
||||
def _exporter_supported(self, format):
|
||||
if format in self.exporters:
|
||||
return True
|
||||
log.msg("Unknown feed format: %s" % format, log.ERROR)
|
||||
logger.error("Unknown feed format: %(format)s", {'format': format})
|
||||
|
||||
def _storage_supported(self, uri):
|
||||
scheme = urlparse(uri).scheme
|
||||
@ -207,9 +216,11 @@ class FeedExporter(object):
|
||||
self._get_storage(uri)
|
||||
return True
|
||||
except NotConfigured:
|
||||
log.msg("Disabled feed storage scheme: %s" % scheme, log.ERROR)
|
||||
logger.error("Disabled feed storage scheme: %(scheme)s",
|
||||
{'scheme': scheme})
|
||||
else:
|
||||
log.msg("Unknown feed storage scheme: %s" % scheme, log.ERROR)
|
||||
logger.error("Unknown feed storage scheme: %(scheme)s",
|
||||
{'scheme': scheme})
|
||||
|
||||
def _get_exporter(self, *args, **kwargs):
|
||||
return self.exporters[self.format](*args, **kwargs)
|
||||
|
@ -1,7 +1,11 @@
|
||||
import logging
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import log, signals
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogStats(object):
|
||||
@ -35,9 +39,12 @@ class LogStats(object):
|
||||
irate = (items - self.itemsprev) * self.multiplier
|
||||
prate = (pages - self.pagesprev) * self.multiplier
|
||||
self.pagesprev, self.itemsprev = pages, items
|
||||
msg = "Crawled %d pages (at %d pages/min), scraped %d items (at %d items/min)" \
|
||||
% (pages, prate, items, irate)
|
||||
log.msg(msg, spider=spider)
|
||||
|
||||
msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
|
||||
"scraped %(items)d items (at %(itemrate)d items/min)")
|
||||
log_args = {'pages': pages, 'pagerate': prate,
|
||||
'items': items, 'itemrate': irate}
|
||||
logger.info(msg, log_args, extra={'spider': spider})
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
if self.task.running:
|
||||
|
@ -5,16 +5,20 @@ See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
import sys
|
||||
import socket
|
||||
import logging
|
||||
from pprint import pformat
|
||||
from importlib import import_module
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
from scrapy import signals, log
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.mail import MailSender
|
||||
from scrapy.utils.engine import get_engine_status
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryUsage(object):
|
||||
|
||||
def __init__(self, crawler):
|
||||
@ -74,8 +78,8 @@ class MemoryUsage(object):
|
||||
if self.get_virtual_size() > self.limit:
|
||||
self.crawler.stats.set_value('memusage/limit_reached', 1)
|
||||
mem = self.limit/1024/1024
|
||||
log.msg(format="Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
|
||||
level=log.ERROR, memusage=mem)
|
||||
logger.error("Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = "%s terminated: memory usage exceeded %dM at %s" % \
|
||||
(self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
|
||||
@ -95,8 +99,8 @@ class MemoryUsage(object):
|
||||
if self.get_virtual_size() > self.warning:
|
||||
self.crawler.stats.set_value('memusage/warning_reached', 1)
|
||||
mem = self.warning/1024/1024
|
||||
log.msg(format="Memory usage reached %(memusage)dM",
|
||||
level=log.WARNING, memusage=mem)
|
||||
logger.warning("Memory usage reached %(memusage)dM",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = "%s warning: memory usage reached %dM at %s" % \
|
||||
(self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
|
||||
|
@ -9,6 +9,7 @@ import os
|
||||
import os.path
|
||||
import rfc822
|
||||
import time
|
||||
import logging
|
||||
from six.moves.urllib.parse import urlparse
|
||||
from collections import defaultdict
|
||||
import six
|
||||
@ -20,12 +21,13 @@ except ImportError:
|
||||
|
||||
from twisted.internet import defer, threads
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.contrib.pipeline.media import MediaPipeline
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.misc import md5sum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileException(Exception):
|
||||
"""General media error exception"""
|
||||
@ -192,9 +194,13 @@ class FilesPipeline(MediaPipeline):
|
||||
return # returning None force download
|
||||
|
||||
referer = request.headers.get('Referer')
|
||||
log.msg(format='File (uptodate): Downloaded %(medianame)s from %(request)s referred in <%(referer)s>',
|
||||
level=log.DEBUG, spider=info.spider,
|
||||
medianame=self.MEDIA_NAME, request=request, referer=referer)
|
||||
logger.debug(
|
||||
'File (uptodate): Downloaded %(medianame)s from %(request)s '
|
||||
'referred in <%(referer)s>',
|
||||
{'medianame': self.MEDIA_NAME, 'request': request,
|
||||
'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
self.inc_stats(info.spider, 'uptodate')
|
||||
|
||||
checksum = result.get('checksum', None)
|
||||
@ -203,17 +209,23 @@ class FilesPipeline(MediaPipeline):
|
||||
path = self.file_path(request, info=info)
|
||||
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
|
||||
dfd.addCallbacks(_onsuccess, lambda _: None)
|
||||
dfd.addErrback(log.err, self.__class__.__name__ + '.store.stat_file')
|
||||
dfd.addErrback(
|
||||
lambda f:
|
||||
logger.error(self.__class__.__name__ + '.store.stat_file',
|
||||
extra={'spider': info.spider, 'failure': f})
|
||||
)
|
||||
return dfd
|
||||
|
||||
def media_failed(self, failure, request, info):
|
||||
if not isinstance(failure.value, IgnoreRequest):
|
||||
referer = request.headers.get('Referer')
|
||||
log.msg(format='File (unknown-error): Error downloading '
|
||||
'%(medianame)s from %(request)s referred in '
|
||||
'<%(referer)s>: %(exception)s',
|
||||
level=log.WARNING, spider=info.spider, exception=failure.value,
|
||||
medianame=self.MEDIA_NAME, request=request, referer=referer)
|
||||
logger.warning(
|
||||
'File (unknown-error): Error downloading %(medianame)s from '
|
||||
'%(request)s referred in <%(referer)s>: %(exception)s',
|
||||
{'medianame': self.MEDIA_NAME, 'request': request,
|
||||
'referer': referer, 'exception': failure.value},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
|
||||
raise FileException
|
||||
|
||||
@ -221,34 +233,51 @@ class FilesPipeline(MediaPipeline):
|
||||
referer = request.headers.get('Referer')
|
||||
|
||||
if response.status != 200:
|
||||
log.msg(format='File (code: %(status)s): Error downloading file from %(request)s referred in <%(referer)s>',
|
||||
level=log.WARNING, spider=info.spider,
|
||||
status=response.status, request=request, referer=referer)
|
||||
logger.warning(
|
||||
'File (code: %(status)s): Error downloading file from '
|
||||
'%(request)s referred in <%(referer)s>',
|
||||
{'status': response.status,
|
||||
'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException('download-error')
|
||||
|
||||
if not response.body:
|
||||
log.msg(format='File (empty-content): Empty file from %(request)s referred in <%(referer)s>: no-content',
|
||||
level=log.WARNING, spider=info.spider,
|
||||
request=request, referer=referer)
|
||||
logger.warning(
|
||||
'File (empty-content): Empty file from %(request)s referred '
|
||||
'in <%(referer)s>: no-content',
|
||||
{'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException('empty-content')
|
||||
|
||||
status = 'cached' if 'cached' in response.flags else 'downloaded'
|
||||
log.msg(format='File (%(status)s): Downloaded file from %(request)s referred in <%(referer)s>',
|
||||
level=log.DEBUG, spider=info.spider,
|
||||
status=status, request=request, referer=referer)
|
||||
logger.debug(
|
||||
'File (%(status)s): Downloaded file from %(request)s referred in '
|
||||
'<%(referer)s>',
|
||||
{'status': status, 'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
self.inc_stats(info.spider, status)
|
||||
|
||||
try:
|
||||
path = self.file_path(request, response=response, info=info)
|
||||
checksum = self.file_downloaded(response, request, info)
|
||||
except FileException as exc:
|
||||
whyfmt = 'File (error): Error processing file from %(request)s referred in <%(referer)s>: %(errormsg)s'
|
||||
log.msg(format=whyfmt, level=log.WARNING, spider=info.spider,
|
||||
request=request, referer=referer, errormsg=str(exc))
|
||||
logger.warning(
|
||||
'File (error): Error processing file from %(request)s '
|
||||
'referred in <%(referer)s>: %(errormsg)s',
|
||||
{'request': request, 'referer': referer, 'errormsg': str(exc)},
|
||||
extra={'spider': info.spider}, exc_info=True
|
||||
)
|
||||
raise
|
||||
except Exception as exc:
|
||||
whyfmt = 'File (unknown-error): Error processing file from %(request)s referred in <%(referer)s>'
|
||||
log.err(None, whyfmt % {'request': request, 'referer': referer}, spider=info.spider)
|
||||
logger.exception(
|
||||
'File (unknown-error): Error processing file from %(request)s '
|
||||
'referred in <%(referer)s>',
|
||||
{'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException(str(exc))
|
||||
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum}
|
||||
|
@ -1,13 +1,16 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from twisted.internet.defer import Deferred, DeferredList
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.utils.defer import mustbe_deferred, defer_result
|
||||
from scrapy import log
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MediaPipeline(object):
|
||||
|
||||
@ -66,7 +69,9 @@ class MediaPipeline(object):
|
||||
dfd = mustbe_deferred(self.media_to_download, request, info)
|
||||
dfd.addCallback(self._check_media_to_download, request, info)
|
||||
dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
|
||||
dfd.addErrback(log.err, spider=info.spider)
|
||||
dfd.addErrback(lambda f: logger.error(
|
||||
f.value, extra={'spider': info.spider, 'failure': f})
|
||||
)
|
||||
return dfd.addBoth(lambda _: wad) # it must return wad at last
|
||||
|
||||
def _check_media_to_download(self, result, request, info):
|
||||
@ -117,8 +122,11 @@ class MediaPipeline(object):
|
||||
def item_completed(self, results, item, info):
|
||||
"""Called per item when all media requests has been processed"""
|
||||
if self.LOG_FAILED_RESULTS:
|
||||
msg = '%s found errors processing %s' % (self.__class__.__name__, item)
|
||||
for ok, value in results:
|
||||
if not ok:
|
||||
log.err(value, msg, spider=info.spider)
|
||||
logger.error(
|
||||
'%(class)s found errors processing %(item)s',
|
||||
{'class': self.__class__.__name__, 'item': item},
|
||||
extra={'spider': info.spider, 'failure': value}
|
||||
)
|
||||
return item
|
||||
|
@ -4,9 +4,13 @@ Depth Spider Middleware
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
|
||||
from scrapy import log
|
||||
import logging
|
||||
|
||||
from scrapy.http import Request
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DepthMiddleware(object):
|
||||
|
||||
def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1):
|
||||
@ -31,9 +35,9 @@ class DepthMiddleware(object):
|
||||
if self.prio:
|
||||
request.priority -= depth * self.prio
|
||||
if self.maxdepth and depth > self.maxdepth:
|
||||
log.msg(format="Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
|
||||
level=log.DEBUG, spider=spider,
|
||||
maxdepth=self.maxdepth, requrl=request.url)
|
||||
logger.debug("Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
|
||||
{'maxdepth': self.maxdepth, 'requrl': request.url},
|
||||
extra={'spider': spider})
|
||||
return False
|
||||
elif self.stats:
|
||||
if self.verbose_stats:
|
||||
|
@ -3,8 +3,12 @@ HttpError Spider Middleware
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
import logging
|
||||
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
from scrapy import log
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HttpError(IgnoreRequest):
|
||||
"""A non-200 response was filtered"""
|
||||
@ -42,10 +46,8 @@ class HttpErrorMiddleware(object):
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
if isinstance(exception, HttpError):
|
||||
log.msg(
|
||||
format="Ignoring response %(response)r: HTTP status code is not handled or not allowed",
|
||||
level=log.DEBUG,
|
||||
spider=spider,
|
||||
response=response
|
||||
logger.debug(
|
||||
"Ignoring response %(response)r: HTTP status code is not handled or not allowed",
|
||||
{'response': response}, extra={'spider': spider},
|
||||
)
|
||||
return []
|
||||
|
@ -5,11 +5,13 @@ See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy import log
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class OffsiteMiddleware(object):
|
||||
|
||||
@ -31,8 +33,8 @@ class OffsiteMiddleware(object):
|
||||
domain = urlparse_cached(x).hostname
|
||||
if domain and domain not in self.domains_seen:
|
||||
self.domains_seen.add(domain)
|
||||
log.msg(format="Filtered offsite request to %(domain)r: %(request)s",
|
||||
level=log.DEBUG, spider=spider, domain=domain, request=x)
|
||||
logger.debug("Filtered offsite request to %(domain)r: %(request)s",
|
||||
{'domain': domain, 'request': x}, extra={'spider': spider})
|
||||
self.stats.inc_value('offsite/domains', spider=spider)
|
||||
self.stats.inc_value('offsite/filtered', spider=spider)
|
||||
else:
|
||||
|
@ -4,10 +4,14 @@ Url Length Spider Middleware
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
|
||||
from scrapy import log
|
||||
import logging
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UrlLengthMiddleware(object):
|
||||
|
||||
def __init__(self, maxlength):
|
||||
@ -23,9 +27,9 @@ class UrlLengthMiddleware(object):
|
||||
def process_spider_output(self, response, result, spider):
|
||||
def _filter(request):
|
||||
if isinstance(request, Request) and len(request.url) > self.maxlength:
|
||||
log.msg(format="Ignoring link (url length > %(maxlength)d): %(url)s ",
|
||||
level=log.DEBUG, spider=spider,
|
||||
maxlength=self.maxlength, url=request.url)
|
||||
logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ",
|
||||
{'maxlength': self.maxlength, 'url': request.url},
|
||||
extra={'spider': spider})
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
@ -1,10 +1,13 @@
|
||||
import re
|
||||
import logging
|
||||
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request, XmlResponse
|
||||
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
|
||||
from scrapy.utils.gz import gunzip, is_gzipped
|
||||
from scrapy import log
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SitemapSpider(Spider):
|
||||
|
||||
@ -32,8 +35,8 @@ class SitemapSpider(Spider):
|
||||
else:
|
||||
body = self._get_sitemap_body(response)
|
||||
if body is None:
|
||||
log.msg(format="Ignoring invalid sitemap: %(response)s",
|
||||
level=log.WARNING, spider=self, response=response)
|
||||
logger.warning("Ignoring invalid sitemap: %(response)s",
|
||||
{'response': response}, extra={'spider': self})
|
||||
return
|
||||
|
||||
s = Sitemap(body)
|
||||
|
@ -1,7 +1,10 @@
|
||||
import logging
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AutoThrottle(object):
|
||||
|
||||
@ -47,9 +50,17 @@ class AutoThrottle(object):
|
||||
diff = slot.delay - olddelay
|
||||
size = len(response.body)
|
||||
conc = len(slot.transferring)
|
||||
msg = "slot: %s | conc:%2d | delay:%5d ms (%+d) | latency:%5d ms | size:%6d bytes" % \
|
||||
(key, conc, slot.delay * 1000, diff * 1000, latency * 1000, size)
|
||||
spider.log(msg, level=logging.INFO)
|
||||
logger.info(
|
||||
"slot: %(slot)s | conc:%(concurrency)2d | "
|
||||
"delay:%(delay)5d ms (%(delaydiff)+d) | "
|
||||
"latency:%(latency)5d ms | size:%(size)6d bytes",
|
||||
{
|
||||
'slot': key, 'concurrency': conc,
|
||||
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
|
||||
'latency': latency * 1000, 'size': size
|
||||
},
|
||||
extra={'spider': spider}
|
||||
)
|
||||
|
||||
def _get_slot(self, request, spider):
|
||||
key = request.meta.get('download_slot')
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""Download handlers for http and https schemes"""
|
||||
|
||||
import re
|
||||
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from time import time
|
||||
from six.moves.urllib.parse import urldefrag
|
||||
@ -19,7 +19,9 @@ from scrapy.http import Headers
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.core.downloader.webclient import _parse
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy import log, twisted_version
|
||||
from scrapy import twisted_version
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HTTP11DownloadHandler(object):
|
||||
@ -237,14 +239,16 @@ class ScrapyAgent(object):
|
||||
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
|
||||
|
||||
if maxsize and expected_size > maxsize:
|
||||
log.msg("Expected response size (%s) larger than download max size (%s)." % (expected_size, maxsize),
|
||||
logLevel=log.ERROR)
|
||||
logger.error("Expected response size (%(size)s) larger than "
|
||||
"download max size (%(maxsize)s).",
|
||||
{'size': expected_size, 'maxsize': maxsize})
|
||||
txresponse._transport._producer.loseConnection()
|
||||
raise defer.CancelledError()
|
||||
|
||||
if warnsize and expected_size > warnsize:
|
||||
log.msg("Expected response size (%s) larger than downlod warn size (%s)." % (expected_size, warnsize),
|
||||
logLevel=log.WARNING)
|
||||
logger.warning("Expected response size (%(size)s) larger than "
|
||||
"download warn size (%(warnsize)s).",
|
||||
{'size': expected_size, 'warnsize': warnsize})
|
||||
|
||||
def _cancel(_):
|
||||
txresponse._transport._producer.loseConnection()
|
||||
@ -295,13 +299,17 @@ class _ResponseReader(protocol.Protocol):
|
||||
self._bytes_received += len(bodyBytes)
|
||||
|
||||
if self._maxsize and self._bytes_received > self._maxsize:
|
||||
log.msg("Received (%s) bytes larger than download max size (%s)." % (self._bytes_received, self._maxsize),
|
||||
logLevel=log.ERROR)
|
||||
logger.error("Received (%(bytes)s) bytes larger than download "
|
||||
"max size (%(maxsize)s).",
|
||||
{'bytes': self._bytes_received,
|
||||
'maxsize': self._maxsize})
|
||||
self._finished.cancel()
|
||||
|
||||
if self._warnsize and self._bytes_received > self._warnsize:
|
||||
log.msg("Received (%s) bytes larger than download warn size (%s)." % (self._bytes_received, self._warnsize),
|
||||
logLevel=log.WARNING)
|
||||
logger.warning("Received (%(bytes)s) bytes larger than download "
|
||||
"warn size (%(warnsize)s).",
|
||||
{'bytes': self._bytes_received,
|
||||
'warnsize': self._warnsize})
|
||||
|
||||
def connectionLost(self, reason):
|
||||
if self._finished.called:
|
||||
|
@ -4,17 +4,21 @@ This is the Scrapy engine which controls the Scheduler, Downloader and Spiders.
|
||||
For more information see docs/topics/architecture.rst
|
||||
|
||||
"""
|
||||
import logging
|
||||
from time import time
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy import log, signals
|
||||
from scrapy import signals
|
||||
from scrapy.core.scraper import Scraper
|
||||
from scrapy.exceptions import DontCloseSpider
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.reactor import CallLaterOnce
|
||||
from scrapy.utils.log import logformatter_adapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Slot(object):
|
||||
@ -106,10 +110,10 @@ class ExecutionEngine(object):
|
||||
request = next(slot.start_requests)
|
||||
except StopIteration:
|
||||
slot.start_requests = None
|
||||
except Exception as exc:
|
||||
except Exception:
|
||||
slot.start_requests = None
|
||||
log.err(None, 'Obtaining request from start requests', \
|
||||
spider=spider)
|
||||
logger.exception('Error while obtaining start requests',
|
||||
extra={'spider': spider})
|
||||
else:
|
||||
self.crawl(request, spider)
|
||||
|
||||
@ -130,11 +134,14 @@ class ExecutionEngine(object):
|
||||
return
|
||||
d = self._download(request, spider)
|
||||
d.addBoth(self._handle_downloader_output, request, spider)
|
||||
d.addErrback(log.msg, spider=spider)
|
||||
d.addErrback(lambda f: logger.info('Error while handling downloader output',
|
||||
extra={'spider': spider, 'failure': f}))
|
||||
d.addBoth(lambda _: slot.remove_request(request))
|
||||
d.addErrback(log.msg, spider=spider)
|
||||
d.addErrback(lambda f: logger.info('Error while removing request from slot',
|
||||
extra={'spider': spider, 'failure': f}))
|
||||
d.addBoth(lambda _: slot.nextcall.schedule())
|
||||
d.addErrback(log.msg, spider=spider)
|
||||
d.addErrback(lambda f: logger.info('Error while scheduling new request',
|
||||
extra={'spider': spider, 'failure': f}))
|
||||
return d
|
||||
|
||||
def _handle_downloader_output(self, response, request, spider):
|
||||
@ -145,7 +152,8 @@ class ExecutionEngine(object):
|
||||
return
|
||||
# response is a Response or Failure
|
||||
d = self.scraper.enqueue_scrape(response, request, spider)
|
||||
d.addErrback(log.err, spider=spider)
|
||||
d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
|
||||
extra={'spider': spider, 'failure': f}))
|
||||
return d
|
||||
|
||||
def spider_is_idle(self, spider):
|
||||
@ -197,7 +205,7 @@ class ExecutionEngine(object):
|
||||
if isinstance(response, Response):
|
||||
response.request = request # tie request to response received
|
||||
logkws = self.logformatter.crawled(request, response, spider)
|
||||
log.msg(spider=spider, **logkws)
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
self.signals.send_catch_log(signal=signals.response_received, \
|
||||
response=response, request=request, spider=spider)
|
||||
return response
|
||||
@ -215,7 +223,7 @@ class ExecutionEngine(object):
|
||||
def open_spider(self, spider, start_requests=(), close_if_idle=True):
|
||||
assert self.has_capacity(), "No free spider slot when opening %r" % \
|
||||
spider.name
|
||||
log.msg("Spider opened", spider=spider)
|
||||
logger.info("Spider opened", extra={'spider': spider})
|
||||
nextcall = CallLaterOnce(self._next_request, spider)
|
||||
scheduler = self.scheduler_cls.from_crawler(self.crawler)
|
||||
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
|
||||
@ -252,33 +260,42 @@ class ExecutionEngine(object):
|
||||
slot = self.slot
|
||||
if slot.closing:
|
||||
return slot.closing
|
||||
log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider)
|
||||
logger.info("Closing spider (%(reason)s)",
|
||||
{'reason': reason},
|
||||
extra={'spider': spider})
|
||||
|
||||
dfd = slot.close()
|
||||
|
||||
def log_failure(msg):
|
||||
def errback(failure):
|
||||
logger.error(msg, extra={'spider': spider, 'failure': failure})
|
||||
return errback
|
||||
|
||||
dfd.addBoth(lambda _: self.downloader.close())
|
||||
dfd.addErrback(log.err, spider=spider)
|
||||
dfd.addErrback(log_failure('Downloader close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
|
||||
dfd.addErrback(log.err, spider=spider)
|
||||
dfd.addErrback(log_failure('Scraper close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: slot.scheduler.close(reason))
|
||||
dfd.addErrback(log.err, spider=spider)
|
||||
dfd.addErrback(log_failure('Scheduler close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
|
||||
signal=signals.spider_closed, spider=spider, reason=reason))
|
||||
dfd.addErrback(log.err, spider=spider)
|
||||
dfd.addErrback(log_failure('Error while sending spider_close signal'))
|
||||
|
||||
dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
|
||||
dfd.addErrback(log.err, spider=spider)
|
||||
dfd.addErrback(log_failure('Stats close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: log.msg(format="Spider closed (%(reason)s)", reason=reason, spider=spider))
|
||||
dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
|
||||
{'reason': reason},
|
||||
extra={'spider': spider}))
|
||||
|
||||
dfd.addBoth(lambda _: setattr(self, 'slot', None))
|
||||
dfd.addErrback(log.err, spider=spider)
|
||||
dfd.addErrback(log_failure('Error while unassigning slot'))
|
||||
|
||||
dfd.addBoth(lambda _: setattr(self, 'spider', None))
|
||||
dfd.addErrback(log.err, spider=spider)
|
||||
dfd.addErrback(log_failure('Error while unassigning spider'))
|
||||
|
||||
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
|
||||
|
||||
|
@ -1,12 +1,15 @@
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from os.path import join, exists
|
||||
|
||||
from queuelib import PriorityQueue
|
||||
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy import log
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Scheduler(object):
|
||||
|
||||
@ -80,9 +83,9 @@ class Scheduler(object):
|
||||
self.dqs.push(reqd, -request.priority)
|
||||
except ValueError as e: # non serializable request
|
||||
if self.logunser:
|
||||
log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s",
|
||||
level=log.ERROR, spider=self.spider,
|
||||
request=request, reason=e)
|
||||
logger.exception("Unable to serialize request: %(request)s - reason: %(reason)s",
|
||||
{'request': request, 'reason': e},
|
||||
extra={'spider': self.spider})
|
||||
return
|
||||
else:
|
||||
return True
|
||||
@ -111,8 +114,8 @@ class Scheduler(object):
|
||||
prios = ()
|
||||
q = PriorityQueue(self._newdq, startprios=prios)
|
||||
if q:
|
||||
log.msg(format="Resuming crawl (%(queuesize)d requests scheduled)",
|
||||
spider=self.spider, queuesize=len(q))
|
||||
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
|
||||
{'queuesize': len(q)}, extra={'spider': self.spider})
|
||||
return q
|
||||
|
||||
def _dqdir(self, jobdir):
|
||||
|
@ -1,6 +1,7 @@
|
||||
"""This module implements the Scraper component which parses responses and
|
||||
extracts information from them"""
|
||||
|
||||
import logging
|
||||
from collections import deque
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
@ -9,12 +10,14 @@ from twisted.internet import defer
|
||||
from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errback
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.log import logformatter_adapter
|
||||
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
|
||||
from scrapy import signals
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.core.spidermw import SpiderMiddlewareManager
|
||||
from scrapy import log
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Slot(object):
|
||||
@ -102,7 +105,9 @@ class Scraper(object):
|
||||
return _
|
||||
dfd.addBoth(finish_scraping)
|
||||
dfd.addErrback(
|
||||
log.err, 'Scraper bug processing %s' % request, spider=spider)
|
||||
lambda f: logger.error('Scraper bug processing %(request)s',
|
||||
{'request': request},
|
||||
extra={'spider': spider, 'failure': f}))
|
||||
self._scrape_next(spider, slot)
|
||||
return dfd
|
||||
|
||||
@ -145,10 +150,10 @@ class Scraper(object):
|
||||
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
|
||||
return
|
||||
referer = request.headers.get('Referer')
|
||||
log.err(
|
||||
_failure,
|
||||
"Spider error processing %s (referer: %s)" % (request, referer),
|
||||
spider=spider
|
||||
logger.error(
|
||||
"Spider error processing %(request)s (referer: %(referer)s)",
|
||||
{'request': request, 'referer': referer},
|
||||
extra={'spider': spider, 'failure': _failure}
|
||||
)
|
||||
self.signals.send_catch_log(
|
||||
signal=signals.spider_error,
|
||||
@ -183,9 +188,10 @@ class Scraper(object):
|
||||
pass
|
||||
else:
|
||||
typename = type(output).__name__
|
||||
log.msg(format='Spider must return Request, BaseItem, dict or None, '
|
||||
'got %(typename)r in %(request)s',
|
||||
level=log.ERROR, spider=spider, request=request, typename=typename)
|
||||
logger.error('Spider must return Request, BaseItem, dict or None, '
|
||||
'got %(typename)r in %(request)s',
|
||||
{'request': request, 'typename': typename},
|
||||
extra={'spider': spider})
|
||||
|
||||
def _log_download_errors(self, spider_failure, download_failure, request, spider):
|
||||
"""Log and silence errors that come from the engine (typically download
|
||||
@ -194,14 +200,15 @@ class Scraper(object):
|
||||
if (isinstance(download_failure, Failure) and
|
||||
not download_failure.check(IgnoreRequest)):
|
||||
if download_failure.frames:
|
||||
log.err(download_failure, 'Error downloading %s' % request,
|
||||
spider=spider)
|
||||
logger.error('Error downloading %(request)s',
|
||||
{'request': request},
|
||||
extra={'spider': spider, 'failure': download_failure})
|
||||
else:
|
||||
errmsg = download_failure.getErrorMessage()
|
||||
if errmsg:
|
||||
log.msg(format='Error downloading %(request)s: %(errmsg)s',
|
||||
level=log.ERROR, spider=spider, request=request,
|
||||
errmsg=errmsg)
|
||||
logger.error('Error downloading %(request)s: %(errmsg)s',
|
||||
{'request': request, 'errmsg': errmsg},
|
||||
extra={'spider': spider})
|
||||
|
||||
if spider_failure is not download_failure:
|
||||
return spider_failure
|
||||
@ -214,15 +221,16 @@ class Scraper(object):
|
||||
ex = output.value
|
||||
if isinstance(ex, DropItem):
|
||||
logkws = self.logformatter.dropped(item, ex, response, spider)
|
||||
log.msg(spider=spider, **logkws)
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_dropped, item=item, response=response,
|
||||
spider=spider, exception=output.value)
|
||||
else:
|
||||
log.err(output, 'Error processing %s' % item, spider=spider)
|
||||
logger.error('Error processing %(item)s', {'item': item},
|
||||
extra={'spider': spider, 'failure': output})
|
||||
else:
|
||||
logkws = self.logformatter.scraped(output, response, spider)
|
||||
log.msg(spider=spider, **logkws)
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_scraped, item=output, response=response,
|
||||
spider=spider)
|
||||
|
@ -1,5 +1,6 @@
|
||||
import six
|
||||
import signal
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
@ -14,7 +15,10 @@ from scrapy.signalmanager import SignalManager
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy import log, signals
|
||||
from scrapy.utils.log import LogCounterHandler, configure_logging, log_scrapy_info
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Crawler(object):
|
||||
@ -28,6 +32,12 @@ class Crawler(object):
|
||||
|
||||
self.signals = SignalManager(self)
|
||||
self.stats = load_object(self.settings['STATS_CLASS'])(self)
|
||||
|
||||
handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
|
||||
logging.root.addHandler(handler)
|
||||
self.signals.connect(lambda: logging.root.removeHandler(handler),
|
||||
signals.engine_stopped)
|
||||
|
||||
lf_cls = load_object(self.settings['LOG_FORMATTER'])
|
||||
self.logformatter = lf_cls.from_crawler(self)
|
||||
self.extensions = ExtensionManager.from_crawler(self)
|
||||
@ -99,7 +109,6 @@ class CrawlerRunner(object):
|
||||
crawler = crawler_or_spidercls
|
||||
if not isinstance(crawler_or_spidercls, Crawler):
|
||||
crawler = self._create_crawler(crawler_or_spidercls)
|
||||
self._setup_crawler_logging(crawler)
|
||||
|
||||
self.crawlers.add(crawler)
|
||||
d = crawler.crawl(*args, **kwargs)
|
||||
@ -117,11 +126,6 @@ class CrawlerRunner(object):
|
||||
spidercls = self.spider_loader.load(spidercls)
|
||||
return Crawler(spidercls, self.settings)
|
||||
|
||||
def _setup_crawler_logging(self, crawler):
|
||||
log_observer = log.start_from_crawler(crawler)
|
||||
if log_observer:
|
||||
crawler.signals.connect(log_observer.stop, signals.engine_stopped)
|
||||
|
||||
def stop(self):
|
||||
return defer.DeferredList([c.stop() for c in list(self.crawlers)])
|
||||
|
||||
@ -139,22 +143,21 @@ class CrawlerProcess(CrawlerRunner):
|
||||
super(CrawlerProcess, self).__init__(settings)
|
||||
install_shutdown_handlers(self._signal_shutdown)
|
||||
self.stopping = False
|
||||
self.log_observer = log.start_from_settings(self.settings)
|
||||
log.scrapy_info(settings)
|
||||
configure_logging(settings)
|
||||
log_scrapy_info(settings)
|
||||
|
||||
def _signal_shutdown(self, signum, _):
|
||||
install_shutdown_handlers(self._signal_kill)
|
||||
signame = signal_names[signum]
|
||||
log.msg(format="Received %(signame)s, shutting down gracefully. Send again to force ",
|
||||
level=log.INFO, signame=signame)
|
||||
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
|
||||
{'signame': signame})
|
||||
reactor.callFromThread(self.stop)
|
||||
|
||||
def _signal_kill(self, signum, _):
|
||||
install_shutdown_handlers(signal.SIG_IGN)
|
||||
signame = signal_names[signum]
|
||||
log.msg(format='Received %(signame)s twice, forcing unclean shutdown',
|
||||
level=log.INFO, signame=signame)
|
||||
self._stop_logging()
|
||||
logger.info('Received %(signame)s twice, forcing unclean shutdown',
|
||||
{'signame': signame})
|
||||
reactor.callFromThread(self._stop_reactor)
|
||||
|
||||
def start(self, stop_after_crawl=True):
|
||||
@ -173,10 +176,6 @@ class CrawlerProcess(CrawlerRunner):
|
||||
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
|
||||
reactor.run(installSignalHandlers=False) # blocking call
|
||||
|
||||
def _stop_logging(self):
|
||||
if self.log_observer:
|
||||
self.log_observer.stop()
|
||||
|
||||
def _stop_reactor(self, _=None):
|
||||
try:
|
||||
reactor.stop()
|
||||
|
@ -1,7 +1,7 @@
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import logging
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
|
||||
@ -33,6 +33,7 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
self.fingerprints = set()
|
||||
self.logdupes = True
|
||||
self.debug = debug
|
||||
self.logger = logging.getLogger(__name__)
|
||||
if path:
|
||||
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
|
||||
self.fingerprints.update(x.rstrip() for x in self.file)
|
||||
@ -59,13 +60,13 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
|
||||
def log(self, request, spider):
|
||||
if self.debug:
|
||||
fmt = "Filtered duplicate request: %(request)s"
|
||||
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
|
||||
msg = "Filtered duplicate request: %(request)s"
|
||||
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
|
||||
elif self.logdupes:
|
||||
fmt = ("Filtered duplicate request: %(request)s"
|
||||
msg = ("Filtered duplicate request: %(request)s"
|
||||
" - no more duplicates will be shown"
|
||||
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
||||
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
|
||||
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
|
||||
self.logdupes = False
|
||||
|
||||
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
||||
|
172
scrapy/log.py
172
scrapy/log.py
@ -1,19 +1,24 @@
|
||||
"""
|
||||
Scrapy logging facility
|
||||
This module is kept to provide a helpful warning about its removal.
|
||||
"""
|
||||
|
||||
See documentation in docs/topics/logging.rst
|
||||
"""
|
||||
import sys
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
from twisted.python import log
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
import scrapy
|
||||
from scrapy.utils.python import unicode_to_str
|
||||
from scrapy.settings import overridden_settings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
warnings.warn("Module `scrapy.log` has been deprecated, Scrapy now relies on "
|
||||
"the builtin Python library for logging. Read the updated "
|
||||
"logging entry in the documentation to learn more.",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
|
||||
# Imports kept for backwards-compatibility
|
||||
|
||||
# Logging levels
|
||||
DEBUG = logging.DEBUG
|
||||
INFO = logging.INFO
|
||||
WARNING = logging.WARNING
|
||||
@ -21,141 +26,26 @@ ERROR = logging.ERROR
|
||||
CRITICAL = logging.CRITICAL
|
||||
SILENT = CRITICAL + 1
|
||||
|
||||
level_names = {
|
||||
logging.DEBUG: "DEBUG",
|
||||
logging.INFO: "INFO",
|
||||
logging.WARNING: "WARNING",
|
||||
logging.ERROR: "ERROR",
|
||||
logging.CRITICAL: "CRITICAL",
|
||||
SILENT: "SILENT",
|
||||
}
|
||||
|
||||
class ScrapyFileLogObserver(log.FileLogObserver):
|
||||
def msg(message=None, _level=logging.INFO, **kw):
|
||||
warnings.warn('log.msg has been deprecated, create a python logger and '
|
||||
'log through it instead',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
def __init__(self, f, level=INFO, encoding='utf-8', crawler=None):
|
||||
self.level = level
|
||||
self.encoding = encoding
|
||||
self.crawler = crawler
|
||||
if crawler:
|
||||
self.emit = self._emit_with_crawler
|
||||
else:
|
||||
self.emit = self._emit
|
||||
log.FileLogObserver.__init__(self, f)
|
||||
level = kw.pop('level', _level)
|
||||
message = kw.pop('format', message)
|
||||
# NOTE: logger.log doesn't handle well passing empty dictionaries with format
|
||||
# arguments because of some weird use-case:
|
||||
# https://hg.python.org/cpython/file/648dcafa7e5f/Lib/logging/__init__.py#l269
|
||||
logger.log(level, message, *[kw] if kw else [])
|
||||
|
||||
def _emit(self, eventDict):
|
||||
ev = _adapt_eventdict(eventDict, self.level, self.encoding,
|
||||
self.crawler)
|
||||
if ev is not None:
|
||||
log.FileLogObserver.emit(self, ev)
|
||||
return ev
|
||||
|
||||
def _emit_with_crawler(self, eventDict):
|
||||
ev = self._emit(eventDict)
|
||||
if ev:
|
||||
level = ev['logLevel']
|
||||
sname = 'log_count/%s' % level_names.get(level, level)
|
||||
self.crawler.stats.inc_value(sname)
|
||||
|
||||
def _adapt_eventdict(eventDict, log_level=INFO, encoding='utf-8',
|
||||
crawler=None, prepend_level=True):
|
||||
"""Adapt Twisted log eventDict making it suitable for logging with a Scrapy
|
||||
log observer. It may return None to indicate that the event should be
|
||||
ignored by a Scrapy log observer.
|
||||
|
||||
`log_level` is the minimum level being logged, and `encoding` is the log
|
||||
encoding.
|
||||
"""
|
||||
ev = eventDict.copy()
|
||||
if ev['isError']:
|
||||
ev.setdefault('logLevel', ERROR)
|
||||
|
||||
# ignore non-error messages from outside scrapy
|
||||
if ev.get('system') != 'scrapy' and not ev['isError']:
|
||||
return
|
||||
|
||||
level = ev.get('logLevel')
|
||||
if level < log_level:
|
||||
return
|
||||
|
||||
spider = ev.get('spider')
|
||||
if spider:
|
||||
ev['system'] = unicode_to_str(spider.name, encoding)
|
||||
if crawler and (not spider or spider.crawler is not crawler):
|
||||
# ignore events not triggered by own spiders in crawlers' observers
|
||||
return
|
||||
if not crawler and spider:
|
||||
# ignore spiders' events in observers without crawler
|
||||
return
|
||||
|
||||
lvlname = level_names.get(level, 'NOLEVEL')
|
||||
message = ev.get('message')
|
||||
if message:
|
||||
message = [unicode_to_str(x, encoding) for x in message]
|
||||
if prepend_level:
|
||||
message[0] = "%s: %s" % (lvlname, message[0])
|
||||
ev['message'] = message
|
||||
|
||||
why = ev.get('why')
|
||||
if why:
|
||||
why = unicode_to_str(why, encoding)
|
||||
if prepend_level:
|
||||
why = "%s: %s" % (lvlname, why)
|
||||
ev['why'] = why
|
||||
|
||||
fmt = ev.get('format')
|
||||
if fmt:
|
||||
fmt = unicode_to_str(fmt, encoding)
|
||||
if prepend_level:
|
||||
fmt = "%s: %s" % (lvlname, fmt)
|
||||
ev['format'] = fmt
|
||||
|
||||
return ev
|
||||
|
||||
def _get_log_level(level_name_or_id):
|
||||
if isinstance(level_name_or_id, int):
|
||||
return level_name_or_id
|
||||
elif isinstance(level_name_or_id, basestring):
|
||||
return globals()[level_name_or_id]
|
||||
else:
|
||||
raise ValueError("Unknown log level: %r" % level_name_or_id)
|
||||
|
||||
def start(logfile=None, loglevel='INFO', logstdout=True, logencoding='utf-8', crawler=None):
|
||||
loglevel = _get_log_level(loglevel)
|
||||
file = open(logfile, 'a') if logfile else sys.stderr
|
||||
log_observer = ScrapyFileLogObserver(file, loglevel, logencoding, crawler)
|
||||
_oldshowwarning = warnings.showwarning
|
||||
log.startLoggingWithObserver(log_observer.emit, setStdout=logstdout)
|
||||
# restore warnings, wrongly silenced by Twisted
|
||||
warnings.showwarning = _oldshowwarning
|
||||
return log_observer
|
||||
|
||||
def msg(message=None, _level=INFO, **kw):
|
||||
kw['logLevel'] = kw.pop('level', _level)
|
||||
kw.setdefault('system', 'scrapy')
|
||||
if message is None:
|
||||
log.msg(**kw)
|
||||
else:
|
||||
log.msg(message, **kw)
|
||||
|
||||
def err(_stuff=None, _why=None, **kw):
|
||||
kw['logLevel'] = kw.pop('level', ERROR)
|
||||
kw.setdefault('system', 'scrapy')
|
||||
log.err(_stuff, _why, **kw)
|
||||
warnings.warn('log.err has been deprecated, create a python logger and '
|
||||
'use its error method instead',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
def start_from_settings(settings, crawler=None):
|
||||
if settings.getbool('LOG_ENABLED'):
|
||||
return start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'],
|
||||
settings['LOG_ENCODING'], crawler)
|
||||
|
||||
def scrapy_info(settings):
|
||||
msg("Scrapy %s started (bot: %s)" % (scrapy.__version__,
|
||||
settings['BOT_NAME']))
|
||||
|
||||
msg("Optional features available: %s" % ", ".join(scrapy.optional_features),
|
||||
level=INFO)
|
||||
|
||||
d = dict(overridden_settings(settings))
|
||||
msg(format="Overridden settings: %(settings)r", settings=d, level=INFO)
|
||||
|
||||
def start_from_crawler(crawler):
|
||||
return start_from_settings(crawler.settings, crawler)
|
||||
level = kw.pop('level', logging.ERROR)
|
||||
failure = kw.pop('failure', _stuff) or Failure()
|
||||
message = kw.pop('why', _why) or failure.value
|
||||
logger.log(level, message, *[kw] if kw else [], extra={'failure': failure})
|
||||
|
@ -1,46 +1,67 @@
|
||||
import os
|
||||
import logging
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy import log
|
||||
|
||||
SCRAPEDMSG = u"Scraped from %(src)s" + os.linesep + "%(item)s"
|
||||
DROPPEDMSG = u"Dropped: %(exception)s" + os.linesep + "%(item)s"
|
||||
CRAWLEDMSG = u"Crawled (%(status)s) %(request)s (referer: %(referer)s)%(flags)s"
|
||||
|
||||
SCRAPEDFMT = u"Scraped from %(src)s" + os.linesep + "%(item)s"
|
||||
DROPPEDFMT = u"Dropped: %(exception)s" + os.linesep + "%(item)s"
|
||||
CRAWLEDFMT = u"Crawled (%(status)s) %(request)s (referer: %(referer)s)%(flags)s"
|
||||
|
||||
class LogFormatter(object):
|
||||
"""Class for generating log messages for different actions. All methods
|
||||
must return a plain string which doesn't include the log level or the
|
||||
timestamp
|
||||
"""Class for generating log messages for different actions.
|
||||
|
||||
All methods must return a dictionary listing the parameters `level`, `msg`
|
||||
and `args` which are going to be used for constructing the log message when
|
||||
calling logging.log.
|
||||
|
||||
Dictionary keys for the method outputs:
|
||||
* `level` should be the log level for that action, you can use those
|
||||
from the python logging library: logging.DEBUG, logging.INFO,
|
||||
logging.WARNING, logging.ERROR and logging.CRITICAL.
|
||||
|
||||
* `msg` should be a string that can contain different formatting
|
||||
placeholders. This string, formatted with the provided `args`, is going
|
||||
to be the log message for that action.
|
||||
|
||||
* `args` should be a tuple or dict with the formatting placeholders for
|
||||
`msg`. The final log message is computed as output['msg'] %
|
||||
output['args'].
|
||||
"""
|
||||
|
||||
def crawled(self, request, response, spider):
|
||||
flags = ' %s' % str(response.flags) if response.flags else ''
|
||||
return {
|
||||
'level': log.DEBUG,
|
||||
'format': CRAWLEDFMT,
|
||||
'status': response.status,
|
||||
'request': request,
|
||||
'referer': request.headers.get('Referer'),
|
||||
'flags': flags,
|
||||
'level': logging.DEBUG,
|
||||
'msg': CRAWLEDMSG,
|
||||
'args': {
|
||||
'status': response.status,
|
||||
'request': request,
|
||||
'referer': request.headers.get('Referer'),
|
||||
'flags': flags,
|
||||
}
|
||||
}
|
||||
|
||||
def scraped(self, item, response, spider):
|
||||
src = response.getErrorMessage() if isinstance(response, Failure) else response
|
||||
return {
|
||||
'level': log.DEBUG,
|
||||
'format': SCRAPEDFMT,
|
||||
'src': src,
|
||||
'item': item,
|
||||
'level': logging.DEBUG,
|
||||
'msg': SCRAPEDMSG,
|
||||
'args': {
|
||||
'src': src,
|
||||
'item': item,
|
||||
}
|
||||
}
|
||||
|
||||
def dropped(self, item, exception, response, spider):
|
||||
return {
|
||||
'level': log.WARNING,
|
||||
'format': DROPPEDFMT,
|
||||
'exception': exception,
|
||||
'item': item,
|
||||
'level': logging.WARNING,
|
||||
'msg': DROPPEDMSG,
|
||||
'args': {
|
||||
'exception': exception,
|
||||
'item': item,
|
||||
}
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
@ -3,6 +3,8 @@ Mail sending helpers
|
||||
|
||||
See documentation in docs/topics/email.rst
|
||||
"""
|
||||
import logging
|
||||
|
||||
from six.moves import cStringIO as StringIO
|
||||
import six
|
||||
|
||||
@ -20,7 +22,8 @@ else:
|
||||
from twisted.internet import defer, reactor, ssl
|
||||
from twisted.mail.smtp import ESMTPSenderFactory
|
||||
|
||||
from scrapy import log
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MailSender(object):
|
||||
|
||||
@ -71,8 +74,10 @@ class MailSender(object):
|
||||
_callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg)
|
||||
|
||||
if self.debug:
|
||||
log.msg(format='Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
|
||||
level=log.DEBUG, mailto=to, mailcc=cc, mailsubject=subject, mailattachs=len(attachs))
|
||||
logger.debug('Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
|
||||
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
|
||||
'mailattachs': len(attachs)})
|
||||
return
|
||||
|
||||
dfd = self._sendmail(rcpts, msg.as_string())
|
||||
@ -83,17 +88,18 @@ class MailSender(object):
|
||||
return dfd
|
||||
|
||||
def _sent_ok(self, result, to, cc, subject, nattachs):
|
||||
log.msg(format='Mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
|
||||
mailto=to, mailcc=cc, mailsubject=subject, mailattachs=nattachs)
|
||||
logger.info('Mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
|
||||
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
|
||||
'mailattachs': nattachs})
|
||||
|
||||
def _sent_failed(self, failure, to, cc, subject, nattachs):
|
||||
errstr = str(failure.value)
|
||||
log.msg(format='Unable to send mail: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
|
||||
'- %(mailerr)s',
|
||||
level=log.ERROR, mailto=to, mailcc=cc, mailsubject=subject,
|
||||
mailattachs=nattachs, mailerr=errstr)
|
||||
logger.error('Unable to send mail: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
|
||||
'- %(mailerr)s',
|
||||
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
|
||||
'mailattachs': nattachs, 'mailerr': errstr})
|
||||
|
||||
def _sendmail(self, to_addrs, msg):
|
||||
msg = StringIO(msg)
|
||||
|
@ -1,10 +1,13 @@
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.defer import process_parallel, process_chain, process_chain_both
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MiddlewareManager(object):
|
||||
"""Base class for implementing middleware managers"""
|
||||
|
||||
@ -37,12 +40,15 @@ class MiddlewareManager(object):
|
||||
except NotConfigured as e:
|
||||
if e.args:
|
||||
clsname = clspath.split('.')[-1]
|
||||
log.msg(format="Disabled %(clsname)s: %(eargs)s",
|
||||
level=log.WARNING, clsname=clsname, eargs=e.args[0])
|
||||
logger.warning("Disabled %(clsname)s: %(eargs)s",
|
||||
{'clsname': clsname, 'eargs': e.args[0]},
|
||||
extra={'crawler': crawler})
|
||||
|
||||
enabled = [x.__class__.__name__ for x in middlewares]
|
||||
log.msg(format="Enabled %(componentname)ss: %(enabledlist)s", level=log.INFO,
|
||||
componentname=cls.component_name, enabledlist=', '.join(enabled))
|
||||
logger.info("Enabled %(componentname)ss: %(enabledlist)s",
|
||||
{'componentname': cls.component_name,
|
||||
'enabledlist': ', '.join(enabled)},
|
||||
extra={'crawler': crawler})
|
||||
return cls(*middlewares)
|
||||
|
||||
@classmethod
|
||||
|
@ -167,6 +167,8 @@ ITEM_PIPELINES_BASE = {}
|
||||
LOG_ENABLED = True
|
||||
LOG_ENCODING = 'utf-8'
|
||||
LOG_FORMATTER = 'scrapy.logformatter.LogFormatter'
|
||||
LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
|
||||
LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S%z'
|
||||
LOG_STDOUT = False
|
||||
LOG_LEVEL = 'DEBUG'
|
||||
LOG_FILE = None
|
||||
|
@ -3,9 +3,9 @@ Base class for Scrapy spiders
|
||||
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
from scrapy import log
|
||||
from scrapy import signals
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.trackref import object_ref
|
||||
@ -31,11 +31,19 @@ class Spider(object_ref):
|
||||
if not hasattr(self, 'start_urls'):
|
||||
self.start_urls = []
|
||||
|
||||
def log(self, message, level=log.DEBUG, **kw):
|
||||
"""Log the given messages at the given log level. Always use this
|
||||
method to send log messages from your spider
|
||||
@property
|
||||
def logger(self):
|
||||
logger = logging.getLogger(self.name)
|
||||
return logging.LoggerAdapter(logger, {'spider': self})
|
||||
|
||||
def log(self, message, level=logging.DEBUG, **kw):
|
||||
"""Log the given message at the given log level
|
||||
|
||||
This helper wraps a log call to the logger within the spider, but you
|
||||
can use it directly (e.g. Spider.logger.info('msg')) or use any other
|
||||
Python logger too.
|
||||
"""
|
||||
log.msg(message, spider=self, level=level, **kw)
|
||||
self.logger.log(level, message, **kw)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
|
@ -2,8 +2,10 @@
|
||||
Scrapy extension for collecting scraping stats
|
||||
"""
|
||||
import pprint
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from scrapy import log
|
||||
|
||||
class StatsCollector(object):
|
||||
|
||||
@ -41,8 +43,8 @@ class StatsCollector(object):
|
||||
|
||||
def close_spider(self, spider, reason):
|
||||
if self._dump:
|
||||
log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self._stats), \
|
||||
spider=spider)
|
||||
logger.info("Dumping Scrapy stats:\n" + pprint.pformat(self._stats),
|
||||
extra={'spider': spider})
|
||||
self._persist_stats(self._stats, spider)
|
||||
|
||||
def _persist_stats(self, stats, spider):
|
||||
|
@ -5,6 +5,7 @@ See documentation in docs/topics/telnetconsole.rst
|
||||
"""
|
||||
|
||||
import pprint
|
||||
import logging
|
||||
|
||||
from twisted.internet import protocol
|
||||
try:
|
||||
@ -15,7 +16,7 @@ except ImportError:
|
||||
TWISTED_CONCH_AVAILABLE = False
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import log, signals
|
||||
from scrapy import signals
|
||||
from scrapy.utils.trackref import print_live_refs
|
||||
from scrapy.utils.engine import print_engine_status
|
||||
from scrapy.utils.reactor import listen_tcp
|
||||
@ -26,6 +27,8 @@ try:
|
||||
except ImportError:
|
||||
hpy = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# signal to update telnet variables
|
||||
# args: telnet_vars
|
||||
update_telnet_vars = object()
|
||||
@ -52,8 +55,9 @@ class TelnetConsole(protocol.ServerFactory):
|
||||
def start_listening(self):
|
||||
self.port = listen_tcp(self.portrange, self.host, self)
|
||||
h = self.port.getHost()
|
||||
log.msg(format="Telnet console listening on %(host)s:%(port)d",
|
||||
level=log.DEBUG, host=h.host, port=h.port)
|
||||
logger.debug("Telnet console listening on %(host)s:%(port)d",
|
||||
{'host': h.host, 'port': h.port},
|
||||
extra={'crawler': self.crawler})
|
||||
|
||||
def stop_listening(self):
|
||||
self.port.stopListening()
|
||||
|
@ -1,15 +1,20 @@
|
||||
import re, csv, six
|
||||
import re
|
||||
import csv
|
||||
import logging
|
||||
|
||||
try:
|
||||
from cStringIO import StringIO as BytesIO
|
||||
except ImportError:
|
||||
from io import BytesIO
|
||||
|
||||
import six
|
||||
|
||||
from scrapy.http import TextResponse, Response
|
||||
from scrapy.selector import Selector
|
||||
from scrapy import log
|
||||
from scrapy.utils.python import re_rsearch, str_to_unicode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def xmliter(obj, nodename):
|
||||
"""Return a iterator of Selector's over all nodes of a XML document,
|
||||
@ -108,8 +113,10 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
|
||||
while True:
|
||||
row = _getrow(csv_r)
|
||||
if len(row) != len(headers):
|
||||
log.msg(format="ignoring row %(csvlnum)d (length: %(csvrow)d, should be: %(csvheader)d)",
|
||||
level=log.WARNING, csvlnum=csv_r.line_num, csvrow=len(row), csvheader=len(headers))
|
||||
logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, "
|
||||
"should be: %(csvheader)d)",
|
||||
{'csvlnum': csv_r.line_num, 'csvrow': len(row),
|
||||
'csvheader': len(headers)})
|
||||
continue
|
||||
else:
|
||||
yield dict(zip(headers, row))
|
||||
|
181
scrapy/utils/log.py
Normal file
181
scrapy/utils/log.py
Normal file
@ -0,0 +1,181 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import warnings
|
||||
from logging.config import dictConfig
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
from twisted.python import log as twisted_log
|
||||
|
||||
import scrapy
|
||||
from scrapy.settings import overridden_settings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FailureFormatter(logging.Filter):
|
||||
"""Extract exc_info from Failure instances provided as contextual data
|
||||
|
||||
This filter mimics Twisted log.err formatting for its first `_stuff`
|
||||
argument, which means that reprs of non Failure objects are appended to the
|
||||
log messages.
|
||||
"""
|
||||
|
||||
def filter(self, record):
|
||||
failure = record.__dict__.get('failure')
|
||||
if failure:
|
||||
if isinstance(failure, Failure):
|
||||
record.exc_info = (failure.type, failure.value, failure.tb)
|
||||
else:
|
||||
record.msg += os.linesep + repr(failure)
|
||||
return True
|
||||
|
||||
|
||||
class TopLevelFormatter(logging.Filter):
|
||||
"""Keep only top level loggers's name (direct children from root) from
|
||||
records.
|
||||
|
||||
This filter will replace Scrapy loggers' names with 'scrapy'. This mimics
|
||||
the old Scrapy log behaviour and helps shortening long names.
|
||||
|
||||
Since it can't be set for just one logger (it won't propagate for its
|
||||
children), it's going to be set in the root handler, with a parametrized
|
||||
`loggers` list where it should act.
|
||||
"""
|
||||
|
||||
def __init__(self, loggers=None):
|
||||
self.loggers = loggers or []
|
||||
|
||||
def filter(self, record):
|
||||
if any(record.name.startswith(l + '.') for l in self.loggers):
|
||||
record.name = record.name.split('.', 1)[0]
|
||||
return True
|
||||
|
||||
|
||||
DEFAULT_LOGGING = {
|
||||
'version': 1,
|
||||
'disable_existing_loggers': False,
|
||||
'filters': {
|
||||
'failure_formatter': {
|
||||
'()': 'scrapy.utils.log.FailureFormatter',
|
||||
},
|
||||
},
|
||||
'loggers': {
|
||||
'scrapy': {
|
||||
'level': 'DEBUG',
|
||||
'filters': ['failure_formatter'],
|
||||
},
|
||||
'twisted': {
|
||||
'level': 'ERROR',
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def configure_logging(settings=None):
|
||||
"""Initialize and configure default loggers
|
||||
|
||||
This function does:
|
||||
- Route warnings and twisted logging through Python standard logging
|
||||
- Set FailureFormatter filter on Scrapy logger
|
||||
- Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
|
||||
- Create a handler for the root logger according to given settings
|
||||
"""
|
||||
if not sys.warnoptions:
|
||||
# Route warnings through python logging
|
||||
logging.captureWarnings(True)
|
||||
|
||||
observer = twisted_log.PythonLoggingObserver('twisted')
|
||||
observer.start()
|
||||
|
||||
dictConfig(DEFAULT_LOGGING)
|
||||
if settings:
|
||||
logging.root.setLevel(logging.NOTSET)
|
||||
|
||||
if settings.getbool('LOG_STDOUT'):
|
||||
sys.stdout = StreamLogger(logging.getLogger('stdout'))
|
||||
|
||||
# Set up the default log handler
|
||||
filename = settings.get('LOG_FILE')
|
||||
if filename:
|
||||
encoding = settings.get('LOG_ENCODING')
|
||||
handler = logging.FileHandler(filename, encoding=encoding)
|
||||
elif settings.getbool('LOG_ENABLED'):
|
||||
handler = logging.StreamHandler()
|
||||
else:
|
||||
handler = logging.NullHandler()
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt=settings.get('LOG_FORMAT'),
|
||||
datefmt=settings.get('LOG_DATEFORMAT')
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
handler.setLevel(settings.get('LOG_LEVEL'))
|
||||
handler.addFilter(TopLevelFormatter(['scrapy']))
|
||||
logging.root.addHandler(handler)
|
||||
|
||||
|
||||
def log_scrapy_info(settings):
|
||||
logger.info("Scrapy %(version)s started (bot: %(bot)s)",
|
||||
{'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
|
||||
|
||||
logger.info("Optional features available: %(features)s",
|
||||
{'features': ", ".join(scrapy.optional_features)})
|
||||
|
||||
d = dict(overridden_settings(settings))
|
||||
logger.info("Overridden settings: %(settings)r", {'settings': d})
|
||||
|
||||
|
||||
class StreamLogger(object):
|
||||
"""Fake file-like stream object that redirects writes to a logger instance
|
||||
|
||||
Taken from:
|
||||
http://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/
|
||||
"""
|
||||
def __init__(self, logger, log_level=logging.INFO):
|
||||
self.logger = logger
|
||||
self.log_level = log_level
|
||||
self.linebuf = ''
|
||||
|
||||
def write(self, buf):
|
||||
for line in buf.rstrip().splitlines():
|
||||
self.logger.log(self.log_level, line.rstrip())
|
||||
|
||||
|
||||
class LogCounterHandler(logging.Handler):
|
||||
"""Record log levels count into a crawler stats"""
|
||||
|
||||
def __init__(self, crawler, *args, **kwargs):
|
||||
super(LogCounterHandler, self).__init__(*args, **kwargs)
|
||||
self.crawler = crawler
|
||||
|
||||
def emit(self, record):
|
||||
sname = 'log_count/{}'.format(record.levelname)
|
||||
self.crawler.stats.inc_value(sname)
|
||||
|
||||
|
||||
def logformatter_adapter(logkws):
|
||||
"""
|
||||
Helper that takes the dictionary output from the methods in LogFormatter
|
||||
and adapts it into a tuple of positional arguments for logger.log calls,
|
||||
handling backward compatibility as well.
|
||||
"""
|
||||
if not {'level', 'msg', 'args'} <= set(logkws):
|
||||
warnings.warn('Missing keys in LogFormatter method',
|
||||
ScrapyDeprecationWarning)
|
||||
|
||||
if 'format' in logkws:
|
||||
warnings.warn('`format` key in LogFormatter methods has been '
|
||||
'deprecated, use `msg` instead',
|
||||
ScrapyDeprecationWarning)
|
||||
|
||||
level = logkws.get('level', logging.INFO)
|
||||
message = logkws.get('format', logkws.get('msg'))
|
||||
# NOTE: This also handles 'args' being an empty dict, that case doesn't
|
||||
# play well in logger.log calls
|
||||
args = logkws if not logkws.get('args') else logkws['args']
|
||||
|
||||
return (level, message, args)
|
@ -1,5 +1,7 @@
|
||||
"""Helper functions for working with signals"""
|
||||
|
||||
import logging
|
||||
|
||||
from twisted.internet.defer import maybeDeferred, DeferredList, Deferred
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
@ -7,7 +9,8 @@ from scrapy.xlib.pydispatch.dispatcher import Any, Anonymous, liveReceivers, \
|
||||
getAllReceivers, disconnect
|
||||
from scrapy.xlib.pydispatch.robustapply import robustApply
|
||||
|
||||
from scrapy import log
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
|
||||
"""Like pydispatcher.robust.sendRobust but it also logs errors and returns
|
||||
@ -21,14 +24,14 @@ def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
|
||||
response = robustApply(receiver, signal=signal, sender=sender,
|
||||
*arguments, **named)
|
||||
if isinstance(response, Deferred):
|
||||
log.msg(format="Cannot return deferreds from signal handler: %(receiver)s",
|
||||
level=log.ERROR, spider=spider, receiver=receiver)
|
||||
logger.error("Cannot return deferreds from signal handler: %(receiver)s",
|
||||
{'receiver': receiver}, extra={'spider': spider})
|
||||
except dont_log:
|
||||
result = Failure()
|
||||
except Exception:
|
||||
result = Failure()
|
||||
log.err(result, "Error caught on signal handler: %s" % receiver, \
|
||||
spider=spider)
|
||||
logger.exception("Error caught on signal handler: %(receiver)s",
|
||||
{'receiver': receiver}, extra={'spider': spider})
|
||||
else:
|
||||
result = response
|
||||
responses.append((receiver, result))
|
||||
@ -41,8 +44,9 @@ def send_catch_log_deferred(signal=Any, sender=Anonymous, *arguments, **named):
|
||||
"""
|
||||
def logerror(failure, recv):
|
||||
if dont_log is None or not isinstance(failure.value, dont_log):
|
||||
log.err(failure, "Error caught on signal handler: %s" % recv, \
|
||||
spider=spider)
|
||||
logger.error("Error caught on signal handler: %(receiver)s",
|
||||
{'receiver': recv},
|
||||
extra={'spider': spider, 'failure': failure})
|
||||
return failure
|
||||
|
||||
dont_log = named.pop('dont_log', None)
|
||||
|
@ -1,11 +1,13 @@
|
||||
import logging
|
||||
import inspect
|
||||
|
||||
import six
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def iterate_spider_output(result):
|
||||
return arg_to_iter(result)
|
||||
@ -43,12 +45,12 @@ def spidercls_for_request(spider_loader, request, default_spidercls=None,
|
||||
return spider_loader.load(snames[0])
|
||||
|
||||
if len(snames) > 1 and log_multiple:
|
||||
log.msg(format='More than one spider can handle: %(request)s - %(snames)s',
|
||||
level=log.ERROR, request=request, snames=', '.join(snames))
|
||||
logger.error('More than one spider can handle: %(request)s - %(snames)s',
|
||||
{'request': request, 'snames': ', '.join(snames)})
|
||||
|
||||
if len(snames) == 0 and log_none:
|
||||
log.msg(format='Unable to find spider that handles: %(request)s',
|
||||
level=log.ERROR, request=request)
|
||||
logger.error('Unable to find spider that handles: %(request)s',
|
||||
{'request': request})
|
||||
|
||||
return default_spidercls
|
||||
|
||||
|
@ -46,19 +46,6 @@ def get_testenv():
|
||||
env['PYTHONPATH'] = get_pythonpath()
|
||||
return env
|
||||
|
||||
def get_testlog():
|
||||
"""Get Scrapy log of current test, ignoring the rest"""
|
||||
with open("test.log", "rb") as fp:
|
||||
loglines = fp.readlines()
|
||||
|
||||
thistest = []
|
||||
for line in loglines[::-1]:
|
||||
thistest.append(line)
|
||||
if "[-] -->" in line:
|
||||
break
|
||||
return "".join(thistest[::-1])
|
||||
|
||||
|
||||
def assert_samelines(testcase, text1, text2, msg=None):
|
||||
"""Asserts text1 and text2 have the same lines, ignoring differences in
|
||||
line endings between platforms
|
||||
|
@ -31,7 +31,6 @@ tests/test_http_cookies.py
|
||||
tests/test_http_request.py
|
||||
tests/test_http_response.py
|
||||
tests/test_logformatter.py
|
||||
tests/test_log.py
|
||||
tests/test_mail.py
|
||||
tests/test_middleware.py
|
||||
tests/test_pipeline_files.py
|
||||
@ -59,6 +58,7 @@ tests/test_stats.py
|
||||
tests/test_utils_defer.py
|
||||
tests/test_utils_iterators.py
|
||||
tests/test_utils_jsonrpc.py
|
||||
tests/test_utils_log.py
|
||||
tests/test_utils_python.py
|
||||
tests/test_utils_reqser.py
|
||||
tests/test_utils_request.py
|
||||
|
@ -4,3 +4,4 @@ mitmproxy==0.10.1
|
||||
netlib==0.10.1
|
||||
pytest-twisted
|
||||
jmespath
|
||||
testfixtures
|
||||
|
@ -74,7 +74,7 @@ class SimpleSpider(MetaSpider):
|
||||
self.start_urls = [url]
|
||||
|
||||
def parse(self, response):
|
||||
self.log("Got response %d" % response.status)
|
||||
self.logger.info("Got response %d" % response.status)
|
||||
|
||||
|
||||
class ItemSpider(FollowAllSpider):
|
||||
|
@ -39,7 +39,6 @@ class TestCloseSpider(TestCase):
|
||||
close_on = 5
|
||||
crawler = get_crawler(ErrorSpider, {'CLOSESPIDER_ERRORCOUNT': close_on})
|
||||
yield crawler.crawl(total=1000000)
|
||||
self.flushLoggedErrors(crawler.spider.exception_cls)
|
||||
reason = crawler.spider.meta['close_reason']
|
||||
self.assertEqual(reason, 'closespider_errorcount')
|
||||
key = 'spider_exceptions/{name}'\
|
||||
|
@ -137,21 +137,20 @@ class RunSpiderCommandTest(CommandTest):
|
||||
with open(fname, 'w') as f:
|
||||
f.write("""
|
||||
import scrapy
|
||||
from scrapy import log
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
name = 'myspider'
|
||||
|
||||
def start_requests(self):
|
||||
self.log("It Works!")
|
||||
self.logger.debug("It Works!")
|
||||
return []
|
||||
""")
|
||||
p = self.proc('runspider', fname)
|
||||
log = p.stderr.read()
|
||||
self.assertIn("[myspider] DEBUG: It Works!", log)
|
||||
self.assertIn("[myspider] INFO: Spider opened", log)
|
||||
self.assertIn("[myspider] INFO: Closing spider (finished)", log)
|
||||
self.assertIn("[myspider] INFO: Spider closed (finished)", log)
|
||||
self.assertIn("DEBUG: It Works!", log)
|
||||
self.assertIn("INFO: Spider opened", log)
|
||||
self.assertIn("INFO: Closing spider (finished)", log)
|
||||
self.assertIn("INFO: Spider closed (finished)", log)
|
||||
|
||||
def test_runspider_no_spider_found(self):
|
||||
tmpdir = self.mktemp()
|
||||
@ -159,7 +158,6 @@ class MySpider(scrapy.Spider):
|
||||
fname = abspath(join(tmpdir, 'myspider.py'))
|
||||
with open(fname, 'w') as f:
|
||||
f.write("""
|
||||
from scrapy import log
|
||||
from scrapy.spider import Spider
|
||||
""")
|
||||
p = self.proc('runspider', fname)
|
||||
@ -192,7 +190,6 @@ class ParseCommandTest(ProcessTest, SiteTest, CommandTest):
|
||||
fname = abspath(join(self.proj_mod_path, 'spiders', 'myspider.py'))
|
||||
with open(fname, 'w') as f:
|
||||
f.write("""
|
||||
from scrapy import log
|
||||
import scrapy
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
@ -200,20 +197,20 @@ class MySpider(scrapy.Spider):
|
||||
|
||||
def parse(self, response):
|
||||
if getattr(self, 'test_arg', None):
|
||||
self.log('It Works!')
|
||||
self.logger.debug('It Works!')
|
||||
return [scrapy.Item(), dict(foo='bar')]
|
||||
""".format(self.spider_name))
|
||||
|
||||
fname = abspath(join(self.proj_mod_path, 'pipelines.py'))
|
||||
with open(fname, 'w') as f:
|
||||
f.write("""
|
||||
from scrapy import log
|
||||
import logging
|
||||
|
||||
class MyPipeline(object):
|
||||
component_name = 'my_pipeline'
|
||||
|
||||
def process_item(self, item, spider):
|
||||
log.msg('It Works!')
|
||||
logging.info('It Works!')
|
||||
return item
|
||||
""")
|
||||
|
||||
@ -229,7 +226,7 @@ ITEM_PIPELINES = {'%s.pipelines.MyPipeline': 1}
|
||||
'-a', 'test_arg=1',
|
||||
'-c', 'parse',
|
||||
self.url('/html')])
|
||||
self.assertIn("[parse_spider] DEBUG: It Works!", stderr)
|
||||
self.assertIn("DEBUG: It Works!", stderr)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_pipelines(self):
|
||||
@ -237,7 +234,7 @@ ITEM_PIPELINES = {'%s.pipelines.MyPipeline': 1}
|
||||
'--pipelines',
|
||||
'-c', 'parse',
|
||||
self.url('/html')])
|
||||
self.assertIn("[scrapy] INFO: It Works!", stderr)
|
||||
self.assertIn("INFO: It Works!", stderr)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_parse_items(self):
|
||||
|
@ -1,8 +1,12 @@
|
||||
import json
|
||||
import socket
|
||||
import logging
|
||||
|
||||
from testfixtures import LogCapture
|
||||
from twisted.internet import defer
|
||||
from twisted.trial.unittest import TestCase
|
||||
from scrapy.utils.test import get_crawler, get_testlog
|
||||
|
||||
from scrapy.utils.test import get_crawler
|
||||
from tests import mock
|
||||
from tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
|
||||
BrokenStartRequestsSpider, SingleRequestSpider, DuplicateStartRequestsSpider
|
||||
@ -72,36 +76,47 @@ class CrawlTestCase(TestCase):
|
||||
@defer.inlineCallbacks
|
||||
def test_retry_503(self):
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("http://localhost:8998/status?n=503")
|
||||
self._assert_retried()
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("http://localhost:8998/status?n=503")
|
||||
self._assert_retried(l)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_retry_conn_failed(self):
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("http://localhost:65432/status?n=503")
|
||||
self._assert_retried()
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("http://localhost:65432/status?n=503")
|
||||
self._assert_retried(l)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_retry_dns_error(self):
|
||||
with mock.patch('socket.gethostbyname',
|
||||
side_effect=socket.gaierror(-5, 'No address associated with hostname')):
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("http://example.com/")
|
||||
self._assert_retried()
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("http://example.com/")
|
||||
self._assert_retried(l)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_start_requests_bug_before_yield(self):
|
||||
crawler = get_crawler(BrokenStartRequestsSpider)
|
||||
yield crawler.crawl(fail_before_yield=1)
|
||||
errors = self.flushLoggedErrors(ZeroDivisionError)
|
||||
self.assertEqual(len(errors), 1)
|
||||
with LogCapture('scrapy', level=logging.ERROR) as l:
|
||||
crawler = get_crawler(BrokenStartRequestsSpider)
|
||||
yield crawler.crawl(fail_before_yield=1)
|
||||
|
||||
self.assertEqual(len(l.records), 1)
|
||||
record = l.records[0]
|
||||
self.assertIsNotNone(record.exc_info)
|
||||
self.assertIs(record.exc_info[0], ZeroDivisionError)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_start_requests_bug_yielding(self):
|
||||
crawler = get_crawler(BrokenStartRequestsSpider)
|
||||
yield crawler.crawl(fail_yielding=1)
|
||||
errors = self.flushLoggedErrors(ZeroDivisionError)
|
||||
self.assertEqual(len(errors), 1)
|
||||
with LogCapture('scrapy', level=logging.ERROR) as l:
|
||||
crawler = get_crawler(BrokenStartRequestsSpider)
|
||||
yield crawler.crawl(fail_yielding=1)
|
||||
|
||||
self.assertEqual(len(l.records), 1)
|
||||
record = l.records[0]
|
||||
self.assertIsNotNone(record.exc_info)
|
||||
self.assertIs(record.exc_info[0], ZeroDivisionError)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_start_requests_lazyness(self):
|
||||
@ -145,28 +160,29 @@ foo body
|
||||
with multiples lines
|
||||
'''})
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("http://localhost:8998/raw?{0}".format(query))
|
||||
log = get_testlog()
|
||||
self.assertEqual(log.count("Got response 200"), 1)
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("http://localhost:8998/raw?{0}".format(query))
|
||||
self.assertEqual(str(l).count("Got response 200"), 1)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_retry_conn_lost(self):
|
||||
# connection lost after receiving data
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("http://localhost:8998/drop?abort=0")
|
||||
self._assert_retried()
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("http://localhost:8998/drop?abort=0")
|
||||
self._assert_retried(l)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_retry_conn_aborted(self):
|
||||
# connection lost before receiving data
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("http://localhost:8998/drop?abort=1")
|
||||
self._assert_retried()
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("http://localhost:8998/drop?abort=1")
|
||||
self._assert_retried(l)
|
||||
|
||||
def _assert_retried(self):
|
||||
log = get_testlog()
|
||||
self.assertEqual(log.count("Retrying"), 2)
|
||||
self.assertEqual(log.count("Gave up retrying"), 1)
|
||||
def _assert_retried(self, log):
|
||||
self.assertEqual(str(log).count("Retrying"), 2)
|
||||
self.assertEqual(str(log).count("Gave up retrying"), 1)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_referer_header(self):
|
||||
|
@ -1,169 +0,0 @@
|
||||
from io import BytesIO
|
||||
|
||||
from twisted.python import log as txlog, failure
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.settings import default_settings
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
class LogTest(unittest.TestCase):
|
||||
|
||||
def test_get_log_level(self):
|
||||
default_log_level = getattr(log, default_settings.LOG_LEVEL)
|
||||
self.assertEqual(log._get_log_level('WARNING'), log.WARNING)
|
||||
self.assertEqual(log._get_log_level(log.WARNING), log.WARNING)
|
||||
self.assertRaises(ValueError, log._get_log_level, object())
|
||||
|
||||
class ScrapyFileLogObserverTest(unittest.TestCase):
|
||||
|
||||
level = log.INFO
|
||||
encoding = 'utf-8'
|
||||
|
||||
def setUp(self):
|
||||
self.f = BytesIO()
|
||||
self.log_observer = log.ScrapyFileLogObserver(self.f, self.level, self.encoding)
|
||||
self.log_observer.start()
|
||||
|
||||
def tearDown(self):
|
||||
self.flushLoggedErrors()
|
||||
self.log_observer.stop()
|
||||
|
||||
def logged(self):
|
||||
return self.f.getvalue().strip()[25:]
|
||||
|
||||
def first_log_line(self):
|
||||
logged = self.logged()
|
||||
return logged.splitlines()[0] if logged else ''
|
||||
|
||||
def test_msg_basic(self):
|
||||
log.msg("Hello")
|
||||
self.assertEqual(self.logged(), "[scrapy] INFO: Hello")
|
||||
|
||||
def test_msg_ignore_spider(self):
|
||||
spider = Spider("myspider")
|
||||
log.msg("Hello", spider=spider)
|
||||
self.failIf(self.logged())
|
||||
|
||||
def test_msg_level1(self):
|
||||
log.msg("Hello", level=log.WARNING)
|
||||
self.assertEqual(self.logged(), "[scrapy] WARNING: Hello")
|
||||
|
||||
def test_msg_level2(self):
|
||||
log.msg("Hello", log.WARNING)
|
||||
self.assertEqual(self.logged(), "[scrapy] WARNING: Hello")
|
||||
|
||||
def test_msg_wrong_level(self):
|
||||
log.msg("Hello", level=9999)
|
||||
self.assertEqual(self.logged(), "[scrapy] NOLEVEL: Hello")
|
||||
|
||||
def test_msg_encoding(self):
|
||||
log.msg(u"Price: \xa3100")
|
||||
self.assertEqual(self.logged(), "[scrapy] INFO: Price: \xc2\xa3100")
|
||||
|
||||
def test_msg_ignore_level(self):
|
||||
log.msg("Hello", level=log.DEBUG)
|
||||
log.msg("World", level=log.INFO)
|
||||
self.assertEqual(self.logged(), "[scrapy] INFO: World")
|
||||
|
||||
def test_msg_ignore_system(self):
|
||||
txlog.msg("Hello")
|
||||
self.failIf(self.logged())
|
||||
|
||||
def test_msg_ignore_system_err(self):
|
||||
txlog.msg("Hello")
|
||||
self.failIf(self.logged())
|
||||
|
||||
def test_err_noargs(self):
|
||||
try:
|
||||
a = 1/0
|
||||
except:
|
||||
log.err()
|
||||
self.assertIn('Traceback', self.logged())
|
||||
self.assertIn('ZeroDivisionError', self.logged())
|
||||
|
||||
def test_err_why(self):
|
||||
log.err(TypeError("bad type"), "Wrong type")
|
||||
self.assertEqual(self.first_log_line(), "[scrapy] ERROR: Wrong type")
|
||||
self.assertIn('TypeError', self.logged())
|
||||
self.assertIn('bad type', self.logged())
|
||||
|
||||
def test_error_outside_scrapy(self):
|
||||
"""Scrapy logger should still print outside errors"""
|
||||
txlog.err(TypeError("bad type"), "Wrong type")
|
||||
self.assertEqual(self.first_log_line(), "[-] ERROR: Wrong type")
|
||||
self.assertIn('TypeError', self.logged())
|
||||
self.assertIn('bad type', self.logged())
|
||||
|
||||
# this test fails in twisted trial observer, not in scrapy observer
|
||||
# def test_err_why_encoding(self):
|
||||
# log.err(TypeError("bad type"), u"\xa3")
|
||||
# self.assertEqual(self.first_log_line(), "[scrapy] ERROR: \xc2\xa3")
|
||||
|
||||
def test_err_exc(self):
|
||||
log.err(TypeError("bad type"))
|
||||
self.assertIn('Unhandled Error', self.logged())
|
||||
self.assertIn('TypeError', self.logged())
|
||||
self.assertIn('bad type', self.logged())
|
||||
|
||||
def test_err_failure(self):
|
||||
log.err(failure.Failure(TypeError("bad type")))
|
||||
self.assertIn('Unhandled Error', self.logged())
|
||||
self.assertIn('TypeError', self.logged())
|
||||
self.assertIn('bad type', self.logged())
|
||||
|
||||
|
||||
class Latin1ScrapyFileLogObserverTest(ScrapyFileLogObserverTest):
|
||||
|
||||
encoding = 'latin-1'
|
||||
|
||||
def test_msg_encoding(self):
|
||||
log.msg(u"Price: \xa3100")
|
||||
logged = self.f.getvalue().strip()[25:]
|
||||
self.assertEqual(self.logged(), "[scrapy] INFO: Price: \xa3100")
|
||||
|
||||
# this test fails in twisted trial observer, not in scrapy observer
|
||||
# def test_err_why_encoding(self):
|
||||
# log.err(TypeError("bad type"), u"\xa3")
|
||||
# self.assertEqual(self.first_log_line(), "[scrapy] ERROR: \xa3")
|
||||
|
||||
|
||||
class CrawlerScrapyFileLogObserverTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.f = BytesIO()
|
||||
self.crawler = get_crawler(Spider)
|
||||
self.spider = self.crawler.spider = self.crawler._create_spider('test')
|
||||
self.log_observer = log.ScrapyFileLogObserver(self.f, log.INFO,
|
||||
'utf-8', self.crawler)
|
||||
self.log_observer.start()
|
||||
|
||||
def tearDown(self):
|
||||
self.flushLoggedErrors()
|
||||
self.log_observer.stop()
|
||||
|
||||
def logged(self):
|
||||
return self.f.getvalue().strip()[25:]
|
||||
|
||||
def test_msg_basic(self):
|
||||
log.msg("Hello", spider=self.spider)
|
||||
self.assertEqual(self.logged(), "[test] INFO: Hello")
|
||||
|
||||
def test_msg_ignore_scrapy_channel(self):
|
||||
log.msg("Hello")
|
||||
self.failIf(self.logged())
|
||||
|
||||
def test_msg_ignore_another_crawler(self):
|
||||
crawler = get_crawler(Spider)
|
||||
log.msg("Hello", spider=crawler._create_spider('test'))
|
||||
self.failIf(self.logged())
|
||||
|
||||
def test_msg_stats_log(self):
|
||||
assert self.crawler.stats.get_value('log_count/INFO', 0) == 0
|
||||
log.msg("Hello", spider=self.spider)
|
||||
self.assertEqual(self.crawler.stats.get_value('log_count/INFO'), 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -24,14 +24,14 @@ class LoggingContribTest(unittest.TestCase):
|
||||
req = Request("http://www.example.com")
|
||||
res = Response("http://www.example.com")
|
||||
logkws = self.formatter.crawled(req, res, self.spider)
|
||||
logline = logkws['format'] % logkws
|
||||
logline = logkws['msg'] % logkws['args']
|
||||
self.assertEqual(logline,
|
||||
"Crawled (200) <GET http://www.example.com> (referer: None)")
|
||||
|
||||
req = Request("http://www.example.com", headers={'referer': 'http://example.com'})
|
||||
res = Response("http://www.example.com", flags=['cached'])
|
||||
logkws = self.formatter.crawled(req, res, self.spider)
|
||||
logline = logkws['format'] % logkws
|
||||
logline = logkws['msg'] % logkws['args']
|
||||
self.assertEqual(logline,
|
||||
"Crawled (200) <GET http://www.example.com> (referer: http://example.com) ['cached']")
|
||||
|
||||
@ -40,7 +40,7 @@ class LoggingContribTest(unittest.TestCase):
|
||||
exception = Exception(u"\u2018")
|
||||
response = Response("http://www.example.com")
|
||||
logkws = self.formatter.dropped(item, exception, response, self.spider)
|
||||
logline = logkws['format'] % logkws
|
||||
logline = logkws['msg'] % logkws['args']
|
||||
lines = logline.splitlines()
|
||||
assert all(isinstance(x, unicode) for x in lines)
|
||||
self.assertEqual(lines, [u"Dropped: \u2018", '{}'])
|
||||
@ -50,7 +50,7 @@ class LoggingContribTest(unittest.TestCase):
|
||||
item['name'] = u'\xa3'
|
||||
response = Response("http://www.example.com")
|
||||
logkws = self.formatter.scraped(item, response, self.spider)
|
||||
logline = logkws['format'] % logkws
|
||||
logline = logkws['msg'] % logkws['args']
|
||||
lines = logline.splitlines()
|
||||
assert all(isinstance(x, unicode) for x in lines)
|
||||
self.assertEqual(lines, [u"Scraped from <200 http://www.example.com>", u'name: \xa3'])
|
||||
|
@ -1,9 +1,9 @@
|
||||
from __future__ import print_function
|
||||
from testfixtures import LogCapture
|
||||
from twisted.trial import unittest
|
||||
from twisted.python.failure import Failure
|
||||
from twisted.internet import reactor
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
from twisted.python import log as txlog
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.spider import Spider
|
||||
@ -11,7 +11,6 @@ from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.contrib.pipeline.media import MediaPipeline
|
||||
from scrapy.utils.signal import disconnect_all
|
||||
from scrapy import signals
|
||||
from scrapy import log
|
||||
|
||||
|
||||
def _mocked_download_func(request, info):
|
||||
@ -60,26 +59,21 @@ class BaseMediaPipelineTestCase(unittest.TestCase):
|
||||
fail = Failure(Exception())
|
||||
results = [(True, 1), (False, fail)]
|
||||
|
||||
events = []
|
||||
txlog.addObserver(events.append)
|
||||
new_item = self.pipe.item_completed(results, item, self.info)
|
||||
txlog.removeObserver(events.append)
|
||||
self.flushLoggedErrors()
|
||||
with LogCapture() as l:
|
||||
new_item = self.pipe.item_completed(results, item, self.info)
|
||||
|
||||
assert new_item is item
|
||||
assert len(events) == 1
|
||||
assert events[0]['logLevel'] == log.ERROR
|
||||
assert events[0]['failure'] is fail
|
||||
assert len(l.records) == 1
|
||||
record = l.records[0]
|
||||
assert record.levelname == 'ERROR'
|
||||
assert record.failure is fail
|
||||
|
||||
# disable failure logging and check again
|
||||
self.pipe.LOG_FAILED_RESULTS = False
|
||||
events = []
|
||||
txlog.addObserver(events.append)
|
||||
new_item = self.pipe.item_completed(results, item, self.info)
|
||||
txlog.removeObserver(events.append)
|
||||
self.flushLoggedErrors()
|
||||
with LogCapture() as l:
|
||||
new_item = self.pipe.item_completed(results, item, self.info)
|
||||
assert new_item is item
|
||||
assert len(events) == 0
|
||||
assert len(l.records) == 0
|
||||
|
||||
@inlineCallbacks
|
||||
def test_default_process_item(self):
|
||||
|
@ -5,10 +5,11 @@ import time
|
||||
from threading import Thread
|
||||
from libmproxy import controller, proxy
|
||||
from netlib import http_auth
|
||||
from testfixtures import LogCapture
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.trial.unittest import TestCase
|
||||
from scrapy.utils.test import get_testlog, get_crawler
|
||||
from scrapy.utils.test import get_crawler
|
||||
from scrapy.http import Request
|
||||
from tests.spiders import SimpleSpider, SingleRequestSpider
|
||||
from tests.mockserver import MockServer
|
||||
@ -50,39 +51,44 @@ class ProxyConnectTestCase(TestCase):
|
||||
@defer.inlineCallbacks
|
||||
def test_https_connect_tunnel(self):
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("https://localhost:8999/status?n=200")
|
||||
self._assert_got_response_code(200)
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("https://localhost:8999/status?n=200")
|
||||
self._assert_got_response_code(200, l)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_https_noconnect(self):
|
||||
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888?noconnect'
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("https://localhost:8999/status?n=200")
|
||||
self._assert_got_response_code(200)
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("https://localhost:8999/status?n=200")
|
||||
self._assert_got_response_code(200, l)
|
||||
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888'
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_https_connect_tunnel_error(self):
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("https://localhost:99999/status?n=200")
|
||||
self._assert_got_tunnel_error()
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("https://localhost:99999/status?n=200")
|
||||
self._assert_got_tunnel_error(l)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_https_tunnel_auth_error(self):
|
||||
os.environ['https_proxy'] = 'http://wrong:wronger@localhost:8888'
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("https://localhost:8999/status?n=200")
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("https://localhost:8999/status?n=200")
|
||||
# The proxy returns a 407 error code but it does not reach the client;
|
||||
# he just sees a TunnelError.
|
||||
self._assert_got_tunnel_error()
|
||||
self._assert_got_tunnel_error(l)
|
||||
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888'
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_https_tunnel_without_leak_proxy_authorization_header(self):
|
||||
request = Request("https://localhost:8999/echo")
|
||||
crawler = get_crawler(SingleRequestSpider)
|
||||
yield crawler.crawl(seed=request)
|
||||
self._assert_got_response_code(200)
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl(seed=request)
|
||||
self._assert_got_response_code(200, l)
|
||||
echo = json.loads(crawler.spider.meta['responses'][0].body)
|
||||
self.assertTrue('Proxy-Authorization' not in echo['headers'])
|
||||
|
||||
@ -90,13 +96,12 @@ class ProxyConnectTestCase(TestCase):
|
||||
def test_https_noconnect_auth_error(self):
|
||||
os.environ['https_proxy'] = 'http://wrong:wronger@localhost:8888?noconnect'
|
||||
crawler = get_crawler(SimpleSpider)
|
||||
yield crawler.crawl("https://localhost:8999/status?n=200")
|
||||
self._assert_got_response_code(407)
|
||||
with LogCapture() as l:
|
||||
yield crawler.crawl("https://localhost:8999/status?n=200")
|
||||
self._assert_got_response_code(407, l)
|
||||
|
||||
def _assert_got_response_code(self, code):
|
||||
log = get_testlog()
|
||||
self.assertEqual(log.count('Crawled (%d)' % code), 1)
|
||||
def _assert_got_response_code(self, code, log):
|
||||
self.assertEqual(str(log).count('Crawled (%d)' % code), 1)
|
||||
|
||||
def _assert_got_tunnel_error(self):
|
||||
log = get_testlog()
|
||||
self.assertEqual(log.count('TunnelError'), 1)
|
||||
def _assert_got_tunnel_error(self, log):
|
||||
self.assertEqual(str(log).count('TunnelError'), 1)
|
||||
|
@ -2,6 +2,8 @@ import gzip
|
||||
import inspect
|
||||
import warnings
|
||||
from io import BytesIO
|
||||
|
||||
from testfixtures import LogCapture
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy import signals
|
||||
@ -102,6 +104,23 @@ class SpiderTest(unittest.TestCase):
|
||||
self.assertEqual(settings.get('TEST2'), 'spider')
|
||||
self.assertEqual(settings.get('TEST3'), 'project')
|
||||
|
||||
def test_logger(self):
|
||||
spider = self.spider_class('example.com')
|
||||
with LogCapture() as l:
|
||||
spider.logger.info('test log msg')
|
||||
l.check(('example.com', 'INFO', 'test log msg'))
|
||||
|
||||
record = l.records[0]
|
||||
self.assertIn('spider', record.__dict__)
|
||||
self.assertIs(record.spider, spider)
|
||||
|
||||
def test_log(self):
|
||||
spider = self.spider_class('example.com')
|
||||
with mock.patch('scrapy.spider.Spider.logger') as mock_logger:
|
||||
spider.log('test log msg', 'INFO')
|
||||
mock_logger.log.assert_called_once_with('INFO', 'test log msg')
|
||||
|
||||
|
||||
class InitSpiderTest(SpiderTest):
|
||||
|
||||
spider_class = InitSpider
|
||||
|
@ -1,9 +1,10 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from testfixtures import LogCapture
|
||||
from twisted.trial.unittest import TestCase as TrialTestCase
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.utils.test import get_crawler, get_testlog
|
||||
from scrapy.utils.test import get_crawler
|
||||
from tests.mockserver import MockServer
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.spider import Spider
|
||||
@ -174,14 +175,13 @@ class TestHttpErrorMiddlewareIntegrational(TrialTestCase):
|
||||
@defer.inlineCallbacks
|
||||
def test_logging(self):
|
||||
crawler = get_crawler(_HttpErrorSpider)
|
||||
yield crawler.crawl(bypass_status_codes={402})
|
||||
# print(get_testlog())
|
||||
with LogCapture() as log:
|
||||
yield crawler.crawl(bypass_status_codes={402})
|
||||
self.assertEqual(crawler.spider.parsed, {'200', '402'})
|
||||
self.assertEqual(crawler.spider.skipped, {'402'})
|
||||
self.assertEqual(crawler.spider.failed, {'404', '500'})
|
||||
|
||||
log = get_testlog()
|
||||
self.assertIn('Ignoring response <404', log)
|
||||
self.assertIn('Ignoring response <500', log)
|
||||
self.assertNotIn('Ignoring response <200', log)
|
||||
self.assertNotIn('Ignoring response <402', log)
|
||||
self.assertIn('Ignoring response <404', str(log))
|
||||
self.assertIn('Ignoring response <500', str(log))
|
||||
self.assertNotIn('Ignoring response <200', str(log))
|
||||
self.assertNotIn('Ignoring response <402', str(log))
|
||||
|
@ -74,7 +74,6 @@ class DeferUtilsTest(unittest.TestCase):
|
||||
def test_process_parallel_failure(self):
|
||||
d = process_parallel([cb1, cb_fail, cb3], 'res', 'v1', 'v2')
|
||||
self.failUnlessFailure(d, TypeError)
|
||||
self.flushLoggedErrors()
|
||||
return d
|
||||
|
||||
|
||||
|
132
tests/test_utils_log.py
Normal file
132
tests/test_utils_log.py
Normal file
@ -0,0 +1,132 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from testfixtures import LogCapture
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.utils.log import (FailureFormatter, TopLevelFormatter,
|
||||
LogCounterHandler, StreamLogger)
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
|
||||
class FailureFormatterTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.logger = logging.getLogger('test')
|
||||
self.filter = FailureFormatter()
|
||||
self.logger.addFilter(self.filter)
|
||||
|
||||
def tearDown(self):
|
||||
self.logger.removeFilter(self.filter)
|
||||
|
||||
def test_failure_format(self):
|
||||
with LogCapture() as l:
|
||||
try:
|
||||
0/0
|
||||
except ZeroDivisionError:
|
||||
self.logger.exception('test log msg')
|
||||
failure = Failure()
|
||||
|
||||
self.logger.error('test log msg', extra={'failure': failure})
|
||||
|
||||
self.assertEqual(len(l.records), 2)
|
||||
exc_record, failure_record = l.records
|
||||
self.assertTupleEqual(failure_record.exc_info, exc_record.exc_info)
|
||||
|
||||
formatter = logging.Formatter()
|
||||
self.assertMultiLineEqual(formatter.format(failure_record),
|
||||
formatter.format(exc_record))
|
||||
|
||||
def test_non_failure_format(self):
|
||||
with LogCapture() as l:
|
||||
self.logger.error('test log msg', extra={'failure': 3})
|
||||
|
||||
self.assertEqual(len(l.records), 1)
|
||||
self.assertMultiLineEqual(l.records[0].getMessage(),
|
||||
'test log msg' + os.linesep + '3')
|
||||
|
||||
|
||||
class TopLevelFormatterTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.handler = LogCapture()
|
||||
self.handler.addFilter(TopLevelFormatter(['test']))
|
||||
|
||||
def test_top_level_logger(self):
|
||||
logger = logging.getLogger('test')
|
||||
with self.handler as l:
|
||||
logger.warning('test log msg')
|
||||
|
||||
l.check(('test', 'WARNING', 'test log msg'))
|
||||
|
||||
def test_children_logger(self):
|
||||
logger = logging.getLogger('test.test1')
|
||||
with self.handler as l:
|
||||
logger.warning('test log msg')
|
||||
|
||||
l.check(('test', 'WARNING', 'test log msg'))
|
||||
|
||||
def test_overlapping_name_logger(self):
|
||||
logger = logging.getLogger('test2')
|
||||
with self.handler as l:
|
||||
logger.warning('test log msg')
|
||||
|
||||
l.check(('test2', 'WARNING', 'test log msg'))
|
||||
|
||||
def test_different_name_logger(self):
|
||||
logger = logging.getLogger('different')
|
||||
with self.handler as l:
|
||||
logger.warning('test log msg')
|
||||
|
||||
l.check(('different', 'WARNING', 'test log msg'))
|
||||
|
||||
|
||||
class LogCounterHandlerTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.logger = logging.getLogger('test')
|
||||
self.logger.setLevel(logging.NOTSET)
|
||||
self.logger.propagate = False
|
||||
self.crawler = get_crawler(settings_dict={'LOG_LEVEL': 'WARNING'})
|
||||
self.handler = LogCounterHandler(self.crawler)
|
||||
self.logger.addHandler(self.handler)
|
||||
|
||||
def tearDown(self):
|
||||
self.logger.propagate = True
|
||||
self.logger.removeHandler(self.handler)
|
||||
|
||||
def test_init(self):
|
||||
self.assertIsNone(self.crawler.stats.get_value('log_count/DEBUG'))
|
||||
self.assertIsNone(self.crawler.stats.get_value('log_count/INFO'))
|
||||
self.assertIsNone(self.crawler.stats.get_value('log_count/WARNING'))
|
||||
self.assertIsNone(self.crawler.stats.get_value('log_count/ERROR'))
|
||||
self.assertIsNone(self.crawler.stats.get_value('log_count/CRITICAL'))
|
||||
|
||||
def test_accepted_level(self):
|
||||
self.logger.error('test log msg')
|
||||
self.assertEqual(self.crawler.stats.get_value('log_count/ERROR'), 1)
|
||||
|
||||
def test_filtered_out_level(self):
|
||||
self.logger.debug('test log msg')
|
||||
self.assertIsNone(self.crawler.stats.get_value('log_count/INFO'))
|
||||
|
||||
|
||||
class StreamLoggerTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.stdout = sys.stdout
|
||||
logger = logging.getLogger('test')
|
||||
logger.setLevel(logging.WARNING)
|
||||
sys.stdout = StreamLogger(logger, logging.ERROR)
|
||||
|
||||
def tearDown(self):
|
||||
sys.stdout = self.stdout
|
||||
|
||||
def test_redirect(self):
|
||||
with LogCapture() as l:
|
||||
print('test log msg')
|
||||
l.check(('test', 'ERROR', 'test log msg'))
|
@ -1,11 +1,11 @@
|
||||
from testfixtures import LogCapture
|
||||
from twisted.trial import unittest
|
||||
from twisted.python import log as txlog
|
||||
from twisted.python.failure import Failure
|
||||
from twisted.internet import defer, reactor
|
||||
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.utils.signal import send_catch_log, send_catch_log_deferred
|
||||
from scrapy import log
|
||||
|
||||
|
||||
class SendCatchLogTest(unittest.TestCase):
|
||||
|
||||
@ -14,26 +14,24 @@ class SendCatchLogTest(unittest.TestCase):
|
||||
test_signal = object()
|
||||
handlers_called = set()
|
||||
|
||||
def log_received(event):
|
||||
handlers_called.add(log_received)
|
||||
assert "error_handler" in event['message'][0]
|
||||
assert event['logLevel'] == log.ERROR
|
||||
|
||||
txlog.addObserver(log_received)
|
||||
dispatcher.connect(self.error_handler, signal=test_signal)
|
||||
dispatcher.connect(self.ok_handler, signal=test_signal)
|
||||
result = yield defer.maybeDeferred(self._get_result, test_signal, arg='test', \
|
||||
handlers_called=handlers_called)
|
||||
with LogCapture() as l:
|
||||
result = yield defer.maybeDeferred(
|
||||
self._get_result, test_signal, arg='test',
|
||||
handlers_called=handlers_called
|
||||
)
|
||||
|
||||
assert self.error_handler in handlers_called
|
||||
assert self.ok_handler in handlers_called
|
||||
assert log_received in handlers_called
|
||||
self.assertEqual(len(l.records), 1)
|
||||
record = l.records[0]
|
||||
self.assertIn('error_handler', record.getMessage())
|
||||
self.assertEqual(record.levelname, 'ERROR')
|
||||
self.assertEqual(result[0][0], self.error_handler)
|
||||
self.assert_(isinstance(result[0][1], Failure))
|
||||
self.assertEqual(result[1], (self.ok_handler, "OK"))
|
||||
|
||||
txlog.removeObserver(log_received)
|
||||
self.flushLoggedErrors()
|
||||
dispatcher.disconnect(self.error_handler, signal=test_signal)
|
||||
dispatcher.disconnect(self.ok_handler, signal=test_signal)
|
||||
|
||||
@ -73,12 +71,9 @@ class SendCatchLogTest2(unittest.TestCase):
|
||||
def test_error_logged_if_deferred_not_supported(self):
|
||||
test_signal = object()
|
||||
test_handler = lambda: defer.Deferred()
|
||||
log_events = []
|
||||
txlog.addObserver(log_events.append)
|
||||
dispatcher.connect(test_handler, test_signal)
|
||||
send_catch_log(test_signal)
|
||||
self.assertTrue(log_events)
|
||||
self.assertIn("Cannot return deferreds from signal handler", str(log_events))
|
||||
txlog.removeObserver(log_events.append)
|
||||
self.flushLoggedErrors()
|
||||
with LogCapture() as l:
|
||||
send_catch_log(test_signal)
|
||||
self.assertEqual(len(l.records), 1)
|
||||
self.assertIn("Cannot return deferreds from signal handler", str(l))
|
||||
dispatcher.disconnect(test_handler, test_signal)
|
||||
|
Loading…
x
Reference in New Issue
Block a user