1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 07:04:01 +00:00

Merge pull request #1060 from Curita/python-logging

[MRG+1] Python logging
This commit is contained in:
Mikhail Korobov 2015-04-29 23:20:34 +05:00
commit fbb1078f58
69 changed files with 1224 additions and 853 deletions

View File

@ -1,6 +1,5 @@
import six
import pytest
from twisted.python import log
collect_ignore = ["scrapy/stats.py", "scrapy/project.py"]
@ -10,35 +9,6 @@ if six.PY3:
if len(file_path) > 0 and file_path[0] != '#':
collect_ignore.append(file_path)
class LogObservers:
"""Class for keeping track of log observers across test modules"""
def __init__(self):
self.observers = []
def add(self, logfile='test.log'):
fileobj = open(logfile, 'wb')
observer = log.FileLogObserver(fileobj)
log.startLoggingWithObserver(observer.emit, 0)
self.observers.append((fileobj, observer))
def remove(self):
fileobj, observer = self.observers.pop()
log.removeObserver(observer.emit)
fileobj.close()
@pytest.fixture(scope='module')
def log_observers():
return LogObservers()
@pytest.fixture()
def setlog(request, log_observers):
"""Attach test.log file observer to twisted log, for trial compatibility"""
log_observers.add()
request.addfinalizer(log_observers.remove)
@pytest.fixture()
def chdir(tmpdir):

View File

@ -119,7 +119,7 @@ Built-in services
topics/webservice
:doc:`topics/logging`
Understand the simple logging facility provided by Scrapy.
Learn how to use Python's builtin logging on Scrapy.
:doc:`topics/stats`
Collect statistics about your scraping crawler.

View File

@ -154,15 +154,14 @@ similar to this::
2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled downloader middlewares: ...
2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled spider middlewares: ...
2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled item pipelines: ...
2014-01-23 18:13:07-0400 [dmoz] INFO: Spider opened
2014-01-23 18:13:08-0400 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> (referer: None)
2014-01-23 18:13:09-0400 [dmoz] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)
2014-01-23 18:13:09-0400 [dmoz] INFO: Closing spider (finished)
2014-01-23 18:13:07-0400 [scrapy] INFO: Spider opened
2014-01-23 18:13:08-0400 [scrapy] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> (referer: None)
2014-01-23 18:13:09-0400 [scrapy] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)
2014-01-23 18:13:09-0400 [scrapy] INFO: Closing spider (finished)
.. note::
The lines containing ``[dmoz]`` correspond to the log messages for
our spider. You can see a log line for each URL defined in ``start_urls``.
At the end you can see a log line for each URL defined in ``start_urls``.
Because these URLs are the starting ones, they have no referrers, which is
shown at the end of the log line, where it says ``(referer: None)``.
@ -268,7 +267,7 @@ This is what the shell looks like::
[ ... Scrapy log here ... ]
2014-01-23 17:11:42-0400 [default] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)
2014-01-23 17:11:42-0400 [scrapy] DEBUG: Crawled (200) <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> (referer: None)
[s] Available Scrapy objects:
[s] crawler <scrapy.crawler.Crawler object at 0x3636b50>
[s] item {}
@ -424,11 +423,11 @@ Spider would be like this::
Now crawling dmoz.org yields ``DmozItem`` objects::
[dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
[scrapy] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
{'desc': [u' - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.\n],
'link': [u'http://gnosis.cx/TPiP/'],
'title': [u'Text Processing in Python']}
[dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
[scrapy] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
{'desc': [u' - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n'],
'link': [u'http://www.informit.com/store/product.aspx?isbn=0130211192'],
'title': [u'XML Processing with Python']}

View File

@ -19,20 +19,20 @@ To run it use::
You should see an output like this::
2013-05-16 13:08:46-0300 [scrapy] INFO: Scrapy 0.17.0 started (bot: scrapybot)
2013-05-16 13:08:47-0300 [follow] INFO: Spider opened
2013-05-16 13:08:47-0300 [follow] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:48-0300 [follow] INFO: Crawled 74 pages (at 4440 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:49-0300 [follow] INFO: Crawled 143 pages (at 4140 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:50-0300 [follow] INFO: Crawled 210 pages (at 4020 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:51-0300 [follow] INFO: Crawled 274 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:52-0300 [follow] INFO: Crawled 343 pages (at 4140 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:53-0300 [follow] INFO: Crawled 410 pages (at 4020 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:54-0300 [follow] INFO: Crawled 474 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:55-0300 [follow] INFO: Crawled 538 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:56-0300 [follow] INFO: Crawled 602 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:57-0300 [follow] INFO: Closing spider (closespider_timeout)
2013-05-16 13:08:57-0300 [follow] INFO: Crawled 666 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:57-0300 [follow] INFO: Dumping Scrapy stats:
2013-05-16 13:08:47-0300 [scrapy] INFO: Spider opened
2013-05-16 13:08:47-0300 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:48-0300 [scrapy] INFO: Crawled 74 pages (at 4440 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:49-0300 [scrapy] INFO: Crawled 143 pages (at 4140 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:50-0300 [scrapy] INFO: Crawled 210 pages (at 4020 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:51-0300 [scrapy] INFO: Crawled 274 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:52-0300 [scrapy] INFO: Crawled 343 pages (at 4140 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:53-0300 [scrapy] INFO: Crawled 410 pages (at 4020 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:54-0300 [scrapy] INFO: Crawled 474 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:55-0300 [scrapy] INFO: Crawled 538 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:56-0300 [scrapy] INFO: Crawled 602 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:57-0300 [scrapy] INFO: Closing spider (closespider_timeout)
2013-05-16 13:08:57-0300 [scrapy] INFO: Crawled 666 pages (at 3840 pages/min), scraped 0 items (at 0 items/min)
2013-05-16 13:08:57-0300 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 231508,
'downloader/request_count': 682,
'downloader/request_method_count/GET': 682,
@ -49,7 +49,7 @@ You should see an output like this::
'scheduler/enqueued': 12767,
'scheduler/enqueued/memory': 12767,
'start_time': datetime.datetime(2013, 5, 16, 16, 8, 47, 676539)}
2013-05-16 13:08:57-0300 [follow] INFO: Spider closed (closespider_timeout)
2013-05-16 13:08:57-0300 [scrapy] INFO: Spider closed (closespider_timeout)
That tells you that Scrapy is able to crawl about 3900 pages per minute in the
hardware where you run it. Note that this is a very simple spider intended to

View File

@ -132,16 +132,13 @@ Logging is another useful option for getting information about your spider run.
Although not as convenient, it comes with the advantage that the logs will be
available in all future runs should they be necessary again::
from scrapy import log
def parse_details(self, response):
item = response.meta.get('item', None)
if item:
# populate more `item` fields
return item
else:
self.log('No item received for %s' % response.url,
level=log.WARNING)
self.logger.warning('No item received for %s', response.url)
For more information, check the :ref:`topics-logging` section.

View File

@ -232,14 +232,14 @@ header) and all cookies received in responses (ie. ``Set-Cookie`` header).
Here's an example of a log with :setting:`COOKIES_DEBUG` enabled::
2011-04-06 14:35:10-0300 [diningcity] INFO: Spider opened
2011-04-06 14:35:10-0300 [diningcity] DEBUG: Sending cookies to: <GET http://www.diningcity.com/netherlands/index.html>
2011-04-06 14:35:10-0300 [scrapy] INFO: Spider opened
2011-04-06 14:35:10-0300 [scrapy] DEBUG: Sending cookies to: <GET http://www.diningcity.com/netherlands/index.html>
Cookie: clientlanguage_nl=en_EN
2011-04-06 14:35:14-0300 [diningcity] DEBUG: Received cookies from: <200 http://www.diningcity.com/netherlands/index.html>
2011-04-06 14:35:14-0300 [scrapy] DEBUG: Received cookies from: <200 http://www.diningcity.com/netherlands/index.html>
Set-Cookie: JSESSIONID=B~FA4DC0C496C8762AE4F1A620EAB34F38; Path=/
Set-Cookie: ip_isocode=US
Set-Cookie: clientlanguage_nl=en_EN; Expires=Thu, 07-Apr-2011 21:21:34 GMT; Path=/
2011-04-06 14:49:50-0300 [diningcity] DEBUG: Crawled (200) <GET http://www.diningcity.com/netherlands/index.html> (referer: None)
2011-04-06 14:49:50-0300 [scrapy] DEBUG: Crawled (200) <GET http://www.diningcity.com/netherlands/index.html> (referer: None)
[...]

View File

@ -102,9 +102,12 @@ number of items will be specified through the ``MYEXT_ITEMCOUNT`` setting.
Here is the code of such extension::
import logging
from scrapy import signals
from scrapy.exceptions import NotConfigured
logger = logging.getLogger(__name__)
class SpiderOpenCloseLogging(object):
def __init__(self, item_count):
@ -133,15 +136,15 @@ Here is the code of such extension::
return ext
def spider_opened(self, spider):
spider.log("opened spider %s" % spider.name)
logger.info("opened spider %s", spider.name)
def spider_closed(self, spider):
spider.log("closed spider %s" % spider.name)
logger.info("closed spider %s", spider.name)
def item_scraped(self, item, spider):
self.items_scraped += 1
if self.items_scraped % self.item_count == 0:
spider.log("scraped %d items" % self.items_scraped)
logger.info("scraped %d items", self.items_scraped)
.. _topics-extensions-ref:

View File

@ -4,126 +4,232 @@
Logging
=======
Scrapy provides a logging facility which can be used through the
:mod:`scrapy.log` module. The current underlying implementation uses `Twisted
logging`_ but this may change in the future.
.. note::
:mod:`scrapy.log` has been deprecated alongside its functions in favor of
explicit calls to the Python standard logging. Keep reading to learn more
about the new logging system.
.. _Twisted logging: http://twistedmatrix.com/documents/current/core/howto/logging.html
Scrapy uses `Python's builtin logging system
<https://docs.python.org/2/library/logging.html>`_ for event logging. We'll
provide some simple examples to get you started, but for more advanced
use-cases it's strongly suggested to read thoroughly its documentation.
The logging service must be explicitly started through the
:func:`scrapy.log.start` function to catch the top level Scrapy's log messages.
On top of that, each crawler has its own independent log observer
(automatically attached when it's created) that intercepts its spider's log
messages.
Logging works out of the box, and can be configured to some extent with the
Scrapy settings listed in :ref:`topics-logging-settings`.
Scrapy calls :func:`scrapy.utils.log.configure_logging` to set some reasonable
defaults and handle those settings in :ref:`topics-logging-settings` when
running commands, so it's recommended to manually call it if you're running
Scrapy from scripts as described in :ref:`run-from-script`.
.. _topics-logging-levels:
Log levels
==========
Scrapy provides 5 logging levels:
Python's builtin logging defines 5 different levels to indicate severity on a
given log message. Here are the standard ones, listed in decreasing order:
1. :data:`~scrapy.log.CRITICAL` - for critical errors
2. :data:`~scrapy.log.ERROR` - for regular errors
3. :data:`~scrapy.log.WARNING` - for warning messages
4. :data:`~scrapy.log.INFO` - for informational messages
5. :data:`~scrapy.log.DEBUG` - for debugging messages
How to set the log level
========================
You can set the log level using the `--loglevel/-L` command line option, or
using the :setting:`LOG_LEVEL` setting.
1. ``logging.CRITICAL`` - for critical errors (highest severity)
2. ``logging.ERROR`` - for regular errors
3. ``logging.WARNING`` - for warning messages
4. ``logging.INFO`` - for informational messages
5. ``logging.DEBUG`` - for debugging messages (lowest severity)
How to log messages
===================
Here's a quick example of how to log a message using the ``WARNING`` level::
Here's a quick example of how to log a message using the ``logging.WARNING``
level::
from scrapy import log
log.msg("This is a warning", level=log.WARNING)
import logging
logging.warning("This is a warning")
There are shortcuts for issuing log messages on any of the standard 5 levels,
and there's also a general ``logging.log`` method which takes a given level as
argument. If you need so, last example could be rewrote as::
import logging
logging.log(logging.WARNING, "This is a warning")
On top of that, you can create different "loggers" to encapsulate messages (For
example, a common practice it's to create different loggers for every module).
These loggers can be configured independently, and they allow hierarchical
constructions.
Last examples use the root logger behind the scenes, which is a top level
logger where all messages are propagated to (unless otherwise specified). Using
``logging`` helpers is merely a shortcut for getting the root logger
explicitly, so this is also an equivalent of last snippets::
import logging
logger = logging.getLogger()
logger.warning("This is a warning")
You can use a different logger just by getting its name with the
``logging.getLogger`` function::
import logging
logger = logging.getLogger('mycustomlogger')
logger.warning("This is a warning")
Finally, you can ensure having a custom logger for any module you're working on
by using the ``__name__`` variable, which is populated with current module's
path::
import logging
logger = logging.getLogger(__name__)
logger.warning("This is a warning")
.. seealso::
Module logging, `HowTo <https://docs.python.org/2/howto/logging.html>`_
Basic Logging Tutorial
Module logging, `Loggers <https://docs.python.org/2/library/logging.html#logger-objects>`_
Further documentation on loggers
.. _topics-logging-from-spiders:
Logging from Spiders
====================
The recommended way to log from spiders is by using the Spider
:meth:`~scrapy.spider.Spider.log` method, which already populates the
``spider`` argument of the :func:`scrapy.log.msg` function. The other arguments
are passed directly to the :func:`~scrapy.log.msg` function.
Scrapy provides a :data:`~scrapy.spider.Spider.logger` within each Spider
instance, that can be accessed and used like this::
scrapy.log module
=================
import scrapy
.. module:: scrapy.log
:synopsis: Logging facility
class MySpider(scrapy.Spider):
.. function:: start(logfile=None, loglevel=None, logstdout=None)
name = 'myspider'
start_urls = ['http://scrapinghub.com']
Start the top level Scrapy logger. This must be called before actually
logging any top level messages (those logged using this module's
:func:`~scrapy.log.msg` function instead of the :meth:`Spider.log
<scrapy.spider.Spider.log>` method). Otherwise, messages logged before this
call will get lost.
def parse(self, response):
self.logger.info('Parse function called on %s', response.url)
:param logfile: the file path to use for logging output. If omitted, the
:setting:`LOG_FILE` setting will be used. If both are ``None``, the log
will be sent to standard error.
:type logfile: str
That logger is created using the Spider's name, but you can use any custom
Python logger you want. For example::
:param loglevel: the minimum logging level to log. Available values are:
:data:`CRITICAL`, :data:`ERROR`, :data:`WARNING`, :data:`INFO` and
:data:`DEBUG`.
import logging
import scrapy
:param logstdout: if ``True``, all standard output (and error) of your
application will be logged instead. For example if you "print 'hello'"
it will appear in the Scrapy log. If omitted, the :setting:`LOG_STDOUT`
setting will be used.
:type logstdout: boolean
logger = logging.getLogger('mycustomlogger')
.. function:: msg(message, level=INFO, spider=None)
class MySpider(scrapy.Spider):
Log a message
name = 'myspider'
start_urls = ['http://scrapinghub.com']
:param message: the message to log
:type message: str
def parse(self, response):
logger.info('Parse function called on %s', response.url)
:param level: the log level for this message. See
:ref:`topics-logging-levels`.
.. _topics-logging-configuration:
:param spider: the spider to use for logging this message. This parameter
should always be used when logging things related to a particular
spider.
:type spider: :class:`~scrapy.spider.Spider` object
Logging configuration
=====================
.. data:: CRITICAL
Loggers on their own don't manage how messages sent through them are displayed.
For this task, different "handlers" can be attached to any logger instance and
they will redirect those messages to appropriate destinations, such as the
standard output, files, emails, etc.
Log level for critical errors
By default, Scrapy sets and configures a handler for the root logger, based on
the settings below.
.. data:: ERROR
Log level for errors
.. data:: WARNING
Log level for warnings
.. data:: INFO
Log level for informational messages (recommended level for production
deployments)
.. data:: DEBUG
Log level for debugging messages (recommended level for development)
.. _topics-logging-settings:
Logging settings
================
----------------
These settings can be used to configure the logging:
* :setting:`LOG_FILE`
* :setting:`LOG_ENABLED`
* :setting:`LOG_ENCODING`
* :setting:`LOG_FILE`
* :setting:`LOG_LEVEL`
* :setting:`LOG_FORMAT`
* :setting:`LOG_DATEFORMAT`
* :setting:`LOG_STDOUT`
First couple of settings define a destination for log messages. If
:setting:`LOG_FILE` is set, messages sent through the root logger will be
redirected to a file named :setting:`LOG_FILE` with encoding
:setting:`LOG_ENCODING`. If unset and :setting:`LOG_ENABLED` is ``True``, log
messages will be displayed on the standard error. Lastly, if
:setting:`LOG_ENABLED` is ``False``, there won't be any visible log output.
:setting:`LOG_LEVEL` determines the minimum level of severity to display, those
messages with lower severity will be filtered out. It ranges through the
possible levels listed in :ref:`topics-logging-levels`.
:setting:`LOG_FORMAT` and :setting:`LOG_DATEFORMAT` specify formatting strings
used as layouts for all messages. Those strings can contain any placeholders
listed in `logging's logrecord attributes docs
<https://docs.python.org/2/library/logging.html#logrecord-attributes>`_ and
`datetime's strftime and strptime directives
<https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_
respectively.
Command-line options
--------------------
There are command-line arguments, available for all commands, that you can use
to override some of the Scrapy settings regarding logging.
* ``--logfile FILE``
Overrides :setting:`LOG_FILE`
* ``--loglevel/-L LEVEL``
Overrides :setting:`LOG_LEVEL`
* ``--nolog``
Sets :setting:`LOG_ENABLED` to ``False``
.. seealso::
Module `logging.handlers <https://docs.python.org/2/library/logging.handlers.html>`_
Further documentation on available handlers
scrapy.utils.log module
=======================
.. module:: scrapy.utils.log
:synopsis: Logging utils
.. function:: configure_logging(settings=None)
This function initializes logging defaults for Scrapy.
It's automatically called when using Scrapy commands, but needs to be
called explicitely when running custom scripts. In that case, its usage is
not required but it's recommended.
This function does:
- Route warnings and Twisted logging through Python standard logging
- Set a filter on Scrapy logger for formatting Twisted failures
- Assign DEBUG and ERROR levels to Scrapy and Twisted loggers
respectively
If `settings` is not ``None``, it will also create a root handler based on
the settings listed in :ref:`topics-logging-settings`.
If you plan on configuring the handlers yourself is still recommended you
call this function, keeping `settings` as ``None``. Bear in mind there
won't be any log output set by default in that case.
To get you started on manually configuring logging's output, you can use
`logging.basicConfig()`_ to set a basic root handler. This is an example on
how to redirect ``INFO`` or higher messages to a file::
import logging
from scrapy.utils.log import configure_logging
configure_logging() # Note we aren't providing settings in this case
logging.basicConfig(filename='log.txt', format='%(levelname)s: %(message)s', level=logging.INFO)
Refer to :ref:`run-from-script` for more details about using Scrapy this
way.
:param settings: settings used to create and configure a handler for the
root logger.
:type settings: :class:`~scrapy.settings.Settings` object or ``None``
.. _logging.basicConfig(): https://docs.python.org/2/library/logging.html#logging.basicConfig

View File

@ -30,9 +30,12 @@ project as example.
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
runner = CrawlerRunner(get_project_settings())
settings = get_project_settings()
configure_logging(settings)
runner = CrawlerRunner(settings)
# 'followall' is the name of one of the spiders of the project.
d = runner.crawl('followall', domain='scrapinghub.com')
@ -54,11 +57,13 @@ the spider class as first argument in the :meth:`CrawlerRunner.crawl
from twisted.internet import reactor
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
class MySpider(scrapy.Spider):
# Your spider definition
...
configure_logging(settings)
runner = CrawlerRunner({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
@ -85,9 +90,12 @@ Here is an example that runs multiple spiders simultaneously, using the
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
runner = CrawlerRunner(get_project_settings())
settings = get_project_settings()
configure_logging(settings)
runner = CrawlerRunner(settings)
dfs = set()
for domain in ['scrapinghub.com', 'insophia.com']:
d = runner.crawl('followall', domain=domain)
@ -102,9 +110,12 @@ Same example but running the spiders sequentially by chaining the deferreds:
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
runner = CrawlerRunner(get_project_settings())
settings = get_project_settings()
configure_logging(settings)
runner = CrawlerRunner(settings)
@defer.inlineCallbacks
def crawl():

View File

@ -189,7 +189,7 @@ Example::
def parse_page2(self, response):
# this would log http://www.example.com/some_page.html
self.log("Visited %s" % response.url)
self.logger.info("Visited %s", response.url)
In some cases you may be interested in passing arguments to those callback
functions so you can receive the arguments later, in the second callback. You
@ -382,7 +382,7 @@ method for this job. Here's an example spider which uses it::
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.log("Login failed", level=log.ERROR)
self.logger.error("Login failed")
return
# continue scraping with authenticated session...

View File

@ -612,6 +612,31 @@ Default: ``None``
File name to use for logging output. If None, standard error will be used.
.. setting:: LOG_FORMAT
LOG_FORMAT
----------
Default: ``'%(asctime)s [%(name)s] %(levelname)s: %(message)s'``
String for formatting log messsages. Refer to the `Python logging documentation`_ for the whole list of available
placeholders.
.. _Python logging documentation: https://docs.python.org/2/library/logging.html#logrecord-attributes
.. setting:: LOG_DATEFORMAT
LOG_DATEFORMAT
--------------
Default: ``'%Y-%m-%d %H:%M:%S%z'``
String for formatting date/time, expansion of the ``%(asctime)s`` placeholder
in :setting:`LOG_FORMAT`. Refer to the `Python datetime documentation`_ for the whole list of available
directives.
.. _Python datetime documentation: https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior
.. setting:: LOG_LEVEL
LOG_LEVEL

View File

@ -187,8 +187,8 @@ Here's an example of how you would call it from your spider::
When you run the spider, you will get something similar to this::
2014-01-23 17:48:31-0400 [myspider] DEBUG: Crawled (200) <GET http://example.com> (referer: None)
2014-01-23 17:48:31-0400 [myspider] DEBUG: Crawled (200) <GET http://example.org> (referer: None)
2014-01-23 17:48:31-0400 [scrapy] DEBUG: Crawled (200) <GET http://example.com> (referer: None)
2014-01-23 17:48:31-0400 [scrapy] DEBUG: Crawled (200) <GET http://example.org> (referer: None)
[s] Available Scrapy objects:
[s] crawler <scrapy.crawler.Crawler object at 0x1e16b50>
...
@ -211,7 +211,7 @@ Finally you hit Ctrl-D (or Ctrl-Z in Windows) to exit the shell and resume the
crawling::
>>> ^D
2014-01-23 17:50:03-0400 [myspider] DEBUG: Crawled (200) <GET http://example.net> (referer: None)
2014-01-23 17:50:03-0400 [scrapy] DEBUG: Crawled (200) <GET http://example.net> (referer: None)
...
Note that you can't use the ``fetch`` shortcut here since the Scrapy engine is

View File

@ -112,6 +112,12 @@ scrapy.Spider
:class:`~scrapy.settings.Settings` instance, see the
:ref:`topics-settings` topic for a detailed introduction on this subject.
.. attribute:: logger
Python logger created with the Spider's :attr:`name`. You can use it to
send log messages through it as described on
:ref:`topics-logging-from-spiders`.
.. method:: from_crawler(crawler, \*args, \**kwargs)
This is the class method used by Scrapy to create your spiders.
@ -194,9 +200,9 @@ scrapy.Spider
.. method:: log(message, [level, component])
Log a message using the :func:`scrapy.log.msg` function, automatically
populating the spider argument with the :attr:`name` of this
spider. For more information see :ref:`topics-logging`.
Wrapper that sends a log message through the Spider's :attr:`logger`,
kept for backwards compatibility. For more information see
:ref:`topics-logging-from-spiders`.
.. method:: closed(reason)
@ -218,7 +224,7 @@ Let's see an example::
]
def parse(self, response):
self.log('A response from %s just arrived!' % response.url)
self.logger.info('A response from %s just arrived!', response.url)
Return multiple Requests and items from a single callback::
@ -406,7 +412,7 @@ Let's now take a look at an example CrawlSpider with rules::
)
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
self.logger.info('Hi, this is an item page! %s', response.url)
item = scrapy.Item()
item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
@ -509,7 +515,6 @@ XMLFeedSpider example
These spiders are pretty easy to use, let's have a look at one example::
from scrapy import log
from scrapy.contrib.spiders import XMLFeedSpider
from myproject.items import TestItem
@ -521,7 +526,7 @@ These spiders are pretty easy to use, let's have a look at one example::
itertag = 'item'
def parse_node(self, response, node):
log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract())))
self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract()))
item = TestItem()
item['id'] = node.xpath('@id').extract()
@ -570,7 +575,6 @@ CSVFeedSpider example
Let's see an example similar to the previous one, but using a
:class:`CSVFeedSpider`::
from scrapy import log
from scrapy.contrib.spiders import CSVFeedSpider
from myproject.items import TestItem
@ -583,7 +587,7 @@ Let's see an example similar to the previous one, but using a
headers = ['id', 'name', 'description']
def parse_row(self, response, row):
log.msg('Hi, this is a row!: %r' % row)
self.logger.info('Hi, this is a row!: %r', row)
item = TestItem()
item['id'] = row['id']

View File

@ -1,5 +1,5 @@
[pytest]
usefixtures = chdir setlog
usefixtures = chdir
python_files=test_*.py __init__.py
addopts = --doctest-modules --assert=plain
twisted = 1

View File

@ -1,5 +1,8 @@
from __future__ import print_function
import logging
from w3lib.url import is_url
from scrapy.command import ScrapyCommand
from scrapy.http import Request
from scrapy.item import BaseItem
@ -7,7 +10,9 @@ from scrapy.utils import display
from scrapy.utils.conf import arglist_to_dict
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
from scrapy.exceptions import UsageError
from scrapy import log
logger = logging.getLogger(__name__)
class Command(ScrapyCommand):
@ -119,9 +124,9 @@ class Command(ScrapyCommand):
if rule.link_extractor.matches(response.url) and rule.callback:
return rule.callback
else:
log.msg(format='No CrawlSpider rules found in spider %(spider)r, '
'please specify a callback to use for parsing',
level=log.ERROR, spider=spider.name)
logger.error('No CrawlSpider rules found in spider %(spider)r, '
'please specify a callback to use for parsing',
{'spider': spider.name})
def set_spidercls(self, url, opts):
spider_loader = self.crawler_process.spider_loader
@ -129,13 +134,13 @@ class Command(ScrapyCommand):
try:
self.spidercls = spider_loader.load(opts.spider)
except KeyError:
log.msg(format='Unable to find spider: %(spider)s',
level=log.ERROR, spider=opts.spider)
logger.error('Unable to find spider: %(spider)s',
{'spider': opts.spider})
else:
self.spidercls = spidercls_for_request(spider_loader, Request(url))
if not self.spidercls:
log.msg(format='Unable to find spider for: %(url)s',
level=log.ERROR, url=url)
logger.error('Unable to find spider for: %(url)s',
{'url': url})
request = Request(url, opts.callback)
_start_requests = lambda s: [self.prepare_request(s, request, opts)]
@ -148,8 +153,8 @@ class Command(ScrapyCommand):
self.crawler_process.start()
if not self.first_response:
log.msg(format='No response downloaded for: %(url)s',
level=log.ERROR, url=url)
logger.error('No response downloaded for: %(url)s',
{'url': url})
def prepare_request(self, spider, request, opts):
def callback(response):
@ -170,8 +175,8 @@ class Command(ScrapyCommand):
if callable(cb_method):
cb = cb_method
else:
log.msg(format='Cannot find callback %(callback)r in spider: %(spider)s',
callback=callback, spider=spider.name, level=log.ERROR)
logger.error('Cannot find callback %(callback)r in spider: %(spider)s',
{'callback': callback, 'spider': spider.name})
return
# parse items and requests

View File

@ -9,7 +9,6 @@ from threading import Thread
from scrapy.command import ScrapyCommand
from scrapy.shell import Shell
from scrapy.http import Request
from scrapy import log
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
@ -54,7 +53,6 @@ class Command(ScrapyCommand):
# The crawler is created this way since the Shell manually handles the
# crawling engine, so the set up in the crawl method won't work
crawler = self.crawler_process._create_crawler(spidercls)
self.crawler_process._setup_crawler_logging(crawler)
# The Shell class needs a persistent engine in the crawler
crawler.engine = crawler._create_engine()
crawler.engine.start()

View File

@ -6,13 +6,15 @@ See documentation in docs/topics/extensions.rst
import sys
import signal
import logging
import traceback
import threading
from pdb import Pdb
from scrapy.utils.engine import format_engine_status
from scrapy.utils.trackref import format_live_refs
from scrapy import log
logger = logging.getLogger(__name__)
class StackTraceDump(object):
@ -31,12 +33,14 @@ class StackTraceDump(object):
return cls(crawler)
def dump_stacktrace(self, signum, frame):
stackdumps = self._thread_stacks()
enginestatus = format_engine_status(self.crawler.engine)
liverefs = format_live_refs()
msg = "Dumping stack trace and engine status" \
"\n{0}\n{1}\n{2}".format(enginestatus, liverefs, stackdumps)
log.msg(msg)
log_args = {
'stackdumps': self._thread_stacks(),
'enginestatus': format_engine_status(self.crawler.engine),
'liverefs': format_live_refs(),
}
logger.info("Dumping stack trace and engine status\n"
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
log_args, extra={'crawler': self.crawler})
def _thread_stacks(self):
id2name = dict((th.ident, th.name) for th in threading.enumerate())

View File

@ -1,14 +1,19 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
import logging
import six
from w3lib import html
from scrapy import log
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
from scrapy.utils.response import _noscript_re, _script_re
logger = logging.getLogger(__name__)
class AjaxCrawlMiddleware(object):
"""
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
@ -46,9 +51,9 @@ class AjaxCrawlMiddleware(object):
# scrapy already handles #! links properly
ajax_crawl_request = request.replace(url=request.url+'#!')
log.msg(format="Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
level=log.DEBUG, spider=spider,
ajax_crawl_request=ajax_crawl_request, request=request)
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
extra={'spider': spider})
ajax_crawl_request.meta['ajax_crawlable'] = True
return ajax_crawl_request

View File

@ -1,11 +1,13 @@
import os
import six
import logging
from collections import defaultdict
from scrapy.exceptions import NotConfigured
from scrapy.http import Response
from scrapy.http.cookies import CookieJar
from scrapy import log
logger = logging.getLogger(__name__)
class CookiesMiddleware(object):
@ -54,7 +56,7 @@ class CookiesMiddleware(object):
if cl:
msg = "Sending cookies to: %s" % request + os.linesep
msg += os.linesep.join("Cookie: %s" % c for c in cl)
log.msg(msg, spider=spider, level=log.DEBUG)
logger.debug(msg, extra={'spider': spider})
def _debug_set_cookie(self, response, spider):
if self.debug:
@ -62,7 +64,7 @@ class CookiesMiddleware(object):
if cl:
msg = "Received cookies from: %s" % response + os.linesep
msg += os.linesep.join("Set-Cookie: %s" % c for c in cl)
log.msg(msg, spider=spider, level=log.DEBUG)
logger.debug(msg, extra={'spider': spider})
def _format_cookie(self, cookie):
# build cookie string

View File

@ -1,11 +1,12 @@
""" This module implements the DecompressionMiddleware which tries to recognise
and extract the potentially compressed responses that may arrive.
and extract the potentially compressed responses that may arrive.
"""
import bz2
import gzip
import zipfile
import tarfile
import logging
from tempfile import mktemp
import six
@ -15,9 +16,10 @@ try:
except ImportError:
from io import BytesIO
from scrapy import log
from scrapy.responsetypes import responsetypes
logger = logging.getLogger(__name__)
class DecompressionMiddleware(object):
""" This middleware tries to recognise and extract the possibly compressed
@ -80,7 +82,7 @@ class DecompressionMiddleware(object):
for fmt, func in six.iteritems(self._formats):
new_response = func(response)
if new_response:
log.msg(format='Decompressed response with format: %(responsefmt)s',
level=log.DEBUG, spider=spider, responsefmt=fmt)
logger.debug('Decompressed response with format: %(responsefmt)s',
{'responsefmt': fmt}, extra={'spider': spider})
return new_response
return response

View File

@ -1,10 +1,12 @@
import logging
from six.moves.urllib.parse import urljoin
from scrapy import log
from scrapy.http import HtmlResponse
from scrapy.utils.response import get_meta_refresh
from scrapy.exceptions import IgnoreRequest, NotConfigured
logger = logging.getLogger(__name__)
class BaseRedirectMiddleware(object):
@ -32,13 +34,13 @@ class BaseRedirectMiddleware(object):
[request.url]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
log.msg(format="Redirecting (%(reason)s) to %(redirected)s from %(request)s",
level=log.DEBUG, spider=spider, request=request,
redirected=redirected, reason=reason)
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{'reason': reason, 'redirected': redirected, 'request': request},
extra={'spider': spider})
return redirected
else:
log.msg(format="Discarding %(request)s: max redirections reached",
level=log.DEBUG, spider=spider, request=request)
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest("max redirections reached")
def _redirect_request_using_get(self, request, redirect_url):

View File

@ -17,17 +17,19 @@ About HTTP errors to consider:
protocol. It's included by default because it's a common code used to
indicate server overload, which would be something we want to retry
"""
import logging
from twisted.internet import defer
from twisted.internet.error import TimeoutError, DNSLookupError, \
ConnectionRefusedError, ConnectionDone, ConnectError, \
ConnectionLost, TCPTimedOutError
from scrapy import log
from scrapy.exceptions import NotConfigured
from scrapy.utils.response import response_status_message
from scrapy.xlib.tx import ResponseFailed
logger = logging.getLogger(__name__)
class RetryMiddleware(object):
@ -66,13 +68,15 @@ class RetryMiddleware(object):
retries = request.meta.get('retry_times', 0) + 1
if retries <= self.max_retry_times:
log.msg(format="Retrying %(request)s (failed %(retries)d times): %(reason)s",
level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason)
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
retryreq.priority = request.priority + self.priority_adjust
return retryreq
else:
log.msg(format="Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason)
logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})

View File

@ -4,13 +4,16 @@ enable this middleware and enable the ROBOTSTXT_OBEY setting.
"""
import logging
from six.moves.urllib import robotparser
from scrapy import signals, log
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
logger = logging.getLogger(__name__)
class RobotsTxtMiddleware(object):
DOWNLOAD_PRIORITY = 1000
@ -32,8 +35,8 @@ class RobotsTxtMiddleware(object):
return
rp = self.robot_parser(request, spider)
if rp and not rp.can_fetch(self._useragent, request.url):
log.msg(format="Forbidden by robots.txt: %(request)s",
level=log.DEBUG, request=request)
logger.debug("Forbidden by robots.txt: %(request)s",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest
def robot_parser(self, request, spider):
@ -54,8 +57,9 @@ class RobotsTxtMiddleware(object):
def _logerror(self, failure, request, spider):
if failure.type is not IgnoreRequest:
log.msg(format="Error downloading %%(request)s: %s" % failure.value,
level=log.ERROR, request=request, spider=spider)
logger.error("Error downloading %(request)s: %(f_exception)s",
{'request': request, 'f_exception': failure.value},
extra={'spider': spider, 'failure': failure})
def _parse_robots(self, response):
rp = robotparser.RobotFileParser(response.url)

View File

@ -4,7 +4,10 @@ Feed Exports extension
See documentation in docs/topics/feed-exports.rst
"""
import sys, os, posixpath
import os
import sys
import logging
import posixpath
from tempfile import TemporaryFile
from datetime import datetime
from six.moves.urllib.parse import urlparse
@ -14,12 +17,14 @@ from zope.interface import Interface, implementer
from twisted.internet import defer, threads
from w3lib.url import file_uri_to_path
from scrapy import log, signals
from scrapy import signals
from scrapy.utils.ftp import ftp_makedirs_cwd
from scrapy.exceptions import NotConfigured
from scrapy.utils.misc import load_object
from scrapy.utils.python import get_func_args
logger = logging.getLogger(__name__)
class IFeedStorage(Interface):
"""Interface that all Feed Storages must implement"""
@ -171,11 +176,15 @@ class FeedExporter(object):
if not slot.itemcount and not self.store_empty:
return
slot.exporter.finish_exporting()
logfmt = "%%s %s feed (%d items) in: %s" % (self.format, \
slot.itemcount, slot.uri)
logfmt = "%%s %(format)s feed (%(itemcount)d items) in: %(uri)s"
log_args = {'format': self.format,
'itemcount': slot.itemcount,
'uri': slot.uri}
d = defer.maybeDeferred(slot.storage.store, slot.file)
d.addCallback(lambda _: log.msg(logfmt % "Stored", spider=spider))
d.addErrback(log.err, logfmt % "Error storing", spider=spider)
d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args,
extra={'spider': spider}))
d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args,
extra={'spider': spider, 'failure': f}))
return d
def item_scraped(self, item, spider):
@ -198,7 +207,7 @@ class FeedExporter(object):
def _exporter_supported(self, format):
if format in self.exporters:
return True
log.msg("Unknown feed format: %s" % format, log.ERROR)
logger.error("Unknown feed format: %(format)s", {'format': format})
def _storage_supported(self, uri):
scheme = urlparse(uri).scheme
@ -207,9 +216,11 @@ class FeedExporter(object):
self._get_storage(uri)
return True
except NotConfigured:
log.msg("Disabled feed storage scheme: %s" % scheme, log.ERROR)
logger.error("Disabled feed storage scheme: %(scheme)s",
{'scheme': scheme})
else:
log.msg("Unknown feed storage scheme: %s" % scheme, log.ERROR)
logger.error("Unknown feed storage scheme: %(scheme)s",
{'scheme': scheme})
def _get_exporter(self, *args, **kwargs):
return self.exporters[self.format](*args, **kwargs)

View File

@ -1,7 +1,11 @@
import logging
from twisted.internet import task
from scrapy.exceptions import NotConfigured
from scrapy import log, signals
from scrapy import signals
logger = logging.getLogger(__name__)
class LogStats(object):
@ -35,9 +39,12 @@ class LogStats(object):
irate = (items - self.itemsprev) * self.multiplier
prate = (pages - self.pagesprev) * self.multiplier
self.pagesprev, self.itemsprev = pages, items
msg = "Crawled %d pages (at %d pages/min), scraped %d items (at %d items/min)" \
% (pages, prate, items, irate)
log.msg(msg, spider=spider)
msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
"scraped %(items)d items (at %(itemrate)d items/min)")
log_args = {'pages': pages, 'pagerate': prate,
'items': items, 'itemrate': irate}
logger.info(msg, log_args, extra={'spider': spider})
def spider_closed(self, spider, reason):
if self.task.running:

View File

@ -5,16 +5,20 @@ See documentation in docs/topics/extensions.rst
"""
import sys
import socket
import logging
from pprint import pformat
from importlib import import_module
from twisted.internet import task
from scrapy import signals, log
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.mail import MailSender
from scrapy.utils.engine import get_engine_status
logger = logging.getLogger(__name__)
class MemoryUsage(object):
def __init__(self, crawler):
@ -74,8 +78,8 @@ class MemoryUsage(object):
if self.get_virtual_size() > self.limit:
self.crawler.stats.set_value('memusage/limit_reached', 1)
mem = self.limit/1024/1024
log.msg(format="Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
level=log.ERROR, memusage=mem)
logger.error("Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
{'memusage': mem}, extra={'crawler': self.crawler})
if self.notify_mails:
subj = "%s terminated: memory usage exceeded %dM at %s" % \
(self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
@ -95,8 +99,8 @@ class MemoryUsage(object):
if self.get_virtual_size() > self.warning:
self.crawler.stats.set_value('memusage/warning_reached', 1)
mem = self.warning/1024/1024
log.msg(format="Memory usage reached %(memusage)dM",
level=log.WARNING, memusage=mem)
logger.warning("Memory usage reached %(memusage)dM",
{'memusage': mem}, extra={'crawler': self.crawler})
if self.notify_mails:
subj = "%s warning: memory usage reached %dM at %s" % \
(self.crawler.settings['BOT_NAME'], mem, socket.gethostname())

View File

@ -9,6 +9,7 @@ import os
import os.path
import rfc822
import time
import logging
from six.moves.urllib.parse import urlparse
from collections import defaultdict
import six
@ -20,12 +21,13 @@ except ImportError:
from twisted.internet import defer, threads
from scrapy import log
from scrapy.contrib.pipeline.media import MediaPipeline
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.misc import md5sum
logger = logging.getLogger(__name__)
class FileException(Exception):
"""General media error exception"""
@ -192,9 +194,13 @@ class FilesPipeline(MediaPipeline):
return # returning None force download
referer = request.headers.get('Referer')
log.msg(format='File (uptodate): Downloaded %(medianame)s from %(request)s referred in <%(referer)s>',
level=log.DEBUG, spider=info.spider,
medianame=self.MEDIA_NAME, request=request, referer=referer)
logger.debug(
'File (uptodate): Downloaded %(medianame)s from %(request)s '
'referred in <%(referer)s>',
{'medianame': self.MEDIA_NAME, 'request': request,
'referer': referer},
extra={'spider': info.spider}
)
self.inc_stats(info.spider, 'uptodate')
checksum = result.get('checksum', None)
@ -203,17 +209,23 @@ class FilesPipeline(MediaPipeline):
path = self.file_path(request, info=info)
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
dfd.addCallbacks(_onsuccess, lambda _: None)
dfd.addErrback(log.err, self.__class__.__name__ + '.store.stat_file')
dfd.addErrback(
lambda f:
logger.error(self.__class__.__name__ + '.store.stat_file',
extra={'spider': info.spider, 'failure': f})
)
return dfd
def media_failed(self, failure, request, info):
if not isinstance(failure.value, IgnoreRequest):
referer = request.headers.get('Referer')
log.msg(format='File (unknown-error): Error downloading '
'%(medianame)s from %(request)s referred in '
'<%(referer)s>: %(exception)s',
level=log.WARNING, spider=info.spider, exception=failure.value,
medianame=self.MEDIA_NAME, request=request, referer=referer)
logger.warning(
'File (unknown-error): Error downloading %(medianame)s from '
'%(request)s referred in <%(referer)s>: %(exception)s',
{'medianame': self.MEDIA_NAME, 'request': request,
'referer': referer, 'exception': failure.value},
extra={'spider': info.spider}
)
raise FileException
@ -221,34 +233,51 @@ class FilesPipeline(MediaPipeline):
referer = request.headers.get('Referer')
if response.status != 200:
log.msg(format='File (code: %(status)s): Error downloading file from %(request)s referred in <%(referer)s>',
level=log.WARNING, spider=info.spider,
status=response.status, request=request, referer=referer)
logger.warning(
'File (code: %(status)s): Error downloading file from '
'%(request)s referred in <%(referer)s>',
{'status': response.status,
'request': request, 'referer': referer},
extra={'spider': info.spider}
)
raise FileException('download-error')
if not response.body:
log.msg(format='File (empty-content): Empty file from %(request)s referred in <%(referer)s>: no-content',
level=log.WARNING, spider=info.spider,
request=request, referer=referer)
logger.warning(
'File (empty-content): Empty file from %(request)s referred '
'in <%(referer)s>: no-content',
{'request': request, 'referer': referer},
extra={'spider': info.spider}
)
raise FileException('empty-content')
status = 'cached' if 'cached' in response.flags else 'downloaded'
log.msg(format='File (%(status)s): Downloaded file from %(request)s referred in <%(referer)s>',
level=log.DEBUG, spider=info.spider,
status=status, request=request, referer=referer)
logger.debug(
'File (%(status)s): Downloaded file from %(request)s referred in '
'<%(referer)s>',
{'status': status, 'request': request, 'referer': referer},
extra={'spider': info.spider}
)
self.inc_stats(info.spider, status)
try:
path = self.file_path(request, response=response, info=info)
checksum = self.file_downloaded(response, request, info)
except FileException as exc:
whyfmt = 'File (error): Error processing file from %(request)s referred in <%(referer)s>: %(errormsg)s'
log.msg(format=whyfmt, level=log.WARNING, spider=info.spider,
request=request, referer=referer, errormsg=str(exc))
logger.warning(
'File (error): Error processing file from %(request)s '
'referred in <%(referer)s>: %(errormsg)s',
{'request': request, 'referer': referer, 'errormsg': str(exc)},
extra={'spider': info.spider}, exc_info=True
)
raise
except Exception as exc:
whyfmt = 'File (unknown-error): Error processing file from %(request)s referred in <%(referer)s>'
log.err(None, whyfmt % {'request': request, 'referer': referer}, spider=info.spider)
logger.exception(
'File (unknown-error): Error processing file from %(request)s '
'referred in <%(referer)s>',
{'request': request, 'referer': referer},
extra={'spider': info.spider}
)
raise FileException(str(exc))
return {'url': request.url, 'path': path, 'checksum': checksum}

View File

@ -1,13 +1,16 @@
from __future__ import print_function
import logging
from collections import defaultdict
from twisted.internet.defer import Deferred, DeferredList
from twisted.python.failure import Failure
from scrapy.utils.defer import mustbe_deferred, defer_result
from scrapy import log
from scrapy.utils.request import request_fingerprint
from scrapy.utils.misc import arg_to_iter
logger = logging.getLogger(__name__)
class MediaPipeline(object):
@ -66,7 +69,9 @@ class MediaPipeline(object):
dfd = mustbe_deferred(self.media_to_download, request, info)
dfd.addCallback(self._check_media_to_download, request, info)
dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
dfd.addErrback(log.err, spider=info.spider)
dfd.addErrback(lambda f: logger.error(
f.value, extra={'spider': info.spider, 'failure': f})
)
return dfd.addBoth(lambda _: wad) # it must return wad at last
def _check_media_to_download(self, result, request, info):
@ -117,8 +122,11 @@ class MediaPipeline(object):
def item_completed(self, results, item, info):
"""Called per item when all media requests has been processed"""
if self.LOG_FAILED_RESULTS:
msg = '%s found errors processing %s' % (self.__class__.__name__, item)
for ok, value in results:
if not ok:
log.err(value, msg, spider=info.spider)
logger.error(
'%(class)s found errors processing %(item)s',
{'class': self.__class__.__name__, 'item': item},
extra={'spider': info.spider, 'failure': value}
)
return item

View File

@ -4,9 +4,13 @@ Depth Spider Middleware
See documentation in docs/topics/spider-middleware.rst
"""
from scrapy import log
import logging
from scrapy.http import Request
logger = logging.getLogger(__name__)
class DepthMiddleware(object):
def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1):
@ -31,9 +35,9 @@ class DepthMiddleware(object):
if self.prio:
request.priority -= depth * self.prio
if self.maxdepth and depth > self.maxdepth:
log.msg(format="Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
level=log.DEBUG, spider=spider,
maxdepth=self.maxdepth, requrl=request.url)
logger.debug("Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
{'maxdepth': self.maxdepth, 'requrl': request.url},
extra={'spider': spider})
return False
elif self.stats:
if self.verbose_stats:

View File

@ -3,8 +3,12 @@ HttpError Spider Middleware
See documentation in docs/topics/spider-middleware.rst
"""
import logging
from scrapy.exceptions import IgnoreRequest
from scrapy import log
logger = logging.getLogger(__name__)
class HttpError(IgnoreRequest):
"""A non-200 response was filtered"""
@ -42,10 +46,8 @@ class HttpErrorMiddleware(object):
def process_spider_exception(self, response, exception, spider):
if isinstance(exception, HttpError):
log.msg(
format="Ignoring response %(response)r: HTTP status code is not handled or not allowed",
level=log.DEBUG,
spider=spider,
response=response
logger.debug(
"Ignoring response %(response)r: HTTP status code is not handled or not allowed",
{'response': response}, extra={'spider': spider},
)
return []

View File

@ -5,11 +5,13 @@ See documentation in docs/topics/spider-middleware.rst
"""
import re
import logging
from scrapy import signals
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
from scrapy import log
logger = logging.getLogger(__name__)
class OffsiteMiddleware(object):
@ -31,8 +33,8 @@ class OffsiteMiddleware(object):
domain = urlparse_cached(x).hostname
if domain and domain not in self.domains_seen:
self.domains_seen.add(domain)
log.msg(format="Filtered offsite request to %(domain)r: %(request)s",
level=log.DEBUG, spider=spider, domain=domain, request=x)
logger.debug("Filtered offsite request to %(domain)r: %(request)s",
{'domain': domain, 'request': x}, extra={'spider': spider})
self.stats.inc_value('offsite/domains', spider=spider)
self.stats.inc_value('offsite/filtered', spider=spider)
else:

View File

@ -4,10 +4,14 @@ Url Length Spider Middleware
See documentation in docs/topics/spider-middleware.rst
"""
from scrapy import log
import logging
from scrapy.http import Request
from scrapy.exceptions import NotConfigured
logger = logging.getLogger(__name__)
class UrlLengthMiddleware(object):
def __init__(self, maxlength):
@ -23,9 +27,9 @@ class UrlLengthMiddleware(object):
def process_spider_output(self, response, result, spider):
def _filter(request):
if isinstance(request, Request) and len(request.url) > self.maxlength:
log.msg(format="Ignoring link (url length > %(maxlength)d): %(url)s ",
level=log.DEBUG, spider=spider,
maxlength=self.maxlength, url=request.url)
logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ",
{'maxlength': self.maxlength, 'url': request.url},
extra={'spider': spider})
return False
else:
return True

View File

@ -1,10 +1,13 @@
import re
import logging
from scrapy.spider import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
from scrapy import log
logger = logging.getLogger(__name__)
class SitemapSpider(Spider):
@ -32,8 +35,8 @@ class SitemapSpider(Spider):
else:
body = self._get_sitemap_body(response)
if body is None:
log.msg(format="Ignoring invalid sitemap: %(response)s",
level=log.WARNING, spider=self, response=response)
logger.warning("Ignoring invalid sitemap: %(response)s",
{'response': response}, extra={'spider': self})
return
s = Sitemap(body)

View File

@ -1,7 +1,10 @@
import logging
from scrapy.exceptions import NotConfigured
from scrapy import signals
logger = logging.getLogger(__name__)
class AutoThrottle(object):
@ -47,9 +50,17 @@ class AutoThrottle(object):
diff = slot.delay - olddelay
size = len(response.body)
conc = len(slot.transferring)
msg = "slot: %s | conc:%2d | delay:%5d ms (%+d) | latency:%5d ms | size:%6d bytes" % \
(key, conc, slot.delay * 1000, diff * 1000, latency * 1000, size)
spider.log(msg, level=logging.INFO)
logger.info(
"slot: %(slot)s | conc:%(concurrency)2d | "
"delay:%(delay)5d ms (%(delaydiff)+d) | "
"latency:%(latency)5d ms | size:%(size)6d bytes",
{
'slot': key, 'concurrency': conc,
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
'latency': latency * 1000, 'size': size
},
extra={'spider': spider}
)
def _get_slot(self, request, spider):
key = request.meta.get('download_slot')

View File

@ -1,7 +1,7 @@
"""Download handlers for http and https schemes"""
import re
import logging
from io import BytesIO
from time import time
from six.moves.urllib.parse import urldefrag
@ -19,7 +19,9 @@ from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
from scrapy.core.downloader.webclient import _parse
from scrapy.utils.misc import load_object
from scrapy import log, twisted_version
from scrapy import twisted_version
logger = logging.getLogger(__name__)
class HTTP11DownloadHandler(object):
@ -237,14 +239,16 @@ class ScrapyAgent(object):
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
if maxsize and expected_size > maxsize:
log.msg("Expected response size (%s) larger than download max size (%s)." % (expected_size, maxsize),
logLevel=log.ERROR)
logger.error("Expected response size (%(size)s) larger than "
"download max size (%(maxsize)s).",
{'size': expected_size, 'maxsize': maxsize})
txresponse._transport._producer.loseConnection()
raise defer.CancelledError()
if warnsize and expected_size > warnsize:
log.msg("Expected response size (%s) larger than downlod warn size (%s)." % (expected_size, warnsize),
logLevel=log.WARNING)
logger.warning("Expected response size (%(size)s) larger than "
"download warn size (%(warnsize)s).",
{'size': expected_size, 'warnsize': warnsize})
def _cancel(_):
txresponse._transport._producer.loseConnection()
@ -295,13 +299,17 @@ class _ResponseReader(protocol.Protocol):
self._bytes_received += len(bodyBytes)
if self._maxsize and self._bytes_received > self._maxsize:
log.msg("Received (%s) bytes larger than download max size (%s)." % (self._bytes_received, self._maxsize),
logLevel=log.ERROR)
logger.error("Received (%(bytes)s) bytes larger than download "
"max size (%(maxsize)s).",
{'bytes': self._bytes_received,
'maxsize': self._maxsize})
self._finished.cancel()
if self._warnsize and self._bytes_received > self._warnsize:
log.msg("Received (%s) bytes larger than download warn size (%s)." % (self._bytes_received, self._warnsize),
logLevel=log.WARNING)
logger.warning("Received (%(bytes)s) bytes larger than download "
"warn size (%(warnsize)s).",
{'bytes': self._bytes_received,
'warnsize': self._warnsize})
def connectionLost(self, reason):
if self._finished.called:

View File

@ -4,17 +4,21 @@ This is the Scrapy engine which controls the Scheduler, Downloader and Spiders.
For more information see docs/topics/architecture.rst
"""
import logging
from time import time
from twisted.internet import defer
from twisted.python.failure import Failure
from scrapy import log, signals
from scrapy import signals
from scrapy.core.scraper import Scraper
from scrapy.exceptions import DontCloseSpider
from scrapy.http import Response, Request
from scrapy.utils.misc import load_object
from scrapy.utils.reactor import CallLaterOnce
from scrapy.utils.log import logformatter_adapter
logger = logging.getLogger(__name__)
class Slot(object):
@ -106,10 +110,10 @@ class ExecutionEngine(object):
request = next(slot.start_requests)
except StopIteration:
slot.start_requests = None
except Exception as exc:
except Exception:
slot.start_requests = None
log.err(None, 'Obtaining request from start requests', \
spider=spider)
logger.exception('Error while obtaining start requests',
extra={'spider': spider})
else:
self.crawl(request, spider)
@ -130,11 +134,14 @@ class ExecutionEngine(object):
return
d = self._download(request, spider)
d.addBoth(self._handle_downloader_output, request, spider)
d.addErrback(log.msg, spider=spider)
d.addErrback(lambda f: logger.info('Error while handling downloader output',
extra={'spider': spider, 'failure': f}))
d.addBoth(lambda _: slot.remove_request(request))
d.addErrback(log.msg, spider=spider)
d.addErrback(lambda f: logger.info('Error while removing request from slot',
extra={'spider': spider, 'failure': f}))
d.addBoth(lambda _: slot.nextcall.schedule())
d.addErrback(log.msg, spider=spider)
d.addErrback(lambda f: logger.info('Error while scheduling new request',
extra={'spider': spider, 'failure': f}))
return d
def _handle_downloader_output(self, response, request, spider):
@ -145,7 +152,8 @@ class ExecutionEngine(object):
return
# response is a Response or Failure
d = self.scraper.enqueue_scrape(response, request, spider)
d.addErrback(log.err, spider=spider)
d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
extra={'spider': spider, 'failure': f}))
return d
def spider_is_idle(self, spider):
@ -197,7 +205,7 @@ class ExecutionEngine(object):
if isinstance(response, Response):
response.request = request # tie request to response received
logkws = self.logformatter.crawled(request, response, spider)
log.msg(spider=spider, **logkws)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
self.signals.send_catch_log(signal=signals.response_received, \
response=response, request=request, spider=spider)
return response
@ -215,7 +223,7 @@ class ExecutionEngine(object):
def open_spider(self, spider, start_requests=(), close_if_idle=True):
assert self.has_capacity(), "No free spider slot when opening %r" % \
spider.name
log.msg("Spider opened", spider=spider)
logger.info("Spider opened", extra={'spider': spider})
nextcall = CallLaterOnce(self._next_request, spider)
scheduler = self.scheduler_cls.from_crawler(self.crawler)
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
@ -252,33 +260,42 @@ class ExecutionEngine(object):
slot = self.slot
if slot.closing:
return slot.closing
log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider)
logger.info("Closing spider (%(reason)s)",
{'reason': reason},
extra={'spider': spider})
dfd = slot.close()
def log_failure(msg):
def errback(failure):
logger.error(msg, extra={'spider': spider, 'failure': failure})
return errback
dfd.addBoth(lambda _: self.downloader.close())
dfd.addErrback(log.err, spider=spider)
dfd.addErrback(log_failure('Downloader close failure'))
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
dfd.addErrback(log.err, spider=spider)
dfd.addErrback(log_failure('Scraper close failure'))
dfd.addBoth(lambda _: slot.scheduler.close(reason))
dfd.addErrback(log.err, spider=spider)
dfd.addErrback(log_failure('Scheduler close failure'))
dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
signal=signals.spider_closed, spider=spider, reason=reason))
dfd.addErrback(log.err, spider=spider)
dfd.addErrback(log_failure('Error while sending spider_close signal'))
dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
dfd.addErrback(log.err, spider=spider)
dfd.addErrback(log_failure('Stats close failure'))
dfd.addBoth(lambda _: log.msg(format="Spider closed (%(reason)s)", reason=reason, spider=spider))
dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
{'reason': reason},
extra={'spider': spider}))
dfd.addBoth(lambda _: setattr(self, 'slot', None))
dfd.addErrback(log.err, spider=spider)
dfd.addErrback(log_failure('Error while unassigning slot'))
dfd.addBoth(lambda _: setattr(self, 'spider', None))
dfd.addErrback(log.err, spider=spider)
dfd.addErrback(log_failure('Error while unassigning spider'))
dfd.addBoth(lambda _: self._spider_closed_callback(spider))

View File

@ -1,12 +1,15 @@
import os
import json
import logging
from os.path import join, exists
from queuelib import PriorityQueue
from scrapy.utils.reqser import request_to_dict, request_from_dict
from scrapy.utils.misc import load_object
from scrapy.utils.job import job_dir
from scrapy import log
logger = logging.getLogger(__name__)
class Scheduler(object):
@ -80,9 +83,9 @@ class Scheduler(object):
self.dqs.push(reqd, -request.priority)
except ValueError as e: # non serializable request
if self.logunser:
log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s",
level=log.ERROR, spider=self.spider,
request=request, reason=e)
logger.exception("Unable to serialize request: %(request)s - reason: %(reason)s",
{'request': request, 'reason': e},
extra={'spider': self.spider})
return
else:
return True
@ -111,8 +114,8 @@ class Scheduler(object):
prios = ()
q = PriorityQueue(self._newdq, startprios=prios)
if q:
log.msg(format="Resuming crawl (%(queuesize)d requests scheduled)",
spider=self.spider, queuesize=len(q))
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
{'queuesize': len(q)}, extra={'spider': self.spider})
return q
def _dqdir(self, jobdir):

View File

@ -1,6 +1,7 @@
"""This module implements the Scraper component which parses responses and
extracts information from them"""
import logging
from collections import deque
from twisted.python.failure import Failure
@ -9,12 +10,14 @@ from twisted.internet import defer
from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errback
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.misc import load_object
from scrapy.utils.log import logformatter_adapter
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy import signals
from scrapy.http import Request, Response
from scrapy.item import BaseItem
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy import log
logger = logging.getLogger(__name__)
class Slot(object):
@ -102,7 +105,9 @@ class Scraper(object):
return _
dfd.addBoth(finish_scraping)
dfd.addErrback(
log.err, 'Scraper bug processing %s' % request, spider=spider)
lambda f: logger.error('Scraper bug processing %(request)s',
{'request': request},
extra={'spider': spider, 'failure': f}))
self._scrape_next(spider, slot)
return dfd
@ -145,10 +150,10 @@ class Scraper(object):
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
return
referer = request.headers.get('Referer')
log.err(
_failure,
"Spider error processing %s (referer: %s)" % (request, referer),
spider=spider
logger.error(
"Spider error processing %(request)s (referer: %(referer)s)",
{'request': request, 'referer': referer},
extra={'spider': spider, 'failure': _failure}
)
self.signals.send_catch_log(
signal=signals.spider_error,
@ -183,9 +188,10 @@ class Scraper(object):
pass
else:
typename = type(output).__name__
log.msg(format='Spider must return Request, BaseItem, dict or None, '
'got %(typename)r in %(request)s',
level=log.ERROR, spider=spider, request=request, typename=typename)
logger.error('Spider must return Request, BaseItem, dict or None, '
'got %(typename)r in %(request)s',
{'request': request, 'typename': typename},
extra={'spider': spider})
def _log_download_errors(self, spider_failure, download_failure, request, spider):
"""Log and silence errors that come from the engine (typically download
@ -194,14 +200,15 @@ class Scraper(object):
if (isinstance(download_failure, Failure) and
not download_failure.check(IgnoreRequest)):
if download_failure.frames:
log.err(download_failure, 'Error downloading %s' % request,
spider=spider)
logger.error('Error downloading %(request)s',
{'request': request},
extra={'spider': spider, 'failure': download_failure})
else:
errmsg = download_failure.getErrorMessage()
if errmsg:
log.msg(format='Error downloading %(request)s: %(errmsg)s',
level=log.ERROR, spider=spider, request=request,
errmsg=errmsg)
logger.error('Error downloading %(request)s: %(errmsg)s',
{'request': request, 'errmsg': errmsg},
extra={'spider': spider})
if spider_failure is not download_failure:
return spider_failure
@ -214,15 +221,16 @@ class Scraper(object):
ex = output.value
if isinstance(ex, DropItem):
logkws = self.logformatter.dropped(item, ex, response, spider)
log.msg(spider=spider, **logkws)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_dropped, item=item, response=response,
spider=spider, exception=output.value)
else:
log.err(output, 'Error processing %s' % item, spider=spider)
logger.error('Error processing %(item)s', {'item': item},
extra={'spider': spider, 'failure': output})
else:
logkws = self.logformatter.scraped(output, response, spider)
log.msg(spider=spider, **logkws)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_scraped, item=output, response=response,
spider=spider)

View File

@ -1,5 +1,6 @@
import six
import signal
import logging
import warnings
from twisted.internet import reactor, defer
@ -14,7 +15,10 @@ from scrapy.signalmanager import SignalManager
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
from scrapy.utils.misc import load_object
from scrapy import log, signals
from scrapy.utils.log import LogCounterHandler, configure_logging, log_scrapy_info
from scrapy import signals
logger = logging.getLogger(__name__)
class Crawler(object):
@ -28,6 +32,12 @@ class Crawler(object):
self.signals = SignalManager(self)
self.stats = load_object(self.settings['STATS_CLASS'])(self)
handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL'))
logging.root.addHandler(handler)
self.signals.connect(lambda: logging.root.removeHandler(handler),
signals.engine_stopped)
lf_cls = load_object(self.settings['LOG_FORMATTER'])
self.logformatter = lf_cls.from_crawler(self)
self.extensions = ExtensionManager.from_crawler(self)
@ -99,7 +109,6 @@ class CrawlerRunner(object):
crawler = crawler_or_spidercls
if not isinstance(crawler_or_spidercls, Crawler):
crawler = self._create_crawler(crawler_or_spidercls)
self._setup_crawler_logging(crawler)
self.crawlers.add(crawler)
d = crawler.crawl(*args, **kwargs)
@ -117,11 +126,6 @@ class CrawlerRunner(object):
spidercls = self.spider_loader.load(spidercls)
return Crawler(spidercls, self.settings)
def _setup_crawler_logging(self, crawler):
log_observer = log.start_from_crawler(crawler)
if log_observer:
crawler.signals.connect(log_observer.stop, signals.engine_stopped)
def stop(self):
return defer.DeferredList([c.stop() for c in list(self.crawlers)])
@ -139,22 +143,21 @@ class CrawlerProcess(CrawlerRunner):
super(CrawlerProcess, self).__init__(settings)
install_shutdown_handlers(self._signal_shutdown)
self.stopping = False
self.log_observer = log.start_from_settings(self.settings)
log.scrapy_info(settings)
configure_logging(settings)
log_scrapy_info(settings)
def _signal_shutdown(self, signum, _):
install_shutdown_handlers(self._signal_kill)
signame = signal_names[signum]
log.msg(format="Received %(signame)s, shutting down gracefully. Send again to force ",
level=log.INFO, signame=signame)
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
{'signame': signame})
reactor.callFromThread(self.stop)
def _signal_kill(self, signum, _):
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
log.msg(format='Received %(signame)s twice, forcing unclean shutdown',
level=log.INFO, signame=signame)
self._stop_logging()
logger.info('Received %(signame)s twice, forcing unclean shutdown',
{'signame': signame})
reactor.callFromThread(self._stop_reactor)
def start(self, stop_after_crawl=True):
@ -173,10 +176,6 @@ class CrawlerProcess(CrawlerRunner):
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
reactor.run(installSignalHandlers=False) # blocking call
def _stop_logging(self):
if self.log_observer:
self.log_observer.stop()
def _stop_reactor(self, _=None):
try:
reactor.stop()

View File

@ -1,7 +1,7 @@
from __future__ import print_function
import os
import logging
from scrapy import log
from scrapy.utils.job import job_dir
from scrapy.utils.request import request_fingerprint
@ -33,6 +33,7 @@ class RFPDupeFilter(BaseDupeFilter):
self.fingerprints = set()
self.logdupes = True
self.debug = debug
self.logger = logging.getLogger(__name__)
if path:
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
self.fingerprints.update(x.rstrip() for x in self.file)
@ -59,13 +60,13 @@ class RFPDupeFilter(BaseDupeFilter):
def log(self, request, spider):
if self.debug:
fmt = "Filtered duplicate request: %(request)s"
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
msg = "Filtered duplicate request: %(request)s"
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
elif self.logdupes:
fmt = ("Filtered duplicate request: %(request)s"
msg = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider)
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
self.logdupes = False
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)

View File

@ -1,19 +1,24 @@
"""
Scrapy logging facility
This module is kept to provide a helpful warning about its removal.
"""
See documentation in docs/topics/logging.rst
"""
import sys
import logging
import warnings
from twisted.python import log
from twisted.python.failure import Failure
import scrapy
from scrapy.utils.python import unicode_to_str
from scrapy.settings import overridden_settings
from scrapy.exceptions import ScrapyDeprecationWarning
logger = logging.getLogger(__name__)
warnings.warn("Module `scrapy.log` has been deprecated, Scrapy now relies on "
"the builtin Python library for logging. Read the updated "
"logging entry in the documentation to learn more.",
ScrapyDeprecationWarning, stacklevel=2)
# Imports kept for backwards-compatibility
# Logging levels
DEBUG = logging.DEBUG
INFO = logging.INFO
WARNING = logging.WARNING
@ -21,141 +26,26 @@ ERROR = logging.ERROR
CRITICAL = logging.CRITICAL
SILENT = CRITICAL + 1
level_names = {
logging.DEBUG: "DEBUG",
logging.INFO: "INFO",
logging.WARNING: "WARNING",
logging.ERROR: "ERROR",
logging.CRITICAL: "CRITICAL",
SILENT: "SILENT",
}
class ScrapyFileLogObserver(log.FileLogObserver):
def msg(message=None, _level=logging.INFO, **kw):
warnings.warn('log.msg has been deprecated, create a python logger and '
'log through it instead',
ScrapyDeprecationWarning, stacklevel=2)
def __init__(self, f, level=INFO, encoding='utf-8', crawler=None):
self.level = level
self.encoding = encoding
self.crawler = crawler
if crawler:
self.emit = self._emit_with_crawler
else:
self.emit = self._emit
log.FileLogObserver.__init__(self, f)
level = kw.pop('level', _level)
message = kw.pop('format', message)
# NOTE: logger.log doesn't handle well passing empty dictionaries with format
# arguments because of some weird use-case:
# https://hg.python.org/cpython/file/648dcafa7e5f/Lib/logging/__init__.py#l269
logger.log(level, message, *[kw] if kw else [])
def _emit(self, eventDict):
ev = _adapt_eventdict(eventDict, self.level, self.encoding,
self.crawler)
if ev is not None:
log.FileLogObserver.emit(self, ev)
return ev
def _emit_with_crawler(self, eventDict):
ev = self._emit(eventDict)
if ev:
level = ev['logLevel']
sname = 'log_count/%s' % level_names.get(level, level)
self.crawler.stats.inc_value(sname)
def _adapt_eventdict(eventDict, log_level=INFO, encoding='utf-8',
crawler=None, prepend_level=True):
"""Adapt Twisted log eventDict making it suitable for logging with a Scrapy
log observer. It may return None to indicate that the event should be
ignored by a Scrapy log observer.
`log_level` is the minimum level being logged, and `encoding` is the log
encoding.
"""
ev = eventDict.copy()
if ev['isError']:
ev.setdefault('logLevel', ERROR)
# ignore non-error messages from outside scrapy
if ev.get('system') != 'scrapy' and not ev['isError']:
return
level = ev.get('logLevel')
if level < log_level:
return
spider = ev.get('spider')
if spider:
ev['system'] = unicode_to_str(spider.name, encoding)
if crawler and (not spider or spider.crawler is not crawler):
# ignore events not triggered by own spiders in crawlers' observers
return
if not crawler and spider:
# ignore spiders' events in observers without crawler
return
lvlname = level_names.get(level, 'NOLEVEL')
message = ev.get('message')
if message:
message = [unicode_to_str(x, encoding) for x in message]
if prepend_level:
message[0] = "%s: %s" % (lvlname, message[0])
ev['message'] = message
why = ev.get('why')
if why:
why = unicode_to_str(why, encoding)
if prepend_level:
why = "%s: %s" % (lvlname, why)
ev['why'] = why
fmt = ev.get('format')
if fmt:
fmt = unicode_to_str(fmt, encoding)
if prepend_level:
fmt = "%s: %s" % (lvlname, fmt)
ev['format'] = fmt
return ev
def _get_log_level(level_name_or_id):
if isinstance(level_name_or_id, int):
return level_name_or_id
elif isinstance(level_name_or_id, basestring):
return globals()[level_name_or_id]
else:
raise ValueError("Unknown log level: %r" % level_name_or_id)
def start(logfile=None, loglevel='INFO', logstdout=True, logencoding='utf-8', crawler=None):
loglevel = _get_log_level(loglevel)
file = open(logfile, 'a') if logfile else sys.stderr
log_observer = ScrapyFileLogObserver(file, loglevel, logencoding, crawler)
_oldshowwarning = warnings.showwarning
log.startLoggingWithObserver(log_observer.emit, setStdout=logstdout)
# restore warnings, wrongly silenced by Twisted
warnings.showwarning = _oldshowwarning
return log_observer
def msg(message=None, _level=INFO, **kw):
kw['logLevel'] = kw.pop('level', _level)
kw.setdefault('system', 'scrapy')
if message is None:
log.msg(**kw)
else:
log.msg(message, **kw)
def err(_stuff=None, _why=None, **kw):
kw['logLevel'] = kw.pop('level', ERROR)
kw.setdefault('system', 'scrapy')
log.err(_stuff, _why, **kw)
warnings.warn('log.err has been deprecated, create a python logger and '
'use its error method instead',
ScrapyDeprecationWarning, stacklevel=2)
def start_from_settings(settings, crawler=None):
if settings.getbool('LOG_ENABLED'):
return start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'],
settings['LOG_ENCODING'], crawler)
def scrapy_info(settings):
msg("Scrapy %s started (bot: %s)" % (scrapy.__version__,
settings['BOT_NAME']))
msg("Optional features available: %s" % ", ".join(scrapy.optional_features),
level=INFO)
d = dict(overridden_settings(settings))
msg(format="Overridden settings: %(settings)r", settings=d, level=INFO)
def start_from_crawler(crawler):
return start_from_settings(crawler.settings, crawler)
level = kw.pop('level', logging.ERROR)
failure = kw.pop('failure', _stuff) or Failure()
message = kw.pop('why', _why) or failure.value
logger.log(level, message, *[kw] if kw else [], extra={'failure': failure})

View File

@ -1,46 +1,67 @@
import os
import logging
from twisted.python.failure import Failure
from scrapy import log
SCRAPEDMSG = u"Scraped from %(src)s" + os.linesep + "%(item)s"
DROPPEDMSG = u"Dropped: %(exception)s" + os.linesep + "%(item)s"
CRAWLEDMSG = u"Crawled (%(status)s) %(request)s (referer: %(referer)s)%(flags)s"
SCRAPEDFMT = u"Scraped from %(src)s" + os.linesep + "%(item)s"
DROPPEDFMT = u"Dropped: %(exception)s" + os.linesep + "%(item)s"
CRAWLEDFMT = u"Crawled (%(status)s) %(request)s (referer: %(referer)s)%(flags)s"
class LogFormatter(object):
"""Class for generating log messages for different actions. All methods
must return a plain string which doesn't include the log level or the
timestamp
"""Class for generating log messages for different actions.
All methods must return a dictionary listing the parameters `level`, `msg`
and `args` which are going to be used for constructing the log message when
calling logging.log.
Dictionary keys for the method outputs:
* `level` should be the log level for that action, you can use those
from the python logging library: logging.DEBUG, logging.INFO,
logging.WARNING, logging.ERROR and logging.CRITICAL.
* `msg` should be a string that can contain different formatting
placeholders. This string, formatted with the provided `args`, is going
to be the log message for that action.
* `args` should be a tuple or dict with the formatting placeholders for
`msg`. The final log message is computed as output['msg'] %
output['args'].
"""
def crawled(self, request, response, spider):
flags = ' %s' % str(response.flags) if response.flags else ''
return {
'level': log.DEBUG,
'format': CRAWLEDFMT,
'status': response.status,
'request': request,
'referer': request.headers.get('Referer'),
'flags': flags,
'level': logging.DEBUG,
'msg': CRAWLEDMSG,
'args': {
'status': response.status,
'request': request,
'referer': request.headers.get('Referer'),
'flags': flags,
}
}
def scraped(self, item, response, spider):
src = response.getErrorMessage() if isinstance(response, Failure) else response
return {
'level': log.DEBUG,
'format': SCRAPEDFMT,
'src': src,
'item': item,
'level': logging.DEBUG,
'msg': SCRAPEDMSG,
'args': {
'src': src,
'item': item,
}
}
def dropped(self, item, exception, response, spider):
return {
'level': log.WARNING,
'format': DROPPEDFMT,
'exception': exception,
'item': item,
'level': logging.WARNING,
'msg': DROPPEDMSG,
'args': {
'exception': exception,
'item': item,
}
}
@classmethod

View File

@ -3,6 +3,8 @@ Mail sending helpers
See documentation in docs/topics/email.rst
"""
import logging
from six.moves import cStringIO as StringIO
import six
@ -20,7 +22,8 @@ else:
from twisted.internet import defer, reactor, ssl
from twisted.mail.smtp import ESMTPSenderFactory
from scrapy import log
logger = logging.getLogger(__name__)
class MailSender(object):
@ -71,8 +74,10 @@ class MailSender(object):
_callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg)
if self.debug:
log.msg(format='Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
level=log.DEBUG, mailto=to, mailcc=cc, mailsubject=subject, mailattachs=len(attachs))
logger.debug('Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
'mailattachs': len(attachs)})
return
dfd = self._sendmail(rcpts, msg.as_string())
@ -83,17 +88,18 @@ class MailSender(object):
return dfd
def _sent_ok(self, result, to, cc, subject, nattachs):
log.msg(format='Mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
mailto=to, mailcc=cc, mailsubject=subject, mailattachs=nattachs)
logger.info('Mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
'mailattachs': nattachs})
def _sent_failed(self, failure, to, cc, subject, nattachs):
errstr = str(failure.value)
log.msg(format='Unable to send mail: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
'- %(mailerr)s',
level=log.ERROR, mailto=to, mailcc=cc, mailsubject=subject,
mailattachs=nattachs, mailerr=errstr)
logger.error('Unable to send mail: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
'- %(mailerr)s',
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
'mailattachs': nattachs, 'mailerr': errstr})
def _sendmail(self, to_addrs, msg):
msg = StringIO(msg)

View File

@ -1,10 +1,13 @@
import logging
from collections import defaultdict
from scrapy import log
from scrapy.exceptions import NotConfigured
from scrapy.utils.misc import load_object
from scrapy.utils.defer import process_parallel, process_chain, process_chain_both
logger = logging.getLogger(__name__)
class MiddlewareManager(object):
"""Base class for implementing middleware managers"""
@ -37,12 +40,15 @@ class MiddlewareManager(object):
except NotConfigured as e:
if e.args:
clsname = clspath.split('.')[-1]
log.msg(format="Disabled %(clsname)s: %(eargs)s",
level=log.WARNING, clsname=clsname, eargs=e.args[0])
logger.warning("Disabled %(clsname)s: %(eargs)s",
{'clsname': clsname, 'eargs': e.args[0]},
extra={'crawler': crawler})
enabled = [x.__class__.__name__ for x in middlewares]
log.msg(format="Enabled %(componentname)ss: %(enabledlist)s", level=log.INFO,
componentname=cls.component_name, enabledlist=', '.join(enabled))
logger.info("Enabled %(componentname)ss: %(enabledlist)s",
{'componentname': cls.component_name,
'enabledlist': ', '.join(enabled)},
extra={'crawler': crawler})
return cls(*middlewares)
@classmethod

View File

@ -167,6 +167,8 @@ ITEM_PIPELINES_BASE = {}
LOG_ENABLED = True
LOG_ENCODING = 'utf-8'
LOG_FORMATTER = 'scrapy.logformatter.LogFormatter'
LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S%z'
LOG_STDOUT = False
LOG_LEVEL = 'DEBUG'
LOG_FILE = None

View File

@ -3,9 +3,9 @@ Base class for Scrapy spiders
See documentation in docs/topics/spiders.rst
"""
import logging
import warnings
from scrapy import log
from scrapy import signals
from scrapy.http import Request
from scrapy.utils.trackref import object_ref
@ -31,11 +31,19 @@ class Spider(object_ref):
if not hasattr(self, 'start_urls'):
self.start_urls = []
def log(self, message, level=log.DEBUG, **kw):
"""Log the given messages at the given log level. Always use this
method to send log messages from your spider
@property
def logger(self):
logger = logging.getLogger(self.name)
return logging.LoggerAdapter(logger, {'spider': self})
def log(self, message, level=logging.DEBUG, **kw):
"""Log the given message at the given log level
This helper wraps a log call to the logger within the spider, but you
can use it directly (e.g. Spider.logger.info('msg')) or use any other
Python logger too.
"""
log.msg(message, spider=self, level=level, **kw)
self.logger.log(level, message, **kw)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):

View File

@ -2,8 +2,10 @@
Scrapy extension for collecting scraping stats
"""
import pprint
import logging
logger = logging.getLogger(__name__)
from scrapy import log
class StatsCollector(object):
@ -41,8 +43,8 @@ class StatsCollector(object):
def close_spider(self, spider, reason):
if self._dump:
log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self._stats), \
spider=spider)
logger.info("Dumping Scrapy stats:\n" + pprint.pformat(self._stats),
extra={'spider': spider})
self._persist_stats(self._stats, spider)
def _persist_stats(self, stats, spider):

View File

@ -5,6 +5,7 @@ See documentation in docs/topics/telnetconsole.rst
"""
import pprint
import logging
from twisted.internet import protocol
try:
@ -15,7 +16,7 @@ except ImportError:
TWISTED_CONCH_AVAILABLE = False
from scrapy.exceptions import NotConfigured
from scrapy import log, signals
from scrapy import signals
from scrapy.utils.trackref import print_live_refs
from scrapy.utils.engine import print_engine_status
from scrapy.utils.reactor import listen_tcp
@ -26,6 +27,8 @@ try:
except ImportError:
hpy = None
logger = logging.getLogger(__name__)
# signal to update telnet variables
# args: telnet_vars
update_telnet_vars = object()
@ -52,8 +55,9 @@ class TelnetConsole(protocol.ServerFactory):
def start_listening(self):
self.port = listen_tcp(self.portrange, self.host, self)
h = self.port.getHost()
log.msg(format="Telnet console listening on %(host)s:%(port)d",
level=log.DEBUG, host=h.host, port=h.port)
logger.debug("Telnet console listening on %(host)s:%(port)d",
{'host': h.host, 'port': h.port},
extra={'crawler': self.crawler})
def stop_listening(self):
self.port.stopListening()

View File

@ -1,15 +1,20 @@
import re, csv, six
import re
import csv
import logging
try:
from cStringIO import StringIO as BytesIO
except ImportError:
from io import BytesIO
import six
from scrapy.http import TextResponse, Response
from scrapy.selector import Selector
from scrapy import log
from scrapy.utils.python import re_rsearch, str_to_unicode
logger = logging.getLogger(__name__)
def xmliter(obj, nodename):
"""Return a iterator of Selector's over all nodes of a XML document,
@ -108,8 +113,10 @@ def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
while True:
row = _getrow(csv_r)
if len(row) != len(headers):
log.msg(format="ignoring row %(csvlnum)d (length: %(csvrow)d, should be: %(csvheader)d)",
level=log.WARNING, csvlnum=csv_r.line_num, csvrow=len(row), csvheader=len(headers))
logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, "
"should be: %(csvheader)d)",
{'csvlnum': csv_r.line_num, 'csvrow': len(row),
'csvheader': len(headers)})
continue
else:
yield dict(zip(headers, row))

181
scrapy/utils/log.py Normal file
View File

@ -0,0 +1,181 @@
# -*- coding: utf-8 -*-
import os
import sys
import logging
import warnings
from logging.config import dictConfig
from twisted.python.failure import Failure
from twisted.python import log as twisted_log
import scrapy
from scrapy.settings import overridden_settings
from scrapy.exceptions import ScrapyDeprecationWarning
logger = logging.getLogger(__name__)
class FailureFormatter(logging.Filter):
"""Extract exc_info from Failure instances provided as contextual data
This filter mimics Twisted log.err formatting for its first `_stuff`
argument, which means that reprs of non Failure objects are appended to the
log messages.
"""
def filter(self, record):
failure = record.__dict__.get('failure')
if failure:
if isinstance(failure, Failure):
record.exc_info = (failure.type, failure.value, failure.tb)
else:
record.msg += os.linesep + repr(failure)
return True
class TopLevelFormatter(logging.Filter):
"""Keep only top level loggers's name (direct children from root) from
records.
This filter will replace Scrapy loggers' names with 'scrapy'. This mimics
the old Scrapy log behaviour and helps shortening long names.
Since it can't be set for just one logger (it won't propagate for its
children), it's going to be set in the root handler, with a parametrized
`loggers` list where it should act.
"""
def __init__(self, loggers=None):
self.loggers = loggers or []
def filter(self, record):
if any(record.name.startswith(l + '.') for l in self.loggers):
record.name = record.name.split('.', 1)[0]
return True
DEFAULT_LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'filters': {
'failure_formatter': {
'()': 'scrapy.utils.log.FailureFormatter',
},
},
'loggers': {
'scrapy': {
'level': 'DEBUG',
'filters': ['failure_formatter'],
},
'twisted': {
'level': 'ERROR',
},
}
}
def configure_logging(settings=None):
"""Initialize and configure default loggers
This function does:
- Route warnings and twisted logging through Python standard logging
- Set FailureFormatter filter on Scrapy logger
- Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
- Create a handler for the root logger according to given settings
"""
if not sys.warnoptions:
# Route warnings through python logging
logging.captureWarnings(True)
observer = twisted_log.PythonLoggingObserver('twisted')
observer.start()
dictConfig(DEFAULT_LOGGING)
if settings:
logging.root.setLevel(logging.NOTSET)
if settings.getbool('LOG_STDOUT'):
sys.stdout = StreamLogger(logging.getLogger('stdout'))
# Set up the default log handler
filename = settings.get('LOG_FILE')
if filename:
encoding = settings.get('LOG_ENCODING')
handler = logging.FileHandler(filename, encoding=encoding)
elif settings.getbool('LOG_ENABLED'):
handler = logging.StreamHandler()
else:
handler = logging.NullHandler()
formatter = logging.Formatter(
fmt=settings.get('LOG_FORMAT'),
datefmt=settings.get('LOG_DATEFORMAT')
)
handler.setFormatter(formatter)
handler.setLevel(settings.get('LOG_LEVEL'))
handler.addFilter(TopLevelFormatter(['scrapy']))
logging.root.addHandler(handler)
def log_scrapy_info(settings):
logger.info("Scrapy %(version)s started (bot: %(bot)s)",
{'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
logger.info("Optional features available: %(features)s",
{'features': ", ".join(scrapy.optional_features)})
d = dict(overridden_settings(settings))
logger.info("Overridden settings: %(settings)r", {'settings': d})
class StreamLogger(object):
"""Fake file-like stream object that redirects writes to a logger instance
Taken from:
http://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/
"""
def __init__(self, logger, log_level=logging.INFO):
self.logger = logger
self.log_level = log_level
self.linebuf = ''
def write(self, buf):
for line in buf.rstrip().splitlines():
self.logger.log(self.log_level, line.rstrip())
class LogCounterHandler(logging.Handler):
"""Record log levels count into a crawler stats"""
def __init__(self, crawler, *args, **kwargs):
super(LogCounterHandler, self).__init__(*args, **kwargs)
self.crawler = crawler
def emit(self, record):
sname = 'log_count/{}'.format(record.levelname)
self.crawler.stats.inc_value(sname)
def logformatter_adapter(logkws):
"""
Helper that takes the dictionary output from the methods in LogFormatter
and adapts it into a tuple of positional arguments for logger.log calls,
handling backward compatibility as well.
"""
if not {'level', 'msg', 'args'} <= set(logkws):
warnings.warn('Missing keys in LogFormatter method',
ScrapyDeprecationWarning)
if 'format' in logkws:
warnings.warn('`format` key in LogFormatter methods has been '
'deprecated, use `msg` instead',
ScrapyDeprecationWarning)
level = logkws.get('level', logging.INFO)
message = logkws.get('format', logkws.get('msg'))
# NOTE: This also handles 'args' being an empty dict, that case doesn't
# play well in logger.log calls
args = logkws if not logkws.get('args') else logkws['args']
return (level, message, args)

View File

@ -1,5 +1,7 @@
"""Helper functions for working with signals"""
import logging
from twisted.internet.defer import maybeDeferred, DeferredList, Deferred
from twisted.python.failure import Failure
@ -7,7 +9,8 @@ from scrapy.xlib.pydispatch.dispatcher import Any, Anonymous, liveReceivers, \
getAllReceivers, disconnect
from scrapy.xlib.pydispatch.robustapply import robustApply
from scrapy import log
logger = logging.getLogger(__name__)
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
"""Like pydispatcher.robust.sendRobust but it also logs errors and returns
@ -21,14 +24,14 @@ def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
response = robustApply(receiver, signal=signal, sender=sender,
*arguments, **named)
if isinstance(response, Deferred):
log.msg(format="Cannot return deferreds from signal handler: %(receiver)s",
level=log.ERROR, spider=spider, receiver=receiver)
logger.error("Cannot return deferreds from signal handler: %(receiver)s",
{'receiver': receiver}, extra={'spider': spider})
except dont_log:
result = Failure()
except Exception:
result = Failure()
log.err(result, "Error caught on signal handler: %s" % receiver, \
spider=spider)
logger.exception("Error caught on signal handler: %(receiver)s",
{'receiver': receiver}, extra={'spider': spider})
else:
result = response
responses.append((receiver, result))
@ -41,8 +44,9 @@ def send_catch_log_deferred(signal=Any, sender=Anonymous, *arguments, **named):
"""
def logerror(failure, recv):
if dont_log is None or not isinstance(failure.value, dont_log):
log.err(failure, "Error caught on signal handler: %s" % recv, \
spider=spider)
logger.error("Error caught on signal handler: %(receiver)s",
{'receiver': recv},
extra={'spider': spider, 'failure': failure})
return failure
dont_log = named.pop('dont_log', None)

View File

@ -1,11 +1,13 @@
import logging
import inspect
import six
from scrapy import log
from scrapy.spider import Spider
from scrapy.utils.misc import arg_to_iter
logger = logging.getLogger(__name__)
def iterate_spider_output(result):
return arg_to_iter(result)
@ -43,12 +45,12 @@ def spidercls_for_request(spider_loader, request, default_spidercls=None,
return spider_loader.load(snames[0])
if len(snames) > 1 and log_multiple:
log.msg(format='More than one spider can handle: %(request)s - %(snames)s',
level=log.ERROR, request=request, snames=', '.join(snames))
logger.error('More than one spider can handle: %(request)s - %(snames)s',
{'request': request, 'snames': ', '.join(snames)})
if len(snames) == 0 and log_none:
log.msg(format='Unable to find spider that handles: %(request)s',
level=log.ERROR, request=request)
logger.error('Unable to find spider that handles: %(request)s',
{'request': request})
return default_spidercls

View File

@ -46,19 +46,6 @@ def get_testenv():
env['PYTHONPATH'] = get_pythonpath()
return env
def get_testlog():
"""Get Scrapy log of current test, ignoring the rest"""
with open("test.log", "rb") as fp:
loglines = fp.readlines()
thistest = []
for line in loglines[::-1]:
thistest.append(line)
if "[-] -->" in line:
break
return "".join(thistest[::-1])
def assert_samelines(testcase, text1, text2, msg=None):
"""Asserts text1 and text2 have the same lines, ignoring differences in
line endings between platforms

View File

@ -31,7 +31,6 @@ tests/test_http_cookies.py
tests/test_http_request.py
tests/test_http_response.py
tests/test_logformatter.py
tests/test_log.py
tests/test_mail.py
tests/test_middleware.py
tests/test_pipeline_files.py
@ -59,6 +58,7 @@ tests/test_stats.py
tests/test_utils_defer.py
tests/test_utils_iterators.py
tests/test_utils_jsonrpc.py
tests/test_utils_log.py
tests/test_utils_python.py
tests/test_utils_reqser.py
tests/test_utils_request.py

View File

@ -4,3 +4,4 @@ mitmproxy==0.10.1
netlib==0.10.1
pytest-twisted
jmespath
testfixtures

View File

@ -74,7 +74,7 @@ class SimpleSpider(MetaSpider):
self.start_urls = [url]
def parse(self, response):
self.log("Got response %d" % response.status)
self.logger.info("Got response %d" % response.status)
class ItemSpider(FollowAllSpider):

View File

@ -39,7 +39,6 @@ class TestCloseSpider(TestCase):
close_on = 5
crawler = get_crawler(ErrorSpider, {'CLOSESPIDER_ERRORCOUNT': close_on})
yield crawler.crawl(total=1000000)
self.flushLoggedErrors(crawler.spider.exception_cls)
reason = crawler.spider.meta['close_reason']
self.assertEqual(reason, 'closespider_errorcount')
key = 'spider_exceptions/{name}'\

View File

@ -137,21 +137,20 @@ class RunSpiderCommandTest(CommandTest):
with open(fname, 'w') as f:
f.write("""
import scrapy
from scrapy import log
class MySpider(scrapy.Spider):
name = 'myspider'
def start_requests(self):
self.log("It Works!")
self.logger.debug("It Works!")
return []
""")
p = self.proc('runspider', fname)
log = p.stderr.read()
self.assertIn("[myspider] DEBUG: It Works!", log)
self.assertIn("[myspider] INFO: Spider opened", log)
self.assertIn("[myspider] INFO: Closing spider (finished)", log)
self.assertIn("[myspider] INFO: Spider closed (finished)", log)
self.assertIn("DEBUG: It Works!", log)
self.assertIn("INFO: Spider opened", log)
self.assertIn("INFO: Closing spider (finished)", log)
self.assertIn("INFO: Spider closed (finished)", log)
def test_runspider_no_spider_found(self):
tmpdir = self.mktemp()
@ -159,7 +158,6 @@ class MySpider(scrapy.Spider):
fname = abspath(join(tmpdir, 'myspider.py'))
with open(fname, 'w') as f:
f.write("""
from scrapy import log
from scrapy.spider import Spider
""")
p = self.proc('runspider', fname)
@ -192,7 +190,6 @@ class ParseCommandTest(ProcessTest, SiteTest, CommandTest):
fname = abspath(join(self.proj_mod_path, 'spiders', 'myspider.py'))
with open(fname, 'w') as f:
f.write("""
from scrapy import log
import scrapy
class MySpider(scrapy.Spider):
@ -200,20 +197,20 @@ class MySpider(scrapy.Spider):
def parse(self, response):
if getattr(self, 'test_arg', None):
self.log('It Works!')
self.logger.debug('It Works!')
return [scrapy.Item(), dict(foo='bar')]
""".format(self.spider_name))
fname = abspath(join(self.proj_mod_path, 'pipelines.py'))
with open(fname, 'w') as f:
f.write("""
from scrapy import log
import logging
class MyPipeline(object):
component_name = 'my_pipeline'
def process_item(self, item, spider):
log.msg('It Works!')
logging.info('It Works!')
return item
""")
@ -229,7 +226,7 @@ ITEM_PIPELINES = {'%s.pipelines.MyPipeline': 1}
'-a', 'test_arg=1',
'-c', 'parse',
self.url('/html')])
self.assertIn("[parse_spider] DEBUG: It Works!", stderr)
self.assertIn("DEBUG: It Works!", stderr)
@defer.inlineCallbacks
def test_pipelines(self):
@ -237,7 +234,7 @@ ITEM_PIPELINES = {'%s.pipelines.MyPipeline': 1}
'--pipelines',
'-c', 'parse',
self.url('/html')])
self.assertIn("[scrapy] INFO: It Works!", stderr)
self.assertIn("INFO: It Works!", stderr)
@defer.inlineCallbacks
def test_parse_items(self):

View File

@ -1,8 +1,12 @@
import json
import socket
import logging
from testfixtures import LogCapture
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import get_crawler, get_testlog
from scrapy.utils.test import get_crawler
from tests import mock
from tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
BrokenStartRequestsSpider, SingleRequestSpider, DuplicateStartRequestsSpider
@ -72,36 +76,47 @@ class CrawlTestCase(TestCase):
@defer.inlineCallbacks
def test_retry_503(self):
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://localhost:8998/status?n=503")
self._assert_retried()
with LogCapture() as l:
yield crawler.crawl("http://localhost:8998/status?n=503")
self._assert_retried(l)
@defer.inlineCallbacks
def test_retry_conn_failed(self):
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://localhost:65432/status?n=503")
self._assert_retried()
with LogCapture() as l:
yield crawler.crawl("http://localhost:65432/status?n=503")
self._assert_retried(l)
@defer.inlineCallbacks
def test_retry_dns_error(self):
with mock.patch('socket.gethostbyname',
side_effect=socket.gaierror(-5, 'No address associated with hostname')):
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://example.com/")
self._assert_retried()
with LogCapture() as l:
yield crawler.crawl("http://example.com/")
self._assert_retried(l)
@defer.inlineCallbacks
def test_start_requests_bug_before_yield(self):
crawler = get_crawler(BrokenStartRequestsSpider)
yield crawler.crawl(fail_before_yield=1)
errors = self.flushLoggedErrors(ZeroDivisionError)
self.assertEqual(len(errors), 1)
with LogCapture('scrapy', level=logging.ERROR) as l:
crawler = get_crawler(BrokenStartRequestsSpider)
yield crawler.crawl(fail_before_yield=1)
self.assertEqual(len(l.records), 1)
record = l.records[0]
self.assertIsNotNone(record.exc_info)
self.assertIs(record.exc_info[0], ZeroDivisionError)
@defer.inlineCallbacks
def test_start_requests_bug_yielding(self):
crawler = get_crawler(BrokenStartRequestsSpider)
yield crawler.crawl(fail_yielding=1)
errors = self.flushLoggedErrors(ZeroDivisionError)
self.assertEqual(len(errors), 1)
with LogCapture('scrapy', level=logging.ERROR) as l:
crawler = get_crawler(BrokenStartRequestsSpider)
yield crawler.crawl(fail_yielding=1)
self.assertEqual(len(l.records), 1)
record = l.records[0]
self.assertIsNotNone(record.exc_info)
self.assertIs(record.exc_info[0], ZeroDivisionError)
@defer.inlineCallbacks
def test_start_requests_lazyness(self):
@ -145,28 +160,29 @@ foo body
with multiples lines
'''})
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://localhost:8998/raw?{0}".format(query))
log = get_testlog()
self.assertEqual(log.count("Got response 200"), 1)
with LogCapture() as l:
yield crawler.crawl("http://localhost:8998/raw?{0}".format(query))
self.assertEqual(str(l).count("Got response 200"), 1)
@defer.inlineCallbacks
def test_retry_conn_lost(self):
# connection lost after receiving data
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://localhost:8998/drop?abort=0")
self._assert_retried()
with LogCapture() as l:
yield crawler.crawl("http://localhost:8998/drop?abort=0")
self._assert_retried(l)
@defer.inlineCallbacks
def test_retry_conn_aborted(self):
# connection lost before receiving data
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("http://localhost:8998/drop?abort=1")
self._assert_retried()
with LogCapture() as l:
yield crawler.crawl("http://localhost:8998/drop?abort=1")
self._assert_retried(l)
def _assert_retried(self):
log = get_testlog()
self.assertEqual(log.count("Retrying"), 2)
self.assertEqual(log.count("Gave up retrying"), 1)
def _assert_retried(self, log):
self.assertEqual(str(log).count("Retrying"), 2)
self.assertEqual(str(log).count("Gave up retrying"), 1)
@defer.inlineCallbacks
def test_referer_header(self):

View File

@ -1,169 +0,0 @@
from io import BytesIO
from twisted.python import log as txlog, failure
from twisted.trial import unittest
from scrapy import log
from scrapy.spider import Spider
from scrapy.settings import default_settings
from scrapy.utils.test import get_crawler
class LogTest(unittest.TestCase):
def test_get_log_level(self):
default_log_level = getattr(log, default_settings.LOG_LEVEL)
self.assertEqual(log._get_log_level('WARNING'), log.WARNING)
self.assertEqual(log._get_log_level(log.WARNING), log.WARNING)
self.assertRaises(ValueError, log._get_log_level, object())
class ScrapyFileLogObserverTest(unittest.TestCase):
level = log.INFO
encoding = 'utf-8'
def setUp(self):
self.f = BytesIO()
self.log_observer = log.ScrapyFileLogObserver(self.f, self.level, self.encoding)
self.log_observer.start()
def tearDown(self):
self.flushLoggedErrors()
self.log_observer.stop()
def logged(self):
return self.f.getvalue().strip()[25:]
def first_log_line(self):
logged = self.logged()
return logged.splitlines()[0] if logged else ''
def test_msg_basic(self):
log.msg("Hello")
self.assertEqual(self.logged(), "[scrapy] INFO: Hello")
def test_msg_ignore_spider(self):
spider = Spider("myspider")
log.msg("Hello", spider=spider)
self.failIf(self.logged())
def test_msg_level1(self):
log.msg("Hello", level=log.WARNING)
self.assertEqual(self.logged(), "[scrapy] WARNING: Hello")
def test_msg_level2(self):
log.msg("Hello", log.WARNING)
self.assertEqual(self.logged(), "[scrapy] WARNING: Hello")
def test_msg_wrong_level(self):
log.msg("Hello", level=9999)
self.assertEqual(self.logged(), "[scrapy] NOLEVEL: Hello")
def test_msg_encoding(self):
log.msg(u"Price: \xa3100")
self.assertEqual(self.logged(), "[scrapy] INFO: Price: \xc2\xa3100")
def test_msg_ignore_level(self):
log.msg("Hello", level=log.DEBUG)
log.msg("World", level=log.INFO)
self.assertEqual(self.logged(), "[scrapy] INFO: World")
def test_msg_ignore_system(self):
txlog.msg("Hello")
self.failIf(self.logged())
def test_msg_ignore_system_err(self):
txlog.msg("Hello")
self.failIf(self.logged())
def test_err_noargs(self):
try:
a = 1/0
except:
log.err()
self.assertIn('Traceback', self.logged())
self.assertIn('ZeroDivisionError', self.logged())
def test_err_why(self):
log.err(TypeError("bad type"), "Wrong type")
self.assertEqual(self.first_log_line(), "[scrapy] ERROR: Wrong type")
self.assertIn('TypeError', self.logged())
self.assertIn('bad type', self.logged())
def test_error_outside_scrapy(self):
"""Scrapy logger should still print outside errors"""
txlog.err(TypeError("bad type"), "Wrong type")
self.assertEqual(self.first_log_line(), "[-] ERROR: Wrong type")
self.assertIn('TypeError', self.logged())
self.assertIn('bad type', self.logged())
# this test fails in twisted trial observer, not in scrapy observer
# def test_err_why_encoding(self):
# log.err(TypeError("bad type"), u"\xa3")
# self.assertEqual(self.first_log_line(), "[scrapy] ERROR: \xc2\xa3")
def test_err_exc(self):
log.err(TypeError("bad type"))
self.assertIn('Unhandled Error', self.logged())
self.assertIn('TypeError', self.logged())
self.assertIn('bad type', self.logged())
def test_err_failure(self):
log.err(failure.Failure(TypeError("bad type")))
self.assertIn('Unhandled Error', self.logged())
self.assertIn('TypeError', self.logged())
self.assertIn('bad type', self.logged())
class Latin1ScrapyFileLogObserverTest(ScrapyFileLogObserverTest):
encoding = 'latin-1'
def test_msg_encoding(self):
log.msg(u"Price: \xa3100")
logged = self.f.getvalue().strip()[25:]
self.assertEqual(self.logged(), "[scrapy] INFO: Price: \xa3100")
# this test fails in twisted trial observer, not in scrapy observer
# def test_err_why_encoding(self):
# log.err(TypeError("bad type"), u"\xa3")
# self.assertEqual(self.first_log_line(), "[scrapy] ERROR: \xa3")
class CrawlerScrapyFileLogObserverTest(unittest.TestCase):
def setUp(self):
self.f = BytesIO()
self.crawler = get_crawler(Spider)
self.spider = self.crawler.spider = self.crawler._create_spider('test')
self.log_observer = log.ScrapyFileLogObserver(self.f, log.INFO,
'utf-8', self.crawler)
self.log_observer.start()
def tearDown(self):
self.flushLoggedErrors()
self.log_observer.stop()
def logged(self):
return self.f.getvalue().strip()[25:]
def test_msg_basic(self):
log.msg("Hello", spider=self.spider)
self.assertEqual(self.logged(), "[test] INFO: Hello")
def test_msg_ignore_scrapy_channel(self):
log.msg("Hello")
self.failIf(self.logged())
def test_msg_ignore_another_crawler(self):
crawler = get_crawler(Spider)
log.msg("Hello", spider=crawler._create_spider('test'))
self.failIf(self.logged())
def test_msg_stats_log(self):
assert self.crawler.stats.get_value('log_count/INFO', 0) == 0
log.msg("Hello", spider=self.spider)
self.assertEqual(self.crawler.stats.get_value('log_count/INFO'), 1)
if __name__ == "__main__":
unittest.main()

View File

@ -24,14 +24,14 @@ class LoggingContribTest(unittest.TestCase):
req = Request("http://www.example.com")
res = Response("http://www.example.com")
logkws = self.formatter.crawled(req, res, self.spider)
logline = logkws['format'] % logkws
logline = logkws['msg'] % logkws['args']
self.assertEqual(logline,
"Crawled (200) <GET http://www.example.com> (referer: None)")
req = Request("http://www.example.com", headers={'referer': 'http://example.com'})
res = Response("http://www.example.com", flags=['cached'])
logkws = self.formatter.crawled(req, res, self.spider)
logline = logkws['format'] % logkws
logline = logkws['msg'] % logkws['args']
self.assertEqual(logline,
"Crawled (200) <GET http://www.example.com> (referer: http://example.com) ['cached']")
@ -40,7 +40,7 @@ class LoggingContribTest(unittest.TestCase):
exception = Exception(u"\u2018")
response = Response("http://www.example.com")
logkws = self.formatter.dropped(item, exception, response, self.spider)
logline = logkws['format'] % logkws
logline = logkws['msg'] % logkws['args']
lines = logline.splitlines()
assert all(isinstance(x, unicode) for x in lines)
self.assertEqual(lines, [u"Dropped: \u2018", '{}'])
@ -50,7 +50,7 @@ class LoggingContribTest(unittest.TestCase):
item['name'] = u'\xa3'
response = Response("http://www.example.com")
logkws = self.formatter.scraped(item, response, self.spider)
logline = logkws['format'] % logkws
logline = logkws['msg'] % logkws['args']
lines = logline.splitlines()
assert all(isinstance(x, unicode) for x in lines)
self.assertEqual(lines, [u"Scraped from <200 http://www.example.com>", u'name: \xa3'])

View File

@ -1,9 +1,9 @@
from __future__ import print_function
from testfixtures import LogCapture
from twisted.trial import unittest
from twisted.python.failure import Failure
from twisted.internet import reactor
from twisted.internet.defer import Deferred, inlineCallbacks
from twisted.python import log as txlog
from scrapy.http import Request, Response
from scrapy.spider import Spider
@ -11,7 +11,6 @@ from scrapy.utils.request import request_fingerprint
from scrapy.contrib.pipeline.media import MediaPipeline
from scrapy.utils.signal import disconnect_all
from scrapy import signals
from scrapy import log
def _mocked_download_func(request, info):
@ -60,26 +59,21 @@ class BaseMediaPipelineTestCase(unittest.TestCase):
fail = Failure(Exception())
results = [(True, 1), (False, fail)]
events = []
txlog.addObserver(events.append)
new_item = self.pipe.item_completed(results, item, self.info)
txlog.removeObserver(events.append)
self.flushLoggedErrors()
with LogCapture() as l:
new_item = self.pipe.item_completed(results, item, self.info)
assert new_item is item
assert len(events) == 1
assert events[0]['logLevel'] == log.ERROR
assert events[0]['failure'] is fail
assert len(l.records) == 1
record = l.records[0]
assert record.levelname == 'ERROR'
assert record.failure is fail
# disable failure logging and check again
self.pipe.LOG_FAILED_RESULTS = False
events = []
txlog.addObserver(events.append)
new_item = self.pipe.item_completed(results, item, self.info)
txlog.removeObserver(events.append)
self.flushLoggedErrors()
with LogCapture() as l:
new_item = self.pipe.item_completed(results, item, self.info)
assert new_item is item
assert len(events) == 0
assert len(l.records) == 0
@inlineCallbacks
def test_default_process_item(self):

View File

@ -5,10 +5,11 @@ import time
from threading import Thread
from libmproxy import controller, proxy
from netlib import http_auth
from testfixtures import LogCapture
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import get_testlog, get_crawler
from scrapy.utils.test import get_crawler
from scrapy.http import Request
from tests.spiders import SimpleSpider, SingleRequestSpider
from tests.mockserver import MockServer
@ -50,39 +51,44 @@ class ProxyConnectTestCase(TestCase):
@defer.inlineCallbacks
def test_https_connect_tunnel(self):
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(200)
with LogCapture() as l:
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(200, l)
@defer.inlineCallbacks
def test_https_noconnect(self):
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888?noconnect'
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(200)
with LogCapture() as l:
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(200, l)
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888'
@defer.inlineCallbacks
def test_https_connect_tunnel_error(self):
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("https://localhost:99999/status?n=200")
self._assert_got_tunnel_error()
with LogCapture() as l:
yield crawler.crawl("https://localhost:99999/status?n=200")
self._assert_got_tunnel_error(l)
@defer.inlineCallbacks
def test_https_tunnel_auth_error(self):
os.environ['https_proxy'] = 'http://wrong:wronger@localhost:8888'
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("https://localhost:8999/status?n=200")
with LogCapture() as l:
yield crawler.crawl("https://localhost:8999/status?n=200")
# The proxy returns a 407 error code but it does not reach the client;
# he just sees a TunnelError.
self._assert_got_tunnel_error()
self._assert_got_tunnel_error(l)
os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888'
@defer.inlineCallbacks
def test_https_tunnel_without_leak_proxy_authorization_header(self):
request = Request("https://localhost:8999/echo")
crawler = get_crawler(SingleRequestSpider)
yield crawler.crawl(seed=request)
self._assert_got_response_code(200)
with LogCapture() as l:
yield crawler.crawl(seed=request)
self._assert_got_response_code(200, l)
echo = json.loads(crawler.spider.meta['responses'][0].body)
self.assertTrue('Proxy-Authorization' not in echo['headers'])
@ -90,13 +96,12 @@ class ProxyConnectTestCase(TestCase):
def test_https_noconnect_auth_error(self):
os.environ['https_proxy'] = 'http://wrong:wronger@localhost:8888?noconnect'
crawler = get_crawler(SimpleSpider)
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(407)
with LogCapture() as l:
yield crawler.crawl("https://localhost:8999/status?n=200")
self._assert_got_response_code(407, l)
def _assert_got_response_code(self, code):
log = get_testlog()
self.assertEqual(log.count('Crawled (%d)' % code), 1)
def _assert_got_response_code(self, code, log):
self.assertEqual(str(log).count('Crawled (%d)' % code), 1)
def _assert_got_tunnel_error(self):
log = get_testlog()
self.assertEqual(log.count('TunnelError'), 1)
def _assert_got_tunnel_error(self, log):
self.assertEqual(str(log).count('TunnelError'), 1)

View File

@ -2,6 +2,8 @@ import gzip
import inspect
import warnings
from io import BytesIO
from testfixtures import LogCapture
from twisted.trial import unittest
from scrapy import signals
@ -102,6 +104,23 @@ class SpiderTest(unittest.TestCase):
self.assertEqual(settings.get('TEST2'), 'spider')
self.assertEqual(settings.get('TEST3'), 'project')
def test_logger(self):
spider = self.spider_class('example.com')
with LogCapture() as l:
spider.logger.info('test log msg')
l.check(('example.com', 'INFO', 'test log msg'))
record = l.records[0]
self.assertIn('spider', record.__dict__)
self.assertIs(record.spider, spider)
def test_log(self):
spider = self.spider_class('example.com')
with mock.patch('scrapy.spider.Spider.logger') as mock_logger:
spider.log('test log msg', 'INFO')
mock_logger.log.assert_called_once_with('INFO', 'test log msg')
class InitSpiderTest(SpiderTest):
spider_class = InitSpider

View File

@ -1,9 +1,10 @@
from unittest import TestCase
from testfixtures import LogCapture
from twisted.trial.unittest import TestCase as TrialTestCase
from twisted.internet import defer
from scrapy.utils.test import get_crawler, get_testlog
from scrapy.utils.test import get_crawler
from tests.mockserver import MockServer
from scrapy.http import Response, Request
from scrapy.spider import Spider
@ -174,14 +175,13 @@ class TestHttpErrorMiddlewareIntegrational(TrialTestCase):
@defer.inlineCallbacks
def test_logging(self):
crawler = get_crawler(_HttpErrorSpider)
yield crawler.crawl(bypass_status_codes={402})
# print(get_testlog())
with LogCapture() as log:
yield crawler.crawl(bypass_status_codes={402})
self.assertEqual(crawler.spider.parsed, {'200', '402'})
self.assertEqual(crawler.spider.skipped, {'402'})
self.assertEqual(crawler.spider.failed, {'404', '500'})
log = get_testlog()
self.assertIn('Ignoring response <404', log)
self.assertIn('Ignoring response <500', log)
self.assertNotIn('Ignoring response <200', log)
self.assertNotIn('Ignoring response <402', log)
self.assertIn('Ignoring response <404', str(log))
self.assertIn('Ignoring response <500', str(log))
self.assertNotIn('Ignoring response <200', str(log))
self.assertNotIn('Ignoring response <402', str(log))

View File

@ -74,7 +74,6 @@ class DeferUtilsTest(unittest.TestCase):
def test_process_parallel_failure(self):
d = process_parallel([cb1, cb_fail, cb3], 'res', 'v1', 'v2')
self.failUnlessFailure(d, TypeError)
self.flushLoggedErrors()
return d

132
tests/test_utils_log.py Normal file
View File

@ -0,0 +1,132 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import sys
import logging
import unittest
from testfixtures import LogCapture
from twisted.python.failure import Failure
from scrapy.utils.log import (FailureFormatter, TopLevelFormatter,
LogCounterHandler, StreamLogger)
from scrapy.utils.test import get_crawler
class FailureFormatterTest(unittest.TestCase):
def setUp(self):
self.logger = logging.getLogger('test')
self.filter = FailureFormatter()
self.logger.addFilter(self.filter)
def tearDown(self):
self.logger.removeFilter(self.filter)
def test_failure_format(self):
with LogCapture() as l:
try:
0/0
except ZeroDivisionError:
self.logger.exception('test log msg')
failure = Failure()
self.logger.error('test log msg', extra={'failure': failure})
self.assertEqual(len(l.records), 2)
exc_record, failure_record = l.records
self.assertTupleEqual(failure_record.exc_info, exc_record.exc_info)
formatter = logging.Formatter()
self.assertMultiLineEqual(formatter.format(failure_record),
formatter.format(exc_record))
def test_non_failure_format(self):
with LogCapture() as l:
self.logger.error('test log msg', extra={'failure': 3})
self.assertEqual(len(l.records), 1)
self.assertMultiLineEqual(l.records[0].getMessage(),
'test log msg' + os.linesep + '3')
class TopLevelFormatterTest(unittest.TestCase):
def setUp(self):
self.handler = LogCapture()
self.handler.addFilter(TopLevelFormatter(['test']))
def test_top_level_logger(self):
logger = logging.getLogger('test')
with self.handler as l:
logger.warning('test log msg')
l.check(('test', 'WARNING', 'test log msg'))
def test_children_logger(self):
logger = logging.getLogger('test.test1')
with self.handler as l:
logger.warning('test log msg')
l.check(('test', 'WARNING', 'test log msg'))
def test_overlapping_name_logger(self):
logger = logging.getLogger('test2')
with self.handler as l:
logger.warning('test log msg')
l.check(('test2', 'WARNING', 'test log msg'))
def test_different_name_logger(self):
logger = logging.getLogger('different')
with self.handler as l:
logger.warning('test log msg')
l.check(('different', 'WARNING', 'test log msg'))
class LogCounterHandlerTest(unittest.TestCase):
def setUp(self):
self.logger = logging.getLogger('test')
self.logger.setLevel(logging.NOTSET)
self.logger.propagate = False
self.crawler = get_crawler(settings_dict={'LOG_LEVEL': 'WARNING'})
self.handler = LogCounterHandler(self.crawler)
self.logger.addHandler(self.handler)
def tearDown(self):
self.logger.propagate = True
self.logger.removeHandler(self.handler)
def test_init(self):
self.assertIsNone(self.crawler.stats.get_value('log_count/DEBUG'))
self.assertIsNone(self.crawler.stats.get_value('log_count/INFO'))
self.assertIsNone(self.crawler.stats.get_value('log_count/WARNING'))
self.assertIsNone(self.crawler.stats.get_value('log_count/ERROR'))
self.assertIsNone(self.crawler.stats.get_value('log_count/CRITICAL'))
def test_accepted_level(self):
self.logger.error('test log msg')
self.assertEqual(self.crawler.stats.get_value('log_count/ERROR'), 1)
def test_filtered_out_level(self):
self.logger.debug('test log msg')
self.assertIsNone(self.crawler.stats.get_value('log_count/INFO'))
class StreamLoggerTest(unittest.TestCase):
def setUp(self):
self.stdout = sys.stdout
logger = logging.getLogger('test')
logger.setLevel(logging.WARNING)
sys.stdout = StreamLogger(logger, logging.ERROR)
def tearDown(self):
sys.stdout = self.stdout
def test_redirect(self):
with LogCapture() as l:
print('test log msg')
l.check(('test', 'ERROR', 'test log msg'))

View File

@ -1,11 +1,11 @@
from testfixtures import LogCapture
from twisted.trial import unittest
from twisted.python import log as txlog
from twisted.python.failure import Failure
from twisted.internet import defer, reactor
from scrapy.xlib.pydispatch import dispatcher
from scrapy.utils.signal import send_catch_log, send_catch_log_deferred
from scrapy import log
class SendCatchLogTest(unittest.TestCase):
@ -14,26 +14,24 @@ class SendCatchLogTest(unittest.TestCase):
test_signal = object()
handlers_called = set()
def log_received(event):
handlers_called.add(log_received)
assert "error_handler" in event['message'][0]
assert event['logLevel'] == log.ERROR
txlog.addObserver(log_received)
dispatcher.connect(self.error_handler, signal=test_signal)
dispatcher.connect(self.ok_handler, signal=test_signal)
result = yield defer.maybeDeferred(self._get_result, test_signal, arg='test', \
handlers_called=handlers_called)
with LogCapture() as l:
result = yield defer.maybeDeferred(
self._get_result, test_signal, arg='test',
handlers_called=handlers_called
)
assert self.error_handler in handlers_called
assert self.ok_handler in handlers_called
assert log_received in handlers_called
self.assertEqual(len(l.records), 1)
record = l.records[0]
self.assertIn('error_handler', record.getMessage())
self.assertEqual(record.levelname, 'ERROR')
self.assertEqual(result[0][0], self.error_handler)
self.assert_(isinstance(result[0][1], Failure))
self.assertEqual(result[1], (self.ok_handler, "OK"))
txlog.removeObserver(log_received)
self.flushLoggedErrors()
dispatcher.disconnect(self.error_handler, signal=test_signal)
dispatcher.disconnect(self.ok_handler, signal=test_signal)
@ -73,12 +71,9 @@ class SendCatchLogTest2(unittest.TestCase):
def test_error_logged_if_deferred_not_supported(self):
test_signal = object()
test_handler = lambda: defer.Deferred()
log_events = []
txlog.addObserver(log_events.append)
dispatcher.connect(test_handler, test_signal)
send_catch_log(test_signal)
self.assertTrue(log_events)
self.assertIn("Cannot return deferreds from signal handler", str(log_events))
txlog.removeObserver(log_events.append)
self.flushLoggedErrors()
with LogCapture() as l:
send_catch_log(test_signal)
self.assertEqual(len(l.records), 1)
self.assertIn("Cannot return deferreds from signal handler", str(l))
dispatcher.disconnect(test_handler, test_signal)

View File

@ -51,6 +51,7 @@ deps =
# tests requirements
pytest>=2.6.0
pytest-twisted
testfixtures
[testenv:py34]
basepython = python3.4