1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 14:24:35 +00:00

Merge remote-tracking branch 'origin/master' into callback_kwargs

This commit is contained in:
Eugenio Lacuesta 2019-06-26 11:03:31 -03:00
commit 428309ba1a
61 changed files with 2194 additions and 189 deletions

View File

@ -12,7 +12,8 @@ branches:
install: install:
- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
- "SET TOX_TESTENV_PASSENV=HOME USERPROFILE HOMEPATH HOMEDRIVE" - "SET PYTHONPATH=%APPVEYOR_BUILD_FOLDER%"
- "SET TOX_TESTENV_PASSENV=HOME HOMEDRIVE HOMEPATH PYTHONPATH USERPROFILE"
- "pip install -U tox" - "pip install -U tox"
build: false build: false

View File

@ -82,6 +82,9 @@ pydoc-topics: build
@echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \ @echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \
"into the Lib/ directory" "into the Lib/ directory"
coverage: BUILDER = coverage
coverage: build
htmlview: html htmlview: html
$(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \ $(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
os.path.realpath('build/html/index.html'))" os.path.realpath('build/html/index.html'))"

View File

@ -28,7 +28,8 @@ sys.path.insert(0, path.dirname(path.dirname(__file__)))
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [ extensions = [
'scrapydocs', 'scrapydocs',
'sphinx.ext.autodoc' 'sphinx.ext.autodoc',
'sphinx.ext.coverage',
] ]
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
@ -218,3 +219,22 @@ linkcheck_ignore = [
'http://localhost:\d+', 'http://hg.scrapy.org', 'http://localhost:\d+', 'http://hg.scrapy.org',
'http://directory.google.com/' 'http://directory.google.com/'
] ]
# Options for the Coverage extension
# ----------------------------------
coverage_ignore_pyobjects = [
# Contracts add_pre_hook and add_post_hook are not documented because
# they should be transparent to contract developers, for whom pre_hook and
# post_hook should be the actual concern.
r'\bContract\.add_(pre|post)_hook$',
# ContractsManager is an internal class, developers are not expected to
# interact with it directly in any way.
r'\bContractsManager\b$',
# For default contracts we only want to document their general purpose in
# their constructor, the methods they reimplement to achieve that purpose
# should be irrelevant to developers using those contracts.
r'\w+Contract\.(adjust_request_args|(pre|post)_process)$',
]

View File

@ -99,6 +99,15 @@ Well-written patches should:
the documentation changes in the same patch. See `Documentation policies`_ the documentation changes in the same patch. See `Documentation policies`_
below. below.
* if you're adding a private API, please add a regular expression to the
``coverage_ignore_pyobjects`` variable of ``docs/conf.py`` to exclude the new
private API from documentation coverage checks.
To see if your private API is skipped properly, generate a documentation
coverage report as follows::
tox -e docs-coverage
.. _submitting-patches: .. _submitting-patches:
Submitting patches Submitting patches
@ -167,8 +176,9 @@ Documentation policies
For reference documentation of API members (classes, methods, etc.) use For reference documentation of API members (classes, methods, etc.) use
docstrings and make sure that the Sphinx documentation uses the autodoc_ docstrings and make sure that the Sphinx documentation uses the autodoc_
extension to pull the docstrings. API reference documentation should be extension to pull the docstrings. API reference documentation should follow
IDE-friendly: short, to the point, and it may provide short examples. docstring conventions (`PEP 257`_) and be IDE-friendly: short, to the point,
and it may provide short examples.
Other types of documentation, such as tutorials or topics, should be covered in Other types of documentation, such as tutorials or topics, should be covered in
files within the ``docs/`` directory. This includes documentation that is files within the ``docs/`` directory. This includes documentation that is
@ -205,6 +215,29 @@ To run a specific test (say ``tests/test_loader.py``) use:
``tox -- tests/test_loader.py`` ``tox -- tests/test_loader.py``
To run the tests on a specific tox_ environment, use ``-e <name>`` with an
environment name from ``tox.ini``. For example, to run the tests with Python
3.6 use::
tox -e py36
You can also specify a comma-separated list of environmets, and use `toxs
parallel mode`_ to run the tests on multiple environments in parallel::
tox -e py27,py36 -p auto
To pass command-line options to pytest_, add them after ``--`` in your call to
tox_. Using ``--`` overrides the default positional arguments defined in
``tox.ini``, so you must include those default positional arguments
(``scrapy tests``) after ``--`` as well::
tox -- scrapy tests -x # stop after first failure
You can also use the `pytest-xdist`_ plugin. For example, to run all tests on
the Python 3.6 tox_ environment using all your CPU cores::
tox -e py36 -- scrapy tests -n auto
To see coverage report install `coverage`_ (``pip install coverage``) and run: To see coverage report install `coverage`_ (``pip install coverage``) and run:
``coverage report`` ``coverage report``
@ -237,5 +270,9 @@ And their unit-tests are in::
.. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS .. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS
.. _tests/: https://github.com/scrapy/scrapy/tree/master/tests .. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
.. _open issues: https://github.com/scrapy/scrapy/issues .. _open issues: https://github.com/scrapy/scrapy/issues
.. _pull request: https://help.github.com/send-pull-requests/ .. _PEP 257: https://www.python.org/dev/peps/pep-0257/
.. _pull request: https://help.github.com/en/articles/creating-a-pull-request
.. _pytest: https://docs.pytest.org/en/latest/usage.html
.. _pytest-xdist: https://docs.pytest.org/en/3.0.0/xdist.html
.. _tox: https://pypi.python.org/pypi/tox .. _tox: https://pypi.python.org/pypi/tox
.. _toxs parallel mode: https://tox.readthedocs.io/en/latest/example/basic.html#parallel-mode

View File

@ -158,6 +158,7 @@ Solving specific problems
topics/practices topics/practices
topics/broad-crawls topics/broad-crawls
topics/developer-tools topics/developer-tools
topics/dynamic-content
topics/leaks topics/leaks
topics/media-pipeline topics/media-pipeline
topics/deploy topics/deploy
@ -183,6 +184,9 @@ Solving specific problems
:doc:`topics/developer-tools` :doc:`topics/developer-tools`
Learn how to scrape with your browser's developer tools. Learn how to scrape with your browser's developer tools.
:doc:`topics/dynamic-content`
Read webpage data that is loaded dynamically.
:doc:`topics/leaks` :doc:`topics/leaks`
Learn how to find and get rid of memory leaks in your crawler. Learn how to find and get rid of memory leaks in your crawler.

View File

@ -205,7 +205,7 @@ Extracting data
--------------- ---------------
The best way to learn how to extract data with Scrapy is trying selectors The best way to learn how to extract data with Scrapy is trying selectors
using the shell :ref:`Scrapy shell <topics-shell>`. Run:: using the :ref:`Scrapy shell <topics-shell>`. Run::
scrapy shell 'http://quotes.toscrape.com/page/1/' scrapy shell 'http://quotes.toscrape.com/page/1/'
@ -296,8 +296,8 @@ expressions`_::
In order to find the proper CSS selectors to use, you might find useful opening In order to find the proper CSS selectors to use, you might find useful opening
the response page from the shell in your web browser using ``view(response)``. the response page from the shell in your web browser using ``view(response)``.
You can use your browser developer tools to inspect the HTML and come up You can use your browser's developer tools to inspect the HTML and come up
with a selector (see section about :ref:`topics-developer-tools`). with a selector (see :ref:`topics-developer-tools`).
`Selector Gadget`_ is also a nice tool to quickly find CSS selector for `Selector Gadget`_ is also a nice tool to quickly find CSS selector for
visually selected elements, which works in many browsers. visually selected elements, which works in many browsers.
@ -379,11 +379,11 @@ variable, so that we can run our CSS selectors directly on a particular quote::
>>> quote = response.css("div.quote")[0] >>> quote = response.css("div.quote")[0]
Now, let's extract ``title``, ``author`` and the ``tags`` from that quote Now, let's extract ``text``, ``author`` and the ``tags`` from that quote
using the ``quote`` object we just created:: using the ``quote`` object we just created::
>>> title = quote.css("span.text::text").get() >>> text = quote.css("span.text::text").get()
>>> title >>> text
'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”' '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'
>>> author = quote.css("small.author::text").get() >>> author = quote.css("small.author::text").get()
>>> author >>> author
@ -511,7 +511,7 @@ We can try extracting it in the shell::
'<a href="/page/2/">Next <span aria-hidden="true">→</span></a>' '<a href="/page/2/">Next <span aria-hidden="true">→</span></a>'
This gets the anchor element, but we want the attribute ``href``. For that, This gets the anchor element, but we want the attribute ``href``. For that,
Scrapy supports a CSS extension that let's you select the attribute contents, Scrapy supports a CSS extension that lets you select the attribute contents,
like this:: like this::
>>> response.css('li.next a::attr(href)').get() >>> response.css('li.next a::attr(href)').get()

View File

@ -1,2 +1,2 @@
Sphinx>=1.6 Sphinx>=2.1
sphinx_rtd_theme sphinx_rtd_theme

View File

@ -99,6 +99,8 @@ how you :ref:`configure the downloader middlewares
Returns a deferred that is fired when the crawl is finished. Returns a deferred that is fired when the crawl is finished.
.. automethod:: stop
.. autoclass:: CrawlerRunner .. autoclass:: CrawlerRunner
:members: :members:
@ -154,7 +156,7 @@ Settings API
SpiderLoader API SpiderLoader API
================ ================
.. module:: scrapy.loader .. module:: scrapy.spiderloader
:synopsis: The spider loader :synopsis: The spider loader
.. class:: SpiderLoader .. class:: SpiderLoader

View File

@ -39,6 +39,17 @@ you need to keep in mind when using Scrapy for doing broad crawls, along with
concrete suggestions of Scrapy settings to tune in order to achieve an concrete suggestions of Scrapy settings to tune in order to achieve an
efficient broad crawl. efficient broad crawl.
Use the right :setting:`SCHEDULER_PRIORITY_QUEUE`
=================================================
Scrapys default scheduler priority queue is ``'scrapy.pqueues.ScrapyPriorityQueue'``.
It works best during single-domain crawl. It does not work well with crawling
many different domains in parallel
To apply the recommended priority queue use::
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
Increase concurrency Increase concurrency
==================== ====================

View File

@ -120,3 +120,23 @@ get the failures pretty printed::
for header in self.args: for header in self.args:
if header not in response.headers: if header not in response.headers:
raise ContractFail('X-CustomHeader not present') raise ContractFail('X-CustomHeader not present')
Detecting check runs
====================
When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is
set to the ``true`` string. You can use `os.environ`_ to perform any change to
your spiders or your settings when ``scrapy check`` is used::
import os
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example'
def __init__(self):
if os.environ.get('SCRAPY_CHECK'):
pass # Do some scraper adjustments when a check is running
.. _os.environ: https://docs.python.org/3/library/os.html#os.environ

View File

@ -805,6 +805,7 @@ The :class:`MetaRefreshMiddleware` can be configured through the following
settings (see the settings documentation for more info): settings (see the settings documentation for more info):
* :setting:`METAREFRESH_ENABLED` * :setting:`METAREFRESH_ENABLED`
* :setting:`METAREFRESH_IGNORE_TAGS`
* :setting:`METAREFRESH_MAXDELAY` * :setting:`METAREFRESH_MAXDELAY`
This middleware obey :setting:`REDIRECT_MAX_TIMES` setting, :reqmeta:`dont_redirect`, This middleware obey :setting:`REDIRECT_MAX_TIMES` setting, :reqmeta:`dont_redirect`,
@ -826,6 +827,15 @@ Default: ``True``
Whether the Meta Refresh middleware will be enabled. Whether the Meta Refresh middleware will be enabled.
.. setting:: METAREFRESH_IGNORE_TAGS
METAREFRESH_IGNORE_TAGS
^^^^^^^^^^^^^^^^^^^^^^^
Default: ``['script', 'noscript']``
Meta tags within these tags are ignored.
.. setting:: METAREFRESH_MAXDELAY .. setting:: METAREFRESH_MAXDELAY
METAREFRESH_MAXDELAY METAREFRESH_MAXDELAY

View File

@ -0,0 +1,246 @@
.. _topics-dynamic-content:
====================================
Selecting dynamically-loaded content
====================================
Some webpages show the desired data when you load them in a web browser.
However, when you download them using Scrapy, you cannot reach the desired data
using :ref:`selectors <topics-selectors>`.
When this happens, the recommended approach is to
:ref:`find the data source <topics-finding-data-source>` and extract the data
from it.
If you fail to do that, and you can nonetheless access the desired data through
the :ref:`DOM <topics-livedom>` from your web browser, see
:ref:`topics-javascript-rendering`.
.. _topics-finding-data-source:
Finding the data source
=======================
To extract the desired data, you must first find its source location.
If the data is in a non-text-based format, such as an image or a PDF document,
use the :ref:`network tool <topics-network-tool>` of your web browser to find
the corresponding request, and :ref:`reproduce it
<topics-reproducing-requests>`.
If your web browser lets you select the desired data as text, the data may be
defined in embedded JavaScript code, or loaded from an external resource in a
text-based format.
In that case, you can use a tool like wgrep_ to find the URL of that resource.
If the data turns out to come from the original URL itself, you must
:ref:`inspect the source code of the webpage <topics-inspecting-source>` to
determine where the data is located.
If the data comes from a different URL, you will need to :ref:`reproduce the
corresponding request <topics-reproducing-requests>`.
.. _topics-inspecting-source:
Inspecting the source code of a webpage
=======================================
Sometimes you need to inspect the source code of a webpage (not the
:ref:`DOM <topics-livedom>`) to determine where some desired data is located.
Use Scrapys :command:`fetch` command to download the webpage contents as seen
by Scrapy::
scrapy fetch --nolog https://example.com > response.html
If the desired data is in embedded JavaScript code within a ``<script/>``
element, see :ref:`topics-parsing-javascript`.
If you cannot find the desired data, first make sure its not just Scrapy:
download the webpage with an HTTP client like curl_ or wget_ and see if the
information can be found in the response they get.
If they get a response with the desired data, modify your Scrapy
:class:`~scrapy.http.Request` to match that of the other HTTP client. For
example, try using the same user-agent string (:setting:`USER_AGENT`) or the
same :attr:`~scrapy.http.Request.headers`.
If they also get a response without the desired data, youll need to take
steps to make your request more similar to that of the web browser. See
:ref:`topics-reproducing-requests`.
.. _topics-reproducing-requests:
Reproducing requests
====================
Sometimes we need to reproduce a request the way our web browser performs it.
Use the :ref:`network tool <topics-network-tool>` of your web browser to see
how your web browser performs the desired request, and try to reproduce that
request with Scrapy.
It might be enough to yield a :class:`~scrapy.http.Request` with the same HTTP
method and URL. However, you may also need to reproduce the body, headers and
form parameters (see :class:`~scrapy.http.FormRequest`) of that request.
Once you get the expected response, you can :ref:`extract the desired data from
it <topics-handling-response-formats>`.
You can reproduce any request with Scrapy. However, some times reproducing all
necessary requests may not seem efficient in developer time. If that is your
case, and crawling speed is not a major concern for you, you can alternatively
consider :ref:`JavaScript pre-rendering <topics-javascript-rendering>`.
If you get the expected response `sometimes`, but not always, the issue is
probably not your request, but the target server. The target server might be
buggy, overloaded, or :ref:`banning <bans>` some of your requests.
.. _topics-handling-response-formats:
Handling different response formats
===================================
Once you have a response with the desired data, how you extract the desired
data from it depends on the type of response:
- If the response is HTML or XML, use :ref:`selectors
<topics-selectors>` as usual.
- If the response is JSON, use `json.loads`_ to load the desired data from
:attr:`response.text <scrapy.http.TextResponse.text>`::
data = json.loads(response.text)
If the desired data is inside HTML or XML code embedded within JSON data,
you can load that HTML or XML code into a
:class:`~scrapy.selector.Selector` and then
:ref:`use it <topics-selectors>` as usual::
selector = Selector(data['html'])
- If the response is JavaScript, or HTML with a ``<script/>`` element
containing the desired data, see :ref:`topics-parsing-javascript`.
- If the response is CSS, use a `regular expression`_ to extract the desired
data from :attr:`response.text <scrapy.http.TextResponse.text>`.
.. _topics-parsing-images:
- If the response is an image or another format based on images (e.g. PDF),
read the response as bytes from
:attr:`response.body <scrapy.http.TextResponse.body>` and use an OCR
solution to extract the desired data as text.
For example, you can use pytesseract_. To read a table from a PDF,
`tabula-py`_ may be a better choice.
- If the response is SVG, or HTML with embedded SVG containing the desired
data, you may be able to extract the desired data using
:ref:`selectors <topics-selectors>`, since SVG is based on XML.
Otherwise, you might need to convert the SVG code into a raster image, and
:ref:`handle that raster image <topics-parsing-images>`.
.. _topics-parsing-javascript:
Parsing JavaScript code
=======================
If the desired data is hardcoded in JavaScript, you first need to get the
JavaScript code:
- If the JavaScript code is in a JavaScript file, simply read
:attr:`response.text <scrapy.http.TextResponse.text>`.
- If the JavaScript code is within a ``<script/>`` element of an HTML page,
use :ref:`selectors <topics-selectors>` to extract the text within that
``<script/>`` element.
Once you have a string with the JavaScript code, you can extract the desired
data from it:
- You might be able to use a `regular expression`_ to extract the desired
data in JSON format, which you can then parse with `json.loads`_.
For example, if the JavaScript code contains a separate line like
``var data = {"field": "value"};`` you can extract that data as follows::
>>> pattern = r'\bvar\s+data\s*=\s*(\{.*?\})\s*;\s*\n'
>>> json_data = response.css('script::text').re_first(pattern)
>>> json.loads(json_data)
{'field': 'value'}
- Otherwise, use js2xml_ to convert the JavaScript code into an XML document
that you can parse using :ref:`selectors <topics-selectors>`.
For example, if the JavaScript code contains
``var data = {field: "value"};`` you can extract that data as follows::
>>> import js2xml
>>> import lxml.etree
>>> from parsel import Selector
>>> javascript = response.css('script::text').get()
>>> xml = lxml.etree.tostring(js2xml.parse(javascript), encoding='unicode')
>>> selector = Selector(text=xml)
>>> selector.css('var[name="data"]').get()
'<var name="data"><object><property name="field"><string>value</string></property></object></var>'
.. _topics-javascript-rendering:
Pre-rendering JavaScript
========================
On webpages that fetch data from additional requests, reproducing those
requests that contain the desired data is the preferred approach. The effort is
often worth the result: structured, complete data with minimum parsing time and
network transfer.
However, sometimes it can be really hard to reproduce certain requests. Or you
may need something that no request can give you, such as a screenshot of a
webpage as seen in a web browser.
In these cases use the Splash_ JavaScript-rendering service, along with
`scrapy-splash`_ for seamless integration.
Splash returns as HTML the :ref:`DOM <topics-livedom>` of a webpage, so that
you can parse it with :ref:`selectors <topics-selectors>`. It provides great
flexibility through configuration_ or scripting_.
If you need something beyond what Splash offers, such as interacting with the
DOM on-the-fly from Python code instead of using a previously-written script,
or handling multiple web browser windows, you might need to
:ref:`use a headless browser <topics-headless-browsing>` instead.
.. _configuration: https://splash.readthedocs.io/en/stable/api.html
.. _scripting: https://splash.readthedocs.io/en/stable/scripting-tutorial.html
.. _topics-headless-browsing:
Using a headless browser
========================
A `headless browser`_ is a special web browser that provides an API for
automation.
The easiest way to use a headless browser with Scrapy is to use Selenium_,
along with `scrapy-selenium`_ for seamless integration.
.. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29
.. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
.. _curl: https://curl.haxx.se/
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
.. _js2xml: https://github.com/scrapinghub/js2xml
.. _json.loads: https://docs.python.org/library/json.html#json.loads
.. _pytesseract: https://github.com/madmaze/pytesseract
.. _regular expression: https://docs.python.org/library/re.html
.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
.. _Selenium: https://www.seleniumhq.org/
.. _Splash: https://github.com/scrapinghub/splash
.. _tabula-py: https://github.com/chezou/tabula-py
.. _wget: https://www.gnu.org/software/wget/
.. _wgrep: https://github.com/stav/wgrep

View File

@ -238,9 +238,10 @@ scrapy.utils.log module
.. autofunction:: configure_logging .. autofunction:: configure_logging
``configure_logging`` is automatically called when using Scrapy commands, ``configure_logging`` is automatically called when using Scrapy commands
but needs to be called explicitly when running custom scripts. In that or :class:`~scrapy.crawler.CrawlerProcess`, but needs to be called explicitly
case, its usage is not required but it's recommended. when running custom scripts using :class:`~scrapy.crawler.CrawlerRunner`.
In that case, its usage is not required but it's recommended.
If you plan on configuring the handlers yourself is still recommended you If you plan on configuring the handlers yourself is still recommended you
call this function, passing ``install_root_handler=False``. Bear in mind call this function, passing ``install_root_handler=False``. Bear in mind

View File

@ -897,6 +897,16 @@ Default: ``False``
If ``True``, the logs will just contain the root path. If it is set to ``False`` If ``True``, the logs will just contain the root path. If it is set to ``False``
then it displays the component responsible for the log output then it displays the component responsible for the log output
.. setting:: LOGSTATS_INTERVAL
LOGSTATS_INTERVAL
-----------------
Default: ``60.0``
The interval (in seconds) between each logging printout of the stats
by :class:`~extensions.logstats.LogStats`.
.. setting:: MEMDEBUG_ENABLED .. setting:: MEMDEBUG_ENABLED
MEMDEBUG_ENABLED MEMDEBUG_ENABLED
@ -1155,9 +1165,14 @@ Type of in-memory queue used by scheduler. Other available type is:
SCHEDULER_PRIORITY_QUEUE SCHEDULER_PRIORITY_QUEUE
------------------------ ------------------------
Default: ``'queuelib.PriorityQueue'`` Default: ``'scrapy.pqueues.ScrapyPriorityQueue'``
Type of priority queue used by scheduler. Type of priority queue used by the scheduler. Another available type is
``scrapy.pqueues.DownloaderAwarePriorityQueue``.
``scrapy.pqueues.DownloaderAwarePriorityQueue`` works better than
``scrapy.pqueues.ScrapyPriorityQueue`` when you crawl many different
domains in parallel. But currently ``scrapy.pqueues.DownloaderAwarePriorityQueue``
does not work together with :setting:`CONCURRENT_REQUESTS_PER_IP`.
.. setting:: SPIDER_CONTRACTS .. setting:: SPIDER_CONTRACTS

View File

@ -82,7 +82,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
If it raises an exception, Scrapy won't bother calling any other spider If it raises an exception, Scrapy won't bother calling any other spider
middleware :meth:`process_spider_input` and will call the request middleware :meth:`process_spider_input` and will call the request
errback. The output of the errback is chained back in the other errback if there is one, otherwise it will start the :meth:`process_spider_exception`
chain. The output of the errback is chained back in the other
direction for :meth:`process_spider_output` to process it, or direction for :meth:`process_spider_output` to process it, or
:meth:`process_spider_exception` if it raised an exception. :meth:`process_spider_exception` if it raised an exception.
@ -116,8 +117,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
.. method:: process_spider_exception(response, exception, spider) .. method:: process_spider_exception(response, exception, spider)
This method is called when a spider or :meth:`process_spider_input` This method is called when a spider or :meth:`process_spider_output`
method (from other spider middleware) raises an exception. method (from a previous spider middleware) raises an exception.
:meth:`process_spider_exception` should return either ``None`` or an :meth:`process_spider_exception` should return either ``None`` or an
iterable of :class:`~scrapy.http.Request`, dict or iterable of :class:`~scrapy.http.Request`, dict or
@ -129,7 +130,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
exception reaches the engine (where it's logged and discarded). exception reaches the engine (where it's logged and discarded).
If it returns an iterable the :meth:`process_spider_output` pipeline If it returns an iterable the :meth:`process_spider_output` pipeline
kicks in, and no other :meth:`process_spider_exception` will be called. kicks in, starting from the next spider middleware, and no other
:meth:`process_spider_exception` will be called.
:param response: the response being processed when the exception was :param response: the response being processed when the exception was
raised raised

View File

@ -402,10 +402,12 @@ Crawling rules
of links extracted from each response using the specified ``link_extractor``. of links extracted from each response using the specified ``link_extractor``.
This is mainly used for filtering purposes. This is mainly used for filtering purposes.
``process_request`` is a callable, or a string (in which case a method from ``process_request`` is a callable (or a string, in which case a method from
the spider object with that name will be used) which will be called with the spider object with that name will be used) which will be called for every
every request extracted by this rule, and must return a request or None (to :class:`~scrapy.http.Request` extracted by this rule. This callable should
filter out the request). take said request as first argument and the :class:`~scrapy.http.Response`
from which the request originated as second argument. It must return a
``Request`` object or ``None`` (to filter out the request).
CrawlSpider example CrawlSpider example
~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
@ -655,7 +657,7 @@ SitemapSpider
.. attribute:: sitemap_follow .. attribute:: sitemap_follow
A list of regexes of sitemap that should be followed. This is is only A list of regexes of sitemap that should be followed. This is only
for sites that use `Sitemap index files`_ that point to other sitemap for sites that use `Sitemap index files`_ that point to other sitemap
files. files.

View File

@ -75,8 +75,7 @@ available in Scrapy which extend the basic Stats Collector. You can select
which Stats Collector to use through the :setting:`STATS_CLASS` setting. The which Stats Collector to use through the :setting:`STATS_CLASS` setting. The
default Stats Collector used is the :class:`MemoryStatsCollector`. default Stats Collector used is the :class:`MemoryStatsCollector`.
.. module:: scrapy.statscollectors .. currentmodule:: scrapy.statscollectors
:synopsis: Stats Collectors
MemoryStatsCollector MemoryStatsCollector
-------------------- --------------------

View File

@ -1,12 +1,11 @@
.. currentmodule:: scrapy.extensions.telnet
.. _topics-telnetconsole: .. _topics-telnetconsole:
============== ==============
Telnet Console Telnet Console
============== ==============
.. module:: scrapy.extensions.telnet
:synopsis: The Telnet Console
Scrapy comes with a built-in telnet console for inspecting and controlling a Scrapy comes with a built-in telnet console for inspecting and controlling a
Scrapy running process. The telnet console is just a regular python shell Scrapy running process. The telnet console is just a regular python shell
running inside the Scrapy process, so you can do literally anything from it. running inside the Scrapy process, so you can do literally anything from it.
@ -45,7 +44,7 @@ the console you need to type::
>>> >>>
By default Username is ``scrapy`` and Password is autogenerated. The By default Username is ``scrapy`` and Password is autogenerated. The
autogenerated Password can be seen on scrapy logs like the example bellow:: autogenerated Password can be seen on scrapy logs like the example below::
2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326 2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326

View File

@ -6,7 +6,7 @@ from unittest import TextTestRunner, TextTestResult as _TextTestResult
from scrapy.commands import ScrapyCommand from scrapy.commands import ScrapyCommand
from scrapy.contracts import ContractsManager from scrapy.contracts import ContractsManager
from scrapy.utils.misc import load_object from scrapy.utils.misc import load_object, set_environ
from scrapy.utils.conf import build_component_list from scrapy.utils.conf import build_component_list
@ -68,6 +68,7 @@ class Command(ScrapyCommand):
spider_loader = self.crawler_process.spider_loader spider_loader = self.crawler_process.spider_loader
with set_environ(SCRAPY_CHECK='true'):
for spidername in args or spider_loader.list(): for spidername in args or spider_loader.list():
spidercls = spider_loader.load(spidername) spidercls = spider_loader.load(spidername)
spidercls.start_requests = lambda s: conman.from_spider(s, result) spidercls.start_requests = lambda s: conman.from_spider(s, result)

View File

@ -94,7 +94,7 @@ class ContractsManager(object):
try: try:
output = cb(response) output = cb(response)
output = list(iterate_spider_output(output)) output = list(iterate_spider_output(output))
except: except Exception:
case = _create_testcase(method, 'callback') case = _create_testcase(method, 'callback')
results.addError(case, sys.exc_info()) results.addError(case, sys.exc_info())

View File

@ -75,6 +75,8 @@ def _get_concurrency_delay(concurrency, spider, settings):
class Downloader(object): class Downloader(object):
DOWNLOAD_SLOT = 'download_slot'
def __init__(self, crawler): def __init__(self, crawler):
self.settings = crawler.settings self.settings = crawler.settings
self.signals = crawler.signals self.signals = crawler.signals
@ -111,8 +113,8 @@ class Downloader(object):
return key, self.slots[key] return key, self.slots[key]
def _get_slot_key(self, request, spider): def _get_slot_key(self, request, spider):
if 'download_slot' in request.meta: if self.DOWNLOAD_SLOT in request.meta:
return request.meta['download_slot'] return request.meta[self.DOWNLOAD_SLOT]
key = urlparse_cached(request).hostname or '' key = urlparse_cached(request).hostname or ''
if self.ip_concurrency: if self.ip_concurrency:
@ -122,7 +124,7 @@ class Downloader(object):
def _enqueue_request(self, request, spider): def _enqueue_request(self, request, spider):
key, slot = self._get_slot(request, spider) key, slot = self._get_slot(request, spider)
request.meta['download_slot'] = key request.meta[self.DOWNLOAD_SLOT] = key
def _deactivate(response): def _deactivate(response):
slot.active.remove(request) slot.active.remove(request)

View File

@ -7,6 +7,7 @@ import six
from twisted.internet import defer from twisted.internet import defer
from scrapy.exceptions import _InvalidOutput
from scrapy.http import Request, Response from scrapy.http import Request, Response
from scrapy.middleware import MiddlewareManager from scrapy.middleware import MiddlewareManager
from scrapy.utils.defer import mustbe_deferred from scrapy.utils.defer import mustbe_deferred
@ -35,9 +36,9 @@ class DownloaderMiddlewareManager(MiddlewareManager):
def process_request(request): def process_request(request):
for method in self.methods['process_request']: for method in self.methods['process_request']:
response = yield method(request=request, spider=spider) response = yield method(request=request, spider=spider)
assert response is None or isinstance(response, (Response, Request)), \ if response is not None and not isinstance(response, (Response, Request)):
'Middleware %s.process_request must return None, Response or Request, got %s' % \ raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
(six.get_method_self(method).__class__.__name__, response.__class__.__name__) (six.get_method_self(method).__class__.__name__, response.__class__.__name__))
if response: if response:
defer.returnValue(response) defer.returnValue(response)
defer.returnValue((yield download_func(request=request, spider=spider))) defer.returnValue((yield download_func(request=request, spider=spider)))
@ -49,11 +50,10 @@ class DownloaderMiddlewareManager(MiddlewareManager):
defer.returnValue(response) defer.returnValue(response)
for method in self.methods['process_response']: for method in self.methods['process_response']:
response = yield method(request=request, response=response, response = yield method(request=request, response=response, spider=spider)
spider=spider) if not isinstance(response, (Response, Request)):
assert isinstance(response, (Response, Request)), \ raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
'Middleware %s.process_response must return Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response)))
(six.get_method_self(method).__class__.__name__, type(response))
if isinstance(response, Request): if isinstance(response, Request):
defer.returnValue(response) defer.returnValue(response)
defer.returnValue(response) defer.returnValue(response)
@ -62,11 +62,10 @@ class DownloaderMiddlewareManager(MiddlewareManager):
def process_exception(_failure): def process_exception(_failure):
exception = _failure.value exception = _failure.value
for method in self.methods['process_exception']: for method in self.methods['process_exception']:
response = yield method(request=request, exception=exception, response = yield method(request=request, exception=exception, spider=spider)
spider=spider) if response is not None and not isinstance(response, (Response, Request)):
assert response is None or isinstance(response, (Response, Request)), \ raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \
'Middleware %s.process_exception must return None, Response or Request, got %s' % \ (six.get_method_self(method).__class__.__name__, type(response)))
(six.get_method_self(method).__class__.__name__, type(response))
if response: if response:
defer.returnValue(response) defer.returnValue(response)
defer.returnValue(_failure) defer.returnValue(_failure)

View File

@ -1,19 +1,46 @@
import os import os
import json import json
import logging import logging
import warnings
from os.path import join, exists from os.path import join, exists
from scrapy.utils.reqser import request_to_dict, request_from_dict from queuelib import PriorityQueue
from scrapy.utils.misc import load_object, create_instance from scrapy.utils.misc import load_object, create_instance
from scrapy.utils.job import job_dir from scrapy.utils.job import job_dir
from scrapy.utils.deprecate import ScrapyDeprecationWarning
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class Scheduler(object): class Scheduler(object):
"""
Scrapy Scheduler. It allows to enqueue requests and then get
a next request to download. Scheduler is also handling duplication
filtering, via dupefilter.
Prioritization and queueing is not performed by the Scheduler.
User sets ``priority`` field for each Request, and a PriorityQueue
(defined by :setting:`SCHEDULER_PRIORITY_QUEUE`) uses these priorities
to dequeue requests in a desired order.
Scheduler uses two PriorityQueue instances, configured to work in-memory
and on-disk (optional). When on-disk queue is present, it is used by
default, and an in-memory queue is used as a fallback for cases where
a disk queue can't handle a request (can't serialize it).
:setting:`SCHEDULER_MEMORY_QUEUE` and
:setting:`SCHEDULER_DISK_QUEUE` allow to specify lower-level queue classes
which PriorityQueue instances would be instantiated with, to keep requests
on disk and in memory respectively.
Overall, Scheduler is an object which holds several PriorityQueue instances
(in-memory and on-disk) and implements fallback logic for them.
Also, it handles dupefilters.
"""
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
logunser=False, stats=None, pqclass=None): logunser=False, stats=None, pqclass=None, crawler=None):
self.df = dupefilter self.df = dupefilter
self.dqdir = self._dqdir(jobdir) self.dqdir = self._dqdir(jobdir)
self.pqclass = pqclass self.pqclass = pqclass
@ -21,6 +48,7 @@ class Scheduler(object):
self.mqclass = mqclass self.mqclass = mqclass
self.logunser = logunser self.logunser = logunser
self.stats = stats self.stats = stats
self.crawler = crawler
@classmethod @classmethod
def from_crawler(cls, crawler): def from_crawler(cls, crawler):
@ -28,26 +56,35 @@ class Scheduler(object):
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
dupefilter = create_instance(dupefilter_cls, settings, crawler) dupefilter = create_instance(dupefilter_cls, settings, crawler)
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE']) pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
if pqclass is PriorityQueue:
warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
" is no longer supported because of API changes; "
"please use 'scrapy.pqueues.ScrapyPriorityQueue'",
ScrapyDeprecationWarning)
from scrapy.pqueues import ScrapyPriorityQueue
pqclass = ScrapyPriorityQueue
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG')) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
settings.getbool('SCHEDULER_DEBUG'))
return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser, return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass) stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
mqclass=mqclass, crawler=crawler)
def has_pending_requests(self): def has_pending_requests(self):
return len(self) > 0 return len(self) > 0
def open(self, spider): def open(self, spider):
self.spider = spider self.spider = spider
self.mqs = self.pqclass(self._newmq) self.mqs = self._mq()
self.dqs = self._dq() if self.dqdir else None self.dqs = self._dq() if self.dqdir else None
return self.df.open() return self.df.open()
def close(self, reason): def close(self, reason):
if self.dqs: if self.dqs:
prios = self.dqs.close() state = self.dqs.close()
with open(join(self.dqdir, 'active.json'), 'w') as f: self._write_dqs_state(self.dqdir, state)
json.dump(prios, f)
return self.df.close(reason) return self.df.close(reason)
def enqueue_request(self, request): def enqueue_request(self, request):
@ -82,8 +119,7 @@ class Scheduler(object):
if self.dqs is None: if self.dqs is None:
return return
try: try:
reqd = request_to_dict(request, self.spider) self.dqs.push(request, -request.priority)
self.dqs.push(reqd, -request.priority)
except ValueError as e: # non serializable request except ValueError as e: # non serializable request
if self.logunser: if self.logunser:
msg = ("Unable to serialize request: %(request)s - reason:" msg = ("Unable to serialize request: %(request)s - reason:"
@ -103,32 +139,51 @@ class Scheduler(object):
def _dqpop(self): def _dqpop(self):
if self.dqs: if self.dqs:
d = self.dqs.pop() return self.dqs.pop()
if d:
return request_from_dict(d, self.spider)
def _newmq(self, priority): def _newmq(self, priority):
""" Factory for creating memory queues. """
return self.mqclass() return self.mqclass()
def _newdq(self, priority): def _newdq(self, priority):
return self.dqclass(join(self.dqdir, 'p%s' % priority)) """ Factory for creating disk queues. """
path = join(self.dqdir, 'p%s' % (priority, ))
return self.dqclass(path)
def _mq(self):
""" Create a new priority queue instance, with in-memory storage """
return create_instance(self.pqclass, None, self.crawler, self._newmq,
serialize=False)
def _dq(self): def _dq(self):
activef = join(self.dqdir, 'active.json') """ Create a new priority queue instance, with disk storage """
if exists(activef): state = self._read_dqs_state(self.dqdir)
with open(activef) as f: q = create_instance(self.pqclass,
prios = json.load(f) None,
else: self.crawler,
prios = () self._newdq,
q = self.pqclass(self._newdq, startprios=prios) state,
serialize=True)
if q: if q:
logger.info("Resuming crawl (%(queuesize)d requests scheduled)", logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
{'queuesize': len(q)}, extra={'spider': self.spider}) {'queuesize': len(q)}, extra={'spider': self.spider})
return q return q
def _dqdir(self, jobdir): def _dqdir(self, jobdir):
""" Return a folder name to keep disk queue state at """
if jobdir: if jobdir:
dqdir = join(jobdir, 'requests.queue') dqdir = join(jobdir, 'requests.queue')
if not exists(dqdir): if not exists(dqdir):
os.makedirs(dqdir) os.makedirs(dqdir)
return dqdir return dqdir
def _read_dqs_state(self, dqdir):
path = join(dqdir, 'active.json')
if not exists(path):
return ()
with open(path) as f:
return json.load(f)
def _write_dqs_state(self, dqdir, state):
with open(join(dqdir, 'active.json'), 'w') as f:
json.dump(state, f)

View File

@ -135,7 +135,6 @@ class Scraper(object):
return self.spidermw.scrape_response( return self.spidermw.scrape_response(
self.call_spider, request_result, request, spider) self.call_spider, request_result, request, spider)
else: else:
# FIXME: don't ignore errors in spider middleware
dfd = self.call_spider(request_result, request, spider) dfd = self.call_spider(request_result, request, spider)
return dfd.addErrback( return dfd.addErrback(
self._log_download_errors, request_result, request, spider) self._log_download_errors, request_result, request, spider)

View File

@ -3,15 +3,21 @@ Spider Middleware manager
See documentation in docs/topics/spider-middleware.rst See documentation in docs/topics/spider-middleware.rst
""" """
from itertools import chain, islice
import six import six
from twisted.python.failure import Failure from twisted.python.failure import Failure
from scrapy.exceptions import _InvalidOutput
from scrapy.middleware import MiddlewareManager from scrapy.middleware import MiddlewareManager
from scrapy.utils.defer import mustbe_deferred from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.conf import build_component_list from scrapy.utils.conf import build_component_list
from scrapy.utils.python import MutableChain
def _isiterable(possible_iterator): def _isiterable(possible_iterator):
return hasattr(possible_iterator, '__iter__') return hasattr(possible_iterator, '__iter__')
class SpiderMiddlewareManager(MiddlewareManager): class SpiderMiddlewareManager(MiddlewareManager):
component_name = 'spider middleware' component_name = 'spider middleware'
@ -24,12 +30,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
super(SpiderMiddlewareManager, self)._add_middleware(mw) super(SpiderMiddlewareManager, self)._add_middleware(mw)
if hasattr(mw, 'process_spider_input'): if hasattr(mw, 'process_spider_input'):
self.methods['process_spider_input'].append(mw.process_spider_input) self.methods['process_spider_input'].append(mw.process_spider_input)
if hasattr(mw, 'process_spider_output'):
self.methods['process_spider_output'].appendleft(mw.process_spider_output)
if hasattr(mw, 'process_spider_exception'):
self.methods['process_spider_exception'].appendleft(mw.process_spider_exception)
if hasattr(mw, 'process_start_requests'): if hasattr(mw, 'process_start_requests'):
self.methods['process_start_requests'].appendleft(mw.process_start_requests) self.methods['process_start_requests'].appendleft(mw.process_start_requests)
self.methods['process_spider_output'].appendleft(getattr(mw, 'process_spider_output', None))
self.methods['process_spider_exception'].appendleft(getattr(mw, 'process_spider_exception', None))
def scrape_response(self, scrape_func, response, request, spider): def scrape_response(self, scrape_func, response, request, spider):
fname = lambda f:'%s.%s' % ( fname = lambda f:'%s.%s' % (
@ -40,36 +44,73 @@ class SpiderMiddlewareManager(MiddlewareManager):
for method in self.methods['process_spider_input']: for method in self.methods['process_spider_input']:
try: try:
result = method(response=response, spider=spider) result = method(response=response, spider=spider)
assert result is None, \ if result is not None:
'Middleware %s must returns None or ' \ raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
'raise an exception, got %s ' \ .format(fname(method), type(result)))
% (fname(method), type(result)) except _InvalidOutput:
except: raise
except Exception:
return scrape_func(Failure(), request, spider) return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider) return scrape_func(response, request, spider)
def process_spider_exception(_failure): def process_spider_exception(_failure, start_index=0):
exception = _failure.value exception = _failure.value
for method in self.methods['process_spider_exception']: # don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
return _failure
method_list = islice(self.methods['process_spider_exception'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
result = method(response=response, exception=exception, spider=spider) result = method(response=response, exception=exception, spider=spider)
assert result is None or _isiterable(result), \ if _isiterable(result):
'Middleware %s must returns None, or an iterable object, got %s ' % \ # stop exception handling by handing control over to the
(fname(method), type(result)) # process_spider_output chain if an iterable has been returned
if result is not None: return process_spider_output(result, method_index+1)
return result elif result is None:
continue
else:
raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \
.format(fname(method), type(result)))
return _failure return _failure
def process_spider_output(result): def process_spider_output(result, start_index=0):
for method in self.methods['process_spider_output']: # items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered = MutableChain()
def evaluate_iterable(iterable, index):
try:
for r in iterable:
yield r
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), index+1)
if isinstance(exception_result, Failure):
raise
recovered.extend(exception_result)
method_list = islice(self.methods['process_spider_output'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
# the following might fail directly if the output value is not a generator
try:
result = method(response=response, result=result, spider=spider) result = method(response=response, result=result, spider=spider)
assert _isiterable(result), \ except Exception as ex:
'Middleware %s must returns an iterable object, got %s ' % \ exception_result = process_spider_exception(Failure(ex), method_index+1)
(fname(method), type(result)) if isinstance(exception_result, Failure):
return result raise
return exception_result
if _isiterable(result):
result = evaluate_iterable(result, method_index)
else:
raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
.format(fname(method), type(result)))
return chain(result, recovered)
dfd = mustbe_deferred(process_spider_input, response) dfd = mustbe_deferred(process_spider_input, response)
dfd.addErrback(process_spider_exception) dfd.addCallbacks(callback=process_spider_output, errback=process_spider_exception)
dfd.addCallback(process_spider_output)
return dfd return dfd
def process_start_requests(self, start_requests, spider): def process_start_requests(self, start_requests, spider):

View File

@ -111,6 +111,8 @@ class Crawler(object):
@defer.inlineCallbacks @defer.inlineCallbacks
def stop(self): def stop(self):
"""Starts a graceful stop of the crawler and returns a deferred that is
fired when the crawler is stopped."""
if self.crawling: if self.crawling:
self.crawling = False self.crawling = False
yield defer.maybeDeferred(self.engine.stop) yield defer.maybeDeferred(self.engine.stop)

View File

@ -88,6 +88,7 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
def __init__(self, settings): def __init__(self, settings):
super(MetaRefreshMiddleware, self).__init__(settings) super(MetaRefreshMiddleware, self).__init__(settings)
self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY', self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY',
settings.getint('METAREFRESH_MAXDELAY')) settings.getint('METAREFRESH_MAXDELAY'))
@ -96,7 +97,8 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
not isinstance(response, HtmlResponse): not isinstance(response, HtmlResponse):
return response return response
interval, url = get_meta_refresh(response) interval, url = get_meta_refresh(response,
ignore_tags=self._ignore_tags)
if url and interval < self._maxdelay: if url and interval < self._maxdelay:
redirected = self._redirect_request_using_get(request, url) redirected = self._redirect_request_using_get(request, url)
return self._redirect(redirected, request, spider, 'meta refresh') return self._redirect(redirected, request, spider, 'meta refresh')

View File

@ -11,6 +11,13 @@ class NotConfigured(Exception):
"""Indicates a missing configuration situation""" """Indicates a missing configuration situation"""
pass pass
class _InvalidOutput(TypeError):
"""
Indicates an invalid value has been returned by a middleware's processing method.
Internal and undocumented, it should not be raised or caught by user code.
"""
pass
# HTTP and crawling # HTTP and crawling
class IgnoreRequest(Exception): class IgnoreRequest(Exception):

View File

@ -24,7 +24,11 @@ class CoreStats(object):
self.stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider) self.stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
def spider_closed(self, spider, reason): def spider_closed(self, spider, reason):
self.stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider) finish_time = datetime.datetime.utcnow()
elapsed_time = finish_time - self.stats.get_value('start_time')
elapsed_time_seconds = elapsed_time.total_seconds()
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
self.stats.set_value('finish_time', finish_time, spider=spider)
self.stats.set_value('finish_reason', reason, spider=spider) self.stats.set_value('finish_reason', reason, spider=spider)
def item_scraped(self, item, spider): def item_scraped(self, item, spider):

View File

@ -31,7 +31,7 @@ class DummyPolicy(object):
def should_cache_response(self, response, request): def should_cache_response(self, response, request):
return response.status not in self.ignore_http_codes return response.status not in self.ignore_http_codes
def is_cached_response_fresh(self, response, request): def is_cached_response_fresh(self, cachedresponse, request):
return True return True
def is_cached_response_valid(self, cachedresponse, response, request): def is_cached_response_valid(self, cachedresponse, response, request):
@ -70,7 +70,7 @@ class RFC2616Policy(object):
return True return True
def should_cache_response(self, response, request): def should_cache_response(self, response, request):
# What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1 # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
# Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4 # Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
# Status code 206 is not included because cache can not deal with partial contents # Status code 206 is not included because cache can not deal with partial contents
cc = self._parse_cachecontrol(response) cc = self._parse_cachecontrol(response)

View File

@ -35,6 +35,10 @@ class ItemLoader(object):
self.parent = parent self.parent = parent
self._local_item = context['item'] = item self._local_item = context['item'] = item
self._local_values = defaultdict(list) self._local_values = defaultdict(list)
# Preprocess values if item built from dict
# Values need to be added to item._values if added them from dict (not with add_values)
for field_name, value in item.items():
self._values[field_name] = self._process_input_value(field_name, value)
@property @property
def _values(self): def _values(self):

View File

@ -3,7 +3,7 @@ from __future__ import print_function
import functools import functools
import logging import logging
from collections import defaultdict from collections import defaultdict
from twisted.internet.defer import Deferred, DeferredList from twisted.internet.defer import Deferred, DeferredList, _DefGen_Return
from twisted.python.failure import Failure from twisted.python.failure import Failure
from scrapy.settings import Settings from scrapy.settings import Settings
@ -139,6 +139,30 @@ class MediaPipeline(object):
result.cleanFailure() result.cleanFailure()
result.frames = [] result.frames = []
result.stack = None result.stack = None
# This code fixes a memory leak by avoiding to keep references to
# the Request and Response objects on the Media Pipeline cache.
#
# Twisted inline callbacks pass return values using the function
# twisted.internet.defer.returnValue, which encapsulates the return
# value inside a _DefGen_Return base exception.
#
# What happens when the media_downloaded callback raises another
# exception, for example a FileException('download-error') when
# the Response status code is not 200 OK, is that it stores the
# _DefGen_Return exception on the FileException context.
#
# To avoid keeping references to the Response and therefore Request
# objects on the Media Pipeline cache, we should wipe the context of
# the exception encapsulated by the Twisted Failure when its a
# _DefGen_Return instance.
#
# This problem does not occur in Python 2.7 since we don't have
# Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
context = getattr(result.value, '__context__', None)
if isinstance(context, _DefGen_Return):
setattr(result.value, '__context__', None)
info.downloading.remove(fp) info.downloading.remove(fp)
info.downloaded[fp] = result # cache result info.downloaded[fp] = result # cache result
for wad in info.waiting.pop(fp): for wad in info.waiting.pop(fp):

193
scrapy/pqueues.py Normal file
View File

@ -0,0 +1,193 @@
import hashlib
import logging
from collections import namedtuple
from queuelib import PriorityQueue
from scrapy.utils.reqser import request_to_dict, request_from_dict
logger = logging.getLogger(__name__)
def _path_safe(text):
"""
Return a filesystem-safe version of a string ``text``
>>> _path_safe('simple.org').startswith('simple.org')
True
>>> _path_safe('dash-underscore_.org').startswith('dash-underscore_.org')
True
>>> _path_safe('some@symbol?').startswith('some_symbol_')
True
"""
pathable_slot = "".join([c if c.isalnum() or c in '-._' else '_'
for c in text])
# as we replace some letters we can get collision for different slots
# add we add unique part
unique_slot = hashlib.md5(text.encode('utf8')).hexdigest()
return '-'.join([pathable_slot, unique_slot])
class _Priority(namedtuple("_Priority", ["priority", "slot"])):
""" Slot-specific priority. It is a hack - ``(priority, slot)`` tuple
which can be used instead of int priorities in queues:
* they are ordered in the same way - order is still by priority value,
min(prios) works;
* str(p) representation is guaranteed to be different when slots
are different - this is important because str(p) is used to create
queue files on disk;
* they have readable str(p) representation which is safe
to use as a file name.
"""
__slots__ = ()
def __str__(self):
return '%s_%s' % (self.priority, _path_safe(str(self.slot)))
class _SlotPriorityQueues(object):
""" Container for multiple priority queues. """
def __init__(self, pqfactory, slot_startprios=None):
"""
``pqfactory`` is a factory for creating new PriorityQueues.
It must be a function which accepts a single optional ``startprios``
argument, with a list of priorities to create queues for.
``slot_startprios`` is a ``{slot: startprios}`` dict.
"""
self.pqfactory = pqfactory
self.pqueues = {} # slot -> priority queue
for slot, startprios in (slot_startprios or {}).items():
self.pqueues[slot] = self.pqfactory(startprios)
def pop_slot(self, slot):
""" Pop an object from a priority queue for this slot """
queue = self.pqueues[slot]
request = queue.pop()
if len(queue) == 0:
del self.pqueues[slot]
return request
def push_slot(self, slot, obj, priority):
""" Push an object to a priority queue for this slot """
if slot not in self.pqueues:
self.pqueues[slot] = self.pqfactory()
queue = self.pqueues[slot]
queue.push(obj, priority)
def close(self):
active = {slot: queue.close()
for slot, queue in self.pqueues.items()}
self.pqueues.clear()
return active
def __len__(self):
return sum(len(x) for x in self.pqueues.values()) if self.pqueues else 0
def __contains__(self, slot):
return slot in self.pqueues
class ScrapyPriorityQueue(PriorityQueue):
"""
PriorityQueue which works with scrapy.Request instances and
can optionally convert them to/from dicts before/after putting to a queue.
"""
def __init__(self, crawler, qfactory, startprios=(), serialize=False):
super(ScrapyPriorityQueue, self).__init__(qfactory, startprios)
self.serialize = serialize
self.spider = crawler.spider
@classmethod
def from_crawler(cls, crawler, qfactory, startprios=(), serialize=False):
return cls(crawler, qfactory, startprios, serialize)
def push(self, request, priority=0):
if self.serialize:
request = request_to_dict(request, self.spider)
super(ScrapyPriorityQueue, self).push(request, priority)
def pop(self):
request = super(ScrapyPriorityQueue, self).pop()
if request and self.serialize:
request = request_from_dict(request, self.spider)
return request
class DownloaderInterface(object):
def __init__(self, crawler):
self.downloader = crawler.engine.downloader
def stats(self, possible_slots):
return [(self._active_downloads(slot), slot)
for slot in possible_slots]
def get_slot_key(self, request):
return self.downloader._get_slot_key(request, None)
def _active_downloads(self, slot):
""" Return a number of requests in a Downloader for a given slot """
if slot not in self.downloader.slots:
return 0
return len(self.downloader.slots[slot].active)
class DownloaderAwarePriorityQueue(object):
""" PriorityQueue which takes Downlaoder activity in account:
domains (slots) with the least amount of active downloads are dequeued
first.
"""
@classmethod
def from_crawler(cls, crawler, qfactory, slot_startprios=None, serialize=False):
return cls(crawler, qfactory, slot_startprios, serialize)
def __init__(self, crawler, qfactory, slot_startprios=None, serialize=False):
if crawler.settings.getint('CONCURRENT_REQUESTS_PER_IP') != 0:
raise ValueError('"%s" does not support CONCURRENT_REQUESTS_PER_IP'
% (self.__class__,))
if slot_startprios and not isinstance(slot_startprios, dict):
raise ValueError("DownloaderAwarePriorityQueue accepts "
"``slot_startprios`` as a dict; %r instance "
"is passed. Most likely, it means the state is"
"created by an incompatible priority queue. "
"Only a crawl started with the same priority "
"queue class can be resumed." %
slot_startprios.__class__)
slot_startprios = {
slot: [_Priority(p, slot) for p in startprios]
for slot, startprios in (slot_startprios or {}).items()}
def pqfactory(startprios=()):
return ScrapyPriorityQueue(crawler, qfactory, startprios, serialize)
self._slot_pqueues = _SlotPriorityQueues(pqfactory, slot_startprios)
self.serialize = serialize
self._downloader_interface = DownloaderInterface(crawler)
def pop(self):
stats = self._downloader_interface.stats(self._slot_pqueues.pqueues)
if not stats:
return
slot = min(stats)[1]
request = self._slot_pqueues.pop_slot(slot)
return request
def push(self, request, priority):
slot = self._downloader_interface.get_slot_key(request)
priority_slot = _Priority(priority=priority, slot=slot)
self._slot_pqueues.push_slot(slot, request, priority_slot)
def close(self):
active = self._slot_pqueues.close()
return {slot: [p.priority for p in startprios]
for slot, startprios in active.items()}
def __len__(self):
return len(self._slot_pqueues)

View File

@ -221,6 +221,7 @@ MEMUSAGE_NOTIFY_MAIL = []
MEMUSAGE_WARNING_MB = 0 MEMUSAGE_WARNING_MB = 0
METAREFRESH_ENABLED = True METAREFRESH_ENABLED = True
METAREFRESH_IGNORE_TAGS = ['script', 'noscript']
METAREFRESH_MAXDELAY = 100 METAREFRESH_MAXDELAY = 100
NEWSPIDER_MODULE = '' NEWSPIDER_MODULE = ''
@ -238,7 +239,7 @@ REFERRER_POLICY = 'scrapy.spidermiddlewares.referer.DefaultReferrerPolicy'
RETRY_ENABLED = True RETRY_ENABLED = True
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408] RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
RETRY_PRIORITY_ADJUST = -1 RETRY_PRIORITY_ADJUST = -1
ROBOTSTXT_OBEY = False ROBOTSTXT_OBEY = False
@ -246,7 +247,7 @@ ROBOTSTXT_OBEY = False
SCHEDULER = 'scrapy.core.scheduler.Scheduler' SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue' SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue' SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
SCHEDULER_PRIORITY_QUEUE = 'queuelib.PriorityQueue' SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader' SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
SPIDER_LOADER_WARN_ONLY = False SPIDER_LOADER_WARN_ONLY = False

View File

@ -6,29 +6,55 @@ See documentation in docs/topics/spiders.rst
""" """
import copy import copy
import warnings
import six import six
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Request, HtmlResponse from scrapy.http import Request, HtmlResponse
from scrapy.utils.spider import iterate_spider_output from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.python import get_func_args
from scrapy.spiders import Spider from scrapy.spiders import Spider
def identity(x): def _identity(request, response):
return x return request
def _get_method(method, spider):
if callable(method):
return method
elif isinstance(method, six.string_types):
return getattr(spider, method, None)
class Rule(object): class Rule(object):
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity): def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None):
self.link_extractor = link_extractor self.link_extractor = link_extractor
self.callback = callback self.callback = callback
self.cb_kwargs = cb_kwargs or {} self.cb_kwargs = cb_kwargs or {}
self.process_links = process_links self.process_links = process_links
self.process_request = process_request self.process_request = process_request or _identity
if follow is None: self.process_request_argcount = None
self.follow = False if callback else True self.follow = follow if follow is not None else not callback
else:
self.follow = follow def _compile(self, spider):
self.callback = _get_method(self.callback, spider)
self.process_links = _get_method(self.process_links, spider)
self.process_request = _get_method(self.process_request, spider)
self.process_request_argcount = len(get_func_args(self.process_request))
if self.process_request_argcount == 1:
msg = 'Rule.process_request should accept two arguments (request, response), accepting only one is deprecated'
warnings.warn(msg, category=ScrapyDeprecationWarning, stacklevel=2)
def _process_request(self, request, response):
"""
Wrapper around the request processing function to maintain backward
compatibility with functions that do not take a Response object
"""
args = [request] if self.process_request_argcount == 1 else [request, response]
return self.process_request(*args)
class CrawlSpider(Spider): class CrawlSpider(Spider):
@ -64,8 +90,8 @@ class CrawlSpider(Spider):
links = rule.process_links(links) links = rule.process_links(links)
for link in links: for link in links:
seen.add(link) seen.add(link)
r = self._build_request(n, link) request = self._build_request(n, link)
yield rule.process_request(r) yield rule._process_request(request, response)
def _response_downloaded(self, response): def _response_downloaded(self, response):
rule = self._rules[response.meta['rule']] rule = self._rules[response.meta['rule']]
@ -83,17 +109,9 @@ class CrawlSpider(Spider):
yield request_or_item yield request_or_item
def _compile_rules(self): def _compile_rules(self):
def get_method(method):
if callable(method):
return method
elif isinstance(method, six.string_types):
return getattr(self, method, None)
self._rules = [copy.copy(r) for r in self.rules] self._rules = [copy.copy(r) for r in self.rules]
for rule in self._rules: for rule in self._rules:
rule.callback = get_method(rule.callback) rule._compile(self)
rule.process_links = get_method(rule.process_links)
rule.process_request = get_method(rule.process_request)
@classmethod @classmethod
def from_crawler(cls, crawler, *args, **kwargs): def from_crawler(cls, crawler, *args, **kwargs):

View File

@ -7,6 +7,7 @@ from six.moves import cPickle as pickle
from queuelib import queue from queuelib import queue
def _serializable_queue(queue_class, serialize, deserialize): def _serializable_queue(queue_class, serialize, deserialize):
class SerializableQueue(queue_class): class SerializableQueue(queue_class):
@ -22,6 +23,7 @@ def _serializable_queue(queue_class, serialize, deserialize):
return SerializableQueue return SerializableQueue
def _pickle_serialize(obj): def _pickle_serialize(obj):
try: try:
return pickle.dumps(obj, protocol=2) return pickle.dumps(obj, protocol=2)
@ -31,13 +33,14 @@ def _pickle_serialize(obj):
except (pickle.PicklingError, AttributeError, TypeError) as e: except (pickle.PicklingError, AttributeError, TypeError) as e:
raise ValueError(str(e)) raise ValueError(str(e))
PickleFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue, \
PickleFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue,
_pickle_serialize, pickle.loads) _pickle_serialize, pickle.loads)
PickleLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue, \ PickleLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue,
_pickle_serialize, pickle.loads) _pickle_serialize, pickle.loads)
MarshalFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue, \ MarshalFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue,
marshal.dumps, marshal.loads) marshal.dumps, marshal.loads)
MarshalLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue, \ MarshalLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue,
marshal.dumps, marshal.loads) marshal.dumps, marshal.loads)
FifoMemoryQueue = queue.FifoMemoryQueue FifoMemoryQueue = queue.FifoMemoryQueue
LifoMemoryQueue = queue.LifoMemoryQueue LifoMemoryQueue = queue.LifoMemoryQueue

View File

@ -39,7 +39,7 @@ class ${ProjectName}SpiderMiddleware(object):
# Called when a spider or process_spider_input() method # Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict # Should return either None or an iterable of Request, dict
# or Item objects. # or Item objects.
pass pass

View File

@ -48,7 +48,7 @@ def mustbe_deferred(f, *args, **kw):
# exception in Scrapy - see #125 # exception in Scrapy - see #125
except IgnoreRequest as e: except IgnoreRequest as e:
return defer_fail(failure.Failure(e)) return defer_fail(failure.Failure(e))
except: except Exception:
return defer_fail(failure.Failure()) return defer_fail(failure.Failure())
else: else:
return defer_result(result) return defer_result(result)
@ -102,5 +102,5 @@ def iter_errback(iterable, errback, *a, **kw):
yield next(it) yield next(it)
except StopIteration: except StopIteration:
break break
except: except Exception:
errback(failure.Failure(), *a, **kw) errback(failure.Failure(), *a, **kw)

View File

@ -9,6 +9,9 @@ from gzip import GzipFile
import six import six
import re import re
from scrapy.utils.decorators import deprecated
# - Python>=3.5 GzipFile's read() has issues returning leftover # - Python>=3.5 GzipFile's read() has issues returning leftover
# uncompressed data when input is corrupted # uncompressed data when input is corrupted
# (regression or bug-fix compared to Python 3.4) # (regression or bug-fix compared to Python 3.4)
@ -53,6 +56,7 @@ def gunzip(data):
_is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search _is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search
_is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search _is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search
@deprecated
def is_gzipped(response): def is_gzipped(response):
"""Return True if the response is gzipped, or False otherwise""" """Return True if the response is gzipped, or False otherwise"""
ctype = response.headers.get('Content-Type', b'') ctype = response.headers.get('Content-Type', b'')

View File

@ -1,6 +1,8 @@
"""Helper functions which don't fit anywhere else""" """Helper functions which don't fit anywhere else"""
import os
import re import re
import hashlib import hashlib
from contextlib import contextmanager
from importlib import import_module from importlib import import_module
from pkgutil import iter_modules from pkgutil import iter_modules
@ -86,7 +88,7 @@ def extract_regex(regex, text, encoding='utf-8'):
try: try:
strings = [regex.search(text).group('extract')] # named group strings = [regex.search(text).group('extract')] # named group
except: except Exception:
strings = regex.findall(text) # full regex or numbered groups strings = regex.findall(text) # full regex or numbered groups
strings = flatten(strings) strings = flatten(strings)
@ -142,3 +144,21 @@ def create_instance(objcls, settings, crawler, *args, **kwargs):
return objcls.from_settings(settings, *args, **kwargs) return objcls.from_settings(settings, *args, **kwargs)
else: else:
return objcls(*args, **kwargs) return objcls(*args, **kwargs)
@contextmanager
def set_environ(**kwargs):
"""Temporarily set environment variables inside the context manager and
fully restore previous environment afterwards
"""
original_env = {k: os.environ.get(k) for k in kwargs}
os.environ.update(kwargs)
try:
yield
finally:
for k, v in original_env.items():
if v is None:
del os.environ[k]
else:
os.environ[k] = v

View File

@ -9,6 +9,7 @@ import weakref
import errno import errno
import six import six
from functools import partial, wraps from functools import partial, wraps
from itertools import chain
import sys import sys
from scrapy.utils.decorators import deprecated from scrapy.utils.decorators import deprecated
@ -387,3 +388,22 @@ if hasattr(sys, "pypy_version_info"):
else: else:
def garbage_collect(): def garbage_collect():
gc.collect() gc.collect()
class MutableChain(object):
"""
Thin wrapper around itertools.chain, allowing to add iterables "in-place"
"""
def __init__(self, *args):
self.data = chain(*args)
def extend(self, *iterables):
self.data = chain(self.data, *iterables)
def __iter__(self):
return self.data.__iter__()
def __next__(self):
return next(self.data)
next = __next__

View File

@ -70,6 +70,20 @@ def request_from_dict(d, spider=None):
) )
def _is_private_method(name):
return name.startswith('__') and not name.endswith('__')
def _mangle_private_name(obj, func, name):
qualname = getattr(func, '__qualname__', None)
if qualname is None:
classname = obj.__class__.__name__.lstrip('_')
return '_%s%s' % (classname, name)
else:
splits = qualname.split('.')
return '_%s%s' % (splits[-2], splits[-1])
def _find_method(obj, func): def _find_method(obj, func):
if obj: if obj:
try: try:
@ -78,7 +92,10 @@ def _find_method(obj, func):
pass pass
else: else:
if func_self is obj: if func_self is obj:
return six.get_method_function(func).__name__ name = six.get_method_function(func).__name__
if _is_private_method(name):
return _mangle_private_name(obj, func, name)
return name
raise ValueError("Function %s is not a method of: %s" % (func, obj)) raise ValueError("Function %s is not a method of: %s" % (func, obj))

View File

@ -31,12 +31,12 @@ def get_base_url(response):
_metaref_cache = weakref.WeakKeyDictionary() _metaref_cache = weakref.WeakKeyDictionary()
def get_meta_refresh(response): def get_meta_refresh(response, ignore_tags=('script', 'noscript')):
"""Parse the http-equiv refrsh parameter from the given response""" """Parse the http-equiv refrsh parameter from the given response"""
if response not in _metaref_cache: if response not in _metaref_cache:
text = response.text[0:4096] text = response.text[0:4096]
_metaref_cache[response] = html.get_meta_refresh(text, response.url, _metaref_cache[response] = html.get_meta_refresh(text, response.url,
response.encoding, ignore_tags=('script', 'noscript')) response.encoding, ignore_tags=ignore_tags)
return _metaref_cache[response] return _metaref_cache[response]

View File

@ -65,7 +65,8 @@ setup(
], ],
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*', python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
install_requires=[ install_requires=[
'Twisted>=13.1.0', 'Twisted>=13.1.0;python_version!="3.4"',
'Twisted>=13.1.0,<=19.2.0;python_version=="3.4"',
'w3lib>=1.17.0', 'w3lib>=1.17.0',
'queuelib', 'queuelib',
'lxml', 'lxml',

View File

@ -177,7 +177,7 @@ class Root(Resource):
try: try:
from tests import tests_datadir from tests import tests_datadir
self.putChild(b"files", File(os.path.join(tests_datadir, 'test_site/files/'))) self.putChild(b"files", File(os.path.join(tests_datadir, 'test_site/files/')))
except: except Exception:
pass pass
self.putChild(b"redirect-to", RedirectTo()) self.putChild(b"redirect-to", RedirectTo())

View File

@ -2,9 +2,10 @@
mock mock
mitmproxy==0.10.1 mitmproxy==0.10.1
netlib==0.10.1 netlib==0.10.1
pytest==2.9.2 pytest
pytest-cov
pytest-twisted pytest-twisted
pytest-cov==2.2.1 pytest-xdist
jmespath jmespath
brotlipy brotlipy
testfixtures testfixtures

View File

@ -1,6 +1,7 @@
pytest==3.6.3 pytest
pytest-cov
pytest-twisted pytest-twisted
pytest-cov==2.5.1 pytest-xdist
testfixtures testfixtures
jmespath jmespath
leveldb; sys_platform != "win32" leveldb; sys_platform != "win32"

View File

@ -53,9 +53,5 @@ class TestCloseSpider(TestCase):
yield crawler.crawl(total=1000000, mockserver=self.mockserver) yield crawler.crawl(total=1000000, mockserver=self.mockserver)
reason = crawler.spider.meta['close_reason'] reason = crawler.spider.meta['close_reason']
self.assertEqual(reason, 'closespider_timeout') self.assertEqual(reason, 'closespider_timeout')
stats = crawler.stats total_seconds = crawler.stats.get_value('elapsed_time_seconds')
start = stats.get_value('start_time')
stop = stats.get_value('finish_time')
diff = stop - start
total_seconds = diff.seconds + diff.microseconds
self.assertTrue(total_seconds >= close_on) self.assertTrue(total_seconds >= close_on)

View File

@ -1,5 +1,4 @@
import logging import logging
import tempfile
import warnings import warnings
from twisted.internet import defer from twisted.internet import defer
@ -38,7 +37,11 @@ class CrawlerTestCase(BaseCrawlerTest):
self.assertIsInstance(spiders, sl_cls) self.assertIsInstance(spiders, sl_cls)
self.crawler.spiders self.crawler.spiders
self.assertEqual(len(w), 1, "Warn deprecated access only once") is_one_warning = len(w) == 1
if not is_one_warning:
for warning in w:
print(warning)
self.assertTrue(is_one_warning, "Warn deprecated access only once")
def test_populate_spidercls_settings(self): def test_populate_spidercls_settings(self):
spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'} spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
@ -179,8 +182,12 @@ class CrawlerRunnerTestCase(BaseCrawlerTest):
'SPIDER_MANAGER_CLASS': 'tests.test_crawler.CustomSpiderLoader' 'SPIDER_MANAGER_CLASS': 'tests.test_crawler.CustomSpiderLoader'
}) })
self.assertIsInstance(runner.spider_loader, CustomSpiderLoader) self.assertIsInstance(runner.spider_loader, CustomSpiderLoader)
self.assertEqual(len(w), 1) is_one_warning = len(w) == 1
if not is_one_warning:
for warning in w:
print(warning)
self.assertIn('Please use SPIDER_LOADER_CLASS', str(w[0].message)) self.assertIn('Please use SPIDER_LOADER_CLASS', str(w[0].message))
self.assertTrue(is_one_warning)
def test_crawl_rejects_spider_objects(self): def test_crawl_rejects_spider_objects(self):
with raises(ValueError): with raises(ValueError):

View File

@ -3,6 +3,7 @@ from twisted.python.failure import Failure
from scrapy.http import Request, Response from scrapy.http import Request, Response
from scrapy.spiders import Spider from scrapy.spiders import Spider
from scrapy.exceptions import _InvalidOutput
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
from scrapy.utils.test import get_crawler from scrapy.utils.test import get_crawler
from scrapy.utils.python import to_bytes from scrapy.utils.python import to_bytes
@ -115,3 +116,63 @@ class ResponseFromProcessRequestTest(ManagerTestCase):
self.assertIs(results[0], resp) self.assertIs(results[0], resp)
self.assertFalse(download_func.called) self.assertFalse(download_func.called)
class ProcessRequestInvalidOutput(ManagerTestCase):
"""Invalid return value for process_request method should raise an exception"""
def test_invalid_process_request(self):
req = Request('http://example.com/index.html')
class InvalidProcessRequestMiddleware:
def process_request(self, request, spider):
return 1
self.mwman._add_middleware(InvalidProcessRequestMiddleware())
download_func = mock.MagicMock()
dfd = self.mwman.download(download_func, req, self.spider)
results = []
dfd.addBoth(results.append)
self.assertIsInstance(results[0], Failure)
self.assertIsInstance(results[0].value, _InvalidOutput)
class ProcessResponseInvalidOutput(ManagerTestCase):
"""Invalid return value for process_response method should raise an exception"""
def test_invalid_process_response(self):
req = Request('http://example.com/index.html')
class InvalidProcessResponseMiddleware:
def process_response(self, request, response, spider):
return 1
self.mwman._add_middleware(InvalidProcessResponseMiddleware())
download_func = mock.MagicMock()
dfd = self.mwman.download(download_func, req, self.spider)
results = []
dfd.addBoth(results.append)
self.assertIsInstance(results[0], Failure)
self.assertIsInstance(results[0].value, _InvalidOutput)
class ProcessExceptionInvalidOutput(ManagerTestCase):
"""Invalid return value for process_exception method should raise an exception"""
def test_invalid_process_exception(self):
req = Request('http://example.com/index.html')
class InvalidProcessExceptionMiddleware:
def process_request(self, request, spider):
raise Exception()
def process_exception(self, request, exception, spider):
return 1
self.mwman._add_middleware(InvalidProcessExceptionMiddleware())
download_func = mock.MagicMock()
dfd = self.mwman.download(download_func, req, self.spider)
results = []
dfd.addBoth(results.append)
self.assertIsInstance(results[0], Failure)
self.assertIsInstance(results[0].value, _InvalidOutput)

View File

@ -279,5 +279,24 @@ class MetaRefreshMiddlewareTest(unittest.TestCase):
self.assertEqual(req2.meta['redirect_reasons'], ['meta refresh']) self.assertEqual(req2.meta['redirect_reasons'], ['meta refresh'])
self.assertEqual(req3.meta['redirect_reasons'], ['meta refresh', 'meta refresh']) self.assertEqual(req3.meta['redirect_reasons'], ['meta refresh', 'meta refresh'])
def test_ignore_tags_default(self):
req = Request(url='http://example.org')
body = ('''<noscript><meta http-equiv="refresh" '''
'''content="0;URL='http://example.org/newpage'"></noscript>''')
rsp = HtmlResponse(req.url, body=body.encode())
response = self.mw.process_response(req, rsp, self.spider)
assert isinstance(response, Response)
def test_ignore_tags_empty_list(self):
crawler = get_crawler(Spider, {'METAREFRESH_IGNORE_TAGS': []})
mw = MetaRefreshMiddleware.from_crawler(crawler)
req = Request(url='http://example.org')
body = ('''<noscript><meta http-equiv="refresh" '''
'''content="0;URL='http://example.org/newpage'"></noscript>''')
rsp = HtmlResponse(req.url, body=body.encode())
req2 = mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, 'http://example.org/newpage')
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -419,6 +419,43 @@ class BasicItemLoaderTest(unittest.TestCase):
self.assertEqual(item['url'], u'rabbit.hole') self.assertEqual(item['url'], u'rabbit.hole')
self.assertEqual(item['summary'], u'rabbithole') self.assertEqual(item['summary'], u'rabbithole')
def test_create_item_from_dict(self):
class TestItem(Item):
title = Field()
class TestItemLoader(ItemLoader):
default_item_class = TestItem
input_item = {'title': 'Test item title 1'}
il = TestItemLoader(item=input_item)
# Getting output value mustn't remove value from item
self.assertEqual(il.load_item(), {
'title': 'Test item title 1',
})
self.assertEqual(il.get_output_value('title'), 'Test item title 1')
self.assertEqual(il.load_item(), {
'title': 'Test item title 1',
})
input_item = {'title': 'Test item title 2'}
il = TestItemLoader(item=input_item)
# Values from dict must be added to item _values
self.assertEqual(il._values.get('title'), 'Test item title 2')
input_item = {'title': [u'Test item title 3', u'Test item 4']}
il = TestItemLoader(item=input_item)
# Same rules must work for lists
self.assertEqual(il._values.get('title'),
[u'Test item title 3', u'Test item 4'])
self.assertEqual(il.load_item(), {
'title': [u'Test item title 3', u'Test item 4'],
})
self.assertEqual(il.get_output_value('title'),
[u'Test item title 3', u'Test item 4'])
self.assertEqual(il.load_item(), {
'title': [u'Test item title 3', u'Test item 4'],
})
class ProcessorsTest(unittest.TestCase): class ProcessorsTest(unittest.TestCase):

View File

@ -1,15 +1,19 @@
from __future__ import print_function from __future__ import print_function
import sys
from testfixtures import LogCapture from testfixtures import LogCapture
from twisted.trial import unittest from twisted.trial import unittest
from twisted.python.failure import Failure from twisted.python.failure import Failure
from twisted.internet import reactor from twisted.internet import reactor
from twisted.internet.defer import Deferred, inlineCallbacks from twisted.internet.defer import Deferred, inlineCallbacks, returnValue
from scrapy.http import Request, Response from scrapy.http import Request, Response
from scrapy.settings import Settings from scrapy.settings import Settings
from scrapy.spiders import Spider from scrapy.spiders import Spider
from scrapy.utils.request import request_fingerprint from scrapy.utils.request import request_fingerprint
from scrapy.pipelines.media import MediaPipeline from scrapy.pipelines.media import MediaPipeline
from scrapy.pipelines.files import FileException
from scrapy.utils.log import failure_to_exc_info from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.signal import disconnect_all from scrapy.utils.signal import disconnect_all
from scrapy import signals from scrapy import signals
@ -90,6 +94,77 @@ class BaseMediaPipelineTestCase(unittest.TestCase):
self.pipe._modify_media_request(request) self.pipe._modify_media_request(request)
assert request.meta == {'handle_httpstatus_all': True} assert request.meta == {'handle_httpstatus_all': True}
def test_should_remove_req_res_references_before_caching_the_results(self):
"""Regression test case to prevent a memory leak in the Media Pipeline.
The memory leak is triggered when an exception is raised when a Response
scheduled by the Media Pipeline is being returned. For example, when a
FileException('download-error') is raised because the Response status
code is not 200 OK.
It happens because we are keeping a reference to the Response object
inside the FileException context. This is caused by the way Twisted
return values from inline callbacks. It raises a custom exception
encapsulating the original return value.
The solution is to remove the exception context when this context is a
_DefGen_Return instance, the BaseException used by Twisted to pass the
returned value from those inline callbacks.
Maybe there's a better and more reliable way to test the case described
here, but it would be more complicated and involve running - or at least
mocking - some async steps from the Media Pipeline. The current test
case is simple and detects the problem very fast. On the other hand, it
would not detect another kind of leak happening due to old object
references being kept inside the Media Pipeline cache.
This problem does not occur in Python 2.7 since we don't have Exception
Chaining (https://www.python.org/dev/peps/pep-3134/).
"""
# Create sample pair of Request and Response objects
request = Request('http://url')
response = Response('http://url', body=b'', request=request)
# Simulate the Media Pipeline behavior to produce a Twisted Failure
try:
# Simulate a Twisted inline callback returning a Response
# The returnValue method raises an exception encapsulating the value
returnValue(response)
except BaseException as exc:
def_gen_return_exc = exc
try:
# Simulate the media_downloaded callback raising a FileException
# This usually happens when the status code is not 200 OK
raise FileException('download-error')
except Exception as exc:
file_exc = exc
# Simulate Twisted capturing the FileException
# It encapsulates the exception inside a Twisted Failure
failure = Failure(file_exc)
# The Failure should encapsulate a FileException ...
self.assertEqual(failure.value, file_exc)
# ... and if we're running on Python 3 ...
if sys.version_info.major >= 3:
# ... it should have the returnValue exception set as its context
self.assertEqual(failure.value.__context__, def_gen_return_exc)
# Let's calculate the request fingerprint and fake some runtime data...
fp = request_fingerprint(request)
info = self.pipe.spiderinfo
info.downloading.add(fp)
info.waiting[fp] = []
# When calling the method that caches the Request's result ...
self.pipe._cache_result_and_execute_waiters(failure, fp, info)
# ... it should store the Twisted Failure ...
self.assertEqual(info.downloaded[fp], failure)
# ... encapsulating the original FileException ...
self.assertEqual(info.downloaded[fp].value, file_exc)
# ... but it should not store the returnValue exception on its context
context = getattr(info.downloaded[fp].value, '__context__', None)
self.assertIsNone(context)
class MockedMediaPipeline(MediaPipeline): class MockedMediaPipeline(MediaPipeline):

342
tests/test_scheduler.py Normal file
View File

@ -0,0 +1,342 @@
import shutil
import tempfile
import unittest
import collections
from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.crawler import Crawler
from scrapy.core.downloader import Downloader
from scrapy.core.scheduler import Scheduler
from scrapy.http import Request
from scrapy.spiders import Spider
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.test import get_crawler
from tests.mockserver import MockServer
MockEngine = collections.namedtuple('MockEngine', ['downloader'])
MockSlot = collections.namedtuple('MockSlot', ['active'])
class MockDownloader(object):
def __init__(self):
self.slots = dict()
def _get_slot_key(self, request, spider):
if Downloader.DOWNLOAD_SLOT in request.meta:
return request.meta[Downloader.DOWNLOAD_SLOT]
return urlparse_cached(request).hostname or ''
def increment(self, slot_key):
slot = self.slots.setdefault(slot_key, MockSlot(active=list()))
slot.active.append(1)
def decrement(self, slot_key):
slot = self.slots.get(slot_key)
slot.active.pop()
def close(self):
pass
class MockCrawler(Crawler):
def __init__(self, priority_queue_cls, jobdir):
settings = dict(
LOG_UNSERIALIZABLE_REQUESTS=False,
SCHEDULER_DISK_QUEUE='scrapy.squeues.PickleLifoDiskQueue',
SCHEDULER_MEMORY_QUEUE='scrapy.squeues.LifoMemoryQueue',
SCHEDULER_PRIORITY_QUEUE=priority_queue_cls,
JOBDIR=jobdir,
DUPEFILTER_CLASS='scrapy.dupefilters.BaseDupeFilter'
)
super(MockCrawler, self).__init__(Spider, settings)
self.engine = MockEngine(downloader=MockDownloader())
class SchedulerHandler(object):
priority_queue_cls = None
jobdir = None
def create_scheduler(self):
self.mock_crawler = MockCrawler(self.priority_queue_cls, self.jobdir)
self.scheduler = Scheduler.from_crawler(self.mock_crawler)
self.spider = Spider(name='spider')
self.scheduler.open(self.spider)
def close_scheduler(self):
self.scheduler.close('finished')
self.mock_crawler.stop()
self.mock_crawler.engine.downloader.close()
def setUp(self):
self.create_scheduler()
def tearDown(self):
self.close_scheduler()
_PRIORITIES = [("http://foo.com/a", -2),
("http://foo.com/d", 1),
("http://foo.com/b", -1),
("http://foo.com/c", 0),
("http://foo.com/e", 2)]
_URLS = {"http://foo.com/a", "http://foo.com/b", "http://foo.com/c"}
class BaseSchedulerInMemoryTester(SchedulerHandler):
def test_length(self):
self.assertFalse(self.scheduler.has_pending_requests())
self.assertEqual(len(self.scheduler), 0)
for url in _URLS:
self.scheduler.enqueue_request(Request(url))
self.assertTrue(self.scheduler.has_pending_requests())
self.assertEqual(len(self.scheduler), len(_URLS))
def test_dequeue(self):
for url in _URLS:
self.scheduler.enqueue_request(Request(url))
urls = set()
while self.scheduler.has_pending_requests():
urls.add(self.scheduler.next_request().url)
self.assertEqual(urls, _URLS)
def test_dequeue_priorities(self):
for url, priority in _PRIORITIES:
self.scheduler.enqueue_request(Request(url, priority=priority))
priorities = list()
while self.scheduler.has_pending_requests():
priorities.append(self.scheduler.next_request().priority)
self.assertEqual(priorities,
sorted([x[1] for x in _PRIORITIES], key=lambda x: -x))
class BaseSchedulerOnDiskTester(SchedulerHandler):
def setUp(self):
self.jobdir = tempfile.mkdtemp()
self.create_scheduler()
def tearDown(self):
self.close_scheduler()
shutil.rmtree(self.jobdir)
self.jobdir = None
def test_length(self):
self.assertFalse(self.scheduler.has_pending_requests())
self.assertEqual(len(self.scheduler), 0)
for url in _URLS:
self.scheduler.enqueue_request(Request(url))
self.close_scheduler()
self.create_scheduler()
self.assertTrue(self.scheduler.has_pending_requests())
self.assertEqual(len(self.scheduler), len(_URLS))
def test_dequeue(self):
for url in _URLS:
self.scheduler.enqueue_request(Request(url))
self.close_scheduler()
self.create_scheduler()
urls = set()
while self.scheduler.has_pending_requests():
urls.add(self.scheduler.next_request().url)
self.assertEqual(urls, _URLS)
def test_dequeue_priorities(self):
for url, priority in _PRIORITIES:
self.scheduler.enqueue_request(Request(url, priority=priority))
self.close_scheduler()
self.create_scheduler()
priorities = list()
while self.scheduler.has_pending_requests():
priorities.append(self.scheduler.next_request().priority)
self.assertEqual(priorities,
sorted([x[1] for x in _PRIORITIES], key=lambda x: -x))
class TestSchedulerInMemory(BaseSchedulerInMemoryTester, unittest.TestCase):
priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
class TestSchedulerOnDisk(BaseSchedulerOnDiskTester, unittest.TestCase):
priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
_URLS_WITH_SLOTS = [("http://foo.com/a", 'a'),
("http://foo.com/b", 'a'),
("http://foo.com/c", 'b'),
("http://foo.com/d", 'b'),
("http://foo.com/e", 'c'),
("http://foo.com/f", 'c')]
class TestMigration(unittest.TestCase):
def setUp(self):
self.tmpdir = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.tmpdir)
def _migration(self, tmp_dir):
prev_scheduler_handler = SchedulerHandler()
prev_scheduler_handler.priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
prev_scheduler_handler.jobdir = tmp_dir
prev_scheduler_handler.create_scheduler()
for url in _URLS:
prev_scheduler_handler.scheduler.enqueue_request(Request(url))
prev_scheduler_handler.close_scheduler()
next_scheduler_handler = SchedulerHandler()
next_scheduler_handler.priority_queue_cls = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
next_scheduler_handler.jobdir = tmp_dir
next_scheduler_handler.create_scheduler()
def test_migration(self):
with self.assertRaises(ValueError):
self._migration(self.tmpdir)
def _is_scheduling_fair(enqueued_slots, dequeued_slots):
"""
We enqueued same number of requests for every slot.
Assert correct order, e.g.
>>> enqueued = ['a', 'b', 'c'] * 2
>>> correct = ['a', 'c', 'b', 'b', 'a', 'c']
>>> incorrect = ['a', 'a', 'b', 'c', 'c', 'b']
>>> _is_scheduling_fair(enqueued, correct)
True
>>> _is_scheduling_fair(enqueued, incorrect)
False
"""
if len(dequeued_slots) != len(enqueued_slots):
return False
slots_number = len(set(enqueued_slots))
for i in range(0, len(dequeued_slots), slots_number):
part = dequeued_slots[i:i + slots_number]
if len(part) != len(set(part)):
return False
return True
class DownloaderAwareSchedulerTestMixin(object):
priority_queue_cls = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
reopen = False
def test_logic(self):
for url, slot in _URLS_WITH_SLOTS:
request = Request(url)
request.meta[Downloader.DOWNLOAD_SLOT] = slot
self.scheduler.enqueue_request(request)
if self.reopen:
self.close_scheduler()
self.create_scheduler()
dequeued_slots = list()
requests = []
downloader = self.mock_crawler.engine.downloader
while self.scheduler.has_pending_requests():
request = self.scheduler.next_request()
# pylint: disable=protected-access
slot = downloader._get_slot_key(request, None)
dequeued_slots.append(slot)
downloader.increment(slot)
requests.append(request)
for request in requests:
# pylint: disable=protected-access
slot = downloader._get_slot_key(request, None)
downloader.decrement(slot)
self.assertTrue(_is_scheduling_fair(list(s for u, s in _URLS_WITH_SLOTS),
dequeued_slots))
self.assertEqual(sum(len(s.active) for s in downloader.slots.values()), 0)
class TestSchedulerWithDownloaderAwareInMemory(DownloaderAwareSchedulerTestMixin,
BaseSchedulerInMemoryTester,
unittest.TestCase):
pass
class TestSchedulerWithDownloaderAwareOnDisk(DownloaderAwareSchedulerTestMixin,
BaseSchedulerOnDiskTester,
unittest.TestCase):
reopen = True
class StartUrlsSpider(Spider):
def __init__(self, start_urls):
self.start_urls = start_urls
super(StartUrlsSpider, self).__init__(start_urls)
def parse(self, response):
pass
class TestIntegrationWithDownloaderAwareInMemory(TestCase):
def setUp(self):
self.crawler = get_crawler(
StartUrlsSpider,
{'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter'}
)
@defer.inlineCallbacks
def tearDown(self):
yield self.crawler.stop()
@defer.inlineCallbacks
def test_integration_downloader_aware_priority_queue(self):
with MockServer() as mockserver:
url = mockserver.url("/status?n=200", is_secure=False)
start_urls = [url] * 6
yield self.crawler.crawl(start_urls)
self.assertEqual(self.crawler.stats.get_value('downloader/response_count'),
len(start_urls))
class TestIncompatibility(unittest.TestCase):
def _incompatible(self):
settings = dict(
SCHEDULER_PRIORITY_QUEUE='scrapy.pqueues.DownloaderAwarePriorityQueue',
CONCURRENT_REQUESTS_PER_IP=1
)
crawler = Crawler(Spider, settings)
scheduler = Scheduler.from_crawler(crawler)
spider = Spider(name='spider')
scheduler.open(spider)
def test_incompatibility(self):
with self.assertRaises(ValueError):
self._incompatible()

View File

@ -105,11 +105,11 @@ class SpiderTest(unittest.TestCase):
def test_logger(self): def test_logger(self):
spider = self.spider_class('example.com') spider = self.spider_class('example.com')
with LogCapture() as l: with LogCapture() as lc:
spider.logger.info('test log msg') spider.logger.info('test log msg')
l.check(('example.com', 'INFO', 'test log msg')) lc.check(('example.com', 'INFO', 'test log msg'))
record = l.records[0] record = lc.records[0]
self.assertIn('spider', record.__dict__) self.assertIn('spider', record.__dict__)
self.assertIs(record.spider, spider) self.assertIs(record.spider, spider)
@ -190,8 +190,7 @@ class CrawlSpiderTest(SpiderTest):
def test_process_links(self): def test_process_links(self):
response = HtmlResponse("http://example.org/somepage/index.html", response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
body=self.test_body)
class _CrawlSpider(self.spider_class): class _CrawlSpider(self.spider_class):
name = "test" name = "test"
@ -214,8 +213,7 @@ class CrawlSpiderTest(SpiderTest):
def test_process_links_filter(self): def test_process_links_filter(self):
response = HtmlResponse("http://example.org/somepage/index.html", response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
body=self.test_body)
class _CrawlSpider(self.spider_class): class _CrawlSpider(self.spider_class):
import re import re
@ -226,6 +224,7 @@ class CrawlSpiderTest(SpiderTest):
Rule(LinkExtractor(), process_links="filter_process_links"), Rule(LinkExtractor(), process_links="filter_process_links"),
) )
_test_regex = re.compile('nofollow') _test_regex = re.compile('nofollow')
def filter_process_links(self, links): def filter_process_links(self, links):
return [link for link in links return [link for link in links
if not self._test_regex.search(link.url)] if not self._test_regex.search(link.url)]
@ -240,8 +239,7 @@ class CrawlSpiderTest(SpiderTest):
def test_process_links_generator(self): def test_process_links_generator(self):
response = HtmlResponse("http://example.org/somepage/index.html", response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
body=self.test_body)
class _CrawlSpider(self.spider_class): class _CrawlSpider(self.spider_class):
name = "test" name = "test"
@ -263,6 +261,110 @@ class CrawlSpiderTest(SpiderTest):
'http://example.org/about.html', 'http://example.org/about.html',
'http://example.org/nofollow.html']) 'http://example.org/nofollow.html'])
def test_process_request(self):
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
def process_request_change_domain(request):
return request.replace(url=request.url.replace('.org', '.com'))
class _CrawlSpider(self.spider_class):
name = "test"
allowed_domains = ['example.org']
rules = (
Rule(LinkExtractor(), process_request=process_request_change_domain),
)
with warnings.catch_warnings(record=True) as cw:
spider = _CrawlSpider()
output = list(spider._requests_to_follow(response))
self.assertEqual(len(output), 3)
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
self.assertEqual([r.url for r in output],
['http://example.com/somepage/item/12.html',
'http://example.com/about.html',
'http://example.com/nofollow.html'])
self.assertEqual(len(cw), 1)
self.assertEqual(cw[0].category, ScrapyDeprecationWarning)
def test_process_request_with_response(self):
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
def process_request_meta_response_class(request, response):
request.meta['response_class'] = response.__class__.__name__
return request
class _CrawlSpider(self.spider_class):
name = "test"
allowed_domains = ['example.org']
rules = (
Rule(LinkExtractor(), process_request=process_request_meta_response_class),
)
spider = _CrawlSpider()
output = list(spider._requests_to_follow(response))
self.assertEqual(len(output), 3)
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
self.assertEqual([r.url for r in output],
['http://example.org/somepage/item/12.html',
'http://example.org/about.html',
'http://example.org/nofollow.html'])
self.assertEqual([r.meta['response_class'] for r in output],
['HtmlResponse', 'HtmlResponse', 'HtmlResponse'])
def test_process_request_instance_method(self):
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
class _CrawlSpider(self.spider_class):
name = "test"
allowed_domains = ['example.org']
rules = (
Rule(LinkExtractor(), process_request='process_request_upper'),
)
def process_request_upper(self, request):
return request.replace(url=request.url.upper())
with warnings.catch_warnings(record=True) as cw:
spider = _CrawlSpider()
output = list(spider._requests_to_follow(response))
self.assertEqual(len(output), 3)
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
self.assertEqual([r.url for r in output],
['http://EXAMPLE.ORG/SOMEPAGE/ITEM/12.HTML',
'http://EXAMPLE.ORG/ABOUT.HTML',
'http://EXAMPLE.ORG/NOFOLLOW.HTML'])
self.assertEqual(len(cw), 1)
self.assertEqual(cw[0].category, ScrapyDeprecationWarning)
def test_process_request_instance_method_with_response(self):
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
class _CrawlSpider(self.spider_class):
name = "test"
allowed_domains = ['example.org']
rules = (
Rule(LinkExtractor(), process_request='process_request_meta_response_class'),
)
def process_request_meta_response_class(self, request, response):
request.meta['response_class'] = response.__class__.__name__
return request
spider = _CrawlSpider()
output = list(spider._requests_to_follow(response))
self.assertEqual(len(output), 3)
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
self.assertEqual([r.url for r in output],
['http://example.org/somepage/item/12.html',
'http://example.org/about.html',
'http://example.org/nofollow.html'])
self.assertEqual([r.meta['response_class'] for r in output],
['HtmlResponse', 'HtmlResponse', 'HtmlResponse'])
def test_follow_links_attribute_population(self): def test_follow_links_attribute_population(self):
crawler = get_crawler() crawler = get_crawler()
spider = self.spider_class.from_crawler(crawler, 'example.com') spider = self.spider_class.from_crawler(crawler, 'example.com')

View File

@ -0,0 +1,102 @@
from twisted.trial.unittest import TestCase
from twisted.python.failure import Failure
from scrapy.spiders import Spider
from scrapy.http import Request, Response
from scrapy.exceptions import _InvalidOutput
from scrapy.utils.test import get_crawler
from scrapy.core.spidermw import SpiderMiddlewareManager
from tests import mock
class SpiderMiddlewareTestCase(TestCase):
def setUp(self):
self.request = Request('http://example.com/index.html')
self.response = Response(self.request.url, request=self.request)
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('foo')
self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
def _scrape_response(self):
"""Execute spider mw manager's scrape_response method and return the result.
Raise exception in case of failure.
"""
scrape_func = mock.MagicMock()
dfd = self.mwman.scrape_response(scrape_func, self.response, self.request, self.spider)
# catch deferred result and return the value
results = []
dfd.addBoth(results.append)
self._wait(dfd)
ret = results[0]
return ret
class ProcessSpiderInputInvalidOutput(SpiderMiddlewareTestCase):
"""Invalid return value for process_spider_input method"""
def test_invalid_process_spider_input(self):
class InvalidProcessSpiderInputMiddleware:
def process_spider_input(self, response, spider):
return 1
self.mwman._add_middleware(InvalidProcessSpiderInputMiddleware())
result = self._scrape_response()
self.assertIsInstance(result, Failure)
self.assertIsInstance(result.value, _InvalidOutput)
class ProcessSpiderOutputInvalidOutput(SpiderMiddlewareTestCase):
"""Invalid return value for process_spider_output method"""
def test_invalid_process_spider_output(self):
class InvalidProcessSpiderOutputMiddleware:
def process_spider_output(self, response, result, spider):
return 1
self.mwman._add_middleware(InvalidProcessSpiderOutputMiddleware())
result = self._scrape_response()
self.assertIsInstance(result, Failure)
self.assertIsInstance(result.value, _InvalidOutput)
class ProcessSpiderExceptionInvalidOutput(SpiderMiddlewareTestCase):
"""Invalid return value for process_spider_exception method"""
def test_invalid_process_spider_exception(self):
class InvalidProcessSpiderOutputExceptionMiddleware:
def process_spider_exception(self, response, exception, spider):
return 1
class RaiseExceptionProcessSpiderOutputMiddleware:
def process_spider_output(self, response, result, spider):
raise Exception()
self.mwman._add_middleware(InvalidProcessSpiderOutputExceptionMiddleware())
self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
result = self._scrape_response()
self.assertIsInstance(result, Failure)
self.assertIsInstance(result.value, _InvalidOutput)
class ProcessSpiderExceptionReRaise(SpiderMiddlewareTestCase):
"""Re raise the exception by returning None"""
def test_process_spider_exception_return_none(self):
class ProcessSpiderExceptionReturnNoneMiddleware:
def process_spider_exception(self, response, exception, spider):
return None
class RaiseExceptionProcessSpiderOutputMiddleware:
def process_spider_output(self, response, result, spider):
1/0
self.mwman._add_middleware(ProcessSpiderExceptionReturnNoneMiddleware())
self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
result = self._scrape_response()
self.assertIsInstance(result, Failure)
self.assertIsInstance(result.value, ZeroDivisionError)

View File

@ -0,0 +1,380 @@
from testfixtures import LogCapture
from twisted.trial.unittest import TestCase
from twisted.internet import defer
from scrapy import Spider, Request
from scrapy.utils.test import get_crawler
from tests.mockserver import MockServer
from tests.spiders import MockServerSpider
class LogExceptionMiddleware:
def process_spider_exception(self, response, exception, spider):
spider.logger.info('Middleware: %s exception caught', exception.__class__.__name__)
return None
# ================================================================================
# (0) recover from an exception on a spider callback
class RecoverySpider(Spider):
name = 'RecoverySpider'
custom_settings = {
'SPIDER_MIDDLEWARES': {
__name__ + '.RecoveryMiddleware': 10,
},
}
def start_requests(self):
yield Request(self.mockserver.url('/status?n=200'))
def parse(self, response):
yield {'test': 1}
self.logger.info('DONT_FAIL: %s', response.meta.get('dont_fail'))
if not response.meta.get('dont_fail'):
raise TabError()
class RecoveryMiddleware:
def process_spider_exception(self, response, exception, spider):
spider.logger.info('Middleware: %s exception caught', exception.__class__.__name__)
return [
{'from': 'process_spider_exception'},
Request(response.url, meta={'dont_fail': True}, dont_filter=True),
]
# ================================================================================
# (1) exceptions from a spider middleware's process_spider_input method
class FailProcessSpiderInputMiddleware:
def process_spider_input(self, response, spider):
spider.logger.info('Middleware: will raise IndexError')
raise IndexError()
class ProcessSpiderInputSpiderWithoutErrback(Spider):
name = 'ProcessSpiderInputSpiderWithoutErrback'
custom_settings = {
'SPIDER_MIDDLEWARES': {
# spider
__name__ + '.LogExceptionMiddleware': 10,
__name__ + '.FailProcessSpiderInputMiddleware': 8,
__name__ + '.LogExceptionMiddleware': 6,
# engine
}
}
def start_requests(self):
yield Request(url=self.mockserver.url('/status?n=200'), callback=self.parse)
def parse(self, response):
return {'from': 'callback'}
class ProcessSpiderInputSpiderWithErrback(ProcessSpiderInputSpiderWithoutErrback):
name = 'ProcessSpiderInputSpiderWithErrback'
def start_requests(self):
yield Request(url=self.mockserver.url('/status?n=200'), callback=self.parse, errback=self.errback)
def errback(self, failure):
self.logger.info('Got a Failure on the Request errback')
return {'from': 'errback'}
# ================================================================================
# (2) exceptions from a spider callback (generator)
class GeneratorCallbackSpider(Spider):
name = 'GeneratorCallbackSpider'
custom_settings = {
'SPIDER_MIDDLEWARES': {
__name__ + '.LogExceptionMiddleware': 10,
},
}
def start_requests(self):
yield Request(self.mockserver.url('/status?n=200'))
def parse(self, response):
yield {'test': 1}
yield {'test': 2}
raise ImportError()
# ================================================================================
# (3) exceptions from a spider callback (not a generator)
class NotGeneratorCallbackSpider(Spider):
name = 'NotGeneratorCallbackSpider'
custom_settings = {
'SPIDER_MIDDLEWARES': {
__name__ + '.LogExceptionMiddleware': 10,
},
}
def start_requests(self):
yield Request(self.mockserver.url('/status?n=200'))
def parse(self, response):
return [{'test': 1}, {'test': 1/0}]
# ================================================================================
# (4) exceptions from a middleware process_spider_output method (generator)
class GeneratorOutputChainSpider(Spider):
name = 'GeneratorOutputChainSpider'
custom_settings = {
'SPIDER_MIDDLEWARES': {
__name__ + '.GeneratorFailMiddleware': 10,
__name__ + '.GeneratorDoNothingAfterFailureMiddleware': 8,
__name__ + '.GeneratorRecoverMiddleware': 5,
__name__ + '.GeneratorDoNothingAfterRecoveryMiddleware': 3,
},
}
def start_requests(self):
yield Request(self.mockserver.url('/status?n=200'))
def parse(self, response):
yield {'processed': ['parse-first-item']}
yield {'processed': ['parse-second-item']}
class _GeneratorDoNothingMiddleware:
def process_spider_output(self, response, result, spider):
for r in result:
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
yield r
def process_spider_exception(self, response, exception, spider):
method = '{}.process_spider_exception'.format(self.__class__.__name__)
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
return None
class GeneratorFailMiddleware:
def process_spider_output(self, response, result, spider):
for r in result:
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
yield r
raise LookupError()
def process_spider_exception(self, response, exception, spider):
method = '{}.process_spider_exception'.format(self.__class__.__name__)
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
yield {'processed': [method]}
class GeneratorDoNothingAfterFailureMiddleware(_GeneratorDoNothingMiddleware):
pass
class GeneratorRecoverMiddleware:
def process_spider_output(self, response, result, spider):
for r in result:
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
yield r
def process_spider_exception(self, response, exception, spider):
method = '{}.process_spider_exception'.format(self.__class__.__name__)
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
yield {'processed': [method]}
class GeneratorDoNothingAfterRecoveryMiddleware(_GeneratorDoNothingMiddleware):
pass
# ================================================================================
# (5) exceptions from a middleware process_spider_output method (not generator)
class NotGeneratorOutputChainSpider(Spider):
name = 'NotGeneratorOutputChainSpider'
custom_settings = {
'SPIDER_MIDDLEWARES': {
__name__ + '.NotGeneratorFailMiddleware': 10,
__name__ + '.NotGeneratorDoNothingAfterFailureMiddleware': 8,
__name__ + '.NotGeneratorRecoverMiddleware': 5,
__name__ + '.NotGeneratorDoNothingAfterRecoveryMiddleware': 3,
},
}
def start_requests(self):
return [Request(self.mockserver.url('/status?n=200'))]
def parse(self, response):
return [{'processed': ['parse-first-item']}, {'processed': ['parse-second-item']}]
class _NotGeneratorDoNothingMiddleware:
def process_spider_output(self, response, result, spider):
out = []
for r in result:
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
out.append(r)
return out
def process_spider_exception(self, response, exception, spider):
method = '{}.process_spider_exception'.format(self.__class__.__name__)
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
return None
class NotGeneratorFailMiddleware:
def process_spider_output(self, response, result, spider):
out = []
for r in result:
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
out.append(r)
raise ReferenceError()
return out
def process_spider_exception(self, response, exception, spider):
method = '{}.process_spider_exception'.format(self.__class__.__name__)
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
return [{'processed': [method]}]
class NotGeneratorDoNothingAfterFailureMiddleware(_NotGeneratorDoNothingMiddleware):
pass
class NotGeneratorRecoverMiddleware:
def process_spider_output(self, response, result, spider):
out = []
for r in result:
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
out.append(r)
return out
def process_spider_exception(self, response, exception, spider):
method = '{}.process_spider_exception'.format(self.__class__.__name__)
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
return [{'processed': [method]}]
class NotGeneratorDoNothingAfterRecoveryMiddleware(_NotGeneratorDoNothingMiddleware):
pass
# ================================================================================
class TestSpiderMiddleware(TestCase):
@classmethod
def setUpClass(cls):
cls.mockserver = MockServer()
cls.mockserver.__enter__()
@classmethod
def tearDownClass(cls):
cls.mockserver.__exit__(None, None, None)
@defer.inlineCallbacks
def crawl_log(self, spider):
crawler = get_crawler(spider)
with LogCapture() as log:
yield crawler.crawl(mockserver=self.mockserver)
raise defer.returnValue(log)
@defer.inlineCallbacks
def test_recovery(self):
"""
(0) Recover from an exception in a spider callback. The final item count should be 3
(one yielded from the callback method before the exception is raised, one directly
from the recovery middleware and one from the spider when processing the request that
was enqueued from the recovery middleware)
"""
log = yield self.crawl_log(RecoverySpider)
self.assertIn("Middleware: TabError exception caught", str(log))
self.assertEqual(str(log).count("Middleware: TabError exception caught"), 1)
self.assertIn("'item_scraped_count': 3", str(log))
@defer.inlineCallbacks
def test_process_spider_input_without_errback(self):
"""
(1.1) An exception from the process_spider_input chain should be caught by the
process_spider_exception chain from the start if the Request has no errback
"""
log1 = yield self.crawl_log(ProcessSpiderInputSpiderWithoutErrback)
self.assertIn("Middleware: will raise IndexError", str(log1))
self.assertIn("Middleware: IndexError exception caught", str(log1))
@defer.inlineCallbacks
def test_process_spider_input_with_errback(self):
"""
(1.2) An exception from the process_spider_input chain should not be caught by the
process_spider_exception chain if the Request has an errback
"""
log1 = yield self.crawl_log(ProcessSpiderInputSpiderWithErrback)
self.assertNotIn("Middleware: IndexError exception caught", str(log1))
self.assertIn("Middleware: will raise IndexError", str(log1))
self.assertIn("Got a Failure on the Request errback", str(log1))
self.assertIn("{'from': 'errback'}", str(log1))
self.assertNotIn("{'from': 'callback'}", str(log1))
self.assertIn("'item_scraped_count': 1", str(log1))
@defer.inlineCallbacks
def test_generator_callback(self):
"""
(2) An exception from a spider callback (returning a generator) should
be caught by the process_spider_exception chain. Items yielded before the
exception is raised should be processed normally.
"""
log2 = yield self.crawl_log(GeneratorCallbackSpider)
self.assertIn("Middleware: ImportError exception caught", str(log2))
self.assertIn("'item_scraped_count': 2", str(log2))
@defer.inlineCallbacks
def test_not_a_generator_callback(self):
"""
(3) An exception from a spider callback (returning a list) should
be caught by the process_spider_exception chain. No items should be processed.
"""
log3 = yield self.crawl_log(NotGeneratorCallbackSpider)
self.assertIn("Middleware: ZeroDivisionError exception caught", str(log3))
self.assertNotIn("item_scraped_count", str(log3))
@defer.inlineCallbacks
def test_generator_output_chain(self):
"""
(4) An exception from a middleware's process_spider_output method should be sent
to the process_spider_exception method from the next middleware in the chain.
The result of the recovery by the process_spider_exception method should be handled
by the process_spider_output method from the next middleware.
The final item count should be 2 (one from the spider callback and one from the
process_spider_exception chain)
"""
log4 = yield self.crawl_log(GeneratorOutputChainSpider)
self.assertIn("'item_scraped_count': 2", str(log4))
self.assertIn("GeneratorRecoverMiddleware.process_spider_exception: LookupError caught", str(log4))
self.assertIn("GeneratorDoNothingAfterFailureMiddleware.process_spider_exception: LookupError caught", str(log4))
self.assertNotIn("GeneratorFailMiddleware.process_spider_exception: LookupError caught", str(log4))
self.assertNotIn("GeneratorDoNothingAfterRecoveryMiddleware.process_spider_exception: LookupError caught", str(log4))
item_from_callback = {'processed': [
'parse-first-item',
'GeneratorFailMiddleware.process_spider_output',
'GeneratorDoNothingAfterFailureMiddleware.process_spider_output',
'GeneratorRecoverMiddleware.process_spider_output',
'GeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
item_recovered = {'processed': [
'GeneratorRecoverMiddleware.process_spider_exception',
'GeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
self.assertIn(str(item_from_callback), str(log4))
self.assertIn(str(item_recovered), str(log4))
self.assertNotIn('parse-second-item', str(log4))
@defer.inlineCallbacks
def test_not_a_generator_output_chain(self):
"""
(5) An exception from a middleware's process_spider_output method should be sent
to the process_spider_exception method from the next middleware in the chain.
The result of the recovery by the process_spider_exception method should be handled
by the process_spider_output method from the next middleware.
The final item count should be 1 (from the process_spider_exception chain, the items
from the spider callback are lost)
"""
log5 = yield self.crawl_log(NotGeneratorOutputChainSpider)
self.assertIn("'item_scraped_count': 1", str(log5))
self.assertIn("GeneratorRecoverMiddleware.process_spider_exception: ReferenceError caught", str(log5))
self.assertIn("GeneratorDoNothingAfterFailureMiddleware.process_spider_exception: ReferenceError caught", str(log5))
self.assertNotIn("GeneratorFailMiddleware.process_spider_exception: ReferenceError caught", str(log5))
self.assertNotIn("GeneratorDoNothingAfterRecoveryMiddleware.process_spider_exception: ReferenceError caught", str(log5))
item_recovered = {'processed': [
'NotGeneratorRecoverMiddleware.process_spider_exception',
'NotGeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
self.assertIn(str(item_recovered), str(log5))
self.assertNotIn('parse-first-item', str(log5))
self.assertNotIn('parse-second-item', str(log5))

View File

@ -3,12 +3,13 @@ import os
import unittest import unittest
from scrapy.item import Item, Field from scrapy.item import Item, Field
from scrapy.utils.misc import arg_to_iter, create_instance, load_object, walk_modules from scrapy.utils.misc import arg_to_iter, create_instance, load_object, set_environ, walk_modules
from tests import mock from tests import mock
__doctests__ = ['scrapy.utils.misc'] __doctests__ = ['scrapy.utils.misc']
class UtilsMiscTestCase(unittest.TestCase): class UtilsMiscTestCase(unittest.TestCase):
def test_load_object(self): def test_load_object(self):
@ -130,5 +131,18 @@ class UtilsMiscTestCase(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
create_instance(m, None, None) create_instance(m, None, None)
def test_set_environ(self):
assert os.environ.get('some_test_environ') is None
with set_environ(some_test_environ='test_value'):
assert os.environ.get('some_test_environ') == 'test_value'
assert os.environ.get('some_test_environ') is None
os.environ['some_test_environ'] = 'test'
assert os.environ.get('some_test_environ') == 'test'
with set_environ(some_test_environ='test_value'):
assert os.environ.get('some_test_environ') == 'test_value'
assert os.environ.get('some_test_environ') == 'test'
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -9,11 +9,23 @@ import six
from scrapy.utils.python import ( from scrapy.utils.python import (
memoizemethod_noargs, binary_is_text, equal_attributes, memoizemethod_noargs, binary_is_text, equal_attributes,
WeakKeyCache, stringify_dict, get_func_args, to_bytes, to_unicode, WeakKeyCache, stringify_dict, get_func_args, to_bytes, to_unicode,
without_none_values) without_none_values, MutableChain)
__doctests__ = ['scrapy.utils.python'] __doctests__ = ['scrapy.utils.python']
class MutableChainTest(unittest.TestCase):
def test_mutablechain(self):
m = MutableChain(range(2), [2, 3], (4, 5))
m.extend(range(6, 7))
m.extend([7, 8])
m.extend([9, 10], (11, 12))
self.assertEqual(next(m), 0)
self.assertEqual(m.next(), 1)
self.assertEqual(m.__next__(), 2)
self.assertEqual(list(m), list(range(3, 13)))
class ToUnicodeTest(unittest.TestCase): class ToUnicodeTest(unittest.TestCase):
def test_converting_an_utf8_encoded_string_to_unicode(self): def test_converting_an_utf8_encoded_string_to_unicode(self):
self.assertEqual(to_unicode(b'lel\xc3\xb1e'), u'lel\xf1e') self.assertEqual(to_unicode(b'lel\xc3\xb1e'), u'lel\xf1e')

View File

@ -1,9 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import unittest import unittest
import sys
import six
from scrapy.http import Request, FormRequest from scrapy.http import Request, FormRequest
from scrapy.spiders import Spider from scrapy.spiders import Spider
from scrapy.utils.reqser import request_to_dict, request_from_dict from scrapy.utils.reqser import request_to_dict, request_from_dict, _is_private_method, _mangle_private_name
class RequestSerializationTest(unittest.TestCase): class RequestSerializationTest(unittest.TestCase):
@ -70,6 +73,56 @@ class RequestSerializationTest(unittest.TestCase):
errback=self.spider.handle_error) errback=self.spider.handle_error)
self._assert_serializes_ok(r, spider=self.spider) self._assert_serializes_ok(r, spider=self.spider)
def test_private_callback_serialization(self):
r = Request("http://www.example.com",
callback=self.spider._TestSpider__parse_item_private,
errback=self.spider.handle_error)
self._assert_serializes_ok(r, spider=self.spider)
def test_mixin_private_callback_serialization(self):
if sys.version_info[0] < 3:
return
r = Request("http://www.example.com",
callback=self.spider._TestSpiderMixin__mixin_callback,
errback=self.spider.handle_error)
self._assert_serializes_ok(r, spider=self.spider)
def test_private_callback_name_matching(self):
self.assertTrue(_is_private_method('__a'))
self.assertTrue(_is_private_method('__a_'))
self.assertTrue(_is_private_method('__a_a'))
self.assertTrue(_is_private_method('__a_a_'))
self.assertTrue(_is_private_method('__a__a'))
self.assertTrue(_is_private_method('__a__a_'))
self.assertTrue(_is_private_method('__a___a'))
self.assertTrue(_is_private_method('__a___a_'))
self.assertTrue(_is_private_method('___a'))
self.assertTrue(_is_private_method('___a_'))
self.assertTrue(_is_private_method('___a_a'))
self.assertTrue(_is_private_method('___a_a_'))
self.assertTrue(_is_private_method('____a_a_'))
self.assertFalse(_is_private_method('_a'))
self.assertFalse(_is_private_method('_a_'))
self.assertFalse(_is_private_method('__a__'))
self.assertFalse(_is_private_method('__'))
self.assertFalse(_is_private_method('___'))
self.assertFalse(_is_private_method('____'))
def _assert_mangles_to(self, obj, name):
func = getattr(obj, name)
self.assertEqual(
_mangle_private_name(obj, func, func.__name__),
name
)
def test_private_name_mangling(self):
self._assert_mangles_to(
self.spider, '_TestSpider__parse_item_private')
if sys.version_info[0] >= 3:
self._assert_mangles_to(
self.spider, '_TestSpiderMixin__mixin_callback')
def test_unserializable_callback1(self): def test_unserializable_callback1(self):
r = Request("http://www.example.com", callback=lambda x: x) r = Request("http://www.example.com", callback=lambda x: x)
self.assertRaises(ValueError, request_to_dict, r) self.assertRaises(ValueError, request_to_dict, r)
@ -80,7 +133,12 @@ class RequestSerializationTest(unittest.TestCase):
self.assertRaises(ValueError, request_to_dict, r) self.assertRaises(ValueError, request_to_dict, r)
class TestSpider(Spider): class TestSpiderMixin(object):
def __mixin_callback(self, response):
pass
class TestSpider(Spider, TestSpiderMixin):
name = 'test' name = 'test'
def parse_item(self, response): def parse_item(self, response):
@ -89,6 +147,9 @@ class TestSpider(Spider):
def handle_error(self, failure): def handle_error(self, failure):
pass pass
def __parse_item_private(self, response):
pass
class CustomRequest(Request): class CustomRequest(Request):
pass pass

View File

@ -105,6 +105,12 @@ deps = {[docs]deps}
commands = commands =
sphinx-build -W -b html . {envtmpdir}/html sphinx-build -W -b html . {envtmpdir}/html
[testenv:docs-coverage]
changedir = {[docs]changedir}
deps = {[docs]deps}
commands =
sphinx-build -b coverage . {envtmpdir}/coverage
[testenv:docs-links] [testenv:docs-links]
changedir = {[docs]changedir} changedir = {[docs]changedir}
deps = {[docs]deps} deps = {[docs]deps}