mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 14:24:35 +00:00
Merge remote-tracking branch 'origin/master' into callback_kwargs
This commit is contained in:
commit
428309ba1a
@ -12,7 +12,8 @@ branches:
|
|||||||
|
|
||||||
install:
|
install:
|
||||||
- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
|
- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
|
||||||
- "SET TOX_TESTENV_PASSENV=HOME USERPROFILE HOMEPATH HOMEDRIVE"
|
- "SET PYTHONPATH=%APPVEYOR_BUILD_FOLDER%"
|
||||||
|
- "SET TOX_TESTENV_PASSENV=HOME HOMEDRIVE HOMEPATH PYTHONPATH USERPROFILE"
|
||||||
- "pip install -U tox"
|
- "pip install -U tox"
|
||||||
|
|
||||||
build: false
|
build: false
|
||||||
|
@ -82,6 +82,9 @@ pydoc-topics: build
|
|||||||
@echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \
|
@echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \
|
||||||
"into the Lib/ directory"
|
"into the Lib/ directory"
|
||||||
|
|
||||||
|
coverage: BUILDER = coverage
|
||||||
|
coverage: build
|
||||||
|
|
||||||
htmlview: html
|
htmlview: html
|
||||||
$(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
|
$(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
|
||||||
os.path.realpath('build/html/index.html'))"
|
os.path.realpath('build/html/index.html'))"
|
||||||
|
22
docs/conf.py
22
docs/conf.py
@ -28,7 +28,8 @@ sys.path.insert(0, path.dirname(path.dirname(__file__)))
|
|||||||
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||||
extensions = [
|
extensions = [
|
||||||
'scrapydocs',
|
'scrapydocs',
|
||||||
'sphinx.ext.autodoc'
|
'sphinx.ext.autodoc',
|
||||||
|
'sphinx.ext.coverage',
|
||||||
]
|
]
|
||||||
|
|
||||||
# Add any paths that contain templates here, relative to this directory.
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
@ -218,3 +219,22 @@ linkcheck_ignore = [
|
|||||||
'http://localhost:\d+', 'http://hg.scrapy.org',
|
'http://localhost:\d+', 'http://hg.scrapy.org',
|
||||||
'http://directory.google.com/'
|
'http://directory.google.com/'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Options for the Coverage extension
|
||||||
|
# ----------------------------------
|
||||||
|
coverage_ignore_pyobjects = [
|
||||||
|
# Contract’s add_pre_hook and add_post_hook are not documented because
|
||||||
|
# they should be transparent to contract developers, for whom pre_hook and
|
||||||
|
# post_hook should be the actual concern.
|
||||||
|
r'\bContract\.add_(pre|post)_hook$',
|
||||||
|
|
||||||
|
# ContractsManager is an internal class, developers are not expected to
|
||||||
|
# interact with it directly in any way.
|
||||||
|
r'\bContractsManager\b$',
|
||||||
|
|
||||||
|
# For default contracts we only want to document their general purpose in
|
||||||
|
# their constructor, the methods they reimplement to achieve that purpose
|
||||||
|
# should be irrelevant to developers using those contracts.
|
||||||
|
r'\w+Contract\.(adjust_request_args|(pre|post)_process)$',
|
||||||
|
]
|
||||||
|
@ -99,6 +99,15 @@ Well-written patches should:
|
|||||||
the documentation changes in the same patch. See `Documentation policies`_
|
the documentation changes in the same patch. See `Documentation policies`_
|
||||||
below.
|
below.
|
||||||
|
|
||||||
|
* if you're adding a private API, please add a regular expression to the
|
||||||
|
``coverage_ignore_pyobjects`` variable of ``docs/conf.py`` to exclude the new
|
||||||
|
private API from documentation coverage checks.
|
||||||
|
|
||||||
|
To see if your private API is skipped properly, generate a documentation
|
||||||
|
coverage report as follows::
|
||||||
|
|
||||||
|
tox -e docs-coverage
|
||||||
|
|
||||||
.. _submitting-patches:
|
.. _submitting-patches:
|
||||||
|
|
||||||
Submitting patches
|
Submitting patches
|
||||||
@ -167,8 +176,9 @@ Documentation policies
|
|||||||
|
|
||||||
For reference documentation of API members (classes, methods, etc.) use
|
For reference documentation of API members (classes, methods, etc.) use
|
||||||
docstrings and make sure that the Sphinx documentation uses the autodoc_
|
docstrings and make sure that the Sphinx documentation uses the autodoc_
|
||||||
extension to pull the docstrings. API reference documentation should be
|
extension to pull the docstrings. API reference documentation should follow
|
||||||
IDE-friendly: short, to the point, and it may provide short examples.
|
docstring conventions (`PEP 257`_) and be IDE-friendly: short, to the point,
|
||||||
|
and it may provide short examples.
|
||||||
|
|
||||||
Other types of documentation, such as tutorials or topics, should be covered in
|
Other types of documentation, such as tutorials or topics, should be covered in
|
||||||
files within the ``docs/`` directory. This includes documentation that is
|
files within the ``docs/`` directory. This includes documentation that is
|
||||||
@ -205,6 +215,29 @@ To run a specific test (say ``tests/test_loader.py``) use:
|
|||||||
|
|
||||||
``tox -- tests/test_loader.py``
|
``tox -- tests/test_loader.py``
|
||||||
|
|
||||||
|
To run the tests on a specific tox_ environment, use ``-e <name>`` with an
|
||||||
|
environment name from ``tox.ini``. For example, to run the tests with Python
|
||||||
|
3.6 use::
|
||||||
|
|
||||||
|
tox -e py36
|
||||||
|
|
||||||
|
You can also specify a comma-separated list of environmets, and use `tox’s
|
||||||
|
parallel mode`_ to run the tests on multiple environments in parallel::
|
||||||
|
|
||||||
|
tox -e py27,py36 -p auto
|
||||||
|
|
||||||
|
To pass command-line options to pytest_, add them after ``--`` in your call to
|
||||||
|
tox_. Using ``--`` overrides the default positional arguments defined in
|
||||||
|
``tox.ini``, so you must include those default positional arguments
|
||||||
|
(``scrapy tests``) after ``--`` as well::
|
||||||
|
|
||||||
|
tox -- scrapy tests -x # stop after first failure
|
||||||
|
|
||||||
|
You can also use the `pytest-xdist`_ plugin. For example, to run all tests on
|
||||||
|
the Python 3.6 tox_ environment using all your CPU cores::
|
||||||
|
|
||||||
|
tox -e py36 -- scrapy tests -n auto
|
||||||
|
|
||||||
To see coverage report install `coverage`_ (``pip install coverage``) and run:
|
To see coverage report install `coverage`_ (``pip install coverage``) and run:
|
||||||
|
|
||||||
``coverage report``
|
``coverage report``
|
||||||
@ -237,5 +270,9 @@ And their unit-tests are in::
|
|||||||
.. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS
|
.. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS
|
||||||
.. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
|
.. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
|
||||||
.. _open issues: https://github.com/scrapy/scrapy/issues
|
.. _open issues: https://github.com/scrapy/scrapy/issues
|
||||||
.. _pull request: https://help.github.com/send-pull-requests/
|
.. _PEP 257: https://www.python.org/dev/peps/pep-0257/
|
||||||
|
.. _pull request: https://help.github.com/en/articles/creating-a-pull-request
|
||||||
|
.. _pytest: https://docs.pytest.org/en/latest/usage.html
|
||||||
|
.. _pytest-xdist: https://docs.pytest.org/en/3.0.0/xdist.html
|
||||||
.. _tox: https://pypi.python.org/pypi/tox
|
.. _tox: https://pypi.python.org/pypi/tox
|
||||||
|
.. _tox’s parallel mode: https://tox.readthedocs.io/en/latest/example/basic.html#parallel-mode
|
||||||
|
@ -158,6 +158,7 @@ Solving specific problems
|
|||||||
topics/practices
|
topics/practices
|
||||||
topics/broad-crawls
|
topics/broad-crawls
|
||||||
topics/developer-tools
|
topics/developer-tools
|
||||||
|
topics/dynamic-content
|
||||||
topics/leaks
|
topics/leaks
|
||||||
topics/media-pipeline
|
topics/media-pipeline
|
||||||
topics/deploy
|
topics/deploy
|
||||||
@ -183,6 +184,9 @@ Solving specific problems
|
|||||||
:doc:`topics/developer-tools`
|
:doc:`topics/developer-tools`
|
||||||
Learn how to scrape with your browser's developer tools.
|
Learn how to scrape with your browser's developer tools.
|
||||||
|
|
||||||
|
:doc:`topics/dynamic-content`
|
||||||
|
Read webpage data that is loaded dynamically.
|
||||||
|
|
||||||
:doc:`topics/leaks`
|
:doc:`topics/leaks`
|
||||||
Learn how to find and get rid of memory leaks in your crawler.
|
Learn how to find and get rid of memory leaks in your crawler.
|
||||||
|
|
||||||
|
@ -205,7 +205,7 @@ Extracting data
|
|||||||
---------------
|
---------------
|
||||||
|
|
||||||
The best way to learn how to extract data with Scrapy is trying selectors
|
The best way to learn how to extract data with Scrapy is trying selectors
|
||||||
using the shell :ref:`Scrapy shell <topics-shell>`. Run::
|
using the :ref:`Scrapy shell <topics-shell>`. Run::
|
||||||
|
|
||||||
scrapy shell 'http://quotes.toscrape.com/page/1/'
|
scrapy shell 'http://quotes.toscrape.com/page/1/'
|
||||||
|
|
||||||
@ -296,8 +296,8 @@ expressions`_::
|
|||||||
|
|
||||||
In order to find the proper CSS selectors to use, you might find useful opening
|
In order to find the proper CSS selectors to use, you might find useful opening
|
||||||
the response page from the shell in your web browser using ``view(response)``.
|
the response page from the shell in your web browser using ``view(response)``.
|
||||||
You can use your browser developer tools to inspect the HTML and come up
|
You can use your browser's developer tools to inspect the HTML and come up
|
||||||
with a selector (see section about :ref:`topics-developer-tools`).
|
with a selector (see :ref:`topics-developer-tools`).
|
||||||
|
|
||||||
`Selector Gadget`_ is also a nice tool to quickly find CSS selector for
|
`Selector Gadget`_ is also a nice tool to quickly find CSS selector for
|
||||||
visually selected elements, which works in many browsers.
|
visually selected elements, which works in many browsers.
|
||||||
@ -379,11 +379,11 @@ variable, so that we can run our CSS selectors directly on a particular quote::
|
|||||||
|
|
||||||
>>> quote = response.css("div.quote")[0]
|
>>> quote = response.css("div.quote")[0]
|
||||||
|
|
||||||
Now, let's extract ``title``, ``author`` and the ``tags`` from that quote
|
Now, let's extract ``text``, ``author`` and the ``tags`` from that quote
|
||||||
using the ``quote`` object we just created::
|
using the ``quote`` object we just created::
|
||||||
|
|
||||||
>>> title = quote.css("span.text::text").get()
|
>>> text = quote.css("span.text::text").get()
|
||||||
>>> title
|
>>> text
|
||||||
'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'
|
'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'
|
||||||
>>> author = quote.css("small.author::text").get()
|
>>> author = quote.css("small.author::text").get()
|
||||||
>>> author
|
>>> author
|
||||||
@ -511,7 +511,7 @@ We can try extracting it in the shell::
|
|||||||
'<a href="/page/2/">Next <span aria-hidden="true">→</span></a>'
|
'<a href="/page/2/">Next <span aria-hidden="true">→</span></a>'
|
||||||
|
|
||||||
This gets the anchor element, but we want the attribute ``href``. For that,
|
This gets the anchor element, but we want the attribute ``href``. For that,
|
||||||
Scrapy supports a CSS extension that let's you select the attribute contents,
|
Scrapy supports a CSS extension that lets you select the attribute contents,
|
||||||
like this::
|
like this::
|
||||||
|
|
||||||
>>> response.css('li.next a::attr(href)').get()
|
>>> response.css('li.next a::attr(href)').get()
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
Sphinx>=1.6
|
Sphinx>=2.1
|
||||||
sphinx_rtd_theme
|
sphinx_rtd_theme
|
@ -99,6 +99,8 @@ how you :ref:`configure the downloader middlewares
|
|||||||
|
|
||||||
Returns a deferred that is fired when the crawl is finished.
|
Returns a deferred that is fired when the crawl is finished.
|
||||||
|
|
||||||
|
.. automethod:: stop
|
||||||
|
|
||||||
.. autoclass:: CrawlerRunner
|
.. autoclass:: CrawlerRunner
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
@ -154,7 +156,7 @@ Settings API
|
|||||||
SpiderLoader API
|
SpiderLoader API
|
||||||
================
|
================
|
||||||
|
|
||||||
.. module:: scrapy.loader
|
.. module:: scrapy.spiderloader
|
||||||
:synopsis: The spider loader
|
:synopsis: The spider loader
|
||||||
|
|
||||||
.. class:: SpiderLoader
|
.. class:: SpiderLoader
|
||||||
|
@ -39,6 +39,17 @@ you need to keep in mind when using Scrapy for doing broad crawls, along with
|
|||||||
concrete suggestions of Scrapy settings to tune in order to achieve an
|
concrete suggestions of Scrapy settings to tune in order to achieve an
|
||||||
efficient broad crawl.
|
efficient broad crawl.
|
||||||
|
|
||||||
|
Use the right :setting:`SCHEDULER_PRIORITY_QUEUE`
|
||||||
|
=================================================
|
||||||
|
|
||||||
|
Scrapy’s default scheduler priority queue is ``'scrapy.pqueues.ScrapyPriorityQueue'``.
|
||||||
|
It works best during single-domain crawl. It does not work well with crawling
|
||||||
|
many different domains in parallel
|
||||||
|
|
||||||
|
To apply the recommended priority queue use::
|
||||||
|
|
||||||
|
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
|
||||||
|
|
||||||
Increase concurrency
|
Increase concurrency
|
||||||
====================
|
====================
|
||||||
|
|
||||||
|
@ -120,3 +120,23 @@ get the failures pretty printed::
|
|||||||
for header in self.args:
|
for header in self.args:
|
||||||
if header not in response.headers:
|
if header not in response.headers:
|
||||||
raise ContractFail('X-CustomHeader not present')
|
raise ContractFail('X-CustomHeader not present')
|
||||||
|
|
||||||
|
|
||||||
|
Detecting check runs
|
||||||
|
====================
|
||||||
|
|
||||||
|
When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is
|
||||||
|
set to the ``true`` string. You can use `os.environ`_ to perform any change to
|
||||||
|
your spiders or your settings when ``scrapy check`` is used::
|
||||||
|
|
||||||
|
import os
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
class ExampleSpider(scrapy.Spider):
|
||||||
|
name = 'example'
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
if os.environ.get('SCRAPY_CHECK'):
|
||||||
|
pass # Do some scraper adjustments when a check is running
|
||||||
|
|
||||||
|
.. _os.environ: https://docs.python.org/3/library/os.html#os.environ
|
||||||
|
@ -805,6 +805,7 @@ The :class:`MetaRefreshMiddleware` can be configured through the following
|
|||||||
settings (see the settings documentation for more info):
|
settings (see the settings documentation for more info):
|
||||||
|
|
||||||
* :setting:`METAREFRESH_ENABLED`
|
* :setting:`METAREFRESH_ENABLED`
|
||||||
|
* :setting:`METAREFRESH_IGNORE_TAGS`
|
||||||
* :setting:`METAREFRESH_MAXDELAY`
|
* :setting:`METAREFRESH_MAXDELAY`
|
||||||
|
|
||||||
This middleware obey :setting:`REDIRECT_MAX_TIMES` setting, :reqmeta:`dont_redirect`,
|
This middleware obey :setting:`REDIRECT_MAX_TIMES` setting, :reqmeta:`dont_redirect`,
|
||||||
@ -826,6 +827,15 @@ Default: ``True``
|
|||||||
|
|
||||||
Whether the Meta Refresh middleware will be enabled.
|
Whether the Meta Refresh middleware will be enabled.
|
||||||
|
|
||||||
|
.. setting:: METAREFRESH_IGNORE_TAGS
|
||||||
|
|
||||||
|
METAREFRESH_IGNORE_TAGS
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Default: ``['script', 'noscript']``
|
||||||
|
|
||||||
|
Meta tags within these tags are ignored.
|
||||||
|
|
||||||
.. setting:: METAREFRESH_MAXDELAY
|
.. setting:: METAREFRESH_MAXDELAY
|
||||||
|
|
||||||
METAREFRESH_MAXDELAY
|
METAREFRESH_MAXDELAY
|
||||||
|
246
docs/topics/dynamic-content.rst
Normal file
246
docs/topics/dynamic-content.rst
Normal file
@ -0,0 +1,246 @@
|
|||||||
|
.. _topics-dynamic-content:
|
||||||
|
|
||||||
|
====================================
|
||||||
|
Selecting dynamically-loaded content
|
||||||
|
====================================
|
||||||
|
|
||||||
|
Some webpages show the desired data when you load them in a web browser.
|
||||||
|
However, when you download them using Scrapy, you cannot reach the desired data
|
||||||
|
using :ref:`selectors <topics-selectors>`.
|
||||||
|
|
||||||
|
When this happens, the recommended approach is to
|
||||||
|
:ref:`find the data source <topics-finding-data-source>` and extract the data
|
||||||
|
from it.
|
||||||
|
|
||||||
|
If you fail to do that, and you can nonetheless access the desired data through
|
||||||
|
the :ref:`DOM <topics-livedom>` from your web browser, see
|
||||||
|
:ref:`topics-javascript-rendering`.
|
||||||
|
|
||||||
|
.. _topics-finding-data-source:
|
||||||
|
|
||||||
|
Finding the data source
|
||||||
|
=======================
|
||||||
|
|
||||||
|
To extract the desired data, you must first find its source location.
|
||||||
|
|
||||||
|
If the data is in a non-text-based format, such as an image or a PDF document,
|
||||||
|
use the :ref:`network tool <topics-network-tool>` of your web browser to find
|
||||||
|
the corresponding request, and :ref:`reproduce it
|
||||||
|
<topics-reproducing-requests>`.
|
||||||
|
|
||||||
|
If your web browser lets you select the desired data as text, the data may be
|
||||||
|
defined in embedded JavaScript code, or loaded from an external resource in a
|
||||||
|
text-based format.
|
||||||
|
|
||||||
|
In that case, you can use a tool like wgrep_ to find the URL of that resource.
|
||||||
|
|
||||||
|
If the data turns out to come from the original URL itself, you must
|
||||||
|
:ref:`inspect the source code of the webpage <topics-inspecting-source>` to
|
||||||
|
determine where the data is located.
|
||||||
|
|
||||||
|
If the data comes from a different URL, you will need to :ref:`reproduce the
|
||||||
|
corresponding request <topics-reproducing-requests>`.
|
||||||
|
|
||||||
|
.. _topics-inspecting-source:
|
||||||
|
|
||||||
|
Inspecting the source code of a webpage
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
Sometimes you need to inspect the source code of a webpage (not the
|
||||||
|
:ref:`DOM <topics-livedom>`) to determine where some desired data is located.
|
||||||
|
|
||||||
|
Use Scrapy’s :command:`fetch` command to download the webpage contents as seen
|
||||||
|
by Scrapy::
|
||||||
|
|
||||||
|
scrapy fetch --nolog https://example.com > response.html
|
||||||
|
|
||||||
|
If the desired data is in embedded JavaScript code within a ``<script/>``
|
||||||
|
element, see :ref:`topics-parsing-javascript`.
|
||||||
|
|
||||||
|
If you cannot find the desired data, first make sure it’s not just Scrapy:
|
||||||
|
download the webpage with an HTTP client like curl_ or wget_ and see if the
|
||||||
|
information can be found in the response they get.
|
||||||
|
|
||||||
|
If they get a response with the desired data, modify your Scrapy
|
||||||
|
:class:`~scrapy.http.Request` to match that of the other HTTP client. For
|
||||||
|
example, try using the same user-agent string (:setting:`USER_AGENT`) or the
|
||||||
|
same :attr:`~scrapy.http.Request.headers`.
|
||||||
|
|
||||||
|
If they also get a response without the desired data, you’ll need to take
|
||||||
|
steps to make your request more similar to that of the web browser. See
|
||||||
|
:ref:`topics-reproducing-requests`.
|
||||||
|
|
||||||
|
.. _topics-reproducing-requests:
|
||||||
|
|
||||||
|
Reproducing requests
|
||||||
|
====================
|
||||||
|
|
||||||
|
Sometimes we need to reproduce a request the way our web browser performs it.
|
||||||
|
|
||||||
|
Use the :ref:`network tool <topics-network-tool>` of your web browser to see
|
||||||
|
how your web browser performs the desired request, and try to reproduce that
|
||||||
|
request with Scrapy.
|
||||||
|
|
||||||
|
It might be enough to yield a :class:`~scrapy.http.Request` with the same HTTP
|
||||||
|
method and URL. However, you may also need to reproduce the body, headers and
|
||||||
|
form parameters (see :class:`~scrapy.http.FormRequest`) of that request.
|
||||||
|
|
||||||
|
Once you get the expected response, you can :ref:`extract the desired data from
|
||||||
|
it <topics-handling-response-formats>`.
|
||||||
|
|
||||||
|
You can reproduce any request with Scrapy. However, some times reproducing all
|
||||||
|
necessary requests may not seem efficient in developer time. If that is your
|
||||||
|
case, and crawling speed is not a major concern for you, you can alternatively
|
||||||
|
consider :ref:`JavaScript pre-rendering <topics-javascript-rendering>`.
|
||||||
|
|
||||||
|
If you get the expected response `sometimes`, but not always, the issue is
|
||||||
|
probably not your request, but the target server. The target server might be
|
||||||
|
buggy, overloaded, or :ref:`banning <bans>` some of your requests.
|
||||||
|
|
||||||
|
.. _topics-handling-response-formats:
|
||||||
|
|
||||||
|
Handling different response formats
|
||||||
|
===================================
|
||||||
|
|
||||||
|
Once you have a response with the desired data, how you extract the desired
|
||||||
|
data from it depends on the type of response:
|
||||||
|
|
||||||
|
- If the response is HTML or XML, use :ref:`selectors
|
||||||
|
<topics-selectors>` as usual.
|
||||||
|
|
||||||
|
- If the response is JSON, use `json.loads`_ to load the desired data from
|
||||||
|
:attr:`response.text <scrapy.http.TextResponse.text>`::
|
||||||
|
|
||||||
|
data = json.loads(response.text)
|
||||||
|
|
||||||
|
If the desired data is inside HTML or XML code embedded within JSON data,
|
||||||
|
you can load that HTML or XML code into a
|
||||||
|
:class:`~scrapy.selector.Selector` and then
|
||||||
|
:ref:`use it <topics-selectors>` as usual::
|
||||||
|
|
||||||
|
selector = Selector(data['html'])
|
||||||
|
|
||||||
|
- If the response is JavaScript, or HTML with a ``<script/>`` element
|
||||||
|
containing the desired data, see :ref:`topics-parsing-javascript`.
|
||||||
|
|
||||||
|
- If the response is CSS, use a `regular expression`_ to extract the desired
|
||||||
|
data from :attr:`response.text <scrapy.http.TextResponse.text>`.
|
||||||
|
|
||||||
|
.. _topics-parsing-images:
|
||||||
|
|
||||||
|
- If the response is an image or another format based on images (e.g. PDF),
|
||||||
|
read the response as bytes from
|
||||||
|
:attr:`response.body <scrapy.http.TextResponse.body>` and use an OCR
|
||||||
|
solution to extract the desired data as text.
|
||||||
|
|
||||||
|
For example, you can use pytesseract_. To read a table from a PDF,
|
||||||
|
`tabula-py`_ may be a better choice.
|
||||||
|
|
||||||
|
- If the response is SVG, or HTML with embedded SVG containing the desired
|
||||||
|
data, you may be able to extract the desired data using
|
||||||
|
:ref:`selectors <topics-selectors>`, since SVG is based on XML.
|
||||||
|
|
||||||
|
Otherwise, you might need to convert the SVG code into a raster image, and
|
||||||
|
:ref:`handle that raster image <topics-parsing-images>`.
|
||||||
|
|
||||||
|
.. _topics-parsing-javascript:
|
||||||
|
|
||||||
|
Parsing JavaScript code
|
||||||
|
=======================
|
||||||
|
|
||||||
|
If the desired data is hardcoded in JavaScript, you first need to get the
|
||||||
|
JavaScript code:
|
||||||
|
|
||||||
|
- If the JavaScript code is in a JavaScript file, simply read
|
||||||
|
:attr:`response.text <scrapy.http.TextResponse.text>`.
|
||||||
|
|
||||||
|
- If the JavaScript code is within a ``<script/>`` element of an HTML page,
|
||||||
|
use :ref:`selectors <topics-selectors>` to extract the text within that
|
||||||
|
``<script/>`` element.
|
||||||
|
|
||||||
|
Once you have a string with the JavaScript code, you can extract the desired
|
||||||
|
data from it:
|
||||||
|
|
||||||
|
- You might be able to use a `regular expression`_ to extract the desired
|
||||||
|
data in JSON format, which you can then parse with `json.loads`_.
|
||||||
|
|
||||||
|
For example, if the JavaScript code contains a separate line like
|
||||||
|
``var data = {"field": "value"};`` you can extract that data as follows::
|
||||||
|
|
||||||
|
>>> pattern = r'\bvar\s+data\s*=\s*(\{.*?\})\s*;\s*\n'
|
||||||
|
>>> json_data = response.css('script::text').re_first(pattern)
|
||||||
|
>>> json.loads(json_data)
|
||||||
|
{'field': 'value'}
|
||||||
|
|
||||||
|
- Otherwise, use js2xml_ to convert the JavaScript code into an XML document
|
||||||
|
that you can parse using :ref:`selectors <topics-selectors>`.
|
||||||
|
|
||||||
|
For example, if the JavaScript code contains
|
||||||
|
``var data = {field: "value"};`` you can extract that data as follows::
|
||||||
|
|
||||||
|
>>> import js2xml
|
||||||
|
>>> import lxml.etree
|
||||||
|
>>> from parsel import Selector
|
||||||
|
>>> javascript = response.css('script::text').get()
|
||||||
|
>>> xml = lxml.etree.tostring(js2xml.parse(javascript), encoding='unicode')
|
||||||
|
>>> selector = Selector(text=xml)
|
||||||
|
>>> selector.css('var[name="data"]').get()
|
||||||
|
'<var name="data"><object><property name="field"><string>value</string></property></object></var>'
|
||||||
|
|
||||||
|
.. _topics-javascript-rendering:
|
||||||
|
|
||||||
|
Pre-rendering JavaScript
|
||||||
|
========================
|
||||||
|
|
||||||
|
On webpages that fetch data from additional requests, reproducing those
|
||||||
|
requests that contain the desired data is the preferred approach. The effort is
|
||||||
|
often worth the result: structured, complete data with minimum parsing time and
|
||||||
|
network transfer.
|
||||||
|
|
||||||
|
However, sometimes it can be really hard to reproduce certain requests. Or you
|
||||||
|
may need something that no request can give you, such as a screenshot of a
|
||||||
|
webpage as seen in a web browser.
|
||||||
|
|
||||||
|
In these cases use the Splash_ JavaScript-rendering service, along with
|
||||||
|
`scrapy-splash`_ for seamless integration.
|
||||||
|
|
||||||
|
Splash returns as HTML the :ref:`DOM <topics-livedom>` of a webpage, so that
|
||||||
|
you can parse it with :ref:`selectors <topics-selectors>`. It provides great
|
||||||
|
flexibility through configuration_ or scripting_.
|
||||||
|
|
||||||
|
If you need something beyond what Splash offers, such as interacting with the
|
||||||
|
DOM on-the-fly from Python code instead of using a previously-written script,
|
||||||
|
or handling multiple web browser windows, you might need to
|
||||||
|
:ref:`use a headless browser <topics-headless-browsing>` instead.
|
||||||
|
|
||||||
|
.. _configuration: https://splash.readthedocs.io/en/stable/api.html
|
||||||
|
.. _scripting: https://splash.readthedocs.io/en/stable/scripting-tutorial.html
|
||||||
|
|
||||||
|
.. _topics-headless-browsing:
|
||||||
|
|
||||||
|
Using a headless browser
|
||||||
|
========================
|
||||||
|
|
||||||
|
A `headless browser`_ is a special web browser that provides an API for
|
||||||
|
automation.
|
||||||
|
|
||||||
|
The easiest way to use a headless browser with Scrapy is to use Selenium_,
|
||||||
|
along with `scrapy-selenium`_ for seamless integration.
|
||||||
|
|
||||||
|
|
||||||
|
.. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29
|
||||||
|
.. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
|
||||||
|
.. _curl: https://curl.haxx.se/
|
||||||
|
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
|
||||||
|
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
|
||||||
|
.. _js2xml: https://github.com/scrapinghub/js2xml
|
||||||
|
.. _json.loads: https://docs.python.org/library/json.html#json.loads
|
||||||
|
.. _pytesseract: https://github.com/madmaze/pytesseract
|
||||||
|
.. _regular expression: https://docs.python.org/library/re.html
|
||||||
|
.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
|
||||||
|
.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
|
||||||
|
.. _Selenium: https://www.seleniumhq.org/
|
||||||
|
.. _Splash: https://github.com/scrapinghub/splash
|
||||||
|
.. _tabula-py: https://github.com/chezou/tabula-py
|
||||||
|
.. _wget: https://www.gnu.org/software/wget/
|
||||||
|
.. _wgrep: https://github.com/stav/wgrep
|
@ -238,9 +238,10 @@ scrapy.utils.log module
|
|||||||
|
|
||||||
.. autofunction:: configure_logging
|
.. autofunction:: configure_logging
|
||||||
|
|
||||||
``configure_logging`` is automatically called when using Scrapy commands,
|
``configure_logging`` is automatically called when using Scrapy commands
|
||||||
but needs to be called explicitly when running custom scripts. In that
|
or :class:`~scrapy.crawler.CrawlerProcess`, but needs to be called explicitly
|
||||||
case, its usage is not required but it's recommended.
|
when running custom scripts using :class:`~scrapy.crawler.CrawlerRunner`.
|
||||||
|
In that case, its usage is not required but it's recommended.
|
||||||
|
|
||||||
If you plan on configuring the handlers yourself is still recommended you
|
If you plan on configuring the handlers yourself is still recommended you
|
||||||
call this function, passing ``install_root_handler=False``. Bear in mind
|
call this function, passing ``install_root_handler=False``. Bear in mind
|
||||||
|
@ -897,6 +897,16 @@ Default: ``False``
|
|||||||
If ``True``, the logs will just contain the root path. If it is set to ``False``
|
If ``True``, the logs will just contain the root path. If it is set to ``False``
|
||||||
then it displays the component responsible for the log output
|
then it displays the component responsible for the log output
|
||||||
|
|
||||||
|
.. setting:: LOGSTATS_INTERVAL
|
||||||
|
|
||||||
|
LOGSTATS_INTERVAL
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
Default: ``60.0``
|
||||||
|
|
||||||
|
The interval (in seconds) between each logging printout of the stats
|
||||||
|
by :class:`~extensions.logstats.LogStats`.
|
||||||
|
|
||||||
.. setting:: MEMDEBUG_ENABLED
|
.. setting:: MEMDEBUG_ENABLED
|
||||||
|
|
||||||
MEMDEBUG_ENABLED
|
MEMDEBUG_ENABLED
|
||||||
@ -1155,9 +1165,14 @@ Type of in-memory queue used by scheduler. Other available type is:
|
|||||||
|
|
||||||
SCHEDULER_PRIORITY_QUEUE
|
SCHEDULER_PRIORITY_QUEUE
|
||||||
------------------------
|
------------------------
|
||||||
Default: ``'queuelib.PriorityQueue'``
|
Default: ``'scrapy.pqueues.ScrapyPriorityQueue'``
|
||||||
|
|
||||||
Type of priority queue used by scheduler.
|
Type of priority queue used by the scheduler. Another available type is
|
||||||
|
``scrapy.pqueues.DownloaderAwarePriorityQueue``.
|
||||||
|
``scrapy.pqueues.DownloaderAwarePriorityQueue`` works better than
|
||||||
|
``scrapy.pqueues.ScrapyPriorityQueue`` when you crawl many different
|
||||||
|
domains in parallel. But currently ``scrapy.pqueues.DownloaderAwarePriorityQueue``
|
||||||
|
does not work together with :setting:`CONCURRENT_REQUESTS_PER_IP`.
|
||||||
|
|
||||||
.. setting:: SPIDER_CONTRACTS
|
.. setting:: SPIDER_CONTRACTS
|
||||||
|
|
||||||
|
@ -82,7 +82,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
|||||||
|
|
||||||
If it raises an exception, Scrapy won't bother calling any other spider
|
If it raises an exception, Scrapy won't bother calling any other spider
|
||||||
middleware :meth:`process_spider_input` and will call the request
|
middleware :meth:`process_spider_input` and will call the request
|
||||||
errback. The output of the errback is chained back in the other
|
errback if there is one, otherwise it will start the :meth:`process_spider_exception`
|
||||||
|
chain. The output of the errback is chained back in the other
|
||||||
direction for :meth:`process_spider_output` to process it, or
|
direction for :meth:`process_spider_output` to process it, or
|
||||||
:meth:`process_spider_exception` if it raised an exception.
|
:meth:`process_spider_exception` if it raised an exception.
|
||||||
|
|
||||||
@ -116,8 +117,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
|||||||
|
|
||||||
.. method:: process_spider_exception(response, exception, spider)
|
.. method:: process_spider_exception(response, exception, spider)
|
||||||
|
|
||||||
This method is called when a spider or :meth:`process_spider_input`
|
This method is called when a spider or :meth:`process_spider_output`
|
||||||
method (from other spider middleware) raises an exception.
|
method (from a previous spider middleware) raises an exception.
|
||||||
|
|
||||||
:meth:`process_spider_exception` should return either ``None`` or an
|
:meth:`process_spider_exception` should return either ``None`` or an
|
||||||
iterable of :class:`~scrapy.http.Request`, dict or
|
iterable of :class:`~scrapy.http.Request`, dict or
|
||||||
@ -129,7 +130,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
|||||||
exception reaches the engine (where it's logged and discarded).
|
exception reaches the engine (where it's logged and discarded).
|
||||||
|
|
||||||
If it returns an iterable the :meth:`process_spider_output` pipeline
|
If it returns an iterable the :meth:`process_spider_output` pipeline
|
||||||
kicks in, and no other :meth:`process_spider_exception` will be called.
|
kicks in, starting from the next spider middleware, and no other
|
||||||
|
:meth:`process_spider_exception` will be called.
|
||||||
|
|
||||||
:param response: the response being processed when the exception was
|
:param response: the response being processed when the exception was
|
||||||
raised
|
raised
|
||||||
|
@ -402,10 +402,12 @@ Crawling rules
|
|||||||
of links extracted from each response using the specified ``link_extractor``.
|
of links extracted from each response using the specified ``link_extractor``.
|
||||||
This is mainly used for filtering purposes.
|
This is mainly used for filtering purposes.
|
||||||
|
|
||||||
``process_request`` is a callable, or a string (in which case a method from
|
``process_request`` is a callable (or a string, in which case a method from
|
||||||
the spider object with that name will be used) which will be called with
|
the spider object with that name will be used) which will be called for every
|
||||||
every request extracted by this rule, and must return a request or None (to
|
:class:`~scrapy.http.Request` extracted by this rule. This callable should
|
||||||
filter out the request).
|
take said request as first argument and the :class:`~scrapy.http.Response`
|
||||||
|
from which the request originated as second argument. It must return a
|
||||||
|
``Request`` object or ``None`` (to filter out the request).
|
||||||
|
|
||||||
CrawlSpider example
|
CrawlSpider example
|
||||||
~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~
|
||||||
@ -655,7 +657,7 @@ SitemapSpider
|
|||||||
|
|
||||||
.. attribute:: sitemap_follow
|
.. attribute:: sitemap_follow
|
||||||
|
|
||||||
A list of regexes of sitemap that should be followed. This is is only
|
A list of regexes of sitemap that should be followed. This is only
|
||||||
for sites that use `Sitemap index files`_ that point to other sitemap
|
for sites that use `Sitemap index files`_ that point to other sitemap
|
||||||
files.
|
files.
|
||||||
|
|
||||||
|
@ -75,8 +75,7 @@ available in Scrapy which extend the basic Stats Collector. You can select
|
|||||||
which Stats Collector to use through the :setting:`STATS_CLASS` setting. The
|
which Stats Collector to use through the :setting:`STATS_CLASS` setting. The
|
||||||
default Stats Collector used is the :class:`MemoryStatsCollector`.
|
default Stats Collector used is the :class:`MemoryStatsCollector`.
|
||||||
|
|
||||||
.. module:: scrapy.statscollectors
|
.. currentmodule:: scrapy.statscollectors
|
||||||
:synopsis: Stats Collectors
|
|
||||||
|
|
||||||
MemoryStatsCollector
|
MemoryStatsCollector
|
||||||
--------------------
|
--------------------
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
|
.. currentmodule:: scrapy.extensions.telnet
|
||||||
|
|
||||||
.. _topics-telnetconsole:
|
.. _topics-telnetconsole:
|
||||||
|
|
||||||
==============
|
==============
|
||||||
Telnet Console
|
Telnet Console
|
||||||
==============
|
==============
|
||||||
|
|
||||||
.. module:: scrapy.extensions.telnet
|
|
||||||
:synopsis: The Telnet Console
|
|
||||||
|
|
||||||
Scrapy comes with a built-in telnet console for inspecting and controlling a
|
Scrapy comes with a built-in telnet console for inspecting and controlling a
|
||||||
Scrapy running process. The telnet console is just a regular python shell
|
Scrapy running process. The telnet console is just a regular python shell
|
||||||
running inside the Scrapy process, so you can do literally anything from it.
|
running inside the Scrapy process, so you can do literally anything from it.
|
||||||
@ -45,7 +44,7 @@ the console you need to type::
|
|||||||
>>>
|
>>>
|
||||||
|
|
||||||
By default Username is ``scrapy`` and Password is autogenerated. The
|
By default Username is ``scrapy`` and Password is autogenerated. The
|
||||||
autogenerated Password can be seen on scrapy logs like the example bellow::
|
autogenerated Password can be seen on scrapy logs like the example below::
|
||||||
|
|
||||||
2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326
|
2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ from unittest import TextTestRunner, TextTestResult as _TextTestResult
|
|||||||
|
|
||||||
from scrapy.commands import ScrapyCommand
|
from scrapy.commands import ScrapyCommand
|
||||||
from scrapy.contracts import ContractsManager
|
from scrapy.contracts import ContractsManager
|
||||||
from scrapy.utils.misc import load_object
|
from scrapy.utils.misc import load_object, set_environ
|
||||||
from scrapy.utils.conf import build_component_list
|
from scrapy.utils.conf import build_component_list
|
||||||
|
|
||||||
|
|
||||||
@ -68,6 +68,7 @@ class Command(ScrapyCommand):
|
|||||||
|
|
||||||
spider_loader = self.crawler_process.spider_loader
|
spider_loader = self.crawler_process.spider_loader
|
||||||
|
|
||||||
|
with set_environ(SCRAPY_CHECK='true'):
|
||||||
for spidername in args or spider_loader.list():
|
for spidername in args or spider_loader.list():
|
||||||
spidercls = spider_loader.load(spidername)
|
spidercls = spider_loader.load(spidername)
|
||||||
spidercls.start_requests = lambda s: conman.from_spider(s, result)
|
spidercls.start_requests = lambda s: conman.from_spider(s, result)
|
||||||
|
@ -94,7 +94,7 @@ class ContractsManager(object):
|
|||||||
try:
|
try:
|
||||||
output = cb(response)
|
output = cb(response)
|
||||||
output = list(iterate_spider_output(output))
|
output = list(iterate_spider_output(output))
|
||||||
except:
|
except Exception:
|
||||||
case = _create_testcase(method, 'callback')
|
case = _create_testcase(method, 'callback')
|
||||||
results.addError(case, sys.exc_info())
|
results.addError(case, sys.exc_info())
|
||||||
|
|
||||||
|
@ -75,6 +75,8 @@ def _get_concurrency_delay(concurrency, spider, settings):
|
|||||||
|
|
||||||
class Downloader(object):
|
class Downloader(object):
|
||||||
|
|
||||||
|
DOWNLOAD_SLOT = 'download_slot'
|
||||||
|
|
||||||
def __init__(self, crawler):
|
def __init__(self, crawler):
|
||||||
self.settings = crawler.settings
|
self.settings = crawler.settings
|
||||||
self.signals = crawler.signals
|
self.signals = crawler.signals
|
||||||
@ -111,8 +113,8 @@ class Downloader(object):
|
|||||||
return key, self.slots[key]
|
return key, self.slots[key]
|
||||||
|
|
||||||
def _get_slot_key(self, request, spider):
|
def _get_slot_key(self, request, spider):
|
||||||
if 'download_slot' in request.meta:
|
if self.DOWNLOAD_SLOT in request.meta:
|
||||||
return request.meta['download_slot']
|
return request.meta[self.DOWNLOAD_SLOT]
|
||||||
|
|
||||||
key = urlparse_cached(request).hostname or ''
|
key = urlparse_cached(request).hostname or ''
|
||||||
if self.ip_concurrency:
|
if self.ip_concurrency:
|
||||||
@ -122,7 +124,7 @@ class Downloader(object):
|
|||||||
|
|
||||||
def _enqueue_request(self, request, spider):
|
def _enqueue_request(self, request, spider):
|
||||||
key, slot = self._get_slot(request, spider)
|
key, slot = self._get_slot(request, spider)
|
||||||
request.meta['download_slot'] = key
|
request.meta[self.DOWNLOAD_SLOT] = key
|
||||||
|
|
||||||
def _deactivate(response):
|
def _deactivate(response):
|
||||||
slot.active.remove(request)
|
slot.active.remove(request)
|
||||||
|
@ -7,6 +7,7 @@ import six
|
|||||||
|
|
||||||
from twisted.internet import defer
|
from twisted.internet import defer
|
||||||
|
|
||||||
|
from scrapy.exceptions import _InvalidOutput
|
||||||
from scrapy.http import Request, Response
|
from scrapy.http import Request, Response
|
||||||
from scrapy.middleware import MiddlewareManager
|
from scrapy.middleware import MiddlewareManager
|
||||||
from scrapy.utils.defer import mustbe_deferred
|
from scrapy.utils.defer import mustbe_deferred
|
||||||
@ -35,9 +36,9 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
|||||||
def process_request(request):
|
def process_request(request):
|
||||||
for method in self.methods['process_request']:
|
for method in self.methods['process_request']:
|
||||||
response = yield method(request=request, spider=spider)
|
response = yield method(request=request, spider=spider)
|
||||||
assert response is None or isinstance(response, (Response, Request)), \
|
if response is not None and not isinstance(response, (Response, Request)):
|
||||||
'Middleware %s.process_request must return None, Response or Request, got %s' % \
|
raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
|
||||||
(six.get_method_self(method).__class__.__name__, response.__class__.__name__)
|
(six.get_method_self(method).__class__.__name__, response.__class__.__name__))
|
||||||
if response:
|
if response:
|
||||||
defer.returnValue(response)
|
defer.returnValue(response)
|
||||||
defer.returnValue((yield download_func(request=request, spider=spider)))
|
defer.returnValue((yield download_func(request=request, spider=spider)))
|
||||||
@ -49,11 +50,10 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
|||||||
defer.returnValue(response)
|
defer.returnValue(response)
|
||||||
|
|
||||||
for method in self.methods['process_response']:
|
for method in self.methods['process_response']:
|
||||||
response = yield method(request=request, response=response,
|
response = yield method(request=request, response=response, spider=spider)
|
||||||
spider=spider)
|
if not isinstance(response, (Response, Request)):
|
||||||
assert isinstance(response, (Response, Request)), \
|
raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
|
||||||
'Middleware %s.process_response must return Response or Request, got %s' % \
|
(six.get_method_self(method).__class__.__name__, type(response)))
|
||||||
(six.get_method_self(method).__class__.__name__, type(response))
|
|
||||||
if isinstance(response, Request):
|
if isinstance(response, Request):
|
||||||
defer.returnValue(response)
|
defer.returnValue(response)
|
||||||
defer.returnValue(response)
|
defer.returnValue(response)
|
||||||
@ -62,11 +62,10 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
|||||||
def process_exception(_failure):
|
def process_exception(_failure):
|
||||||
exception = _failure.value
|
exception = _failure.value
|
||||||
for method in self.methods['process_exception']:
|
for method in self.methods['process_exception']:
|
||||||
response = yield method(request=request, exception=exception,
|
response = yield method(request=request, exception=exception, spider=spider)
|
||||||
spider=spider)
|
if response is not None and not isinstance(response, (Response, Request)):
|
||||||
assert response is None or isinstance(response, (Response, Request)), \
|
raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \
|
||||||
'Middleware %s.process_exception must return None, Response or Request, got %s' % \
|
(six.get_method_self(method).__class__.__name__, type(response)))
|
||||||
(six.get_method_self(method).__class__.__name__, type(response))
|
|
||||||
if response:
|
if response:
|
||||||
defer.returnValue(response)
|
defer.returnValue(response)
|
||||||
defer.returnValue(_failure)
|
defer.returnValue(_failure)
|
||||||
|
@ -1,19 +1,46 @@
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import warnings
|
||||||
from os.path import join, exists
|
from os.path import join, exists
|
||||||
|
|
||||||
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
from queuelib import PriorityQueue
|
||||||
|
|
||||||
from scrapy.utils.misc import load_object, create_instance
|
from scrapy.utils.misc import load_object, create_instance
|
||||||
from scrapy.utils.job import job_dir
|
from scrapy.utils.job import job_dir
|
||||||
|
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Scheduler(object):
|
class Scheduler(object):
|
||||||
|
"""
|
||||||
|
Scrapy Scheduler. It allows to enqueue requests and then get
|
||||||
|
a next request to download. Scheduler is also handling duplication
|
||||||
|
filtering, via dupefilter.
|
||||||
|
|
||||||
|
Prioritization and queueing is not performed by the Scheduler.
|
||||||
|
User sets ``priority`` field for each Request, and a PriorityQueue
|
||||||
|
(defined by :setting:`SCHEDULER_PRIORITY_QUEUE`) uses these priorities
|
||||||
|
to dequeue requests in a desired order.
|
||||||
|
|
||||||
|
Scheduler uses two PriorityQueue instances, configured to work in-memory
|
||||||
|
and on-disk (optional). When on-disk queue is present, it is used by
|
||||||
|
default, and an in-memory queue is used as a fallback for cases where
|
||||||
|
a disk queue can't handle a request (can't serialize it).
|
||||||
|
|
||||||
|
:setting:`SCHEDULER_MEMORY_QUEUE` and
|
||||||
|
:setting:`SCHEDULER_DISK_QUEUE` allow to specify lower-level queue classes
|
||||||
|
which PriorityQueue instances would be instantiated with, to keep requests
|
||||||
|
on disk and in memory respectively.
|
||||||
|
|
||||||
|
Overall, Scheduler is an object which holds several PriorityQueue instances
|
||||||
|
(in-memory and on-disk) and implements fallback logic for them.
|
||||||
|
Also, it handles dupefilters.
|
||||||
|
"""
|
||||||
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
|
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
|
||||||
logunser=False, stats=None, pqclass=None):
|
logunser=False, stats=None, pqclass=None, crawler=None):
|
||||||
self.df = dupefilter
|
self.df = dupefilter
|
||||||
self.dqdir = self._dqdir(jobdir)
|
self.dqdir = self._dqdir(jobdir)
|
||||||
self.pqclass = pqclass
|
self.pqclass = pqclass
|
||||||
@ -21,6 +48,7 @@ class Scheduler(object):
|
|||||||
self.mqclass = mqclass
|
self.mqclass = mqclass
|
||||||
self.logunser = logunser
|
self.logunser = logunser
|
||||||
self.stats = stats
|
self.stats = stats
|
||||||
|
self.crawler = crawler
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_crawler(cls, crawler):
|
def from_crawler(cls, crawler):
|
||||||
@ -28,26 +56,35 @@ class Scheduler(object):
|
|||||||
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
|
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
|
||||||
dupefilter = create_instance(dupefilter_cls, settings, crawler)
|
dupefilter = create_instance(dupefilter_cls, settings, crawler)
|
||||||
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
|
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
|
||||||
|
if pqclass is PriorityQueue:
|
||||||
|
warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
|
||||||
|
" is no longer supported because of API changes; "
|
||||||
|
"please use 'scrapy.pqueues.ScrapyPriorityQueue'",
|
||||||
|
ScrapyDeprecationWarning)
|
||||||
|
from scrapy.pqueues import ScrapyPriorityQueue
|
||||||
|
pqclass = ScrapyPriorityQueue
|
||||||
|
|
||||||
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
|
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
|
||||||
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
|
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
|
||||||
logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
|
logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
|
||||||
|
settings.getbool('SCHEDULER_DEBUG'))
|
||||||
return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
|
return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
|
||||||
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
|
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
|
||||||
|
mqclass=mqclass, crawler=crawler)
|
||||||
|
|
||||||
def has_pending_requests(self):
|
def has_pending_requests(self):
|
||||||
return len(self) > 0
|
return len(self) > 0
|
||||||
|
|
||||||
def open(self, spider):
|
def open(self, spider):
|
||||||
self.spider = spider
|
self.spider = spider
|
||||||
self.mqs = self.pqclass(self._newmq)
|
self.mqs = self._mq()
|
||||||
self.dqs = self._dq() if self.dqdir else None
|
self.dqs = self._dq() if self.dqdir else None
|
||||||
return self.df.open()
|
return self.df.open()
|
||||||
|
|
||||||
def close(self, reason):
|
def close(self, reason):
|
||||||
if self.dqs:
|
if self.dqs:
|
||||||
prios = self.dqs.close()
|
state = self.dqs.close()
|
||||||
with open(join(self.dqdir, 'active.json'), 'w') as f:
|
self._write_dqs_state(self.dqdir, state)
|
||||||
json.dump(prios, f)
|
|
||||||
return self.df.close(reason)
|
return self.df.close(reason)
|
||||||
|
|
||||||
def enqueue_request(self, request):
|
def enqueue_request(self, request):
|
||||||
@ -82,8 +119,7 @@ class Scheduler(object):
|
|||||||
if self.dqs is None:
|
if self.dqs is None:
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
reqd = request_to_dict(request, self.spider)
|
self.dqs.push(request, -request.priority)
|
||||||
self.dqs.push(reqd, -request.priority)
|
|
||||||
except ValueError as e: # non serializable request
|
except ValueError as e: # non serializable request
|
||||||
if self.logunser:
|
if self.logunser:
|
||||||
msg = ("Unable to serialize request: %(request)s - reason:"
|
msg = ("Unable to serialize request: %(request)s - reason:"
|
||||||
@ -103,32 +139,51 @@ class Scheduler(object):
|
|||||||
|
|
||||||
def _dqpop(self):
|
def _dqpop(self):
|
||||||
if self.dqs:
|
if self.dqs:
|
||||||
d = self.dqs.pop()
|
return self.dqs.pop()
|
||||||
if d:
|
|
||||||
return request_from_dict(d, self.spider)
|
|
||||||
|
|
||||||
def _newmq(self, priority):
|
def _newmq(self, priority):
|
||||||
|
""" Factory for creating memory queues. """
|
||||||
return self.mqclass()
|
return self.mqclass()
|
||||||
|
|
||||||
def _newdq(self, priority):
|
def _newdq(self, priority):
|
||||||
return self.dqclass(join(self.dqdir, 'p%s' % priority))
|
""" Factory for creating disk queues. """
|
||||||
|
path = join(self.dqdir, 'p%s' % (priority, ))
|
||||||
|
return self.dqclass(path)
|
||||||
|
|
||||||
|
def _mq(self):
|
||||||
|
""" Create a new priority queue instance, with in-memory storage """
|
||||||
|
return create_instance(self.pqclass, None, self.crawler, self._newmq,
|
||||||
|
serialize=False)
|
||||||
|
|
||||||
def _dq(self):
|
def _dq(self):
|
||||||
activef = join(self.dqdir, 'active.json')
|
""" Create a new priority queue instance, with disk storage """
|
||||||
if exists(activef):
|
state = self._read_dqs_state(self.dqdir)
|
||||||
with open(activef) as f:
|
q = create_instance(self.pqclass,
|
||||||
prios = json.load(f)
|
None,
|
||||||
else:
|
self.crawler,
|
||||||
prios = ()
|
self._newdq,
|
||||||
q = self.pqclass(self._newdq, startprios=prios)
|
state,
|
||||||
|
serialize=True)
|
||||||
if q:
|
if q:
|
||||||
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
|
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
|
||||||
{'queuesize': len(q)}, extra={'spider': self.spider})
|
{'queuesize': len(q)}, extra={'spider': self.spider})
|
||||||
return q
|
return q
|
||||||
|
|
||||||
def _dqdir(self, jobdir):
|
def _dqdir(self, jobdir):
|
||||||
|
""" Return a folder name to keep disk queue state at """
|
||||||
if jobdir:
|
if jobdir:
|
||||||
dqdir = join(jobdir, 'requests.queue')
|
dqdir = join(jobdir, 'requests.queue')
|
||||||
if not exists(dqdir):
|
if not exists(dqdir):
|
||||||
os.makedirs(dqdir)
|
os.makedirs(dqdir)
|
||||||
return dqdir
|
return dqdir
|
||||||
|
|
||||||
|
def _read_dqs_state(self, dqdir):
|
||||||
|
path = join(dqdir, 'active.json')
|
||||||
|
if not exists(path):
|
||||||
|
return ()
|
||||||
|
with open(path) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def _write_dqs_state(self, dqdir, state):
|
||||||
|
with open(join(dqdir, 'active.json'), 'w') as f:
|
||||||
|
json.dump(state, f)
|
||||||
|
@ -135,7 +135,6 @@ class Scraper(object):
|
|||||||
return self.spidermw.scrape_response(
|
return self.spidermw.scrape_response(
|
||||||
self.call_spider, request_result, request, spider)
|
self.call_spider, request_result, request, spider)
|
||||||
else:
|
else:
|
||||||
# FIXME: don't ignore errors in spider middleware
|
|
||||||
dfd = self.call_spider(request_result, request, spider)
|
dfd = self.call_spider(request_result, request, spider)
|
||||||
return dfd.addErrback(
|
return dfd.addErrback(
|
||||||
self._log_download_errors, request_result, request, spider)
|
self._log_download_errors, request_result, request, spider)
|
||||||
|
@ -3,15 +3,21 @@ Spider Middleware manager
|
|||||||
|
|
||||||
See documentation in docs/topics/spider-middleware.rst
|
See documentation in docs/topics/spider-middleware.rst
|
||||||
"""
|
"""
|
||||||
|
from itertools import chain, islice
|
||||||
|
|
||||||
import six
|
import six
|
||||||
from twisted.python.failure import Failure
|
from twisted.python.failure import Failure
|
||||||
|
from scrapy.exceptions import _InvalidOutput
|
||||||
from scrapy.middleware import MiddlewareManager
|
from scrapy.middleware import MiddlewareManager
|
||||||
from scrapy.utils.defer import mustbe_deferred
|
from scrapy.utils.defer import mustbe_deferred
|
||||||
from scrapy.utils.conf import build_component_list
|
from scrapy.utils.conf import build_component_list
|
||||||
|
from scrapy.utils.python import MutableChain
|
||||||
|
|
||||||
|
|
||||||
def _isiterable(possible_iterator):
|
def _isiterable(possible_iterator):
|
||||||
return hasattr(possible_iterator, '__iter__')
|
return hasattr(possible_iterator, '__iter__')
|
||||||
|
|
||||||
|
|
||||||
class SpiderMiddlewareManager(MiddlewareManager):
|
class SpiderMiddlewareManager(MiddlewareManager):
|
||||||
|
|
||||||
component_name = 'spider middleware'
|
component_name = 'spider middleware'
|
||||||
@ -24,12 +30,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
|||||||
super(SpiderMiddlewareManager, self)._add_middleware(mw)
|
super(SpiderMiddlewareManager, self)._add_middleware(mw)
|
||||||
if hasattr(mw, 'process_spider_input'):
|
if hasattr(mw, 'process_spider_input'):
|
||||||
self.methods['process_spider_input'].append(mw.process_spider_input)
|
self.methods['process_spider_input'].append(mw.process_spider_input)
|
||||||
if hasattr(mw, 'process_spider_output'):
|
|
||||||
self.methods['process_spider_output'].appendleft(mw.process_spider_output)
|
|
||||||
if hasattr(mw, 'process_spider_exception'):
|
|
||||||
self.methods['process_spider_exception'].appendleft(mw.process_spider_exception)
|
|
||||||
if hasattr(mw, 'process_start_requests'):
|
if hasattr(mw, 'process_start_requests'):
|
||||||
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
|
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
|
||||||
|
self.methods['process_spider_output'].appendleft(getattr(mw, 'process_spider_output', None))
|
||||||
|
self.methods['process_spider_exception'].appendleft(getattr(mw, 'process_spider_exception', None))
|
||||||
|
|
||||||
def scrape_response(self, scrape_func, response, request, spider):
|
def scrape_response(self, scrape_func, response, request, spider):
|
||||||
fname = lambda f:'%s.%s' % (
|
fname = lambda f:'%s.%s' % (
|
||||||
@ -40,36 +44,73 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
|||||||
for method in self.methods['process_spider_input']:
|
for method in self.methods['process_spider_input']:
|
||||||
try:
|
try:
|
||||||
result = method(response=response, spider=spider)
|
result = method(response=response, spider=spider)
|
||||||
assert result is None, \
|
if result is not None:
|
||||||
'Middleware %s must returns None or ' \
|
raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
|
||||||
'raise an exception, got %s ' \
|
.format(fname(method), type(result)))
|
||||||
% (fname(method), type(result))
|
except _InvalidOutput:
|
||||||
except:
|
raise
|
||||||
|
except Exception:
|
||||||
return scrape_func(Failure(), request, spider)
|
return scrape_func(Failure(), request, spider)
|
||||||
return scrape_func(response, request, spider)
|
return scrape_func(response, request, spider)
|
||||||
|
|
||||||
def process_spider_exception(_failure):
|
def process_spider_exception(_failure, start_index=0):
|
||||||
exception = _failure.value
|
exception = _failure.value
|
||||||
for method in self.methods['process_spider_exception']:
|
# don't handle _InvalidOutput exception
|
||||||
|
if isinstance(exception, _InvalidOutput):
|
||||||
|
return _failure
|
||||||
|
method_list = islice(self.methods['process_spider_exception'], start_index, None)
|
||||||
|
for method_index, method in enumerate(method_list, start=start_index):
|
||||||
|
if method is None:
|
||||||
|
continue
|
||||||
result = method(response=response, exception=exception, spider=spider)
|
result = method(response=response, exception=exception, spider=spider)
|
||||||
assert result is None or _isiterable(result), \
|
if _isiterable(result):
|
||||||
'Middleware %s must returns None, or an iterable object, got %s ' % \
|
# stop exception handling by handing control over to the
|
||||||
(fname(method), type(result))
|
# process_spider_output chain if an iterable has been returned
|
||||||
if result is not None:
|
return process_spider_output(result, method_index+1)
|
||||||
return result
|
elif result is None:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \
|
||||||
|
.format(fname(method), type(result)))
|
||||||
return _failure
|
return _failure
|
||||||
|
|
||||||
def process_spider_output(result):
|
def process_spider_output(result, start_index=0):
|
||||||
for method in self.methods['process_spider_output']:
|
# items in this iterable do not need to go through the process_spider_output
|
||||||
|
# chain, they went through it already from the process_spider_exception method
|
||||||
|
recovered = MutableChain()
|
||||||
|
|
||||||
|
def evaluate_iterable(iterable, index):
|
||||||
|
try:
|
||||||
|
for r in iterable:
|
||||||
|
yield r
|
||||||
|
except Exception as ex:
|
||||||
|
exception_result = process_spider_exception(Failure(ex), index+1)
|
||||||
|
if isinstance(exception_result, Failure):
|
||||||
|
raise
|
||||||
|
recovered.extend(exception_result)
|
||||||
|
|
||||||
|
method_list = islice(self.methods['process_spider_output'], start_index, None)
|
||||||
|
for method_index, method in enumerate(method_list, start=start_index):
|
||||||
|
if method is None:
|
||||||
|
continue
|
||||||
|
# the following might fail directly if the output value is not a generator
|
||||||
|
try:
|
||||||
result = method(response=response, result=result, spider=spider)
|
result = method(response=response, result=result, spider=spider)
|
||||||
assert _isiterable(result), \
|
except Exception as ex:
|
||||||
'Middleware %s must returns an iterable object, got %s ' % \
|
exception_result = process_spider_exception(Failure(ex), method_index+1)
|
||||||
(fname(method), type(result))
|
if isinstance(exception_result, Failure):
|
||||||
return result
|
raise
|
||||||
|
return exception_result
|
||||||
|
if _isiterable(result):
|
||||||
|
result = evaluate_iterable(result, method_index)
|
||||||
|
else:
|
||||||
|
raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
|
||||||
|
.format(fname(method), type(result)))
|
||||||
|
|
||||||
|
return chain(result, recovered)
|
||||||
|
|
||||||
dfd = mustbe_deferred(process_spider_input, response)
|
dfd = mustbe_deferred(process_spider_input, response)
|
||||||
dfd.addErrback(process_spider_exception)
|
dfd.addCallbacks(callback=process_spider_output, errback=process_spider_exception)
|
||||||
dfd.addCallback(process_spider_output)
|
|
||||||
return dfd
|
return dfd
|
||||||
|
|
||||||
def process_start_requests(self, start_requests, spider):
|
def process_start_requests(self, start_requests, spider):
|
||||||
|
@ -111,6 +111,8 @@ class Crawler(object):
|
|||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def stop(self):
|
def stop(self):
|
||||||
|
"""Starts a graceful stop of the crawler and returns a deferred that is
|
||||||
|
fired when the crawler is stopped."""
|
||||||
if self.crawling:
|
if self.crawling:
|
||||||
self.crawling = False
|
self.crawling = False
|
||||||
yield defer.maybeDeferred(self.engine.stop)
|
yield defer.maybeDeferred(self.engine.stop)
|
||||||
|
@ -88,6 +88,7 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
|||||||
|
|
||||||
def __init__(self, settings):
|
def __init__(self, settings):
|
||||||
super(MetaRefreshMiddleware, self).__init__(settings)
|
super(MetaRefreshMiddleware, self).__init__(settings)
|
||||||
|
self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
|
||||||
self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY',
|
self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY',
|
||||||
settings.getint('METAREFRESH_MAXDELAY'))
|
settings.getint('METAREFRESH_MAXDELAY'))
|
||||||
|
|
||||||
@ -96,7 +97,8 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
|||||||
not isinstance(response, HtmlResponse):
|
not isinstance(response, HtmlResponse):
|
||||||
return response
|
return response
|
||||||
|
|
||||||
interval, url = get_meta_refresh(response)
|
interval, url = get_meta_refresh(response,
|
||||||
|
ignore_tags=self._ignore_tags)
|
||||||
if url and interval < self._maxdelay:
|
if url and interval < self._maxdelay:
|
||||||
redirected = self._redirect_request_using_get(request, url)
|
redirected = self._redirect_request_using_get(request, url)
|
||||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||||
|
@ -11,6 +11,13 @@ class NotConfigured(Exception):
|
|||||||
"""Indicates a missing configuration situation"""
|
"""Indicates a missing configuration situation"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class _InvalidOutput(TypeError):
|
||||||
|
"""
|
||||||
|
Indicates an invalid value has been returned by a middleware's processing method.
|
||||||
|
Internal and undocumented, it should not be raised or caught by user code.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
# HTTP and crawling
|
# HTTP and crawling
|
||||||
|
|
||||||
class IgnoreRequest(Exception):
|
class IgnoreRequest(Exception):
|
||||||
|
@ -24,7 +24,11 @@ class CoreStats(object):
|
|||||||
self.stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
|
self.stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
|
||||||
|
|
||||||
def spider_closed(self, spider, reason):
|
def spider_closed(self, spider, reason):
|
||||||
self.stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
|
finish_time = datetime.datetime.utcnow()
|
||||||
|
elapsed_time = finish_time - self.stats.get_value('start_time')
|
||||||
|
elapsed_time_seconds = elapsed_time.total_seconds()
|
||||||
|
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
|
||||||
|
self.stats.set_value('finish_time', finish_time, spider=spider)
|
||||||
self.stats.set_value('finish_reason', reason, spider=spider)
|
self.stats.set_value('finish_reason', reason, spider=spider)
|
||||||
|
|
||||||
def item_scraped(self, item, spider):
|
def item_scraped(self, item, spider):
|
||||||
|
@ -31,7 +31,7 @@ class DummyPolicy(object):
|
|||||||
def should_cache_response(self, response, request):
|
def should_cache_response(self, response, request):
|
||||||
return response.status not in self.ignore_http_codes
|
return response.status not in self.ignore_http_codes
|
||||||
|
|
||||||
def is_cached_response_fresh(self, response, request):
|
def is_cached_response_fresh(self, cachedresponse, request):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||||
@ -70,7 +70,7 @@ class RFC2616Policy(object):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def should_cache_response(self, response, request):
|
def should_cache_response(self, response, request):
|
||||||
# What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1
|
# What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
|
||||||
# Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
|
# Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
|
||||||
# Status code 206 is not included because cache can not deal with partial contents
|
# Status code 206 is not included because cache can not deal with partial contents
|
||||||
cc = self._parse_cachecontrol(response)
|
cc = self._parse_cachecontrol(response)
|
||||||
|
@ -35,6 +35,10 @@ class ItemLoader(object):
|
|||||||
self.parent = parent
|
self.parent = parent
|
||||||
self._local_item = context['item'] = item
|
self._local_item = context['item'] = item
|
||||||
self._local_values = defaultdict(list)
|
self._local_values = defaultdict(list)
|
||||||
|
# Preprocess values if item built from dict
|
||||||
|
# Values need to be added to item._values if added them from dict (not with add_values)
|
||||||
|
for field_name, value in item.items():
|
||||||
|
self._values[field_name] = self._process_input_value(field_name, value)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _values(self):
|
def _values(self):
|
||||||
|
@ -3,7 +3,7 @@ from __future__ import print_function
|
|||||||
import functools
|
import functools
|
||||||
import logging
|
import logging
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from twisted.internet.defer import Deferred, DeferredList
|
from twisted.internet.defer import Deferred, DeferredList, _DefGen_Return
|
||||||
from twisted.python.failure import Failure
|
from twisted.python.failure import Failure
|
||||||
|
|
||||||
from scrapy.settings import Settings
|
from scrapy.settings import Settings
|
||||||
@ -139,6 +139,30 @@ class MediaPipeline(object):
|
|||||||
result.cleanFailure()
|
result.cleanFailure()
|
||||||
result.frames = []
|
result.frames = []
|
||||||
result.stack = None
|
result.stack = None
|
||||||
|
|
||||||
|
# This code fixes a memory leak by avoiding to keep references to
|
||||||
|
# the Request and Response objects on the Media Pipeline cache.
|
||||||
|
#
|
||||||
|
# Twisted inline callbacks pass return values using the function
|
||||||
|
# twisted.internet.defer.returnValue, which encapsulates the return
|
||||||
|
# value inside a _DefGen_Return base exception.
|
||||||
|
#
|
||||||
|
# What happens when the media_downloaded callback raises another
|
||||||
|
# exception, for example a FileException('download-error') when
|
||||||
|
# the Response status code is not 200 OK, is that it stores the
|
||||||
|
# _DefGen_Return exception on the FileException context.
|
||||||
|
#
|
||||||
|
# To avoid keeping references to the Response and therefore Request
|
||||||
|
# objects on the Media Pipeline cache, we should wipe the context of
|
||||||
|
# the exception encapsulated by the Twisted Failure when its a
|
||||||
|
# _DefGen_Return instance.
|
||||||
|
#
|
||||||
|
# This problem does not occur in Python 2.7 since we don't have
|
||||||
|
# Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
|
||||||
|
context = getattr(result.value, '__context__', None)
|
||||||
|
if isinstance(context, _DefGen_Return):
|
||||||
|
setattr(result.value, '__context__', None)
|
||||||
|
|
||||||
info.downloading.remove(fp)
|
info.downloading.remove(fp)
|
||||||
info.downloaded[fp] = result # cache result
|
info.downloaded[fp] = result # cache result
|
||||||
for wad in info.waiting.pop(fp):
|
for wad in info.waiting.pop(fp):
|
||||||
|
193
scrapy/pqueues.py
Normal file
193
scrapy/pqueues.py
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
from queuelib import PriorityQueue
|
||||||
|
|
||||||
|
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _path_safe(text):
|
||||||
|
"""
|
||||||
|
Return a filesystem-safe version of a string ``text``
|
||||||
|
|
||||||
|
>>> _path_safe('simple.org').startswith('simple.org')
|
||||||
|
True
|
||||||
|
>>> _path_safe('dash-underscore_.org').startswith('dash-underscore_.org')
|
||||||
|
True
|
||||||
|
>>> _path_safe('some@symbol?').startswith('some_symbol_')
|
||||||
|
True
|
||||||
|
"""
|
||||||
|
pathable_slot = "".join([c if c.isalnum() or c in '-._' else '_'
|
||||||
|
for c in text])
|
||||||
|
# as we replace some letters we can get collision for different slots
|
||||||
|
# add we add unique part
|
||||||
|
unique_slot = hashlib.md5(text.encode('utf8')).hexdigest()
|
||||||
|
return '-'.join([pathable_slot, unique_slot])
|
||||||
|
|
||||||
|
|
||||||
|
class _Priority(namedtuple("_Priority", ["priority", "slot"])):
|
||||||
|
""" Slot-specific priority. It is a hack - ``(priority, slot)`` tuple
|
||||||
|
which can be used instead of int priorities in queues:
|
||||||
|
|
||||||
|
* they are ordered in the same way - order is still by priority value,
|
||||||
|
min(prios) works;
|
||||||
|
* str(p) representation is guaranteed to be different when slots
|
||||||
|
are different - this is important because str(p) is used to create
|
||||||
|
queue files on disk;
|
||||||
|
* they have readable str(p) representation which is safe
|
||||||
|
to use as a file name.
|
||||||
|
"""
|
||||||
|
__slots__ = ()
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return '%s_%s' % (self.priority, _path_safe(str(self.slot)))
|
||||||
|
|
||||||
|
|
||||||
|
class _SlotPriorityQueues(object):
|
||||||
|
""" Container for multiple priority queues. """
|
||||||
|
def __init__(self, pqfactory, slot_startprios=None):
|
||||||
|
"""
|
||||||
|
``pqfactory`` is a factory for creating new PriorityQueues.
|
||||||
|
It must be a function which accepts a single optional ``startprios``
|
||||||
|
argument, with a list of priorities to create queues for.
|
||||||
|
|
||||||
|
``slot_startprios`` is a ``{slot: startprios}`` dict.
|
||||||
|
"""
|
||||||
|
self.pqfactory = pqfactory
|
||||||
|
self.pqueues = {} # slot -> priority queue
|
||||||
|
for slot, startprios in (slot_startprios or {}).items():
|
||||||
|
self.pqueues[slot] = self.pqfactory(startprios)
|
||||||
|
|
||||||
|
def pop_slot(self, slot):
|
||||||
|
""" Pop an object from a priority queue for this slot """
|
||||||
|
queue = self.pqueues[slot]
|
||||||
|
request = queue.pop()
|
||||||
|
if len(queue) == 0:
|
||||||
|
del self.pqueues[slot]
|
||||||
|
return request
|
||||||
|
|
||||||
|
def push_slot(self, slot, obj, priority):
|
||||||
|
""" Push an object to a priority queue for this slot """
|
||||||
|
if slot not in self.pqueues:
|
||||||
|
self.pqueues[slot] = self.pqfactory()
|
||||||
|
queue = self.pqueues[slot]
|
||||||
|
queue.push(obj, priority)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
active = {slot: queue.close()
|
||||||
|
for slot, queue in self.pqueues.items()}
|
||||||
|
self.pqueues.clear()
|
||||||
|
return active
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return sum(len(x) for x in self.pqueues.values()) if self.pqueues else 0
|
||||||
|
|
||||||
|
def __contains__(self, slot):
|
||||||
|
return slot in self.pqueues
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapyPriorityQueue(PriorityQueue):
|
||||||
|
"""
|
||||||
|
PriorityQueue which works with scrapy.Request instances and
|
||||||
|
can optionally convert them to/from dicts before/after putting to a queue.
|
||||||
|
"""
|
||||||
|
def __init__(self, crawler, qfactory, startprios=(), serialize=False):
|
||||||
|
super(ScrapyPriorityQueue, self).__init__(qfactory, startprios)
|
||||||
|
self.serialize = serialize
|
||||||
|
self.spider = crawler.spider
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler, qfactory, startprios=(), serialize=False):
|
||||||
|
return cls(crawler, qfactory, startprios, serialize)
|
||||||
|
|
||||||
|
def push(self, request, priority=0):
|
||||||
|
if self.serialize:
|
||||||
|
request = request_to_dict(request, self.spider)
|
||||||
|
super(ScrapyPriorityQueue, self).push(request, priority)
|
||||||
|
|
||||||
|
def pop(self):
|
||||||
|
request = super(ScrapyPriorityQueue, self).pop()
|
||||||
|
if request and self.serialize:
|
||||||
|
request = request_from_dict(request, self.spider)
|
||||||
|
return request
|
||||||
|
|
||||||
|
|
||||||
|
class DownloaderInterface(object):
|
||||||
|
|
||||||
|
def __init__(self, crawler):
|
||||||
|
self.downloader = crawler.engine.downloader
|
||||||
|
|
||||||
|
def stats(self, possible_slots):
|
||||||
|
return [(self._active_downloads(slot), slot)
|
||||||
|
for slot in possible_slots]
|
||||||
|
|
||||||
|
def get_slot_key(self, request):
|
||||||
|
return self.downloader._get_slot_key(request, None)
|
||||||
|
|
||||||
|
def _active_downloads(self, slot):
|
||||||
|
""" Return a number of requests in a Downloader for a given slot """
|
||||||
|
if slot not in self.downloader.slots:
|
||||||
|
return 0
|
||||||
|
return len(self.downloader.slots[slot].active)
|
||||||
|
|
||||||
|
|
||||||
|
class DownloaderAwarePriorityQueue(object):
|
||||||
|
""" PriorityQueue which takes Downlaoder activity in account:
|
||||||
|
domains (slots) with the least amount of active downloads are dequeued
|
||||||
|
first.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler, qfactory, slot_startprios=None, serialize=False):
|
||||||
|
return cls(crawler, qfactory, slot_startprios, serialize)
|
||||||
|
|
||||||
|
def __init__(self, crawler, qfactory, slot_startprios=None, serialize=False):
|
||||||
|
if crawler.settings.getint('CONCURRENT_REQUESTS_PER_IP') != 0:
|
||||||
|
raise ValueError('"%s" does not support CONCURRENT_REQUESTS_PER_IP'
|
||||||
|
% (self.__class__,))
|
||||||
|
|
||||||
|
if slot_startprios and not isinstance(slot_startprios, dict):
|
||||||
|
raise ValueError("DownloaderAwarePriorityQueue accepts "
|
||||||
|
"``slot_startprios`` as a dict; %r instance "
|
||||||
|
"is passed. Most likely, it means the state is"
|
||||||
|
"created by an incompatible priority queue. "
|
||||||
|
"Only a crawl started with the same priority "
|
||||||
|
"queue class can be resumed." %
|
||||||
|
slot_startprios.__class__)
|
||||||
|
|
||||||
|
slot_startprios = {
|
||||||
|
slot: [_Priority(p, slot) for p in startprios]
|
||||||
|
for slot, startprios in (slot_startprios or {}).items()}
|
||||||
|
|
||||||
|
def pqfactory(startprios=()):
|
||||||
|
return ScrapyPriorityQueue(crawler, qfactory, startprios, serialize)
|
||||||
|
self._slot_pqueues = _SlotPriorityQueues(pqfactory, slot_startprios)
|
||||||
|
self.serialize = serialize
|
||||||
|
self._downloader_interface = DownloaderInterface(crawler)
|
||||||
|
|
||||||
|
def pop(self):
|
||||||
|
stats = self._downloader_interface.stats(self._slot_pqueues.pqueues)
|
||||||
|
|
||||||
|
if not stats:
|
||||||
|
return
|
||||||
|
|
||||||
|
slot = min(stats)[1]
|
||||||
|
request = self._slot_pqueues.pop_slot(slot)
|
||||||
|
return request
|
||||||
|
|
||||||
|
def push(self, request, priority):
|
||||||
|
slot = self._downloader_interface.get_slot_key(request)
|
||||||
|
priority_slot = _Priority(priority=priority, slot=slot)
|
||||||
|
self._slot_pqueues.push_slot(slot, request, priority_slot)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
active = self._slot_pqueues.close()
|
||||||
|
return {slot: [p.priority for p in startprios]
|
||||||
|
for slot, startprios in active.items()}
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._slot_pqueues)
|
@ -221,6 +221,7 @@ MEMUSAGE_NOTIFY_MAIL = []
|
|||||||
MEMUSAGE_WARNING_MB = 0
|
MEMUSAGE_WARNING_MB = 0
|
||||||
|
|
||||||
METAREFRESH_ENABLED = True
|
METAREFRESH_ENABLED = True
|
||||||
|
METAREFRESH_IGNORE_TAGS = ['script', 'noscript']
|
||||||
METAREFRESH_MAXDELAY = 100
|
METAREFRESH_MAXDELAY = 100
|
||||||
|
|
||||||
NEWSPIDER_MODULE = ''
|
NEWSPIDER_MODULE = ''
|
||||||
@ -238,7 +239,7 @@ REFERRER_POLICY = 'scrapy.spidermiddlewares.referer.DefaultReferrerPolicy'
|
|||||||
|
|
||||||
RETRY_ENABLED = True
|
RETRY_ENABLED = True
|
||||||
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
|
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
|
||||||
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408]
|
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
|
||||||
RETRY_PRIORITY_ADJUST = -1
|
RETRY_PRIORITY_ADJUST = -1
|
||||||
|
|
||||||
ROBOTSTXT_OBEY = False
|
ROBOTSTXT_OBEY = False
|
||||||
@ -246,7 +247,7 @@ ROBOTSTXT_OBEY = False
|
|||||||
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
|
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
|
||||||
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
|
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
|
||||||
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
|
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
|
||||||
SCHEDULER_PRIORITY_QUEUE = 'queuelib.PriorityQueue'
|
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'
|
||||||
|
|
||||||
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
|
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
|
||||||
SPIDER_LOADER_WARN_ONLY = False
|
SPIDER_LOADER_WARN_ONLY = False
|
||||||
|
@ -6,29 +6,55 @@ See documentation in docs/topics/spiders.rst
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
import warnings
|
||||||
|
|
||||||
import six
|
import six
|
||||||
|
|
||||||
|
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||||
from scrapy.http import Request, HtmlResponse
|
from scrapy.http import Request, HtmlResponse
|
||||||
from scrapy.utils.spider import iterate_spider_output
|
from scrapy.utils.spider import iterate_spider_output
|
||||||
|
from scrapy.utils.python import get_func_args
|
||||||
from scrapy.spiders import Spider
|
from scrapy.spiders import Spider
|
||||||
|
|
||||||
|
|
||||||
def identity(x):
|
def _identity(request, response):
|
||||||
return x
|
return request
|
||||||
|
|
||||||
|
|
||||||
|
def _get_method(method, spider):
|
||||||
|
if callable(method):
|
||||||
|
return method
|
||||||
|
elif isinstance(method, six.string_types):
|
||||||
|
return getattr(spider, method, None)
|
||||||
|
|
||||||
|
|
||||||
class Rule(object):
|
class Rule(object):
|
||||||
|
|
||||||
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
|
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None):
|
||||||
self.link_extractor = link_extractor
|
self.link_extractor = link_extractor
|
||||||
self.callback = callback
|
self.callback = callback
|
||||||
self.cb_kwargs = cb_kwargs or {}
|
self.cb_kwargs = cb_kwargs or {}
|
||||||
self.process_links = process_links
|
self.process_links = process_links
|
||||||
self.process_request = process_request
|
self.process_request = process_request or _identity
|
||||||
if follow is None:
|
self.process_request_argcount = None
|
||||||
self.follow = False if callback else True
|
self.follow = follow if follow is not None else not callback
|
||||||
else:
|
|
||||||
self.follow = follow
|
def _compile(self, spider):
|
||||||
|
self.callback = _get_method(self.callback, spider)
|
||||||
|
self.process_links = _get_method(self.process_links, spider)
|
||||||
|
self.process_request = _get_method(self.process_request, spider)
|
||||||
|
self.process_request_argcount = len(get_func_args(self.process_request))
|
||||||
|
if self.process_request_argcount == 1:
|
||||||
|
msg = 'Rule.process_request should accept two arguments (request, response), accepting only one is deprecated'
|
||||||
|
warnings.warn(msg, category=ScrapyDeprecationWarning, stacklevel=2)
|
||||||
|
|
||||||
|
def _process_request(self, request, response):
|
||||||
|
"""
|
||||||
|
Wrapper around the request processing function to maintain backward
|
||||||
|
compatibility with functions that do not take a Response object
|
||||||
|
"""
|
||||||
|
args = [request] if self.process_request_argcount == 1 else [request, response]
|
||||||
|
return self.process_request(*args)
|
||||||
|
|
||||||
|
|
||||||
class CrawlSpider(Spider):
|
class CrawlSpider(Spider):
|
||||||
@ -64,8 +90,8 @@ class CrawlSpider(Spider):
|
|||||||
links = rule.process_links(links)
|
links = rule.process_links(links)
|
||||||
for link in links:
|
for link in links:
|
||||||
seen.add(link)
|
seen.add(link)
|
||||||
r = self._build_request(n, link)
|
request = self._build_request(n, link)
|
||||||
yield rule.process_request(r)
|
yield rule._process_request(request, response)
|
||||||
|
|
||||||
def _response_downloaded(self, response):
|
def _response_downloaded(self, response):
|
||||||
rule = self._rules[response.meta['rule']]
|
rule = self._rules[response.meta['rule']]
|
||||||
@ -83,17 +109,9 @@ class CrawlSpider(Spider):
|
|||||||
yield request_or_item
|
yield request_or_item
|
||||||
|
|
||||||
def _compile_rules(self):
|
def _compile_rules(self):
|
||||||
def get_method(method):
|
|
||||||
if callable(method):
|
|
||||||
return method
|
|
||||||
elif isinstance(method, six.string_types):
|
|
||||||
return getattr(self, method, None)
|
|
||||||
|
|
||||||
self._rules = [copy.copy(r) for r in self.rules]
|
self._rules = [copy.copy(r) for r in self.rules]
|
||||||
for rule in self._rules:
|
for rule in self._rules:
|
||||||
rule.callback = get_method(rule.callback)
|
rule._compile(self)
|
||||||
rule.process_links = get_method(rule.process_links)
|
|
||||||
rule.process_request = get_method(rule.process_request)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_crawler(cls, crawler, *args, **kwargs):
|
def from_crawler(cls, crawler, *args, **kwargs):
|
||||||
|
@ -7,6 +7,7 @@ from six.moves import cPickle as pickle
|
|||||||
|
|
||||||
from queuelib import queue
|
from queuelib import queue
|
||||||
|
|
||||||
|
|
||||||
def _serializable_queue(queue_class, serialize, deserialize):
|
def _serializable_queue(queue_class, serialize, deserialize):
|
||||||
|
|
||||||
class SerializableQueue(queue_class):
|
class SerializableQueue(queue_class):
|
||||||
@ -22,6 +23,7 @@ def _serializable_queue(queue_class, serialize, deserialize):
|
|||||||
|
|
||||||
return SerializableQueue
|
return SerializableQueue
|
||||||
|
|
||||||
|
|
||||||
def _pickle_serialize(obj):
|
def _pickle_serialize(obj):
|
||||||
try:
|
try:
|
||||||
return pickle.dumps(obj, protocol=2)
|
return pickle.dumps(obj, protocol=2)
|
||||||
@ -31,13 +33,14 @@ def _pickle_serialize(obj):
|
|||||||
except (pickle.PicklingError, AttributeError, TypeError) as e:
|
except (pickle.PicklingError, AttributeError, TypeError) as e:
|
||||||
raise ValueError(str(e))
|
raise ValueError(str(e))
|
||||||
|
|
||||||
PickleFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue, \
|
|
||||||
|
PickleFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue,
|
||||||
_pickle_serialize, pickle.loads)
|
_pickle_serialize, pickle.loads)
|
||||||
PickleLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue, \
|
PickleLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue,
|
||||||
_pickle_serialize, pickle.loads)
|
_pickle_serialize, pickle.loads)
|
||||||
MarshalFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue, \
|
MarshalFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue,
|
||||||
marshal.dumps, marshal.loads)
|
marshal.dumps, marshal.loads)
|
||||||
MarshalLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue, \
|
MarshalLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue,
|
||||||
marshal.dumps, marshal.loads)
|
marshal.dumps, marshal.loads)
|
||||||
FifoMemoryQueue = queue.FifoMemoryQueue
|
FifoMemoryQueue = queue.FifoMemoryQueue
|
||||||
LifoMemoryQueue = queue.LifoMemoryQueue
|
LifoMemoryQueue = queue.LifoMemoryQueue
|
||||||
|
@ -39,7 +39,7 @@ class ${ProjectName}SpiderMiddleware(object):
|
|||||||
# Called when a spider or process_spider_input() method
|
# Called when a spider or process_spider_input() method
|
||||||
# (from other spider middleware) raises an exception.
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
# Should return either None or an iterable of Response, dict
|
# Should return either None or an iterable of Request, dict
|
||||||
# or Item objects.
|
# or Item objects.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -48,7 +48,7 @@ def mustbe_deferred(f, *args, **kw):
|
|||||||
# exception in Scrapy - see #125
|
# exception in Scrapy - see #125
|
||||||
except IgnoreRequest as e:
|
except IgnoreRequest as e:
|
||||||
return defer_fail(failure.Failure(e))
|
return defer_fail(failure.Failure(e))
|
||||||
except:
|
except Exception:
|
||||||
return defer_fail(failure.Failure())
|
return defer_fail(failure.Failure())
|
||||||
else:
|
else:
|
||||||
return defer_result(result)
|
return defer_result(result)
|
||||||
@ -102,5 +102,5 @@ def iter_errback(iterable, errback, *a, **kw):
|
|||||||
yield next(it)
|
yield next(it)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
break
|
break
|
||||||
except:
|
except Exception:
|
||||||
errback(failure.Failure(), *a, **kw)
|
errback(failure.Failure(), *a, **kw)
|
||||||
|
@ -9,6 +9,9 @@ from gzip import GzipFile
|
|||||||
import six
|
import six
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from scrapy.utils.decorators import deprecated
|
||||||
|
|
||||||
|
|
||||||
# - Python>=3.5 GzipFile's read() has issues returning leftover
|
# - Python>=3.5 GzipFile's read() has issues returning leftover
|
||||||
# uncompressed data when input is corrupted
|
# uncompressed data when input is corrupted
|
||||||
# (regression or bug-fix compared to Python 3.4)
|
# (regression or bug-fix compared to Python 3.4)
|
||||||
@ -53,6 +56,7 @@ def gunzip(data):
|
|||||||
_is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search
|
_is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search
|
||||||
_is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search
|
_is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search
|
||||||
|
|
||||||
|
@deprecated
|
||||||
def is_gzipped(response):
|
def is_gzipped(response):
|
||||||
"""Return True if the response is gzipped, or False otherwise"""
|
"""Return True if the response is gzipped, or False otherwise"""
|
||||||
ctype = response.headers.get('Content-Type', b'')
|
ctype = response.headers.get('Content-Type', b'')
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
"""Helper functions which don't fit anywhere else"""
|
"""Helper functions which don't fit anywhere else"""
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import hashlib
|
import hashlib
|
||||||
|
from contextlib import contextmanager
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
from pkgutil import iter_modules
|
from pkgutil import iter_modules
|
||||||
|
|
||||||
@ -86,7 +88,7 @@ def extract_regex(regex, text, encoding='utf-8'):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
strings = [regex.search(text).group('extract')] # named group
|
strings = [regex.search(text).group('extract')] # named group
|
||||||
except:
|
except Exception:
|
||||||
strings = regex.findall(text) # full regex or numbered groups
|
strings = regex.findall(text) # full regex or numbered groups
|
||||||
strings = flatten(strings)
|
strings = flatten(strings)
|
||||||
|
|
||||||
@ -142,3 +144,21 @@ def create_instance(objcls, settings, crawler, *args, **kwargs):
|
|||||||
return objcls.from_settings(settings, *args, **kwargs)
|
return objcls.from_settings(settings, *args, **kwargs)
|
||||||
else:
|
else:
|
||||||
return objcls(*args, **kwargs)
|
return objcls(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def set_environ(**kwargs):
|
||||||
|
"""Temporarily set environment variables inside the context manager and
|
||||||
|
fully restore previous environment afterwards
|
||||||
|
"""
|
||||||
|
|
||||||
|
original_env = {k: os.environ.get(k) for k in kwargs}
|
||||||
|
os.environ.update(kwargs)
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
for k, v in original_env.items():
|
||||||
|
if v is None:
|
||||||
|
del os.environ[k]
|
||||||
|
else:
|
||||||
|
os.environ[k] = v
|
||||||
|
@ -9,6 +9,7 @@ import weakref
|
|||||||
import errno
|
import errno
|
||||||
import six
|
import six
|
||||||
from functools import partial, wraps
|
from functools import partial, wraps
|
||||||
|
from itertools import chain
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from scrapy.utils.decorators import deprecated
|
from scrapy.utils.decorators import deprecated
|
||||||
@ -387,3 +388,22 @@ if hasattr(sys, "pypy_version_info"):
|
|||||||
else:
|
else:
|
||||||
def garbage_collect():
|
def garbage_collect():
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
|
|
||||||
|
class MutableChain(object):
|
||||||
|
"""
|
||||||
|
Thin wrapper around itertools.chain, allowing to add iterables "in-place"
|
||||||
|
"""
|
||||||
|
def __init__(self, *args):
|
||||||
|
self.data = chain(*args)
|
||||||
|
|
||||||
|
def extend(self, *iterables):
|
||||||
|
self.data = chain(self.data, *iterables)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self.data.__iter__()
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
return next(self.data)
|
||||||
|
|
||||||
|
next = __next__
|
||||||
|
@ -70,6 +70,20 @@ def request_from_dict(d, spider=None):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_private_method(name):
|
||||||
|
return name.startswith('__') and not name.endswith('__')
|
||||||
|
|
||||||
|
|
||||||
|
def _mangle_private_name(obj, func, name):
|
||||||
|
qualname = getattr(func, '__qualname__', None)
|
||||||
|
if qualname is None:
|
||||||
|
classname = obj.__class__.__name__.lstrip('_')
|
||||||
|
return '_%s%s' % (classname, name)
|
||||||
|
else:
|
||||||
|
splits = qualname.split('.')
|
||||||
|
return '_%s%s' % (splits[-2], splits[-1])
|
||||||
|
|
||||||
|
|
||||||
def _find_method(obj, func):
|
def _find_method(obj, func):
|
||||||
if obj:
|
if obj:
|
||||||
try:
|
try:
|
||||||
@ -78,7 +92,10 @@ def _find_method(obj, func):
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
if func_self is obj:
|
if func_self is obj:
|
||||||
return six.get_method_function(func).__name__
|
name = six.get_method_function(func).__name__
|
||||||
|
if _is_private_method(name):
|
||||||
|
return _mangle_private_name(obj, func, name)
|
||||||
|
return name
|
||||||
raise ValueError("Function %s is not a method of: %s" % (func, obj))
|
raise ValueError("Function %s is not a method of: %s" % (func, obj))
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,12 +31,12 @@ def get_base_url(response):
|
|||||||
|
|
||||||
|
|
||||||
_metaref_cache = weakref.WeakKeyDictionary()
|
_metaref_cache = weakref.WeakKeyDictionary()
|
||||||
def get_meta_refresh(response):
|
def get_meta_refresh(response, ignore_tags=('script', 'noscript')):
|
||||||
"""Parse the http-equiv refrsh parameter from the given response"""
|
"""Parse the http-equiv refrsh parameter from the given response"""
|
||||||
if response not in _metaref_cache:
|
if response not in _metaref_cache:
|
||||||
text = response.text[0:4096]
|
text = response.text[0:4096]
|
||||||
_metaref_cache[response] = html.get_meta_refresh(text, response.url,
|
_metaref_cache[response] = html.get_meta_refresh(text, response.url,
|
||||||
response.encoding, ignore_tags=('script', 'noscript'))
|
response.encoding, ignore_tags=ignore_tags)
|
||||||
return _metaref_cache[response]
|
return _metaref_cache[response]
|
||||||
|
|
||||||
|
|
||||||
|
3
setup.py
3
setup.py
@ -65,7 +65,8 @@ setup(
|
|||||||
],
|
],
|
||||||
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
|
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'Twisted>=13.1.0',
|
'Twisted>=13.1.0;python_version!="3.4"',
|
||||||
|
'Twisted>=13.1.0,<=19.2.0;python_version=="3.4"',
|
||||||
'w3lib>=1.17.0',
|
'w3lib>=1.17.0',
|
||||||
'queuelib',
|
'queuelib',
|
||||||
'lxml',
|
'lxml',
|
||||||
|
@ -177,7 +177,7 @@ class Root(Resource):
|
|||||||
try:
|
try:
|
||||||
from tests import tests_datadir
|
from tests import tests_datadir
|
||||||
self.putChild(b"files", File(os.path.join(tests_datadir, 'test_site/files/')))
|
self.putChild(b"files", File(os.path.join(tests_datadir, 'test_site/files/')))
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
self.putChild(b"redirect-to", RedirectTo())
|
self.putChild(b"redirect-to", RedirectTo())
|
||||||
|
|
||||||
|
@ -2,9 +2,10 @@
|
|||||||
mock
|
mock
|
||||||
mitmproxy==0.10.1
|
mitmproxy==0.10.1
|
||||||
netlib==0.10.1
|
netlib==0.10.1
|
||||||
pytest==2.9.2
|
pytest
|
||||||
|
pytest-cov
|
||||||
pytest-twisted
|
pytest-twisted
|
||||||
pytest-cov==2.2.1
|
pytest-xdist
|
||||||
jmespath
|
jmespath
|
||||||
brotlipy
|
brotlipy
|
||||||
testfixtures
|
testfixtures
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
pytest==3.6.3
|
pytest
|
||||||
|
pytest-cov
|
||||||
pytest-twisted
|
pytest-twisted
|
||||||
pytest-cov==2.5.1
|
pytest-xdist
|
||||||
testfixtures
|
testfixtures
|
||||||
jmespath
|
jmespath
|
||||||
leveldb; sys_platform != "win32"
|
leveldb; sys_platform != "win32"
|
||||||
|
@ -53,9 +53,5 @@ class TestCloseSpider(TestCase):
|
|||||||
yield crawler.crawl(total=1000000, mockserver=self.mockserver)
|
yield crawler.crawl(total=1000000, mockserver=self.mockserver)
|
||||||
reason = crawler.spider.meta['close_reason']
|
reason = crawler.spider.meta['close_reason']
|
||||||
self.assertEqual(reason, 'closespider_timeout')
|
self.assertEqual(reason, 'closespider_timeout')
|
||||||
stats = crawler.stats
|
total_seconds = crawler.stats.get_value('elapsed_time_seconds')
|
||||||
start = stats.get_value('start_time')
|
|
||||||
stop = stats.get_value('finish_time')
|
|
||||||
diff = stop - start
|
|
||||||
total_seconds = diff.seconds + diff.microseconds
|
|
||||||
self.assertTrue(total_seconds >= close_on)
|
self.assertTrue(total_seconds >= close_on)
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import logging
|
import logging
|
||||||
import tempfile
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from twisted.internet import defer
|
from twisted.internet import defer
|
||||||
@ -38,7 +37,11 @@ class CrawlerTestCase(BaseCrawlerTest):
|
|||||||
self.assertIsInstance(spiders, sl_cls)
|
self.assertIsInstance(spiders, sl_cls)
|
||||||
|
|
||||||
self.crawler.spiders
|
self.crawler.spiders
|
||||||
self.assertEqual(len(w), 1, "Warn deprecated access only once")
|
is_one_warning = len(w) == 1
|
||||||
|
if not is_one_warning:
|
||||||
|
for warning in w:
|
||||||
|
print(warning)
|
||||||
|
self.assertTrue(is_one_warning, "Warn deprecated access only once")
|
||||||
|
|
||||||
def test_populate_spidercls_settings(self):
|
def test_populate_spidercls_settings(self):
|
||||||
spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
|
spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
|
||||||
@ -179,8 +182,12 @@ class CrawlerRunnerTestCase(BaseCrawlerTest):
|
|||||||
'SPIDER_MANAGER_CLASS': 'tests.test_crawler.CustomSpiderLoader'
|
'SPIDER_MANAGER_CLASS': 'tests.test_crawler.CustomSpiderLoader'
|
||||||
})
|
})
|
||||||
self.assertIsInstance(runner.spider_loader, CustomSpiderLoader)
|
self.assertIsInstance(runner.spider_loader, CustomSpiderLoader)
|
||||||
self.assertEqual(len(w), 1)
|
is_one_warning = len(w) == 1
|
||||||
|
if not is_one_warning:
|
||||||
|
for warning in w:
|
||||||
|
print(warning)
|
||||||
self.assertIn('Please use SPIDER_LOADER_CLASS', str(w[0].message))
|
self.assertIn('Please use SPIDER_LOADER_CLASS', str(w[0].message))
|
||||||
|
self.assertTrue(is_one_warning)
|
||||||
|
|
||||||
def test_crawl_rejects_spider_objects(self):
|
def test_crawl_rejects_spider_objects(self):
|
||||||
with raises(ValueError):
|
with raises(ValueError):
|
||||||
|
@ -3,6 +3,7 @@ from twisted.python.failure import Failure
|
|||||||
|
|
||||||
from scrapy.http import Request, Response
|
from scrapy.http import Request, Response
|
||||||
from scrapy.spiders import Spider
|
from scrapy.spiders import Spider
|
||||||
|
from scrapy.exceptions import _InvalidOutput
|
||||||
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
|
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
|
||||||
from scrapy.utils.test import get_crawler
|
from scrapy.utils.test import get_crawler
|
||||||
from scrapy.utils.python import to_bytes
|
from scrapy.utils.python import to_bytes
|
||||||
@ -115,3 +116,63 @@ class ResponseFromProcessRequestTest(ManagerTestCase):
|
|||||||
|
|
||||||
self.assertIs(results[0], resp)
|
self.assertIs(results[0], resp)
|
||||||
self.assertFalse(download_func.called)
|
self.assertFalse(download_func.called)
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessRequestInvalidOutput(ManagerTestCase):
|
||||||
|
"""Invalid return value for process_request method should raise an exception"""
|
||||||
|
|
||||||
|
def test_invalid_process_request(self):
|
||||||
|
req = Request('http://example.com/index.html')
|
||||||
|
|
||||||
|
class InvalidProcessRequestMiddleware:
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
self.mwman._add_middleware(InvalidProcessRequestMiddleware())
|
||||||
|
download_func = mock.MagicMock()
|
||||||
|
dfd = self.mwman.download(download_func, req, self.spider)
|
||||||
|
results = []
|
||||||
|
dfd.addBoth(results.append)
|
||||||
|
self.assertIsInstance(results[0], Failure)
|
||||||
|
self.assertIsInstance(results[0].value, _InvalidOutput)
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessResponseInvalidOutput(ManagerTestCase):
|
||||||
|
"""Invalid return value for process_response method should raise an exception"""
|
||||||
|
|
||||||
|
def test_invalid_process_response(self):
|
||||||
|
req = Request('http://example.com/index.html')
|
||||||
|
|
||||||
|
class InvalidProcessResponseMiddleware:
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
self.mwman._add_middleware(InvalidProcessResponseMiddleware())
|
||||||
|
download_func = mock.MagicMock()
|
||||||
|
dfd = self.mwman.download(download_func, req, self.spider)
|
||||||
|
results = []
|
||||||
|
dfd.addBoth(results.append)
|
||||||
|
self.assertIsInstance(results[0], Failure)
|
||||||
|
self.assertIsInstance(results[0].value, _InvalidOutput)
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessExceptionInvalidOutput(ManagerTestCase):
|
||||||
|
"""Invalid return value for process_exception method should raise an exception"""
|
||||||
|
|
||||||
|
def test_invalid_process_exception(self):
|
||||||
|
req = Request('http://example.com/index.html')
|
||||||
|
|
||||||
|
class InvalidProcessExceptionMiddleware:
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
self.mwman._add_middleware(InvalidProcessExceptionMiddleware())
|
||||||
|
download_func = mock.MagicMock()
|
||||||
|
dfd = self.mwman.download(download_func, req, self.spider)
|
||||||
|
results = []
|
||||||
|
dfd.addBoth(results.append)
|
||||||
|
self.assertIsInstance(results[0], Failure)
|
||||||
|
self.assertIsInstance(results[0].value, _InvalidOutput)
|
||||||
|
@ -279,5 +279,24 @@ class MetaRefreshMiddlewareTest(unittest.TestCase):
|
|||||||
self.assertEqual(req2.meta['redirect_reasons'], ['meta refresh'])
|
self.assertEqual(req2.meta['redirect_reasons'], ['meta refresh'])
|
||||||
self.assertEqual(req3.meta['redirect_reasons'], ['meta refresh', 'meta refresh'])
|
self.assertEqual(req3.meta['redirect_reasons'], ['meta refresh', 'meta refresh'])
|
||||||
|
|
||||||
|
def test_ignore_tags_default(self):
|
||||||
|
req = Request(url='http://example.org')
|
||||||
|
body = ('''<noscript><meta http-equiv="refresh" '''
|
||||||
|
'''content="0;URL='http://example.org/newpage'"></noscript>''')
|
||||||
|
rsp = HtmlResponse(req.url, body=body.encode())
|
||||||
|
response = self.mw.process_response(req, rsp, self.spider)
|
||||||
|
assert isinstance(response, Response)
|
||||||
|
|
||||||
|
def test_ignore_tags_empty_list(self):
|
||||||
|
crawler = get_crawler(Spider, {'METAREFRESH_IGNORE_TAGS': []})
|
||||||
|
mw = MetaRefreshMiddleware.from_crawler(crawler)
|
||||||
|
req = Request(url='http://example.org')
|
||||||
|
body = ('''<noscript><meta http-equiv="refresh" '''
|
||||||
|
'''content="0;URL='http://example.org/newpage'"></noscript>''')
|
||||||
|
rsp = HtmlResponse(req.url, body=body.encode())
|
||||||
|
req2 = mw.process_response(req, rsp, self.spider)
|
||||||
|
assert isinstance(req2, Request)
|
||||||
|
self.assertEqual(req2.url, 'http://example.org/newpage')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -419,6 +419,43 @@ class BasicItemLoaderTest(unittest.TestCase):
|
|||||||
self.assertEqual(item['url'], u'rabbit.hole')
|
self.assertEqual(item['url'], u'rabbit.hole')
|
||||||
self.assertEqual(item['summary'], u'rabbithole')
|
self.assertEqual(item['summary'], u'rabbithole')
|
||||||
|
|
||||||
|
def test_create_item_from_dict(self):
|
||||||
|
class TestItem(Item):
|
||||||
|
title = Field()
|
||||||
|
|
||||||
|
class TestItemLoader(ItemLoader):
|
||||||
|
default_item_class = TestItem
|
||||||
|
|
||||||
|
input_item = {'title': 'Test item title 1'}
|
||||||
|
il = TestItemLoader(item=input_item)
|
||||||
|
# Getting output value mustn't remove value from item
|
||||||
|
self.assertEqual(il.load_item(), {
|
||||||
|
'title': 'Test item title 1',
|
||||||
|
})
|
||||||
|
self.assertEqual(il.get_output_value('title'), 'Test item title 1')
|
||||||
|
self.assertEqual(il.load_item(), {
|
||||||
|
'title': 'Test item title 1',
|
||||||
|
})
|
||||||
|
|
||||||
|
input_item = {'title': 'Test item title 2'}
|
||||||
|
il = TestItemLoader(item=input_item)
|
||||||
|
# Values from dict must be added to item _values
|
||||||
|
self.assertEqual(il._values.get('title'), 'Test item title 2')
|
||||||
|
|
||||||
|
input_item = {'title': [u'Test item title 3', u'Test item 4']}
|
||||||
|
il = TestItemLoader(item=input_item)
|
||||||
|
# Same rules must work for lists
|
||||||
|
self.assertEqual(il._values.get('title'),
|
||||||
|
[u'Test item title 3', u'Test item 4'])
|
||||||
|
self.assertEqual(il.load_item(), {
|
||||||
|
'title': [u'Test item title 3', u'Test item 4'],
|
||||||
|
})
|
||||||
|
self.assertEqual(il.get_output_value('title'),
|
||||||
|
[u'Test item title 3', u'Test item 4'])
|
||||||
|
self.assertEqual(il.load_item(), {
|
||||||
|
'title': [u'Test item title 3', u'Test item 4'],
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
class ProcessorsTest(unittest.TestCase):
|
class ProcessorsTest(unittest.TestCase):
|
||||||
|
|
||||||
|
@ -1,15 +1,19 @@
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
from testfixtures import LogCapture
|
from testfixtures import LogCapture
|
||||||
from twisted.trial import unittest
|
from twisted.trial import unittest
|
||||||
from twisted.python.failure import Failure
|
from twisted.python.failure import Failure
|
||||||
from twisted.internet import reactor
|
from twisted.internet import reactor
|
||||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
from twisted.internet.defer import Deferred, inlineCallbacks, returnValue
|
||||||
|
|
||||||
from scrapy.http import Request, Response
|
from scrapy.http import Request, Response
|
||||||
from scrapy.settings import Settings
|
from scrapy.settings import Settings
|
||||||
from scrapy.spiders import Spider
|
from scrapy.spiders import Spider
|
||||||
from scrapy.utils.request import request_fingerprint
|
from scrapy.utils.request import request_fingerprint
|
||||||
from scrapy.pipelines.media import MediaPipeline
|
from scrapy.pipelines.media import MediaPipeline
|
||||||
|
from scrapy.pipelines.files import FileException
|
||||||
from scrapy.utils.log import failure_to_exc_info
|
from scrapy.utils.log import failure_to_exc_info
|
||||||
from scrapy.utils.signal import disconnect_all
|
from scrapy.utils.signal import disconnect_all
|
||||||
from scrapy import signals
|
from scrapy import signals
|
||||||
@ -90,6 +94,77 @@ class BaseMediaPipelineTestCase(unittest.TestCase):
|
|||||||
self.pipe._modify_media_request(request)
|
self.pipe._modify_media_request(request)
|
||||||
assert request.meta == {'handle_httpstatus_all': True}
|
assert request.meta == {'handle_httpstatus_all': True}
|
||||||
|
|
||||||
|
def test_should_remove_req_res_references_before_caching_the_results(self):
|
||||||
|
"""Regression test case to prevent a memory leak in the Media Pipeline.
|
||||||
|
|
||||||
|
The memory leak is triggered when an exception is raised when a Response
|
||||||
|
scheduled by the Media Pipeline is being returned. For example, when a
|
||||||
|
FileException('download-error') is raised because the Response status
|
||||||
|
code is not 200 OK.
|
||||||
|
|
||||||
|
It happens because we are keeping a reference to the Response object
|
||||||
|
inside the FileException context. This is caused by the way Twisted
|
||||||
|
return values from inline callbacks. It raises a custom exception
|
||||||
|
encapsulating the original return value.
|
||||||
|
|
||||||
|
The solution is to remove the exception context when this context is a
|
||||||
|
_DefGen_Return instance, the BaseException used by Twisted to pass the
|
||||||
|
returned value from those inline callbacks.
|
||||||
|
|
||||||
|
Maybe there's a better and more reliable way to test the case described
|
||||||
|
here, but it would be more complicated and involve running - or at least
|
||||||
|
mocking - some async steps from the Media Pipeline. The current test
|
||||||
|
case is simple and detects the problem very fast. On the other hand, it
|
||||||
|
would not detect another kind of leak happening due to old object
|
||||||
|
references being kept inside the Media Pipeline cache.
|
||||||
|
|
||||||
|
This problem does not occur in Python 2.7 since we don't have Exception
|
||||||
|
Chaining (https://www.python.org/dev/peps/pep-3134/).
|
||||||
|
"""
|
||||||
|
# Create sample pair of Request and Response objects
|
||||||
|
request = Request('http://url')
|
||||||
|
response = Response('http://url', body=b'', request=request)
|
||||||
|
|
||||||
|
# Simulate the Media Pipeline behavior to produce a Twisted Failure
|
||||||
|
try:
|
||||||
|
# Simulate a Twisted inline callback returning a Response
|
||||||
|
# The returnValue method raises an exception encapsulating the value
|
||||||
|
returnValue(response)
|
||||||
|
except BaseException as exc:
|
||||||
|
def_gen_return_exc = exc
|
||||||
|
try:
|
||||||
|
# Simulate the media_downloaded callback raising a FileException
|
||||||
|
# This usually happens when the status code is not 200 OK
|
||||||
|
raise FileException('download-error')
|
||||||
|
except Exception as exc:
|
||||||
|
file_exc = exc
|
||||||
|
# Simulate Twisted capturing the FileException
|
||||||
|
# It encapsulates the exception inside a Twisted Failure
|
||||||
|
failure = Failure(file_exc)
|
||||||
|
|
||||||
|
# The Failure should encapsulate a FileException ...
|
||||||
|
self.assertEqual(failure.value, file_exc)
|
||||||
|
# ... and if we're running on Python 3 ...
|
||||||
|
if sys.version_info.major >= 3:
|
||||||
|
# ... it should have the returnValue exception set as its context
|
||||||
|
self.assertEqual(failure.value.__context__, def_gen_return_exc)
|
||||||
|
|
||||||
|
# Let's calculate the request fingerprint and fake some runtime data...
|
||||||
|
fp = request_fingerprint(request)
|
||||||
|
info = self.pipe.spiderinfo
|
||||||
|
info.downloading.add(fp)
|
||||||
|
info.waiting[fp] = []
|
||||||
|
|
||||||
|
# When calling the method that caches the Request's result ...
|
||||||
|
self.pipe._cache_result_and_execute_waiters(failure, fp, info)
|
||||||
|
# ... it should store the Twisted Failure ...
|
||||||
|
self.assertEqual(info.downloaded[fp], failure)
|
||||||
|
# ... encapsulating the original FileException ...
|
||||||
|
self.assertEqual(info.downloaded[fp].value, file_exc)
|
||||||
|
# ... but it should not store the returnValue exception on its context
|
||||||
|
context = getattr(info.downloaded[fp].value, '__context__', None)
|
||||||
|
self.assertIsNone(context)
|
||||||
|
|
||||||
|
|
||||||
class MockedMediaPipeline(MediaPipeline):
|
class MockedMediaPipeline(MediaPipeline):
|
||||||
|
|
||||||
|
342
tests/test_scheduler.py
Normal file
342
tests/test_scheduler.py
Normal file
@ -0,0 +1,342 @@
|
|||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
import collections
|
||||||
|
|
||||||
|
from twisted.internet import defer
|
||||||
|
from twisted.trial.unittest import TestCase
|
||||||
|
|
||||||
|
from scrapy.crawler import Crawler
|
||||||
|
from scrapy.core.downloader import Downloader
|
||||||
|
from scrapy.core.scheduler import Scheduler
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.spiders import Spider
|
||||||
|
from scrapy.utils.httpobj import urlparse_cached
|
||||||
|
from scrapy.utils.test import get_crawler
|
||||||
|
from tests.mockserver import MockServer
|
||||||
|
|
||||||
|
|
||||||
|
MockEngine = collections.namedtuple('MockEngine', ['downloader'])
|
||||||
|
MockSlot = collections.namedtuple('MockSlot', ['active'])
|
||||||
|
|
||||||
|
|
||||||
|
class MockDownloader(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.slots = dict()
|
||||||
|
|
||||||
|
def _get_slot_key(self, request, spider):
|
||||||
|
if Downloader.DOWNLOAD_SLOT in request.meta:
|
||||||
|
return request.meta[Downloader.DOWNLOAD_SLOT]
|
||||||
|
|
||||||
|
return urlparse_cached(request).hostname or ''
|
||||||
|
|
||||||
|
def increment(self, slot_key):
|
||||||
|
slot = self.slots.setdefault(slot_key, MockSlot(active=list()))
|
||||||
|
slot.active.append(1)
|
||||||
|
|
||||||
|
def decrement(self, slot_key):
|
||||||
|
slot = self.slots.get(slot_key)
|
||||||
|
slot.active.pop()
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class MockCrawler(Crawler):
|
||||||
|
def __init__(self, priority_queue_cls, jobdir):
|
||||||
|
|
||||||
|
settings = dict(
|
||||||
|
LOG_UNSERIALIZABLE_REQUESTS=False,
|
||||||
|
SCHEDULER_DISK_QUEUE='scrapy.squeues.PickleLifoDiskQueue',
|
||||||
|
SCHEDULER_MEMORY_QUEUE='scrapy.squeues.LifoMemoryQueue',
|
||||||
|
SCHEDULER_PRIORITY_QUEUE=priority_queue_cls,
|
||||||
|
JOBDIR=jobdir,
|
||||||
|
DUPEFILTER_CLASS='scrapy.dupefilters.BaseDupeFilter'
|
||||||
|
)
|
||||||
|
super(MockCrawler, self).__init__(Spider, settings)
|
||||||
|
self.engine = MockEngine(downloader=MockDownloader())
|
||||||
|
|
||||||
|
|
||||||
|
class SchedulerHandler(object):
|
||||||
|
priority_queue_cls = None
|
||||||
|
jobdir = None
|
||||||
|
|
||||||
|
def create_scheduler(self):
|
||||||
|
self.mock_crawler = MockCrawler(self.priority_queue_cls, self.jobdir)
|
||||||
|
self.scheduler = Scheduler.from_crawler(self.mock_crawler)
|
||||||
|
self.spider = Spider(name='spider')
|
||||||
|
self.scheduler.open(self.spider)
|
||||||
|
|
||||||
|
def close_scheduler(self):
|
||||||
|
self.scheduler.close('finished')
|
||||||
|
self.mock_crawler.stop()
|
||||||
|
self.mock_crawler.engine.downloader.close()
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.create_scheduler()
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self.close_scheduler()
|
||||||
|
|
||||||
|
|
||||||
|
_PRIORITIES = [("http://foo.com/a", -2),
|
||||||
|
("http://foo.com/d", 1),
|
||||||
|
("http://foo.com/b", -1),
|
||||||
|
("http://foo.com/c", 0),
|
||||||
|
("http://foo.com/e", 2)]
|
||||||
|
|
||||||
|
|
||||||
|
_URLS = {"http://foo.com/a", "http://foo.com/b", "http://foo.com/c"}
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSchedulerInMemoryTester(SchedulerHandler):
|
||||||
|
def test_length(self):
|
||||||
|
self.assertFalse(self.scheduler.has_pending_requests())
|
||||||
|
self.assertEqual(len(self.scheduler), 0)
|
||||||
|
|
||||||
|
for url in _URLS:
|
||||||
|
self.scheduler.enqueue_request(Request(url))
|
||||||
|
|
||||||
|
self.assertTrue(self.scheduler.has_pending_requests())
|
||||||
|
self.assertEqual(len(self.scheduler), len(_URLS))
|
||||||
|
|
||||||
|
def test_dequeue(self):
|
||||||
|
for url in _URLS:
|
||||||
|
self.scheduler.enqueue_request(Request(url))
|
||||||
|
|
||||||
|
urls = set()
|
||||||
|
while self.scheduler.has_pending_requests():
|
||||||
|
urls.add(self.scheduler.next_request().url)
|
||||||
|
|
||||||
|
self.assertEqual(urls, _URLS)
|
||||||
|
|
||||||
|
def test_dequeue_priorities(self):
|
||||||
|
for url, priority in _PRIORITIES:
|
||||||
|
self.scheduler.enqueue_request(Request(url, priority=priority))
|
||||||
|
|
||||||
|
priorities = list()
|
||||||
|
while self.scheduler.has_pending_requests():
|
||||||
|
priorities.append(self.scheduler.next_request().priority)
|
||||||
|
|
||||||
|
self.assertEqual(priorities,
|
||||||
|
sorted([x[1] for x in _PRIORITIES], key=lambda x: -x))
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSchedulerOnDiskTester(SchedulerHandler):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.jobdir = tempfile.mkdtemp()
|
||||||
|
self.create_scheduler()
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self.close_scheduler()
|
||||||
|
|
||||||
|
shutil.rmtree(self.jobdir)
|
||||||
|
self.jobdir = None
|
||||||
|
|
||||||
|
def test_length(self):
|
||||||
|
self.assertFalse(self.scheduler.has_pending_requests())
|
||||||
|
self.assertEqual(len(self.scheduler), 0)
|
||||||
|
|
||||||
|
for url in _URLS:
|
||||||
|
self.scheduler.enqueue_request(Request(url))
|
||||||
|
|
||||||
|
self.close_scheduler()
|
||||||
|
self.create_scheduler()
|
||||||
|
|
||||||
|
self.assertTrue(self.scheduler.has_pending_requests())
|
||||||
|
self.assertEqual(len(self.scheduler), len(_URLS))
|
||||||
|
|
||||||
|
def test_dequeue(self):
|
||||||
|
for url in _URLS:
|
||||||
|
self.scheduler.enqueue_request(Request(url))
|
||||||
|
|
||||||
|
self.close_scheduler()
|
||||||
|
self.create_scheduler()
|
||||||
|
|
||||||
|
urls = set()
|
||||||
|
while self.scheduler.has_pending_requests():
|
||||||
|
urls.add(self.scheduler.next_request().url)
|
||||||
|
|
||||||
|
self.assertEqual(urls, _URLS)
|
||||||
|
|
||||||
|
def test_dequeue_priorities(self):
|
||||||
|
for url, priority in _PRIORITIES:
|
||||||
|
self.scheduler.enqueue_request(Request(url, priority=priority))
|
||||||
|
|
||||||
|
self.close_scheduler()
|
||||||
|
self.create_scheduler()
|
||||||
|
|
||||||
|
priorities = list()
|
||||||
|
while self.scheduler.has_pending_requests():
|
||||||
|
priorities.append(self.scheduler.next_request().priority)
|
||||||
|
|
||||||
|
self.assertEqual(priorities,
|
||||||
|
sorted([x[1] for x in _PRIORITIES], key=lambda x: -x))
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchedulerInMemory(BaseSchedulerInMemoryTester, unittest.TestCase):
|
||||||
|
priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchedulerOnDisk(BaseSchedulerOnDiskTester, unittest.TestCase):
|
||||||
|
priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
|
||||||
|
|
||||||
|
|
||||||
|
_URLS_WITH_SLOTS = [("http://foo.com/a", 'a'),
|
||||||
|
("http://foo.com/b", 'a'),
|
||||||
|
("http://foo.com/c", 'b'),
|
||||||
|
("http://foo.com/d", 'b'),
|
||||||
|
("http://foo.com/e", 'c'),
|
||||||
|
("http://foo.com/f", 'c')]
|
||||||
|
|
||||||
|
|
||||||
|
class TestMigration(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.tmpdir = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
shutil.rmtree(self.tmpdir)
|
||||||
|
|
||||||
|
def _migration(self, tmp_dir):
|
||||||
|
prev_scheduler_handler = SchedulerHandler()
|
||||||
|
prev_scheduler_handler.priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
|
||||||
|
prev_scheduler_handler.jobdir = tmp_dir
|
||||||
|
|
||||||
|
prev_scheduler_handler.create_scheduler()
|
||||||
|
for url in _URLS:
|
||||||
|
prev_scheduler_handler.scheduler.enqueue_request(Request(url))
|
||||||
|
prev_scheduler_handler.close_scheduler()
|
||||||
|
|
||||||
|
next_scheduler_handler = SchedulerHandler()
|
||||||
|
next_scheduler_handler.priority_queue_cls = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
|
||||||
|
next_scheduler_handler.jobdir = tmp_dir
|
||||||
|
|
||||||
|
next_scheduler_handler.create_scheduler()
|
||||||
|
|
||||||
|
def test_migration(self):
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
self._migration(self.tmpdir)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_scheduling_fair(enqueued_slots, dequeued_slots):
|
||||||
|
"""
|
||||||
|
We enqueued same number of requests for every slot.
|
||||||
|
Assert correct order, e.g.
|
||||||
|
|
||||||
|
>>> enqueued = ['a', 'b', 'c'] * 2
|
||||||
|
>>> correct = ['a', 'c', 'b', 'b', 'a', 'c']
|
||||||
|
>>> incorrect = ['a', 'a', 'b', 'c', 'c', 'b']
|
||||||
|
>>> _is_scheduling_fair(enqueued, correct)
|
||||||
|
True
|
||||||
|
>>> _is_scheduling_fair(enqueued, incorrect)
|
||||||
|
False
|
||||||
|
"""
|
||||||
|
if len(dequeued_slots) != len(enqueued_slots):
|
||||||
|
return False
|
||||||
|
|
||||||
|
slots_number = len(set(enqueued_slots))
|
||||||
|
for i in range(0, len(dequeued_slots), slots_number):
|
||||||
|
part = dequeued_slots[i:i + slots_number]
|
||||||
|
if len(part) != len(set(part)):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class DownloaderAwareSchedulerTestMixin(object):
|
||||||
|
priority_queue_cls = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
|
||||||
|
reopen = False
|
||||||
|
|
||||||
|
def test_logic(self):
|
||||||
|
for url, slot in _URLS_WITH_SLOTS:
|
||||||
|
request = Request(url)
|
||||||
|
request.meta[Downloader.DOWNLOAD_SLOT] = slot
|
||||||
|
self.scheduler.enqueue_request(request)
|
||||||
|
|
||||||
|
if self.reopen:
|
||||||
|
self.close_scheduler()
|
||||||
|
self.create_scheduler()
|
||||||
|
|
||||||
|
dequeued_slots = list()
|
||||||
|
requests = []
|
||||||
|
downloader = self.mock_crawler.engine.downloader
|
||||||
|
while self.scheduler.has_pending_requests():
|
||||||
|
request = self.scheduler.next_request()
|
||||||
|
# pylint: disable=protected-access
|
||||||
|
slot = downloader._get_slot_key(request, None)
|
||||||
|
dequeued_slots.append(slot)
|
||||||
|
downloader.increment(slot)
|
||||||
|
requests.append(request)
|
||||||
|
|
||||||
|
for request in requests:
|
||||||
|
# pylint: disable=protected-access
|
||||||
|
slot = downloader._get_slot_key(request, None)
|
||||||
|
downloader.decrement(slot)
|
||||||
|
|
||||||
|
self.assertTrue(_is_scheduling_fair(list(s for u, s in _URLS_WITH_SLOTS),
|
||||||
|
dequeued_slots))
|
||||||
|
self.assertEqual(sum(len(s.active) for s in downloader.slots.values()), 0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchedulerWithDownloaderAwareInMemory(DownloaderAwareSchedulerTestMixin,
|
||||||
|
BaseSchedulerInMemoryTester,
|
||||||
|
unittest.TestCase):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchedulerWithDownloaderAwareOnDisk(DownloaderAwareSchedulerTestMixin,
|
||||||
|
BaseSchedulerOnDiskTester,
|
||||||
|
unittest.TestCase):
|
||||||
|
reopen = True
|
||||||
|
|
||||||
|
|
||||||
|
class StartUrlsSpider(Spider):
|
||||||
|
|
||||||
|
def __init__(self, start_urls):
|
||||||
|
self.start_urls = start_urls
|
||||||
|
super(StartUrlsSpider, self).__init__(start_urls)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TestIntegrationWithDownloaderAwareInMemory(TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.crawler = get_crawler(
|
||||||
|
StartUrlsSpider,
|
||||||
|
{'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
|
||||||
|
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter'}
|
||||||
|
)
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def tearDown(self):
|
||||||
|
yield self.crawler.stop()
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_integration_downloader_aware_priority_queue(self):
|
||||||
|
with MockServer() as mockserver:
|
||||||
|
|
||||||
|
url = mockserver.url("/status?n=200", is_secure=False)
|
||||||
|
start_urls = [url] * 6
|
||||||
|
yield self.crawler.crawl(start_urls)
|
||||||
|
self.assertEqual(self.crawler.stats.get_value('downloader/response_count'),
|
||||||
|
len(start_urls))
|
||||||
|
|
||||||
|
|
||||||
|
class TestIncompatibility(unittest.TestCase):
|
||||||
|
|
||||||
|
def _incompatible(self):
|
||||||
|
settings = dict(
|
||||||
|
SCHEDULER_PRIORITY_QUEUE='scrapy.pqueues.DownloaderAwarePriorityQueue',
|
||||||
|
CONCURRENT_REQUESTS_PER_IP=1
|
||||||
|
)
|
||||||
|
crawler = Crawler(Spider, settings)
|
||||||
|
scheduler = Scheduler.from_crawler(crawler)
|
||||||
|
spider = Spider(name='spider')
|
||||||
|
scheduler.open(spider)
|
||||||
|
|
||||||
|
def test_incompatibility(self):
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
self._incompatible()
|
@ -105,11 +105,11 @@ class SpiderTest(unittest.TestCase):
|
|||||||
|
|
||||||
def test_logger(self):
|
def test_logger(self):
|
||||||
spider = self.spider_class('example.com')
|
spider = self.spider_class('example.com')
|
||||||
with LogCapture() as l:
|
with LogCapture() as lc:
|
||||||
spider.logger.info('test log msg')
|
spider.logger.info('test log msg')
|
||||||
l.check(('example.com', 'INFO', 'test log msg'))
|
lc.check(('example.com', 'INFO', 'test log msg'))
|
||||||
|
|
||||||
record = l.records[0]
|
record = lc.records[0]
|
||||||
self.assertIn('spider', record.__dict__)
|
self.assertIn('spider', record.__dict__)
|
||||||
self.assertIs(record.spider, spider)
|
self.assertIs(record.spider, spider)
|
||||||
|
|
||||||
@ -190,8 +190,7 @@ class CrawlSpiderTest(SpiderTest):
|
|||||||
|
|
||||||
def test_process_links(self):
|
def test_process_links(self):
|
||||||
|
|
||||||
response = HtmlResponse("http://example.org/somepage/index.html",
|
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||||
body=self.test_body)
|
|
||||||
|
|
||||||
class _CrawlSpider(self.spider_class):
|
class _CrawlSpider(self.spider_class):
|
||||||
name = "test"
|
name = "test"
|
||||||
@ -214,8 +213,7 @@ class CrawlSpiderTest(SpiderTest):
|
|||||||
|
|
||||||
def test_process_links_filter(self):
|
def test_process_links_filter(self):
|
||||||
|
|
||||||
response = HtmlResponse("http://example.org/somepage/index.html",
|
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||||
body=self.test_body)
|
|
||||||
|
|
||||||
class _CrawlSpider(self.spider_class):
|
class _CrawlSpider(self.spider_class):
|
||||||
import re
|
import re
|
||||||
@ -226,6 +224,7 @@ class CrawlSpiderTest(SpiderTest):
|
|||||||
Rule(LinkExtractor(), process_links="filter_process_links"),
|
Rule(LinkExtractor(), process_links="filter_process_links"),
|
||||||
)
|
)
|
||||||
_test_regex = re.compile('nofollow')
|
_test_regex = re.compile('nofollow')
|
||||||
|
|
||||||
def filter_process_links(self, links):
|
def filter_process_links(self, links):
|
||||||
return [link for link in links
|
return [link for link in links
|
||||||
if not self._test_regex.search(link.url)]
|
if not self._test_regex.search(link.url)]
|
||||||
@ -240,8 +239,7 @@ class CrawlSpiderTest(SpiderTest):
|
|||||||
|
|
||||||
def test_process_links_generator(self):
|
def test_process_links_generator(self):
|
||||||
|
|
||||||
response = HtmlResponse("http://example.org/somepage/index.html",
|
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||||
body=self.test_body)
|
|
||||||
|
|
||||||
class _CrawlSpider(self.spider_class):
|
class _CrawlSpider(self.spider_class):
|
||||||
name = "test"
|
name = "test"
|
||||||
@ -263,6 +261,110 @@ class CrawlSpiderTest(SpiderTest):
|
|||||||
'http://example.org/about.html',
|
'http://example.org/about.html',
|
||||||
'http://example.org/nofollow.html'])
|
'http://example.org/nofollow.html'])
|
||||||
|
|
||||||
|
def test_process_request(self):
|
||||||
|
|
||||||
|
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||||
|
|
||||||
|
def process_request_change_domain(request):
|
||||||
|
return request.replace(url=request.url.replace('.org', '.com'))
|
||||||
|
|
||||||
|
class _CrawlSpider(self.spider_class):
|
||||||
|
name = "test"
|
||||||
|
allowed_domains = ['example.org']
|
||||||
|
rules = (
|
||||||
|
Rule(LinkExtractor(), process_request=process_request_change_domain),
|
||||||
|
)
|
||||||
|
|
||||||
|
with warnings.catch_warnings(record=True) as cw:
|
||||||
|
spider = _CrawlSpider()
|
||||||
|
output = list(spider._requests_to_follow(response))
|
||||||
|
self.assertEqual(len(output), 3)
|
||||||
|
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||||
|
self.assertEqual([r.url for r in output],
|
||||||
|
['http://example.com/somepage/item/12.html',
|
||||||
|
'http://example.com/about.html',
|
||||||
|
'http://example.com/nofollow.html'])
|
||||||
|
self.assertEqual(len(cw), 1)
|
||||||
|
self.assertEqual(cw[0].category, ScrapyDeprecationWarning)
|
||||||
|
|
||||||
|
def test_process_request_with_response(self):
|
||||||
|
|
||||||
|
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||||
|
|
||||||
|
def process_request_meta_response_class(request, response):
|
||||||
|
request.meta['response_class'] = response.__class__.__name__
|
||||||
|
return request
|
||||||
|
|
||||||
|
class _CrawlSpider(self.spider_class):
|
||||||
|
name = "test"
|
||||||
|
allowed_domains = ['example.org']
|
||||||
|
rules = (
|
||||||
|
Rule(LinkExtractor(), process_request=process_request_meta_response_class),
|
||||||
|
)
|
||||||
|
|
||||||
|
spider = _CrawlSpider()
|
||||||
|
output = list(spider._requests_to_follow(response))
|
||||||
|
self.assertEqual(len(output), 3)
|
||||||
|
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||||
|
self.assertEqual([r.url for r in output],
|
||||||
|
['http://example.org/somepage/item/12.html',
|
||||||
|
'http://example.org/about.html',
|
||||||
|
'http://example.org/nofollow.html'])
|
||||||
|
self.assertEqual([r.meta['response_class'] for r in output],
|
||||||
|
['HtmlResponse', 'HtmlResponse', 'HtmlResponse'])
|
||||||
|
|
||||||
|
def test_process_request_instance_method(self):
|
||||||
|
|
||||||
|
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||||
|
|
||||||
|
class _CrawlSpider(self.spider_class):
|
||||||
|
name = "test"
|
||||||
|
allowed_domains = ['example.org']
|
||||||
|
rules = (
|
||||||
|
Rule(LinkExtractor(), process_request='process_request_upper'),
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_request_upper(self, request):
|
||||||
|
return request.replace(url=request.url.upper())
|
||||||
|
|
||||||
|
with warnings.catch_warnings(record=True) as cw:
|
||||||
|
spider = _CrawlSpider()
|
||||||
|
output = list(spider._requests_to_follow(response))
|
||||||
|
self.assertEqual(len(output), 3)
|
||||||
|
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||||
|
self.assertEqual([r.url for r in output],
|
||||||
|
['http://EXAMPLE.ORG/SOMEPAGE/ITEM/12.HTML',
|
||||||
|
'http://EXAMPLE.ORG/ABOUT.HTML',
|
||||||
|
'http://EXAMPLE.ORG/NOFOLLOW.HTML'])
|
||||||
|
self.assertEqual(len(cw), 1)
|
||||||
|
self.assertEqual(cw[0].category, ScrapyDeprecationWarning)
|
||||||
|
|
||||||
|
def test_process_request_instance_method_with_response(self):
|
||||||
|
|
||||||
|
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||||
|
|
||||||
|
class _CrawlSpider(self.spider_class):
|
||||||
|
name = "test"
|
||||||
|
allowed_domains = ['example.org']
|
||||||
|
rules = (
|
||||||
|
Rule(LinkExtractor(), process_request='process_request_meta_response_class'),
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_request_meta_response_class(self, request, response):
|
||||||
|
request.meta['response_class'] = response.__class__.__name__
|
||||||
|
return request
|
||||||
|
|
||||||
|
spider = _CrawlSpider()
|
||||||
|
output = list(spider._requests_to_follow(response))
|
||||||
|
self.assertEqual(len(output), 3)
|
||||||
|
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||||
|
self.assertEqual([r.url for r in output],
|
||||||
|
['http://example.org/somepage/item/12.html',
|
||||||
|
'http://example.org/about.html',
|
||||||
|
'http://example.org/nofollow.html'])
|
||||||
|
self.assertEqual([r.meta['response_class'] for r in output],
|
||||||
|
['HtmlResponse', 'HtmlResponse', 'HtmlResponse'])
|
||||||
|
|
||||||
def test_follow_links_attribute_population(self):
|
def test_follow_links_attribute_population(self):
|
||||||
crawler = get_crawler()
|
crawler = get_crawler()
|
||||||
spider = self.spider_class.from_crawler(crawler, 'example.com')
|
spider = self.spider_class.from_crawler(crawler, 'example.com')
|
||||||
|
102
tests/test_spidermiddleware.py
Normal file
102
tests/test_spidermiddleware.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
from twisted.trial.unittest import TestCase
|
||||||
|
from twisted.python.failure import Failure
|
||||||
|
|
||||||
|
from scrapy.spiders import Spider
|
||||||
|
from scrapy.http import Request, Response
|
||||||
|
from scrapy.exceptions import _InvalidOutput
|
||||||
|
from scrapy.utils.test import get_crawler
|
||||||
|
from scrapy.core.spidermw import SpiderMiddlewareManager
|
||||||
|
from tests import mock
|
||||||
|
|
||||||
|
|
||||||
|
class SpiderMiddlewareTestCase(TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.request = Request('http://example.com/index.html')
|
||||||
|
self.response = Response(self.request.url, request=self.request)
|
||||||
|
self.crawler = get_crawler(Spider)
|
||||||
|
self.spider = self.crawler._create_spider('foo')
|
||||||
|
self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
|
||||||
|
|
||||||
|
def _scrape_response(self):
|
||||||
|
"""Execute spider mw manager's scrape_response method and return the result.
|
||||||
|
Raise exception in case of failure.
|
||||||
|
"""
|
||||||
|
scrape_func = mock.MagicMock()
|
||||||
|
dfd = self.mwman.scrape_response(scrape_func, self.response, self.request, self.spider)
|
||||||
|
# catch deferred result and return the value
|
||||||
|
results = []
|
||||||
|
dfd.addBoth(results.append)
|
||||||
|
self._wait(dfd)
|
||||||
|
ret = results[0]
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessSpiderInputInvalidOutput(SpiderMiddlewareTestCase):
|
||||||
|
"""Invalid return value for process_spider_input method"""
|
||||||
|
|
||||||
|
def test_invalid_process_spider_input(self):
|
||||||
|
|
||||||
|
class InvalidProcessSpiderInputMiddleware:
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
self.mwman._add_middleware(InvalidProcessSpiderInputMiddleware())
|
||||||
|
result = self._scrape_response()
|
||||||
|
self.assertIsInstance(result, Failure)
|
||||||
|
self.assertIsInstance(result.value, _InvalidOutput)
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessSpiderOutputInvalidOutput(SpiderMiddlewareTestCase):
|
||||||
|
"""Invalid return value for process_spider_output method"""
|
||||||
|
|
||||||
|
def test_invalid_process_spider_output(self):
|
||||||
|
|
||||||
|
class InvalidProcessSpiderOutputMiddleware:
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
self.mwman._add_middleware(InvalidProcessSpiderOutputMiddleware())
|
||||||
|
result = self._scrape_response()
|
||||||
|
self.assertIsInstance(result, Failure)
|
||||||
|
self.assertIsInstance(result.value, _InvalidOutput)
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessSpiderExceptionInvalidOutput(SpiderMiddlewareTestCase):
|
||||||
|
"""Invalid return value for process_spider_exception method"""
|
||||||
|
|
||||||
|
def test_invalid_process_spider_exception(self):
|
||||||
|
|
||||||
|
class InvalidProcessSpiderOutputExceptionMiddleware:
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
return 1
|
||||||
|
|
||||||
|
class RaiseExceptionProcessSpiderOutputMiddleware:
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
self.mwman._add_middleware(InvalidProcessSpiderOutputExceptionMiddleware())
|
||||||
|
self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
|
||||||
|
result = self._scrape_response()
|
||||||
|
self.assertIsInstance(result, Failure)
|
||||||
|
self.assertIsInstance(result.value, _InvalidOutput)
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessSpiderExceptionReRaise(SpiderMiddlewareTestCase):
|
||||||
|
"""Re raise the exception by returning None"""
|
||||||
|
|
||||||
|
def test_process_spider_exception_return_none(self):
|
||||||
|
|
||||||
|
class ProcessSpiderExceptionReturnNoneMiddleware:
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
return None
|
||||||
|
|
||||||
|
class RaiseExceptionProcessSpiderOutputMiddleware:
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
1/0
|
||||||
|
|
||||||
|
self.mwman._add_middleware(ProcessSpiderExceptionReturnNoneMiddleware())
|
||||||
|
self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
|
||||||
|
result = self._scrape_response()
|
||||||
|
self.assertIsInstance(result, Failure)
|
||||||
|
self.assertIsInstance(result.value, ZeroDivisionError)
|
380
tests/test_spidermiddleware_output_chain.py
Normal file
380
tests/test_spidermiddleware_output_chain.py
Normal file
@ -0,0 +1,380 @@
|
|||||||
|
|
||||||
|
from testfixtures import LogCapture
|
||||||
|
from twisted.trial.unittest import TestCase
|
||||||
|
from twisted.internet import defer
|
||||||
|
|
||||||
|
from scrapy import Spider, Request
|
||||||
|
from scrapy.utils.test import get_crawler
|
||||||
|
from tests.mockserver import MockServer
|
||||||
|
from tests.spiders import MockServerSpider
|
||||||
|
|
||||||
|
|
||||||
|
class LogExceptionMiddleware:
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
spider.logger.info('Middleware: %s exception caught', exception.__class__.__name__)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================
|
||||||
|
# (0) recover from an exception on a spider callback
|
||||||
|
class RecoverySpider(Spider):
|
||||||
|
name = 'RecoverySpider'
|
||||||
|
custom_settings = {
|
||||||
|
'SPIDER_MIDDLEWARES': {
|
||||||
|
__name__ + '.RecoveryMiddleware': 10,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield Request(self.mockserver.url('/status?n=200'))
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
yield {'test': 1}
|
||||||
|
self.logger.info('DONT_FAIL: %s', response.meta.get('dont_fail'))
|
||||||
|
if not response.meta.get('dont_fail'):
|
||||||
|
raise TabError()
|
||||||
|
|
||||||
|
class RecoveryMiddleware:
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
spider.logger.info('Middleware: %s exception caught', exception.__class__.__name__)
|
||||||
|
return [
|
||||||
|
{'from': 'process_spider_exception'},
|
||||||
|
Request(response.url, meta={'dont_fail': True}, dont_filter=True),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================
|
||||||
|
# (1) exceptions from a spider middleware's process_spider_input method
|
||||||
|
class FailProcessSpiderInputMiddleware:
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
spider.logger.info('Middleware: will raise IndexError')
|
||||||
|
raise IndexError()
|
||||||
|
|
||||||
|
class ProcessSpiderInputSpiderWithoutErrback(Spider):
|
||||||
|
name = 'ProcessSpiderInputSpiderWithoutErrback'
|
||||||
|
custom_settings = {
|
||||||
|
'SPIDER_MIDDLEWARES': {
|
||||||
|
# spider
|
||||||
|
__name__ + '.LogExceptionMiddleware': 10,
|
||||||
|
__name__ + '.FailProcessSpiderInputMiddleware': 8,
|
||||||
|
__name__ + '.LogExceptionMiddleware': 6,
|
||||||
|
# engine
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield Request(url=self.mockserver.url('/status?n=200'), callback=self.parse)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
return {'from': 'callback'}
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessSpiderInputSpiderWithErrback(ProcessSpiderInputSpiderWithoutErrback):
|
||||||
|
name = 'ProcessSpiderInputSpiderWithErrback'
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield Request(url=self.mockserver.url('/status?n=200'), callback=self.parse, errback=self.errback)
|
||||||
|
|
||||||
|
def errback(self, failure):
|
||||||
|
self.logger.info('Got a Failure on the Request errback')
|
||||||
|
return {'from': 'errback'}
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================
|
||||||
|
# (2) exceptions from a spider callback (generator)
|
||||||
|
class GeneratorCallbackSpider(Spider):
|
||||||
|
name = 'GeneratorCallbackSpider'
|
||||||
|
custom_settings = {
|
||||||
|
'SPIDER_MIDDLEWARES': {
|
||||||
|
__name__ + '.LogExceptionMiddleware': 10,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield Request(self.mockserver.url('/status?n=200'))
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
yield {'test': 1}
|
||||||
|
yield {'test': 2}
|
||||||
|
raise ImportError()
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================
|
||||||
|
# (3) exceptions from a spider callback (not a generator)
|
||||||
|
class NotGeneratorCallbackSpider(Spider):
|
||||||
|
name = 'NotGeneratorCallbackSpider'
|
||||||
|
custom_settings = {
|
||||||
|
'SPIDER_MIDDLEWARES': {
|
||||||
|
__name__ + '.LogExceptionMiddleware': 10,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield Request(self.mockserver.url('/status?n=200'))
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
return [{'test': 1}, {'test': 1/0}]
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================
|
||||||
|
# (4) exceptions from a middleware process_spider_output method (generator)
|
||||||
|
class GeneratorOutputChainSpider(Spider):
|
||||||
|
name = 'GeneratorOutputChainSpider'
|
||||||
|
custom_settings = {
|
||||||
|
'SPIDER_MIDDLEWARES': {
|
||||||
|
__name__ + '.GeneratorFailMiddleware': 10,
|
||||||
|
__name__ + '.GeneratorDoNothingAfterFailureMiddleware': 8,
|
||||||
|
__name__ + '.GeneratorRecoverMiddleware': 5,
|
||||||
|
__name__ + '.GeneratorDoNothingAfterRecoveryMiddleware': 3,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield Request(self.mockserver.url('/status?n=200'))
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
yield {'processed': ['parse-first-item']}
|
||||||
|
yield {'processed': ['parse-second-item']}
|
||||||
|
|
||||||
|
|
||||||
|
class _GeneratorDoNothingMiddleware:
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
for r in result:
|
||||||
|
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||||
|
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class GeneratorFailMiddleware:
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
for r in result:
|
||||||
|
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||||
|
yield r
|
||||||
|
raise LookupError()
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||||
|
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||||
|
yield {'processed': [method]}
|
||||||
|
|
||||||
|
|
||||||
|
class GeneratorDoNothingAfterFailureMiddleware(_GeneratorDoNothingMiddleware):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class GeneratorRecoverMiddleware:
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
for r in result:
|
||||||
|
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||||
|
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||||
|
yield {'processed': [method]}
|
||||||
|
|
||||||
|
class GeneratorDoNothingAfterRecoveryMiddleware(_GeneratorDoNothingMiddleware):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================
|
||||||
|
# (5) exceptions from a middleware process_spider_output method (not generator)
|
||||||
|
class NotGeneratorOutputChainSpider(Spider):
|
||||||
|
name = 'NotGeneratorOutputChainSpider'
|
||||||
|
custom_settings = {
|
||||||
|
'SPIDER_MIDDLEWARES': {
|
||||||
|
__name__ + '.NotGeneratorFailMiddleware': 10,
|
||||||
|
__name__ + '.NotGeneratorDoNothingAfterFailureMiddleware': 8,
|
||||||
|
__name__ + '.NotGeneratorRecoverMiddleware': 5,
|
||||||
|
__name__ + '.NotGeneratorDoNothingAfterRecoveryMiddleware': 3,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
return [Request(self.mockserver.url('/status?n=200'))]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
return [{'processed': ['parse-first-item']}, {'processed': ['parse-second-item']}]
|
||||||
|
|
||||||
|
|
||||||
|
class _NotGeneratorDoNothingMiddleware:
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
out = []
|
||||||
|
for r in result:
|
||||||
|
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||||
|
out.append(r)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||||
|
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class NotGeneratorFailMiddleware:
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
out = []
|
||||||
|
for r in result:
|
||||||
|
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||||
|
out.append(r)
|
||||||
|
raise ReferenceError()
|
||||||
|
return out
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||||
|
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||||
|
return [{'processed': [method]}]
|
||||||
|
|
||||||
|
|
||||||
|
class NotGeneratorDoNothingAfterFailureMiddleware(_NotGeneratorDoNothingMiddleware):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class NotGeneratorRecoverMiddleware:
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
out = []
|
||||||
|
for r in result:
|
||||||
|
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||||
|
out.append(r)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||||
|
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||||
|
return [{'processed': [method]}]
|
||||||
|
|
||||||
|
class NotGeneratorDoNothingAfterRecoveryMiddleware(_NotGeneratorDoNothingMiddleware):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================
|
||||||
|
class TestSpiderMiddleware(TestCase):
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.mockserver = MockServer()
|
||||||
|
cls.mockserver.__enter__()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tearDownClass(cls):
|
||||||
|
cls.mockserver.__exit__(None, None, None)
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def crawl_log(self, spider):
|
||||||
|
crawler = get_crawler(spider)
|
||||||
|
with LogCapture() as log:
|
||||||
|
yield crawler.crawl(mockserver=self.mockserver)
|
||||||
|
raise defer.returnValue(log)
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_recovery(self):
|
||||||
|
"""
|
||||||
|
(0) Recover from an exception in a spider callback. The final item count should be 3
|
||||||
|
(one yielded from the callback method before the exception is raised, one directly
|
||||||
|
from the recovery middleware and one from the spider when processing the request that
|
||||||
|
was enqueued from the recovery middleware)
|
||||||
|
"""
|
||||||
|
log = yield self.crawl_log(RecoverySpider)
|
||||||
|
self.assertIn("Middleware: TabError exception caught", str(log))
|
||||||
|
self.assertEqual(str(log).count("Middleware: TabError exception caught"), 1)
|
||||||
|
self.assertIn("'item_scraped_count': 3", str(log))
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_process_spider_input_without_errback(self):
|
||||||
|
"""
|
||||||
|
(1.1) An exception from the process_spider_input chain should be caught by the
|
||||||
|
process_spider_exception chain from the start if the Request has no errback
|
||||||
|
"""
|
||||||
|
log1 = yield self.crawl_log(ProcessSpiderInputSpiderWithoutErrback)
|
||||||
|
self.assertIn("Middleware: will raise IndexError", str(log1))
|
||||||
|
self.assertIn("Middleware: IndexError exception caught", str(log1))
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_process_spider_input_with_errback(self):
|
||||||
|
"""
|
||||||
|
(1.2) An exception from the process_spider_input chain should not be caught by the
|
||||||
|
process_spider_exception chain if the Request has an errback
|
||||||
|
"""
|
||||||
|
log1 = yield self.crawl_log(ProcessSpiderInputSpiderWithErrback)
|
||||||
|
self.assertNotIn("Middleware: IndexError exception caught", str(log1))
|
||||||
|
self.assertIn("Middleware: will raise IndexError", str(log1))
|
||||||
|
self.assertIn("Got a Failure on the Request errback", str(log1))
|
||||||
|
self.assertIn("{'from': 'errback'}", str(log1))
|
||||||
|
self.assertNotIn("{'from': 'callback'}", str(log1))
|
||||||
|
self.assertIn("'item_scraped_count': 1", str(log1))
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_generator_callback(self):
|
||||||
|
"""
|
||||||
|
(2) An exception from a spider callback (returning a generator) should
|
||||||
|
be caught by the process_spider_exception chain. Items yielded before the
|
||||||
|
exception is raised should be processed normally.
|
||||||
|
"""
|
||||||
|
log2 = yield self.crawl_log(GeneratorCallbackSpider)
|
||||||
|
self.assertIn("Middleware: ImportError exception caught", str(log2))
|
||||||
|
self.assertIn("'item_scraped_count': 2", str(log2))
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_not_a_generator_callback(self):
|
||||||
|
"""
|
||||||
|
(3) An exception from a spider callback (returning a list) should
|
||||||
|
be caught by the process_spider_exception chain. No items should be processed.
|
||||||
|
"""
|
||||||
|
log3 = yield self.crawl_log(NotGeneratorCallbackSpider)
|
||||||
|
self.assertIn("Middleware: ZeroDivisionError exception caught", str(log3))
|
||||||
|
self.assertNotIn("item_scraped_count", str(log3))
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_generator_output_chain(self):
|
||||||
|
"""
|
||||||
|
(4) An exception from a middleware's process_spider_output method should be sent
|
||||||
|
to the process_spider_exception method from the next middleware in the chain.
|
||||||
|
The result of the recovery by the process_spider_exception method should be handled
|
||||||
|
by the process_spider_output method from the next middleware.
|
||||||
|
The final item count should be 2 (one from the spider callback and one from the
|
||||||
|
process_spider_exception chain)
|
||||||
|
"""
|
||||||
|
log4 = yield self.crawl_log(GeneratorOutputChainSpider)
|
||||||
|
self.assertIn("'item_scraped_count': 2", str(log4))
|
||||||
|
self.assertIn("GeneratorRecoverMiddleware.process_spider_exception: LookupError caught", str(log4))
|
||||||
|
self.assertIn("GeneratorDoNothingAfterFailureMiddleware.process_spider_exception: LookupError caught", str(log4))
|
||||||
|
self.assertNotIn("GeneratorFailMiddleware.process_spider_exception: LookupError caught", str(log4))
|
||||||
|
self.assertNotIn("GeneratorDoNothingAfterRecoveryMiddleware.process_spider_exception: LookupError caught", str(log4))
|
||||||
|
item_from_callback = {'processed': [
|
||||||
|
'parse-first-item',
|
||||||
|
'GeneratorFailMiddleware.process_spider_output',
|
||||||
|
'GeneratorDoNothingAfterFailureMiddleware.process_spider_output',
|
||||||
|
'GeneratorRecoverMiddleware.process_spider_output',
|
||||||
|
'GeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
|
||||||
|
item_recovered = {'processed': [
|
||||||
|
'GeneratorRecoverMiddleware.process_spider_exception',
|
||||||
|
'GeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
|
||||||
|
self.assertIn(str(item_from_callback), str(log4))
|
||||||
|
self.assertIn(str(item_recovered), str(log4))
|
||||||
|
self.assertNotIn('parse-second-item', str(log4))
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def test_not_a_generator_output_chain(self):
|
||||||
|
"""
|
||||||
|
(5) An exception from a middleware's process_spider_output method should be sent
|
||||||
|
to the process_spider_exception method from the next middleware in the chain.
|
||||||
|
The result of the recovery by the process_spider_exception method should be handled
|
||||||
|
by the process_spider_output method from the next middleware.
|
||||||
|
The final item count should be 1 (from the process_spider_exception chain, the items
|
||||||
|
from the spider callback are lost)
|
||||||
|
"""
|
||||||
|
log5 = yield self.crawl_log(NotGeneratorOutputChainSpider)
|
||||||
|
self.assertIn("'item_scraped_count': 1", str(log5))
|
||||||
|
self.assertIn("GeneratorRecoverMiddleware.process_spider_exception: ReferenceError caught", str(log5))
|
||||||
|
self.assertIn("GeneratorDoNothingAfterFailureMiddleware.process_spider_exception: ReferenceError caught", str(log5))
|
||||||
|
self.assertNotIn("GeneratorFailMiddleware.process_spider_exception: ReferenceError caught", str(log5))
|
||||||
|
self.assertNotIn("GeneratorDoNothingAfterRecoveryMiddleware.process_spider_exception: ReferenceError caught", str(log5))
|
||||||
|
item_recovered = {'processed': [
|
||||||
|
'NotGeneratorRecoverMiddleware.process_spider_exception',
|
||||||
|
'NotGeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
|
||||||
|
self.assertIn(str(item_recovered), str(log5))
|
||||||
|
self.assertNotIn('parse-first-item', str(log5))
|
||||||
|
self.assertNotIn('parse-second-item', str(log5))
|
@ -3,12 +3,13 @@ import os
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from scrapy.item import Item, Field
|
from scrapy.item import Item, Field
|
||||||
from scrapy.utils.misc import arg_to_iter, create_instance, load_object, walk_modules
|
from scrapy.utils.misc import arg_to_iter, create_instance, load_object, set_environ, walk_modules
|
||||||
|
|
||||||
from tests import mock
|
from tests import mock
|
||||||
|
|
||||||
__doctests__ = ['scrapy.utils.misc']
|
__doctests__ = ['scrapy.utils.misc']
|
||||||
|
|
||||||
|
|
||||||
class UtilsMiscTestCase(unittest.TestCase):
|
class UtilsMiscTestCase(unittest.TestCase):
|
||||||
|
|
||||||
def test_load_object(self):
|
def test_load_object(self):
|
||||||
@ -130,5 +131,18 @@ class UtilsMiscTestCase(unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
create_instance(m, None, None)
|
create_instance(m, None, None)
|
||||||
|
|
||||||
|
def test_set_environ(self):
|
||||||
|
assert os.environ.get('some_test_environ') is None
|
||||||
|
with set_environ(some_test_environ='test_value'):
|
||||||
|
assert os.environ.get('some_test_environ') == 'test_value'
|
||||||
|
assert os.environ.get('some_test_environ') is None
|
||||||
|
|
||||||
|
os.environ['some_test_environ'] = 'test'
|
||||||
|
assert os.environ.get('some_test_environ') == 'test'
|
||||||
|
with set_environ(some_test_environ='test_value'):
|
||||||
|
assert os.environ.get('some_test_environ') == 'test_value'
|
||||||
|
assert os.environ.get('some_test_environ') == 'test'
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -9,11 +9,23 @@ import six
|
|||||||
from scrapy.utils.python import (
|
from scrapy.utils.python import (
|
||||||
memoizemethod_noargs, binary_is_text, equal_attributes,
|
memoizemethod_noargs, binary_is_text, equal_attributes,
|
||||||
WeakKeyCache, stringify_dict, get_func_args, to_bytes, to_unicode,
|
WeakKeyCache, stringify_dict, get_func_args, to_bytes, to_unicode,
|
||||||
without_none_values)
|
without_none_values, MutableChain)
|
||||||
|
|
||||||
__doctests__ = ['scrapy.utils.python']
|
__doctests__ = ['scrapy.utils.python']
|
||||||
|
|
||||||
|
|
||||||
|
class MutableChainTest(unittest.TestCase):
|
||||||
|
def test_mutablechain(self):
|
||||||
|
m = MutableChain(range(2), [2, 3], (4, 5))
|
||||||
|
m.extend(range(6, 7))
|
||||||
|
m.extend([7, 8])
|
||||||
|
m.extend([9, 10], (11, 12))
|
||||||
|
self.assertEqual(next(m), 0)
|
||||||
|
self.assertEqual(m.next(), 1)
|
||||||
|
self.assertEqual(m.__next__(), 2)
|
||||||
|
self.assertEqual(list(m), list(range(3, 13)))
|
||||||
|
|
||||||
|
|
||||||
class ToUnicodeTest(unittest.TestCase):
|
class ToUnicodeTest(unittest.TestCase):
|
||||||
def test_converting_an_utf8_encoded_string_to_unicode(self):
|
def test_converting_an_utf8_encoded_string_to_unicode(self):
|
||||||
self.assertEqual(to_unicode(b'lel\xc3\xb1e'), u'lel\xf1e')
|
self.assertEqual(to_unicode(b'lel\xc3\xb1e'), u'lel\xf1e')
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import unittest
|
import unittest
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import six
|
||||||
|
|
||||||
from scrapy.http import Request, FormRequest
|
from scrapy.http import Request, FormRequest
|
||||||
from scrapy.spiders import Spider
|
from scrapy.spiders import Spider
|
||||||
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
from scrapy.utils.reqser import request_to_dict, request_from_dict, _is_private_method, _mangle_private_name
|
||||||
|
|
||||||
|
|
||||||
class RequestSerializationTest(unittest.TestCase):
|
class RequestSerializationTest(unittest.TestCase):
|
||||||
@ -70,6 +73,56 @@ class RequestSerializationTest(unittest.TestCase):
|
|||||||
errback=self.spider.handle_error)
|
errback=self.spider.handle_error)
|
||||||
self._assert_serializes_ok(r, spider=self.spider)
|
self._assert_serializes_ok(r, spider=self.spider)
|
||||||
|
|
||||||
|
def test_private_callback_serialization(self):
|
||||||
|
r = Request("http://www.example.com",
|
||||||
|
callback=self.spider._TestSpider__parse_item_private,
|
||||||
|
errback=self.spider.handle_error)
|
||||||
|
self._assert_serializes_ok(r, spider=self.spider)
|
||||||
|
|
||||||
|
def test_mixin_private_callback_serialization(self):
|
||||||
|
if sys.version_info[0] < 3:
|
||||||
|
return
|
||||||
|
r = Request("http://www.example.com",
|
||||||
|
callback=self.spider._TestSpiderMixin__mixin_callback,
|
||||||
|
errback=self.spider.handle_error)
|
||||||
|
self._assert_serializes_ok(r, spider=self.spider)
|
||||||
|
|
||||||
|
def test_private_callback_name_matching(self):
|
||||||
|
self.assertTrue(_is_private_method('__a'))
|
||||||
|
self.assertTrue(_is_private_method('__a_'))
|
||||||
|
self.assertTrue(_is_private_method('__a_a'))
|
||||||
|
self.assertTrue(_is_private_method('__a_a_'))
|
||||||
|
self.assertTrue(_is_private_method('__a__a'))
|
||||||
|
self.assertTrue(_is_private_method('__a__a_'))
|
||||||
|
self.assertTrue(_is_private_method('__a___a'))
|
||||||
|
self.assertTrue(_is_private_method('__a___a_'))
|
||||||
|
self.assertTrue(_is_private_method('___a'))
|
||||||
|
self.assertTrue(_is_private_method('___a_'))
|
||||||
|
self.assertTrue(_is_private_method('___a_a'))
|
||||||
|
self.assertTrue(_is_private_method('___a_a_'))
|
||||||
|
self.assertTrue(_is_private_method('____a_a_'))
|
||||||
|
|
||||||
|
self.assertFalse(_is_private_method('_a'))
|
||||||
|
self.assertFalse(_is_private_method('_a_'))
|
||||||
|
self.assertFalse(_is_private_method('__a__'))
|
||||||
|
self.assertFalse(_is_private_method('__'))
|
||||||
|
self.assertFalse(_is_private_method('___'))
|
||||||
|
self.assertFalse(_is_private_method('____'))
|
||||||
|
|
||||||
|
def _assert_mangles_to(self, obj, name):
|
||||||
|
func = getattr(obj, name)
|
||||||
|
self.assertEqual(
|
||||||
|
_mangle_private_name(obj, func, func.__name__),
|
||||||
|
name
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_private_name_mangling(self):
|
||||||
|
self._assert_mangles_to(
|
||||||
|
self.spider, '_TestSpider__parse_item_private')
|
||||||
|
if sys.version_info[0] >= 3:
|
||||||
|
self._assert_mangles_to(
|
||||||
|
self.spider, '_TestSpiderMixin__mixin_callback')
|
||||||
|
|
||||||
def test_unserializable_callback1(self):
|
def test_unserializable_callback1(self):
|
||||||
r = Request("http://www.example.com", callback=lambda x: x)
|
r = Request("http://www.example.com", callback=lambda x: x)
|
||||||
self.assertRaises(ValueError, request_to_dict, r)
|
self.assertRaises(ValueError, request_to_dict, r)
|
||||||
@ -80,7 +133,12 @@ class RequestSerializationTest(unittest.TestCase):
|
|||||||
self.assertRaises(ValueError, request_to_dict, r)
|
self.assertRaises(ValueError, request_to_dict, r)
|
||||||
|
|
||||||
|
|
||||||
class TestSpider(Spider):
|
class TestSpiderMixin(object):
|
||||||
|
def __mixin_callback(self, response):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TestSpider(Spider, TestSpiderMixin):
|
||||||
name = 'test'
|
name = 'test'
|
||||||
|
|
||||||
def parse_item(self, response):
|
def parse_item(self, response):
|
||||||
@ -89,6 +147,9 @@ class TestSpider(Spider):
|
|||||||
def handle_error(self, failure):
|
def handle_error(self, failure):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def __parse_item_private(self, response):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class CustomRequest(Request):
|
class CustomRequest(Request):
|
||||||
pass
|
pass
|
||||||
|
6
tox.ini
6
tox.ini
@ -105,6 +105,12 @@ deps = {[docs]deps}
|
|||||||
commands =
|
commands =
|
||||||
sphinx-build -W -b html . {envtmpdir}/html
|
sphinx-build -W -b html . {envtmpdir}/html
|
||||||
|
|
||||||
|
[testenv:docs-coverage]
|
||||||
|
changedir = {[docs]changedir}
|
||||||
|
deps = {[docs]deps}
|
||||||
|
commands =
|
||||||
|
sphinx-build -b coverage . {envtmpdir}/coverage
|
||||||
|
|
||||||
[testenv:docs-links]
|
[testenv:docs-links]
|
||||||
changedir = {[docs]changedir}
|
changedir = {[docs]changedir}
|
||||||
deps = {[docs]deps}
|
deps = {[docs]deps}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user