mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 02:43:41 +00:00
Merge remote-tracking branch 'origin/master' into callback_kwargs
This commit is contained in:
commit
428309ba1a
@ -12,7 +12,8 @@ branches:
|
||||
|
||||
install:
|
||||
- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
|
||||
- "SET TOX_TESTENV_PASSENV=HOME USERPROFILE HOMEPATH HOMEDRIVE"
|
||||
- "SET PYTHONPATH=%APPVEYOR_BUILD_FOLDER%"
|
||||
- "SET TOX_TESTENV_PASSENV=HOME HOMEDRIVE HOMEPATH PYTHONPATH USERPROFILE"
|
||||
- "pip install -U tox"
|
||||
|
||||
build: false
|
||||
|
@ -82,6 +82,9 @@ pydoc-topics: build
|
||||
@echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \
|
||||
"into the Lib/ directory"
|
||||
|
||||
coverage: BUILDER = coverage
|
||||
coverage: build
|
||||
|
||||
htmlview: html
|
||||
$(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
|
||||
os.path.realpath('build/html/index.html'))"
|
||||
|
22
docs/conf.py
22
docs/conf.py
@ -28,7 +28,8 @@ sys.path.insert(0, path.dirname(path.dirname(__file__)))
|
||||
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||
extensions = [
|
||||
'scrapydocs',
|
||||
'sphinx.ext.autodoc'
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.coverage',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
@ -218,3 +219,22 @@ linkcheck_ignore = [
|
||||
'http://localhost:\d+', 'http://hg.scrapy.org',
|
||||
'http://directory.google.com/'
|
||||
]
|
||||
|
||||
|
||||
# Options for the Coverage extension
|
||||
# ----------------------------------
|
||||
coverage_ignore_pyobjects = [
|
||||
# Contract’s add_pre_hook and add_post_hook are not documented because
|
||||
# they should be transparent to contract developers, for whom pre_hook and
|
||||
# post_hook should be the actual concern.
|
||||
r'\bContract\.add_(pre|post)_hook$',
|
||||
|
||||
# ContractsManager is an internal class, developers are not expected to
|
||||
# interact with it directly in any way.
|
||||
r'\bContractsManager\b$',
|
||||
|
||||
# For default contracts we only want to document their general purpose in
|
||||
# their constructor, the methods they reimplement to achieve that purpose
|
||||
# should be irrelevant to developers using those contracts.
|
||||
r'\w+Contract\.(adjust_request_args|(pre|post)_process)$',
|
||||
]
|
||||
|
@ -99,6 +99,15 @@ Well-written patches should:
|
||||
the documentation changes in the same patch. See `Documentation policies`_
|
||||
below.
|
||||
|
||||
* if you're adding a private API, please add a regular expression to the
|
||||
``coverage_ignore_pyobjects`` variable of ``docs/conf.py`` to exclude the new
|
||||
private API from documentation coverage checks.
|
||||
|
||||
To see if your private API is skipped properly, generate a documentation
|
||||
coverage report as follows::
|
||||
|
||||
tox -e docs-coverage
|
||||
|
||||
.. _submitting-patches:
|
||||
|
||||
Submitting patches
|
||||
@ -167,8 +176,9 @@ Documentation policies
|
||||
|
||||
For reference documentation of API members (classes, methods, etc.) use
|
||||
docstrings and make sure that the Sphinx documentation uses the autodoc_
|
||||
extension to pull the docstrings. API reference documentation should be
|
||||
IDE-friendly: short, to the point, and it may provide short examples.
|
||||
extension to pull the docstrings. API reference documentation should follow
|
||||
docstring conventions (`PEP 257`_) and be IDE-friendly: short, to the point,
|
||||
and it may provide short examples.
|
||||
|
||||
Other types of documentation, such as tutorials or topics, should be covered in
|
||||
files within the ``docs/`` directory. This includes documentation that is
|
||||
@ -205,6 +215,29 @@ To run a specific test (say ``tests/test_loader.py``) use:
|
||||
|
||||
``tox -- tests/test_loader.py``
|
||||
|
||||
To run the tests on a specific tox_ environment, use ``-e <name>`` with an
|
||||
environment name from ``tox.ini``. For example, to run the tests with Python
|
||||
3.6 use::
|
||||
|
||||
tox -e py36
|
||||
|
||||
You can also specify a comma-separated list of environmets, and use `tox’s
|
||||
parallel mode`_ to run the tests on multiple environments in parallel::
|
||||
|
||||
tox -e py27,py36 -p auto
|
||||
|
||||
To pass command-line options to pytest_, add them after ``--`` in your call to
|
||||
tox_. Using ``--`` overrides the default positional arguments defined in
|
||||
``tox.ini``, so you must include those default positional arguments
|
||||
(``scrapy tests``) after ``--`` as well::
|
||||
|
||||
tox -- scrapy tests -x # stop after first failure
|
||||
|
||||
You can also use the `pytest-xdist`_ plugin. For example, to run all tests on
|
||||
the Python 3.6 tox_ environment using all your CPU cores::
|
||||
|
||||
tox -e py36 -- scrapy tests -n auto
|
||||
|
||||
To see coverage report install `coverage`_ (``pip install coverage``) and run:
|
||||
|
||||
``coverage report``
|
||||
@ -237,5 +270,9 @@ And their unit-tests are in::
|
||||
.. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS
|
||||
.. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
|
||||
.. _open issues: https://github.com/scrapy/scrapy/issues
|
||||
.. _pull request: https://help.github.com/send-pull-requests/
|
||||
.. _PEP 257: https://www.python.org/dev/peps/pep-0257/
|
||||
.. _pull request: https://help.github.com/en/articles/creating-a-pull-request
|
||||
.. _pytest: https://docs.pytest.org/en/latest/usage.html
|
||||
.. _pytest-xdist: https://docs.pytest.org/en/3.0.0/xdist.html
|
||||
.. _tox: https://pypi.python.org/pypi/tox
|
||||
.. _tox’s parallel mode: https://tox.readthedocs.io/en/latest/example/basic.html#parallel-mode
|
||||
|
@ -158,6 +158,7 @@ Solving specific problems
|
||||
topics/practices
|
||||
topics/broad-crawls
|
||||
topics/developer-tools
|
||||
topics/dynamic-content
|
||||
topics/leaks
|
||||
topics/media-pipeline
|
||||
topics/deploy
|
||||
@ -183,6 +184,9 @@ Solving specific problems
|
||||
:doc:`topics/developer-tools`
|
||||
Learn how to scrape with your browser's developer tools.
|
||||
|
||||
:doc:`topics/dynamic-content`
|
||||
Read webpage data that is loaded dynamically.
|
||||
|
||||
:doc:`topics/leaks`
|
||||
Learn how to find and get rid of memory leaks in your crawler.
|
||||
|
||||
|
@ -205,7 +205,7 @@ Extracting data
|
||||
---------------
|
||||
|
||||
The best way to learn how to extract data with Scrapy is trying selectors
|
||||
using the shell :ref:`Scrapy shell <topics-shell>`. Run::
|
||||
using the :ref:`Scrapy shell <topics-shell>`. Run::
|
||||
|
||||
scrapy shell 'http://quotes.toscrape.com/page/1/'
|
||||
|
||||
@ -296,8 +296,8 @@ expressions`_::
|
||||
|
||||
In order to find the proper CSS selectors to use, you might find useful opening
|
||||
the response page from the shell in your web browser using ``view(response)``.
|
||||
You can use your browser developer tools to inspect the HTML and come up
|
||||
with a selector (see section about :ref:`topics-developer-tools`).
|
||||
You can use your browser's developer tools to inspect the HTML and come up
|
||||
with a selector (see :ref:`topics-developer-tools`).
|
||||
|
||||
`Selector Gadget`_ is also a nice tool to quickly find CSS selector for
|
||||
visually selected elements, which works in many browsers.
|
||||
@ -379,11 +379,11 @@ variable, so that we can run our CSS selectors directly on a particular quote::
|
||||
|
||||
>>> quote = response.css("div.quote")[0]
|
||||
|
||||
Now, let's extract ``title``, ``author`` and the ``tags`` from that quote
|
||||
Now, let's extract ``text``, ``author`` and the ``tags`` from that quote
|
||||
using the ``quote`` object we just created::
|
||||
|
||||
>>> title = quote.css("span.text::text").get()
|
||||
>>> title
|
||||
>>> text = quote.css("span.text::text").get()
|
||||
>>> text
|
||||
'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'
|
||||
>>> author = quote.css("small.author::text").get()
|
||||
>>> author
|
||||
@ -511,7 +511,7 @@ We can try extracting it in the shell::
|
||||
'<a href="/page/2/">Next <span aria-hidden="true">→</span></a>'
|
||||
|
||||
This gets the anchor element, but we want the attribute ``href``. For that,
|
||||
Scrapy supports a CSS extension that let's you select the attribute contents,
|
||||
Scrapy supports a CSS extension that lets you select the attribute contents,
|
||||
like this::
|
||||
|
||||
>>> response.css('li.next a::attr(href)').get()
|
||||
|
@ -1,2 +1,2 @@
|
||||
Sphinx>=1.6
|
||||
Sphinx>=2.1
|
||||
sphinx_rtd_theme
|
@ -99,6 +99,8 @@ how you :ref:`configure the downloader middlewares
|
||||
|
||||
Returns a deferred that is fired when the crawl is finished.
|
||||
|
||||
.. automethod:: stop
|
||||
|
||||
.. autoclass:: CrawlerRunner
|
||||
:members:
|
||||
|
||||
@ -154,7 +156,7 @@ Settings API
|
||||
SpiderLoader API
|
||||
================
|
||||
|
||||
.. module:: scrapy.loader
|
||||
.. module:: scrapy.spiderloader
|
||||
:synopsis: The spider loader
|
||||
|
||||
.. class:: SpiderLoader
|
||||
|
@ -39,6 +39,17 @@ you need to keep in mind when using Scrapy for doing broad crawls, along with
|
||||
concrete suggestions of Scrapy settings to tune in order to achieve an
|
||||
efficient broad crawl.
|
||||
|
||||
Use the right :setting:`SCHEDULER_PRIORITY_QUEUE`
|
||||
=================================================
|
||||
|
||||
Scrapy’s default scheduler priority queue is ``'scrapy.pqueues.ScrapyPriorityQueue'``.
|
||||
It works best during single-domain crawl. It does not work well with crawling
|
||||
many different domains in parallel
|
||||
|
||||
To apply the recommended priority queue use::
|
||||
|
||||
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
|
||||
|
||||
Increase concurrency
|
||||
====================
|
||||
|
||||
@ -85,7 +96,7 @@ When doing broad crawls you are often only interested in the crawl rates you
|
||||
get and any errors found. These stats are reported by Scrapy when using the
|
||||
``INFO`` log level. In order to save CPU (and log storage requirements) you
|
||||
should not use ``DEBUG`` log level when preforming large broad crawls in
|
||||
production. Using ``DEBUG`` level when developing your (broad) crawler may be
|
||||
production. Using ``DEBUG`` level when developing your (broad) crawler may be
|
||||
fine though.
|
||||
|
||||
To set the log level use::
|
||||
|
@ -120,3 +120,23 @@ get the failures pretty printed::
|
||||
for header in self.args:
|
||||
if header not in response.headers:
|
||||
raise ContractFail('X-CustomHeader not present')
|
||||
|
||||
|
||||
Detecting check runs
|
||||
====================
|
||||
|
||||
When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is
|
||||
set to the ``true`` string. You can use `os.environ`_ to perform any change to
|
||||
your spiders or your settings when ``scrapy check`` is used::
|
||||
|
||||
import os
|
||||
import scrapy
|
||||
|
||||
class ExampleSpider(scrapy.Spider):
|
||||
name = 'example'
|
||||
|
||||
def __init__(self):
|
||||
if os.environ.get('SCRAPY_CHECK'):
|
||||
pass # Do some scraper adjustments when a check is running
|
||||
|
||||
.. _os.environ: https://docs.python.org/3/library/os.html#os.environ
|
||||
|
@ -805,6 +805,7 @@ The :class:`MetaRefreshMiddleware` can be configured through the following
|
||||
settings (see the settings documentation for more info):
|
||||
|
||||
* :setting:`METAREFRESH_ENABLED`
|
||||
* :setting:`METAREFRESH_IGNORE_TAGS`
|
||||
* :setting:`METAREFRESH_MAXDELAY`
|
||||
|
||||
This middleware obey :setting:`REDIRECT_MAX_TIMES` setting, :reqmeta:`dont_redirect`,
|
||||
@ -826,6 +827,15 @@ Default: ``True``
|
||||
|
||||
Whether the Meta Refresh middleware will be enabled.
|
||||
|
||||
.. setting:: METAREFRESH_IGNORE_TAGS
|
||||
|
||||
METAREFRESH_IGNORE_TAGS
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Default: ``['script', 'noscript']``
|
||||
|
||||
Meta tags within these tags are ignored.
|
||||
|
||||
.. setting:: METAREFRESH_MAXDELAY
|
||||
|
||||
METAREFRESH_MAXDELAY
|
||||
|
246
docs/topics/dynamic-content.rst
Normal file
246
docs/topics/dynamic-content.rst
Normal file
@ -0,0 +1,246 @@
|
||||
.. _topics-dynamic-content:
|
||||
|
||||
====================================
|
||||
Selecting dynamically-loaded content
|
||||
====================================
|
||||
|
||||
Some webpages show the desired data when you load them in a web browser.
|
||||
However, when you download them using Scrapy, you cannot reach the desired data
|
||||
using :ref:`selectors <topics-selectors>`.
|
||||
|
||||
When this happens, the recommended approach is to
|
||||
:ref:`find the data source <topics-finding-data-source>` and extract the data
|
||||
from it.
|
||||
|
||||
If you fail to do that, and you can nonetheless access the desired data through
|
||||
the :ref:`DOM <topics-livedom>` from your web browser, see
|
||||
:ref:`topics-javascript-rendering`.
|
||||
|
||||
.. _topics-finding-data-source:
|
||||
|
||||
Finding the data source
|
||||
=======================
|
||||
|
||||
To extract the desired data, you must first find its source location.
|
||||
|
||||
If the data is in a non-text-based format, such as an image or a PDF document,
|
||||
use the :ref:`network tool <topics-network-tool>` of your web browser to find
|
||||
the corresponding request, and :ref:`reproduce it
|
||||
<topics-reproducing-requests>`.
|
||||
|
||||
If your web browser lets you select the desired data as text, the data may be
|
||||
defined in embedded JavaScript code, or loaded from an external resource in a
|
||||
text-based format.
|
||||
|
||||
In that case, you can use a tool like wgrep_ to find the URL of that resource.
|
||||
|
||||
If the data turns out to come from the original URL itself, you must
|
||||
:ref:`inspect the source code of the webpage <topics-inspecting-source>` to
|
||||
determine where the data is located.
|
||||
|
||||
If the data comes from a different URL, you will need to :ref:`reproduce the
|
||||
corresponding request <topics-reproducing-requests>`.
|
||||
|
||||
.. _topics-inspecting-source:
|
||||
|
||||
Inspecting the source code of a webpage
|
||||
=======================================
|
||||
|
||||
Sometimes you need to inspect the source code of a webpage (not the
|
||||
:ref:`DOM <topics-livedom>`) to determine where some desired data is located.
|
||||
|
||||
Use Scrapy’s :command:`fetch` command to download the webpage contents as seen
|
||||
by Scrapy::
|
||||
|
||||
scrapy fetch --nolog https://example.com > response.html
|
||||
|
||||
If the desired data is in embedded JavaScript code within a ``<script/>``
|
||||
element, see :ref:`topics-parsing-javascript`.
|
||||
|
||||
If you cannot find the desired data, first make sure it’s not just Scrapy:
|
||||
download the webpage with an HTTP client like curl_ or wget_ and see if the
|
||||
information can be found in the response they get.
|
||||
|
||||
If they get a response with the desired data, modify your Scrapy
|
||||
:class:`~scrapy.http.Request` to match that of the other HTTP client. For
|
||||
example, try using the same user-agent string (:setting:`USER_AGENT`) or the
|
||||
same :attr:`~scrapy.http.Request.headers`.
|
||||
|
||||
If they also get a response without the desired data, you’ll need to take
|
||||
steps to make your request more similar to that of the web browser. See
|
||||
:ref:`topics-reproducing-requests`.
|
||||
|
||||
.. _topics-reproducing-requests:
|
||||
|
||||
Reproducing requests
|
||||
====================
|
||||
|
||||
Sometimes we need to reproduce a request the way our web browser performs it.
|
||||
|
||||
Use the :ref:`network tool <topics-network-tool>` of your web browser to see
|
||||
how your web browser performs the desired request, and try to reproduce that
|
||||
request with Scrapy.
|
||||
|
||||
It might be enough to yield a :class:`~scrapy.http.Request` with the same HTTP
|
||||
method and URL. However, you may also need to reproduce the body, headers and
|
||||
form parameters (see :class:`~scrapy.http.FormRequest`) of that request.
|
||||
|
||||
Once you get the expected response, you can :ref:`extract the desired data from
|
||||
it <topics-handling-response-formats>`.
|
||||
|
||||
You can reproduce any request with Scrapy. However, some times reproducing all
|
||||
necessary requests may not seem efficient in developer time. If that is your
|
||||
case, and crawling speed is not a major concern for you, you can alternatively
|
||||
consider :ref:`JavaScript pre-rendering <topics-javascript-rendering>`.
|
||||
|
||||
If you get the expected response `sometimes`, but not always, the issue is
|
||||
probably not your request, but the target server. The target server might be
|
||||
buggy, overloaded, or :ref:`banning <bans>` some of your requests.
|
||||
|
||||
.. _topics-handling-response-formats:
|
||||
|
||||
Handling different response formats
|
||||
===================================
|
||||
|
||||
Once you have a response with the desired data, how you extract the desired
|
||||
data from it depends on the type of response:
|
||||
|
||||
- If the response is HTML or XML, use :ref:`selectors
|
||||
<topics-selectors>` as usual.
|
||||
|
||||
- If the response is JSON, use `json.loads`_ to load the desired data from
|
||||
:attr:`response.text <scrapy.http.TextResponse.text>`::
|
||||
|
||||
data = json.loads(response.text)
|
||||
|
||||
If the desired data is inside HTML or XML code embedded within JSON data,
|
||||
you can load that HTML or XML code into a
|
||||
:class:`~scrapy.selector.Selector` and then
|
||||
:ref:`use it <topics-selectors>` as usual::
|
||||
|
||||
selector = Selector(data['html'])
|
||||
|
||||
- If the response is JavaScript, or HTML with a ``<script/>`` element
|
||||
containing the desired data, see :ref:`topics-parsing-javascript`.
|
||||
|
||||
- If the response is CSS, use a `regular expression`_ to extract the desired
|
||||
data from :attr:`response.text <scrapy.http.TextResponse.text>`.
|
||||
|
||||
.. _topics-parsing-images:
|
||||
|
||||
- If the response is an image or another format based on images (e.g. PDF),
|
||||
read the response as bytes from
|
||||
:attr:`response.body <scrapy.http.TextResponse.body>` and use an OCR
|
||||
solution to extract the desired data as text.
|
||||
|
||||
For example, you can use pytesseract_. To read a table from a PDF,
|
||||
`tabula-py`_ may be a better choice.
|
||||
|
||||
- If the response is SVG, or HTML with embedded SVG containing the desired
|
||||
data, you may be able to extract the desired data using
|
||||
:ref:`selectors <topics-selectors>`, since SVG is based on XML.
|
||||
|
||||
Otherwise, you might need to convert the SVG code into a raster image, and
|
||||
:ref:`handle that raster image <topics-parsing-images>`.
|
||||
|
||||
.. _topics-parsing-javascript:
|
||||
|
||||
Parsing JavaScript code
|
||||
=======================
|
||||
|
||||
If the desired data is hardcoded in JavaScript, you first need to get the
|
||||
JavaScript code:
|
||||
|
||||
- If the JavaScript code is in a JavaScript file, simply read
|
||||
:attr:`response.text <scrapy.http.TextResponse.text>`.
|
||||
|
||||
- If the JavaScript code is within a ``<script/>`` element of an HTML page,
|
||||
use :ref:`selectors <topics-selectors>` to extract the text within that
|
||||
``<script/>`` element.
|
||||
|
||||
Once you have a string with the JavaScript code, you can extract the desired
|
||||
data from it:
|
||||
|
||||
- You might be able to use a `regular expression`_ to extract the desired
|
||||
data in JSON format, which you can then parse with `json.loads`_.
|
||||
|
||||
For example, if the JavaScript code contains a separate line like
|
||||
``var data = {"field": "value"};`` you can extract that data as follows::
|
||||
|
||||
>>> pattern = r'\bvar\s+data\s*=\s*(\{.*?\})\s*;\s*\n'
|
||||
>>> json_data = response.css('script::text').re_first(pattern)
|
||||
>>> json.loads(json_data)
|
||||
{'field': 'value'}
|
||||
|
||||
- Otherwise, use js2xml_ to convert the JavaScript code into an XML document
|
||||
that you can parse using :ref:`selectors <topics-selectors>`.
|
||||
|
||||
For example, if the JavaScript code contains
|
||||
``var data = {field: "value"};`` you can extract that data as follows::
|
||||
|
||||
>>> import js2xml
|
||||
>>> import lxml.etree
|
||||
>>> from parsel import Selector
|
||||
>>> javascript = response.css('script::text').get()
|
||||
>>> xml = lxml.etree.tostring(js2xml.parse(javascript), encoding='unicode')
|
||||
>>> selector = Selector(text=xml)
|
||||
>>> selector.css('var[name="data"]').get()
|
||||
'<var name="data"><object><property name="field"><string>value</string></property></object></var>'
|
||||
|
||||
.. _topics-javascript-rendering:
|
||||
|
||||
Pre-rendering JavaScript
|
||||
========================
|
||||
|
||||
On webpages that fetch data from additional requests, reproducing those
|
||||
requests that contain the desired data is the preferred approach. The effort is
|
||||
often worth the result: structured, complete data with minimum parsing time and
|
||||
network transfer.
|
||||
|
||||
However, sometimes it can be really hard to reproduce certain requests. Or you
|
||||
may need something that no request can give you, such as a screenshot of a
|
||||
webpage as seen in a web browser.
|
||||
|
||||
In these cases use the Splash_ JavaScript-rendering service, along with
|
||||
`scrapy-splash`_ for seamless integration.
|
||||
|
||||
Splash returns as HTML the :ref:`DOM <topics-livedom>` of a webpage, so that
|
||||
you can parse it with :ref:`selectors <topics-selectors>`. It provides great
|
||||
flexibility through configuration_ or scripting_.
|
||||
|
||||
If you need something beyond what Splash offers, such as interacting with the
|
||||
DOM on-the-fly from Python code instead of using a previously-written script,
|
||||
or handling multiple web browser windows, you might need to
|
||||
:ref:`use a headless browser <topics-headless-browsing>` instead.
|
||||
|
||||
.. _configuration: https://splash.readthedocs.io/en/stable/api.html
|
||||
.. _scripting: https://splash.readthedocs.io/en/stable/scripting-tutorial.html
|
||||
|
||||
.. _topics-headless-browsing:
|
||||
|
||||
Using a headless browser
|
||||
========================
|
||||
|
||||
A `headless browser`_ is a special web browser that provides an API for
|
||||
automation.
|
||||
|
||||
The easiest way to use a headless browser with Scrapy is to use Selenium_,
|
||||
along with `scrapy-selenium`_ for seamless integration.
|
||||
|
||||
|
||||
.. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29
|
||||
.. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
|
||||
.. _curl: https://curl.haxx.se/
|
||||
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
|
||||
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
|
||||
.. _js2xml: https://github.com/scrapinghub/js2xml
|
||||
.. _json.loads: https://docs.python.org/library/json.html#json.loads
|
||||
.. _pytesseract: https://github.com/madmaze/pytesseract
|
||||
.. _regular expression: https://docs.python.org/library/re.html
|
||||
.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
|
||||
.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
|
||||
.. _Selenium: https://www.seleniumhq.org/
|
||||
.. _Splash: https://github.com/scrapinghub/splash
|
||||
.. _tabula-py: https://github.com/chezou/tabula-py
|
||||
.. _wget: https://www.gnu.org/software/wget/
|
||||
.. _wgrep: https://github.com/stav/wgrep
|
@ -238,9 +238,10 @@ scrapy.utils.log module
|
||||
|
||||
.. autofunction:: configure_logging
|
||||
|
||||
``configure_logging`` is automatically called when using Scrapy commands,
|
||||
but needs to be called explicitly when running custom scripts. In that
|
||||
case, its usage is not required but it's recommended.
|
||||
``configure_logging`` is automatically called when using Scrapy commands
|
||||
or :class:`~scrapy.crawler.CrawlerProcess`, but needs to be called explicitly
|
||||
when running custom scripts using :class:`~scrapy.crawler.CrawlerRunner`.
|
||||
In that case, its usage is not required but it's recommended.
|
||||
|
||||
If you plan on configuring the handlers yourself is still recommended you
|
||||
call this function, passing ``install_root_handler=False``. Bear in mind
|
||||
|
@ -897,6 +897,16 @@ Default: ``False``
|
||||
If ``True``, the logs will just contain the root path. If it is set to ``False``
|
||||
then it displays the component responsible for the log output
|
||||
|
||||
.. setting:: LOGSTATS_INTERVAL
|
||||
|
||||
LOGSTATS_INTERVAL
|
||||
-----------------
|
||||
|
||||
Default: ``60.0``
|
||||
|
||||
The interval (in seconds) between each logging printout of the stats
|
||||
by :class:`~extensions.logstats.LogStats`.
|
||||
|
||||
.. setting:: MEMDEBUG_ENABLED
|
||||
|
||||
MEMDEBUG_ENABLED
|
||||
@ -1155,9 +1165,14 @@ Type of in-memory queue used by scheduler. Other available type is:
|
||||
|
||||
SCHEDULER_PRIORITY_QUEUE
|
||||
------------------------
|
||||
Default: ``'queuelib.PriorityQueue'``
|
||||
Default: ``'scrapy.pqueues.ScrapyPriorityQueue'``
|
||||
|
||||
Type of priority queue used by scheduler.
|
||||
Type of priority queue used by the scheduler. Another available type is
|
||||
``scrapy.pqueues.DownloaderAwarePriorityQueue``.
|
||||
``scrapy.pqueues.DownloaderAwarePriorityQueue`` works better than
|
||||
``scrapy.pqueues.ScrapyPriorityQueue`` when you crawl many different
|
||||
domains in parallel. But currently ``scrapy.pqueues.DownloaderAwarePriorityQueue``
|
||||
does not work together with :setting:`CONCURRENT_REQUESTS_PER_IP`.
|
||||
|
||||
.. setting:: SPIDER_CONTRACTS
|
||||
|
||||
|
@ -82,7 +82,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
||||
|
||||
If it raises an exception, Scrapy won't bother calling any other spider
|
||||
middleware :meth:`process_spider_input` and will call the request
|
||||
errback. The output of the errback is chained back in the other
|
||||
errback if there is one, otherwise it will start the :meth:`process_spider_exception`
|
||||
chain. The output of the errback is chained back in the other
|
||||
direction for :meth:`process_spider_output` to process it, or
|
||||
:meth:`process_spider_exception` if it raised an exception.
|
||||
|
||||
@ -116,8 +117,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
||||
|
||||
.. method:: process_spider_exception(response, exception, spider)
|
||||
|
||||
This method is called when a spider or :meth:`process_spider_input`
|
||||
method (from other spider middleware) raises an exception.
|
||||
This method is called when a spider or :meth:`process_spider_output`
|
||||
method (from a previous spider middleware) raises an exception.
|
||||
|
||||
:meth:`process_spider_exception` should return either ``None`` or an
|
||||
iterable of :class:`~scrapy.http.Request`, dict or
|
||||
@ -129,7 +130,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
||||
exception reaches the engine (where it's logged and discarded).
|
||||
|
||||
If it returns an iterable the :meth:`process_spider_output` pipeline
|
||||
kicks in, and no other :meth:`process_spider_exception` will be called.
|
||||
kicks in, starting from the next spider middleware, and no other
|
||||
:meth:`process_spider_exception` will be called.
|
||||
|
||||
:param response: the response being processed when the exception was
|
||||
raised
|
||||
|
@ -402,10 +402,12 @@ Crawling rules
|
||||
of links extracted from each response using the specified ``link_extractor``.
|
||||
This is mainly used for filtering purposes.
|
||||
|
||||
``process_request`` is a callable, or a string (in which case a method from
|
||||
the spider object with that name will be used) which will be called with
|
||||
every request extracted by this rule, and must return a request or None (to
|
||||
filter out the request).
|
||||
``process_request`` is a callable (or a string, in which case a method from
|
||||
the spider object with that name will be used) which will be called for every
|
||||
:class:`~scrapy.http.Request` extracted by this rule. This callable should
|
||||
take said request as first argument and the :class:`~scrapy.http.Response`
|
||||
from which the request originated as second argument. It must return a
|
||||
``Request`` object or ``None`` (to filter out the request).
|
||||
|
||||
CrawlSpider example
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
@ -655,7 +657,7 @@ SitemapSpider
|
||||
|
||||
.. attribute:: sitemap_follow
|
||||
|
||||
A list of regexes of sitemap that should be followed. This is is only
|
||||
A list of regexes of sitemap that should be followed. This is only
|
||||
for sites that use `Sitemap index files`_ that point to other sitemap
|
||||
files.
|
||||
|
||||
|
@ -75,8 +75,7 @@ available in Scrapy which extend the basic Stats Collector. You can select
|
||||
which Stats Collector to use through the :setting:`STATS_CLASS` setting. The
|
||||
default Stats Collector used is the :class:`MemoryStatsCollector`.
|
||||
|
||||
.. module:: scrapy.statscollectors
|
||||
:synopsis: Stats Collectors
|
||||
.. currentmodule:: scrapy.statscollectors
|
||||
|
||||
MemoryStatsCollector
|
||||
--------------------
|
||||
|
@ -1,12 +1,11 @@
|
||||
.. currentmodule:: scrapy.extensions.telnet
|
||||
|
||||
.. _topics-telnetconsole:
|
||||
|
||||
==============
|
||||
Telnet Console
|
||||
==============
|
||||
|
||||
.. module:: scrapy.extensions.telnet
|
||||
:synopsis: The Telnet Console
|
||||
|
||||
Scrapy comes with a built-in telnet console for inspecting and controlling a
|
||||
Scrapy running process. The telnet console is just a regular python shell
|
||||
running inside the Scrapy process, so you can do literally anything from it.
|
||||
@ -45,7 +44,7 @@ the console you need to type::
|
||||
>>>
|
||||
|
||||
By default Username is ``scrapy`` and Password is autogenerated. The
|
||||
autogenerated Password can be seen on scrapy logs like the example bellow::
|
||||
autogenerated Password can be seen on scrapy logs like the example below::
|
||||
|
||||
2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326
|
||||
|
||||
|
@ -6,7 +6,7 @@ from unittest import TextTestRunner, TextTestResult as _TextTestResult
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.contracts import ContractsManager
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.misc import load_object, set_environ
|
||||
from scrapy.utils.conf import build_component_list
|
||||
|
||||
|
||||
@ -68,16 +68,17 @@ class Command(ScrapyCommand):
|
||||
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
|
||||
for spidername in args or spider_loader.list():
|
||||
spidercls = spider_loader.load(spidername)
|
||||
spidercls.start_requests = lambda s: conman.from_spider(s, result)
|
||||
with set_environ(SCRAPY_CHECK='true'):
|
||||
for spidername in args or spider_loader.list():
|
||||
spidercls = spider_loader.load(spidername)
|
||||
spidercls.start_requests = lambda s: conman.from_spider(s, result)
|
||||
|
||||
tested_methods = conman.tested_methods_from_spidercls(spidercls)
|
||||
if opts.list:
|
||||
for method in tested_methods:
|
||||
contract_reqs[spidercls.name].append(method)
|
||||
elif tested_methods:
|
||||
self.crawler_process.crawl(spidercls)
|
||||
tested_methods = conman.tested_methods_from_spidercls(spidercls)
|
||||
if opts.list:
|
||||
for method in tested_methods:
|
||||
contract_reqs[spidercls.name].append(method)
|
||||
elif tested_methods:
|
||||
self.crawler_process.crawl(spidercls)
|
||||
|
||||
# start checks
|
||||
if opts.list:
|
||||
|
@ -94,7 +94,7 @@ class ContractsManager(object):
|
||||
try:
|
||||
output = cb(response)
|
||||
output = list(iterate_spider_output(output))
|
||||
except:
|
||||
except Exception:
|
||||
case = _create_testcase(method, 'callback')
|
||||
results.addError(case, sys.exc_info())
|
||||
|
||||
|
@ -75,6 +75,8 @@ def _get_concurrency_delay(concurrency, spider, settings):
|
||||
|
||||
class Downloader(object):
|
||||
|
||||
DOWNLOAD_SLOT = 'download_slot'
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.settings = crawler.settings
|
||||
self.signals = crawler.signals
|
||||
@ -111,8 +113,8 @@ class Downloader(object):
|
||||
return key, self.slots[key]
|
||||
|
||||
def _get_slot_key(self, request, spider):
|
||||
if 'download_slot' in request.meta:
|
||||
return request.meta['download_slot']
|
||||
if self.DOWNLOAD_SLOT in request.meta:
|
||||
return request.meta[self.DOWNLOAD_SLOT]
|
||||
|
||||
key = urlparse_cached(request).hostname or ''
|
||||
if self.ip_concurrency:
|
||||
@ -122,7 +124,7 @@ class Downloader(object):
|
||||
|
||||
def _enqueue_request(self, request, spider):
|
||||
key, slot = self._get_slot(request, spider)
|
||||
request.meta['download_slot'] = key
|
||||
request.meta[self.DOWNLOAD_SLOT] = key
|
||||
|
||||
def _deactivate(response):
|
||||
slot.active.remove(request)
|
||||
|
@ -7,6 +7,7 @@ import six
|
||||
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.exceptions import _InvalidOutput
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.defer import mustbe_deferred
|
||||
@ -35,12 +36,12 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
def process_request(request):
|
||||
for method in self.methods['process_request']:
|
||||
response = yield method(request=request, spider=spider)
|
||||
assert response is None or isinstance(response, (Response, Request)), \
|
||||
'Middleware %s.process_request must return None, Response or Request, got %s' % \
|
||||
(six.get_method_self(method).__class__.__name__, response.__class__.__name__)
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
|
||||
(six.get_method_self(method).__class__.__name__, response.__class__.__name__))
|
||||
if response:
|
||||
defer.returnValue(response)
|
||||
defer.returnValue((yield download_func(request=request,spider=spider)))
|
||||
defer.returnValue((yield download_func(request=request, spider=spider)))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def process_response(response):
|
||||
@ -49,11 +50,10 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
defer.returnValue(response)
|
||||
|
||||
for method in self.methods['process_response']:
|
||||
response = yield method(request=request, response=response,
|
||||
spider=spider)
|
||||
assert isinstance(response, (Response, Request)), \
|
||||
'Middleware %s.process_response must return Response or Request, got %s' % \
|
||||
(six.get_method_self(method).__class__.__name__, type(response))
|
||||
response = yield method(request=request, response=response, spider=spider)
|
||||
if not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
|
||||
(six.get_method_self(method).__class__.__name__, type(response)))
|
||||
if isinstance(response, Request):
|
||||
defer.returnValue(response)
|
||||
defer.returnValue(response)
|
||||
@ -62,11 +62,10 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
def process_exception(_failure):
|
||||
exception = _failure.value
|
||||
for method in self.methods['process_exception']:
|
||||
response = yield method(request=request, exception=exception,
|
||||
spider=spider)
|
||||
assert response is None or isinstance(response, (Response, Request)), \
|
||||
'Middleware %s.process_exception must return None, Response or Request, got %s' % \
|
||||
(six.get_method_self(method).__class__.__name__, type(response))
|
||||
response = yield method(request=request, exception=exception, spider=spider)
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \
|
||||
(six.get_method_self(method).__class__.__name__, type(response)))
|
||||
if response:
|
||||
defer.returnValue(response)
|
||||
defer.returnValue(_failure)
|
||||
|
@ -1,19 +1,46 @@
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
import warnings
|
||||
from os.path import join, exists
|
||||
|
||||
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
||||
from queuelib import PriorityQueue
|
||||
|
||||
from scrapy.utils.misc import load_object, create_instance
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Scheduler(object):
|
||||
"""
|
||||
Scrapy Scheduler. It allows to enqueue requests and then get
|
||||
a next request to download. Scheduler is also handling duplication
|
||||
filtering, via dupefilter.
|
||||
|
||||
Prioritization and queueing is not performed by the Scheduler.
|
||||
User sets ``priority`` field for each Request, and a PriorityQueue
|
||||
(defined by :setting:`SCHEDULER_PRIORITY_QUEUE`) uses these priorities
|
||||
to dequeue requests in a desired order.
|
||||
|
||||
Scheduler uses two PriorityQueue instances, configured to work in-memory
|
||||
and on-disk (optional). When on-disk queue is present, it is used by
|
||||
default, and an in-memory queue is used as a fallback for cases where
|
||||
a disk queue can't handle a request (can't serialize it).
|
||||
|
||||
:setting:`SCHEDULER_MEMORY_QUEUE` and
|
||||
:setting:`SCHEDULER_DISK_QUEUE` allow to specify lower-level queue classes
|
||||
which PriorityQueue instances would be instantiated with, to keep requests
|
||||
on disk and in memory respectively.
|
||||
|
||||
Overall, Scheduler is an object which holds several PriorityQueue instances
|
||||
(in-memory and on-disk) and implements fallback logic for them.
|
||||
Also, it handles dupefilters.
|
||||
"""
|
||||
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
|
||||
logunser=False, stats=None, pqclass=None):
|
||||
logunser=False, stats=None, pqclass=None, crawler=None):
|
||||
self.df = dupefilter
|
||||
self.dqdir = self._dqdir(jobdir)
|
||||
self.pqclass = pqclass
|
||||
@ -21,6 +48,7 @@ class Scheduler(object):
|
||||
self.mqclass = mqclass
|
||||
self.logunser = logunser
|
||||
self.stats = stats
|
||||
self.crawler = crawler
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
@ -28,26 +56,35 @@ class Scheduler(object):
|
||||
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
|
||||
dupefilter = create_instance(dupefilter_cls, settings, crawler)
|
||||
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
|
||||
if pqclass is PriorityQueue:
|
||||
warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
|
||||
" is no longer supported because of API changes; "
|
||||
"please use 'scrapy.pqueues.ScrapyPriorityQueue'",
|
||||
ScrapyDeprecationWarning)
|
||||
from scrapy.pqueues import ScrapyPriorityQueue
|
||||
pqclass = ScrapyPriorityQueue
|
||||
|
||||
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
|
||||
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
|
||||
logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
|
||||
logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
|
||||
settings.getbool('SCHEDULER_DEBUG'))
|
||||
return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
|
||||
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
|
||||
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
|
||||
mqclass=mqclass, crawler=crawler)
|
||||
|
||||
def has_pending_requests(self):
|
||||
return len(self) > 0
|
||||
|
||||
def open(self, spider):
|
||||
self.spider = spider
|
||||
self.mqs = self.pqclass(self._newmq)
|
||||
self.mqs = self._mq()
|
||||
self.dqs = self._dq() if self.dqdir else None
|
||||
return self.df.open()
|
||||
|
||||
def close(self, reason):
|
||||
if self.dqs:
|
||||
prios = self.dqs.close()
|
||||
with open(join(self.dqdir, 'active.json'), 'w') as f:
|
||||
json.dump(prios, f)
|
||||
state = self.dqs.close()
|
||||
self._write_dqs_state(self.dqdir, state)
|
||||
return self.df.close(reason)
|
||||
|
||||
def enqueue_request(self, request):
|
||||
@ -82,8 +119,7 @@ class Scheduler(object):
|
||||
if self.dqs is None:
|
||||
return
|
||||
try:
|
||||
reqd = request_to_dict(request, self.spider)
|
||||
self.dqs.push(reqd, -request.priority)
|
||||
self.dqs.push(request, -request.priority)
|
||||
except ValueError as e: # non serializable request
|
||||
if self.logunser:
|
||||
msg = ("Unable to serialize request: %(request)s - reason:"
|
||||
@ -103,32 +139,51 @@ class Scheduler(object):
|
||||
|
||||
def _dqpop(self):
|
||||
if self.dqs:
|
||||
d = self.dqs.pop()
|
||||
if d:
|
||||
return request_from_dict(d, self.spider)
|
||||
return self.dqs.pop()
|
||||
|
||||
def _newmq(self, priority):
|
||||
""" Factory for creating memory queues. """
|
||||
return self.mqclass()
|
||||
|
||||
def _newdq(self, priority):
|
||||
return self.dqclass(join(self.dqdir, 'p%s' % priority))
|
||||
""" Factory for creating disk queues. """
|
||||
path = join(self.dqdir, 'p%s' % (priority, ))
|
||||
return self.dqclass(path)
|
||||
|
||||
def _mq(self):
|
||||
""" Create a new priority queue instance, with in-memory storage """
|
||||
return create_instance(self.pqclass, None, self.crawler, self._newmq,
|
||||
serialize=False)
|
||||
|
||||
def _dq(self):
|
||||
activef = join(self.dqdir, 'active.json')
|
||||
if exists(activef):
|
||||
with open(activef) as f:
|
||||
prios = json.load(f)
|
||||
else:
|
||||
prios = ()
|
||||
q = self.pqclass(self._newdq, startprios=prios)
|
||||
""" Create a new priority queue instance, with disk storage """
|
||||
state = self._read_dqs_state(self.dqdir)
|
||||
q = create_instance(self.pqclass,
|
||||
None,
|
||||
self.crawler,
|
||||
self._newdq,
|
||||
state,
|
||||
serialize=True)
|
||||
if q:
|
||||
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
|
||||
{'queuesize': len(q)}, extra={'spider': self.spider})
|
||||
return q
|
||||
|
||||
def _dqdir(self, jobdir):
|
||||
""" Return a folder name to keep disk queue state at """
|
||||
if jobdir:
|
||||
dqdir = join(jobdir, 'requests.queue')
|
||||
if not exists(dqdir):
|
||||
os.makedirs(dqdir)
|
||||
return dqdir
|
||||
|
||||
def _read_dqs_state(self, dqdir):
|
||||
path = join(dqdir, 'active.json')
|
||||
if not exists(path):
|
||||
return ()
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
def _write_dqs_state(self, dqdir, state):
|
||||
with open(join(dqdir, 'active.json'), 'w') as f:
|
||||
json.dump(state, f)
|
||||
|
@ -135,7 +135,6 @@ class Scraper(object):
|
||||
return self.spidermw.scrape_response(
|
||||
self.call_spider, request_result, request, spider)
|
||||
else:
|
||||
# FIXME: don't ignore errors in spider middleware
|
||||
dfd = self.call_spider(request_result, request, spider)
|
||||
return dfd.addErrback(
|
||||
self._log_download_errors, request_result, request, spider)
|
||||
|
@ -3,15 +3,21 @@ Spider Middleware manager
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
from itertools import chain, islice
|
||||
|
||||
import six
|
||||
from twisted.python.failure import Failure
|
||||
from scrapy.exceptions import _InvalidOutput
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.defer import mustbe_deferred
|
||||
from scrapy.utils.conf import build_component_list
|
||||
from scrapy.utils.python import MutableChain
|
||||
|
||||
|
||||
def _isiterable(possible_iterator):
|
||||
return hasattr(possible_iterator, '__iter__')
|
||||
|
||||
|
||||
class SpiderMiddlewareManager(MiddlewareManager):
|
||||
|
||||
component_name = 'spider middleware'
|
||||
@ -24,12 +30,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
super(SpiderMiddlewareManager, self)._add_middleware(mw)
|
||||
if hasattr(mw, 'process_spider_input'):
|
||||
self.methods['process_spider_input'].append(mw.process_spider_input)
|
||||
if hasattr(mw, 'process_spider_output'):
|
||||
self.methods['process_spider_output'].appendleft(mw.process_spider_output)
|
||||
if hasattr(mw, 'process_spider_exception'):
|
||||
self.methods['process_spider_exception'].appendleft(mw.process_spider_exception)
|
||||
if hasattr(mw, 'process_start_requests'):
|
||||
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
|
||||
self.methods['process_spider_output'].appendleft(getattr(mw, 'process_spider_output', None))
|
||||
self.methods['process_spider_exception'].appendleft(getattr(mw, 'process_spider_exception', None))
|
||||
|
||||
def scrape_response(self, scrape_func, response, request, spider):
|
||||
fname = lambda f:'%s.%s' % (
|
||||
@ -40,36 +44,73 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
for method in self.methods['process_spider_input']:
|
||||
try:
|
||||
result = method(response=response, spider=spider)
|
||||
assert result is None, \
|
||||
'Middleware %s must returns None or ' \
|
||||
'raise an exception, got %s ' \
|
||||
% (fname(method), type(result))
|
||||
except:
|
||||
if result is not None:
|
||||
raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
|
||||
.format(fname(method), type(result)))
|
||||
except _InvalidOutput:
|
||||
raise
|
||||
except Exception:
|
||||
return scrape_func(Failure(), request, spider)
|
||||
return scrape_func(response, request, spider)
|
||||
|
||||
def process_spider_exception(_failure):
|
||||
def process_spider_exception(_failure, start_index=0):
|
||||
exception = _failure.value
|
||||
for method in self.methods['process_spider_exception']:
|
||||
# don't handle _InvalidOutput exception
|
||||
if isinstance(exception, _InvalidOutput):
|
||||
return _failure
|
||||
method_list = islice(self.methods['process_spider_exception'], start_index, None)
|
||||
for method_index, method in enumerate(method_list, start=start_index):
|
||||
if method is None:
|
||||
continue
|
||||
result = method(response=response, exception=exception, spider=spider)
|
||||
assert result is None or _isiterable(result), \
|
||||
'Middleware %s must returns None, or an iterable object, got %s ' % \
|
||||
(fname(method), type(result))
|
||||
if result is not None:
|
||||
return result
|
||||
if _isiterable(result):
|
||||
# stop exception handling by handing control over to the
|
||||
# process_spider_output chain if an iterable has been returned
|
||||
return process_spider_output(result, method_index+1)
|
||||
elif result is None:
|
||||
continue
|
||||
else:
|
||||
raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \
|
||||
.format(fname(method), type(result)))
|
||||
return _failure
|
||||
|
||||
def process_spider_output(result):
|
||||
for method in self.methods['process_spider_output']:
|
||||
result = method(response=response, result=result, spider=spider)
|
||||
assert _isiterable(result), \
|
||||
'Middleware %s must returns an iterable object, got %s ' % \
|
||||
(fname(method), type(result))
|
||||
return result
|
||||
def process_spider_output(result, start_index=0):
|
||||
# items in this iterable do not need to go through the process_spider_output
|
||||
# chain, they went through it already from the process_spider_exception method
|
||||
recovered = MutableChain()
|
||||
|
||||
def evaluate_iterable(iterable, index):
|
||||
try:
|
||||
for r in iterable:
|
||||
yield r
|
||||
except Exception as ex:
|
||||
exception_result = process_spider_exception(Failure(ex), index+1)
|
||||
if isinstance(exception_result, Failure):
|
||||
raise
|
||||
recovered.extend(exception_result)
|
||||
|
||||
method_list = islice(self.methods['process_spider_output'], start_index, None)
|
||||
for method_index, method in enumerate(method_list, start=start_index):
|
||||
if method is None:
|
||||
continue
|
||||
# the following might fail directly if the output value is not a generator
|
||||
try:
|
||||
result = method(response=response, result=result, spider=spider)
|
||||
except Exception as ex:
|
||||
exception_result = process_spider_exception(Failure(ex), method_index+1)
|
||||
if isinstance(exception_result, Failure):
|
||||
raise
|
||||
return exception_result
|
||||
if _isiterable(result):
|
||||
result = evaluate_iterable(result, method_index)
|
||||
else:
|
||||
raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
|
||||
.format(fname(method), type(result)))
|
||||
|
||||
return chain(result, recovered)
|
||||
|
||||
dfd = mustbe_deferred(process_spider_input, response)
|
||||
dfd.addErrback(process_spider_exception)
|
||||
dfd.addCallback(process_spider_output)
|
||||
dfd.addCallbacks(callback=process_spider_output, errback=process_spider_exception)
|
||||
return dfd
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
|
@ -111,6 +111,8 @@ class Crawler(object):
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def stop(self):
|
||||
"""Starts a graceful stop of the crawler and returns a deferred that is
|
||||
fired when the crawler is stopped."""
|
||||
if self.crawling:
|
||||
self.crawling = False
|
||||
yield defer.maybeDeferred(self.engine.stop)
|
||||
|
@ -88,6 +88,7 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
def __init__(self, settings):
|
||||
super(MetaRefreshMiddleware, self).__init__(settings)
|
||||
self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
|
||||
self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY',
|
||||
settings.getint('METAREFRESH_MAXDELAY'))
|
||||
|
||||
@ -96,7 +97,8 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
||||
not isinstance(response, HtmlResponse):
|
||||
return response
|
||||
|
||||
interval, url = get_meta_refresh(response)
|
||||
interval, url = get_meta_refresh(response,
|
||||
ignore_tags=self._ignore_tags)
|
||||
if url and interval < self._maxdelay:
|
||||
redirected = self._redirect_request_using_get(request, url)
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
|
@ -11,6 +11,13 @@ class NotConfigured(Exception):
|
||||
"""Indicates a missing configuration situation"""
|
||||
pass
|
||||
|
||||
class _InvalidOutput(TypeError):
|
||||
"""
|
||||
Indicates an invalid value has been returned by a middleware's processing method.
|
||||
Internal and undocumented, it should not be raised or caught by user code.
|
||||
"""
|
||||
pass
|
||||
|
||||
# HTTP and crawling
|
||||
|
||||
class IgnoreRequest(Exception):
|
||||
|
@ -24,7 +24,11 @@ class CoreStats(object):
|
||||
self.stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
self.stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
|
||||
finish_time = datetime.datetime.utcnow()
|
||||
elapsed_time = finish_time - self.stats.get_value('start_time')
|
||||
elapsed_time_seconds = elapsed_time.total_seconds()
|
||||
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
|
||||
self.stats.set_value('finish_time', finish_time, spider=spider)
|
||||
self.stats.set_value('finish_reason', reason, spider=spider)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
|
@ -31,7 +31,7 @@ class DummyPolicy(object):
|
||||
def should_cache_response(self, response, request):
|
||||
return response.status not in self.ignore_http_codes
|
||||
|
||||
def is_cached_response_fresh(self, response, request):
|
||||
def is_cached_response_fresh(self, cachedresponse, request):
|
||||
return True
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
@ -70,7 +70,7 @@ class RFC2616Policy(object):
|
||||
return True
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
# What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1
|
||||
# What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
|
||||
# Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
|
||||
# Status code 206 is not included because cache can not deal with partial contents
|
||||
cc = self._parse_cachecontrol(response)
|
||||
|
@ -35,6 +35,10 @@ class ItemLoader(object):
|
||||
self.parent = parent
|
||||
self._local_item = context['item'] = item
|
||||
self._local_values = defaultdict(list)
|
||||
# Preprocess values if item built from dict
|
||||
# Values need to be added to item._values if added them from dict (not with add_values)
|
||||
for field_name, value in item.items():
|
||||
self._values[field_name] = self._process_input_value(field_name, value)
|
||||
|
||||
@property
|
||||
def _values(self):
|
||||
|
@ -3,7 +3,7 @@ from __future__ import print_function
|
||||
import functools
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from twisted.internet.defer import Deferred, DeferredList
|
||||
from twisted.internet.defer import Deferred, DeferredList, _DefGen_Return
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.settings import Settings
|
||||
@ -139,6 +139,30 @@ class MediaPipeline(object):
|
||||
result.cleanFailure()
|
||||
result.frames = []
|
||||
result.stack = None
|
||||
|
||||
# This code fixes a memory leak by avoiding to keep references to
|
||||
# the Request and Response objects on the Media Pipeline cache.
|
||||
#
|
||||
# Twisted inline callbacks pass return values using the function
|
||||
# twisted.internet.defer.returnValue, which encapsulates the return
|
||||
# value inside a _DefGen_Return base exception.
|
||||
#
|
||||
# What happens when the media_downloaded callback raises another
|
||||
# exception, for example a FileException('download-error') when
|
||||
# the Response status code is not 200 OK, is that it stores the
|
||||
# _DefGen_Return exception on the FileException context.
|
||||
#
|
||||
# To avoid keeping references to the Response and therefore Request
|
||||
# objects on the Media Pipeline cache, we should wipe the context of
|
||||
# the exception encapsulated by the Twisted Failure when its a
|
||||
# _DefGen_Return instance.
|
||||
#
|
||||
# This problem does not occur in Python 2.7 since we don't have
|
||||
# Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
|
||||
context = getattr(result.value, '__context__', None)
|
||||
if isinstance(context, _DefGen_Return):
|
||||
setattr(result.value, '__context__', None)
|
||||
|
||||
info.downloading.remove(fp)
|
||||
info.downloaded[fp] = result # cache result
|
||||
for wad in info.waiting.pop(fp):
|
||||
|
193
scrapy/pqueues.py
Normal file
193
scrapy/pqueues.py
Normal file
@ -0,0 +1,193 @@
|
||||
import hashlib
|
||||
import logging
|
||||
from collections import namedtuple
|
||||
|
||||
from queuelib import PriorityQueue
|
||||
|
||||
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _path_safe(text):
|
||||
"""
|
||||
Return a filesystem-safe version of a string ``text``
|
||||
|
||||
>>> _path_safe('simple.org').startswith('simple.org')
|
||||
True
|
||||
>>> _path_safe('dash-underscore_.org').startswith('dash-underscore_.org')
|
||||
True
|
||||
>>> _path_safe('some@symbol?').startswith('some_symbol_')
|
||||
True
|
||||
"""
|
||||
pathable_slot = "".join([c if c.isalnum() or c in '-._' else '_'
|
||||
for c in text])
|
||||
# as we replace some letters we can get collision for different slots
|
||||
# add we add unique part
|
||||
unique_slot = hashlib.md5(text.encode('utf8')).hexdigest()
|
||||
return '-'.join([pathable_slot, unique_slot])
|
||||
|
||||
|
||||
class _Priority(namedtuple("_Priority", ["priority", "slot"])):
|
||||
""" Slot-specific priority. It is a hack - ``(priority, slot)`` tuple
|
||||
which can be used instead of int priorities in queues:
|
||||
|
||||
* they are ordered in the same way - order is still by priority value,
|
||||
min(prios) works;
|
||||
* str(p) representation is guaranteed to be different when slots
|
||||
are different - this is important because str(p) is used to create
|
||||
queue files on disk;
|
||||
* they have readable str(p) representation which is safe
|
||||
to use as a file name.
|
||||
"""
|
||||
__slots__ = ()
|
||||
|
||||
def __str__(self):
|
||||
return '%s_%s' % (self.priority, _path_safe(str(self.slot)))
|
||||
|
||||
|
||||
class _SlotPriorityQueues(object):
|
||||
""" Container for multiple priority queues. """
|
||||
def __init__(self, pqfactory, slot_startprios=None):
|
||||
"""
|
||||
``pqfactory`` is a factory for creating new PriorityQueues.
|
||||
It must be a function which accepts a single optional ``startprios``
|
||||
argument, with a list of priorities to create queues for.
|
||||
|
||||
``slot_startprios`` is a ``{slot: startprios}`` dict.
|
||||
"""
|
||||
self.pqfactory = pqfactory
|
||||
self.pqueues = {} # slot -> priority queue
|
||||
for slot, startprios in (slot_startprios or {}).items():
|
||||
self.pqueues[slot] = self.pqfactory(startprios)
|
||||
|
||||
def pop_slot(self, slot):
|
||||
""" Pop an object from a priority queue for this slot """
|
||||
queue = self.pqueues[slot]
|
||||
request = queue.pop()
|
||||
if len(queue) == 0:
|
||||
del self.pqueues[slot]
|
||||
return request
|
||||
|
||||
def push_slot(self, slot, obj, priority):
|
||||
""" Push an object to a priority queue for this slot """
|
||||
if slot not in self.pqueues:
|
||||
self.pqueues[slot] = self.pqfactory()
|
||||
queue = self.pqueues[slot]
|
||||
queue.push(obj, priority)
|
||||
|
||||
def close(self):
|
||||
active = {slot: queue.close()
|
||||
for slot, queue in self.pqueues.items()}
|
||||
self.pqueues.clear()
|
||||
return active
|
||||
|
||||
def __len__(self):
|
||||
return sum(len(x) for x in self.pqueues.values()) if self.pqueues else 0
|
||||
|
||||
def __contains__(self, slot):
|
||||
return slot in self.pqueues
|
||||
|
||||
|
||||
class ScrapyPriorityQueue(PriorityQueue):
|
||||
"""
|
||||
PriorityQueue which works with scrapy.Request instances and
|
||||
can optionally convert them to/from dicts before/after putting to a queue.
|
||||
"""
|
||||
def __init__(self, crawler, qfactory, startprios=(), serialize=False):
|
||||
super(ScrapyPriorityQueue, self).__init__(qfactory, startprios)
|
||||
self.serialize = serialize
|
||||
self.spider = crawler.spider
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, qfactory, startprios=(), serialize=False):
|
||||
return cls(crawler, qfactory, startprios, serialize)
|
||||
|
||||
def push(self, request, priority=0):
|
||||
if self.serialize:
|
||||
request = request_to_dict(request, self.spider)
|
||||
super(ScrapyPriorityQueue, self).push(request, priority)
|
||||
|
||||
def pop(self):
|
||||
request = super(ScrapyPriorityQueue, self).pop()
|
||||
if request and self.serialize:
|
||||
request = request_from_dict(request, self.spider)
|
||||
return request
|
||||
|
||||
|
||||
class DownloaderInterface(object):
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.downloader = crawler.engine.downloader
|
||||
|
||||
def stats(self, possible_slots):
|
||||
return [(self._active_downloads(slot), slot)
|
||||
for slot in possible_slots]
|
||||
|
||||
def get_slot_key(self, request):
|
||||
return self.downloader._get_slot_key(request, None)
|
||||
|
||||
def _active_downloads(self, slot):
|
||||
""" Return a number of requests in a Downloader for a given slot """
|
||||
if slot not in self.downloader.slots:
|
||||
return 0
|
||||
return len(self.downloader.slots[slot].active)
|
||||
|
||||
|
||||
class DownloaderAwarePriorityQueue(object):
|
||||
""" PriorityQueue which takes Downlaoder activity in account:
|
||||
domains (slots) with the least amount of active downloads are dequeued
|
||||
first.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, qfactory, slot_startprios=None, serialize=False):
|
||||
return cls(crawler, qfactory, slot_startprios, serialize)
|
||||
|
||||
def __init__(self, crawler, qfactory, slot_startprios=None, serialize=False):
|
||||
if crawler.settings.getint('CONCURRENT_REQUESTS_PER_IP') != 0:
|
||||
raise ValueError('"%s" does not support CONCURRENT_REQUESTS_PER_IP'
|
||||
% (self.__class__,))
|
||||
|
||||
if slot_startprios and not isinstance(slot_startprios, dict):
|
||||
raise ValueError("DownloaderAwarePriorityQueue accepts "
|
||||
"``slot_startprios`` as a dict; %r instance "
|
||||
"is passed. Most likely, it means the state is"
|
||||
"created by an incompatible priority queue. "
|
||||
"Only a crawl started with the same priority "
|
||||
"queue class can be resumed." %
|
||||
slot_startprios.__class__)
|
||||
|
||||
slot_startprios = {
|
||||
slot: [_Priority(p, slot) for p in startprios]
|
||||
for slot, startprios in (slot_startprios or {}).items()}
|
||||
|
||||
def pqfactory(startprios=()):
|
||||
return ScrapyPriorityQueue(crawler, qfactory, startprios, serialize)
|
||||
self._slot_pqueues = _SlotPriorityQueues(pqfactory, slot_startprios)
|
||||
self.serialize = serialize
|
||||
self._downloader_interface = DownloaderInterface(crawler)
|
||||
|
||||
def pop(self):
|
||||
stats = self._downloader_interface.stats(self._slot_pqueues.pqueues)
|
||||
|
||||
if not stats:
|
||||
return
|
||||
|
||||
slot = min(stats)[1]
|
||||
request = self._slot_pqueues.pop_slot(slot)
|
||||
return request
|
||||
|
||||
def push(self, request, priority):
|
||||
slot = self._downloader_interface.get_slot_key(request)
|
||||
priority_slot = _Priority(priority=priority, slot=slot)
|
||||
self._slot_pqueues.push_slot(slot, request, priority_slot)
|
||||
|
||||
def close(self):
|
||||
active = self._slot_pqueues.close()
|
||||
return {slot: [p.priority for p in startprios]
|
||||
for slot, startprios in active.items()}
|
||||
|
||||
def __len__(self):
|
||||
return len(self._slot_pqueues)
|
@ -221,6 +221,7 @@ MEMUSAGE_NOTIFY_MAIL = []
|
||||
MEMUSAGE_WARNING_MB = 0
|
||||
|
||||
METAREFRESH_ENABLED = True
|
||||
METAREFRESH_IGNORE_TAGS = ['script', 'noscript']
|
||||
METAREFRESH_MAXDELAY = 100
|
||||
|
||||
NEWSPIDER_MODULE = ''
|
||||
@ -238,7 +239,7 @@ REFERRER_POLICY = 'scrapy.spidermiddlewares.referer.DefaultReferrerPolicy'
|
||||
|
||||
RETRY_ENABLED = True
|
||||
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
|
||||
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408]
|
||||
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
|
||||
RETRY_PRIORITY_ADJUST = -1
|
||||
|
||||
ROBOTSTXT_OBEY = False
|
||||
@ -246,7 +247,7 @@ ROBOTSTXT_OBEY = False
|
||||
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
|
||||
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
|
||||
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
|
||||
SCHEDULER_PRIORITY_QUEUE = 'queuelib.PriorityQueue'
|
||||
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'
|
||||
|
||||
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
|
||||
SPIDER_LOADER_WARN_ONLY = False
|
||||
|
@ -6,29 +6,55 @@ See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
|
||||
import copy
|
||||
import warnings
|
||||
|
||||
import six
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.http import Request, HtmlResponse
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.utils.python import get_func_args
|
||||
from scrapy.spiders import Spider
|
||||
|
||||
|
||||
def identity(x):
|
||||
return x
|
||||
def _identity(request, response):
|
||||
return request
|
||||
|
||||
|
||||
def _get_method(method, spider):
|
||||
if callable(method):
|
||||
return method
|
||||
elif isinstance(method, six.string_types):
|
||||
return getattr(spider, method, None)
|
||||
|
||||
|
||||
class Rule(object):
|
||||
|
||||
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
|
||||
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None):
|
||||
self.link_extractor = link_extractor
|
||||
self.callback = callback
|
||||
self.cb_kwargs = cb_kwargs or {}
|
||||
self.process_links = process_links
|
||||
self.process_request = process_request
|
||||
if follow is None:
|
||||
self.follow = False if callback else True
|
||||
else:
|
||||
self.follow = follow
|
||||
self.process_request = process_request or _identity
|
||||
self.process_request_argcount = None
|
||||
self.follow = follow if follow is not None else not callback
|
||||
|
||||
def _compile(self, spider):
|
||||
self.callback = _get_method(self.callback, spider)
|
||||
self.process_links = _get_method(self.process_links, spider)
|
||||
self.process_request = _get_method(self.process_request, spider)
|
||||
self.process_request_argcount = len(get_func_args(self.process_request))
|
||||
if self.process_request_argcount == 1:
|
||||
msg = 'Rule.process_request should accept two arguments (request, response), accepting only one is deprecated'
|
||||
warnings.warn(msg, category=ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
def _process_request(self, request, response):
|
||||
"""
|
||||
Wrapper around the request processing function to maintain backward
|
||||
compatibility with functions that do not take a Response object
|
||||
"""
|
||||
args = [request] if self.process_request_argcount == 1 else [request, response]
|
||||
return self.process_request(*args)
|
||||
|
||||
|
||||
class CrawlSpider(Spider):
|
||||
@ -64,8 +90,8 @@ class CrawlSpider(Spider):
|
||||
links = rule.process_links(links)
|
||||
for link in links:
|
||||
seen.add(link)
|
||||
r = self._build_request(n, link)
|
||||
yield rule.process_request(r)
|
||||
request = self._build_request(n, link)
|
||||
yield rule._process_request(request, response)
|
||||
|
||||
def _response_downloaded(self, response):
|
||||
rule = self._rules[response.meta['rule']]
|
||||
@ -83,17 +109,9 @@ class CrawlSpider(Spider):
|
||||
yield request_or_item
|
||||
|
||||
def _compile_rules(self):
|
||||
def get_method(method):
|
||||
if callable(method):
|
||||
return method
|
||||
elif isinstance(method, six.string_types):
|
||||
return getattr(self, method, None)
|
||||
|
||||
self._rules = [copy.copy(r) for r in self.rules]
|
||||
for rule in self._rules:
|
||||
rule.callback = get_method(rule.callback)
|
||||
rule.process_links = get_method(rule.process_links)
|
||||
rule.process_request = get_method(rule.process_request)
|
||||
rule._compile(self)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
|
@ -7,6 +7,7 @@ from six.moves import cPickle as pickle
|
||||
|
||||
from queuelib import queue
|
||||
|
||||
|
||||
def _serializable_queue(queue_class, serialize, deserialize):
|
||||
|
||||
class SerializableQueue(queue_class):
|
||||
@ -22,6 +23,7 @@ def _serializable_queue(queue_class, serialize, deserialize):
|
||||
|
||||
return SerializableQueue
|
||||
|
||||
|
||||
def _pickle_serialize(obj):
|
||||
try:
|
||||
return pickle.dumps(obj, protocol=2)
|
||||
@ -31,13 +33,14 @@ def _pickle_serialize(obj):
|
||||
except (pickle.PicklingError, AttributeError, TypeError) as e:
|
||||
raise ValueError(str(e))
|
||||
|
||||
PickleFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue, \
|
||||
|
||||
PickleFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue,
|
||||
_pickle_serialize, pickle.loads)
|
||||
PickleLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue, \
|
||||
PickleLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue,
|
||||
_pickle_serialize, pickle.loads)
|
||||
MarshalFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue, \
|
||||
MarshalFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue,
|
||||
marshal.dumps, marshal.loads)
|
||||
MarshalLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue, \
|
||||
MarshalLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue,
|
||||
marshal.dumps, marshal.loads)
|
||||
FifoMemoryQueue = queue.FifoMemoryQueue
|
||||
LifoMemoryQueue = queue.LifoMemoryQueue
|
||||
|
@ -39,7 +39,7 @@ class ${ProjectName}SpiderMiddleware(object):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Response, dict
|
||||
# Should return either None or an iterable of Request, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
|
@ -48,7 +48,7 @@ def mustbe_deferred(f, *args, **kw):
|
||||
# exception in Scrapy - see #125
|
||||
except IgnoreRequest as e:
|
||||
return defer_fail(failure.Failure(e))
|
||||
except:
|
||||
except Exception:
|
||||
return defer_fail(failure.Failure())
|
||||
else:
|
||||
return defer_result(result)
|
||||
@ -102,5 +102,5 @@ def iter_errback(iterable, errback, *a, **kw):
|
||||
yield next(it)
|
||||
except StopIteration:
|
||||
break
|
||||
except:
|
||||
except Exception:
|
||||
errback(failure.Failure(), *a, **kw)
|
||||
|
@ -9,6 +9,9 @@ from gzip import GzipFile
|
||||
import six
|
||||
import re
|
||||
|
||||
from scrapy.utils.decorators import deprecated
|
||||
|
||||
|
||||
# - Python>=3.5 GzipFile's read() has issues returning leftover
|
||||
# uncompressed data when input is corrupted
|
||||
# (regression or bug-fix compared to Python 3.4)
|
||||
@ -53,6 +56,7 @@ def gunzip(data):
|
||||
_is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search
|
||||
_is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search
|
||||
|
||||
@deprecated
|
||||
def is_gzipped(response):
|
||||
"""Return True if the response is gzipped, or False otherwise"""
|
||||
ctype = response.headers.get('Content-Type', b'')
|
||||
|
@ -1,6 +1,8 @@
|
||||
"""Helper functions which don't fit anywhere else"""
|
||||
import os
|
||||
import re
|
||||
import hashlib
|
||||
from contextlib import contextmanager
|
||||
from importlib import import_module
|
||||
from pkgutil import iter_modules
|
||||
|
||||
@ -86,7 +88,7 @@ def extract_regex(regex, text, encoding='utf-8'):
|
||||
|
||||
try:
|
||||
strings = [regex.search(text).group('extract')] # named group
|
||||
except:
|
||||
except Exception:
|
||||
strings = regex.findall(text) # full regex or numbered groups
|
||||
strings = flatten(strings)
|
||||
|
||||
@ -142,3 +144,21 @@ def create_instance(objcls, settings, crawler, *args, **kwargs):
|
||||
return objcls.from_settings(settings, *args, **kwargs)
|
||||
else:
|
||||
return objcls(*args, **kwargs)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def set_environ(**kwargs):
|
||||
"""Temporarily set environment variables inside the context manager and
|
||||
fully restore previous environment afterwards
|
||||
"""
|
||||
|
||||
original_env = {k: os.environ.get(k) for k in kwargs}
|
||||
os.environ.update(kwargs)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
for k, v in original_env.items():
|
||||
if v is None:
|
||||
del os.environ[k]
|
||||
else:
|
||||
os.environ[k] = v
|
||||
|
@ -9,6 +9,7 @@ import weakref
|
||||
import errno
|
||||
import six
|
||||
from functools import partial, wraps
|
||||
from itertools import chain
|
||||
import sys
|
||||
|
||||
from scrapy.utils.decorators import deprecated
|
||||
@ -387,3 +388,22 @@ if hasattr(sys, "pypy_version_info"):
|
||||
else:
|
||||
def garbage_collect():
|
||||
gc.collect()
|
||||
|
||||
|
||||
class MutableChain(object):
|
||||
"""
|
||||
Thin wrapper around itertools.chain, allowing to add iterables "in-place"
|
||||
"""
|
||||
def __init__(self, *args):
|
||||
self.data = chain(*args)
|
||||
|
||||
def extend(self, *iterables):
|
||||
self.data = chain(self.data, *iterables)
|
||||
|
||||
def __iter__(self):
|
||||
return self.data.__iter__()
|
||||
|
||||
def __next__(self):
|
||||
return next(self.data)
|
||||
|
||||
next = __next__
|
||||
|
@ -70,6 +70,20 @@ def request_from_dict(d, spider=None):
|
||||
)
|
||||
|
||||
|
||||
def _is_private_method(name):
|
||||
return name.startswith('__') and not name.endswith('__')
|
||||
|
||||
|
||||
def _mangle_private_name(obj, func, name):
|
||||
qualname = getattr(func, '__qualname__', None)
|
||||
if qualname is None:
|
||||
classname = obj.__class__.__name__.lstrip('_')
|
||||
return '_%s%s' % (classname, name)
|
||||
else:
|
||||
splits = qualname.split('.')
|
||||
return '_%s%s' % (splits[-2], splits[-1])
|
||||
|
||||
|
||||
def _find_method(obj, func):
|
||||
if obj:
|
||||
try:
|
||||
@ -78,7 +92,10 @@ def _find_method(obj, func):
|
||||
pass
|
||||
else:
|
||||
if func_self is obj:
|
||||
return six.get_method_function(func).__name__
|
||||
name = six.get_method_function(func).__name__
|
||||
if _is_private_method(name):
|
||||
return _mangle_private_name(obj, func, name)
|
||||
return name
|
||||
raise ValueError("Function %s is not a method of: %s" % (func, obj))
|
||||
|
||||
|
||||
|
@ -31,12 +31,12 @@ def get_base_url(response):
|
||||
|
||||
|
||||
_metaref_cache = weakref.WeakKeyDictionary()
|
||||
def get_meta_refresh(response):
|
||||
def get_meta_refresh(response, ignore_tags=('script', 'noscript')):
|
||||
"""Parse the http-equiv refrsh parameter from the given response"""
|
||||
if response not in _metaref_cache:
|
||||
text = response.text[0:4096]
|
||||
_metaref_cache[response] = html.get_meta_refresh(text, response.url,
|
||||
response.encoding, ignore_tags=('script', 'noscript'))
|
||||
response.encoding, ignore_tags=ignore_tags)
|
||||
return _metaref_cache[response]
|
||||
|
||||
|
||||
|
3
setup.py
3
setup.py
@ -65,7 +65,8 @@ setup(
|
||||
],
|
||||
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
|
||||
install_requires=[
|
||||
'Twisted>=13.1.0',
|
||||
'Twisted>=13.1.0;python_version!="3.4"',
|
||||
'Twisted>=13.1.0,<=19.2.0;python_version=="3.4"',
|
||||
'w3lib>=1.17.0',
|
||||
'queuelib',
|
||||
'lxml',
|
||||
|
@ -177,7 +177,7 @@ class Root(Resource):
|
||||
try:
|
||||
from tests import tests_datadir
|
||||
self.putChild(b"files", File(os.path.join(tests_datadir, 'test_site/files/')))
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
self.putChild(b"redirect-to", RedirectTo())
|
||||
|
||||
|
@ -2,9 +2,10 @@
|
||||
mock
|
||||
mitmproxy==0.10.1
|
||||
netlib==0.10.1
|
||||
pytest==2.9.2
|
||||
pytest
|
||||
pytest-cov
|
||||
pytest-twisted
|
||||
pytest-cov==2.2.1
|
||||
pytest-xdist
|
||||
jmespath
|
||||
brotlipy
|
||||
testfixtures
|
||||
|
@ -1,6 +1,7 @@
|
||||
pytest==3.6.3
|
||||
pytest
|
||||
pytest-cov
|
||||
pytest-twisted
|
||||
pytest-cov==2.5.1
|
||||
pytest-xdist
|
||||
testfixtures
|
||||
jmespath
|
||||
leveldb; sys_platform != "win32"
|
||||
|
@ -53,9 +53,5 @@ class TestCloseSpider(TestCase):
|
||||
yield crawler.crawl(total=1000000, mockserver=self.mockserver)
|
||||
reason = crawler.spider.meta['close_reason']
|
||||
self.assertEqual(reason, 'closespider_timeout')
|
||||
stats = crawler.stats
|
||||
start = stats.get_value('start_time')
|
||||
stop = stats.get_value('finish_time')
|
||||
diff = stop - start
|
||||
total_seconds = diff.seconds + diff.microseconds
|
||||
total_seconds = crawler.stats.get_value('elapsed_time_seconds')
|
||||
self.assertTrue(total_seconds >= close_on)
|
||||
|
@ -1,5 +1,4 @@
|
||||
import logging
|
||||
import tempfile
|
||||
import warnings
|
||||
|
||||
from twisted.internet import defer
|
||||
@ -38,7 +37,11 @@ class CrawlerTestCase(BaseCrawlerTest):
|
||||
self.assertIsInstance(spiders, sl_cls)
|
||||
|
||||
self.crawler.spiders
|
||||
self.assertEqual(len(w), 1, "Warn deprecated access only once")
|
||||
is_one_warning = len(w) == 1
|
||||
if not is_one_warning:
|
||||
for warning in w:
|
||||
print(warning)
|
||||
self.assertTrue(is_one_warning, "Warn deprecated access only once")
|
||||
|
||||
def test_populate_spidercls_settings(self):
|
||||
spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
|
||||
@ -179,8 +182,12 @@ class CrawlerRunnerTestCase(BaseCrawlerTest):
|
||||
'SPIDER_MANAGER_CLASS': 'tests.test_crawler.CustomSpiderLoader'
|
||||
})
|
||||
self.assertIsInstance(runner.spider_loader, CustomSpiderLoader)
|
||||
self.assertEqual(len(w), 1)
|
||||
is_one_warning = len(w) == 1
|
||||
if not is_one_warning:
|
||||
for warning in w:
|
||||
print(warning)
|
||||
self.assertIn('Please use SPIDER_LOADER_CLASS', str(w[0].message))
|
||||
self.assertTrue(is_one_warning)
|
||||
|
||||
def test_crawl_rejects_spider_objects(self):
|
||||
with raises(ValueError):
|
||||
|
@ -3,6 +3,7 @@ from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.exceptions import _InvalidOutput
|
||||
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
|
||||
from scrapy.utils.test import get_crawler
|
||||
from scrapy.utils.python import to_bytes
|
||||
@ -115,3 +116,63 @@ class ResponseFromProcessRequestTest(ManagerTestCase):
|
||||
|
||||
self.assertIs(results[0], resp)
|
||||
self.assertFalse(download_func.called)
|
||||
|
||||
|
||||
class ProcessRequestInvalidOutput(ManagerTestCase):
|
||||
"""Invalid return value for process_request method should raise an exception"""
|
||||
|
||||
def test_invalid_process_request(self):
|
||||
req = Request('http://example.com/index.html')
|
||||
|
||||
class InvalidProcessRequestMiddleware:
|
||||
def process_request(self, request, spider):
|
||||
return 1
|
||||
|
||||
self.mwman._add_middleware(InvalidProcessRequestMiddleware())
|
||||
download_func = mock.MagicMock()
|
||||
dfd = self.mwman.download(download_func, req, self.spider)
|
||||
results = []
|
||||
dfd.addBoth(results.append)
|
||||
self.assertIsInstance(results[0], Failure)
|
||||
self.assertIsInstance(results[0].value, _InvalidOutput)
|
||||
|
||||
|
||||
class ProcessResponseInvalidOutput(ManagerTestCase):
|
||||
"""Invalid return value for process_response method should raise an exception"""
|
||||
|
||||
def test_invalid_process_response(self):
|
||||
req = Request('http://example.com/index.html')
|
||||
|
||||
class InvalidProcessResponseMiddleware:
|
||||
def process_response(self, request, response, spider):
|
||||
return 1
|
||||
|
||||
self.mwman._add_middleware(InvalidProcessResponseMiddleware())
|
||||
download_func = mock.MagicMock()
|
||||
dfd = self.mwman.download(download_func, req, self.spider)
|
||||
results = []
|
||||
dfd.addBoth(results.append)
|
||||
self.assertIsInstance(results[0], Failure)
|
||||
self.assertIsInstance(results[0].value, _InvalidOutput)
|
||||
|
||||
|
||||
class ProcessExceptionInvalidOutput(ManagerTestCase):
|
||||
"""Invalid return value for process_exception method should raise an exception"""
|
||||
|
||||
def test_invalid_process_exception(self):
|
||||
req = Request('http://example.com/index.html')
|
||||
|
||||
class InvalidProcessExceptionMiddleware:
|
||||
def process_request(self, request, spider):
|
||||
raise Exception()
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
return 1
|
||||
|
||||
self.mwman._add_middleware(InvalidProcessExceptionMiddleware())
|
||||
download_func = mock.MagicMock()
|
||||
dfd = self.mwman.download(download_func, req, self.spider)
|
||||
results = []
|
||||
dfd.addBoth(results.append)
|
||||
self.assertIsInstance(results[0], Failure)
|
||||
self.assertIsInstance(results[0].value, _InvalidOutput)
|
||||
|
@ -279,5 +279,24 @@ class MetaRefreshMiddlewareTest(unittest.TestCase):
|
||||
self.assertEqual(req2.meta['redirect_reasons'], ['meta refresh'])
|
||||
self.assertEqual(req3.meta['redirect_reasons'], ['meta refresh', 'meta refresh'])
|
||||
|
||||
def test_ignore_tags_default(self):
|
||||
req = Request(url='http://example.org')
|
||||
body = ('''<noscript><meta http-equiv="refresh" '''
|
||||
'''content="0;URL='http://example.org/newpage'"></noscript>''')
|
||||
rsp = HtmlResponse(req.url, body=body.encode())
|
||||
response = self.mw.process_response(req, rsp, self.spider)
|
||||
assert isinstance(response, Response)
|
||||
|
||||
def test_ignore_tags_empty_list(self):
|
||||
crawler = get_crawler(Spider, {'METAREFRESH_IGNORE_TAGS': []})
|
||||
mw = MetaRefreshMiddleware.from_crawler(crawler)
|
||||
req = Request(url='http://example.org')
|
||||
body = ('''<noscript><meta http-equiv="refresh" '''
|
||||
'''content="0;URL='http://example.org/newpage'"></noscript>''')
|
||||
rsp = HtmlResponse(req.url, body=body.encode())
|
||||
req2 = mw.process_response(req, rsp, self.spider)
|
||||
assert isinstance(req2, Request)
|
||||
self.assertEqual(req2.url, 'http://example.org/newpage')
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -419,6 +419,43 @@ class BasicItemLoaderTest(unittest.TestCase):
|
||||
self.assertEqual(item['url'], u'rabbit.hole')
|
||||
self.assertEqual(item['summary'], u'rabbithole')
|
||||
|
||||
def test_create_item_from_dict(self):
|
||||
class TestItem(Item):
|
||||
title = Field()
|
||||
|
||||
class TestItemLoader(ItemLoader):
|
||||
default_item_class = TestItem
|
||||
|
||||
input_item = {'title': 'Test item title 1'}
|
||||
il = TestItemLoader(item=input_item)
|
||||
# Getting output value mustn't remove value from item
|
||||
self.assertEqual(il.load_item(), {
|
||||
'title': 'Test item title 1',
|
||||
})
|
||||
self.assertEqual(il.get_output_value('title'), 'Test item title 1')
|
||||
self.assertEqual(il.load_item(), {
|
||||
'title': 'Test item title 1',
|
||||
})
|
||||
|
||||
input_item = {'title': 'Test item title 2'}
|
||||
il = TestItemLoader(item=input_item)
|
||||
# Values from dict must be added to item _values
|
||||
self.assertEqual(il._values.get('title'), 'Test item title 2')
|
||||
|
||||
input_item = {'title': [u'Test item title 3', u'Test item 4']}
|
||||
il = TestItemLoader(item=input_item)
|
||||
# Same rules must work for lists
|
||||
self.assertEqual(il._values.get('title'),
|
||||
[u'Test item title 3', u'Test item 4'])
|
||||
self.assertEqual(il.load_item(), {
|
||||
'title': [u'Test item title 3', u'Test item 4'],
|
||||
})
|
||||
self.assertEqual(il.get_output_value('title'),
|
||||
[u'Test item title 3', u'Test item 4'])
|
||||
self.assertEqual(il.load_item(), {
|
||||
'title': [u'Test item title 3', u'Test item 4'],
|
||||
})
|
||||
|
||||
|
||||
class ProcessorsTest(unittest.TestCase):
|
||||
|
||||
|
@ -1,15 +1,19 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
|
||||
from testfixtures import LogCapture
|
||||
from twisted.trial import unittest
|
||||
from twisted.python.failure import Failure
|
||||
from twisted.internet import reactor
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
from twisted.internet.defer import Deferred, inlineCallbacks, returnValue
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.pipelines.media import MediaPipeline
|
||||
from scrapy.pipelines.files import FileException
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
from scrapy.utils.signal import disconnect_all
|
||||
from scrapy import signals
|
||||
@ -90,6 +94,77 @@ class BaseMediaPipelineTestCase(unittest.TestCase):
|
||||
self.pipe._modify_media_request(request)
|
||||
assert request.meta == {'handle_httpstatus_all': True}
|
||||
|
||||
def test_should_remove_req_res_references_before_caching_the_results(self):
|
||||
"""Regression test case to prevent a memory leak in the Media Pipeline.
|
||||
|
||||
The memory leak is triggered when an exception is raised when a Response
|
||||
scheduled by the Media Pipeline is being returned. For example, when a
|
||||
FileException('download-error') is raised because the Response status
|
||||
code is not 200 OK.
|
||||
|
||||
It happens because we are keeping a reference to the Response object
|
||||
inside the FileException context. This is caused by the way Twisted
|
||||
return values from inline callbacks. It raises a custom exception
|
||||
encapsulating the original return value.
|
||||
|
||||
The solution is to remove the exception context when this context is a
|
||||
_DefGen_Return instance, the BaseException used by Twisted to pass the
|
||||
returned value from those inline callbacks.
|
||||
|
||||
Maybe there's a better and more reliable way to test the case described
|
||||
here, but it would be more complicated and involve running - or at least
|
||||
mocking - some async steps from the Media Pipeline. The current test
|
||||
case is simple and detects the problem very fast. On the other hand, it
|
||||
would not detect another kind of leak happening due to old object
|
||||
references being kept inside the Media Pipeline cache.
|
||||
|
||||
This problem does not occur in Python 2.7 since we don't have Exception
|
||||
Chaining (https://www.python.org/dev/peps/pep-3134/).
|
||||
"""
|
||||
# Create sample pair of Request and Response objects
|
||||
request = Request('http://url')
|
||||
response = Response('http://url', body=b'', request=request)
|
||||
|
||||
# Simulate the Media Pipeline behavior to produce a Twisted Failure
|
||||
try:
|
||||
# Simulate a Twisted inline callback returning a Response
|
||||
# The returnValue method raises an exception encapsulating the value
|
||||
returnValue(response)
|
||||
except BaseException as exc:
|
||||
def_gen_return_exc = exc
|
||||
try:
|
||||
# Simulate the media_downloaded callback raising a FileException
|
||||
# This usually happens when the status code is not 200 OK
|
||||
raise FileException('download-error')
|
||||
except Exception as exc:
|
||||
file_exc = exc
|
||||
# Simulate Twisted capturing the FileException
|
||||
# It encapsulates the exception inside a Twisted Failure
|
||||
failure = Failure(file_exc)
|
||||
|
||||
# The Failure should encapsulate a FileException ...
|
||||
self.assertEqual(failure.value, file_exc)
|
||||
# ... and if we're running on Python 3 ...
|
||||
if sys.version_info.major >= 3:
|
||||
# ... it should have the returnValue exception set as its context
|
||||
self.assertEqual(failure.value.__context__, def_gen_return_exc)
|
||||
|
||||
# Let's calculate the request fingerprint and fake some runtime data...
|
||||
fp = request_fingerprint(request)
|
||||
info = self.pipe.spiderinfo
|
||||
info.downloading.add(fp)
|
||||
info.waiting[fp] = []
|
||||
|
||||
# When calling the method that caches the Request's result ...
|
||||
self.pipe._cache_result_and_execute_waiters(failure, fp, info)
|
||||
# ... it should store the Twisted Failure ...
|
||||
self.assertEqual(info.downloaded[fp], failure)
|
||||
# ... encapsulating the original FileException ...
|
||||
self.assertEqual(info.downloaded[fp].value, file_exc)
|
||||
# ... but it should not store the returnValue exception on its context
|
||||
context = getattr(info.downloaded[fp].value, '__context__', None)
|
||||
self.assertIsNone(context)
|
||||
|
||||
|
||||
class MockedMediaPipeline(MediaPipeline):
|
||||
|
||||
|
342
tests/test_scheduler.py
Normal file
342
tests/test_scheduler.py
Normal file
@ -0,0 +1,342 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
import collections
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.trial.unittest import TestCase
|
||||
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.core.downloader import Downloader
|
||||
from scrapy.core.scheduler import Scheduler
|
||||
from scrapy.http import Request
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.test import get_crawler
|
||||
from tests.mockserver import MockServer
|
||||
|
||||
|
||||
MockEngine = collections.namedtuple('MockEngine', ['downloader'])
|
||||
MockSlot = collections.namedtuple('MockSlot', ['active'])
|
||||
|
||||
|
||||
class MockDownloader(object):
|
||||
def __init__(self):
|
||||
self.slots = dict()
|
||||
|
||||
def _get_slot_key(self, request, spider):
|
||||
if Downloader.DOWNLOAD_SLOT in request.meta:
|
||||
return request.meta[Downloader.DOWNLOAD_SLOT]
|
||||
|
||||
return urlparse_cached(request).hostname or ''
|
||||
|
||||
def increment(self, slot_key):
|
||||
slot = self.slots.setdefault(slot_key, MockSlot(active=list()))
|
||||
slot.active.append(1)
|
||||
|
||||
def decrement(self, slot_key):
|
||||
slot = self.slots.get(slot_key)
|
||||
slot.active.pop()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
|
||||
class MockCrawler(Crawler):
|
||||
def __init__(self, priority_queue_cls, jobdir):
|
||||
|
||||
settings = dict(
|
||||
LOG_UNSERIALIZABLE_REQUESTS=False,
|
||||
SCHEDULER_DISK_QUEUE='scrapy.squeues.PickleLifoDiskQueue',
|
||||
SCHEDULER_MEMORY_QUEUE='scrapy.squeues.LifoMemoryQueue',
|
||||
SCHEDULER_PRIORITY_QUEUE=priority_queue_cls,
|
||||
JOBDIR=jobdir,
|
||||
DUPEFILTER_CLASS='scrapy.dupefilters.BaseDupeFilter'
|
||||
)
|
||||
super(MockCrawler, self).__init__(Spider, settings)
|
||||
self.engine = MockEngine(downloader=MockDownloader())
|
||||
|
||||
|
||||
class SchedulerHandler(object):
|
||||
priority_queue_cls = None
|
||||
jobdir = None
|
||||
|
||||
def create_scheduler(self):
|
||||
self.mock_crawler = MockCrawler(self.priority_queue_cls, self.jobdir)
|
||||
self.scheduler = Scheduler.from_crawler(self.mock_crawler)
|
||||
self.spider = Spider(name='spider')
|
||||
self.scheduler.open(self.spider)
|
||||
|
||||
def close_scheduler(self):
|
||||
self.scheduler.close('finished')
|
||||
self.mock_crawler.stop()
|
||||
self.mock_crawler.engine.downloader.close()
|
||||
|
||||
def setUp(self):
|
||||
self.create_scheduler()
|
||||
|
||||
def tearDown(self):
|
||||
self.close_scheduler()
|
||||
|
||||
|
||||
_PRIORITIES = [("http://foo.com/a", -2),
|
||||
("http://foo.com/d", 1),
|
||||
("http://foo.com/b", -1),
|
||||
("http://foo.com/c", 0),
|
||||
("http://foo.com/e", 2)]
|
||||
|
||||
|
||||
_URLS = {"http://foo.com/a", "http://foo.com/b", "http://foo.com/c"}
|
||||
|
||||
|
||||
class BaseSchedulerInMemoryTester(SchedulerHandler):
|
||||
def test_length(self):
|
||||
self.assertFalse(self.scheduler.has_pending_requests())
|
||||
self.assertEqual(len(self.scheduler), 0)
|
||||
|
||||
for url in _URLS:
|
||||
self.scheduler.enqueue_request(Request(url))
|
||||
|
||||
self.assertTrue(self.scheduler.has_pending_requests())
|
||||
self.assertEqual(len(self.scheduler), len(_URLS))
|
||||
|
||||
def test_dequeue(self):
|
||||
for url in _URLS:
|
||||
self.scheduler.enqueue_request(Request(url))
|
||||
|
||||
urls = set()
|
||||
while self.scheduler.has_pending_requests():
|
||||
urls.add(self.scheduler.next_request().url)
|
||||
|
||||
self.assertEqual(urls, _URLS)
|
||||
|
||||
def test_dequeue_priorities(self):
|
||||
for url, priority in _PRIORITIES:
|
||||
self.scheduler.enqueue_request(Request(url, priority=priority))
|
||||
|
||||
priorities = list()
|
||||
while self.scheduler.has_pending_requests():
|
||||
priorities.append(self.scheduler.next_request().priority)
|
||||
|
||||
self.assertEqual(priorities,
|
||||
sorted([x[1] for x in _PRIORITIES], key=lambda x: -x))
|
||||
|
||||
|
||||
class BaseSchedulerOnDiskTester(SchedulerHandler):
|
||||
|
||||
def setUp(self):
|
||||
self.jobdir = tempfile.mkdtemp()
|
||||
self.create_scheduler()
|
||||
|
||||
def tearDown(self):
|
||||
self.close_scheduler()
|
||||
|
||||
shutil.rmtree(self.jobdir)
|
||||
self.jobdir = None
|
||||
|
||||
def test_length(self):
|
||||
self.assertFalse(self.scheduler.has_pending_requests())
|
||||
self.assertEqual(len(self.scheduler), 0)
|
||||
|
||||
for url in _URLS:
|
||||
self.scheduler.enqueue_request(Request(url))
|
||||
|
||||
self.close_scheduler()
|
||||
self.create_scheduler()
|
||||
|
||||
self.assertTrue(self.scheduler.has_pending_requests())
|
||||
self.assertEqual(len(self.scheduler), len(_URLS))
|
||||
|
||||
def test_dequeue(self):
|
||||
for url in _URLS:
|
||||
self.scheduler.enqueue_request(Request(url))
|
||||
|
||||
self.close_scheduler()
|
||||
self.create_scheduler()
|
||||
|
||||
urls = set()
|
||||
while self.scheduler.has_pending_requests():
|
||||
urls.add(self.scheduler.next_request().url)
|
||||
|
||||
self.assertEqual(urls, _URLS)
|
||||
|
||||
def test_dequeue_priorities(self):
|
||||
for url, priority in _PRIORITIES:
|
||||
self.scheduler.enqueue_request(Request(url, priority=priority))
|
||||
|
||||
self.close_scheduler()
|
||||
self.create_scheduler()
|
||||
|
||||
priorities = list()
|
||||
while self.scheduler.has_pending_requests():
|
||||
priorities.append(self.scheduler.next_request().priority)
|
||||
|
||||
self.assertEqual(priorities,
|
||||
sorted([x[1] for x in _PRIORITIES], key=lambda x: -x))
|
||||
|
||||
|
||||
class TestSchedulerInMemory(BaseSchedulerInMemoryTester, unittest.TestCase):
|
||||
priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
|
||||
|
||||
|
||||
class TestSchedulerOnDisk(BaseSchedulerOnDiskTester, unittest.TestCase):
|
||||
priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
|
||||
|
||||
|
||||
_URLS_WITH_SLOTS = [("http://foo.com/a", 'a'),
|
||||
("http://foo.com/b", 'a'),
|
||||
("http://foo.com/c", 'b'),
|
||||
("http://foo.com/d", 'b'),
|
||||
("http://foo.com/e", 'c'),
|
||||
("http://foo.com/f", 'c')]
|
||||
|
||||
|
||||
class TestMigration(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdir = tempfile.mkdtemp()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdir)
|
||||
|
||||
def _migration(self, tmp_dir):
|
||||
prev_scheduler_handler = SchedulerHandler()
|
||||
prev_scheduler_handler.priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
|
||||
prev_scheduler_handler.jobdir = tmp_dir
|
||||
|
||||
prev_scheduler_handler.create_scheduler()
|
||||
for url in _URLS:
|
||||
prev_scheduler_handler.scheduler.enqueue_request(Request(url))
|
||||
prev_scheduler_handler.close_scheduler()
|
||||
|
||||
next_scheduler_handler = SchedulerHandler()
|
||||
next_scheduler_handler.priority_queue_cls = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
|
||||
next_scheduler_handler.jobdir = tmp_dir
|
||||
|
||||
next_scheduler_handler.create_scheduler()
|
||||
|
||||
def test_migration(self):
|
||||
with self.assertRaises(ValueError):
|
||||
self._migration(self.tmpdir)
|
||||
|
||||
|
||||
def _is_scheduling_fair(enqueued_slots, dequeued_slots):
|
||||
"""
|
||||
We enqueued same number of requests for every slot.
|
||||
Assert correct order, e.g.
|
||||
|
||||
>>> enqueued = ['a', 'b', 'c'] * 2
|
||||
>>> correct = ['a', 'c', 'b', 'b', 'a', 'c']
|
||||
>>> incorrect = ['a', 'a', 'b', 'c', 'c', 'b']
|
||||
>>> _is_scheduling_fair(enqueued, correct)
|
||||
True
|
||||
>>> _is_scheduling_fair(enqueued, incorrect)
|
||||
False
|
||||
"""
|
||||
if len(dequeued_slots) != len(enqueued_slots):
|
||||
return False
|
||||
|
||||
slots_number = len(set(enqueued_slots))
|
||||
for i in range(0, len(dequeued_slots), slots_number):
|
||||
part = dequeued_slots[i:i + slots_number]
|
||||
if len(part) != len(set(part)):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class DownloaderAwareSchedulerTestMixin(object):
|
||||
priority_queue_cls = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
|
||||
reopen = False
|
||||
|
||||
def test_logic(self):
|
||||
for url, slot in _URLS_WITH_SLOTS:
|
||||
request = Request(url)
|
||||
request.meta[Downloader.DOWNLOAD_SLOT] = slot
|
||||
self.scheduler.enqueue_request(request)
|
||||
|
||||
if self.reopen:
|
||||
self.close_scheduler()
|
||||
self.create_scheduler()
|
||||
|
||||
dequeued_slots = list()
|
||||
requests = []
|
||||
downloader = self.mock_crawler.engine.downloader
|
||||
while self.scheduler.has_pending_requests():
|
||||
request = self.scheduler.next_request()
|
||||
# pylint: disable=protected-access
|
||||
slot = downloader._get_slot_key(request, None)
|
||||
dequeued_slots.append(slot)
|
||||
downloader.increment(slot)
|
||||
requests.append(request)
|
||||
|
||||
for request in requests:
|
||||
# pylint: disable=protected-access
|
||||
slot = downloader._get_slot_key(request, None)
|
||||
downloader.decrement(slot)
|
||||
|
||||
self.assertTrue(_is_scheduling_fair(list(s for u, s in _URLS_WITH_SLOTS),
|
||||
dequeued_slots))
|
||||
self.assertEqual(sum(len(s.active) for s in downloader.slots.values()), 0)
|
||||
|
||||
|
||||
class TestSchedulerWithDownloaderAwareInMemory(DownloaderAwareSchedulerTestMixin,
|
||||
BaseSchedulerInMemoryTester,
|
||||
unittest.TestCase):
|
||||
pass
|
||||
|
||||
|
||||
class TestSchedulerWithDownloaderAwareOnDisk(DownloaderAwareSchedulerTestMixin,
|
||||
BaseSchedulerOnDiskTester,
|
||||
unittest.TestCase):
|
||||
reopen = True
|
||||
|
||||
|
||||
class StartUrlsSpider(Spider):
|
||||
|
||||
def __init__(self, start_urls):
|
||||
self.start_urls = start_urls
|
||||
super(StartUrlsSpider, self).__init__(start_urls)
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
||||
|
||||
|
||||
class TestIntegrationWithDownloaderAwareInMemory(TestCase):
|
||||
def setUp(self):
|
||||
self.crawler = get_crawler(
|
||||
StartUrlsSpider,
|
||||
{'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
|
||||
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter'}
|
||||
)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def tearDown(self):
|
||||
yield self.crawler.stop()
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_integration_downloader_aware_priority_queue(self):
|
||||
with MockServer() as mockserver:
|
||||
|
||||
url = mockserver.url("/status?n=200", is_secure=False)
|
||||
start_urls = [url] * 6
|
||||
yield self.crawler.crawl(start_urls)
|
||||
self.assertEqual(self.crawler.stats.get_value('downloader/response_count'),
|
||||
len(start_urls))
|
||||
|
||||
|
||||
class TestIncompatibility(unittest.TestCase):
|
||||
|
||||
def _incompatible(self):
|
||||
settings = dict(
|
||||
SCHEDULER_PRIORITY_QUEUE='scrapy.pqueues.DownloaderAwarePriorityQueue',
|
||||
CONCURRENT_REQUESTS_PER_IP=1
|
||||
)
|
||||
crawler = Crawler(Spider, settings)
|
||||
scheduler = Scheduler.from_crawler(crawler)
|
||||
spider = Spider(name='spider')
|
||||
scheduler.open(spider)
|
||||
|
||||
def test_incompatibility(self):
|
||||
with self.assertRaises(ValueError):
|
||||
self._incompatible()
|
@ -105,11 +105,11 @@ class SpiderTest(unittest.TestCase):
|
||||
|
||||
def test_logger(self):
|
||||
spider = self.spider_class('example.com')
|
||||
with LogCapture() as l:
|
||||
with LogCapture() as lc:
|
||||
spider.logger.info('test log msg')
|
||||
l.check(('example.com', 'INFO', 'test log msg'))
|
||||
lc.check(('example.com', 'INFO', 'test log msg'))
|
||||
|
||||
record = l.records[0]
|
||||
record = lc.records[0]
|
||||
self.assertIn('spider', record.__dict__)
|
||||
self.assertIs(record.spider, spider)
|
||||
|
||||
@ -190,12 +190,11 @@ class CrawlSpiderTest(SpiderTest):
|
||||
|
||||
def test_process_links(self):
|
||||
|
||||
response = HtmlResponse("http://example.org/somepage/index.html",
|
||||
body=self.test_body)
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||
|
||||
class _CrawlSpider(self.spider_class):
|
||||
name="test"
|
||||
allowed_domains=['example.org']
|
||||
name = "test"
|
||||
allowed_domains = ['example.org']
|
||||
rules = (
|
||||
Rule(LinkExtractor(), process_links="dummy_process_links"),
|
||||
)
|
||||
@ -208,24 +207,24 @@ class CrawlSpiderTest(SpiderTest):
|
||||
self.assertEqual(len(output), 3)
|
||||
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||
self.assertEqual([r.url for r in output],
|
||||
['http://example.org/somepage/item/12.html',
|
||||
'http://example.org/about.html',
|
||||
'http://example.org/nofollow.html'])
|
||||
['http://example.org/somepage/item/12.html',
|
||||
'http://example.org/about.html',
|
||||
'http://example.org/nofollow.html'])
|
||||
|
||||
def test_process_links_filter(self):
|
||||
|
||||
response = HtmlResponse("http://example.org/somepage/index.html",
|
||||
body=self.test_body)
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||
|
||||
class _CrawlSpider(self.spider_class):
|
||||
import re
|
||||
|
||||
name="test"
|
||||
allowed_domains=['example.org']
|
||||
name = "test"
|
||||
allowed_domains = ['example.org']
|
||||
rules = (
|
||||
Rule(LinkExtractor(), process_links="filter_process_links"),
|
||||
)
|
||||
_test_regex = re.compile('nofollow')
|
||||
|
||||
def filter_process_links(self, links):
|
||||
return [link for link in links
|
||||
if not self._test_regex.search(link.url)]
|
||||
@ -235,17 +234,16 @@ class CrawlSpiderTest(SpiderTest):
|
||||
self.assertEqual(len(output), 2)
|
||||
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||
self.assertEqual([r.url for r in output],
|
||||
['http://example.org/somepage/item/12.html',
|
||||
'http://example.org/about.html'])
|
||||
['http://example.org/somepage/item/12.html',
|
||||
'http://example.org/about.html'])
|
||||
|
||||
def test_process_links_generator(self):
|
||||
|
||||
response = HtmlResponse("http://example.org/somepage/index.html",
|
||||
body=self.test_body)
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||
|
||||
class _CrawlSpider(self.spider_class):
|
||||
name="test"
|
||||
allowed_domains=['example.org']
|
||||
name = "test"
|
||||
allowed_domains = ['example.org']
|
||||
rules = (
|
||||
Rule(LinkExtractor(), process_links="dummy_process_links"),
|
||||
)
|
||||
@ -259,9 +257,113 @@ class CrawlSpiderTest(SpiderTest):
|
||||
self.assertEqual(len(output), 3)
|
||||
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||
self.assertEqual([r.url for r in output],
|
||||
['http://example.org/somepage/item/12.html',
|
||||
'http://example.org/about.html',
|
||||
'http://example.org/nofollow.html'])
|
||||
['http://example.org/somepage/item/12.html',
|
||||
'http://example.org/about.html',
|
||||
'http://example.org/nofollow.html'])
|
||||
|
||||
def test_process_request(self):
|
||||
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||
|
||||
def process_request_change_domain(request):
|
||||
return request.replace(url=request.url.replace('.org', '.com'))
|
||||
|
||||
class _CrawlSpider(self.spider_class):
|
||||
name = "test"
|
||||
allowed_domains = ['example.org']
|
||||
rules = (
|
||||
Rule(LinkExtractor(), process_request=process_request_change_domain),
|
||||
)
|
||||
|
||||
with warnings.catch_warnings(record=True) as cw:
|
||||
spider = _CrawlSpider()
|
||||
output = list(spider._requests_to_follow(response))
|
||||
self.assertEqual(len(output), 3)
|
||||
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||
self.assertEqual([r.url for r in output],
|
||||
['http://example.com/somepage/item/12.html',
|
||||
'http://example.com/about.html',
|
||||
'http://example.com/nofollow.html'])
|
||||
self.assertEqual(len(cw), 1)
|
||||
self.assertEqual(cw[0].category, ScrapyDeprecationWarning)
|
||||
|
||||
def test_process_request_with_response(self):
|
||||
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||
|
||||
def process_request_meta_response_class(request, response):
|
||||
request.meta['response_class'] = response.__class__.__name__
|
||||
return request
|
||||
|
||||
class _CrawlSpider(self.spider_class):
|
||||
name = "test"
|
||||
allowed_domains = ['example.org']
|
||||
rules = (
|
||||
Rule(LinkExtractor(), process_request=process_request_meta_response_class),
|
||||
)
|
||||
|
||||
spider = _CrawlSpider()
|
||||
output = list(spider._requests_to_follow(response))
|
||||
self.assertEqual(len(output), 3)
|
||||
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||
self.assertEqual([r.url for r in output],
|
||||
['http://example.org/somepage/item/12.html',
|
||||
'http://example.org/about.html',
|
||||
'http://example.org/nofollow.html'])
|
||||
self.assertEqual([r.meta['response_class'] for r in output],
|
||||
['HtmlResponse', 'HtmlResponse', 'HtmlResponse'])
|
||||
|
||||
def test_process_request_instance_method(self):
|
||||
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||
|
||||
class _CrawlSpider(self.spider_class):
|
||||
name = "test"
|
||||
allowed_domains = ['example.org']
|
||||
rules = (
|
||||
Rule(LinkExtractor(), process_request='process_request_upper'),
|
||||
)
|
||||
|
||||
def process_request_upper(self, request):
|
||||
return request.replace(url=request.url.upper())
|
||||
|
||||
with warnings.catch_warnings(record=True) as cw:
|
||||
spider = _CrawlSpider()
|
||||
output = list(spider._requests_to_follow(response))
|
||||
self.assertEqual(len(output), 3)
|
||||
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||
self.assertEqual([r.url for r in output],
|
||||
['http://EXAMPLE.ORG/SOMEPAGE/ITEM/12.HTML',
|
||||
'http://EXAMPLE.ORG/ABOUT.HTML',
|
||||
'http://EXAMPLE.ORG/NOFOLLOW.HTML'])
|
||||
self.assertEqual(len(cw), 1)
|
||||
self.assertEqual(cw[0].category, ScrapyDeprecationWarning)
|
||||
|
||||
def test_process_request_instance_method_with_response(self):
|
||||
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
|
||||
|
||||
class _CrawlSpider(self.spider_class):
|
||||
name = "test"
|
||||
allowed_domains = ['example.org']
|
||||
rules = (
|
||||
Rule(LinkExtractor(), process_request='process_request_meta_response_class'),
|
||||
)
|
||||
|
||||
def process_request_meta_response_class(self, request, response):
|
||||
request.meta['response_class'] = response.__class__.__name__
|
||||
return request
|
||||
|
||||
spider = _CrawlSpider()
|
||||
output = list(spider._requests_to_follow(response))
|
||||
self.assertEqual(len(output), 3)
|
||||
self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
|
||||
self.assertEqual([r.url for r in output],
|
||||
['http://example.org/somepage/item/12.html',
|
||||
'http://example.org/about.html',
|
||||
'http://example.org/nofollow.html'])
|
||||
self.assertEqual([r.meta['response_class'] for r in output],
|
||||
['HtmlResponse', 'HtmlResponse', 'HtmlResponse'])
|
||||
|
||||
def test_follow_links_attribute_population(self):
|
||||
crawler = get_crawler()
|
||||
|
102
tests/test_spidermiddleware.py
Normal file
102
tests/test_spidermiddleware.py
Normal file
@ -0,0 +1,102 @@
|
||||
from twisted.trial.unittest import TestCase
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.exceptions import _InvalidOutput
|
||||
from scrapy.utils.test import get_crawler
|
||||
from scrapy.core.spidermw import SpiderMiddlewareManager
|
||||
from tests import mock
|
||||
|
||||
|
||||
class SpiderMiddlewareTestCase(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.request = Request('http://example.com/index.html')
|
||||
self.response = Response(self.request.url, request=self.request)
|
||||
self.crawler = get_crawler(Spider)
|
||||
self.spider = self.crawler._create_spider('foo')
|
||||
self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
|
||||
|
||||
def _scrape_response(self):
|
||||
"""Execute spider mw manager's scrape_response method and return the result.
|
||||
Raise exception in case of failure.
|
||||
"""
|
||||
scrape_func = mock.MagicMock()
|
||||
dfd = self.mwman.scrape_response(scrape_func, self.response, self.request, self.spider)
|
||||
# catch deferred result and return the value
|
||||
results = []
|
||||
dfd.addBoth(results.append)
|
||||
self._wait(dfd)
|
||||
ret = results[0]
|
||||
return ret
|
||||
|
||||
|
||||
class ProcessSpiderInputInvalidOutput(SpiderMiddlewareTestCase):
|
||||
"""Invalid return value for process_spider_input method"""
|
||||
|
||||
def test_invalid_process_spider_input(self):
|
||||
|
||||
class InvalidProcessSpiderInputMiddleware:
|
||||
def process_spider_input(self, response, spider):
|
||||
return 1
|
||||
|
||||
self.mwman._add_middleware(InvalidProcessSpiderInputMiddleware())
|
||||
result = self._scrape_response()
|
||||
self.assertIsInstance(result, Failure)
|
||||
self.assertIsInstance(result.value, _InvalidOutput)
|
||||
|
||||
|
||||
class ProcessSpiderOutputInvalidOutput(SpiderMiddlewareTestCase):
|
||||
"""Invalid return value for process_spider_output method"""
|
||||
|
||||
def test_invalid_process_spider_output(self):
|
||||
|
||||
class InvalidProcessSpiderOutputMiddleware:
|
||||
def process_spider_output(self, response, result, spider):
|
||||
return 1
|
||||
|
||||
self.mwman._add_middleware(InvalidProcessSpiderOutputMiddleware())
|
||||
result = self._scrape_response()
|
||||
self.assertIsInstance(result, Failure)
|
||||
self.assertIsInstance(result.value, _InvalidOutput)
|
||||
|
||||
|
||||
class ProcessSpiderExceptionInvalidOutput(SpiderMiddlewareTestCase):
|
||||
"""Invalid return value for process_spider_exception method"""
|
||||
|
||||
def test_invalid_process_spider_exception(self):
|
||||
|
||||
class InvalidProcessSpiderOutputExceptionMiddleware:
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
return 1
|
||||
|
||||
class RaiseExceptionProcessSpiderOutputMiddleware:
|
||||
def process_spider_output(self, response, result, spider):
|
||||
raise Exception()
|
||||
|
||||
self.mwman._add_middleware(InvalidProcessSpiderOutputExceptionMiddleware())
|
||||
self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
|
||||
result = self._scrape_response()
|
||||
self.assertIsInstance(result, Failure)
|
||||
self.assertIsInstance(result.value, _InvalidOutput)
|
||||
|
||||
|
||||
class ProcessSpiderExceptionReRaise(SpiderMiddlewareTestCase):
|
||||
"""Re raise the exception by returning None"""
|
||||
|
||||
def test_process_spider_exception_return_none(self):
|
||||
|
||||
class ProcessSpiderExceptionReturnNoneMiddleware:
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
return None
|
||||
|
||||
class RaiseExceptionProcessSpiderOutputMiddleware:
|
||||
def process_spider_output(self, response, result, spider):
|
||||
1/0
|
||||
|
||||
self.mwman._add_middleware(ProcessSpiderExceptionReturnNoneMiddleware())
|
||||
self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
|
||||
result = self._scrape_response()
|
||||
self.assertIsInstance(result, Failure)
|
||||
self.assertIsInstance(result.value, ZeroDivisionError)
|
380
tests/test_spidermiddleware_output_chain.py
Normal file
380
tests/test_spidermiddleware_output_chain.py
Normal file
@ -0,0 +1,380 @@
|
||||
|
||||
from testfixtures import LogCapture
|
||||
from twisted.trial.unittest import TestCase
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy import Spider, Request
|
||||
from scrapy.utils.test import get_crawler
|
||||
from tests.mockserver import MockServer
|
||||
from tests.spiders import MockServerSpider
|
||||
|
||||
|
||||
class LogExceptionMiddleware:
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
spider.logger.info('Middleware: %s exception caught', exception.__class__.__name__)
|
||||
return None
|
||||
|
||||
|
||||
# ================================================================================
|
||||
# (0) recover from an exception on a spider callback
|
||||
class RecoverySpider(Spider):
|
||||
name = 'RecoverySpider'
|
||||
custom_settings = {
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
__name__ + '.RecoveryMiddleware': 10,
|
||||
},
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
yield Request(self.mockserver.url('/status?n=200'))
|
||||
|
||||
def parse(self, response):
|
||||
yield {'test': 1}
|
||||
self.logger.info('DONT_FAIL: %s', response.meta.get('dont_fail'))
|
||||
if not response.meta.get('dont_fail'):
|
||||
raise TabError()
|
||||
|
||||
class RecoveryMiddleware:
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
spider.logger.info('Middleware: %s exception caught', exception.__class__.__name__)
|
||||
return [
|
||||
{'from': 'process_spider_exception'},
|
||||
Request(response.url, meta={'dont_fail': True}, dont_filter=True),
|
||||
]
|
||||
|
||||
|
||||
# ================================================================================
|
||||
# (1) exceptions from a spider middleware's process_spider_input method
|
||||
class FailProcessSpiderInputMiddleware:
|
||||
def process_spider_input(self, response, spider):
|
||||
spider.logger.info('Middleware: will raise IndexError')
|
||||
raise IndexError()
|
||||
|
||||
class ProcessSpiderInputSpiderWithoutErrback(Spider):
|
||||
name = 'ProcessSpiderInputSpiderWithoutErrback'
|
||||
custom_settings = {
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
# spider
|
||||
__name__ + '.LogExceptionMiddleware': 10,
|
||||
__name__ + '.FailProcessSpiderInputMiddleware': 8,
|
||||
__name__ + '.LogExceptionMiddleware': 6,
|
||||
# engine
|
||||
}
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
yield Request(url=self.mockserver.url('/status?n=200'), callback=self.parse)
|
||||
|
||||
def parse(self, response):
|
||||
return {'from': 'callback'}
|
||||
|
||||
|
||||
class ProcessSpiderInputSpiderWithErrback(ProcessSpiderInputSpiderWithoutErrback):
|
||||
name = 'ProcessSpiderInputSpiderWithErrback'
|
||||
|
||||
def start_requests(self):
|
||||
yield Request(url=self.mockserver.url('/status?n=200'), callback=self.parse, errback=self.errback)
|
||||
|
||||
def errback(self, failure):
|
||||
self.logger.info('Got a Failure on the Request errback')
|
||||
return {'from': 'errback'}
|
||||
|
||||
|
||||
# ================================================================================
|
||||
# (2) exceptions from a spider callback (generator)
|
||||
class GeneratorCallbackSpider(Spider):
|
||||
name = 'GeneratorCallbackSpider'
|
||||
custom_settings = {
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
__name__ + '.LogExceptionMiddleware': 10,
|
||||
},
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
yield Request(self.mockserver.url('/status?n=200'))
|
||||
|
||||
def parse(self, response):
|
||||
yield {'test': 1}
|
||||
yield {'test': 2}
|
||||
raise ImportError()
|
||||
|
||||
|
||||
# ================================================================================
|
||||
# (3) exceptions from a spider callback (not a generator)
|
||||
class NotGeneratorCallbackSpider(Spider):
|
||||
name = 'NotGeneratorCallbackSpider'
|
||||
custom_settings = {
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
__name__ + '.LogExceptionMiddleware': 10,
|
||||
},
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
yield Request(self.mockserver.url('/status?n=200'))
|
||||
|
||||
def parse(self, response):
|
||||
return [{'test': 1}, {'test': 1/0}]
|
||||
|
||||
|
||||
# ================================================================================
|
||||
# (4) exceptions from a middleware process_spider_output method (generator)
|
||||
class GeneratorOutputChainSpider(Spider):
|
||||
name = 'GeneratorOutputChainSpider'
|
||||
custom_settings = {
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
__name__ + '.GeneratorFailMiddleware': 10,
|
||||
__name__ + '.GeneratorDoNothingAfterFailureMiddleware': 8,
|
||||
__name__ + '.GeneratorRecoverMiddleware': 5,
|
||||
__name__ + '.GeneratorDoNothingAfterRecoveryMiddleware': 3,
|
||||
},
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
yield Request(self.mockserver.url('/status?n=200'))
|
||||
|
||||
def parse(self, response):
|
||||
yield {'processed': ['parse-first-item']}
|
||||
yield {'processed': ['parse-second-item']}
|
||||
|
||||
|
||||
class _GeneratorDoNothingMiddleware:
|
||||
def process_spider_output(self, response, result, spider):
|
||||
for r in result:
|
||||
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||
yield r
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||
return None
|
||||
|
||||
|
||||
class GeneratorFailMiddleware:
|
||||
def process_spider_output(self, response, result, spider):
|
||||
for r in result:
|
||||
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||
yield r
|
||||
raise LookupError()
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||
yield {'processed': [method]}
|
||||
|
||||
|
||||
class GeneratorDoNothingAfterFailureMiddleware(_GeneratorDoNothingMiddleware):
|
||||
pass
|
||||
|
||||
|
||||
class GeneratorRecoverMiddleware:
|
||||
def process_spider_output(self, response, result, spider):
|
||||
for r in result:
|
||||
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||
yield r
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||
yield {'processed': [method]}
|
||||
|
||||
class GeneratorDoNothingAfterRecoveryMiddleware(_GeneratorDoNothingMiddleware):
|
||||
pass
|
||||
|
||||
|
||||
# ================================================================================
|
||||
# (5) exceptions from a middleware process_spider_output method (not generator)
|
||||
class NotGeneratorOutputChainSpider(Spider):
|
||||
name = 'NotGeneratorOutputChainSpider'
|
||||
custom_settings = {
|
||||
'SPIDER_MIDDLEWARES': {
|
||||
__name__ + '.NotGeneratorFailMiddleware': 10,
|
||||
__name__ + '.NotGeneratorDoNothingAfterFailureMiddleware': 8,
|
||||
__name__ + '.NotGeneratorRecoverMiddleware': 5,
|
||||
__name__ + '.NotGeneratorDoNothingAfterRecoveryMiddleware': 3,
|
||||
},
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
return [Request(self.mockserver.url('/status?n=200'))]
|
||||
|
||||
def parse(self, response):
|
||||
return [{'processed': ['parse-first-item']}, {'processed': ['parse-second-item']}]
|
||||
|
||||
|
||||
class _NotGeneratorDoNothingMiddleware:
|
||||
def process_spider_output(self, response, result, spider):
|
||||
out = []
|
||||
for r in result:
|
||||
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||
out.append(r)
|
||||
return out
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||
return None
|
||||
|
||||
|
||||
class NotGeneratorFailMiddleware:
|
||||
def process_spider_output(self, response, result, spider):
|
||||
out = []
|
||||
for r in result:
|
||||
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||
out.append(r)
|
||||
raise ReferenceError()
|
||||
return out
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||
return [{'processed': [method]}]
|
||||
|
||||
|
||||
class NotGeneratorDoNothingAfterFailureMiddleware(_NotGeneratorDoNothingMiddleware):
|
||||
pass
|
||||
|
||||
|
||||
class NotGeneratorRecoverMiddleware:
|
||||
def process_spider_output(self, response, result, spider):
|
||||
out = []
|
||||
for r in result:
|
||||
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||
out.append(r)
|
||||
return out
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||
return [{'processed': [method]}]
|
||||
|
||||
class NotGeneratorDoNothingAfterRecoveryMiddleware(_NotGeneratorDoNothingMiddleware):
|
||||
pass
|
||||
|
||||
|
||||
# ================================================================================
|
||||
class TestSpiderMiddleware(TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.mockserver = MockServer()
|
||||
cls.mockserver.__enter__()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
cls.mockserver.__exit__(None, None, None)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def crawl_log(self, spider):
|
||||
crawler = get_crawler(spider)
|
||||
with LogCapture() as log:
|
||||
yield crawler.crawl(mockserver=self.mockserver)
|
||||
raise defer.returnValue(log)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_recovery(self):
|
||||
"""
|
||||
(0) Recover from an exception in a spider callback. The final item count should be 3
|
||||
(one yielded from the callback method before the exception is raised, one directly
|
||||
from the recovery middleware and one from the spider when processing the request that
|
||||
was enqueued from the recovery middleware)
|
||||
"""
|
||||
log = yield self.crawl_log(RecoverySpider)
|
||||
self.assertIn("Middleware: TabError exception caught", str(log))
|
||||
self.assertEqual(str(log).count("Middleware: TabError exception caught"), 1)
|
||||
self.assertIn("'item_scraped_count': 3", str(log))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_process_spider_input_without_errback(self):
|
||||
"""
|
||||
(1.1) An exception from the process_spider_input chain should be caught by the
|
||||
process_spider_exception chain from the start if the Request has no errback
|
||||
"""
|
||||
log1 = yield self.crawl_log(ProcessSpiderInputSpiderWithoutErrback)
|
||||
self.assertIn("Middleware: will raise IndexError", str(log1))
|
||||
self.assertIn("Middleware: IndexError exception caught", str(log1))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_process_spider_input_with_errback(self):
|
||||
"""
|
||||
(1.2) An exception from the process_spider_input chain should not be caught by the
|
||||
process_spider_exception chain if the Request has an errback
|
||||
"""
|
||||
log1 = yield self.crawl_log(ProcessSpiderInputSpiderWithErrback)
|
||||
self.assertNotIn("Middleware: IndexError exception caught", str(log1))
|
||||
self.assertIn("Middleware: will raise IndexError", str(log1))
|
||||
self.assertIn("Got a Failure on the Request errback", str(log1))
|
||||
self.assertIn("{'from': 'errback'}", str(log1))
|
||||
self.assertNotIn("{'from': 'callback'}", str(log1))
|
||||
self.assertIn("'item_scraped_count': 1", str(log1))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_generator_callback(self):
|
||||
"""
|
||||
(2) An exception from a spider callback (returning a generator) should
|
||||
be caught by the process_spider_exception chain. Items yielded before the
|
||||
exception is raised should be processed normally.
|
||||
"""
|
||||
log2 = yield self.crawl_log(GeneratorCallbackSpider)
|
||||
self.assertIn("Middleware: ImportError exception caught", str(log2))
|
||||
self.assertIn("'item_scraped_count': 2", str(log2))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_not_a_generator_callback(self):
|
||||
"""
|
||||
(3) An exception from a spider callback (returning a list) should
|
||||
be caught by the process_spider_exception chain. No items should be processed.
|
||||
"""
|
||||
log3 = yield self.crawl_log(NotGeneratorCallbackSpider)
|
||||
self.assertIn("Middleware: ZeroDivisionError exception caught", str(log3))
|
||||
self.assertNotIn("item_scraped_count", str(log3))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_generator_output_chain(self):
|
||||
"""
|
||||
(4) An exception from a middleware's process_spider_output method should be sent
|
||||
to the process_spider_exception method from the next middleware in the chain.
|
||||
The result of the recovery by the process_spider_exception method should be handled
|
||||
by the process_spider_output method from the next middleware.
|
||||
The final item count should be 2 (one from the spider callback and one from the
|
||||
process_spider_exception chain)
|
||||
"""
|
||||
log4 = yield self.crawl_log(GeneratorOutputChainSpider)
|
||||
self.assertIn("'item_scraped_count': 2", str(log4))
|
||||
self.assertIn("GeneratorRecoverMiddleware.process_spider_exception: LookupError caught", str(log4))
|
||||
self.assertIn("GeneratorDoNothingAfterFailureMiddleware.process_spider_exception: LookupError caught", str(log4))
|
||||
self.assertNotIn("GeneratorFailMiddleware.process_spider_exception: LookupError caught", str(log4))
|
||||
self.assertNotIn("GeneratorDoNothingAfterRecoveryMiddleware.process_spider_exception: LookupError caught", str(log4))
|
||||
item_from_callback = {'processed': [
|
||||
'parse-first-item',
|
||||
'GeneratorFailMiddleware.process_spider_output',
|
||||
'GeneratorDoNothingAfterFailureMiddleware.process_spider_output',
|
||||
'GeneratorRecoverMiddleware.process_spider_output',
|
||||
'GeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
|
||||
item_recovered = {'processed': [
|
||||
'GeneratorRecoverMiddleware.process_spider_exception',
|
||||
'GeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
|
||||
self.assertIn(str(item_from_callback), str(log4))
|
||||
self.assertIn(str(item_recovered), str(log4))
|
||||
self.assertNotIn('parse-second-item', str(log4))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_not_a_generator_output_chain(self):
|
||||
"""
|
||||
(5) An exception from a middleware's process_spider_output method should be sent
|
||||
to the process_spider_exception method from the next middleware in the chain.
|
||||
The result of the recovery by the process_spider_exception method should be handled
|
||||
by the process_spider_output method from the next middleware.
|
||||
The final item count should be 1 (from the process_spider_exception chain, the items
|
||||
from the spider callback are lost)
|
||||
"""
|
||||
log5 = yield self.crawl_log(NotGeneratorOutputChainSpider)
|
||||
self.assertIn("'item_scraped_count': 1", str(log5))
|
||||
self.assertIn("GeneratorRecoverMiddleware.process_spider_exception: ReferenceError caught", str(log5))
|
||||
self.assertIn("GeneratorDoNothingAfterFailureMiddleware.process_spider_exception: ReferenceError caught", str(log5))
|
||||
self.assertNotIn("GeneratorFailMiddleware.process_spider_exception: ReferenceError caught", str(log5))
|
||||
self.assertNotIn("GeneratorDoNothingAfterRecoveryMiddleware.process_spider_exception: ReferenceError caught", str(log5))
|
||||
item_recovered = {'processed': [
|
||||
'NotGeneratorRecoverMiddleware.process_spider_exception',
|
||||
'NotGeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
|
||||
self.assertIn(str(item_recovered), str(log5))
|
||||
self.assertNotIn('parse-first-item', str(log5))
|
||||
self.assertNotIn('parse-second-item', str(log5))
|
@ -3,12 +3,13 @@ import os
|
||||
import unittest
|
||||
|
||||
from scrapy.item import Item, Field
|
||||
from scrapy.utils.misc import arg_to_iter, create_instance, load_object, walk_modules
|
||||
from scrapy.utils.misc import arg_to_iter, create_instance, load_object, set_environ, walk_modules
|
||||
|
||||
from tests import mock
|
||||
|
||||
__doctests__ = ['scrapy.utils.misc']
|
||||
|
||||
|
||||
class UtilsMiscTestCase(unittest.TestCase):
|
||||
|
||||
def test_load_object(self):
|
||||
@ -130,5 +131,18 @@ class UtilsMiscTestCase(unittest.TestCase):
|
||||
with self.assertRaises(ValueError):
|
||||
create_instance(m, None, None)
|
||||
|
||||
def test_set_environ(self):
|
||||
assert os.environ.get('some_test_environ') is None
|
||||
with set_environ(some_test_environ='test_value'):
|
||||
assert os.environ.get('some_test_environ') == 'test_value'
|
||||
assert os.environ.get('some_test_environ') is None
|
||||
|
||||
os.environ['some_test_environ'] = 'test'
|
||||
assert os.environ.get('some_test_environ') == 'test'
|
||||
with set_environ(some_test_environ='test_value'):
|
||||
assert os.environ.get('some_test_environ') == 'test_value'
|
||||
assert os.environ.get('some_test_environ') == 'test'
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -9,11 +9,23 @@ import six
|
||||
from scrapy.utils.python import (
|
||||
memoizemethod_noargs, binary_is_text, equal_attributes,
|
||||
WeakKeyCache, stringify_dict, get_func_args, to_bytes, to_unicode,
|
||||
without_none_values)
|
||||
without_none_values, MutableChain)
|
||||
|
||||
__doctests__ = ['scrapy.utils.python']
|
||||
|
||||
|
||||
class MutableChainTest(unittest.TestCase):
|
||||
def test_mutablechain(self):
|
||||
m = MutableChain(range(2), [2, 3], (4, 5))
|
||||
m.extend(range(6, 7))
|
||||
m.extend([7, 8])
|
||||
m.extend([9, 10], (11, 12))
|
||||
self.assertEqual(next(m), 0)
|
||||
self.assertEqual(m.next(), 1)
|
||||
self.assertEqual(m.__next__(), 2)
|
||||
self.assertEqual(list(m), list(range(3, 13)))
|
||||
|
||||
|
||||
class ToUnicodeTest(unittest.TestCase):
|
||||
def test_converting_an_utf8_encoded_string_to_unicode(self):
|
||||
self.assertEqual(to_unicode(b'lel\xc3\xb1e'), u'lel\xf1e')
|
||||
|
@ -1,9 +1,12 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import unittest
|
||||
import sys
|
||||
|
||||
import six
|
||||
|
||||
from scrapy.http import Request, FormRequest
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
||||
from scrapy.utils.reqser import request_to_dict, request_from_dict, _is_private_method, _mangle_private_name
|
||||
|
||||
|
||||
class RequestSerializationTest(unittest.TestCase):
|
||||
@ -70,6 +73,56 @@ class RequestSerializationTest(unittest.TestCase):
|
||||
errback=self.spider.handle_error)
|
||||
self._assert_serializes_ok(r, spider=self.spider)
|
||||
|
||||
def test_private_callback_serialization(self):
|
||||
r = Request("http://www.example.com",
|
||||
callback=self.spider._TestSpider__parse_item_private,
|
||||
errback=self.spider.handle_error)
|
||||
self._assert_serializes_ok(r, spider=self.spider)
|
||||
|
||||
def test_mixin_private_callback_serialization(self):
|
||||
if sys.version_info[0] < 3:
|
||||
return
|
||||
r = Request("http://www.example.com",
|
||||
callback=self.spider._TestSpiderMixin__mixin_callback,
|
||||
errback=self.spider.handle_error)
|
||||
self._assert_serializes_ok(r, spider=self.spider)
|
||||
|
||||
def test_private_callback_name_matching(self):
|
||||
self.assertTrue(_is_private_method('__a'))
|
||||
self.assertTrue(_is_private_method('__a_'))
|
||||
self.assertTrue(_is_private_method('__a_a'))
|
||||
self.assertTrue(_is_private_method('__a_a_'))
|
||||
self.assertTrue(_is_private_method('__a__a'))
|
||||
self.assertTrue(_is_private_method('__a__a_'))
|
||||
self.assertTrue(_is_private_method('__a___a'))
|
||||
self.assertTrue(_is_private_method('__a___a_'))
|
||||
self.assertTrue(_is_private_method('___a'))
|
||||
self.assertTrue(_is_private_method('___a_'))
|
||||
self.assertTrue(_is_private_method('___a_a'))
|
||||
self.assertTrue(_is_private_method('___a_a_'))
|
||||
self.assertTrue(_is_private_method('____a_a_'))
|
||||
|
||||
self.assertFalse(_is_private_method('_a'))
|
||||
self.assertFalse(_is_private_method('_a_'))
|
||||
self.assertFalse(_is_private_method('__a__'))
|
||||
self.assertFalse(_is_private_method('__'))
|
||||
self.assertFalse(_is_private_method('___'))
|
||||
self.assertFalse(_is_private_method('____'))
|
||||
|
||||
def _assert_mangles_to(self, obj, name):
|
||||
func = getattr(obj, name)
|
||||
self.assertEqual(
|
||||
_mangle_private_name(obj, func, func.__name__),
|
||||
name
|
||||
)
|
||||
|
||||
def test_private_name_mangling(self):
|
||||
self._assert_mangles_to(
|
||||
self.spider, '_TestSpider__parse_item_private')
|
||||
if sys.version_info[0] >= 3:
|
||||
self._assert_mangles_to(
|
||||
self.spider, '_TestSpiderMixin__mixin_callback')
|
||||
|
||||
def test_unserializable_callback1(self):
|
||||
r = Request("http://www.example.com", callback=lambda x: x)
|
||||
self.assertRaises(ValueError, request_to_dict, r)
|
||||
@ -80,7 +133,12 @@ class RequestSerializationTest(unittest.TestCase):
|
||||
self.assertRaises(ValueError, request_to_dict, r)
|
||||
|
||||
|
||||
class TestSpider(Spider):
|
||||
class TestSpiderMixin(object):
|
||||
def __mixin_callback(self, response):
|
||||
pass
|
||||
|
||||
|
||||
class TestSpider(Spider, TestSpiderMixin):
|
||||
name = 'test'
|
||||
|
||||
def parse_item(self, response):
|
||||
@ -89,6 +147,9 @@ class TestSpider(Spider):
|
||||
def handle_error(self, failure):
|
||||
pass
|
||||
|
||||
def __parse_item_private(self, response):
|
||||
pass
|
||||
|
||||
|
||||
class CustomRequest(Request):
|
||||
pass
|
||||
|
6
tox.ini
6
tox.ini
@ -105,6 +105,12 @@ deps = {[docs]deps}
|
||||
commands =
|
||||
sphinx-build -W -b html . {envtmpdir}/html
|
||||
|
||||
[testenv:docs-coverage]
|
||||
changedir = {[docs]changedir}
|
||||
deps = {[docs]deps}
|
||||
commands =
|
||||
sphinx-build -b coverage . {envtmpdir}/coverage
|
||||
|
||||
[testenv:docs-links]
|
||||
changedir = {[docs]changedir}
|
||||
deps = {[docs]deps}
|
||||
|
Loading…
x
Reference in New Issue
Block a user