Merge remote-tracking branch 'origin/master' into callback_kwargs

2025-02-24 14:24:35 +00:00 · 2019-06-26 11:03:31 -03:00 · 2019-06-26 11:03:31 -03:00 · 428309ba1a
commit 428309ba1a
parent 0522fe35c3 c81d120bde
61 changed files with 2194 additions and 189 deletions
--- a/appveyor.yml
+++ b/appveyor.yml
@ -12,7 +12,8 @@ branches:
 install:
  - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
-  - "SET TOX_TESTENV_PASSENV=HOME USERPROFILE HOMEPATH HOMEDRIVE"
+  - "SET PYTHONPATH=%APPVEYOR_BUILD_FOLDER%"
  - "SET TOX_TESTENV_PASSENV=HOME HOMEDRIVE HOMEPATH PYTHONPATH USERPROFILE"
  - "pip install -U tox"
 build: false
--- a/docs/Makefile
+++ b/docs/Makefile
@ -82,6 +82,9 @@ pydoc-topics: build
 	@echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \
 	      "into the Lib/ directory"
 coverage: BUILDER = coverage
 coverage: build
 htmlview: html
 	 $(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
 	 os.path.realpath('build/html/index.html'))"
--- a/docs/conf.py
+++ b/docs/conf.py
@ -28,7 +28,8 @@ sys.path.insert(0, path.dirname(path.dirname(__file__)))
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = [
    'scrapydocs',
-    'sphinx.ext.autodoc'
+    'sphinx.ext.autodoc',
    'sphinx.ext.coverage',
 ]
 # Add any paths that contain templates here, relative to this directory.
@ -218,3 +219,22 @@ linkcheck_ignore = [
    'http://localhost:\d+', 'http://hg.scrapy.org',
    'http://directory.google.com/'
 ]
 # Options for the Coverage extension
 # ----------------------------------
 coverage_ignore_pyobjects = [
    # Contract’s add_pre_hook and add_post_hook are not documented because
    # they should be transparent to contract developers, for whom pre_hook and
    # post_hook should be the actual concern.
    r'\bContract\.add_(pre|post)_hook$',
    # ContractsManager is an internal class, developers are not expected to
    # interact with it directly in any way.
    r'\bContractsManager\b$',
    # For default contracts we only want to document their general purpose in
    # their constructor, the methods they reimplement to achieve that purpose
    # should be irrelevant to developers using those contracts.
    r'\w+Contract\.(adjust_request_args|(pre|post)_process)$',
 ]
--- a/docs/contributing.rst
+++ b/docs/contributing.rst
@ -99,6 +99,15 @@ Well-written patches should:
  the documentation changes in the same patch.  See `Documentation policies`_
  below.
 * if you're adding a private API, please add a regular expression to the
  ``coverage_ignore_pyobjects`` variable of ``docs/conf.py`` to exclude the new
  private API from documentation coverage checks.
  To see if your private API is skipped properly, generate a documentation
  coverage report as follows::
      tox -e docs-coverage
 .. _submitting-patches:
 Submitting patches
@ -167,8 +176,9 @@ Documentation policies
 For reference documentation of API members (classes, methods, etc.) use
 docstrings and make sure that the Sphinx documentation uses the autodoc_
-extension to pull the docstrings. API reference documentation should be
+extension to pull the docstrings. API reference documentation should follow
-IDE-friendly: short, to the point, and it may provide short examples.
+docstring conventions (`PEP 257`_) and be IDE-friendly: short, to the point,
 and it may provide short examples.
 Other types of documentation, such as tutorials or topics, should be covered in
 files within the ``docs/`` directory. This includes documentation that is
@ -205,6 +215,29 @@ To run a specific test (say ``tests/test_loader.py``) use:
    ``tox -- tests/test_loader.py``
 To run the tests on a specific tox_ environment, use ``-e <name>`` with an
 environment name from ``tox.ini``. For example, to run the tests with Python
 3.6 use::
    tox -e py36
 You can also specify a comma-separated list of environmets, and use `tox’s
 parallel mode`_ to run the tests on multiple environments in parallel::
    tox -e py27,py36 -p auto
 To pass command-line options to pytest_, add them after ``--`` in your call to
 tox_. Using ``--`` overrides the default positional arguments defined in
 ``tox.ini``, so you must include those default positional arguments
 (``scrapy tests``) after ``--`` as well::
    tox -- scrapy tests -x  # stop after first failure
 You can also use the `pytest-xdist`_ plugin. For example, to run all tests on
 the Python 3.6 tox_ environment using all your CPU cores::
    tox -e py36 -- scrapy tests -n auto
 To see coverage report install `coverage`_ (``pip install coverage``) and run:
    ``coverage report``
@ -237,5 +270,9 @@ And their unit-tests are in::
 .. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS
 .. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
 .. _open issues: https://github.com/scrapy/scrapy/issues
-.. _pull request: https://help.github.com/send-pull-requests/
+.. _PEP 257: https://www.python.org/dev/peps/pep-0257/
 .. _pull request: https://help.github.com/en/articles/creating-a-pull-request
 .. _pytest: https://docs.pytest.org/en/latest/usage.html
 .. _pytest-xdist: https://docs.pytest.org/en/3.0.0/xdist.html
 .. _tox: https://pypi.python.org/pypi/tox
 .. _tox’s parallel mode: https://tox.readthedocs.io/en/latest/example/basic.html#parallel-mode
--- a/docs/index.rst
+++ b/docs/index.rst
@ -158,6 +158,7 @@ Solving specific problems
   topics/practices
   topics/broad-crawls
   topics/developer-tools
   topics/dynamic-content
   topics/leaks
   topics/media-pipeline
   topics/deploy
@ -183,6 +184,9 @@ Solving specific problems
 :doc:`topics/developer-tools`
    Learn how to scrape with your browser's developer tools.
 :doc:`topics/dynamic-content`
    Read webpage data that is loaded dynamically.
 :doc:`topics/leaks`
    Learn how to find and get rid of memory leaks in your crawler.
--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@ -205,7 +205,7 @@ Extracting data
 ---------------
 The best way to learn how to extract data with Scrapy is trying selectors
-using the shell :ref:`Scrapy shell <topics-shell>`. Run::
+using the :ref:`Scrapy shell <topics-shell>`. Run::
    scrapy shell 'http://quotes.toscrape.com/page/1/'
@ -296,8 +296,8 @@ expressions`_::
 In order to find the proper CSS selectors to use, you might find useful opening
 the response page from the shell in your web browser using ``view(response)``.
-You can use your browser developer tools to inspect the HTML and come up
+You can use your browser's developer tools to inspect the HTML and come up
-with a selector (see section about :ref:`topics-developer-tools`).
+with a selector (see :ref:`topics-developer-tools`).
 `Selector Gadget`_ is also a nice tool to quickly find CSS selector for
 visually selected elements, which works in many browsers.
@ -379,11 +379,11 @@ variable, so that we can run our CSS selectors directly on a particular quote::
    >>> quote = response.css("div.quote")[0]
-Now, let's extract ``title``, ``author`` and the ``tags`` from that quote
+Now, let's extract ``text``, ``author`` and the ``tags`` from that quote
 using the ``quote`` object we just created::
-    >>> title = quote.css("span.text::text").get()
+    >>> text = quote.css("span.text::text").get()
-    >>> title
+    >>> text
    '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'
    >>> author = quote.css("small.author::text").get()
    >>> author
@ -511,7 +511,7 @@ We can try extracting it in the shell::
    '<a href="/page/2/">Next <span aria-hidden="true">→</span></a>'
 This gets the anchor element, but we want the attribute ``href``. For that,
-Scrapy supports a CSS extension that let's you select the attribute contents,
+Scrapy supports a CSS extension that lets you select the attribute contents,
 like this::
    >>> response.css('li.next a::attr(href)').get()
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,2 +1,2 @@
-Sphinx>=1.6
+Sphinx>=2.1
 sphinx_rtd_theme
--- a/docs/topics/api.rst
+++ b/docs/topics/api.rst
@ -99,6 +99,8 @@ how you :ref:`configure the downloader middlewares
        Returns a deferred that is fired when the crawl is finished.
    .. automethod:: stop
 .. autoclass:: CrawlerRunner
   :members:
@ -154,7 +156,7 @@ Settings API
 SpiderLoader API
 ================
-.. module:: scrapy.loader
+.. module:: scrapy.spiderloader
   :synopsis: The spider loader
 .. class:: SpiderLoader
--- a/docs/topics/broad-crawls.rst
+++ b/docs/topics/broad-crawls.rst
@ -39,6 +39,17 @@ you need to keep in mind when using Scrapy for doing broad crawls, along with
 concrete suggestions of Scrapy settings to tune in order to achieve an
 efficient broad crawl.
 Use the right :setting:`SCHEDULER_PRIORITY_QUEUE`
 =================================================
 Scrapy’s default scheduler priority queue is ``'scrapy.pqueues.ScrapyPriorityQueue'``.
 It works best during single-domain crawl. It does not work well with crawling
 many different domains in parallel
 To apply the recommended priority queue use::
    SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
 Increase concurrency
 ====================
--- a/docs/topics/contracts.rst
+++ b/docs/topics/contracts.rst
@ -120,3 +120,23 @@ get the failures pretty printed::
            for header in self.args:
                if header not in response.headers:
                    raise ContractFail('X-CustomHeader not present')
 Detecting check runs
 ====================
 When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is
 set to the ``true`` string. You can use `os.environ`_ to perform any change to
 your spiders or your settings when ``scrapy check`` is used::
    import os
    import scrapy
    class ExampleSpider(scrapy.Spider):
        name = 'example'
        def __init__(self):
            if os.environ.get('SCRAPY_CHECK'):
                pass  # Do some scraper adjustments when a check is running
 .. _os.environ: https://docs.python.org/3/library/os.html#os.environ
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@ -805,6 +805,7 @@ The :class:`MetaRefreshMiddleware` can be configured through the following
 settings (see the settings documentation for more info):
 * :setting:`METAREFRESH_ENABLED`
 * :setting:`METAREFRESH_IGNORE_TAGS`
 * :setting:`METAREFRESH_MAXDELAY`
 This middleware obey :setting:`REDIRECT_MAX_TIMES` setting, :reqmeta:`dont_redirect`,
@ -826,6 +827,15 @@ Default: ``True``
 Whether the Meta Refresh middleware will be enabled.
 .. setting:: METAREFRESH_IGNORE_TAGS
 METAREFRESH_IGNORE_TAGS
 ^^^^^^^^^^^^^^^^^^^^^^^
 Default: ``['script', 'noscript']``
 Meta tags within these tags are ignored.
 .. setting:: METAREFRESH_MAXDELAY
 METAREFRESH_MAXDELAY
--- a/docs/topics/dynamic-content.rst
+++ b/docs/topics/dynamic-content.rst
@ -0,0 +1,246 @@
 .. _topics-dynamic-content:
 ====================================
 Selecting dynamically-loaded content
 ====================================
 Some webpages show the desired data when you load them in a web browser.
 However, when you download them using Scrapy, you cannot reach the desired data
 using :ref:`selectors <topics-selectors>`.
 When this happens, the recommended approach is to
 :ref:`find the data source <topics-finding-data-source>` and extract the data
 from it.
 If you fail to do that, and you can nonetheless access the desired data through
 the :ref:`DOM <topics-livedom>` from your web browser, see
 :ref:`topics-javascript-rendering`.
 .. _topics-finding-data-source:
 Finding the data source
 =======================
 To extract the desired data, you must first find its source location.
 If the data is in a non-text-based format, such as an image or a PDF document,
 use the :ref:`network tool <topics-network-tool>` of your web browser to find
 the corresponding request, and :ref:`reproduce it
 <topics-reproducing-requests>`.
 If your web browser lets you select the desired data as text, the data may be
 defined in embedded JavaScript code, or loaded from an external resource in a
 text-based format.
 In that case, you can use a tool like wgrep_ to find the URL of that resource.
 If the data turns out to come from the original URL itself, you must
 :ref:`inspect the source code of the webpage <topics-inspecting-source>` to
 determine where the data is located.
 If the data comes from a different URL, you will need to :ref:`reproduce the
 corresponding request <topics-reproducing-requests>`.
 .. _topics-inspecting-source:
 Inspecting the source code of a webpage
 =======================================
 Sometimes you need to inspect the source code of a webpage (not the
 :ref:`DOM <topics-livedom>`) to determine where some desired data is located.
 Use Scrapy’s :command:`fetch` command to download the webpage contents as seen
 by Scrapy::
    scrapy fetch --nolog https://example.com > response.html
 If the desired data is in embedded JavaScript code within a ``<script/>``
 element, see :ref:`topics-parsing-javascript`.
 If you cannot find the desired data, first make sure it’s not just Scrapy:
 download the webpage with an HTTP client like curl_ or wget_ and see if the
 information can be found in the response they get.
 If they get a response with the desired data, modify your Scrapy
 :class:`~scrapy.http.Request` to match that of the other HTTP client. For
 example, try using the same user-agent string (:setting:`USER_AGENT`) or the
 same :attr:`~scrapy.http.Request.headers`.
 If they also get a response without the desired data, you’ll need to take
 steps to make your request more similar to that of the web browser. See
 :ref:`topics-reproducing-requests`.
 .. _topics-reproducing-requests:
 Reproducing requests
 ====================
 Sometimes we need to reproduce a request the way our web browser performs it.
 Use the :ref:`network tool <topics-network-tool>` of your web browser to see
 how your web browser performs the desired request, and try to reproduce that
 request with Scrapy.
 It might be enough to yield a :class:`~scrapy.http.Request` with the same HTTP
 method and URL. However, you may also need to reproduce the body, headers and
 form parameters (see :class:`~scrapy.http.FormRequest`) of that request.
 Once you get the expected response, you can :ref:`extract the desired data from
 it <topics-handling-response-formats>`.
 You can reproduce any request with Scrapy. However, some times reproducing all
 necessary requests may not seem efficient in developer time. If that is your
 case, and crawling speed is not a major concern for you, you can alternatively
 consider :ref:`JavaScript pre-rendering <topics-javascript-rendering>`.
 If you get the expected response `sometimes`, but not always, the issue is
 probably not your request, but the target server. The target server might be
 buggy, overloaded, or :ref:`banning <bans>` some of your requests.
 .. _topics-handling-response-formats:
 Handling different response formats
 ===================================
 Once you have a response with the desired data, how you extract the desired
 data from it depends on the type of response:
 -   If the response is HTML or XML, use :ref:`selectors
    <topics-selectors>` as usual.
 -   If the response is JSON, use `json.loads`_ to load the desired data from
    :attr:`response.text <scrapy.http.TextResponse.text>`::
        data = json.loads(response.text)
    If the desired data is inside HTML or XML code embedded within JSON data,
    you can load that HTML or XML code into a
    :class:`~scrapy.selector.Selector` and then
    :ref:`use it <topics-selectors>` as usual::
        selector = Selector(data['html'])
 -   If the response is JavaScript, or HTML with a ``<script/>`` element
    containing the desired data, see :ref:`topics-parsing-javascript`.
 -   If the response is CSS, use a `regular expression`_ to extract the desired
    data from :attr:`response.text <scrapy.http.TextResponse.text>`.
 .. _topics-parsing-images:
 -   If the response is an image or another format based on images (e.g. PDF),
    read the response as bytes from
    :attr:`response.body <scrapy.http.TextResponse.body>` and use an OCR
    solution to extract the desired data as text.
    For example, you can use pytesseract_. To read a table from a PDF,
    `tabula-py`_ may be a better choice.
 -   If the response is SVG, or HTML with embedded SVG containing the desired
    data, you may be able to extract the desired data using
    :ref:`selectors <topics-selectors>`, since SVG is based on XML.
    Otherwise, you might need to convert the SVG code into a raster image, and
    :ref:`handle that raster image <topics-parsing-images>`.
 .. _topics-parsing-javascript:
 Parsing JavaScript code
 =======================
 If the desired data is hardcoded in JavaScript, you first need to get the
 JavaScript code:
 -   If the JavaScript code is in a JavaScript file, simply read
    :attr:`response.text <scrapy.http.TextResponse.text>`.
 -   If the JavaScript code is within a ``<script/>`` element of an HTML page,
    use :ref:`selectors <topics-selectors>` to extract the text within that
    ``<script/>`` element.
 Once you have a string with the JavaScript code, you can extract the desired
 data from it:
 -   You might be able to use a `regular expression`_ to extract the desired
    data in JSON format, which you can then parse with `json.loads`_.
    For example, if the JavaScript code contains a separate line like
    ``var data = {"field": "value"};`` you can extract that data as follows::
        >>> pattern = r'\bvar\s+data\s*=\s*(\{.*?\})\s*;\s*\n'
        >>> json_data = response.css('script::text').re_first(pattern)
        >>> json.loads(json_data)
        {'field': 'value'}
 -   Otherwise, use js2xml_ to convert the JavaScript code into an XML document
    that you can parse using :ref:`selectors <topics-selectors>`.
    For example, if the JavaScript code contains
    ``var data = {field: "value"};`` you can extract that data as follows::
        >>> import js2xml
        >>> import lxml.etree
        >>> from parsel import Selector
        >>> javascript = response.css('script::text').get()
        >>> xml = lxml.etree.tostring(js2xml.parse(javascript), encoding='unicode')
        >>> selector = Selector(text=xml)
        >>> selector.css('var[name="data"]').get()
        '<var name="data"><object><property name="field"><string>value</string></property></object></var>'
 .. _topics-javascript-rendering:
 Pre-rendering JavaScript
 ========================
 On webpages that fetch data from additional requests, reproducing those
 requests that contain the desired data is the preferred approach. The effort is
 often worth the result: structured, complete data with minimum parsing time and
 network transfer.
 However, sometimes it can be really hard to reproduce certain requests. Or you
 may need something that no request can give you, such as a screenshot of a
 webpage as seen in a web browser.
 In these cases use the Splash_ JavaScript-rendering service, along with
 `scrapy-splash`_ for seamless integration.
 Splash returns as HTML the :ref:`DOM <topics-livedom>` of a webpage, so that
 you can parse it with :ref:`selectors <topics-selectors>`. It provides great
 flexibility through configuration_ or scripting_.
 If you need something beyond what Splash offers, such as interacting with the
 DOM on-the-fly from Python code instead of using a previously-written script,
 or handling multiple web browser windows, you might need to
 :ref:`use a headless browser <topics-headless-browsing>` instead.
 .. _configuration: https://splash.readthedocs.io/en/stable/api.html
 .. _scripting: https://splash.readthedocs.io/en/stable/scripting-tutorial.html
 .. _topics-headless-browsing:
 Using a headless browser
 ========================
 A `headless browser`_ is a special web browser that provides an API for
 automation.
 The easiest way to use a headless browser with Scrapy is to use Selenium_,
 along with `scrapy-selenium`_ for seamless integration.
 .. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29
 .. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
 .. _curl: https://curl.haxx.se/
 .. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
 .. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
 .. _js2xml: https://github.com/scrapinghub/js2xml
 .. _json.loads: https://docs.python.org/library/json.html#json.loads
 .. _pytesseract: https://github.com/madmaze/pytesseract
 .. _regular expression: https://docs.python.org/library/re.html
 .. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
 .. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
 .. _Selenium: https://www.seleniumhq.org/
 .. _Splash: https://github.com/scrapinghub/splash
 .. _tabula-py: https://github.com/chezou/tabula-py
 .. _wget: https://www.gnu.org/software/wget/
 .. _wgrep: https://github.com/stav/wgrep
--- a/docs/topics/logging.rst
+++ b/docs/topics/logging.rst
@ -238,9 +238,10 @@ scrapy.utils.log module
 .. autofunction:: configure_logging
-    ``configure_logging`` is automatically called when using Scrapy commands,
+    ``configure_logging`` is automatically called when using Scrapy commands
-    but needs to be called explicitly when running custom scripts. In that
+    or :class:`~scrapy.crawler.CrawlerProcess`, but needs to be called explicitly
-    case, its usage is not required but it's recommended.
+    when running custom scripts using :class:`~scrapy.crawler.CrawlerRunner`.
    In that case, its usage is not required but it's recommended.
    If you plan on configuring the handlers yourself is still recommended you
    call this function, passing ``install_root_handler=False``. Bear in mind
--- a/docs/topics/settings.rst
+++ b/docs/topics/settings.rst
@ -897,6 +897,16 @@ Default: ``False``
 If ``True``, the logs will just contain the root path. If it is set to ``False``
 then it displays the component responsible for the log output
 .. setting:: LOGSTATS_INTERVAL
 LOGSTATS_INTERVAL
 -----------------
 Default: ``60.0``
 The interval (in seconds) between each logging printout of the stats 
 by :class:`~extensions.logstats.LogStats`.
 .. setting:: MEMDEBUG_ENABLED
 MEMDEBUG_ENABLED
@ -1155,9 +1165,14 @@ Type of in-memory queue used by scheduler. Other available type is:
 SCHEDULER_PRIORITY_QUEUE
 ------------------------
-Default: ``'queuelib.PriorityQueue'``
+Default: ``'scrapy.pqueues.ScrapyPriorityQueue'``
-Type of priority queue used by scheduler.
+Type of priority queue used by the scheduler. Another available type is
 ``scrapy.pqueues.DownloaderAwarePriorityQueue``.
 ``scrapy.pqueues.DownloaderAwarePriorityQueue`` works better than
 ``scrapy.pqueues.ScrapyPriorityQueue`` when you crawl many different
 domains in parallel. But currently ``scrapy.pqueues.DownloaderAwarePriorityQueue``
 does not work together with :setting:`CONCURRENT_REQUESTS_PER_IP`.
 .. setting:: SPIDER_CONTRACTS
--- a/docs/topics/spider-middleware.rst
+++ b/docs/topics/spider-middleware.rst
@ -82,7 +82,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
        If it raises an exception, Scrapy won't bother calling any other spider
        middleware :meth:`process_spider_input` and will call the request
-        errback.  The output of the errback is chained back in the other
+        errback if there is one, otherwise it will start the :meth:`process_spider_exception`
        chain. The output of the errback is chained back in the other
        direction for :meth:`process_spider_output` to process it, or
        :meth:`process_spider_exception` if it raised an exception.
@ -116,8 +117,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
    .. method:: process_spider_exception(response, exception, spider)
-        This method is called when a spider or :meth:`process_spider_input`
+        This method is called when a spider or :meth:`process_spider_output`
-        method (from other spider middleware) raises an exception.
+        method (from a previous spider middleware) raises an exception.
        :meth:`process_spider_exception` should return either ``None`` or an
        iterable of :class:`~scrapy.http.Request`, dict or
@ -129,7 +130,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
        exception reaches the engine (where it's logged and discarded).
        If it returns an iterable the :meth:`process_spider_output` pipeline
-        kicks in, and no other :meth:`process_spider_exception` will be called.
+        kicks in, starting from the next spider middleware, and no other
        :meth:`process_spider_exception` will be called.
        :param response: the response being processed when the exception was
          raised
--- a/docs/topics/spiders.rst
+++ b/docs/topics/spiders.rst
@ -402,10 +402,12 @@ Crawling rules
   of links extracted from each response using the specified ``link_extractor``.
   This is mainly used for filtering purposes.
-   ``process_request`` is a callable, or a string (in which case a method from
+   ``process_request`` is a callable (or a string, in which case a method from
-   the spider object with that name will be used) which will be called with
+   the spider object with that name will be used) which will be called for every
-   every request extracted by this rule, and must return a request or None (to
+   :class:`~scrapy.http.Request` extracted by this rule. This callable should
-   filter out the request).
+   take said request as first argument and the :class:`~scrapy.http.Response`
   from which the request originated as second argument. It must return a
   ``Request`` object or ``None`` (to filter out the request).
 CrawlSpider example
 ~~~~~~~~~~~~~~~~~~~
@ -655,7 +657,7 @@ SitemapSpider
    .. attribute:: sitemap_follow
-        A list of regexes of sitemap that should be followed. This is is only
+        A list of regexes of sitemap that should be followed. This is only
        for sites that use `Sitemap index files`_ that point to other sitemap
        files.
--- a/docs/topics/stats.rst
+++ b/docs/topics/stats.rst
@ -75,8 +75,7 @@ available in Scrapy which extend the basic Stats Collector. You can select
 which Stats Collector to use through the :setting:`STATS_CLASS` setting. The
 default Stats Collector used is the :class:`MemoryStatsCollector`. 
-.. module:: scrapy.statscollectors
+.. currentmodule:: scrapy.statscollectors
   :synopsis: Stats Collectors
 MemoryStatsCollector
 --------------------
--- a/docs/topics/telnetconsole.rst
+++ b/docs/topics/telnetconsole.rst
@ -1,12 +1,11 @@
 .. currentmodule:: scrapy.extensions.telnet
 .. _topics-telnetconsole:
 ==============
 Telnet Console
 ==============
 .. module:: scrapy.extensions.telnet
   :synopsis: The Telnet Console
 Scrapy comes with a built-in telnet console for inspecting and controlling a
 Scrapy running process. The telnet console is just a regular python shell
 running inside the Scrapy process, so you can do literally anything from it.
@ -45,7 +44,7 @@ the console you need to type::
    >>>
 By default Username is ``scrapy`` and Password is autogenerated. The 
-autogenerated Password can be seen on scrapy logs like the example bellow::
+autogenerated Password can be seen on scrapy logs like the example below::
    2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326
--- a/scrapy/commands/check.py
+++ b/scrapy/commands/check.py
@ -6,7 +6,7 @@ from unittest import TextTestRunner, TextTestResult as _TextTestResult
 from scrapy.commands import ScrapyCommand
 from scrapy.contracts import ContractsManager
-from scrapy.utils.misc import load_object
+from scrapy.utils.misc import load_object, set_environ
 from scrapy.utils.conf import build_component_list
@ -68,6 +68,7 @@ class Command(ScrapyCommand):
        spider_loader = self.crawler_process.spider_loader
        with set_environ(SCRAPY_CHECK='true'):
            for spidername in args or spider_loader.list():
                spidercls = spider_loader.load(spidername)
                spidercls.start_requests = lambda s: conman.from_spider(s, result)
--- a/scrapy/contracts/init.py
+++ b/scrapy/contracts/init.py
@ -94,7 +94,7 @@ class ContractsManager(object):
            try:
                output = cb(response)
                output = list(iterate_spider_output(output))
-            except:
+            except Exception:
                case = _create_testcase(method, 'callback')
                results.addError(case, sys.exc_info())
--- a/scrapy/core/downloader/init.py
+++ b/scrapy/core/downloader/init.py
@ -75,6 +75,8 @@ def _get_concurrency_delay(concurrency, spider, settings):
 class Downloader(object):
    DOWNLOAD_SLOT = 'download_slot'
    def __init__(self, crawler):
        self.settings = crawler.settings
        self.signals = crawler.signals
@ -111,8 +113,8 @@ class Downloader(object):
        return key, self.slots[key]
    def _get_slot_key(self, request, spider):
-        if 'download_slot' in request.meta:
+        if self.DOWNLOAD_SLOT in request.meta:
-            return request.meta['download_slot']
+            return request.meta[self.DOWNLOAD_SLOT]
        key = urlparse_cached(request).hostname or ''
        if self.ip_concurrency:
@ -122,7 +124,7 @@ class Downloader(object):
    def _enqueue_request(self, request, spider):
        key, slot = self._get_slot(request, spider)
-        request.meta['download_slot'] = key
+        request.meta[self.DOWNLOAD_SLOT] = key
        def _deactivate(response):
            slot.active.remove(request)
--- a/scrapy/core/downloader/middleware.py
+++ b/scrapy/core/downloader/middleware.py
@ -7,6 +7,7 @@ import six
 from twisted.internet import defer
 from scrapy.exceptions import _InvalidOutput
 from scrapy.http import Request, Response
 from scrapy.middleware import MiddlewareManager
 from scrapy.utils.defer import mustbe_deferred
@ -35,9 +36,9 @@ class DownloaderMiddlewareManager(MiddlewareManager):
        def process_request(request):
            for method in self.methods['process_request']:
                response = yield method(request=request, spider=spider)
-                assert response is None or isinstance(response, (Response, Request)), \
+                if response is not None and not isinstance(response, (Response, Request)):
-                        'Middleware %s.process_request must return None, Response or Request, got %s' % \
+                    raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
-                        (six.get_method_self(method).__class__.__name__, response.__class__.__name__)
+                                         (six.get_method_self(method).__class__.__name__, response.__class__.__name__))
                if response:
                    defer.returnValue(response)
            defer.returnValue((yield download_func(request=request, spider=spider)))
@ -49,11 +50,10 @@ class DownloaderMiddlewareManager(MiddlewareManager):
                defer.returnValue(response)
            for method in self.methods['process_response']:
-                response = yield method(request=request, response=response,
+                response = yield method(request=request, response=response, spider=spider)
-                                        spider=spider)
+                if not isinstance(response, (Response, Request)):
-                assert isinstance(response, (Response, Request)), \
+                    raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
-                    'Middleware %s.process_response must return Response or Request, got %s' % \
+                                         (six.get_method_self(method).__class__.__name__, type(response)))
                    (six.get_method_self(method).__class__.__name__, type(response))
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)
@ -62,11 +62,10 @@ class DownloaderMiddlewareManager(MiddlewareManager):
        def process_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_exception']:
-                response = yield method(request=request, exception=exception,
+                response = yield method(request=request, exception=exception, spider=spider)
-                                        spider=spider)
+                if response is not None and not isinstance(response, (Response, Request)):
-                assert response is None or isinstance(response, (Response, Request)), \
+                    raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \
-                    'Middleware %s.process_exception must return None, Response or Request, got %s' % \
+                                         (six.get_method_self(method).__class__.__name__, type(response)))
                    (six.get_method_self(method).__class__.__name__, type(response))
                if response:
                    defer.returnValue(response)
            defer.returnValue(_failure)
--- a/scrapy/core/scheduler.py
+++ b/scrapy/core/scheduler.py
@ -1,19 +1,46 @@
 import os
 import json
 import logging
 import warnings
 from os.path import join, exists
-from scrapy.utils.reqser import request_to_dict, request_from_dict
+from queuelib import PriorityQueue
 from scrapy.utils.misc import load_object, create_instance
 from scrapy.utils.job import job_dir
 from scrapy.utils.deprecate import ScrapyDeprecationWarning
 logger = logging.getLogger(__name__)
 class Scheduler(object):
    """
    Scrapy Scheduler. It allows to enqueue requests and then get
    a next request to download. Scheduler is also handling duplication
    filtering, via dupefilter.
    Prioritization and queueing is not performed by the Scheduler.
    User sets ``priority`` field for each Request, and a PriorityQueue
    (defined by :setting:`SCHEDULER_PRIORITY_QUEUE`) uses these priorities
    to dequeue requests in a desired order.
    Scheduler uses two PriorityQueue instances, configured to work in-memory
    and on-disk (optional). When on-disk queue is present, it is used by
    default, and an in-memory queue is used as a fallback for cases where
    a disk queue can't handle a request (can't serialize it).
    :setting:`SCHEDULER_MEMORY_QUEUE` and
    :setting:`SCHEDULER_DISK_QUEUE` allow to specify lower-level queue classes
    which PriorityQueue instances would be instantiated with, to keep requests
    on disk and in memory respectively.
    Overall, Scheduler is an object which holds several PriorityQueue instances
    (in-memory and on-disk) and implements fallback logic for them.
    Also, it handles dupefilters.
    """
    def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
-                 logunser=False, stats=None, pqclass=None):
+                 logunser=False, stats=None, pqclass=None, crawler=None):
        self.df = dupefilter
        self.dqdir = self._dqdir(jobdir)
        self.pqclass = pqclass
@ -21,6 +48,7 @@ class Scheduler(object):
        self.mqclass = mqclass
        self.logunser = logunser
        self.stats = stats
        self.crawler = crawler
    @classmethod
    def from_crawler(cls, crawler):
@ -28,26 +56,35 @@ class Scheduler(object):
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = create_instance(dupefilter_cls, settings, crawler)
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
        if pqclass is PriorityQueue:
            warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
                          " is no longer supported because of API changes; "
                          "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
                          ScrapyDeprecationWarning)
            from scrapy.pqueues import ScrapyPriorityQueue
            pqclass = ScrapyPriorityQueue
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
-        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
+        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
                                    settings.getbool('SCHEDULER_DEBUG'))
        return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
-                   stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
+                   stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
                   mqclass=mqclass, crawler=crawler)
    def has_pending_requests(self):
        return len(self) > 0
    def open(self, spider):
        self.spider = spider
-        self.mqs = self.pqclass(self._newmq)
+        self.mqs = self._mq()
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()
    def close(self, reason):
        if self.dqs:
-            prios = self.dqs.close()
+            state = self.dqs.close()
-            with open(join(self.dqdir, 'active.json'), 'w') as f:
+            self._write_dqs_state(self.dqdir, state)
                json.dump(prios, f)
        return self.df.close(reason)
    def enqueue_request(self, request):
@ -82,8 +119,7 @@ class Scheduler(object):
        if self.dqs is None:
            return
        try:
-            reqd = request_to_dict(request, self.spider)
+            self.dqs.push(request, -request.priority)
            self.dqs.push(reqd, -request.priority)
        except ValueError as e:  # non serializable request
            if self.logunser:
                msg = ("Unable to serialize request: %(request)s - reason:"
@ -103,32 +139,51 @@ class Scheduler(object):
    def _dqpop(self):
        if self.dqs:
-            d = self.dqs.pop()
+            return self.dqs.pop()
            if d:
                return request_from_dict(d, self.spider)
    def _newmq(self, priority):
        """ Factory for creating memory queues. """
        return self.mqclass()
    def _newdq(self, priority):
-        return self.dqclass(join(self.dqdir, 'p%s' % priority))
+        """ Factory for creating disk queues. """
        path = join(self.dqdir, 'p%s' % (priority, ))
        return self.dqclass(path)
    def _mq(self):
        """ Create a new priority queue instance, with in-memory storage """
        return create_instance(self.pqclass, None, self.crawler, self._newmq,
                               serialize=False)
    def _dq(self):
-        activef = join(self.dqdir, 'active.json')
+        """ Create a new priority queue instance, with disk storage """
-        if exists(activef):
+        state = self._read_dqs_state(self.dqdir)
-            with open(activef) as f:
+        q = create_instance(self.pqclass,
-                prios = json.load(f)
+                            None,
-        else:
+                            self.crawler,
-            prios = ()
+                            self._newdq,
-        q = self.pqclass(self._newdq, startprios=prios)
+                            state,
                            serialize=True)
        if q:
            logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                        {'queuesize': len(q)}, extra={'spider': self.spider})
        return q
    def _dqdir(self, jobdir):
        """ Return a folder name to keep disk queue state at """
        if jobdir:
            dqdir = join(jobdir, 'requests.queue')
            if not exists(dqdir):
                os.makedirs(dqdir)
            return dqdir
    def _read_dqs_state(self, dqdir):
        path = join(dqdir, 'active.json')
        if not exists(path):
            return ()
        with open(path) as f:
            return json.load(f)
    def _write_dqs_state(self, dqdir, state):
        with open(join(dqdir, 'active.json'), 'w') as f:
            json.dump(state, f)
--- a/scrapy/core/scraper.py
+++ b/scrapy/core/scraper.py
@ -135,7 +135,6 @@ class Scraper(object):
            return self.spidermw.scrape_response(
                self.call_spider, request_result, request, spider)
        else:
            # FIXME: don't ignore errors in spider middleware
            dfd = self.call_spider(request_result, request, spider)
            return dfd.addErrback(
                self._log_download_errors, request_result, request, spider)
--- a/scrapy/core/spidermw.py
+++ b/scrapy/core/spidermw.py
@ -3,15 +3,21 @@ Spider Middleware manager
 See documentation in docs/topics/spider-middleware.rst
 """
 from itertools import chain, islice
 import six
 from twisted.python.failure import Failure
 from scrapy.exceptions import _InvalidOutput
 from scrapy.middleware import MiddlewareManager
 from scrapy.utils.defer import mustbe_deferred
 from scrapy.utils.conf import build_component_list
 from scrapy.utils.python import MutableChain
 def _isiterable(possible_iterator):
    return hasattr(possible_iterator, '__iter__')
 class SpiderMiddlewareManager(MiddlewareManager):
    component_name = 'spider middleware'
@ -24,12 +30,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
        super(SpiderMiddlewareManager, self)._add_middleware(mw)
        if hasattr(mw, 'process_spider_input'):
            self.methods['process_spider_input'].append(mw.process_spider_input)
        if hasattr(mw, 'process_spider_output'):
            self.methods['process_spider_output'].appendleft(mw.process_spider_output)
        if hasattr(mw, 'process_spider_exception'):
            self.methods['process_spider_exception'].appendleft(mw.process_spider_exception)
        if hasattr(mw, 'process_start_requests'):
            self.methods['process_start_requests'].appendleft(mw.process_start_requests)
        self.methods['process_spider_output'].appendleft(getattr(mw, 'process_spider_output', None))
        self.methods['process_spider_exception'].appendleft(getattr(mw, 'process_spider_exception', None))
    def scrape_response(self, scrape_func, response, request, spider):
        fname = lambda f:'%s.%s' % (
@ -40,36 +44,73 @@ class SpiderMiddlewareManager(MiddlewareManager):
            for method in self.methods['process_spider_input']:
                try:
                    result = method(response=response, spider=spider)
-                    assert result is None, \
+                    if result is not None:
-                            'Middleware %s must returns None or ' \
+                        raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
-                            'raise an exception, got %s ' \
+                                             .format(fname(method), type(result)))
-                            % (fname(method), type(result))
+                except _InvalidOutput:
-                except:
+                    raise
                except Exception:
                    return scrape_func(Failure(), request, spider)
            return scrape_func(response, request, spider)
-        def process_spider_exception(_failure):
+        def process_spider_exception(_failure, start_index=0):
            exception = _failure.value
-            for method in self.methods['process_spider_exception']:
+            # don't handle _InvalidOutput exception
            if isinstance(exception, _InvalidOutput):
                return _failure
            method_list = islice(self.methods['process_spider_exception'], start_index, None)
            for method_index, method in enumerate(method_list, start=start_index):
                if method is None:
                    continue
                result = method(response=response, exception=exception, spider=spider)
-                assert result is None or _isiterable(result), \
+                if _isiterable(result):
-                    'Middleware %s must returns None, or an iterable object, got %s ' % \
+                    # stop exception handling by handing control over to the
-                    (fname(method), type(result))
+                    # process_spider_output chain if an iterable has been returned
-                if result is not None:
+                    return process_spider_output(result, method_index+1)
-                    return result
+                elif result is None:
                    continue
                else:
                    raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \
                                         .format(fname(method), type(result)))
            return _failure
-        def process_spider_output(result):
+        def process_spider_output(result, start_index=0):
-            for method in self.methods['process_spider_output']:
+            # items in this iterable do not need to go through the process_spider_output
            # chain, they went through it already from the process_spider_exception method
            recovered = MutableChain()
            def evaluate_iterable(iterable, index):
                try:
                    for r in iterable:
                        yield r
                except Exception as ex:
                    exception_result = process_spider_exception(Failure(ex), index+1)
                    if isinstance(exception_result, Failure):
                        raise
                    recovered.extend(exception_result)
            method_list = islice(self.methods['process_spider_output'], start_index, None)
            for method_index, method in enumerate(method_list, start=start_index):
                if method is None:
                    continue
                # the following might fail directly if the output value is not a generator
                try:
                    result = method(response=response, result=result, spider=spider)
-                assert _isiterable(result), \
+                except Exception as ex:
-                    'Middleware %s must returns an iterable object, got %s ' % \
+                    exception_result = process_spider_exception(Failure(ex), method_index+1)
-                    (fname(method), type(result))
+                    if isinstance(exception_result, Failure):
-            return result
+                        raise
                    return exception_result
                if _isiterable(result):
                    result = evaluate_iterable(result, method_index)
                else:
                    raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
                                         .format(fname(method), type(result)))
            return chain(result, recovered)
        dfd = mustbe_deferred(process_spider_input, response)
-        dfd.addErrback(process_spider_exception)
+        dfd.addCallbacks(callback=process_spider_output, errback=process_spider_exception)
        dfd.addCallback(process_spider_output)
        return dfd
    def process_start_requests(self, start_requests, spider):
--- a/scrapy/crawler.py
+++ b/scrapy/crawler.py
@ -111,6 +111,8 @@ class Crawler(object):
    @defer.inlineCallbacks
    def stop(self):
        """Starts a graceful stop of the crawler and returns a deferred that is
        fired when the crawler is stopped."""
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
--- a/scrapy/downloadermiddlewares/redirect.py
+++ b/scrapy/downloadermiddlewares/redirect.py
@ -88,6 +88,7 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
    def __init__(self, settings):
        super(MetaRefreshMiddleware, self).__init__(settings)
        self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
        self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY',
                                         settings.getint('METAREFRESH_MAXDELAY'))
@ -96,7 +97,8 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
                not isinstance(response, HtmlResponse):
            return response
-        interval, url = get_meta_refresh(response)
+        interval, url = get_meta_refresh(response,
                                         ignore_tags=self._ignore_tags)
        if url and interval < self._maxdelay:
            redirected = self._redirect_request_using_get(request, url)
            return self._redirect(redirected, request, spider, 'meta refresh')
--- a/scrapy/exceptions.py
+++ b/scrapy/exceptions.py
@ -11,6 +11,13 @@ class NotConfigured(Exception):
    """Indicates a missing configuration situation"""
    pass
 class _InvalidOutput(TypeError):
    """
    Indicates an invalid value has been returned by a middleware's processing method.
    Internal and undocumented, it should not be raised or caught by user code.
    """
    pass
 # HTTP and crawling
 class IgnoreRequest(Exception):
--- a/scrapy/extensions/corestats.py
+++ b/scrapy/extensions/corestats.py
@ -24,7 +24,11 @@ class CoreStats(object):
        self.stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
    def spider_closed(self, spider, reason):
-        self.stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
+        finish_time = datetime.datetime.utcnow()
        elapsed_time = finish_time - self.stats.get_value('start_time')
        elapsed_time_seconds = elapsed_time.total_seconds()
        self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
        self.stats.set_value('finish_time', finish_time, spider=spider)
        self.stats.set_value('finish_reason', reason, spider=spider)
    def item_scraped(self, item, spider):
--- a/scrapy/extensions/httpcache.py
+++ b/scrapy/extensions/httpcache.py
@ -31,7 +31,7 @@ class DummyPolicy(object):
    def should_cache_response(self, response, request):
        return response.status not in self.ignore_http_codes
-    def is_cached_response_fresh(self, response, request):
+    def is_cached_response_fresh(self, cachedresponse, request):
        return True
    def is_cached_response_valid(self, cachedresponse, response, request):
@ -70,7 +70,7 @@ class RFC2616Policy(object):
        return True
    def should_cache_response(self, response, request):
-        # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1
+        # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
        # Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
        # Status code 206 is not included because cache can not deal with partial contents
        cc = self._parse_cachecontrol(response)
--- a/scrapy/loader/init.py
+++ b/scrapy/loader/init.py
@ -35,6 +35,10 @@ class ItemLoader(object):
        self.parent = parent
        self._local_item = context['item'] = item
        self._local_values = defaultdict(list)
        # Preprocess values if item built from dict
        # Values need to be added to item._values if added them from dict (not with add_values)
        for field_name, value in item.items():
            self._values[field_name] = self._process_input_value(field_name, value)
    @property
    def _values(self):
--- a/scrapy/pipelines/media.py
+++ b/scrapy/pipelines/media.py
@ -3,7 +3,7 @@ from __future__ import print_function
 import functools
 import logging
 from collections import defaultdict
-from twisted.internet.defer import Deferred, DeferredList
+from twisted.internet.defer import Deferred, DeferredList, _DefGen_Return
 from twisted.python.failure import Failure
 from scrapy.settings import Settings
@ -139,6 +139,30 @@ class MediaPipeline(object):
            result.cleanFailure()
            result.frames = []
            result.stack = None
            # This code fixes a memory leak by avoiding to keep references to
            # the Request and Response objects on the Media Pipeline cache.
            #
            # Twisted inline callbacks pass return values using the function
            # twisted.internet.defer.returnValue, which encapsulates the return
            # value inside a _DefGen_Return base exception.
            #
            # What happens when the media_downloaded callback raises another
            # exception, for example a FileException('download-error') when
            # the Response status code is not 200 OK, is that it stores the
            # _DefGen_Return exception on the FileException context.
            #
            # To avoid keeping references to the Response and therefore Request
            # objects on the Media Pipeline cache, we should wipe the context of
            # the exception encapsulated by the Twisted Failure when its a
            # _DefGen_Return instance.
            #
            # This problem does not occur in Python 2.7 since we don't have
            # Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
            context = getattr(result.value, '__context__', None)
            if isinstance(context, _DefGen_Return):
                setattr(result.value, '__context__', None)
        info.downloading.remove(fp)
        info.downloaded[fp] = result  # cache result
        for wad in info.waiting.pop(fp):
--- a/scrapy/pqueues.py
+++ b/scrapy/pqueues.py
@ -0,0 +1,193 @@
 import hashlib
 import logging
 from collections import namedtuple
 from queuelib import PriorityQueue
 from scrapy.utils.reqser import request_to_dict, request_from_dict
 logger = logging.getLogger(__name__)
 def _path_safe(text):
    """
    Return a filesystem-safe version of a string ``text``
    >>> _path_safe('simple.org').startswith('simple.org')
    True
    >>> _path_safe('dash-underscore_.org').startswith('dash-underscore_.org')
    True
    >>> _path_safe('some@symbol?').startswith('some_symbol_')
    True
    """
    pathable_slot = "".join([c if c.isalnum() or c in '-._' else '_'
                             for c in text])
    # as we replace some letters we can get collision for different slots
    # add we add unique part
    unique_slot = hashlib.md5(text.encode('utf8')).hexdigest()
    return '-'.join([pathable_slot, unique_slot])
 class _Priority(namedtuple("_Priority", ["priority", "slot"])):
    """ Slot-specific priority. It is a hack - ``(priority, slot)`` tuple
    which can be used instead of int priorities in queues:
    * they are ordered in the same way - order is still by priority value,
      min(prios) works;
    * str(p) representation is guaranteed to be different when slots
      are different - this is important because str(p) is used to create
      queue files on disk;
    * they have readable str(p) representation which is safe
      to use as a file name.
    """
    __slots__ = ()
    def __str__(self):
        return '%s_%s' % (self.priority, _path_safe(str(self.slot)))
 class _SlotPriorityQueues(object):
    """ Container for multiple priority queues. """
    def __init__(self, pqfactory, slot_startprios=None):
        """
        ``pqfactory`` is a factory for creating new PriorityQueues.
        It must be a function which accepts a single optional ``startprios``
        argument, with a list of priorities to create queues for.
        ``slot_startprios`` is a ``{slot: startprios}`` dict.
        """
        self.pqfactory = pqfactory
        self.pqueues = {}  # slot -> priority queue
        for slot, startprios in (slot_startprios or {}).items():
            self.pqueues[slot] = self.pqfactory(startprios)
    def pop_slot(self, slot):
        """ Pop an object from a priority queue for this slot """
        queue = self.pqueues[slot]
        request = queue.pop()
        if len(queue) == 0:
            del self.pqueues[slot]
        return request
    def push_slot(self, slot, obj, priority):
        """ Push an object to a priority queue for this slot """
        if slot not in self.pqueues:
            self.pqueues[slot] = self.pqfactory()
        queue = self.pqueues[slot]
        queue.push(obj, priority)
    def close(self):
        active = {slot: queue.close()
                  for slot, queue in self.pqueues.items()}
        self.pqueues.clear()
        return active
    def __len__(self):
        return sum(len(x) for x in self.pqueues.values()) if self.pqueues else 0
    def __contains__(self, slot):
        return slot in self.pqueues
 class ScrapyPriorityQueue(PriorityQueue):
    """
    PriorityQueue which works with scrapy.Request instances and
    can optionally convert them to/from dicts before/after putting to a queue.
    """
    def __init__(self, crawler, qfactory, startprios=(), serialize=False):
        super(ScrapyPriorityQueue, self).__init__(qfactory, startprios)
        self.serialize = serialize
        self.spider = crawler.spider
    @classmethod
    def from_crawler(cls, crawler, qfactory, startprios=(), serialize=False):
        return cls(crawler, qfactory, startprios, serialize)
    def push(self, request, priority=0):
        if self.serialize:
            request = request_to_dict(request, self.spider)
        super(ScrapyPriorityQueue, self).push(request, priority)
    def pop(self):
        request = super(ScrapyPriorityQueue, self).pop()
        if request and self.serialize:
            request = request_from_dict(request, self.spider)
        return request
 class DownloaderInterface(object):
    def __init__(self, crawler):
        self.downloader = crawler.engine.downloader
    def stats(self, possible_slots):
        return [(self._active_downloads(slot), slot)
                for slot in possible_slots]
    def get_slot_key(self, request):
        return self.downloader._get_slot_key(request, None)
    def _active_downloads(self, slot):
        """ Return a number of requests in a Downloader for a given slot """
        if slot not in self.downloader.slots:
            return 0
        return len(self.downloader.slots[slot].active)
 class DownloaderAwarePriorityQueue(object):
    """ PriorityQueue which takes Downlaoder activity in account:
    domains (slots) with the least amount of active downloads are dequeued
    first.
    """
    @classmethod
    def from_crawler(cls, crawler, qfactory, slot_startprios=None, serialize=False):
        return cls(crawler, qfactory, slot_startprios, serialize)
    def __init__(self, crawler, qfactory, slot_startprios=None, serialize=False):
        if crawler.settings.getint('CONCURRENT_REQUESTS_PER_IP') != 0:
            raise ValueError('"%s" does not support CONCURRENT_REQUESTS_PER_IP'
                             % (self.__class__,))
        if slot_startprios and not isinstance(slot_startprios, dict):
            raise ValueError("DownloaderAwarePriorityQueue accepts "
                             "``slot_startprios`` as a dict; %r instance "
                             "is passed. Most likely, it means the state is"
                             "created by an incompatible priority queue. "
                             "Only a crawl started with the same priority "
                             "queue class can be resumed." %
                             slot_startprios.__class__)
        slot_startprios = {
            slot: [_Priority(p, slot) for p in startprios]
            for slot, startprios in (slot_startprios or {}).items()}
        def pqfactory(startprios=()):
            return ScrapyPriorityQueue(crawler, qfactory, startprios, serialize)
        self._slot_pqueues = _SlotPriorityQueues(pqfactory, slot_startprios)
        self.serialize = serialize
        self._downloader_interface = DownloaderInterface(crawler)
    def pop(self):
        stats = self._downloader_interface.stats(self._slot_pqueues.pqueues)
        if not stats:
            return
        slot = min(stats)[1]
        request = self._slot_pqueues.pop_slot(slot)
        return request
    def push(self, request, priority):
        slot = self._downloader_interface.get_slot_key(request)
        priority_slot = _Priority(priority=priority, slot=slot)
        self._slot_pqueues.push_slot(slot, request, priority_slot)
    def close(self):
        active = self._slot_pqueues.close()
        return {slot: [p.priority for p in startprios]
                for slot, startprios in active.items()}
    def __len__(self):
        return len(self._slot_pqueues)
--- a/scrapy/settings/default_settings.py
+++ b/scrapy/settings/default_settings.py
@ -221,6 +221,7 @@ MEMUSAGE_NOTIFY_MAIL = []
 MEMUSAGE_WARNING_MB = 0
 METAREFRESH_ENABLED = True
 METAREFRESH_IGNORE_TAGS = ['script', 'noscript']
 METAREFRESH_MAXDELAY = 100
 NEWSPIDER_MODULE = ''
@ -238,7 +239,7 @@ REFERRER_POLICY = 'scrapy.spidermiddlewares.referer.DefaultReferrerPolicy'
 RETRY_ENABLED = True
 RETRY_TIMES = 2  # initial response + 2 retries = 3 requests
-RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408]
+RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
 RETRY_PRIORITY_ADJUST = -1
 ROBOTSTXT_OBEY = False
@ -246,7 +247,7 @@ ROBOTSTXT_OBEY = False
 SCHEDULER = 'scrapy.core.scheduler.Scheduler'
 SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
 SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
-SCHEDULER_PRIORITY_QUEUE = 'queuelib.PriorityQueue'
+SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'
 SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
 SPIDER_LOADER_WARN_ONLY = False
--- a/scrapy/spiders/crawl.py
+++ b/scrapy/spiders/crawl.py
@ -6,29 +6,55 @@ See documentation in docs/topics/spiders.rst
 """
 import copy
 import warnings
 import six
 from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Request, HtmlResponse
 from scrapy.utils.spider import iterate_spider_output
 from scrapy.utils.python import get_func_args
 from scrapy.spiders import Spider
-def identity(x):
+def _identity(request, response):
-    return x
+    return request
 def _get_method(method, spider):
    if callable(method):
        return method
    elif isinstance(method, six.string_types):
        return getattr(spider, method, None)
 class Rule(object):
-    def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
+    def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None):
        self.link_extractor = link_extractor
        self.callback = callback
        self.cb_kwargs = cb_kwargs or {}
        self.process_links = process_links
-        self.process_request = process_request
+        self.process_request = process_request or _identity
-        if follow is None:
+        self.process_request_argcount = None
-            self.follow = False if callback else True
+        self.follow = follow if follow is not None else not callback
-        else:
+
-            self.follow = follow
+    def _compile(self, spider):
        self.callback = _get_method(self.callback, spider)
        self.process_links = _get_method(self.process_links, spider)
        self.process_request = _get_method(self.process_request, spider)
        self.process_request_argcount = len(get_func_args(self.process_request))
        if self.process_request_argcount == 1:
            msg = 'Rule.process_request should accept two arguments (request, response), accepting only one is deprecated'
            warnings.warn(msg, category=ScrapyDeprecationWarning, stacklevel=2)
    def _process_request(self, request, response):
        """
        Wrapper around the request processing function to maintain backward
        compatibility with functions that do not take a Response object
        """
        args = [request] if self.process_request_argcount == 1 else [request, response]
        return self.process_request(*args)
 class CrawlSpider(Spider):
@ -64,8 +90,8 @@ class CrawlSpider(Spider):
                links = rule.process_links(links)
            for link in links:
                seen.add(link)
-                r = self._build_request(n, link)
+                request = self._build_request(n, link)
-                yield rule.process_request(r)
+                yield rule._process_request(request, response)
    def _response_downloaded(self, response):
        rule = self._rules[response.meta['rule']]
@ -83,17 +109,9 @@ class CrawlSpider(Spider):
                yield request_or_item
    def _compile_rules(self):
        def get_method(method):
            if callable(method):
                return method
            elif isinstance(method, six.string_types):
                return getattr(self, method, None)
        self._rules = [copy.copy(r) for r in self.rules]
        for rule in self._rules:
-            rule.callback = get_method(rule.callback)
+            rule._compile(self)
            rule.process_links = get_method(rule.process_links)
            rule.process_request = get_method(rule.process_request)
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
--- a/scrapy/squeues.py
+++ b/scrapy/squeues.py
@ -7,6 +7,7 @@ from six.moves import cPickle as pickle
 from queuelib import queue
 def _serializable_queue(queue_class, serialize, deserialize):
    class SerializableQueue(queue_class):
@ -22,6 +23,7 @@ def _serializable_queue(queue_class, serialize, deserialize):
    return SerializableQueue
 def _pickle_serialize(obj):
    try:
        return pickle.dumps(obj, protocol=2)
@ -31,13 +33,14 @@ def _pickle_serialize(obj):
    except (pickle.PicklingError, AttributeError, TypeError) as e:
        raise ValueError(str(e))
-PickleFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue, \
+
 PickleFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue,
    _pickle_serialize, pickle.loads)
-PickleLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue, \
+PickleLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue,
    _pickle_serialize, pickle.loads)
-MarshalFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue, \
+MarshalFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue,
    marshal.dumps, marshal.loads)
-MarshalLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue, \
+MarshalLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue,
    marshal.dumps, marshal.loads)
 FifoMemoryQueue = queue.FifoMemoryQueue
 LifoMemoryQueue = queue.LifoMemoryQueue
--- a/scrapy/templates/project/module/middlewares.py.tmpl
+++ b/scrapy/templates/project/module/middlewares.py.tmpl
@ -39,7 +39,7 @@ class ${ProjectName}SpiderMiddleware(object):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
-        # Should return either None or an iterable of Response, dict
+        # Should return either None or an iterable of Request, dict
        # or Item objects.
        pass
--- a/scrapy/utils/defer.py
+++ b/scrapy/utils/defer.py
@ -48,7 +48,7 @@ def mustbe_deferred(f, *args, **kw):
    # exception in Scrapy - see #125
    except IgnoreRequest as e:
        return defer_fail(failure.Failure(e))
-    except:
+    except Exception:
        return defer_fail(failure.Failure())
    else:
        return defer_result(result)
@ -102,5 +102,5 @@ def iter_errback(iterable, errback, *a, **kw):
            yield next(it)
        except StopIteration:
            break
-        except:
+        except Exception:
            errback(failure.Failure(), *a, **kw)
--- a/scrapy/utils/gz.py
+++ b/scrapy/utils/gz.py
@ -9,6 +9,9 @@ from gzip import GzipFile
 import six
 import re
 from scrapy.utils.decorators import deprecated
 # - Python>=3.5 GzipFile's read() has issues returning leftover
 #   uncompressed data when input is corrupted
 #   (regression or bug-fix compared to Python 3.4)
@ -53,6 +56,7 @@ def gunzip(data):
 _is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search
 _is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search
@deprecated
 def is_gzipped(response):
    """Return True if the response is gzipped, or False otherwise"""
    ctype = response.headers.get('Content-Type', b'')
--- a/scrapy/utils/misc.py
+++ b/scrapy/utils/misc.py
@ -1,6 +1,8 @@
 """Helper functions which don't fit anywhere else"""
 import os
 import re
 import hashlib
 from contextlib import contextmanager
 from importlib import import_module
 from pkgutil import iter_modules
@ -86,7 +88,7 @@ def extract_regex(regex, text, encoding='utf-8'):
    try:
        strings = [regex.search(text).group('extract')]   # named group
-    except:
+    except Exception:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)
@ -142,3 +144,21 @@ def create_instance(objcls, settings, crawler, *args, **kwargs):
        return objcls.from_settings(settings, *args, **kwargs)
    else:
        return objcls(*args, **kwargs)
@contextmanager
 def set_environ(**kwargs):
    """Temporarily set environment variables inside the context manager and
    fully restore previous environment afterwards
    """
    original_env = {k: os.environ.get(k) for k in kwargs}
    os.environ.update(kwargs)
    try:
        yield
    finally:
        for k, v in original_env.items():
            if v is None:
                del os.environ[k]
            else:
                os.environ[k] = v
--- a/scrapy/utils/python.py
+++ b/scrapy/utils/python.py
@ -9,6 +9,7 @@ import weakref
 import errno
 import six
 from functools import partial, wraps
 from itertools import chain
 import sys
 from scrapy.utils.decorators import deprecated
@ -387,3 +388,22 @@ if hasattr(sys, "pypy_version_info"):
 else:
    def garbage_collect():
        gc.collect()
 class MutableChain(object):
    """
    Thin wrapper around itertools.chain, allowing to add iterables "in-place"
    """
    def __init__(self, *args):
        self.data = chain(*args)
    def extend(self, *iterables):
        self.data = chain(self.data, *iterables)
    def __iter__(self):
        return self.data.__iter__()
    def __next__(self):
        return next(self.data)
    next = __next__
--- a/scrapy/utils/reqser.py
+++ b/scrapy/utils/reqser.py
@ -70,6 +70,20 @@ def request_from_dict(d, spider=None):
    )
 def _is_private_method(name):
    return name.startswith('__') and not name.endswith('__')
 def _mangle_private_name(obj, func, name):
    qualname = getattr(func, '__qualname__', None)
    if qualname is None:
        classname = obj.__class__.__name__.lstrip('_')
        return '_%s%s' % (classname, name)
    else:
        splits = qualname.split('.')
        return '_%s%s' % (splits[-2], splits[-1])
 def _find_method(obj, func):
    if obj:
        try:
@ -78,7 +92,10 @@ def _find_method(obj, func):
            pass
        else:
            if func_self is obj:
-                return six.get_method_function(func).__name__
+                name = six.get_method_function(func).__name__
                if _is_private_method(name):
                    return _mangle_private_name(obj, func, name)
                return name
    raise ValueError("Function %s is not a method of: %s" % (func, obj))
--- a/scrapy/utils/response.py
+++ b/scrapy/utils/response.py
@ -31,12 +31,12 @@ def get_base_url(response):
 _metaref_cache = weakref.WeakKeyDictionary()
-def get_meta_refresh(response):
+def get_meta_refresh(response, ignore_tags=('script', 'noscript')):
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.text[0:4096]
        _metaref_cache[response] = html.get_meta_refresh(text, response.url,
-            response.encoding, ignore_tags=('script', 'noscript'))
+            response.encoding, ignore_tags=ignore_tags)
    return _metaref_cache[response]
--- a/setup.py
+++ b/setup.py
@ -65,7 +65,8 @@ setup(
    ],
    python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
    install_requires=[
-        'Twisted>=13.1.0',
+        'Twisted>=13.1.0;python_version!="3.4"',
        'Twisted>=13.1.0,<=19.2.0;python_version=="3.4"',
        'w3lib>=1.17.0',
        'queuelib',
        'lxml',
--- a/tests/mockserver.py
+++ b/tests/mockserver.py
@ -177,7 +177,7 @@ class Root(Resource):
        try:
            from tests import tests_datadir
            self.putChild(b"files", File(os.path.join(tests_datadir, 'test_site/files/')))
-        except:
+        except Exception:
            pass
        self.putChild(b"redirect-to", RedirectTo())
--- a/tests/requirements-py2.txt
+++ b/tests/requirements-py2.txt
@ -2,9 +2,10 @@
 mock
 mitmproxy==0.10.1
 netlib==0.10.1
-pytest==2.9.2
+pytest
 pytest-cov
 pytest-twisted
-pytest-cov==2.2.1
+pytest-xdist
 jmespath
 brotlipy
 testfixtures
--- a/tests/requirements-py3.txt
+++ b/tests/requirements-py3.txt
@ -1,6 +1,7 @@
-pytest==3.6.3
+pytest
 pytest-cov
 pytest-twisted
-pytest-cov==2.5.1
+pytest-xdist
 testfixtures
 jmespath
 leveldb; sys_platform != "win32"
--- a/tests/test_closespider.py
+++ b/tests/test_closespider.py
@ -53,9 +53,5 @@ class TestCloseSpider(TestCase):
        yield crawler.crawl(total=1000000, mockserver=self.mockserver)
        reason = crawler.spider.meta['close_reason']
        self.assertEqual(reason, 'closespider_timeout')
-        stats = crawler.stats
+        total_seconds = crawler.stats.get_value('elapsed_time_seconds')
        start = stats.get_value('start_time')
        stop = stats.get_value('finish_time')
        diff = stop - start
        total_seconds = diff.seconds + diff.microseconds
        self.assertTrue(total_seconds >= close_on)
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@ -1,5 +1,4 @@
 import logging
 import tempfile
 import warnings
 from twisted.internet import defer
@ -38,7 +37,11 @@ class CrawlerTestCase(BaseCrawlerTest):
            self.assertIsInstance(spiders, sl_cls)
            self.crawler.spiders
-            self.assertEqual(len(w), 1, "Warn deprecated access only once")
+            is_one_warning = len(w) == 1
            if not is_one_warning:
                for warning in w:
                    print(warning)
            self.assertTrue(is_one_warning, "Warn deprecated access only once")
    def test_populate_spidercls_settings(self):
        spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
@ -179,8 +182,12 @@ class CrawlerRunnerTestCase(BaseCrawlerTest):
                'SPIDER_MANAGER_CLASS': 'tests.test_crawler.CustomSpiderLoader'
            })
            self.assertIsInstance(runner.spider_loader, CustomSpiderLoader)
-            self.assertEqual(len(w), 1)
+            is_one_warning = len(w) == 1
            if not is_one_warning:
                for warning in w:
                    print(warning)
            self.assertIn('Please use SPIDER_LOADER_CLASS', str(w[0].message))
            self.assertTrue(is_one_warning)
    def test_crawl_rejects_spider_objects(self):
        with raises(ValueError):
--- a/tests/test_downloadermiddleware.py
+++ b/tests/test_downloadermiddleware.py
@ -3,6 +3,7 @@ from twisted.python.failure import Failure
 from scrapy.http import Request, Response
 from scrapy.spiders import Spider
 from scrapy.exceptions import _InvalidOutput
 from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
 from scrapy.utils.test import get_crawler
 from scrapy.utils.python import to_bytes
@ -115,3 +116,63 @@ class ResponseFromProcessRequestTest(ManagerTestCase):
        self.assertIs(results[0], resp)
        self.assertFalse(download_func.called)
 class ProcessRequestInvalidOutput(ManagerTestCase):
    """Invalid return value for process_request method should raise an exception"""
    def test_invalid_process_request(self):
        req = Request('http://example.com/index.html')
        class InvalidProcessRequestMiddleware:
            def process_request(self, request, spider):
                return 1
        self.mwman._add_middleware(InvalidProcessRequestMiddleware())
        download_func = mock.MagicMock()
        dfd = self.mwman.download(download_func, req, self.spider)
        results = []
        dfd.addBoth(results.append)
        self.assertIsInstance(results[0], Failure)
        self.assertIsInstance(results[0].value, _InvalidOutput)
 class ProcessResponseInvalidOutput(ManagerTestCase):
    """Invalid return value for process_response method should raise an exception"""
    def test_invalid_process_response(self):
        req = Request('http://example.com/index.html')
        class InvalidProcessResponseMiddleware:
            def process_response(self, request, response, spider):
                return 1
        self.mwman._add_middleware(InvalidProcessResponseMiddleware())
        download_func = mock.MagicMock()
        dfd = self.mwman.download(download_func, req, self.spider)
        results = []
        dfd.addBoth(results.append)
        self.assertIsInstance(results[0], Failure)
        self.assertIsInstance(results[0].value, _InvalidOutput)
 class ProcessExceptionInvalidOutput(ManagerTestCase):
    """Invalid return value for process_exception method should raise an exception"""
    def test_invalid_process_exception(self):
        req = Request('http://example.com/index.html')
        class InvalidProcessExceptionMiddleware:
            def process_request(self, request, spider):
                raise Exception()
            def process_exception(self, request, exception, spider):
                return 1
        self.mwman._add_middleware(InvalidProcessExceptionMiddleware())
        download_func = mock.MagicMock()
        dfd = self.mwman.download(download_func, req, self.spider)
        results = []
        dfd.addBoth(results.append)
        self.assertIsInstance(results[0], Failure)
        self.assertIsInstance(results[0].value, _InvalidOutput)
--- a/tests/test_downloadermiddleware_redirect.py
+++ b/tests/test_downloadermiddleware_redirect.py
@ -279,5 +279,24 @@ class MetaRefreshMiddlewareTest(unittest.TestCase):
        self.assertEqual(req2.meta['redirect_reasons'], ['meta refresh'])
        self.assertEqual(req3.meta['redirect_reasons'], ['meta refresh', 'meta refresh'])
    def test_ignore_tags_default(self):
        req = Request(url='http://example.org')
        body = ('''<noscript><meta http-equiv="refresh" '''
                '''content="0;URL='http://example.org/newpage'"></noscript>''')
        rsp = HtmlResponse(req.url, body=body.encode())
        response = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(response, Response)
    def test_ignore_tags_empty_list(self):
        crawler = get_crawler(Spider, {'METAREFRESH_IGNORE_TAGS': []})
        mw = MetaRefreshMiddleware.from_crawler(crawler)
        req = Request(url='http://example.org')
        body = ('''<noscript><meta http-equiv="refresh" '''
                '''content="0;URL='http://example.org/newpage'"></noscript>''')
        rsp = HtmlResponse(req.url, body=body.encode())
        req2 = mw.process_response(req, rsp, self.spider)
        assert isinstance(req2, Request)
        self.assertEqual(req2.url, 'http://example.org/newpage')
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@ -419,6 +419,43 @@ class BasicItemLoaderTest(unittest.TestCase):
        self.assertEqual(item['url'], u'rabbit.hole')
        self.assertEqual(item['summary'], u'rabbithole')
    def test_create_item_from_dict(self):
        class TestItem(Item):
            title = Field()
        class TestItemLoader(ItemLoader):
            default_item_class = TestItem
        input_item = {'title': 'Test item title 1'}
        il = TestItemLoader(item=input_item)
        # Getting output value mustn't remove value from item
        self.assertEqual(il.load_item(), {
            'title': 'Test item title 1',
        })
        self.assertEqual(il.get_output_value('title'), 'Test item title 1')
        self.assertEqual(il.load_item(), {
            'title': 'Test item title 1',
        })
        input_item = {'title': 'Test item title 2'}
        il = TestItemLoader(item=input_item)
        # Values from dict must be added to item _values
        self.assertEqual(il._values.get('title'), 'Test item title 2')
        input_item = {'title': [u'Test item title 3', u'Test item 4']}
        il = TestItemLoader(item=input_item)
        # Same rules must work for lists
        self.assertEqual(il._values.get('title'),
                         [u'Test item title 3', u'Test item 4'])
        self.assertEqual(il.load_item(), {
            'title': [u'Test item title 3', u'Test item 4'],
        })
        self.assertEqual(il.get_output_value('title'),
                         [u'Test item title 3', u'Test item 4'])
        self.assertEqual(il.load_item(), {
            'title': [u'Test item title 3', u'Test item 4'],
        })
 class ProcessorsTest(unittest.TestCase):
--- a/tests/test_pipeline_media.py
+++ b/tests/test_pipeline_media.py
@ -1,15 +1,19 @@
 from __future__ import print_function
 import sys
 from testfixtures import LogCapture
 from twisted.trial import unittest
 from twisted.python.failure import Failure
 from twisted.internet import reactor
-from twisted.internet.defer import Deferred, inlineCallbacks
+from twisted.internet.defer import Deferred, inlineCallbacks, returnValue
 from scrapy.http import Request, Response
 from scrapy.settings import Settings
 from scrapy.spiders import Spider
 from scrapy.utils.request import request_fingerprint
 from scrapy.pipelines.media import MediaPipeline
 from scrapy.pipelines.files import FileException
 from scrapy.utils.log import failure_to_exc_info
 from scrapy.utils.signal import disconnect_all
 from scrapy import signals
@ -90,6 +94,77 @@ class BaseMediaPipelineTestCase(unittest.TestCase):
        self.pipe._modify_media_request(request)
        assert request.meta == {'handle_httpstatus_all': True}
    def test_should_remove_req_res_references_before_caching_the_results(self):
        """Regression test case to prevent a memory leak in the Media Pipeline.
        The memory leak is triggered when an exception is raised when a Response
        scheduled by the Media Pipeline is being returned. For example, when a
        FileException('download-error') is raised because the Response status
        code is not 200 OK.
        It happens because we are keeping a reference to the Response object
        inside the FileException context. This is caused by the way Twisted
        return values from inline callbacks. It raises a custom exception
        encapsulating the original return value.
        The solution is to remove the exception context when this context is a
        _DefGen_Return instance, the BaseException used by Twisted to pass the
        returned value from those inline callbacks.
        Maybe there's a better and more reliable way to test the case described
        here, but it would be more complicated and involve running - or at least
        mocking - some async steps from the Media Pipeline. The current test
        case is simple and detects the problem very fast. On the other hand, it
        would not detect another kind of leak happening due to old object
        references being kept inside the Media Pipeline cache.
        This problem does not occur in Python 2.7 since we don't have Exception
        Chaining (https://www.python.org/dev/peps/pep-3134/).
        """
        # Create sample pair of Request and Response objects
        request = Request('http://url')
        response = Response('http://url', body=b'', request=request)
        # Simulate the Media Pipeline behavior to produce a Twisted Failure
        try:
            # Simulate a Twisted inline callback returning a Response
            # The returnValue method raises an exception encapsulating the value
            returnValue(response)
        except BaseException as exc:
            def_gen_return_exc = exc
            try:
                # Simulate the media_downloaded callback raising a FileException
                # This usually happens when the status code is not 200 OK
                raise FileException('download-error')
            except Exception as exc:
                file_exc = exc
                # Simulate Twisted capturing the FileException
                # It encapsulates the exception inside a Twisted Failure
                failure = Failure(file_exc)
        # The Failure should encapsulate a FileException ...
        self.assertEqual(failure.value, file_exc)
        # ... and if we're running on Python 3 ...
        if sys.version_info.major >= 3:
            # ... it should have the returnValue exception set as its context
            self.assertEqual(failure.value.__context__, def_gen_return_exc)
        # Let's calculate the request fingerprint and fake some runtime data...
        fp = request_fingerprint(request)
        info = self.pipe.spiderinfo
        info.downloading.add(fp)
        info.waiting[fp] = []
        # When calling the method that caches the Request's result ...
        self.pipe._cache_result_and_execute_waiters(failure, fp, info)
        # ... it should store the Twisted Failure ...
        self.assertEqual(info.downloaded[fp], failure)
        # ... encapsulating the original FileException ...
        self.assertEqual(info.downloaded[fp].value, file_exc)
        # ... but it should not store the returnValue exception on its context
        context = getattr(info.downloaded[fp].value, '__context__', None)
        self.assertIsNone(context)
 class MockedMediaPipeline(MediaPipeline):
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@ -0,0 +1,342 @@
 import shutil
 import tempfile
 import unittest
 import collections
 from twisted.internet import defer
 from twisted.trial.unittest import TestCase
 from scrapy.crawler import Crawler
 from scrapy.core.downloader import Downloader
 from scrapy.core.scheduler import Scheduler
 from scrapy.http import Request
 from scrapy.spiders import Spider
 from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.test import get_crawler
 from tests.mockserver import MockServer
 MockEngine = collections.namedtuple('MockEngine', ['downloader'])
 MockSlot = collections.namedtuple('MockSlot', ['active'])
 class MockDownloader(object):
    def __init__(self):
        self.slots = dict()
    def _get_slot_key(self, request, spider):
        if Downloader.DOWNLOAD_SLOT in request.meta:
            return request.meta[Downloader.DOWNLOAD_SLOT]
        return urlparse_cached(request).hostname or ''
    def increment(self, slot_key):
        slot = self.slots.setdefault(slot_key, MockSlot(active=list()))
        slot.active.append(1)
    def decrement(self, slot_key):
        slot = self.slots.get(slot_key)
        slot.active.pop()
    def close(self):
        pass
 class MockCrawler(Crawler):
    def __init__(self, priority_queue_cls, jobdir):
        settings = dict(
                LOG_UNSERIALIZABLE_REQUESTS=False,
                SCHEDULER_DISK_QUEUE='scrapy.squeues.PickleLifoDiskQueue',
                SCHEDULER_MEMORY_QUEUE='scrapy.squeues.LifoMemoryQueue',
                SCHEDULER_PRIORITY_QUEUE=priority_queue_cls,
                JOBDIR=jobdir,
                DUPEFILTER_CLASS='scrapy.dupefilters.BaseDupeFilter'
                )
        super(MockCrawler, self).__init__(Spider, settings)
        self.engine = MockEngine(downloader=MockDownloader())
 class SchedulerHandler(object):
    priority_queue_cls = None
    jobdir = None
    def create_scheduler(self):
        self.mock_crawler = MockCrawler(self.priority_queue_cls, self.jobdir)
        self.scheduler = Scheduler.from_crawler(self.mock_crawler)
        self.spider = Spider(name='spider')
        self.scheduler.open(self.spider)
    def close_scheduler(self):
        self.scheduler.close('finished')
        self.mock_crawler.stop()
        self.mock_crawler.engine.downloader.close()
    def setUp(self):
        self.create_scheduler()
    def tearDown(self):
        self.close_scheduler()
 _PRIORITIES = [("http://foo.com/a", -2),
               ("http://foo.com/d", 1),
               ("http://foo.com/b", -1),
               ("http://foo.com/c", 0),
               ("http://foo.com/e", 2)]
 _URLS = {"http://foo.com/a", "http://foo.com/b", "http://foo.com/c"}
 class BaseSchedulerInMemoryTester(SchedulerHandler):
    def test_length(self):
        self.assertFalse(self.scheduler.has_pending_requests())
        self.assertEqual(len(self.scheduler), 0)
        for url in _URLS:
            self.scheduler.enqueue_request(Request(url))
        self.assertTrue(self.scheduler.has_pending_requests())
        self.assertEqual(len(self.scheduler), len(_URLS))
    def test_dequeue(self):
        for url in _URLS:
            self.scheduler.enqueue_request(Request(url))
        urls = set()
        while self.scheduler.has_pending_requests():
            urls.add(self.scheduler.next_request().url)
        self.assertEqual(urls, _URLS)
    def test_dequeue_priorities(self):
        for url, priority in _PRIORITIES:
            self.scheduler.enqueue_request(Request(url, priority=priority))
        priorities = list()
        while self.scheduler.has_pending_requests():
            priorities.append(self.scheduler.next_request().priority)
        self.assertEqual(priorities,
                         sorted([x[1] for x in _PRIORITIES], key=lambda x: -x))
 class BaseSchedulerOnDiskTester(SchedulerHandler):
    def setUp(self):
        self.jobdir = tempfile.mkdtemp()
        self.create_scheduler()
    def tearDown(self):
        self.close_scheduler()
        shutil.rmtree(self.jobdir)
        self.jobdir = None
    def test_length(self):
        self.assertFalse(self.scheduler.has_pending_requests())
        self.assertEqual(len(self.scheduler), 0)
        for url in _URLS:
            self.scheduler.enqueue_request(Request(url))
        self.close_scheduler()
        self.create_scheduler()
        self.assertTrue(self.scheduler.has_pending_requests())
        self.assertEqual(len(self.scheduler), len(_URLS))
    def test_dequeue(self):
        for url in _URLS:
            self.scheduler.enqueue_request(Request(url))
        self.close_scheduler()
        self.create_scheduler()
        urls = set()
        while self.scheduler.has_pending_requests():
            urls.add(self.scheduler.next_request().url)
        self.assertEqual(urls, _URLS)
    def test_dequeue_priorities(self):
        for url, priority in _PRIORITIES:
            self.scheduler.enqueue_request(Request(url, priority=priority))
        self.close_scheduler()
        self.create_scheduler()
        priorities = list()
        while self.scheduler.has_pending_requests():
            priorities.append(self.scheduler.next_request().priority)
        self.assertEqual(priorities,
                         sorted([x[1] for x in _PRIORITIES], key=lambda x: -x))
 class TestSchedulerInMemory(BaseSchedulerInMemoryTester, unittest.TestCase):
    priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
 class TestSchedulerOnDisk(BaseSchedulerOnDiskTester, unittest.TestCase):
    priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
 _URLS_WITH_SLOTS = [("http://foo.com/a", 'a'),
                    ("http://foo.com/b", 'a'),
                    ("http://foo.com/c", 'b'),
                    ("http://foo.com/d", 'b'),
                    ("http://foo.com/e", 'c'),
                    ("http://foo.com/f", 'c')]
 class TestMigration(unittest.TestCase):
    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()
    def tearDown(self):
        shutil.rmtree(self.tmpdir)
    def _migration(self, tmp_dir):
        prev_scheduler_handler = SchedulerHandler()
        prev_scheduler_handler.priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
        prev_scheduler_handler.jobdir = tmp_dir
        prev_scheduler_handler.create_scheduler()
        for url in _URLS:
            prev_scheduler_handler.scheduler.enqueue_request(Request(url))
        prev_scheduler_handler.close_scheduler()
        next_scheduler_handler = SchedulerHandler()
        next_scheduler_handler.priority_queue_cls = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
        next_scheduler_handler.jobdir = tmp_dir
        next_scheduler_handler.create_scheduler()
    def test_migration(self):
        with self.assertRaises(ValueError):
            self._migration(self.tmpdir)
 def _is_scheduling_fair(enqueued_slots, dequeued_slots):
    """
    We enqueued same number of requests for every slot.
    Assert correct order, e.g.
    >>> enqueued = ['a', 'b', 'c'] * 2
    >>> correct = ['a', 'c', 'b', 'b', 'a', 'c']
    >>> incorrect = ['a', 'a', 'b', 'c', 'c', 'b']
    >>> _is_scheduling_fair(enqueued, correct)
    True
    >>> _is_scheduling_fair(enqueued, incorrect)
    False
    """
    if len(dequeued_slots) != len(enqueued_slots):
        return False
    slots_number = len(set(enqueued_slots))
    for i in range(0, len(dequeued_slots), slots_number):
        part = dequeued_slots[i:i + slots_number]
        if len(part) != len(set(part)):
            return False
    return True
 class DownloaderAwareSchedulerTestMixin(object):
    priority_queue_cls = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
    reopen = False
    def test_logic(self):
        for url, slot in _URLS_WITH_SLOTS:
            request = Request(url)
            request.meta[Downloader.DOWNLOAD_SLOT] = slot
            self.scheduler.enqueue_request(request)
        if self.reopen:
            self.close_scheduler()
            self.create_scheduler()
        dequeued_slots = list()
        requests = []
        downloader = self.mock_crawler.engine.downloader
        while self.scheduler.has_pending_requests():
            request = self.scheduler.next_request()
            # pylint: disable=protected-access
            slot = downloader._get_slot_key(request, None)
            dequeued_slots.append(slot)
            downloader.increment(slot)
            requests.append(request)
        for request in requests:
            # pylint: disable=protected-access
            slot = downloader._get_slot_key(request, None)
            downloader.decrement(slot)
        self.assertTrue(_is_scheduling_fair(list(s for u, s in _URLS_WITH_SLOTS),
                                            dequeued_slots))
        self.assertEqual(sum(len(s.active) for s in downloader.slots.values()), 0)
 class TestSchedulerWithDownloaderAwareInMemory(DownloaderAwareSchedulerTestMixin,
                                               BaseSchedulerInMemoryTester,
                                               unittest.TestCase):
    pass
 class TestSchedulerWithDownloaderAwareOnDisk(DownloaderAwareSchedulerTestMixin,
                                             BaseSchedulerOnDiskTester,
                                             unittest.TestCase):
    reopen = True
 class StartUrlsSpider(Spider):
    def __init__(self, start_urls):
        self.start_urls = start_urls
        super(StartUrlsSpider, self).__init__(start_urls)
    def parse(self, response):
        pass
 class TestIntegrationWithDownloaderAwareInMemory(TestCase):
    def setUp(self):
        self.crawler = get_crawler(
                    StartUrlsSpider,
                    {'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
                     'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter'}
                    )
    @defer.inlineCallbacks
    def tearDown(self):
        yield self.crawler.stop()
    @defer.inlineCallbacks
    def test_integration_downloader_aware_priority_queue(self):
        with MockServer() as mockserver:
            url = mockserver.url("/status?n=200", is_secure=False)
            start_urls = [url] * 6
            yield self.crawler.crawl(start_urls)
            self.assertEqual(self.crawler.stats.get_value('downloader/response_count'),
                             len(start_urls))
 class TestIncompatibility(unittest.TestCase):
    def _incompatible(self):
        settings = dict(
                SCHEDULER_PRIORITY_QUEUE='scrapy.pqueues.DownloaderAwarePriorityQueue',
                CONCURRENT_REQUESTS_PER_IP=1
                )
        crawler = Crawler(Spider, settings)
        scheduler = Scheduler.from_crawler(crawler)
        spider = Spider(name='spider')
        scheduler.open(spider)
    def test_incompatibility(self):
        with self.assertRaises(ValueError):
            self._incompatible()
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -105,11 +105,11 @@ class SpiderTest(unittest.TestCase):
    def test_logger(self):
        spider = self.spider_class('example.com')
-        with LogCapture() as l:
+        with LogCapture() as lc:
            spider.logger.info('test log msg')
-        l.check(('example.com', 'INFO', 'test log msg'))
+        lc.check(('example.com', 'INFO', 'test log msg'))
-        record = l.records[0]
+        record = lc.records[0]
        self.assertIn('spider', record.__dict__)
        self.assertIs(record.spider, spider)
@ -190,8 +190,7 @@ class CrawlSpiderTest(SpiderTest):
    def test_process_links(self):
-        response = HtmlResponse("http://example.org/somepage/index.html",
+        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
            body=self.test_body)
        class _CrawlSpider(self.spider_class):
            name = "test"
@ -214,8 +213,7 @@ class CrawlSpiderTest(SpiderTest):
    def test_process_links_filter(self):
-        response = HtmlResponse("http://example.org/somepage/index.html",
+        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
            body=self.test_body)
        class _CrawlSpider(self.spider_class):
            import re
@ -226,6 +224,7 @@ class CrawlSpiderTest(SpiderTest):
                Rule(LinkExtractor(), process_links="filter_process_links"),
            )
            _test_regex = re.compile('nofollow')
            def filter_process_links(self, links):
                return [link for link in links
                        if not self._test_regex.search(link.url)]
@ -240,8 +239,7 @@ class CrawlSpiderTest(SpiderTest):
    def test_process_links_generator(self):
-        response = HtmlResponse("http://example.org/somepage/index.html",
+        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
            body=self.test_body)
        class _CrawlSpider(self.spider_class):
            name = "test"
@ -263,6 +261,110 @@ class CrawlSpiderTest(SpiderTest):
                          'http://example.org/about.html',
                          'http://example.org/nofollow.html'])
    def test_process_request(self):
        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
        def process_request_change_domain(request):
            return request.replace(url=request.url.replace('.org', '.com'))
        class _CrawlSpider(self.spider_class):
            name = "test"
            allowed_domains = ['example.org']
            rules = (
                Rule(LinkExtractor(), process_request=process_request_change_domain),
            )
        with warnings.catch_warnings(record=True) as cw:
            spider = _CrawlSpider()
            output = list(spider._requests_to_follow(response))
            self.assertEqual(len(output), 3)
            self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
            self.assertEqual([r.url for r in output],
                             ['http://example.com/somepage/item/12.html',
                              'http://example.com/about.html',
                              'http://example.com/nofollow.html'])
            self.assertEqual(len(cw), 1)
            self.assertEqual(cw[0].category, ScrapyDeprecationWarning)
    def test_process_request_with_response(self):
        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
        def process_request_meta_response_class(request, response):
            request.meta['response_class'] = response.__class__.__name__
            return request
        class _CrawlSpider(self.spider_class):
            name = "test"
            allowed_domains = ['example.org']
            rules = (
                Rule(LinkExtractor(), process_request=process_request_meta_response_class),
            )
        spider = _CrawlSpider()
        output = list(spider._requests_to_follow(response))
        self.assertEqual(len(output), 3)
        self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
        self.assertEqual([r.url for r in output],
                         ['http://example.org/somepage/item/12.html',
                          'http://example.org/about.html',
                          'http://example.org/nofollow.html'])
        self.assertEqual([r.meta['response_class'] for r in output],
                         ['HtmlResponse', 'HtmlResponse', 'HtmlResponse'])
    def test_process_request_instance_method(self):
        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
        class _CrawlSpider(self.spider_class):
            name = "test"
            allowed_domains = ['example.org']
            rules = (
                Rule(LinkExtractor(), process_request='process_request_upper'),
            )
            def process_request_upper(self, request):
                return request.replace(url=request.url.upper())
        with warnings.catch_warnings(record=True) as cw:
            spider = _CrawlSpider()
            output = list(spider._requests_to_follow(response))
            self.assertEqual(len(output), 3)
            self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
            self.assertEqual([r.url for r in output],
                             ['http://EXAMPLE.ORG/SOMEPAGE/ITEM/12.HTML',
                              'http://EXAMPLE.ORG/ABOUT.HTML',
                              'http://EXAMPLE.ORG/NOFOLLOW.HTML'])
            self.assertEqual(len(cw), 1)
            self.assertEqual(cw[0].category, ScrapyDeprecationWarning)
    def test_process_request_instance_method_with_response(self):
        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
        class _CrawlSpider(self.spider_class):
            name = "test"
            allowed_domains = ['example.org']
            rules = (
                Rule(LinkExtractor(), process_request='process_request_meta_response_class'),
            )
            def process_request_meta_response_class(self, request, response):
                request.meta['response_class'] = response.__class__.__name__
                return request
        spider = _CrawlSpider()
        output = list(spider._requests_to_follow(response))
        self.assertEqual(len(output), 3)
        self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
        self.assertEqual([r.url for r in output],
                         ['http://example.org/somepage/item/12.html',
                          'http://example.org/about.html',
                          'http://example.org/nofollow.html'])
        self.assertEqual([r.meta['response_class'] for r in output],
                         ['HtmlResponse', 'HtmlResponse', 'HtmlResponse'])
    def test_follow_links_attribute_population(self):
        crawler = get_crawler()
        spider = self.spider_class.from_crawler(crawler, 'example.com')
--- a/tests/test_spidermiddleware.py
+++ b/tests/test_spidermiddleware.py
@ -0,0 +1,102 @@
 from twisted.trial.unittest import TestCase
 from twisted.python.failure import Failure
 from scrapy.spiders import Spider
 from scrapy.http import Request, Response
 from scrapy.exceptions import _InvalidOutput
 from scrapy.utils.test import get_crawler
 from scrapy.core.spidermw import SpiderMiddlewareManager
 from tests import mock
 class SpiderMiddlewareTestCase(TestCase):
    def setUp(self):
        self.request = Request('http://example.com/index.html')
        self.response = Response(self.request.url, request=self.request)
        self.crawler = get_crawler(Spider)
        self.spider = self.crawler._create_spider('foo')
        self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
    def _scrape_response(self):
        """Execute spider mw manager's scrape_response method and return the result.
        Raise exception in case of failure.
        """
        scrape_func = mock.MagicMock()
        dfd = self.mwman.scrape_response(scrape_func, self.response, self.request, self.spider)
        # catch deferred result and return the value
        results = []
        dfd.addBoth(results.append)
        self._wait(dfd)
        ret = results[0]
        return ret
 class ProcessSpiderInputInvalidOutput(SpiderMiddlewareTestCase):
    """Invalid return value for process_spider_input method"""
    def test_invalid_process_spider_input(self):
        class InvalidProcessSpiderInputMiddleware:
            def process_spider_input(self, response, spider):
                return 1
        self.mwman._add_middleware(InvalidProcessSpiderInputMiddleware())
        result = self._scrape_response()
        self.assertIsInstance(result, Failure)
        self.assertIsInstance(result.value, _InvalidOutput)
 class ProcessSpiderOutputInvalidOutput(SpiderMiddlewareTestCase):
    """Invalid return value for process_spider_output method"""
    def test_invalid_process_spider_output(self):
        class InvalidProcessSpiderOutputMiddleware:
            def process_spider_output(self, response, result, spider):
                return 1
        self.mwman._add_middleware(InvalidProcessSpiderOutputMiddleware())
        result = self._scrape_response()
        self.assertIsInstance(result, Failure)
        self.assertIsInstance(result.value, _InvalidOutput)
 class ProcessSpiderExceptionInvalidOutput(SpiderMiddlewareTestCase):
    """Invalid return value for process_spider_exception method"""
    def test_invalid_process_spider_exception(self):
        class InvalidProcessSpiderOutputExceptionMiddleware:
            def process_spider_exception(self, response, exception, spider):
                return 1
        class RaiseExceptionProcessSpiderOutputMiddleware:
            def process_spider_output(self, response, result, spider):
                raise Exception()
        self.mwman._add_middleware(InvalidProcessSpiderOutputExceptionMiddleware())
        self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
        result = self._scrape_response()
        self.assertIsInstance(result, Failure)
        self.assertIsInstance(result.value, _InvalidOutput)
 class ProcessSpiderExceptionReRaise(SpiderMiddlewareTestCase):
    """Re raise the exception by returning None"""
    def test_process_spider_exception_return_none(self):
        class ProcessSpiderExceptionReturnNoneMiddleware:
            def process_spider_exception(self, response, exception, spider):
                return None
        class RaiseExceptionProcessSpiderOutputMiddleware:
            def process_spider_output(self, response, result, spider):
                1/0
        self.mwman._add_middleware(ProcessSpiderExceptionReturnNoneMiddleware())
        self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
        result = self._scrape_response()
        self.assertIsInstance(result, Failure)
        self.assertIsInstance(result.value, ZeroDivisionError)
--- a/tests/test_spidermiddleware_output_chain.py
+++ b/tests/test_spidermiddleware_output_chain.py
@ -0,0 +1,380 @@
 from testfixtures import LogCapture
 from twisted.trial.unittest import TestCase
 from twisted.internet import defer
 from scrapy import Spider, Request
 from scrapy.utils.test import get_crawler
 from tests.mockserver import MockServer
 from tests.spiders import MockServerSpider
 class LogExceptionMiddleware:
    def process_spider_exception(self, response, exception, spider):
        spider.logger.info('Middleware: %s exception caught', exception.__class__.__name__)
        return None
 # ================================================================================
 # (0) recover from an exception on a spider callback
 class RecoverySpider(Spider):
    name = 'RecoverySpider'
    custom_settings = {
        'SPIDER_MIDDLEWARES': {
            __name__ + '.RecoveryMiddleware': 10,
        },
    }
    def start_requests(self):
        yield Request(self.mockserver.url('/status?n=200'))
    def parse(self, response):
        yield {'test': 1}
        self.logger.info('DONT_FAIL: %s', response.meta.get('dont_fail'))
        if not response.meta.get('dont_fail'):
            raise TabError()
 class RecoveryMiddleware:
    def process_spider_exception(self, response, exception, spider):
        spider.logger.info('Middleware: %s exception caught', exception.__class__.__name__)
        return [
            {'from': 'process_spider_exception'},
            Request(response.url, meta={'dont_fail': True}, dont_filter=True),
        ]
 # ================================================================================
 # (1) exceptions from a spider middleware's process_spider_input method
 class FailProcessSpiderInputMiddleware:
    def process_spider_input(self, response, spider):
        spider.logger.info('Middleware: will raise IndexError')
        raise IndexError()
 class ProcessSpiderInputSpiderWithoutErrback(Spider):
    name = 'ProcessSpiderInputSpiderWithoutErrback'
    custom_settings = {
        'SPIDER_MIDDLEWARES': {
            # spider
            __name__ + '.LogExceptionMiddleware': 10,
            __name__ + '.FailProcessSpiderInputMiddleware': 8,
            __name__ + '.LogExceptionMiddleware': 6,
            # engine
        }
    }
    def start_requests(self):
        yield Request(url=self.mockserver.url('/status?n=200'), callback=self.parse)
    def parse(self, response):
        return {'from': 'callback'}
 class ProcessSpiderInputSpiderWithErrback(ProcessSpiderInputSpiderWithoutErrback):
    name = 'ProcessSpiderInputSpiderWithErrback'
    def start_requests(self):
        yield Request(url=self.mockserver.url('/status?n=200'), callback=self.parse, errback=self.errback)
    def errback(self, failure):
        self.logger.info('Got a Failure on the Request errback')
        return {'from': 'errback'}
 # ================================================================================
 # (2) exceptions from a spider callback (generator)
 class GeneratorCallbackSpider(Spider):
    name = 'GeneratorCallbackSpider'
    custom_settings = {
        'SPIDER_MIDDLEWARES': {
            __name__ + '.LogExceptionMiddleware': 10,
        },
    }
    def start_requests(self):
        yield Request(self.mockserver.url('/status?n=200'))
    def parse(self, response):
        yield {'test': 1}
        yield {'test': 2}
        raise ImportError()
 # ================================================================================
 # (3) exceptions from a spider callback (not a generator)
 class NotGeneratorCallbackSpider(Spider):
    name = 'NotGeneratorCallbackSpider'
    custom_settings = {
        'SPIDER_MIDDLEWARES': {
            __name__ + '.LogExceptionMiddleware': 10,
        },
    }
    def start_requests(self):
        yield Request(self.mockserver.url('/status?n=200'))
    def parse(self, response):
        return [{'test': 1}, {'test': 1/0}]
 # ================================================================================
 # (4) exceptions from a middleware process_spider_output method (generator)
 class GeneratorOutputChainSpider(Spider):
    name = 'GeneratorOutputChainSpider'
    custom_settings = {
        'SPIDER_MIDDLEWARES': {
            __name__ + '.GeneratorFailMiddleware': 10,
            __name__ + '.GeneratorDoNothingAfterFailureMiddleware': 8,
            __name__ + '.GeneratorRecoverMiddleware': 5,
            __name__ + '.GeneratorDoNothingAfterRecoveryMiddleware': 3,
        },
    }
    def start_requests(self):
        yield Request(self.mockserver.url('/status?n=200'))
    def parse(self, response):
        yield {'processed': ['parse-first-item']}
        yield {'processed': ['parse-second-item']}
 class _GeneratorDoNothingMiddleware:
    def process_spider_output(self, response, result, spider):
        for r in result:
            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
            yield r
    def process_spider_exception(self, response, exception, spider):
        method = '{}.process_spider_exception'.format(self.__class__.__name__)
        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
        return None
 class GeneratorFailMiddleware:
    def process_spider_output(self, response, result, spider):
        for r in result:
            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
            yield r
            raise LookupError()
    def process_spider_exception(self, response, exception, spider):
        method = '{}.process_spider_exception'.format(self.__class__.__name__)
        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
        yield {'processed': [method]}
 class GeneratorDoNothingAfterFailureMiddleware(_GeneratorDoNothingMiddleware):
    pass
 class GeneratorRecoverMiddleware:
    def process_spider_output(self, response, result, spider):
        for r in result:
            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
            yield r
    def process_spider_exception(self, response, exception, spider):
        method = '{}.process_spider_exception'.format(self.__class__.__name__)
        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
        yield {'processed': [method]}
 class GeneratorDoNothingAfterRecoveryMiddleware(_GeneratorDoNothingMiddleware):
    pass
 # ================================================================================
 # (5) exceptions from a middleware process_spider_output method (not generator)
 class NotGeneratorOutputChainSpider(Spider):
    name = 'NotGeneratorOutputChainSpider'
    custom_settings = {
        'SPIDER_MIDDLEWARES': {
            __name__ + '.NotGeneratorFailMiddleware': 10,
            __name__ + '.NotGeneratorDoNothingAfterFailureMiddleware': 8,
            __name__ + '.NotGeneratorRecoverMiddleware': 5,
            __name__ + '.NotGeneratorDoNothingAfterRecoveryMiddleware': 3,
        },
    }
    def start_requests(self):
        return [Request(self.mockserver.url('/status?n=200'))]
    def parse(self, response):
        return [{'processed': ['parse-first-item']}, {'processed': ['parse-second-item']}]
 class _NotGeneratorDoNothingMiddleware:
    def process_spider_output(self, response, result, spider):
        out = []
        for r in result:
            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
            out.append(r)
        return out
    def process_spider_exception(self, response, exception, spider):
        method = '{}.process_spider_exception'.format(self.__class__.__name__)
        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
        return None
 class NotGeneratorFailMiddleware:
    def process_spider_output(self, response, result, spider):
        out = []
        for r in result:
            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
            out.append(r)
        raise ReferenceError()
        return out
    def process_spider_exception(self, response, exception, spider):
        method = '{}.process_spider_exception'.format(self.__class__.__name__)
        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
        return [{'processed': [method]}]
 class NotGeneratorDoNothingAfterFailureMiddleware(_NotGeneratorDoNothingMiddleware):
    pass
 class NotGeneratorRecoverMiddleware:
    def process_spider_output(self, response, result, spider):
        out = []
        for r in result:
            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
            out.append(r)
        return out
    def process_spider_exception(self, response, exception, spider):
        method = '{}.process_spider_exception'.format(self.__class__.__name__)
        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
        return [{'processed': [method]}]
 class NotGeneratorDoNothingAfterRecoveryMiddleware(_NotGeneratorDoNothingMiddleware):
    pass
 # ================================================================================
 class TestSpiderMiddleware(TestCase):
    @classmethod
    def setUpClass(cls):
        cls.mockserver = MockServer()
        cls.mockserver.__enter__()
    @classmethod
    def tearDownClass(cls):
        cls.mockserver.__exit__(None, None, None)
    @defer.inlineCallbacks
    def crawl_log(self, spider):
        crawler = get_crawler(spider)
        with LogCapture() as log:
            yield crawler.crawl(mockserver=self.mockserver)
        raise defer.returnValue(log)
    @defer.inlineCallbacks
    def test_recovery(self):
        """
        (0) Recover from an exception in a spider callback. The final item count should be 3
        (one yielded from the callback method before the exception is raised, one directly
        from the recovery middleware and one from the spider when processing the request that
        was enqueued from the recovery middleware)
        """
        log = yield self.crawl_log(RecoverySpider)
        self.assertIn("Middleware: TabError exception caught", str(log))
        self.assertEqual(str(log).count("Middleware: TabError exception caught"), 1)
        self.assertIn("'item_scraped_count': 3", str(log))
    @defer.inlineCallbacks
    def test_process_spider_input_without_errback(self):
        """
        (1.1) An exception from the process_spider_input chain should be caught by the
        process_spider_exception chain from the start if the Request has no errback
        """
        log1 = yield self.crawl_log(ProcessSpiderInputSpiderWithoutErrback)
        self.assertIn("Middleware: will raise IndexError", str(log1))
        self.assertIn("Middleware: IndexError exception caught", str(log1))
    @defer.inlineCallbacks
    def test_process_spider_input_with_errback(self):
        """
        (1.2) An exception from the process_spider_input chain should not be caught by the
        process_spider_exception chain if the Request has an errback
        """
        log1 = yield self.crawl_log(ProcessSpiderInputSpiderWithErrback)
        self.assertNotIn("Middleware: IndexError exception caught", str(log1))
        self.assertIn("Middleware: will raise IndexError", str(log1))
        self.assertIn("Got a Failure on the Request errback", str(log1))
        self.assertIn("{'from': 'errback'}", str(log1))
        self.assertNotIn("{'from': 'callback'}", str(log1))
        self.assertIn("'item_scraped_count': 1", str(log1))
    @defer.inlineCallbacks
    def test_generator_callback(self):
        """
        (2) An exception from a spider callback (returning a generator) should
        be caught by the process_spider_exception chain. Items yielded before the
        exception is raised should be processed normally.
        """
        log2 = yield self.crawl_log(GeneratorCallbackSpider)
        self.assertIn("Middleware: ImportError exception caught", str(log2))
        self.assertIn("'item_scraped_count': 2", str(log2))
    @defer.inlineCallbacks
    def test_not_a_generator_callback(self):
        """
        (3) An exception from a spider callback (returning a list) should
        be caught by the process_spider_exception chain. No items should be processed.
        """
        log3 = yield self.crawl_log(NotGeneratorCallbackSpider)
        self.assertIn("Middleware: ZeroDivisionError exception caught", str(log3))
        self.assertNotIn("item_scraped_count", str(log3))
    @defer.inlineCallbacks
    def test_generator_output_chain(self):
        """
        (4) An exception from a middleware's process_spider_output method should be sent
        to the process_spider_exception method from the next middleware in the chain.
        The result of the recovery by the process_spider_exception method should be handled
        by the process_spider_output method from the next middleware.
        The final item count should be 2 (one from the spider callback and one from the
        process_spider_exception chain)
        """
        log4 = yield self.crawl_log(GeneratorOutputChainSpider)
        self.assertIn("'item_scraped_count': 2", str(log4))
        self.assertIn("GeneratorRecoverMiddleware.process_spider_exception: LookupError caught", str(log4))
        self.assertIn("GeneratorDoNothingAfterFailureMiddleware.process_spider_exception: LookupError caught", str(log4))
        self.assertNotIn("GeneratorFailMiddleware.process_spider_exception: LookupError caught", str(log4))
        self.assertNotIn("GeneratorDoNothingAfterRecoveryMiddleware.process_spider_exception: LookupError caught", str(log4))
        item_from_callback = {'processed': [
            'parse-first-item',
            'GeneratorFailMiddleware.process_spider_output',
            'GeneratorDoNothingAfterFailureMiddleware.process_spider_output',
            'GeneratorRecoverMiddleware.process_spider_output',
            'GeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
        item_recovered = {'processed': [
            'GeneratorRecoverMiddleware.process_spider_exception',
            'GeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
        self.assertIn(str(item_from_callback), str(log4))
        self.assertIn(str(item_recovered), str(log4))
        self.assertNotIn('parse-second-item', str(log4))
    @defer.inlineCallbacks
    def test_not_a_generator_output_chain(self):
        """
        (5) An exception from a middleware's process_spider_output method should be sent
        to the process_spider_exception method from the next middleware in the chain.
        The result of the recovery by the process_spider_exception method should be handled
        by the process_spider_output method from the next middleware.
        The final item count should be 1 (from the process_spider_exception chain, the items
        from the spider callback are lost)
        """
        log5 = yield self.crawl_log(NotGeneratorOutputChainSpider)
        self.assertIn("'item_scraped_count': 1", str(log5))
        self.assertIn("GeneratorRecoverMiddleware.process_spider_exception: ReferenceError caught", str(log5))
        self.assertIn("GeneratorDoNothingAfterFailureMiddleware.process_spider_exception: ReferenceError caught", str(log5))
        self.assertNotIn("GeneratorFailMiddleware.process_spider_exception: ReferenceError caught", str(log5))
        self.assertNotIn("GeneratorDoNothingAfterRecoveryMiddleware.process_spider_exception: ReferenceError caught", str(log5))
        item_recovered = {'processed': [
            'NotGeneratorRecoverMiddleware.process_spider_exception',
            'NotGeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
        self.assertIn(str(item_recovered), str(log5))
        self.assertNotIn('parse-first-item', str(log5))
        self.assertNotIn('parse-second-item', str(log5))
--- a/tests/test_utils_misc/init.py
+++ b/tests/test_utils_misc/init.py
@ -3,12 +3,13 @@ import os
 import unittest
 from scrapy.item import Item, Field
-from scrapy.utils.misc import arg_to_iter, create_instance, load_object, walk_modules
+from scrapy.utils.misc import arg_to_iter, create_instance, load_object, set_environ, walk_modules
 from tests import mock
 __doctests__ = ['scrapy.utils.misc']
 class UtilsMiscTestCase(unittest.TestCase):
    def test_load_object(self):
@ -130,5 +131,18 @@ class UtilsMiscTestCase(unittest.TestCase):
        with self.assertRaises(ValueError):
            create_instance(m, None, None)
    def test_set_environ(self):
        assert os.environ.get('some_test_environ') is None
        with set_environ(some_test_environ='test_value'):
            assert os.environ.get('some_test_environ') == 'test_value'
        assert os.environ.get('some_test_environ') is None
        os.environ['some_test_environ'] = 'test'
        assert os.environ.get('some_test_environ') == 'test'
        with set_environ(some_test_environ='test_value'):
            assert os.environ.get('some_test_environ') == 'test_value'
        assert os.environ.get('some_test_environ') == 'test'
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_utils_python.py
+++ b/tests/test_utils_python.py
@ -9,11 +9,23 @@ import six
 from scrapy.utils.python import (
    memoizemethod_noargs, binary_is_text, equal_attributes,
    WeakKeyCache, stringify_dict, get_func_args, to_bytes, to_unicode,
-    without_none_values)
+    without_none_values, MutableChain)
 __doctests__ = ['scrapy.utils.python']
 class MutableChainTest(unittest.TestCase):
    def test_mutablechain(self):
        m = MutableChain(range(2), [2, 3], (4, 5))
        m.extend(range(6, 7))
        m.extend([7, 8])
        m.extend([9, 10], (11, 12))
        self.assertEqual(next(m), 0)
        self.assertEqual(m.next(), 1)
        self.assertEqual(m.__next__(), 2)
        self.assertEqual(list(m), list(range(3, 13)))
 class ToUnicodeTest(unittest.TestCase):
    def test_converting_an_utf8_encoded_string_to_unicode(self):
        self.assertEqual(to_unicode(b'lel\xc3\xb1e'), u'lel\xf1e')
--- a/tests/test_utils_reqser.py
+++ b/tests/test_utils_reqser.py
@ -1,9 +1,12 @@
 # -*- coding: utf-8 -*-
 import unittest
 import sys
 import six
 from scrapy.http import Request, FormRequest
 from scrapy.spiders import Spider
-from scrapy.utils.reqser import request_to_dict, request_from_dict
+from scrapy.utils.reqser import request_to_dict, request_from_dict, _is_private_method, _mangle_private_name
 class RequestSerializationTest(unittest.TestCase):
@ -70,6 +73,56 @@ class RequestSerializationTest(unittest.TestCase):
                    errback=self.spider.handle_error)
        self._assert_serializes_ok(r, spider=self.spider)
    def test_private_callback_serialization(self):
        r = Request("http://www.example.com",
                    callback=self.spider._TestSpider__parse_item_private,
                    errback=self.spider.handle_error)
        self._assert_serializes_ok(r, spider=self.spider)
    def test_mixin_private_callback_serialization(self):
        if sys.version_info[0] < 3:
            return
        r = Request("http://www.example.com",
                    callback=self.spider._TestSpiderMixin__mixin_callback,
                    errback=self.spider.handle_error)
        self._assert_serializes_ok(r, spider=self.spider)
    def test_private_callback_name_matching(self):
        self.assertTrue(_is_private_method('__a'))
        self.assertTrue(_is_private_method('__a_'))
        self.assertTrue(_is_private_method('__a_a'))
        self.assertTrue(_is_private_method('__a_a_'))
        self.assertTrue(_is_private_method('__a__a'))
        self.assertTrue(_is_private_method('__a__a_'))
        self.assertTrue(_is_private_method('__a___a'))
        self.assertTrue(_is_private_method('__a___a_'))
        self.assertTrue(_is_private_method('___a'))
        self.assertTrue(_is_private_method('___a_'))
        self.assertTrue(_is_private_method('___a_a'))
        self.assertTrue(_is_private_method('___a_a_'))
        self.assertTrue(_is_private_method('____a_a_'))
        self.assertFalse(_is_private_method('_a'))
        self.assertFalse(_is_private_method('_a_'))
        self.assertFalse(_is_private_method('__a__'))
        self.assertFalse(_is_private_method('__'))
        self.assertFalse(_is_private_method('___'))
        self.assertFalse(_is_private_method('____'))
    def _assert_mangles_to(self, obj, name):
        func = getattr(obj, name)
        self.assertEqual(
            _mangle_private_name(obj, func, func.__name__),
            name
        )
    def test_private_name_mangling(self):
        self._assert_mangles_to(
            self.spider, '_TestSpider__parse_item_private')
        if sys.version_info[0] >= 3:
            self._assert_mangles_to(
                self.spider, '_TestSpiderMixin__mixin_callback')
    def test_unserializable_callback1(self):
        r = Request("http://www.example.com", callback=lambda x: x)
        self.assertRaises(ValueError, request_to_dict, r)
@ -80,7 +133,12 @@ class RequestSerializationTest(unittest.TestCase):
        self.assertRaises(ValueError, request_to_dict, r)
-class TestSpider(Spider):
+class TestSpiderMixin(object):
    def __mixin_callback(self, response):
        pass
 class TestSpider(Spider, TestSpiderMixin):
    name = 'test'
    def parse_item(self, response):
@ -89,6 +147,9 @@ class TestSpider(Spider):
    def handle_error(self, failure):
        pass
    def __parse_item_private(self, response):
        pass
 class CustomRequest(Request):
    pass
--- a/tox.ini
+++ b/tox.ini
@ -105,6 +105,12 @@ deps = {[docs]deps}
 commands =
    sphinx-build -W -b html . {envtmpdir}/html
 [testenv:docs-coverage]
 changedir = {[docs]changedir}
 deps = {[docs]deps}
 commands =
    sphinx-build -b coverage . {envtmpdir}/coverage
 [testenv:docs-links]
 changedir = {[docs]changedir}
 deps = {[docs]deps}
`@ -1,2 +1,2 @@`
	`Sphinx>=1.6`	`Sphinx>=2.1`
	`sphinx_rtd_theme`	`sphinx_rtd_theme`