Merge remote-tracking branch 'origin/master' into callback_kwargs

2025-02-24 02:43:41 +00:00 · 2019-06-26 11:03:31 -03:00 · 2019-06-26 11:03:31 -03:00 · 428309ba1a
commit 428309ba1a
parent 0522fe35c3 c81d120bde
61 changed files with 2194 additions and 189 deletions
--- a/appveyor.yml
+++ b/appveyor.yml
@ -12,7 +12,8 @@ branches:

 install:
  - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
-  - "SET TOX_TESTENV_PASSENV=HOME USERPROFILE HOMEPATH HOMEDRIVE"
+  - "SET PYTHONPATH=%APPVEYOR_BUILD_FOLDER%"
+  - "SET TOX_TESTENV_PASSENV=HOME HOMEDRIVE HOMEPATH PYTHONPATH USERPROFILE"
  - "pip install -U tox"

 build: false
--- a/docs/Makefile
+++ b/docs/Makefile
@ -82,6 +82,9 @@ pydoc-topics: build
 	@echo "Building finished; now copy build/pydoc-topics/pydoc_topics.py " \
 	      "into the Lib/ directory"

+coverage: BUILDER = coverage
+coverage: build
+
 htmlview: html
 	 $(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
 	 os.path.realpath('build/html/index.html'))"
--- a/docs/conf.py
+++ b/docs/conf.py
@ -28,7 +28,8 @@ sys.path.insert(0, path.dirname(path.dirname(__file__)))
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = [
    'scrapydocs',
-    'sphinx.ext.autodoc'
+    'sphinx.ext.autodoc',
+    'sphinx.ext.coverage',
 ]

 # Add any paths that contain templates here, relative to this directory.
@ -218,3 +219,22 @@ linkcheck_ignore = [
    'http://localhost:\d+', 'http://hg.scrapy.org',
    'http://directory.google.com/'
 ]
+
+
+# Options for the Coverage extension
+# ----------------------------------
+coverage_ignore_pyobjects = [
+    # Contract’s add_pre_hook and add_post_hook are not documented because
+    # they should be transparent to contract developers, for whom pre_hook and
+    # post_hook should be the actual concern.
+    r'\bContract\.add_(pre|post)_hook$',
+
+    # ContractsManager is an internal class, developers are not expected to
+    # interact with it directly in any way.
+    r'\bContractsManager\b$',
+
+    # For default contracts we only want to document their general purpose in
+    # their constructor, the methods they reimplement to achieve that purpose
+    # should be irrelevant to developers using those contracts.
+    r'\w+Contract\.(adjust_request_args|(pre|post)_process)$',
+]
--- a/docs/contributing.rst
+++ b/docs/contributing.rst
@ -99,6 +99,15 @@ Well-written patches should:
  the documentation changes in the same patch.  See `Documentation policies`_
  below.

+* if you're adding a private API, please add a regular expression to the
+  ``coverage_ignore_pyobjects`` variable of ``docs/conf.py`` to exclude the new
+  private API from documentation coverage checks.
+
+  To see if your private API is skipped properly, generate a documentation
+  coverage report as follows::
+
+      tox -e docs-coverage
+
 .. _submitting-patches:

 Submitting patches
@ -167,8 +176,9 @@ Documentation policies

 For reference documentation of API members (classes, methods, etc.) use
 docstrings and make sure that the Sphinx documentation uses the autodoc_
-extension to pull the docstrings. API reference documentation should be
-IDE-friendly: short, to the point, and it may provide short examples.
+extension to pull the docstrings. API reference documentation should follow
+docstring conventions (`PEP 257`_) and be IDE-friendly: short, to the point,
+and it may provide short examples.

 Other types of documentation, such as tutorials or topics, should be covered in
 files within the ``docs/`` directory. This includes documentation that is
@ -205,6 +215,29 @@ To run a specific test (say ``tests/test_loader.py``) use:

    ``tox -- tests/test_loader.py``

+To run the tests on a specific tox_ environment, use ``-e <name>`` with an
+environment name from ``tox.ini``. For example, to run the tests with Python
+3.6 use::
+
+    tox -e py36
+
+You can also specify a comma-separated list of environmets, and use `tox’s
+parallel mode`_ to run the tests on multiple environments in parallel::
+
+    tox -e py27,py36 -p auto
+
+To pass command-line options to pytest_, add them after ``--`` in your call to
+tox_. Using ``--`` overrides the default positional arguments defined in
+``tox.ini``, so you must include those default positional arguments
+(``scrapy tests``) after ``--`` as well::
+
+    tox -- scrapy tests -x  # stop after first failure
+
+You can also use the `pytest-xdist`_ plugin. For example, to run all tests on
+the Python 3.6 tox_ environment using all your CPU cores::
+
+    tox -e py36 -- scrapy tests -n auto
+
 To see coverage report install `coverage`_ (``pip install coverage``) and run:

    ``coverage report``
@ -237,5 +270,9 @@ And their unit-tests are in::
 .. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS
 .. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
 .. _open issues: https://github.com/scrapy/scrapy/issues
-.. _pull request: https://help.github.com/send-pull-requests/
+.. _PEP 257: https://www.python.org/dev/peps/pep-0257/
+.. _pull request: https://help.github.com/en/articles/creating-a-pull-request
+.. _pytest: https://docs.pytest.org/en/latest/usage.html
+.. _pytest-xdist: https://docs.pytest.org/en/3.0.0/xdist.html
 .. _tox: https://pypi.python.org/pypi/tox
+.. _tox’s parallel mode: https://tox.readthedocs.io/en/latest/example/basic.html#parallel-mode
--- a/docs/index.rst
+++ b/docs/index.rst
@ -158,6 +158,7 @@ Solving specific problems
   topics/practices
   topics/broad-crawls
   topics/developer-tools
+   topics/dynamic-content
   topics/leaks
   topics/media-pipeline
   topics/deploy
@ -183,6 +184,9 @@ Solving specific problems
 :doc:`topics/developer-tools`
    Learn how to scrape with your browser's developer tools.

+:doc:`topics/dynamic-content`
+    Read webpage data that is loaded dynamically.
+
 :doc:`topics/leaks`
    Learn how to find and get rid of memory leaks in your crawler.

--- a/docs/intro/tutorial.rst
+++ b/docs/intro/tutorial.rst
@ -205,7 +205,7 @@ Extracting data
 ---------------

 The best way to learn how to extract data with Scrapy is trying selectors
-using the shell :ref:`Scrapy shell <topics-shell>`. Run::
+using the :ref:`Scrapy shell <topics-shell>`. Run::

    scrapy shell 'http://quotes.toscrape.com/page/1/'

@ -296,8 +296,8 @@ expressions`_::

 In order to find the proper CSS selectors to use, you might find useful opening
 the response page from the shell in your web browser using ``view(response)``.
-You can use your browser developer tools to inspect the HTML and come up
-with a selector (see section about :ref:`topics-developer-tools`).
+You can use your browser's developer tools to inspect the HTML and come up
+with a selector (see :ref:`topics-developer-tools`).

 `Selector Gadget`_ is also a nice tool to quickly find CSS selector for
 visually selected elements, which works in many browsers.
@ -379,11 +379,11 @@ variable, so that we can run our CSS selectors directly on a particular quote::

    >>> quote = response.css("div.quote")[0]

-Now, let's extract ``title``, ``author`` and the ``tags`` from that quote
+Now, let's extract ``text``, ``author`` and the ``tags`` from that quote
 using the ``quote`` object we just created::

-    >>> title = quote.css("span.text::text").get()
-    >>> title
+    >>> text = quote.css("span.text::text").get()
+    >>> text
    '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'
    >>> author = quote.css("small.author::text").get()
    >>> author
@ -511,7 +511,7 @@ We can try extracting it in the shell::
    '<a href="/page/2/">Next <span aria-hidden="true">→</span></a>'

 This gets the anchor element, but we want the attribute ``href``. For that,
-Scrapy supports a CSS extension that let's you select the attribute contents,
+Scrapy supports a CSS extension that lets you select the attribute contents,
 like this::

    >>> response.css('li.next a::attr(href)').get()
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,2 +1,2 @@
-Sphinx>=1.6
+Sphinx>=2.1
 sphinx_rtd_theme
--- a/docs/topics/api.rst
+++ b/docs/topics/api.rst
@ -99,6 +99,8 @@ how you :ref:`configure the downloader middlewares

        Returns a deferred that is fired when the crawl is finished.

+    .. automethod:: stop
+
 .. autoclass:: CrawlerRunner
   :members:

@ -154,7 +156,7 @@ Settings API
 SpiderLoader API
 ================

-.. module:: scrapy.loader
+.. module:: scrapy.spiderloader
   :synopsis: The spider loader

 .. class:: SpiderLoader
--- a/docs/topics/broad-crawls.rst
+++ b/docs/topics/broad-crawls.rst
@ -39,6 +39,17 @@ you need to keep in mind when using Scrapy for doing broad crawls, along with
 concrete suggestions of Scrapy settings to tune in order to achieve an
 efficient broad crawl.

+Use the right :setting:`SCHEDULER_PRIORITY_QUEUE`
+=================================================
+
+Scrapy’s default scheduler priority queue is ``'scrapy.pqueues.ScrapyPriorityQueue'``.
+It works best during single-domain crawl. It does not work well with crawling
+many different domains in parallel
+
+To apply the recommended priority queue use::
+
+    SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
+
 Increase concurrency
 ====================

@ -85,7 +96,7 @@ When doing broad crawls you are often only interested in the crawl rates you
 get and any errors found. These stats are reported by Scrapy when using the
 ``INFO`` log level. In order to save CPU (and log storage requirements) you
 should not use ``DEBUG`` log level when preforming large broad crawls in
-production. Using ``DEBUG`` level when developing your (broad) crawler may be 
+production. Using ``DEBUG`` level when developing your (broad) crawler may be
 fine though.

 To set the log level use::
--- a/docs/topics/contracts.rst
+++ b/docs/topics/contracts.rst
@ -120,3 +120,23 @@ get the failures pretty printed::
            for header in self.args:
                if header not in response.headers:
                    raise ContractFail('X-CustomHeader not present')
+
+
+Detecting check runs
+====================
+
+When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is
+set to the ``true`` string. You can use `os.environ`_ to perform any change to
+your spiders or your settings when ``scrapy check`` is used::
+
+    import os
+    import scrapy
+
+    class ExampleSpider(scrapy.Spider):
+        name = 'example'
+
+        def __init__(self):
+            if os.environ.get('SCRAPY_CHECK'):
+                pass  # Do some scraper adjustments when a check is running
+
+.. _os.environ: https://docs.python.org/3/library/os.html#os.environ
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@ -805,6 +805,7 @@ The :class:`MetaRefreshMiddleware` can be configured through the following
 settings (see the settings documentation for more info):

 * :setting:`METAREFRESH_ENABLED`
+* :setting:`METAREFRESH_IGNORE_TAGS`
 * :setting:`METAREFRESH_MAXDELAY`

 This middleware obey :setting:`REDIRECT_MAX_TIMES` setting, :reqmeta:`dont_redirect`,
@ -826,6 +827,15 @@ Default: ``True``

 Whether the Meta Refresh middleware will be enabled.

+.. setting:: METAREFRESH_IGNORE_TAGS
+
+METAREFRESH_IGNORE_TAGS
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Default: ``['script', 'noscript']``
+
+Meta tags within these tags are ignored.
+
 .. setting:: METAREFRESH_MAXDELAY

 METAREFRESH_MAXDELAY
--- a/docs/topics/dynamic-content.rst
+++ b/docs/topics/dynamic-content.rst
@ -0,0 +1,246 @@
+.. _topics-dynamic-content:
+
+====================================
+Selecting dynamically-loaded content
+====================================
+
+Some webpages show the desired data when you load them in a web browser.
+However, when you download them using Scrapy, you cannot reach the desired data
+using :ref:`selectors <topics-selectors>`.
+
+When this happens, the recommended approach is to
+:ref:`find the data source <topics-finding-data-source>` and extract the data
+from it.
+
+If you fail to do that, and you can nonetheless access the desired data through
+the :ref:`DOM <topics-livedom>` from your web browser, see
+:ref:`topics-javascript-rendering`.
+
+.. _topics-finding-data-source:
+
+Finding the data source
+=======================
+
+To extract the desired data, you must first find its source location.
+
+If the data is in a non-text-based format, such as an image or a PDF document,
+use the :ref:`network tool <topics-network-tool>` of your web browser to find
+the corresponding request, and :ref:`reproduce it
+<topics-reproducing-requests>`.
+
+If your web browser lets you select the desired data as text, the data may be
+defined in embedded JavaScript code, or loaded from an external resource in a
+text-based format.
+
+In that case, you can use a tool like wgrep_ to find the URL of that resource.
+
+If the data turns out to come from the original URL itself, you must
+:ref:`inspect the source code of the webpage <topics-inspecting-source>` to
+determine where the data is located.
+
+If the data comes from a different URL, you will need to :ref:`reproduce the
+corresponding request <topics-reproducing-requests>`.
+
+.. _topics-inspecting-source:
+
+Inspecting the source code of a webpage
+=======================================
+
+Sometimes you need to inspect the source code of a webpage (not the
+:ref:`DOM <topics-livedom>`) to determine where some desired data is located.
+
+Use Scrapy’s :command:`fetch` command to download the webpage contents as seen
+by Scrapy::
+
+    scrapy fetch --nolog https://example.com > response.html
+
+If the desired data is in embedded JavaScript code within a ``<script/>``
+element, see :ref:`topics-parsing-javascript`.
+
+If you cannot find the desired data, first make sure it’s not just Scrapy:
+download the webpage with an HTTP client like curl_ or wget_ and see if the
+information can be found in the response they get.
+
+If they get a response with the desired data, modify your Scrapy
+:class:`~scrapy.http.Request` to match that of the other HTTP client. For
+example, try using the same user-agent string (:setting:`USER_AGENT`) or the
+same :attr:`~scrapy.http.Request.headers`.
+
+If they also get a response without the desired data, you’ll need to take
+steps to make your request more similar to that of the web browser. See
+:ref:`topics-reproducing-requests`.
+
+.. _topics-reproducing-requests:
+
+Reproducing requests
+====================
+
+Sometimes we need to reproduce a request the way our web browser performs it.
+
+Use the :ref:`network tool <topics-network-tool>` of your web browser to see
+how your web browser performs the desired request, and try to reproduce that
+request with Scrapy.
+
+It might be enough to yield a :class:`~scrapy.http.Request` with the same HTTP
+method and URL. However, you may also need to reproduce the body, headers and
+form parameters (see :class:`~scrapy.http.FormRequest`) of that request.
+
+Once you get the expected response, you can :ref:`extract the desired data from
+it <topics-handling-response-formats>`.
+
+You can reproduce any request with Scrapy. However, some times reproducing all
+necessary requests may not seem efficient in developer time. If that is your
+case, and crawling speed is not a major concern for you, you can alternatively
+consider :ref:`JavaScript pre-rendering <topics-javascript-rendering>`.
+
+If you get the expected response `sometimes`, but not always, the issue is
+probably not your request, but the target server. The target server might be
+buggy, overloaded, or :ref:`banning <bans>` some of your requests.
+
+.. _topics-handling-response-formats:
+
+Handling different response formats
+===================================
+
+Once you have a response with the desired data, how you extract the desired
+data from it depends on the type of response:
+
+-   If the response is HTML or XML, use :ref:`selectors
+    <topics-selectors>` as usual.
+
+-   If the response is JSON, use `json.loads`_ to load the desired data from
+    :attr:`response.text <scrapy.http.TextResponse.text>`::
+
+        data = json.loads(response.text)
+
+    If the desired data is inside HTML or XML code embedded within JSON data,
+    you can load that HTML or XML code into a
+    :class:`~scrapy.selector.Selector` and then
+    :ref:`use it <topics-selectors>` as usual::
+
+        selector = Selector(data['html'])
+
+-   If the response is JavaScript, or HTML with a ``<script/>`` element
+    containing the desired data, see :ref:`topics-parsing-javascript`.
+
+-   If the response is CSS, use a `regular expression`_ to extract the desired
+    data from :attr:`response.text <scrapy.http.TextResponse.text>`.
+
+.. _topics-parsing-images:
+
+-   If the response is an image or another format based on images (e.g. PDF),
+    read the response as bytes from
+    :attr:`response.body <scrapy.http.TextResponse.body>` and use an OCR
+    solution to extract the desired data as text.
+
+    For example, you can use pytesseract_. To read a table from a PDF,
+    `tabula-py`_ may be a better choice.
+
+-   If the response is SVG, or HTML with embedded SVG containing the desired
+    data, you may be able to extract the desired data using
+    :ref:`selectors <topics-selectors>`, since SVG is based on XML.
+
+    Otherwise, you might need to convert the SVG code into a raster image, and
+    :ref:`handle that raster image <topics-parsing-images>`.
+
+.. _topics-parsing-javascript:
+
+Parsing JavaScript code
+=======================
+
+If the desired data is hardcoded in JavaScript, you first need to get the
+JavaScript code:
+
+-   If the JavaScript code is in a JavaScript file, simply read
+    :attr:`response.text <scrapy.http.TextResponse.text>`.
+
+-   If the JavaScript code is within a ``<script/>`` element of an HTML page,
+    use :ref:`selectors <topics-selectors>` to extract the text within that
+    ``<script/>`` element.
+
+Once you have a string with the JavaScript code, you can extract the desired
+data from it:
+
+-   You might be able to use a `regular expression`_ to extract the desired
+    data in JSON format, which you can then parse with `json.loads`_.
+
+    For example, if the JavaScript code contains a separate line like
+    ``var data = {"field": "value"};`` you can extract that data as follows::
+
+        >>> pattern = r'\bvar\s+data\s*=\s*(\{.*?\})\s*;\s*\n'
+        >>> json_data = response.css('script::text').re_first(pattern)
+        >>> json.loads(json_data)
+        {'field': 'value'}
+
+-   Otherwise, use js2xml_ to convert the JavaScript code into an XML document
+    that you can parse using :ref:`selectors <topics-selectors>`.
+
+    For example, if the JavaScript code contains
+    ``var data = {field: "value"};`` you can extract that data as follows::
+
+        >>> import js2xml
+        >>> import lxml.etree
+        >>> from parsel import Selector
+        >>> javascript = response.css('script::text').get()
+        >>> xml = lxml.etree.tostring(js2xml.parse(javascript), encoding='unicode')
+        >>> selector = Selector(text=xml)
+        >>> selector.css('var[name="data"]').get()
+        '<var name="data"><object><property name="field"><string>value</string></property></object></var>'
+
+.. _topics-javascript-rendering:
+
+Pre-rendering JavaScript
+========================
+
+On webpages that fetch data from additional requests, reproducing those
+requests that contain the desired data is the preferred approach. The effort is
+often worth the result: structured, complete data with minimum parsing time and
+network transfer.
+
+However, sometimes it can be really hard to reproduce certain requests. Or you
+may need something that no request can give you, such as a screenshot of a
+webpage as seen in a web browser.
+
+In these cases use the Splash_ JavaScript-rendering service, along with
+`scrapy-splash`_ for seamless integration.
+
+Splash returns as HTML the :ref:`DOM <topics-livedom>` of a webpage, so that
+you can parse it with :ref:`selectors <topics-selectors>`. It provides great
+flexibility through configuration_ or scripting_.
+
+If you need something beyond what Splash offers, such as interacting with the
+DOM on-the-fly from Python code instead of using a previously-written script,
+or handling multiple web browser windows, you might need to
+:ref:`use a headless browser <topics-headless-browsing>` instead.
+
+.. _configuration: https://splash.readthedocs.io/en/stable/api.html
+.. _scripting: https://splash.readthedocs.io/en/stable/scripting-tutorial.html
+
+.. _topics-headless-browsing:
+
+Using a headless browser
+========================
+
+A `headless browser`_ is a special web browser that provides an API for
+automation.
+
+The easiest way to use a headless browser with Scrapy is to use Selenium_,
+along with `scrapy-selenium`_ for seamless integration.
+
+
+.. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29
+.. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
+.. _curl: https://curl.haxx.se/
+.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
+.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
+.. _js2xml: https://github.com/scrapinghub/js2xml
+.. _json.loads: https://docs.python.org/library/json.html#json.loads
+.. _pytesseract: https://github.com/madmaze/pytesseract
+.. _regular expression: https://docs.python.org/library/re.html
+.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
+.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
+.. _Selenium: https://www.seleniumhq.org/
+.. _Splash: https://github.com/scrapinghub/splash
+.. _tabula-py: https://github.com/chezou/tabula-py
+.. _wget: https://www.gnu.org/software/wget/
+.. _wgrep: https://github.com/stav/wgrep
--- a/docs/topics/logging.rst
+++ b/docs/topics/logging.rst
@ -238,9 +238,10 @@ scrapy.utils.log module

 .. autofunction:: configure_logging

-    ``configure_logging`` is automatically called when using Scrapy commands,
-    but needs to be called explicitly when running custom scripts. In that
-    case, its usage is not required but it's recommended.
+    ``configure_logging`` is automatically called when using Scrapy commands
+    or :class:`~scrapy.crawler.CrawlerProcess`, but needs to be called explicitly
+    when running custom scripts using :class:`~scrapy.crawler.CrawlerRunner`.
+    In that case, its usage is not required but it's recommended.

    If you plan on configuring the handlers yourself is still recommended you
    call this function, passing ``install_root_handler=False``. Bear in mind
--- a/docs/topics/settings.rst
+++ b/docs/topics/settings.rst
@ -897,6 +897,16 @@ Default: ``False``
 If ``True``, the logs will just contain the root path. If it is set to ``False``
 then it displays the component responsible for the log output

+.. setting:: LOGSTATS_INTERVAL
+
+LOGSTATS_INTERVAL
+-----------------
+
+Default: ``60.0``
+
+The interval (in seconds) between each logging printout of the stats 
+by :class:`~extensions.logstats.LogStats`.
+
 .. setting:: MEMDEBUG_ENABLED

 MEMDEBUG_ENABLED
@ -1155,9 +1165,14 @@ Type of in-memory queue used by scheduler. Other available type is:

 SCHEDULER_PRIORITY_QUEUE
 ------------------------
-Default: ``'queuelib.PriorityQueue'``
+Default: ``'scrapy.pqueues.ScrapyPriorityQueue'``

-Type of priority queue used by scheduler.
+Type of priority queue used by the scheduler. Another available type is
+``scrapy.pqueues.DownloaderAwarePriorityQueue``.
+``scrapy.pqueues.DownloaderAwarePriorityQueue`` works better than
+``scrapy.pqueues.ScrapyPriorityQueue`` when you crawl many different
+domains in parallel. But currently ``scrapy.pqueues.DownloaderAwarePriorityQueue``
+does not work together with :setting:`CONCURRENT_REQUESTS_PER_IP`.

 .. setting:: SPIDER_CONTRACTS

--- a/docs/topics/spider-middleware.rst
+++ b/docs/topics/spider-middleware.rst
@ -82,7 +82,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.

        If it raises an exception, Scrapy won't bother calling any other spider
        middleware :meth:`process_spider_input` and will call the request
-        errback.  The output of the errback is chained back in the other
+        errback if there is one, otherwise it will start the :meth:`process_spider_exception`
+        chain. The output of the errback is chained back in the other
        direction for :meth:`process_spider_output` to process it, or
        :meth:`process_spider_exception` if it raised an exception.

@ -116,8 +117,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.

    .. method:: process_spider_exception(response, exception, spider)

-        This method is called when a spider or :meth:`process_spider_input`
-        method (from other spider middleware) raises an exception.
+        This method is called when a spider or :meth:`process_spider_output`
+        method (from a previous spider middleware) raises an exception.

        :meth:`process_spider_exception` should return either ``None`` or an
        iterable of :class:`~scrapy.http.Request`, dict or
@ -129,7 +130,8 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
        exception reaches the engine (where it's logged and discarded).

        If it returns an iterable the :meth:`process_spider_output` pipeline
-        kicks in, and no other :meth:`process_spider_exception` will be called.
+        kicks in, starting from the next spider middleware, and no other
+        :meth:`process_spider_exception` will be called.

        :param response: the response being processed when the exception was
          raised
--- a/docs/topics/spiders.rst
+++ b/docs/topics/spiders.rst
@ -402,10 +402,12 @@ Crawling rules
   of links extracted from each response using the specified ``link_extractor``.
   This is mainly used for filtering purposes.

-   ``process_request`` is a callable, or a string (in which case a method from
-   the spider object with that name will be used) which will be called with
-   every request extracted by this rule, and must return a request or None (to
-   filter out the request).
+   ``process_request`` is a callable (or a string, in which case a method from
+   the spider object with that name will be used) which will be called for every
+   :class:`~scrapy.http.Request` extracted by this rule. This callable should
+   take said request as first argument and the :class:`~scrapy.http.Response`
+   from which the request originated as second argument. It must return a
+   ``Request`` object or ``None`` (to filter out the request).

 CrawlSpider example
 ~~~~~~~~~~~~~~~~~~~
@ -655,7 +657,7 @@ SitemapSpider

    .. attribute:: sitemap_follow

-        A list of regexes of sitemap that should be followed. This is is only
+        A list of regexes of sitemap that should be followed. This is only
        for sites that use `Sitemap index files`_ that point to other sitemap
        files.

--- a/docs/topics/stats.rst
+++ b/docs/topics/stats.rst
@ -75,8 +75,7 @@ available in Scrapy which extend the basic Stats Collector. You can select
 which Stats Collector to use through the :setting:`STATS_CLASS` setting. The
 default Stats Collector used is the :class:`MemoryStatsCollector`. 

-.. module:: scrapy.statscollectors
-   :synopsis: Stats Collectors
+.. currentmodule:: scrapy.statscollectors

 MemoryStatsCollector
 --------------------
--- a/docs/topics/telnetconsole.rst
+++ b/docs/topics/telnetconsole.rst
@ -1,12 +1,11 @@
+.. currentmodule:: scrapy.extensions.telnet
+
 .. _topics-telnetconsole:

 ==============
 Telnet Console
 ==============

-.. module:: scrapy.extensions.telnet
-   :synopsis: The Telnet Console
-
 Scrapy comes with a built-in telnet console for inspecting and controlling a
 Scrapy running process. The telnet console is just a regular python shell
 running inside the Scrapy process, so you can do literally anything from it.
@ -45,7 +44,7 @@ the console you need to type::
    >>>

 By default Username is ``scrapy`` and Password is autogenerated. The 
-autogenerated Password can be seen on scrapy logs like the example bellow::
+autogenerated Password can be seen on scrapy logs like the example below::

    2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326

--- a/scrapy/commands/check.py
+++ b/scrapy/commands/check.py
@ -6,7 +6,7 @@ from unittest import TextTestRunner, TextTestResult as _TextTestResult

 from scrapy.commands import ScrapyCommand
 from scrapy.contracts import ContractsManager
-from scrapy.utils.misc import load_object
+from scrapy.utils.misc import load_object, set_environ
 from scrapy.utils.conf import build_component_list


@ -68,16 +68,17 @@ class Command(ScrapyCommand):

        spider_loader = self.crawler_process.spider_loader

-        for spidername in args or spider_loader.list():
-            spidercls = spider_loader.load(spidername)
-            spidercls.start_requests = lambda s: conman.from_spider(s, result)
+        with set_environ(SCRAPY_CHECK='true'):
+            for spidername in args or spider_loader.list():
+                spidercls = spider_loader.load(spidername)
+                spidercls.start_requests = lambda s: conman.from_spider(s, result)

-            tested_methods = conman.tested_methods_from_spidercls(spidercls)
-            if opts.list:
-                for method in tested_methods:
-                    contract_reqs[spidercls.name].append(method)
-            elif tested_methods:
-                self.crawler_process.crawl(spidercls)
+                tested_methods = conman.tested_methods_from_spidercls(spidercls)
+                if opts.list:
+                    for method in tested_methods:
+                        contract_reqs[spidercls.name].append(method)
+                elif tested_methods:
+                    self.crawler_process.crawl(spidercls)

        # start checks
        if opts.list:
--- a/scrapy/contracts/init.py
+++ b/scrapy/contracts/init.py
@ -94,7 +94,7 @@ class ContractsManager(object):
            try:
                output = cb(response)
                output = list(iterate_spider_output(output))
-            except:
+            except Exception:
                case = _create_testcase(method, 'callback')
                results.addError(case, sys.exc_info())

--- a/scrapy/core/downloader/init.py
+++ b/scrapy/core/downloader/init.py
@ -75,6 +75,8 @@ def _get_concurrency_delay(concurrency, spider, settings):

 class Downloader(object):

+    DOWNLOAD_SLOT = 'download_slot'
+
    def __init__(self, crawler):
        self.settings = crawler.settings
        self.signals = crawler.signals
@ -111,8 +113,8 @@ class Downloader(object):
        return key, self.slots[key]

    def _get_slot_key(self, request, spider):
-        if 'download_slot' in request.meta:
-            return request.meta['download_slot']
+        if self.DOWNLOAD_SLOT in request.meta:
+            return request.meta[self.DOWNLOAD_SLOT]

        key = urlparse_cached(request).hostname or ''
        if self.ip_concurrency:
@ -122,7 +124,7 @@ class Downloader(object):

    def _enqueue_request(self, request, spider):
        key, slot = self._get_slot(request, spider)
-        request.meta['download_slot'] = key
+        request.meta[self.DOWNLOAD_SLOT] = key

        def _deactivate(response):
            slot.active.remove(request)
--- a/scrapy/core/downloader/middleware.py
+++ b/scrapy/core/downloader/middleware.py
@ -7,6 +7,7 @@ import six

 from twisted.internet import defer

+from scrapy.exceptions import _InvalidOutput
 from scrapy.http import Request, Response
 from scrapy.middleware import MiddlewareManager
 from scrapy.utils.defer import mustbe_deferred
@ -35,12 +36,12 @@ class DownloaderMiddlewareManager(MiddlewareManager):
        def process_request(request):
            for method in self.methods['process_request']:
                response = yield method(request=request, spider=spider)
-                assert response is None or isinstance(response, (Response, Request)), \
-                        'Middleware %s.process_request must return None, Response or Request, got %s' % \
-                        (six.get_method_self(method).__class__.__name__, response.__class__.__name__)
+                if response is not None and not isinstance(response, (Response, Request)):
+                    raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
+                                         (six.get_method_self(method).__class__.__name__, response.__class__.__name__))
                if response:
                    defer.returnValue(response)
-            defer.returnValue((yield download_func(request=request,spider=spider)))
+            defer.returnValue((yield download_func(request=request, spider=spider)))

        @defer.inlineCallbacks
        def process_response(response):
@ -49,11 +50,10 @@ class DownloaderMiddlewareManager(MiddlewareManager):
                defer.returnValue(response)

            for method in self.methods['process_response']:
-                response = yield method(request=request, response=response,
-                                        spider=spider)
-                assert isinstance(response, (Response, Request)), \
-                    'Middleware %s.process_response must return Response or Request, got %s' % \
-                    (six.get_method_self(method).__class__.__name__, type(response))
+                response = yield method(request=request, response=response, spider=spider)
+                if not isinstance(response, (Response, Request)):
+                    raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
+                                         (six.get_method_self(method).__class__.__name__, type(response)))
                if isinstance(response, Request):
                    defer.returnValue(response)
            defer.returnValue(response)
@ -62,11 +62,10 @@ class DownloaderMiddlewareManager(MiddlewareManager):
        def process_exception(_failure):
            exception = _failure.value
            for method in self.methods['process_exception']:
-                response = yield method(request=request, exception=exception,
-                                        spider=spider)
-                assert response is None or isinstance(response, (Response, Request)), \
-                    'Middleware %s.process_exception must return None, Response or Request, got %s' % \
-                    (six.get_method_self(method).__class__.__name__, type(response))
+                response = yield method(request=request, exception=exception, spider=spider)
+                if response is not None and not isinstance(response, (Response, Request)):
+                    raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \
+                                         (six.get_method_self(method).__class__.__name__, type(response)))
                if response:
                    defer.returnValue(response)
            defer.returnValue(_failure)
--- a/scrapy/core/scheduler.py
+++ b/scrapy/core/scheduler.py
@ -1,19 +1,46 @@
 import os
 import json
 import logging
+import warnings
 from os.path import join, exists

-from scrapy.utils.reqser import request_to_dict, request_from_dict
+from queuelib import PriorityQueue
+
 from scrapy.utils.misc import load_object, create_instance
 from scrapy.utils.job import job_dir
+from scrapy.utils.deprecate import ScrapyDeprecationWarning
+

 logger = logging.getLogger(__name__)


 class Scheduler(object):
+    """
+    Scrapy Scheduler. It allows to enqueue requests and then get
+    a next request to download. Scheduler is also handling duplication
+    filtering, via dupefilter.

+    Prioritization and queueing is not performed by the Scheduler.
+    User sets ``priority`` field for each Request, and a PriorityQueue
+    (defined by :setting:`SCHEDULER_PRIORITY_QUEUE`) uses these priorities
+    to dequeue requests in a desired order.
+
+    Scheduler uses two PriorityQueue instances, configured to work in-memory
+    and on-disk (optional). When on-disk queue is present, it is used by
+    default, and an in-memory queue is used as a fallback for cases where
+    a disk queue can't handle a request (can't serialize it).
+
+    :setting:`SCHEDULER_MEMORY_QUEUE` and
+    :setting:`SCHEDULER_DISK_QUEUE` allow to specify lower-level queue classes
+    which PriorityQueue instances would be instantiated with, to keep requests
+    on disk and in memory respectively.
+
+    Overall, Scheduler is an object which holds several PriorityQueue instances
+    (in-memory and on-disk) and implements fallback logic for them.
+    Also, it handles dupefilters.
+    """
    def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
-                 logunser=False, stats=None, pqclass=None):
+                 logunser=False, stats=None, pqclass=None, crawler=None):
        self.df = dupefilter
        self.dqdir = self._dqdir(jobdir)
        self.pqclass = pqclass
@ -21,6 +48,7 @@ class Scheduler(object):
        self.mqclass = mqclass
        self.logunser = logunser
        self.stats = stats
+        self.crawler = crawler

    @classmethod
    def from_crawler(cls, crawler):
@ -28,26 +56,35 @@ class Scheduler(object):
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = create_instance(dupefilter_cls, settings, crawler)
        pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
+        if pqclass is PriorityQueue:
+            warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
+                          " is no longer supported because of API changes; "
+                          "please use 'scrapy.pqueues.ScrapyPriorityQueue'",
+                          ScrapyDeprecationWarning)
+            from scrapy.pqueues import ScrapyPriorityQueue
+            pqclass = ScrapyPriorityQueue
+
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
-        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
+        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS',
+                                    settings.getbool('SCHEDULER_DEBUG'))
        return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
-                   stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
+                   stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
+                   mqclass=mqclass, crawler=crawler)

    def has_pending_requests(self):
        return len(self) > 0

    def open(self, spider):
        self.spider = spider
-        self.mqs = self.pqclass(self._newmq)
+        self.mqs = self._mq()
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
-            prios = self.dqs.close()
-            with open(join(self.dqdir, 'active.json'), 'w') as f:
-                json.dump(prios, f)
+            state = self.dqs.close()
+            self._write_dqs_state(self.dqdir, state)
        return self.df.close(reason)

    def enqueue_request(self, request):
@ -82,8 +119,7 @@ class Scheduler(object):
        if self.dqs is None:
            return
        try:
-            reqd = request_to_dict(request, self.spider)
-            self.dqs.push(reqd, -request.priority)
+            self.dqs.push(request, -request.priority)
        except ValueError as e:  # non serializable request
            if self.logunser:
                msg = ("Unable to serialize request: %(request)s - reason:"
@ -103,32 +139,51 @@ class Scheduler(object):

    def _dqpop(self):
        if self.dqs:
-            d = self.dqs.pop()
-            if d:
-                return request_from_dict(d, self.spider)
+            return self.dqs.pop()

    def _newmq(self, priority):
+        """ Factory for creating memory queues. """
        return self.mqclass()

    def _newdq(self, priority):
-        return self.dqclass(join(self.dqdir, 'p%s' % priority))
+        """ Factory for creating disk queues. """
+        path = join(self.dqdir, 'p%s' % (priority, ))
+        return self.dqclass(path)
+
+    def _mq(self):
+        """ Create a new priority queue instance, with in-memory storage """
+        return create_instance(self.pqclass, None, self.crawler, self._newmq,
+                               serialize=False)

    def _dq(self):
-        activef = join(self.dqdir, 'active.json')
-        if exists(activef):
-            with open(activef) as f:
-                prios = json.load(f)
-        else:
-            prios = ()
-        q = self.pqclass(self._newdq, startprios=prios)
+        """ Create a new priority queue instance, with disk storage """
+        state = self._read_dqs_state(self.dqdir)
+        q = create_instance(self.pqclass,
+                            None,
+                            self.crawler,
+                            self._newdq,
+                            state,
+                            serialize=True)
        if q:
            logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                        {'queuesize': len(q)}, extra={'spider': self.spider})
        return q

    def _dqdir(self, jobdir):
+        """ Return a folder name to keep disk queue state at """
        if jobdir:
            dqdir = join(jobdir, 'requests.queue')
            if not exists(dqdir):
                os.makedirs(dqdir)
            return dqdir
+
+    def _read_dqs_state(self, dqdir):
+        path = join(dqdir, 'active.json')
+        if not exists(path):
+            return ()
+        with open(path) as f:
+            return json.load(f)
+
+    def _write_dqs_state(self, dqdir, state):
+        with open(join(dqdir, 'active.json'), 'w') as f:
+            json.dump(state, f)
--- a/scrapy/core/scraper.py
+++ b/scrapy/core/scraper.py
@ -135,7 +135,6 @@ class Scraper(object):
            return self.spidermw.scrape_response(
                self.call_spider, request_result, request, spider)
        else:
-            # FIXME: don't ignore errors in spider middleware
            dfd = self.call_spider(request_result, request, spider)
            return dfd.addErrback(
                self._log_download_errors, request_result, request, spider)
--- a/scrapy/core/spidermw.py
+++ b/scrapy/core/spidermw.py
@ -3,15 +3,21 @@ Spider Middleware manager

 See documentation in docs/topics/spider-middleware.rst
 """
+from itertools import chain, islice
+
 import six
 from twisted.python.failure import Failure
+from scrapy.exceptions import _InvalidOutput
 from scrapy.middleware import MiddlewareManager
 from scrapy.utils.defer import mustbe_deferred
 from scrapy.utils.conf import build_component_list
+from scrapy.utils.python import MutableChain
+

 def _isiterable(possible_iterator):
    return hasattr(possible_iterator, '__iter__')

+
 class SpiderMiddlewareManager(MiddlewareManager):

    component_name = 'spider middleware'
@ -24,12 +30,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
        super(SpiderMiddlewareManager, self)._add_middleware(mw)
        if hasattr(mw, 'process_spider_input'):
            self.methods['process_spider_input'].append(mw.process_spider_input)
-        if hasattr(mw, 'process_spider_output'):
-            self.methods['process_spider_output'].appendleft(mw.process_spider_output)
-        if hasattr(mw, 'process_spider_exception'):
-            self.methods['process_spider_exception'].appendleft(mw.process_spider_exception)
        if hasattr(mw, 'process_start_requests'):
            self.methods['process_start_requests'].appendleft(mw.process_start_requests)
+        self.methods['process_spider_output'].appendleft(getattr(mw, 'process_spider_output', None))
+        self.methods['process_spider_exception'].appendleft(getattr(mw, 'process_spider_exception', None))

    def scrape_response(self, scrape_func, response, request, spider):
        fname = lambda f:'%s.%s' % (
@ -40,36 +44,73 @@ class SpiderMiddlewareManager(MiddlewareManager):
            for method in self.methods['process_spider_input']:
                try:
                    result = method(response=response, spider=spider)
-                    assert result is None, \
-                            'Middleware %s must returns None or ' \
-                            'raise an exception, got %s ' \
-                            % (fname(method), type(result))
-                except:
+                    if result is not None:
+                        raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
+                                             .format(fname(method), type(result)))
+                except _InvalidOutput:
+                    raise
+                except Exception:
                    return scrape_func(Failure(), request, spider)
            return scrape_func(response, request, spider)

-        def process_spider_exception(_failure):
+        def process_spider_exception(_failure, start_index=0):
            exception = _failure.value
-            for method in self.methods['process_spider_exception']:
+            # don't handle _InvalidOutput exception
+            if isinstance(exception, _InvalidOutput):
+                return _failure
+            method_list = islice(self.methods['process_spider_exception'], start_index, None)
+            for method_index, method in enumerate(method_list, start=start_index):
+                if method is None:
+                    continue
                result = method(response=response, exception=exception, spider=spider)
-                assert result is None or _isiterable(result), \
-                    'Middleware %s must returns None, or an iterable object, got %s ' % \
-                    (fname(method), type(result))
-                if result is not None:
-                    return result
+                if _isiterable(result):
+                    # stop exception handling by handing control over to the
+                    # process_spider_output chain if an iterable has been returned
+                    return process_spider_output(result, method_index+1)
+                elif result is None:
+                    continue
+                else:
+                    raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \
+                                         .format(fname(method), type(result)))
            return _failure

-        def process_spider_output(result):
-            for method in self.methods['process_spider_output']:
-                result = method(response=response, result=result, spider=spider)
-                assert _isiterable(result), \
-                    'Middleware %s must returns an iterable object, got %s ' % \
-                    (fname(method), type(result))
-            return result
+        def process_spider_output(result, start_index=0):
+            # items in this iterable do not need to go through the process_spider_output
+            # chain, they went through it already from the process_spider_exception method
+            recovered = MutableChain()
+
+            def evaluate_iterable(iterable, index):
+                try:
+                    for r in iterable:
+                        yield r
+                except Exception as ex:
+                    exception_result = process_spider_exception(Failure(ex), index+1)
+                    if isinstance(exception_result, Failure):
+                        raise
+                    recovered.extend(exception_result)
+
+            method_list = islice(self.methods['process_spider_output'], start_index, None)
+            for method_index, method in enumerate(method_list, start=start_index):
+                if method is None:
+                    continue
+                # the following might fail directly if the output value is not a generator
+                try:
+                    result = method(response=response, result=result, spider=spider)
+                except Exception as ex:
+                    exception_result = process_spider_exception(Failure(ex), method_index+1)
+                    if isinstance(exception_result, Failure):
+                        raise
+                    return exception_result
+                if _isiterable(result):
+                    result = evaluate_iterable(result, method_index)
+                else:
+                    raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
+                                         .format(fname(method), type(result)))
+
+            return chain(result, recovered)

        dfd = mustbe_deferred(process_spider_input, response)
-        dfd.addErrback(process_spider_exception)
-        dfd.addCallback(process_spider_output)
+        dfd.addCallbacks(callback=process_spider_output, errback=process_spider_exception)
        return dfd

    def process_start_requests(self, start_requests, spider):
--- a/scrapy/crawler.py
+++ b/scrapy/crawler.py
@ -111,6 +111,8 @@ class Crawler(object):

    @defer.inlineCallbacks
    def stop(self):
+        """Starts a graceful stop of the crawler and returns a deferred that is
+        fired when the crawler is stopped."""
        if self.crawling:
            self.crawling = False
            yield defer.maybeDeferred(self.engine.stop)
--- a/scrapy/downloadermiddlewares/redirect.py
+++ b/scrapy/downloadermiddlewares/redirect.py
@ -88,6 +88,7 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):

    def __init__(self, settings):
        super(MetaRefreshMiddleware, self).__init__(settings)
+        self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
        self._maxdelay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY',
                                         settings.getint('METAREFRESH_MAXDELAY'))

@ -96,7 +97,8 @@ class MetaRefreshMiddleware(BaseRedirectMiddleware):
                not isinstance(response, HtmlResponse):
            return response

-        interval, url = get_meta_refresh(response)
+        interval, url = get_meta_refresh(response,
+                                         ignore_tags=self._ignore_tags)
        if url and interval < self._maxdelay:
            redirected = self._redirect_request_using_get(request, url)
            return self._redirect(redirected, request, spider, 'meta refresh')
--- a/scrapy/exceptions.py
+++ b/scrapy/exceptions.py
@ -11,6 +11,13 @@ class NotConfigured(Exception):
    """Indicates a missing configuration situation"""
    pass

+class _InvalidOutput(TypeError):
+    """
+    Indicates an invalid value has been returned by a middleware's processing method.
+    Internal and undocumented, it should not be raised or caught by user code.
+    """
+    pass
+
 # HTTP and crawling

 class IgnoreRequest(Exception):
--- a/scrapy/extensions/corestats.py
+++ b/scrapy/extensions/corestats.py
@ -24,7 +24,11 @@ class CoreStats(object):
        self.stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)

    def spider_closed(self, spider, reason):
-        self.stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
+        finish_time = datetime.datetime.utcnow()
+        elapsed_time = finish_time - self.stats.get_value('start_time')
+        elapsed_time_seconds = elapsed_time.total_seconds()
+        self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
+        self.stats.set_value('finish_time', finish_time, spider=spider)
        self.stats.set_value('finish_reason', reason, spider=spider)

    def item_scraped(self, item, spider):
--- a/scrapy/extensions/httpcache.py
+++ b/scrapy/extensions/httpcache.py
@ -31,7 +31,7 @@ class DummyPolicy(object):
    def should_cache_response(self, response, request):
        return response.status not in self.ignore_http_codes

-    def is_cached_response_fresh(self, response, request):
+    def is_cached_response_fresh(self, cachedresponse, request):
        return True

    def is_cached_response_valid(self, cachedresponse, response, request):
@ -70,7 +70,7 @@ class RFC2616Policy(object):
        return True

    def should_cache_response(self, response, request):
-        # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1
+        # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
        # Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
        # Status code 206 is not included because cache can not deal with partial contents
        cc = self._parse_cachecontrol(response)
--- a/scrapy/loader/init.py
+++ b/scrapy/loader/init.py
@ -35,6 +35,10 @@ class ItemLoader(object):
        self.parent = parent
        self._local_item = context['item'] = item
        self._local_values = defaultdict(list)
+        # Preprocess values if item built from dict
+        # Values need to be added to item._values if added them from dict (not with add_values)
+        for field_name, value in item.items():
+            self._values[field_name] = self._process_input_value(field_name, value)

    @property
    def _values(self):
--- a/scrapy/pipelines/media.py
+++ b/scrapy/pipelines/media.py
@ -3,7 +3,7 @@ from __future__ import print_function
 import functools
 import logging
 from collections import defaultdict
-from twisted.internet.defer import Deferred, DeferredList
+from twisted.internet.defer import Deferred, DeferredList, _DefGen_Return
 from twisted.python.failure import Failure

 from scrapy.settings import Settings
@ -139,6 +139,30 @@ class MediaPipeline(object):
            result.cleanFailure()
            result.frames = []
            result.stack = None
+
+            # This code fixes a memory leak by avoiding to keep references to
+            # the Request and Response objects on the Media Pipeline cache.
+            #
+            # Twisted inline callbacks pass return values using the function
+            # twisted.internet.defer.returnValue, which encapsulates the return
+            # value inside a _DefGen_Return base exception.
+            #
+            # What happens when the media_downloaded callback raises another
+            # exception, for example a FileException('download-error') when
+            # the Response status code is not 200 OK, is that it stores the
+            # _DefGen_Return exception on the FileException context.
+            #
+            # To avoid keeping references to the Response and therefore Request
+            # objects on the Media Pipeline cache, we should wipe the context of
+            # the exception encapsulated by the Twisted Failure when its a
+            # _DefGen_Return instance.
+            #
+            # This problem does not occur in Python 2.7 since we don't have
+            # Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
+            context = getattr(result.value, '__context__', None)
+            if isinstance(context, _DefGen_Return):
+                setattr(result.value, '__context__', None)
+
        info.downloading.remove(fp)
        info.downloaded[fp] = result  # cache result
        for wad in info.waiting.pop(fp):
--- a/scrapy/pqueues.py
+++ b/scrapy/pqueues.py
@ -0,0 +1,193 @@
+import hashlib
+import logging
+from collections import namedtuple
+
+from queuelib import PriorityQueue
+
+from scrapy.utils.reqser import request_to_dict, request_from_dict
+
+
+logger = logging.getLogger(__name__)
+
+
+def _path_safe(text):
+    """
+    Return a filesystem-safe version of a string ``text``
+
+    >>> _path_safe('simple.org').startswith('simple.org')
+    True
+    >>> _path_safe('dash-underscore_.org').startswith('dash-underscore_.org')
+    True
+    >>> _path_safe('some@symbol?').startswith('some_symbol_')
+    True
+    """
+    pathable_slot = "".join([c if c.isalnum() or c in '-._' else '_'
+                             for c in text])
+    # as we replace some letters we can get collision for different slots
+    # add we add unique part
+    unique_slot = hashlib.md5(text.encode('utf8')).hexdigest()
+    return '-'.join([pathable_slot, unique_slot])
+
+
+class _Priority(namedtuple("_Priority", ["priority", "slot"])):
+    """ Slot-specific priority. It is a hack - ``(priority, slot)`` tuple
+    which can be used instead of int priorities in queues:
+
+    * they are ordered in the same way - order is still by priority value,
+      min(prios) works;
+    * str(p) representation is guaranteed to be different when slots
+      are different - this is important because str(p) is used to create
+      queue files on disk;
+    * they have readable str(p) representation which is safe
+      to use as a file name.
+    """
+    __slots__ = ()
+
+    def __str__(self):
+        return '%s_%s' % (self.priority, _path_safe(str(self.slot)))
+
+
+class _SlotPriorityQueues(object):
+    """ Container for multiple priority queues. """
+    def __init__(self, pqfactory, slot_startprios=None):
+        """
+        ``pqfactory`` is a factory for creating new PriorityQueues.
+        It must be a function which accepts a single optional ``startprios``
+        argument, with a list of priorities to create queues for.
+
+        ``slot_startprios`` is a ``{slot: startprios}`` dict.
+        """
+        self.pqfactory = pqfactory
+        self.pqueues = {}  # slot -> priority queue
+        for slot, startprios in (slot_startprios or {}).items():
+            self.pqueues[slot] = self.pqfactory(startprios)
+
+    def pop_slot(self, slot):
+        """ Pop an object from a priority queue for this slot """
+        queue = self.pqueues[slot]
+        request = queue.pop()
+        if len(queue) == 0:
+            del self.pqueues[slot]
+        return request
+
+    def push_slot(self, slot, obj, priority):
+        """ Push an object to a priority queue for this slot """
+        if slot not in self.pqueues:
+            self.pqueues[slot] = self.pqfactory()
+        queue = self.pqueues[slot]
+        queue.push(obj, priority)
+
+    def close(self):
+        active = {slot: queue.close()
+                  for slot, queue in self.pqueues.items()}
+        self.pqueues.clear()
+        return active
+
+    def __len__(self):
+        return sum(len(x) for x in self.pqueues.values()) if self.pqueues else 0
+
+    def __contains__(self, slot):
+        return slot in self.pqueues
+
+
+class ScrapyPriorityQueue(PriorityQueue):
+    """
+    PriorityQueue which works with scrapy.Request instances and
+    can optionally convert them to/from dicts before/after putting to a queue.
+    """
+    def __init__(self, crawler, qfactory, startprios=(), serialize=False):
+        super(ScrapyPriorityQueue, self).__init__(qfactory, startprios)
+        self.serialize = serialize
+        self.spider = crawler.spider
+
+    @classmethod
+    def from_crawler(cls, crawler, qfactory, startprios=(), serialize=False):
+        return cls(crawler, qfactory, startprios, serialize)
+
+    def push(self, request, priority=0):
+        if self.serialize:
+            request = request_to_dict(request, self.spider)
+        super(ScrapyPriorityQueue, self).push(request, priority)
+
+    def pop(self):
+        request = super(ScrapyPriorityQueue, self).pop()
+        if request and self.serialize:
+            request = request_from_dict(request, self.spider)
+        return request
+
+
+class DownloaderInterface(object):
+
+    def __init__(self, crawler):
+        self.downloader = crawler.engine.downloader
+
+    def stats(self, possible_slots):
+        return [(self._active_downloads(slot), slot)
+                for slot in possible_slots]
+
+    def get_slot_key(self, request):
+        return self.downloader._get_slot_key(request, None)
+
+    def _active_downloads(self, slot):
+        """ Return a number of requests in a Downloader for a given slot """
+        if slot not in self.downloader.slots:
+            return 0
+        return len(self.downloader.slots[slot].active)
+
+
+class DownloaderAwarePriorityQueue(object):
+    """ PriorityQueue which takes Downlaoder activity in account:
+    domains (slots) with the least amount of active downloads are dequeued
+    first.
+    """
+
+    @classmethod
+    def from_crawler(cls, crawler, qfactory, slot_startprios=None, serialize=False):
+        return cls(crawler, qfactory, slot_startprios, serialize)
+
+    def __init__(self, crawler, qfactory, slot_startprios=None, serialize=False):
+        if crawler.settings.getint('CONCURRENT_REQUESTS_PER_IP') != 0:
+            raise ValueError('"%s" does not support CONCURRENT_REQUESTS_PER_IP'
+                             % (self.__class__,))
+
+        if slot_startprios and not isinstance(slot_startprios, dict):
+            raise ValueError("DownloaderAwarePriorityQueue accepts "
+                             "``slot_startprios`` as a dict; %r instance "
+                             "is passed. Most likely, it means the state is"
+                             "created by an incompatible priority queue. "
+                             "Only a crawl started with the same priority "
+                             "queue class can be resumed." %
+                             slot_startprios.__class__)
+
+        slot_startprios = {
+            slot: [_Priority(p, slot) for p in startprios]
+            for slot, startprios in (slot_startprios or {}).items()}
+
+        def pqfactory(startprios=()):
+            return ScrapyPriorityQueue(crawler, qfactory, startprios, serialize)
+        self._slot_pqueues = _SlotPriorityQueues(pqfactory, slot_startprios)
+        self.serialize = serialize
+        self._downloader_interface = DownloaderInterface(crawler)
+
+    def pop(self):
+        stats = self._downloader_interface.stats(self._slot_pqueues.pqueues)
+
+        if not stats:
+            return
+
+        slot = min(stats)[1]
+        request = self._slot_pqueues.pop_slot(slot)
+        return request
+
+    def push(self, request, priority):
+        slot = self._downloader_interface.get_slot_key(request)
+        priority_slot = _Priority(priority=priority, slot=slot)
+        self._slot_pqueues.push_slot(slot, request, priority_slot)
+
+    def close(self):
+        active = self._slot_pqueues.close()
+        return {slot: [p.priority for p in startprios]
+                for slot, startprios in active.items()}
+
+    def __len__(self):
+        return len(self._slot_pqueues)
--- a/scrapy/settings/default_settings.py
+++ b/scrapy/settings/default_settings.py
@ -221,6 +221,7 @@ MEMUSAGE_NOTIFY_MAIL = []
 MEMUSAGE_WARNING_MB = 0

 METAREFRESH_ENABLED = True
+METAREFRESH_IGNORE_TAGS = ['script', 'noscript']
 METAREFRESH_MAXDELAY = 100

 NEWSPIDER_MODULE = ''
@ -238,7 +239,7 @@ REFERRER_POLICY = 'scrapy.spidermiddlewares.referer.DefaultReferrerPolicy'

 RETRY_ENABLED = True
 RETRY_TIMES = 2  # initial response + 2 retries = 3 requests
-RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408]
+RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
 RETRY_PRIORITY_ADJUST = -1

 ROBOTSTXT_OBEY = False
@ -246,7 +247,7 @@ ROBOTSTXT_OBEY = False
 SCHEDULER = 'scrapy.core.scheduler.Scheduler'
 SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
 SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
-SCHEDULER_PRIORITY_QUEUE = 'queuelib.PriorityQueue'
+SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'

 SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
 SPIDER_LOADER_WARN_ONLY = False
--- a/scrapy/spiders/crawl.py
+++ b/scrapy/spiders/crawl.py
@ -6,29 +6,55 @@ See documentation in docs/topics/spiders.rst
 """

 import copy
+import warnings
+
 import six

+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Request, HtmlResponse
 from scrapy.utils.spider import iterate_spider_output
+from scrapy.utils.python import get_func_args
 from scrapy.spiders import Spider


-def identity(x):
-    return x
+def _identity(request, response):
+    return request
+
+
+def _get_method(method, spider):
+    if callable(method):
+        return method
+    elif isinstance(method, six.string_types):
+        return getattr(spider, method, None)


 class Rule(object):

-    def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity):
+    def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None):
        self.link_extractor = link_extractor
        self.callback = callback
        self.cb_kwargs = cb_kwargs or {}
        self.process_links = process_links
-        self.process_request = process_request
-        if follow is None:
-            self.follow = False if callback else True
-        else:
-            self.follow = follow
+        self.process_request = process_request or _identity
+        self.process_request_argcount = None
+        self.follow = follow if follow is not None else not callback
+
+    def _compile(self, spider):
+        self.callback = _get_method(self.callback, spider)
+        self.process_links = _get_method(self.process_links, spider)
+        self.process_request = _get_method(self.process_request, spider)
+        self.process_request_argcount = len(get_func_args(self.process_request))
+        if self.process_request_argcount == 1:
+            msg = 'Rule.process_request should accept two arguments (request, response), accepting only one is deprecated'
+            warnings.warn(msg, category=ScrapyDeprecationWarning, stacklevel=2)
+
+    def _process_request(self, request, response):
+        """
+        Wrapper around the request processing function to maintain backward
+        compatibility with functions that do not take a Response object
+        """
+        args = [request] if self.process_request_argcount == 1 else [request, response]
+        return self.process_request(*args)


 class CrawlSpider(Spider):
@ -64,8 +90,8 @@ class CrawlSpider(Spider):
                links = rule.process_links(links)
            for link in links:
                seen.add(link)
-                r = self._build_request(n, link)
-                yield rule.process_request(r)
+                request = self._build_request(n, link)
+                yield rule._process_request(request, response)

    def _response_downloaded(self, response):
        rule = self._rules[response.meta['rule']]
@ -83,17 +109,9 @@ class CrawlSpider(Spider):
                yield request_or_item

    def _compile_rules(self):
-        def get_method(method):
-            if callable(method):
-                return method
-            elif isinstance(method, six.string_types):
-                return getattr(self, method, None)
-
        self._rules = [copy.copy(r) for r in self.rules]
        for rule in self._rules:
-            rule.callback = get_method(rule.callback)
-            rule.process_links = get_method(rule.process_links)
-            rule.process_request = get_method(rule.process_request)
+            rule._compile(self)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
--- a/scrapy/squeues.py
+++ b/scrapy/squeues.py
@ -7,6 +7,7 @@ from six.moves import cPickle as pickle

 from queuelib import queue

+
 def _serializable_queue(queue_class, serialize, deserialize):

    class SerializableQueue(queue_class):
@ -22,6 +23,7 @@ def _serializable_queue(queue_class, serialize, deserialize):

    return SerializableQueue

+
 def _pickle_serialize(obj):
    try:
        return pickle.dumps(obj, protocol=2)
@ -31,13 +33,14 @@ def _pickle_serialize(obj):
    except (pickle.PicklingError, AttributeError, TypeError) as e:
        raise ValueError(str(e))

-PickleFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue, \
+
+PickleFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue,
    _pickle_serialize, pickle.loads)
-PickleLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue, \
+PickleLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue,
    _pickle_serialize, pickle.loads)
-MarshalFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue, \
+MarshalFifoDiskQueue = _serializable_queue(queue.FifoDiskQueue,
    marshal.dumps, marshal.loads)
-MarshalLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue, \
+MarshalLifoDiskQueue = _serializable_queue(queue.LifoDiskQueue,
    marshal.dumps, marshal.loads)
 FifoMemoryQueue = queue.FifoMemoryQueue
 LifoMemoryQueue = queue.LifoMemoryQueue
--- a/scrapy/templates/project/module/middlewares.py.tmpl
+++ b/scrapy/templates/project/module/middlewares.py.tmpl
@ -39,7 +39,7 @@ class ${ProjectName}SpiderMiddleware(object):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

-        # Should return either None or an iterable of Response, dict
+        # Should return either None or an iterable of Request, dict
        # or Item objects.
        pass

--- a/scrapy/utils/defer.py
+++ b/scrapy/utils/defer.py
@ -48,7 +48,7 @@ def mustbe_deferred(f, *args, **kw):
    # exception in Scrapy - see #125
    except IgnoreRequest as e:
        return defer_fail(failure.Failure(e))
-    except:
+    except Exception:
        return defer_fail(failure.Failure())
    else:
        return defer_result(result)
@ -102,5 +102,5 @@ def iter_errback(iterable, errback, *a, **kw):
            yield next(it)
        except StopIteration:
            break
-        except:
+        except Exception:
            errback(failure.Failure(), *a, **kw)
--- a/scrapy/utils/gz.py
+++ b/scrapy/utils/gz.py
@ -9,6 +9,9 @@ from gzip import GzipFile
 import six
 import re

+from scrapy.utils.decorators import deprecated
+
+
 # - Python>=3.5 GzipFile's read() has issues returning leftover
 #   uncompressed data when input is corrupted
 #   (regression or bug-fix compared to Python 3.4)
@ -53,6 +56,7 @@ def gunzip(data):
 _is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search
 _is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search

+@deprecated
 def is_gzipped(response):
    """Return True if the response is gzipped, or False otherwise"""
    ctype = response.headers.get('Content-Type', b'')
--- a/scrapy/utils/misc.py
+++ b/scrapy/utils/misc.py
@ -1,6 +1,8 @@
 """Helper functions which don't fit anywhere else"""
+import os
 import re
 import hashlib
+from contextlib import contextmanager
 from importlib import import_module
 from pkgutil import iter_modules

@ -86,7 +88,7 @@ def extract_regex(regex, text, encoding='utf-8'):

    try:
        strings = [regex.search(text).group('extract')]   # named group
-    except:
+    except Exception:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)

@ -142,3 +144,21 @@ def create_instance(objcls, settings, crawler, *args, **kwargs):
        return objcls.from_settings(settings, *args, **kwargs)
    else:
        return objcls(*args, **kwargs)
+
+
+@contextmanager
+def set_environ(**kwargs):
+    """Temporarily set environment variables inside the context manager and
+    fully restore previous environment afterwards
+    """
+
+    original_env = {k: os.environ.get(k) for k in kwargs}
+    os.environ.update(kwargs)
+    try:
+        yield
+    finally:
+        for k, v in original_env.items():
+            if v is None:
+                del os.environ[k]
+            else:
+                os.environ[k] = v
--- a/scrapy/utils/python.py
+++ b/scrapy/utils/python.py
@ -9,6 +9,7 @@ import weakref
 import errno
 import six
 from functools import partial, wraps
+from itertools import chain
 import sys

 from scrapy.utils.decorators import deprecated
@ -387,3 +388,22 @@ if hasattr(sys, "pypy_version_info"):
 else:
    def garbage_collect():
        gc.collect()
+
+
+class MutableChain(object):
+    """
+    Thin wrapper around itertools.chain, allowing to add iterables "in-place"
+    """
+    def __init__(self, *args):
+        self.data = chain(*args)
+
+    def extend(self, *iterables):
+        self.data = chain(self.data, *iterables)
+
+    def __iter__(self):
+        return self.data.__iter__()
+
+    def __next__(self):
+        return next(self.data)
+
+    next = __next__
--- a/scrapy/utils/reqser.py
+++ b/scrapy/utils/reqser.py
@ -70,6 +70,20 @@ def request_from_dict(d, spider=None):
    )


+def _is_private_method(name):
+    return name.startswith('__') and not name.endswith('__')
+
+
+def _mangle_private_name(obj, func, name):
+    qualname = getattr(func, '__qualname__', None)
+    if qualname is None:
+        classname = obj.__class__.__name__.lstrip('_')
+        return '_%s%s' % (classname, name)
+    else:
+        splits = qualname.split('.')
+        return '_%s%s' % (splits[-2], splits[-1])
+
+
 def _find_method(obj, func):
    if obj:
        try:
@ -78,7 +92,10 @@ def _find_method(obj, func):
            pass
        else:
            if func_self is obj:
-                return six.get_method_function(func).__name__
+                name = six.get_method_function(func).__name__
+                if _is_private_method(name):
+                    return _mangle_private_name(obj, func, name)
+                return name
    raise ValueError("Function %s is not a method of: %s" % (func, obj))


--- a/scrapy/utils/response.py
+++ b/scrapy/utils/response.py
@ -31,12 +31,12 @@ def get_base_url(response):


 _metaref_cache = weakref.WeakKeyDictionary()
-def get_meta_refresh(response):
+def get_meta_refresh(response, ignore_tags=('script', 'noscript')):
    """Parse the http-equiv refrsh parameter from the given response"""
    if response not in _metaref_cache:
        text = response.text[0:4096]
        _metaref_cache[response] = html.get_meta_refresh(text, response.url,
-            response.encoding, ignore_tags=('script', 'noscript'))
+            response.encoding, ignore_tags=ignore_tags)
    return _metaref_cache[response]


--- a/setup.py
+++ b/setup.py
@ -65,7 +65,8 @@ setup(
    ],
    python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
    install_requires=[
-        'Twisted>=13.1.0',
+        'Twisted>=13.1.0;python_version!="3.4"',
+        'Twisted>=13.1.0,<=19.2.0;python_version=="3.4"',
        'w3lib>=1.17.0',
        'queuelib',
        'lxml',
--- a/tests/mockserver.py
+++ b/tests/mockserver.py
@ -177,7 +177,7 @@ class Root(Resource):
        try:
            from tests import tests_datadir
            self.putChild(b"files", File(os.path.join(tests_datadir, 'test_site/files/')))
-        except:
+        except Exception:
            pass
        self.putChild(b"redirect-to", RedirectTo())

--- a/tests/requirements-py2.txt
+++ b/tests/requirements-py2.txt
@ -2,9 +2,10 @@
 mock
 mitmproxy==0.10.1
 netlib==0.10.1
-pytest==2.9.2
+pytest
+pytest-cov
 pytest-twisted
-pytest-cov==2.2.1
+pytest-xdist
 jmespath
 brotlipy
 testfixtures
--- a/tests/requirements-py3.txt
+++ b/tests/requirements-py3.txt
@ -1,6 +1,7 @@
-pytest==3.6.3
+pytest
+pytest-cov
 pytest-twisted
-pytest-cov==2.5.1
+pytest-xdist
 testfixtures
 jmespath
 leveldb; sys_platform != "win32"
--- a/tests/test_closespider.py
+++ b/tests/test_closespider.py
@ -53,9 +53,5 @@ class TestCloseSpider(TestCase):
        yield crawler.crawl(total=1000000, mockserver=self.mockserver)
        reason = crawler.spider.meta['close_reason']
        self.assertEqual(reason, 'closespider_timeout')
-        stats = crawler.stats
-        start = stats.get_value('start_time')
-        stop = stats.get_value('finish_time')
-        diff = stop - start
-        total_seconds = diff.seconds + diff.microseconds
+        total_seconds = crawler.stats.get_value('elapsed_time_seconds')
        self.assertTrue(total_seconds >= close_on)
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@ -1,5 +1,4 @@
 import logging
-import tempfile
 import warnings

 from twisted.internet import defer
@ -38,7 +37,11 @@ class CrawlerTestCase(BaseCrawlerTest):
            self.assertIsInstance(spiders, sl_cls)

            self.crawler.spiders
-            self.assertEqual(len(w), 1, "Warn deprecated access only once")
+            is_one_warning = len(w) == 1
+            if not is_one_warning:
+                for warning in w:
+                    print(warning)
+            self.assertTrue(is_one_warning, "Warn deprecated access only once")

    def test_populate_spidercls_settings(self):
        spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
@ -179,8 +182,12 @@ class CrawlerRunnerTestCase(BaseCrawlerTest):
                'SPIDER_MANAGER_CLASS': 'tests.test_crawler.CustomSpiderLoader'
            })
            self.assertIsInstance(runner.spider_loader, CustomSpiderLoader)
-            self.assertEqual(len(w), 1)
+            is_one_warning = len(w) == 1
+            if not is_one_warning:
+                for warning in w:
+                    print(warning)
            self.assertIn('Please use SPIDER_LOADER_CLASS', str(w[0].message))
+            self.assertTrue(is_one_warning)

    def test_crawl_rejects_spider_objects(self):
        with raises(ValueError):
--- a/tests/test_downloadermiddleware.py
+++ b/tests/test_downloadermiddleware.py
@ -3,6 +3,7 @@ from twisted.python.failure import Failure

 from scrapy.http import Request, Response
 from scrapy.spiders import Spider
+from scrapy.exceptions import _InvalidOutput
 from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
 from scrapy.utils.test import get_crawler
 from scrapy.utils.python import to_bytes
@ -115,3 +116,63 @@ class ResponseFromProcessRequestTest(ManagerTestCase):

        self.assertIs(results[0], resp)
        self.assertFalse(download_func.called)
+
+
+class ProcessRequestInvalidOutput(ManagerTestCase):
+    """Invalid return value for process_request method should raise an exception"""
+
+    def test_invalid_process_request(self):
+        req = Request('http://example.com/index.html')
+
+        class InvalidProcessRequestMiddleware:
+            def process_request(self, request, spider):
+                return 1
+
+        self.mwman._add_middleware(InvalidProcessRequestMiddleware())
+        download_func = mock.MagicMock()
+        dfd = self.mwman.download(download_func, req, self.spider)
+        results = []
+        dfd.addBoth(results.append)
+        self.assertIsInstance(results[0], Failure)
+        self.assertIsInstance(results[0].value, _InvalidOutput)
+
+
+class ProcessResponseInvalidOutput(ManagerTestCase):
+    """Invalid return value for process_response method should raise an exception"""
+
+    def test_invalid_process_response(self):
+        req = Request('http://example.com/index.html')
+
+        class InvalidProcessResponseMiddleware:
+            def process_response(self, request, response, spider):
+                return 1
+
+        self.mwman._add_middleware(InvalidProcessResponseMiddleware())
+        download_func = mock.MagicMock()
+        dfd = self.mwman.download(download_func, req, self.spider)
+        results = []
+        dfd.addBoth(results.append)
+        self.assertIsInstance(results[0], Failure)
+        self.assertIsInstance(results[0].value, _InvalidOutput)
+
+
+class ProcessExceptionInvalidOutput(ManagerTestCase):
+    """Invalid return value for process_exception method should raise an exception"""
+
+    def test_invalid_process_exception(self):
+        req = Request('http://example.com/index.html')
+
+        class InvalidProcessExceptionMiddleware:
+            def process_request(self, request, spider):
+                raise Exception()
+
+            def process_exception(self, request, exception, spider):
+                return 1
+
+        self.mwman._add_middleware(InvalidProcessExceptionMiddleware())
+        download_func = mock.MagicMock()
+        dfd = self.mwman.download(download_func, req, self.spider)
+        results = []
+        dfd.addBoth(results.append)
+        self.assertIsInstance(results[0], Failure)
+        self.assertIsInstance(results[0].value, _InvalidOutput)
--- a/tests/test_downloadermiddleware_redirect.py
+++ b/tests/test_downloadermiddleware_redirect.py
@ -279,5 +279,24 @@ class MetaRefreshMiddlewareTest(unittest.TestCase):
        self.assertEqual(req2.meta['redirect_reasons'], ['meta refresh'])
        self.assertEqual(req3.meta['redirect_reasons'], ['meta refresh', 'meta refresh'])

+    def test_ignore_tags_default(self):
+        req = Request(url='http://example.org')
+        body = ('''<noscript><meta http-equiv="refresh" '''
+                '''content="0;URL='http://example.org/newpage'"></noscript>''')
+        rsp = HtmlResponse(req.url, body=body.encode())
+        response = self.mw.process_response(req, rsp, self.spider)
+        assert isinstance(response, Response)
+
+    def test_ignore_tags_empty_list(self):
+        crawler = get_crawler(Spider, {'METAREFRESH_IGNORE_TAGS': []})
+        mw = MetaRefreshMiddleware.from_crawler(crawler)
+        req = Request(url='http://example.org')
+        body = ('''<noscript><meta http-equiv="refresh" '''
+                '''content="0;URL='http://example.org/newpage'"></noscript>''')
+        rsp = HtmlResponse(req.url, body=body.encode())
+        req2 = mw.process_response(req, rsp, self.spider)
+        assert isinstance(req2, Request)
+        self.assertEqual(req2.url, 'http://example.org/newpage')
+
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@ -419,6 +419,43 @@ class BasicItemLoaderTest(unittest.TestCase):
        self.assertEqual(item['url'], u'rabbit.hole')
        self.assertEqual(item['summary'], u'rabbithole')

+    def test_create_item_from_dict(self):
+        class TestItem(Item):
+            title = Field()
+
+        class TestItemLoader(ItemLoader):
+            default_item_class = TestItem
+
+        input_item = {'title': 'Test item title 1'}
+        il = TestItemLoader(item=input_item)
+        # Getting output value mustn't remove value from item
+        self.assertEqual(il.load_item(), {
+            'title': 'Test item title 1',
+        })
+        self.assertEqual(il.get_output_value('title'), 'Test item title 1')
+        self.assertEqual(il.load_item(), {
+            'title': 'Test item title 1',
+        })
+
+        input_item = {'title': 'Test item title 2'}
+        il = TestItemLoader(item=input_item)
+        # Values from dict must be added to item _values
+        self.assertEqual(il._values.get('title'), 'Test item title 2')
+
+        input_item = {'title': [u'Test item title 3', u'Test item 4']}
+        il = TestItemLoader(item=input_item)
+        # Same rules must work for lists
+        self.assertEqual(il._values.get('title'),
+                         [u'Test item title 3', u'Test item 4'])
+        self.assertEqual(il.load_item(), {
+            'title': [u'Test item title 3', u'Test item 4'],
+        })
+        self.assertEqual(il.get_output_value('title'),
+                         [u'Test item title 3', u'Test item 4'])
+        self.assertEqual(il.load_item(), {
+            'title': [u'Test item title 3', u'Test item 4'],
+        })
+

 class ProcessorsTest(unittest.TestCase):

--- a/tests/test_pipeline_media.py
+++ b/tests/test_pipeline_media.py
@ -1,15 +1,19 @@
 from __future__ import print_function
+
+import sys
+
 from testfixtures import LogCapture
 from twisted.trial import unittest
 from twisted.python.failure import Failure
 from twisted.internet import reactor
-from twisted.internet.defer import Deferred, inlineCallbacks
+from twisted.internet.defer import Deferred, inlineCallbacks, returnValue

 from scrapy.http import Request, Response
 from scrapy.settings import Settings
 from scrapy.spiders import Spider
 from scrapy.utils.request import request_fingerprint
 from scrapy.pipelines.media import MediaPipeline
+from scrapy.pipelines.files import FileException
 from scrapy.utils.log import failure_to_exc_info
 from scrapy.utils.signal import disconnect_all
 from scrapy import signals
@ -90,6 +94,77 @@ class BaseMediaPipelineTestCase(unittest.TestCase):
        self.pipe._modify_media_request(request)
        assert request.meta == {'handle_httpstatus_all': True}

+    def test_should_remove_req_res_references_before_caching_the_results(self):
+        """Regression test case to prevent a memory leak in the Media Pipeline.
+
+        The memory leak is triggered when an exception is raised when a Response
+        scheduled by the Media Pipeline is being returned. For example, when a
+        FileException('download-error') is raised because the Response status
+        code is not 200 OK.
+
+        It happens because we are keeping a reference to the Response object
+        inside the FileException context. This is caused by the way Twisted
+        return values from inline callbacks. It raises a custom exception
+        encapsulating the original return value.
+
+        The solution is to remove the exception context when this context is a
+        _DefGen_Return instance, the BaseException used by Twisted to pass the
+        returned value from those inline callbacks.
+
+        Maybe there's a better and more reliable way to test the case described
+        here, but it would be more complicated and involve running - or at least
+        mocking - some async steps from the Media Pipeline. The current test
+        case is simple and detects the problem very fast. On the other hand, it
+        would not detect another kind of leak happening due to old object
+        references being kept inside the Media Pipeline cache.
+
+        This problem does not occur in Python 2.7 since we don't have Exception
+        Chaining (https://www.python.org/dev/peps/pep-3134/).
+        """
+        # Create sample pair of Request and Response objects
+        request = Request('http://url')
+        response = Response('http://url', body=b'', request=request)
+
+        # Simulate the Media Pipeline behavior to produce a Twisted Failure
+        try:
+            # Simulate a Twisted inline callback returning a Response
+            # The returnValue method raises an exception encapsulating the value
+            returnValue(response)
+        except BaseException as exc:
+            def_gen_return_exc = exc
+            try:
+                # Simulate the media_downloaded callback raising a FileException
+                # This usually happens when the status code is not 200 OK
+                raise FileException('download-error')
+            except Exception as exc:
+                file_exc = exc
+                # Simulate Twisted capturing the FileException
+                # It encapsulates the exception inside a Twisted Failure
+                failure = Failure(file_exc)
+
+        # The Failure should encapsulate a FileException ...
+        self.assertEqual(failure.value, file_exc)
+        # ... and if we're running on Python 3 ...
+        if sys.version_info.major >= 3:
+            # ... it should have the returnValue exception set as its context
+            self.assertEqual(failure.value.__context__, def_gen_return_exc)
+
+        # Let's calculate the request fingerprint and fake some runtime data...
+        fp = request_fingerprint(request)
+        info = self.pipe.spiderinfo
+        info.downloading.add(fp)
+        info.waiting[fp] = []
+
+        # When calling the method that caches the Request's result ...
+        self.pipe._cache_result_and_execute_waiters(failure, fp, info)
+        # ... it should store the Twisted Failure ...
+        self.assertEqual(info.downloaded[fp], failure)
+        # ... encapsulating the original FileException ...
+        self.assertEqual(info.downloaded[fp].value, file_exc)
+        # ... but it should not store the returnValue exception on its context
+        context = getattr(info.downloaded[fp].value, '__context__', None)
+        self.assertIsNone(context)
+

 class MockedMediaPipeline(MediaPipeline):

--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@ -0,0 +1,342 @@
+import shutil
+import tempfile
+import unittest
+import collections
+
+from twisted.internet import defer
+from twisted.trial.unittest import TestCase
+
+from scrapy.crawler import Crawler
+from scrapy.core.downloader import Downloader
+from scrapy.core.scheduler import Scheduler
+from scrapy.http import Request
+from scrapy.spiders import Spider
+from scrapy.utils.httpobj import urlparse_cached
+from scrapy.utils.test import get_crawler
+from tests.mockserver import MockServer
+
+
+MockEngine = collections.namedtuple('MockEngine', ['downloader'])
+MockSlot = collections.namedtuple('MockSlot', ['active'])
+
+
+class MockDownloader(object):
+    def __init__(self):
+        self.slots = dict()
+
+    def _get_slot_key(self, request, spider):
+        if Downloader.DOWNLOAD_SLOT in request.meta:
+            return request.meta[Downloader.DOWNLOAD_SLOT]
+
+        return urlparse_cached(request).hostname or ''
+
+    def increment(self, slot_key):
+        slot = self.slots.setdefault(slot_key, MockSlot(active=list()))
+        slot.active.append(1)
+
+    def decrement(self, slot_key):
+        slot = self.slots.get(slot_key)
+        slot.active.pop()
+
+    def close(self):
+        pass
+
+
+class MockCrawler(Crawler):
+    def __init__(self, priority_queue_cls, jobdir):
+
+        settings = dict(
+                LOG_UNSERIALIZABLE_REQUESTS=False,
+                SCHEDULER_DISK_QUEUE='scrapy.squeues.PickleLifoDiskQueue',
+                SCHEDULER_MEMORY_QUEUE='scrapy.squeues.LifoMemoryQueue',
+                SCHEDULER_PRIORITY_QUEUE=priority_queue_cls,
+                JOBDIR=jobdir,
+                DUPEFILTER_CLASS='scrapy.dupefilters.BaseDupeFilter'
+                )
+        super(MockCrawler, self).__init__(Spider, settings)
+        self.engine = MockEngine(downloader=MockDownloader())
+
+
+class SchedulerHandler(object):
+    priority_queue_cls = None
+    jobdir = None
+
+    def create_scheduler(self):
+        self.mock_crawler = MockCrawler(self.priority_queue_cls, self.jobdir)
+        self.scheduler = Scheduler.from_crawler(self.mock_crawler)
+        self.spider = Spider(name='spider')
+        self.scheduler.open(self.spider)
+
+    def close_scheduler(self):
+        self.scheduler.close('finished')
+        self.mock_crawler.stop()
+        self.mock_crawler.engine.downloader.close()
+
+    def setUp(self):
+        self.create_scheduler()
+
+    def tearDown(self):
+        self.close_scheduler()
+
+
+_PRIORITIES = [("http://foo.com/a", -2),
+               ("http://foo.com/d", 1),
+               ("http://foo.com/b", -1),
+               ("http://foo.com/c", 0),
+               ("http://foo.com/e", 2)]
+
+
+_URLS = {"http://foo.com/a", "http://foo.com/b", "http://foo.com/c"}
+
+
+class BaseSchedulerInMemoryTester(SchedulerHandler):
+    def test_length(self):
+        self.assertFalse(self.scheduler.has_pending_requests())
+        self.assertEqual(len(self.scheduler), 0)
+
+        for url in _URLS:
+            self.scheduler.enqueue_request(Request(url))
+
+        self.assertTrue(self.scheduler.has_pending_requests())
+        self.assertEqual(len(self.scheduler), len(_URLS))
+
+    def test_dequeue(self):
+        for url in _URLS:
+            self.scheduler.enqueue_request(Request(url))
+
+        urls = set()
+        while self.scheduler.has_pending_requests():
+            urls.add(self.scheduler.next_request().url)
+
+        self.assertEqual(urls, _URLS)
+
+    def test_dequeue_priorities(self):
+        for url, priority in _PRIORITIES:
+            self.scheduler.enqueue_request(Request(url, priority=priority))
+
+        priorities = list()
+        while self.scheduler.has_pending_requests():
+            priorities.append(self.scheduler.next_request().priority)
+
+        self.assertEqual(priorities,
+                         sorted([x[1] for x in _PRIORITIES], key=lambda x: -x))
+
+
+class BaseSchedulerOnDiskTester(SchedulerHandler):
+
+    def setUp(self):
+        self.jobdir = tempfile.mkdtemp()
+        self.create_scheduler()
+
+    def tearDown(self):
+        self.close_scheduler()
+
+        shutil.rmtree(self.jobdir)
+        self.jobdir = None
+
+    def test_length(self):
+        self.assertFalse(self.scheduler.has_pending_requests())
+        self.assertEqual(len(self.scheduler), 0)
+
+        for url in _URLS:
+            self.scheduler.enqueue_request(Request(url))
+
+        self.close_scheduler()
+        self.create_scheduler()
+
+        self.assertTrue(self.scheduler.has_pending_requests())
+        self.assertEqual(len(self.scheduler), len(_URLS))
+
+    def test_dequeue(self):
+        for url in _URLS:
+            self.scheduler.enqueue_request(Request(url))
+
+        self.close_scheduler()
+        self.create_scheduler()
+
+        urls = set()
+        while self.scheduler.has_pending_requests():
+            urls.add(self.scheduler.next_request().url)
+
+        self.assertEqual(urls, _URLS)
+
+    def test_dequeue_priorities(self):
+        for url, priority in _PRIORITIES:
+            self.scheduler.enqueue_request(Request(url, priority=priority))
+
+        self.close_scheduler()
+        self.create_scheduler()
+
+        priorities = list()
+        while self.scheduler.has_pending_requests():
+            priorities.append(self.scheduler.next_request().priority)
+
+        self.assertEqual(priorities,
+                         sorted([x[1] for x in _PRIORITIES], key=lambda x: -x))
+
+
+class TestSchedulerInMemory(BaseSchedulerInMemoryTester, unittest.TestCase):
+    priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
+
+
+class TestSchedulerOnDisk(BaseSchedulerOnDiskTester, unittest.TestCase):
+    priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
+
+
+_URLS_WITH_SLOTS = [("http://foo.com/a", 'a'),
+                    ("http://foo.com/b", 'a'),
+                    ("http://foo.com/c", 'b'),
+                    ("http://foo.com/d", 'b'),
+                    ("http://foo.com/e", 'c'),
+                    ("http://foo.com/f", 'c')]
+
+
+class TestMigration(unittest.TestCase):
+
+    def setUp(self):
+        self.tmpdir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdir)
+
+    def _migration(self, tmp_dir):
+        prev_scheduler_handler = SchedulerHandler()
+        prev_scheduler_handler.priority_queue_cls = 'scrapy.pqueues.ScrapyPriorityQueue'
+        prev_scheduler_handler.jobdir = tmp_dir
+
+        prev_scheduler_handler.create_scheduler()
+        for url in _URLS:
+            prev_scheduler_handler.scheduler.enqueue_request(Request(url))
+        prev_scheduler_handler.close_scheduler()
+
+        next_scheduler_handler = SchedulerHandler()
+        next_scheduler_handler.priority_queue_cls = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
+        next_scheduler_handler.jobdir = tmp_dir
+
+        next_scheduler_handler.create_scheduler()
+
+    def test_migration(self):
+        with self.assertRaises(ValueError):
+            self._migration(self.tmpdir)
+
+
+def _is_scheduling_fair(enqueued_slots, dequeued_slots):
+    """
+    We enqueued same number of requests for every slot.
+    Assert correct order, e.g.
+
+    >>> enqueued = ['a', 'b', 'c'] * 2
+    >>> correct = ['a', 'c', 'b', 'b', 'a', 'c']
+    >>> incorrect = ['a', 'a', 'b', 'c', 'c', 'b']
+    >>> _is_scheduling_fair(enqueued, correct)
+    True
+    >>> _is_scheduling_fair(enqueued, incorrect)
+    False
+    """
+    if len(dequeued_slots) != len(enqueued_slots):
+        return False
+
+    slots_number = len(set(enqueued_slots))
+    for i in range(0, len(dequeued_slots), slots_number):
+        part = dequeued_slots[i:i + slots_number]
+        if len(part) != len(set(part)):
+            return False
+
+    return True
+
+
+class DownloaderAwareSchedulerTestMixin(object):
+    priority_queue_cls = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
+    reopen = False
+
+    def test_logic(self):
+        for url, slot in _URLS_WITH_SLOTS:
+            request = Request(url)
+            request.meta[Downloader.DOWNLOAD_SLOT] = slot
+            self.scheduler.enqueue_request(request)
+
+        if self.reopen:
+            self.close_scheduler()
+            self.create_scheduler()
+
+        dequeued_slots = list()
+        requests = []
+        downloader = self.mock_crawler.engine.downloader
+        while self.scheduler.has_pending_requests():
+            request = self.scheduler.next_request()
+            # pylint: disable=protected-access
+            slot = downloader._get_slot_key(request, None)
+            dequeued_slots.append(slot)
+            downloader.increment(slot)
+            requests.append(request)
+
+        for request in requests:
+            # pylint: disable=protected-access
+            slot = downloader._get_slot_key(request, None)
+            downloader.decrement(slot)
+
+        self.assertTrue(_is_scheduling_fair(list(s for u, s in _URLS_WITH_SLOTS),
+                                            dequeued_slots))
+        self.assertEqual(sum(len(s.active) for s in downloader.slots.values()), 0)
+
+
+class TestSchedulerWithDownloaderAwareInMemory(DownloaderAwareSchedulerTestMixin,
+                                               BaseSchedulerInMemoryTester,
+                                               unittest.TestCase):
+    pass
+
+
+class TestSchedulerWithDownloaderAwareOnDisk(DownloaderAwareSchedulerTestMixin,
+                                             BaseSchedulerOnDiskTester,
+                                             unittest.TestCase):
+    reopen = True
+
+
+class StartUrlsSpider(Spider):
+
+    def __init__(self, start_urls):
+        self.start_urls = start_urls
+        super(StartUrlsSpider, self).__init__(start_urls)
+
+    def parse(self, response):
+        pass
+
+
+class TestIntegrationWithDownloaderAwareInMemory(TestCase):
+    def setUp(self):
+        self.crawler = get_crawler(
+                    StartUrlsSpider,
+                    {'SCHEDULER_PRIORITY_QUEUE': 'scrapy.pqueues.DownloaderAwarePriorityQueue',
+                     'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter'}
+                    )
+
+    @defer.inlineCallbacks
+    def tearDown(self):
+        yield self.crawler.stop()
+
+    @defer.inlineCallbacks
+    def test_integration_downloader_aware_priority_queue(self):
+        with MockServer() as mockserver:
+
+            url = mockserver.url("/status?n=200", is_secure=False)
+            start_urls = [url] * 6
+            yield self.crawler.crawl(start_urls)
+            self.assertEqual(self.crawler.stats.get_value('downloader/response_count'),
+                             len(start_urls))
+
+
+class TestIncompatibility(unittest.TestCase):
+
+    def _incompatible(self):
+        settings = dict(
+                SCHEDULER_PRIORITY_QUEUE='scrapy.pqueues.DownloaderAwarePriorityQueue',
+                CONCURRENT_REQUESTS_PER_IP=1
+                )
+        crawler = Crawler(Spider, settings)
+        scheduler = Scheduler.from_crawler(crawler)
+        spider = Spider(name='spider')
+        scheduler.open(spider)
+
+    def test_incompatibility(self):
+        with self.assertRaises(ValueError):
+            self._incompatible()
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -105,11 +105,11 @@ class SpiderTest(unittest.TestCase):

    def test_logger(self):
        spider = self.spider_class('example.com')
-        with LogCapture() as l:
+        with LogCapture() as lc:
            spider.logger.info('test log msg')
-        l.check(('example.com', 'INFO', 'test log msg'))
+        lc.check(('example.com', 'INFO', 'test log msg'))

-        record = l.records[0]
+        record = lc.records[0]
        self.assertIn('spider', record.__dict__)
        self.assertIs(record.spider, spider)

@ -190,12 +190,11 @@ class CrawlSpiderTest(SpiderTest):

    def test_process_links(self):

-        response = HtmlResponse("http://example.org/somepage/index.html",
-            body=self.test_body)
+        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)

        class _CrawlSpider(self.spider_class):
-            name="test"
-            allowed_domains=['example.org']
+            name = "test"
+            allowed_domains = ['example.org']
            rules = (
                Rule(LinkExtractor(), process_links="dummy_process_links"),
            )
@ -208,24 +207,24 @@ class CrawlSpiderTest(SpiderTest):
        self.assertEqual(len(output), 3)
        self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
        self.assertEqual([r.url for r in output],
-                          ['http://example.org/somepage/item/12.html',
-                           'http://example.org/about.html',
-                           'http://example.org/nofollow.html'])
+                         ['http://example.org/somepage/item/12.html',
+                          'http://example.org/about.html',
+                          'http://example.org/nofollow.html'])

    def test_process_links_filter(self):

-        response = HtmlResponse("http://example.org/somepage/index.html",
-            body=self.test_body)
+        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)

        class _CrawlSpider(self.spider_class):
            import re

-            name="test"
-            allowed_domains=['example.org']
+            name = "test"
+            allowed_domains = ['example.org']
            rules = (
                Rule(LinkExtractor(), process_links="filter_process_links"),
            )
            _test_regex = re.compile('nofollow')
+
            def filter_process_links(self, links):
                return [link for link in links
                        if not self._test_regex.search(link.url)]
@ -235,17 +234,16 @@ class CrawlSpiderTest(SpiderTest):
        self.assertEqual(len(output), 2)
        self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
        self.assertEqual([r.url for r in output],
-                          ['http://example.org/somepage/item/12.html',
-                           'http://example.org/about.html'])
+                         ['http://example.org/somepage/item/12.html',
+                          'http://example.org/about.html'])

    def test_process_links_generator(self):

-        response = HtmlResponse("http://example.org/somepage/index.html",
-            body=self.test_body)
+        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)

        class _CrawlSpider(self.spider_class):
-            name="test"
-            allowed_domains=['example.org']
+            name = "test"
+            allowed_domains = ['example.org']
            rules = (
                Rule(LinkExtractor(), process_links="dummy_process_links"),
            )
@ -259,9 +257,113 @@ class CrawlSpiderTest(SpiderTest):
        self.assertEqual(len(output), 3)
        self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
        self.assertEqual([r.url for r in output],
-                          ['http://example.org/somepage/item/12.html',
-                           'http://example.org/about.html',
-                           'http://example.org/nofollow.html'])
+                         ['http://example.org/somepage/item/12.html',
+                          'http://example.org/about.html',
+                          'http://example.org/nofollow.html'])
+
+    def test_process_request(self):
+
+        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
+
+        def process_request_change_domain(request):
+            return request.replace(url=request.url.replace('.org', '.com'))
+
+        class _CrawlSpider(self.spider_class):
+            name = "test"
+            allowed_domains = ['example.org']
+            rules = (
+                Rule(LinkExtractor(), process_request=process_request_change_domain),
+            )
+
+        with warnings.catch_warnings(record=True) as cw:
+            spider = _CrawlSpider()
+            output = list(spider._requests_to_follow(response))
+            self.assertEqual(len(output), 3)
+            self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
+            self.assertEqual([r.url for r in output],
+                             ['http://example.com/somepage/item/12.html',
+                              'http://example.com/about.html',
+                              'http://example.com/nofollow.html'])
+            self.assertEqual(len(cw), 1)
+            self.assertEqual(cw[0].category, ScrapyDeprecationWarning)
+
+    def test_process_request_with_response(self):
+
+        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
+
+        def process_request_meta_response_class(request, response):
+            request.meta['response_class'] = response.__class__.__name__
+            return request
+
+        class _CrawlSpider(self.spider_class):
+            name = "test"
+            allowed_domains = ['example.org']
+            rules = (
+                Rule(LinkExtractor(), process_request=process_request_meta_response_class),
+            )
+
+        spider = _CrawlSpider()
+        output = list(spider._requests_to_follow(response))
+        self.assertEqual(len(output), 3)
+        self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
+        self.assertEqual([r.url for r in output],
+                         ['http://example.org/somepage/item/12.html',
+                          'http://example.org/about.html',
+                          'http://example.org/nofollow.html'])
+        self.assertEqual([r.meta['response_class'] for r in output],
+                         ['HtmlResponse', 'HtmlResponse', 'HtmlResponse'])
+
+    def test_process_request_instance_method(self):
+
+        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
+
+        class _CrawlSpider(self.spider_class):
+            name = "test"
+            allowed_domains = ['example.org']
+            rules = (
+                Rule(LinkExtractor(), process_request='process_request_upper'),
+            )
+
+            def process_request_upper(self, request):
+                return request.replace(url=request.url.upper())
+
+        with warnings.catch_warnings(record=True) as cw:
+            spider = _CrawlSpider()
+            output = list(spider._requests_to_follow(response))
+            self.assertEqual(len(output), 3)
+            self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
+            self.assertEqual([r.url for r in output],
+                             ['http://EXAMPLE.ORG/SOMEPAGE/ITEM/12.HTML',
+                              'http://EXAMPLE.ORG/ABOUT.HTML',
+                              'http://EXAMPLE.ORG/NOFOLLOW.HTML'])
+            self.assertEqual(len(cw), 1)
+            self.assertEqual(cw[0].category, ScrapyDeprecationWarning)
+
+    def test_process_request_instance_method_with_response(self):
+
+        response = HtmlResponse("http://example.org/somepage/index.html", body=self.test_body)
+
+        class _CrawlSpider(self.spider_class):
+            name = "test"
+            allowed_domains = ['example.org']
+            rules = (
+                Rule(LinkExtractor(), process_request='process_request_meta_response_class'),
+            )
+
+            def process_request_meta_response_class(self, request, response):
+                request.meta['response_class'] = response.__class__.__name__
+                return request
+
+        spider = _CrawlSpider()
+        output = list(spider._requests_to_follow(response))
+        self.assertEqual(len(output), 3)
+        self.assertTrue(all(map(lambda r: isinstance(r, Request), output)))
+        self.assertEqual([r.url for r in output],
+                         ['http://example.org/somepage/item/12.html',
+                          'http://example.org/about.html',
+                          'http://example.org/nofollow.html'])
+        self.assertEqual([r.meta['response_class'] for r in output],
+                         ['HtmlResponse', 'HtmlResponse', 'HtmlResponse'])

    def test_follow_links_attribute_population(self):
        crawler = get_crawler()
--- a/tests/test_spidermiddleware.py
+++ b/tests/test_spidermiddleware.py
@ -0,0 +1,102 @@
+from twisted.trial.unittest import TestCase
+from twisted.python.failure import Failure
+
+from scrapy.spiders import Spider
+from scrapy.http import Request, Response
+from scrapy.exceptions import _InvalidOutput
+from scrapy.utils.test import get_crawler
+from scrapy.core.spidermw import SpiderMiddlewareManager
+from tests import mock
+
+
+class SpiderMiddlewareTestCase(TestCase):
+
+    def setUp(self):
+        self.request = Request('http://example.com/index.html')
+        self.response = Response(self.request.url, request=self.request)
+        self.crawler = get_crawler(Spider)
+        self.spider = self.crawler._create_spider('foo')
+        self.mwman = SpiderMiddlewareManager.from_crawler(self.crawler)
+
+    def _scrape_response(self):
+        """Execute spider mw manager's scrape_response method and return the result.
+        Raise exception in case of failure.
+        """
+        scrape_func = mock.MagicMock()
+        dfd = self.mwman.scrape_response(scrape_func, self.response, self.request, self.spider)
+        # catch deferred result and return the value
+        results = []
+        dfd.addBoth(results.append)
+        self._wait(dfd)
+        ret = results[0]
+        return ret
+
+
+class ProcessSpiderInputInvalidOutput(SpiderMiddlewareTestCase):
+    """Invalid return value for process_spider_input method"""
+
+    def test_invalid_process_spider_input(self):
+
+        class InvalidProcessSpiderInputMiddleware:
+            def process_spider_input(self, response, spider):
+                return 1
+
+        self.mwman._add_middleware(InvalidProcessSpiderInputMiddleware())
+        result = self._scrape_response()
+        self.assertIsInstance(result, Failure)
+        self.assertIsInstance(result.value, _InvalidOutput)
+
+
+class ProcessSpiderOutputInvalidOutput(SpiderMiddlewareTestCase):
+    """Invalid return value for process_spider_output method"""
+
+    def test_invalid_process_spider_output(self):
+
+        class InvalidProcessSpiderOutputMiddleware:
+            def process_spider_output(self, response, result, spider):
+                return 1
+
+        self.mwman._add_middleware(InvalidProcessSpiderOutputMiddleware())
+        result = self._scrape_response()
+        self.assertIsInstance(result, Failure)
+        self.assertIsInstance(result.value, _InvalidOutput)
+
+
+class ProcessSpiderExceptionInvalidOutput(SpiderMiddlewareTestCase):
+    """Invalid return value for process_spider_exception method"""
+
+    def test_invalid_process_spider_exception(self):
+
+        class InvalidProcessSpiderOutputExceptionMiddleware:
+            def process_spider_exception(self, response, exception, spider):
+                return 1
+
+        class RaiseExceptionProcessSpiderOutputMiddleware:
+            def process_spider_output(self, response, result, spider):
+                raise Exception()
+
+        self.mwman._add_middleware(InvalidProcessSpiderOutputExceptionMiddleware())
+        self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
+        result = self._scrape_response()
+        self.assertIsInstance(result, Failure)
+        self.assertIsInstance(result.value, _InvalidOutput)
+
+
+class ProcessSpiderExceptionReRaise(SpiderMiddlewareTestCase):
+    """Re raise the exception by returning None"""
+
+    def test_process_spider_exception_return_none(self):
+
+        class ProcessSpiderExceptionReturnNoneMiddleware:
+            def process_spider_exception(self, response, exception, spider):
+                return None
+
+        class RaiseExceptionProcessSpiderOutputMiddleware:
+            def process_spider_output(self, response, result, spider):
+                1/0
+
+        self.mwman._add_middleware(ProcessSpiderExceptionReturnNoneMiddleware())
+        self.mwman._add_middleware(RaiseExceptionProcessSpiderOutputMiddleware())
+        result = self._scrape_response()
+        self.assertIsInstance(result, Failure)
+        self.assertIsInstance(result.value, ZeroDivisionError)
--- a/tests/test_spidermiddleware_output_chain.py
+++ b/tests/test_spidermiddleware_output_chain.py
@ -0,0 +1,380 @@
+
+from testfixtures import LogCapture
+from twisted.trial.unittest import TestCase
+from twisted.internet import defer
+
+from scrapy import Spider, Request
+from scrapy.utils.test import get_crawler
+from tests.mockserver import MockServer
+from tests.spiders import MockServerSpider
+
+
+class LogExceptionMiddleware:
+    def process_spider_exception(self, response, exception, spider):
+        spider.logger.info('Middleware: %s exception caught', exception.__class__.__name__)
+        return None
+
+
+# ================================================================================
+# (0) recover from an exception on a spider callback
+class RecoverySpider(Spider):
+    name = 'RecoverySpider'
+    custom_settings = {
+        'SPIDER_MIDDLEWARES': {
+            __name__ + '.RecoveryMiddleware': 10,
+        },
+    }
+
+    def start_requests(self):
+        yield Request(self.mockserver.url('/status?n=200'))
+
+    def parse(self, response):
+        yield {'test': 1}
+        self.logger.info('DONT_FAIL: %s', response.meta.get('dont_fail'))
+        if not response.meta.get('dont_fail'):
+            raise TabError()
+
+class RecoveryMiddleware:
+    def process_spider_exception(self, response, exception, spider):
+        spider.logger.info('Middleware: %s exception caught', exception.__class__.__name__)
+        return [
+            {'from': 'process_spider_exception'},
+            Request(response.url, meta={'dont_fail': True}, dont_filter=True),
+        ]
+
+
+# ================================================================================
+# (1) exceptions from a spider middleware's process_spider_input method
+class FailProcessSpiderInputMiddleware:
+    def process_spider_input(self, response, spider):
+        spider.logger.info('Middleware: will raise IndexError')
+        raise IndexError()
+
+class ProcessSpiderInputSpiderWithoutErrback(Spider):
+    name = 'ProcessSpiderInputSpiderWithoutErrback'
+    custom_settings = {
+        'SPIDER_MIDDLEWARES': {
+            # spider
+            __name__ + '.LogExceptionMiddleware': 10,
+            __name__ + '.FailProcessSpiderInputMiddleware': 8,
+            __name__ + '.LogExceptionMiddleware': 6,
+            # engine
+        }
+    }
+
+    def start_requests(self):
+        yield Request(url=self.mockserver.url('/status?n=200'), callback=self.parse)
+
+    def parse(self, response):
+        return {'from': 'callback'}
+
+
+class ProcessSpiderInputSpiderWithErrback(ProcessSpiderInputSpiderWithoutErrback):
+    name = 'ProcessSpiderInputSpiderWithErrback'
+
+    def start_requests(self):
+        yield Request(url=self.mockserver.url('/status?n=200'), callback=self.parse, errback=self.errback)
+
+    def errback(self, failure):
+        self.logger.info('Got a Failure on the Request errback')
+        return {'from': 'errback'}
+
+
+# ================================================================================
+# (2) exceptions from a spider callback (generator)
+class GeneratorCallbackSpider(Spider):
+    name = 'GeneratorCallbackSpider'
+    custom_settings = {
+        'SPIDER_MIDDLEWARES': {
+            __name__ + '.LogExceptionMiddleware': 10,
+        },
+    }
+
+    def start_requests(self):
+        yield Request(self.mockserver.url('/status?n=200'))
+
+    def parse(self, response):
+        yield {'test': 1}
+        yield {'test': 2}
+        raise ImportError()
+
+
+# ================================================================================
+# (3) exceptions from a spider callback (not a generator)
+class NotGeneratorCallbackSpider(Spider):
+    name = 'NotGeneratorCallbackSpider'
+    custom_settings = {
+        'SPIDER_MIDDLEWARES': {
+            __name__ + '.LogExceptionMiddleware': 10,
+        },
+    }
+
+    def start_requests(self):
+        yield Request(self.mockserver.url('/status?n=200'))
+
+    def parse(self, response):
+        return [{'test': 1}, {'test': 1/0}]
+
+
+# ================================================================================
+# (4) exceptions from a middleware process_spider_output method (generator)
+class GeneratorOutputChainSpider(Spider):
+    name = 'GeneratorOutputChainSpider'
+    custom_settings = {
+        'SPIDER_MIDDLEWARES': {
+            __name__ + '.GeneratorFailMiddleware': 10,
+            __name__ + '.GeneratorDoNothingAfterFailureMiddleware': 8,
+            __name__ + '.GeneratorRecoverMiddleware': 5,
+            __name__ + '.GeneratorDoNothingAfterRecoveryMiddleware': 3,
+        },
+    }
+
+    def start_requests(self):
+        yield Request(self.mockserver.url('/status?n=200'))
+
+    def parse(self, response):
+        yield {'processed': ['parse-first-item']}
+        yield {'processed': ['parse-second-item']}
+
+
+class _GeneratorDoNothingMiddleware:
+    def process_spider_output(self, response, result, spider):
+        for r in result:
+            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
+            yield r
+
+    def process_spider_exception(self, response, exception, spider):
+        method = '{}.process_spider_exception'.format(self.__class__.__name__)
+        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
+        return None
+
+
+class GeneratorFailMiddleware:
+    def process_spider_output(self, response, result, spider):
+        for r in result:
+            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
+            yield r
+            raise LookupError()
+    
+    def process_spider_exception(self, response, exception, spider):
+        method = '{}.process_spider_exception'.format(self.__class__.__name__)
+        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
+        yield {'processed': [method]}
+
+
+class GeneratorDoNothingAfterFailureMiddleware(_GeneratorDoNothingMiddleware):
+    pass
+
+
+class GeneratorRecoverMiddleware:
+    def process_spider_output(self, response, result, spider):
+        for r in result:
+            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
+            yield r
+
+    def process_spider_exception(self, response, exception, spider):
+        method = '{}.process_spider_exception'.format(self.__class__.__name__)
+        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
+        yield {'processed': [method]}
+
+class GeneratorDoNothingAfterRecoveryMiddleware(_GeneratorDoNothingMiddleware):
+    pass
+
+
+# ================================================================================
+# (5) exceptions from a middleware process_spider_output method (not generator)
+class NotGeneratorOutputChainSpider(Spider):
+    name = 'NotGeneratorOutputChainSpider'
+    custom_settings = {
+        'SPIDER_MIDDLEWARES': {
+            __name__ + '.NotGeneratorFailMiddleware': 10,
+            __name__ + '.NotGeneratorDoNothingAfterFailureMiddleware': 8,
+            __name__ + '.NotGeneratorRecoverMiddleware': 5,
+            __name__ + '.NotGeneratorDoNothingAfterRecoveryMiddleware': 3,
+        },
+    }
+
+    def start_requests(self):
+        return [Request(self.mockserver.url('/status?n=200'))]
+
+    def parse(self, response):
+        return [{'processed': ['parse-first-item']}, {'processed': ['parse-second-item']}]
+
+
+class _NotGeneratorDoNothingMiddleware:
+    def process_spider_output(self, response, result, spider):
+        out = []
+        for r in result:
+            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
+            out.append(r)
+        return out
+
+    def process_spider_exception(self, response, exception, spider):
+        method = '{}.process_spider_exception'.format(self.__class__.__name__)
+        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
+        return None
+
+
+class NotGeneratorFailMiddleware:
+    def process_spider_output(self, response, result, spider):
+        out = []
+        for r in result:
+            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
+            out.append(r)
+        raise ReferenceError()
+        return out
+
+    def process_spider_exception(self, response, exception, spider):
+        method = '{}.process_spider_exception'.format(self.__class__.__name__)
+        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
+        return [{'processed': [method]}]
+
+
+class NotGeneratorDoNothingAfterFailureMiddleware(_NotGeneratorDoNothingMiddleware):
+    pass
+
+
+class NotGeneratorRecoverMiddleware:
+    def process_spider_output(self, response, result, spider):
+        out = []
+        for r in result:
+            r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
+            out.append(r)
+        return out
+
+    def process_spider_exception(self, response, exception, spider):
+        method = '{}.process_spider_exception'.format(self.__class__.__name__)
+        spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
+        return [{'processed': [method]}]
+
+class NotGeneratorDoNothingAfterRecoveryMiddleware(_NotGeneratorDoNothingMiddleware):
+    pass
+
+
+# ================================================================================
+class TestSpiderMiddleware(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.mockserver = MockServer()
+        cls.mockserver.__enter__()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.mockserver.__exit__(None, None, None)
+    
+    @defer.inlineCallbacks
+    def crawl_log(self, spider):
+        crawler = get_crawler(spider)
+        with LogCapture() as log:
+            yield crawler.crawl(mockserver=self.mockserver)
+        raise defer.returnValue(log)
+
+    @defer.inlineCallbacks
+    def test_recovery(self):
+        """
+        (0) Recover from an exception in a spider callback. The final item count should be 3
+        (one yielded from the callback method before the exception is raised, one directly
+        from the recovery middleware and one from the spider when processing the request that
+        was enqueued from the recovery middleware)
+        """
+        log = yield self.crawl_log(RecoverySpider)
+        self.assertIn("Middleware: TabError exception caught", str(log))
+        self.assertEqual(str(log).count("Middleware: TabError exception caught"), 1)
+        self.assertIn("'item_scraped_count': 3", str(log))
+
+    @defer.inlineCallbacks
+    def test_process_spider_input_without_errback(self):
+        """
+        (1.1) An exception from the process_spider_input chain should be caught by the
+        process_spider_exception chain from the start if the Request has no errback
+        """
+        log1 = yield self.crawl_log(ProcessSpiderInputSpiderWithoutErrback)
+        self.assertIn("Middleware: will raise IndexError", str(log1))
+        self.assertIn("Middleware: IndexError exception caught", str(log1))
+
+    @defer.inlineCallbacks
+    def test_process_spider_input_with_errback(self):
+        """
+        (1.2) An exception from the process_spider_input chain should not be caught by the
+        process_spider_exception chain if the Request has an errback
+        """
+        log1 = yield self.crawl_log(ProcessSpiderInputSpiderWithErrback)
+        self.assertNotIn("Middleware: IndexError exception caught", str(log1))
+        self.assertIn("Middleware: will raise IndexError", str(log1))
+        self.assertIn("Got a Failure on the Request errback", str(log1))
+        self.assertIn("{'from': 'errback'}", str(log1))
+        self.assertNotIn("{'from': 'callback'}", str(log1))
+        self.assertIn("'item_scraped_count': 1", str(log1))
+    
+    @defer.inlineCallbacks
+    def test_generator_callback(self):
+        """
+        (2) An exception from a spider callback (returning a generator) should
+        be caught by the process_spider_exception chain. Items yielded before the
+        exception is raised should be processed normally.
+        """
+        log2 = yield self.crawl_log(GeneratorCallbackSpider)
+        self.assertIn("Middleware: ImportError exception caught", str(log2))
+        self.assertIn("'item_scraped_count': 2", str(log2))
+    
+    @defer.inlineCallbacks
+    def test_not_a_generator_callback(self):
+        """
+        (3) An exception from a spider callback (returning a list) should
+        be caught by the process_spider_exception chain. No items should be processed.
+        """
+        log3 = yield self.crawl_log(NotGeneratorCallbackSpider)
+        self.assertIn("Middleware: ZeroDivisionError exception caught", str(log3))
+        self.assertNotIn("item_scraped_count", str(log3))
+
+    @defer.inlineCallbacks
+    def test_generator_output_chain(self):
+        """
+        (4) An exception from a middleware's process_spider_output method should be sent
+        to the process_spider_exception method from the next middleware in the chain.
+        The result of the recovery by the process_spider_exception method should be handled
+        by the process_spider_output method from the next middleware.
+        The final item count should be 2 (one from the spider callback and one from the
+        process_spider_exception chain)
+        """
+        log4 = yield self.crawl_log(GeneratorOutputChainSpider)
+        self.assertIn("'item_scraped_count': 2", str(log4))
+        self.assertIn("GeneratorRecoverMiddleware.process_spider_exception: LookupError caught", str(log4))
+        self.assertIn("GeneratorDoNothingAfterFailureMiddleware.process_spider_exception: LookupError caught", str(log4))
+        self.assertNotIn("GeneratorFailMiddleware.process_spider_exception: LookupError caught", str(log4))
+        self.assertNotIn("GeneratorDoNothingAfterRecoveryMiddleware.process_spider_exception: LookupError caught", str(log4))
+        item_from_callback = {'processed': [
+            'parse-first-item',
+            'GeneratorFailMiddleware.process_spider_output',
+            'GeneratorDoNothingAfterFailureMiddleware.process_spider_output',
+            'GeneratorRecoverMiddleware.process_spider_output',
+            'GeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
+        item_recovered = {'processed': [
+            'GeneratorRecoverMiddleware.process_spider_exception',
+            'GeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
+        self.assertIn(str(item_from_callback), str(log4))
+        self.assertIn(str(item_recovered), str(log4))
+        self.assertNotIn('parse-second-item', str(log4))
+
+    @defer.inlineCallbacks
+    def test_not_a_generator_output_chain(self):
+        """
+        (5) An exception from a middleware's process_spider_output method should be sent
+        to the process_spider_exception method from the next middleware in the chain.
+        The result of the recovery by the process_spider_exception method should be handled
+        by the process_spider_output method from the next middleware.
+        The final item count should be 1 (from the process_spider_exception chain, the items
+        from the spider callback are lost)
+        """
+        log5 = yield self.crawl_log(NotGeneratorOutputChainSpider)
+        self.assertIn("'item_scraped_count': 1", str(log5))
+        self.assertIn("GeneratorRecoverMiddleware.process_spider_exception: ReferenceError caught", str(log5))
+        self.assertIn("GeneratorDoNothingAfterFailureMiddleware.process_spider_exception: ReferenceError caught", str(log5))
+        self.assertNotIn("GeneratorFailMiddleware.process_spider_exception: ReferenceError caught", str(log5))
+        self.assertNotIn("GeneratorDoNothingAfterRecoveryMiddleware.process_spider_exception: ReferenceError caught", str(log5))
+        item_recovered = {'processed': [
+            'NotGeneratorRecoverMiddleware.process_spider_exception',
+            'NotGeneratorDoNothingAfterRecoveryMiddleware.process_spider_output']}
+        self.assertIn(str(item_recovered), str(log5))
+        self.assertNotIn('parse-first-item', str(log5))
+        self.assertNotIn('parse-second-item', str(log5))
--- a/tests/test_utils_misc/init.py
+++ b/tests/test_utils_misc/init.py
@ -3,12 +3,13 @@ import os
 import unittest

 from scrapy.item import Item, Field
-from scrapy.utils.misc import arg_to_iter, create_instance, load_object, walk_modules
+from scrapy.utils.misc import arg_to_iter, create_instance, load_object, set_environ, walk_modules

 from tests import mock

 __doctests__ = ['scrapy.utils.misc']

+
 class UtilsMiscTestCase(unittest.TestCase):

    def test_load_object(self):
@ -130,5 +131,18 @@ class UtilsMiscTestCase(unittest.TestCase):
        with self.assertRaises(ValueError):
            create_instance(m, None, None)

+    def test_set_environ(self):
+        assert os.environ.get('some_test_environ') is None
+        with set_environ(some_test_environ='test_value'):
+            assert os.environ.get('some_test_environ') == 'test_value'
+        assert os.environ.get('some_test_environ') is None
+
+        os.environ['some_test_environ'] = 'test'
+        assert os.environ.get('some_test_environ') == 'test'
+        with set_environ(some_test_environ='test_value'):
+            assert os.environ.get('some_test_environ') == 'test_value'
+        assert os.environ.get('some_test_environ') == 'test'
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_utils_python.py
+++ b/tests/test_utils_python.py
@ -9,11 +9,23 @@ import six
 from scrapy.utils.python import (
    memoizemethod_noargs, binary_is_text, equal_attributes,
    WeakKeyCache, stringify_dict, get_func_args, to_bytes, to_unicode,
-    without_none_values)
+    without_none_values, MutableChain)

 __doctests__ = ['scrapy.utils.python']


+class MutableChainTest(unittest.TestCase):
+    def test_mutablechain(self):
+        m = MutableChain(range(2), [2, 3], (4, 5))
+        m.extend(range(6, 7))
+        m.extend([7, 8])
+        m.extend([9, 10], (11, 12))
+        self.assertEqual(next(m), 0)
+        self.assertEqual(m.next(), 1)
+        self.assertEqual(m.__next__(), 2)
+        self.assertEqual(list(m), list(range(3, 13)))
+
+
 class ToUnicodeTest(unittest.TestCase):
    def test_converting_an_utf8_encoded_string_to_unicode(self):
        self.assertEqual(to_unicode(b'lel\xc3\xb1e'), u'lel\xf1e')
--- a/tests/test_utils_reqser.py
+++ b/tests/test_utils_reqser.py
@ -1,9 +1,12 @@
 # -*- coding: utf-8 -*-
 import unittest
+import sys
+
+import six

 from scrapy.http import Request, FormRequest
 from scrapy.spiders import Spider
-from scrapy.utils.reqser import request_to_dict, request_from_dict
+from scrapy.utils.reqser import request_to_dict, request_from_dict, _is_private_method, _mangle_private_name


 class RequestSerializationTest(unittest.TestCase):
@ -70,6 +73,56 @@ class RequestSerializationTest(unittest.TestCase):
                    errback=self.spider.handle_error)
        self._assert_serializes_ok(r, spider=self.spider)

+    def test_private_callback_serialization(self):
+        r = Request("http://www.example.com",
+                    callback=self.spider._TestSpider__parse_item_private,
+                    errback=self.spider.handle_error)
+        self._assert_serializes_ok(r, spider=self.spider)
+
+    def test_mixin_private_callback_serialization(self):
+        if sys.version_info[0] < 3:
+            return
+        r = Request("http://www.example.com",
+                    callback=self.spider._TestSpiderMixin__mixin_callback,
+                    errback=self.spider.handle_error)
+        self._assert_serializes_ok(r, spider=self.spider)
+
+    def test_private_callback_name_matching(self):
+        self.assertTrue(_is_private_method('__a'))
+        self.assertTrue(_is_private_method('__a_'))
+        self.assertTrue(_is_private_method('__a_a'))
+        self.assertTrue(_is_private_method('__a_a_'))
+        self.assertTrue(_is_private_method('__a__a'))
+        self.assertTrue(_is_private_method('__a__a_'))
+        self.assertTrue(_is_private_method('__a___a'))
+        self.assertTrue(_is_private_method('__a___a_'))
+        self.assertTrue(_is_private_method('___a'))
+        self.assertTrue(_is_private_method('___a_'))
+        self.assertTrue(_is_private_method('___a_a'))
+        self.assertTrue(_is_private_method('___a_a_'))
+        self.assertTrue(_is_private_method('____a_a_'))
+
+        self.assertFalse(_is_private_method('_a'))
+        self.assertFalse(_is_private_method('_a_'))
+        self.assertFalse(_is_private_method('__a__'))
+        self.assertFalse(_is_private_method('__'))
+        self.assertFalse(_is_private_method('___'))
+        self.assertFalse(_is_private_method('____'))
+
+    def _assert_mangles_to(self, obj, name):
+        func = getattr(obj, name)
+        self.assertEqual(
+            _mangle_private_name(obj, func, func.__name__),
+            name
+        )
+
+    def test_private_name_mangling(self):
+        self._assert_mangles_to(
+            self.spider, '_TestSpider__parse_item_private')
+        if sys.version_info[0] >= 3:
+            self._assert_mangles_to(
+                self.spider, '_TestSpiderMixin__mixin_callback')
+
    def test_unserializable_callback1(self):
        r = Request("http://www.example.com", callback=lambda x: x)
        self.assertRaises(ValueError, request_to_dict, r)
@ -80,7 +133,12 @@ class RequestSerializationTest(unittest.TestCase):
        self.assertRaises(ValueError, request_to_dict, r)


-class TestSpider(Spider):
+class TestSpiderMixin(object):
+    def __mixin_callback(self, response):
+        pass
+
+
+class TestSpider(Spider, TestSpiderMixin):
    name = 'test'

    def parse_item(self, response):
@ -89,6 +147,9 @@ class TestSpider(Spider):
    def handle_error(self, failure):
        pass

+    def __parse_item_private(self, response):
+        pass
+

 class CustomRequest(Request):
    pass
--- a/tox.ini
+++ b/tox.ini
@ -105,6 +105,12 @@ deps = {[docs]deps}
 commands =
    sphinx-build -W -b html . {envtmpdir}/html

+[testenv:docs-coverage]
+changedir = {[docs]changedir}
+deps = {[docs]deps}
+commands =
+    sphinx-build -b coverage . {envtmpdir}/coverage
+
 [testenv:docs-links]
 changedir = {[docs]changedir}
 deps = {[docs]deps}