1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-14 16:28:31 +00:00

Merge branch 'master' into azure-pipelines

This commit is contained in:
Adrián Chaves 2020-07-02 17:49:42 +02:00 committed by GitHub
commit 6e58da1dcd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
210 changed files with 4961 additions and 2528 deletions

View File

@ -1,5 +1,5 @@
[bumpversion] [bumpversion]
current_version = 2.0.0 current_version = 2.2.0
commit = True commit = True
tag = True tag = True
tag_name = {new_version} tag_name = {new_version}

1
.gitignore vendored
View File

@ -15,6 +15,7 @@ htmlcov/
.pytest_cache/ .pytest_cache/
.coverage.* .coverage.*
.cache/ .cache/
.mypy_cache/
# Windows # Windows
Thumbs.db Thumbs.db

View File

@ -1,4 +1,5 @@
version: 2 version: 2
formats: all
sphinx: sphinx:
configuration: docs/conf.py configuration: docs/conf.py
fail_on_warning: true fail_on_warning: true

View File

@ -11,25 +11,35 @@ matrix:
python: 3.8 python: 3.8
- env: TOXENV=flake8 - env: TOXENV=flake8
python: 3.8 python: 3.8
- env: TOXENV=pypy3 - env: TOXENV=pylint
- env: TOXENV=py35
python: 3.5
- env: TOXENV=pinned
python: 3.5
- env: TOXENV=py35-asyncio
python: 3.5.2
- env: TOXENV=py36
python: 3.6
- env: TOXENV=py37
python: 3.7
- env: TOXENV=py38
python: 3.8
- env: TOXENV=extra-deps
python: 3.8
- env: TOXENV=py38-asyncio
python: 3.8 python: 3.8
- env: TOXENV=docs - env: TOXENV=docs
python: 3.7 # Keep in sync with .readthedocs.yml python: 3.7 # Keep in sync with .readthedocs.yml
- env: TOXENV=typing
python: 3.8
- env: TOXENV=pypy3
- env: TOXENV=pinned
python: 3.5.2
- env: TOXENV=asyncio
python: 3.5.2 # We use additional code to support 3.5.3 and earlier
- env: TOXENV=py
python: 3.5
- env: TOXENV=asyncio
python: 3.5 # We use specific code to support >= 3.5.4, < 3.6
- env: TOXENV=py
python: 3.6
- env: TOXENV=py
python: 3.7
- env: TOXENV=py PYPI_RELEASE_JOB=true
python: 3.8
dist: bionic
- env: TOXENV=extra-deps
python: 3.8
dist: bionic
- env: TOXENV=asyncio
python: 3.8
dist: bionic
install: install:
- | - |
if [ "$TOXENV" = "pypy3" ]; then if [ "$TOXENV" = "pypy3" ]; then
@ -62,4 +72,4 @@ deploy:
on: on:
tags: true tags: true
repo: scrapy/scrapy repo: scrapy/scrapy
condition: "$TOXENV == py37 && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$" condition: "$PYPI_RELEASE_JOB == true && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$"

View File

@ -40,7 +40,7 @@ including a list of features.
Requirements Requirements
============ ============
* Python 3.5+ * Python 3.5.2+
* Works on Linux, Windows, macOS, BSD * Works on Linux, Windows, macOS, BSD
Install Install

View File

@ -12,6 +12,8 @@ collect_ignore = [
"scrapy/utils/testsite.py", "scrapy/utils/testsite.py",
# contains scripts to be run by tests/test_crawler.py::CrawlerProcessSubprocess # contains scripts to be run by tests/test_crawler.py::CrawlerProcessSubprocess
*_py_files("tests/CrawlerProcess"), *_py_files("tests/CrawlerProcess"),
# contains scripts to be run by tests/test_crawler.py::CrawlerRunnerSubprocess
*_py_files("tests/CrawlerRunner"),
# Py36-only parts of respective tests # Py36-only parts of respective tests
*_py_files("tests/py36"), *_py_files("tests/py36"),
] ]

View File

@ -57,3 +57,12 @@ There is a way to recreate the doc automatically when you make changes, you
need to install watchdog (``pip install watchdog``) and then use:: need to install watchdog (``pip install watchdog``) and then use::
make watch make watch
Alternative method using tox
----------------------------
To compile the documentation to HTML run the following command::
tox -e docs
Documentation will be generated (in HTML format) inside the ``.tox/docs/tmp/html`` dir.

View File

@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# Scrapy documentation build configuration file, created by # Scrapy documentation build configuration file, created by
# sphinx-quickstart on Mon Nov 24 12:02:52 2008. # sphinx-quickstart on Mon Nov 24 12:02:52 2008.
# #
@ -102,6 +100,9 @@ exclude_trees = ['.build']
# The name of the Pygments (syntax highlighting) style to use. # The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx' pygments_style = 'sphinx'
# List of Sphinx warnings that will not be raised
suppress_warnings = ['epub.unknown_project_files']
# Options for HTML output # Options for HTML output
# ----------------------- # -----------------------
@ -280,6 +281,7 @@ coverage_ignore_pyobjects = [
# ------------------------------------- # -------------------------------------
intersphinx_mapping = { intersphinx_mapping = {
'attrs': ('https://www.attrs.org/en/stable/', None),
'coverage': ('https://coverage.readthedocs.io/en/stable', None), 'coverage': ('https://coverage.readthedocs.io/en/stable', None),
'cssselect': ('https://cssselect.readthedocs.io/en/latest', None), 'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
'pytest': ('https://docs.pytest.org/en/latest', None), 'pytest': ('https://docs.pytest.org/en/latest', None),
@ -295,3 +297,11 @@ intersphinx_mapping = {
# ------------------------------------ # ------------------------------------
hoverxref_auto_ref = True hoverxref_auto_ref = True
hoverxref_role_types = {
"class": "tooltip",
"confval": "tooltip",
"hoverxref": "tooltip",
"mod": "tooltip",
"ref": "tooltip",
}
hoverxref_roles = ['command', 'reqmeta', 'setting', 'signal']

View File

@ -155,6 +155,9 @@ Finally, try to keep aesthetic changes (:pep:`8` compliance, unused imports
removal, etc) in separate commits from functional changes. This will make pull removal, etc) in separate commits from functional changes. This will make pull
requests easier to review and more likely to get merged. requests easier to review and more likely to get merged.
.. _coding-style:
Coding style Coding style
============ ============
@ -163,7 +166,7 @@ Scrapy:
* Unless otherwise specified, follow :pep:`8`. * Unless otherwise specified, follow :pep:`8`.
* It's OK to use lines longer than 80 chars if it improves the code * It's OK to use lines longer than 79 chars if it improves the code
readability. readability.
* Don't put your name in the code you contribute; git provides enough * Don't put your name in the code you contribute; git provides enough

View File

@ -69,7 +69,7 @@ Here's an example spider using BeautifulSoup API, with ``lxml`` as the HTML pars
What Python versions does Scrapy support? What Python versions does Scrapy support?
----------------------------------------- -----------------------------------------
Scrapy is supported under Python 3.5+ Scrapy is supported under Python 3.5.2+
under CPython (default Python implementation) and PyPy (starting with PyPy 5.9). under CPython (default Python implementation) and PyPy (starting with PyPy 5.9).
Python 3 support was added in Scrapy 1.1. Python 3 support was added in Scrapy 1.1.
PyPy support was added in Scrapy 1.4, PyPy3 support was added in Scrapy 1.5. PyPy support was added in Scrapy 1.4, PyPy3 support was added in Scrapy 1.5.
@ -342,15 +342,15 @@ method for this purpose. For example::
from copy import deepcopy from copy import deepcopy
from scrapy.item import BaseItem from itemadapter import is_item, ItemAdapter
class MultiplyItemsMiddleware: class MultiplyItemsMiddleware:
def process_spider_output(self, response, result, spider): def process_spider_output(self, response, result, spider):
for item in result: for item in result:
if isinstance(item, (BaseItem, dict)): if is_item(item):
for _ in range(item['multiply_by']): adapter = ItemAdapter(item)
for _ in range(adapter['multiply_by']):
yield deepcopy(item) yield deepcopy(item)
Does Scrapy support IPv6 addresses? Does Scrapy support IPv6 addresses?
@ -371,6 +371,19 @@ Twisted reactor is :class:`twisted.internet.selectreactor.SelectReactor`. Switch
different reactor is possible by using the :setting:`TWISTED_REACTOR` setting. different reactor is possible by using the :setting:`TWISTED_REACTOR` setting.
.. _faq-stop-response-download:
How can I cancel the download of a given response?
--------------------------------------------------
In some situations, it might be useful to stop the download of a certain response.
For instance, if you only need the first part of a large response and you would like
to save resources by avoiding the download of the whole body.
In that case, you could attach a handler to the :class:`~scrapy.signals.bytes_received`
signal and raise a :exc:`~scrapy.exceptions.StopDownload` exception. Please refer to
the :ref:`topics-stop-response-download` topic for additional information and examples.
.. _has been reported: https://github.com/scrapy/scrapy/issues/2905 .. _has been reported: https://github.com/scrapy/scrapy/issues/2905
.. _user agents: https://en.wikipedia.org/wiki/User_agent .. _user agents: https://en.wikipedia.org/wiki/User_agent
.. _LIFO: https://en.wikipedia.org/wiki/Stack_(abstract_data_type) .. _LIFO: https://en.wikipedia.org/wiki/Stack_(abstract_data_type)

View File

@ -7,7 +7,7 @@ Installation guide
Installing Scrapy Installing Scrapy
================= =================
Scrapy runs on Python 3.5 or above under CPython (default Python Scrapy runs on Python 3.5.2 or above under CPython (default Python
implementation) and PyPy (starting with PyPy 5.9). implementation) and PyPy (starting with PyPy 5.9).
If you're using `Anaconda`_ or `Miniconda`_, you can install the package from If you're using `Anaconda`_ or `Miniconda`_, you can install the package from

View File

@ -25,16 +25,16 @@ Scrapy.
If you're already familiar with other languages, and want to learn Python quickly, the `Python Tutorial`_ is a good resource. If you're already familiar with other languages, and want to learn Python quickly, the `Python Tutorial`_ is a good resource.
If you're new to programming and want to start with Python, the following books If you're new to programming and want to start with Python, the following books
may be useful to you: may be useful to you:
* `Automate the Boring Stuff With Python`_ * `Automate the Boring Stuff With Python`_
* `How To Think Like a Computer Scientist`_ * `How To Think Like a Computer Scientist`_
* `Learn Python 3 The Hard Way`_ * `Learn Python 3 The Hard Way`_
You can also take a look at `this list of Python resources for non-programmers`_, You can also take a look at `this list of Python resources for non-programmers`_,
as well as the `suggested resources in the learnpython-subreddit`_. as well as the `suggested resources in the learnpython-subreddit`_.
.. _Python: https://www.python.org/ .. _Python: https://www.python.org/
.. _this list of Python resources for non-programmers: https://wiki.python.org/moin/BeginnersGuide/NonProgrammers .. _this list of Python resources for non-programmers: https://wiki.python.org/moin/BeginnersGuide/NonProgrammers
@ -62,7 +62,7 @@ This will create a ``tutorial`` directory with the following contents::
__init__.py __init__.py
items.py # project items definition file items.py # project items definition file
middlewares.py # project middlewares file middlewares.py # project middlewares file
pipelines.py # project pipelines file pipelines.py # project pipelines file
@ -287,8 +287,8 @@ to be scraped, you can at least get **some** data.
Besides the :meth:`~scrapy.selector.SelectorList.getall` and Besides the :meth:`~scrapy.selector.SelectorList.getall` and
:meth:`~scrapy.selector.SelectorList.get` methods, you can also use :meth:`~scrapy.selector.SelectorList.get` methods, you can also use
the :meth:`~scrapy.selector.SelectorList.re` method to extract using `regular the :meth:`~scrapy.selector.SelectorList.re` method to extract using
expressions`_: :doc:`regular expressions <library/re>`:
>>> response.css('title::text').re(r'Quotes.*') >>> response.css('title::text').re(r'Quotes.*')
['Quotes to Scrape'] ['Quotes to Scrape']
@ -305,7 +305,6 @@ with a selector (see :ref:`topics-developer-tools`).
`Selector Gadget`_ is also a nice tool to quickly find CSS selector for `Selector Gadget`_ is also a nice tool to quickly find CSS selector for
visually selected elements, which works in many browsers. visually selected elements, which works in many browsers.
.. _regular expressions: https://docs.python.org/3/library/re.html
.. _Selector Gadget: https://selectorgadget.com/ .. _Selector Gadget: https://selectorgadget.com/

View File

@ -3,6 +3,348 @@
Release notes Release notes
============= =============
.. _release-2.2.0:
Scrapy 2.2.0 (2020-06-24)
-------------------------
Highlights:
* Python 3.5.2+ is required now
* :ref:`dataclass objects <dataclass-items>` and
:ref:`attrs objects <attrs-items>` are now valid :ref:`item types
<item-types>`
* New :meth:`TextResponse.json <scrapy.http.TextResponse.json>` method
* New :signal:`bytes_received` signal that allows canceling response download
* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` fixes
Backward-incompatible changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Support for Python 3.5.0 and 3.5.1 has been dropped; Scrapy now refuses to
run with a Python version lower than 3.5.2, which introduced
:class:`typing.Type` (:issue:`4615`)
Deprecations
~~~~~~~~~~~~
* :meth:`TextResponse.body_as_unicode
<scrapy.http.TextResponse.body_as_unicode>` is now deprecated, use
:attr:`TextResponse.text <scrapy.http.TextResponse.text>` instead
(:issue:`4546`, :issue:`4555`, :issue:`4579`)
* :class:`scrapy.item.BaseItem` is now deprecated, use
:class:`scrapy.item.Item` instead (:issue:`4534`)
New features
~~~~~~~~~~~~
* :ref:`dataclass objects <dataclass-items>` and
:ref:`attrs objects <attrs-items>` are now valid :ref:`item types
<item-types>`, and a new itemadapter_ library makes it easy to
write code that :ref:`supports any item type <supporting-item-types>`
(:issue:`2749`, :issue:`2807`, :issue:`3761`, :issue:`3881`, :issue:`4642`)
* A new :meth:`TextResponse.json <scrapy.http.TextResponse.json>` method
allows to deserialize JSON responses (:issue:`2444`, :issue:`4460`,
:issue:`4574`)
* A new :signal:`bytes_received` signal allows monitoring response download
progress and :ref:`stopping downloads <topics-stop-response-download>`
(:issue:`4205`, :issue:`4559`)
* The dictionaries in the result list of a :ref:`media pipeline
<topics-media-pipeline>` now include a new key, ``status``, which indicates
if the file was downloaded or, if the file was not downloaded, why it was
not downloaded; see :meth:`FilesPipeline.get_media_requests
<scrapy.pipelines.files.FilesPipeline.get_media_requests>` for more
information (:issue:`2893`, :issue:`4486`)
* When using :ref:`Google Cloud Storage <media-pipeline-gcs>` for
a :ref:`media pipeline <topics-media-pipeline>`, a warning is now logged if
the configured credentials do not grant the required permissions
(:issue:`4346`, :issue:`4508`)
* :ref:`Link extractors <topics-link-extractors>` are now serializable,
as long as you do not use :ref:`lambdas <lambda>` for parameters; for
example, you can now pass link extractors in :attr:`Request.cb_kwargs
<scrapy.http.Request.cb_kwargs>` or
:attr:`Request.meta <scrapy.http.Request.meta>` when :ref:`persisting
scheduled requests <topics-jobs>` (:issue:`4554`)
* Upgraded the :ref:`pickle protocol <pickle-protocols>` that Scrapy uses
from protocol 2 to protocol 4, improving serialization capabilities and
performance (:issue:`4135`, :issue:`4541`)
* :func:`scrapy.utils.misc.create_instance` now raises a :exc:`TypeError`
exception if the resulting instance is ``None`` (:issue:`4528`,
:issue:`4532`)
.. _itemadapter: https://github.com/scrapy/itemadapter
Bug fixes
~~~~~~~~~
* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` no longer
discards cookies defined in :attr:`Request.headers
<scrapy.http.Request.headers>` (:issue:`1992`, :issue:`2400`)
* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` no longer
re-encodes cookies defined as :class:`bytes` in the ``cookies`` parameter
of the ``__init__`` method of :class:`~scrapy.http.Request`
(:issue:`2400`, :issue:`3575`)
* When :setting:`FEEDS` defines multiple URIs, :setting:`FEED_STORE_EMPTY` is
``False`` and the crawl yields no items, Scrapy no longer stops feed
exports after the first URI (:issue:`4621`, :issue:`4626`)
* :class:`~scrapy.spiders.Spider` callbacks defined using :doc:`coroutine
syntax <topics/coroutines>` no longer need to return an iterable, and may
instead return a :class:`~scrapy.http.Request` object, an
:ref:`item <topics-items>`, or ``None`` (:issue:`4609`)
* The :command:`startproject` command now ensures that the generated project
folders and files have the right permissions (:issue:`4604`)
* Fix a :exc:`KeyError` exception being sometimes raised from
:class:`scrapy.utils.datatypes.LocalWeakReferencedCache` (:issue:`4597`,
:issue:`4599`)
* When :setting:`FEEDS` defines multiple URIs, log messages about items being
stored now contain information from the corresponding feed, instead of
always containing information about only one of the feeds (:issue:`4619`,
:issue:`4629`)
Documentation
~~~~~~~~~~~~~
* Added a new section about :ref:`accessing cb_kwargs from errbacks
<errback-cb_kwargs>` (:issue:`4598`, :issue:`4634`)
* Covered chompjs_ in :ref:`topics-parsing-javascript` (:issue:`4556`,
:issue:`4562`)
* Removed from :doc:`topics/coroutines` the warning about the API being
experimental (:issue:`4511`, :issue:`4513`)
* Removed references to unsupported versions of :doc:`Twisted
<twisted:index>` (:issue:`4533`)
* Updated the description of the :ref:`screenshot pipeline example
<ScreenshotPipeline>`, which now uses :doc:`coroutine syntax
<topics/coroutines>` instead of returning a
:class:`~twisted.internet.defer.Deferred` (:issue:`4514`, :issue:`4593`)
* Removed a misleading import line from the
:func:`scrapy.utils.log.configure_logging` code example (:issue:`4510`,
:issue:`4587`)
* The display-on-hover behavior of internal documentation references now also
covers links to :ref:`commands <topics-commands>`, :attr:`Request.meta
<scrapy.http.Request.meta>` keys, :ref:`settings <topics-settings>` and
:ref:`signals <topics-signals>` (:issue:`4495`, :issue:`4563`)
* It is again possible to download the documentation for offline reading
(:issue:`4578`, :issue:`4585`)
* Removed backslashes preceding ``*args`` and ``**kwargs`` in some function
and method signatures (:issue:`4592`, :issue:`4596`)
.. _chompjs: https://github.com/Nykakin/chompjs
Quality assurance
~~~~~~~~~~~~~~~~~
* Adjusted the code base further to our :ref:`style guidelines
<coding-style>` (:issue:`4237`, :issue:`4525`, :issue:`4538`,
:issue:`4539`, :issue:`4540`, :issue:`4542`, :issue:`4543`, :issue:`4544`,
:issue:`4545`, :issue:`4557`, :issue:`4558`, :issue:`4566`, :issue:`4568`,
:issue:`4572`)
* Removed remnants of Python 2 support (:issue:`4550`, :issue:`4553`,
:issue:`4568`)
* Improved code sharing between the :command:`crawl` and :command:`runspider`
commands (:issue:`4548`, :issue:`4552`)
* Replaced ``chain(*iterable)`` with ``chain.from_iterable(iterable)``
(:issue:`4635`)
* You may now run the :mod:`asyncio` tests with Tox on any Python version
(:issue:`4521`)
* Updated test requirements to reflect an incompatibility with pytest 5.4 and
5.4.1 (:issue:`4588`)
* Improved :class:`~scrapy.spiderloader.SpiderLoader` test coverage for
scenarios involving duplicate spider names (:issue:`4549`, :issue:`4560`)
* Configured Travis CI to also run the tests with Python 3.5.2
(:issue:`4518`, :issue:`4615`)
* Added a `Pylint <https://www.pylint.org/>`_ job to Travis CI
(:issue:`3727`)
* Added a `Mypy <http://mypy-lang.org/>`_ job to Travis CI (:issue:`4637`)
* Made use of set literals in tests (:issue:`4573`)
* Cleaned up the Travis CI configuration (:issue:`4517`, :issue:`4519`,
:issue:`4522`, :issue:`4537`)
.. _release-2.1.0:
Scrapy 2.1.0 (2020-04-24)
-------------------------
Highlights:
* New :setting:`FEEDS` setting to export to multiple feeds
* New :attr:`Response.ip_address <scrapy.http.Response.ip_address>` attribute
Backward-incompatible changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* :exc:`AssertionError` exceptions triggered by :ref:`assert <assert>`
statements have been replaced by new exception types, to support running
Python in optimized mode (see :option:`-O`) without changing Scrapys
behavior in any unexpected ways.
If you catch an :exc:`AssertionError` exception from Scrapy, update your
code to catch the corresponding new exception.
(:issue:`4440`)
Deprecation removals
~~~~~~~~~~~~~~~~~~~~
* The ``LOG_UNSERIALIZABLE_REQUESTS`` setting is no longer supported, use
:setting:`SCHEDULER_DEBUG` instead (:issue:`4385`)
* The ``REDIRECT_MAX_METAREFRESH_DELAY`` setting is no longer supported, use
:setting:`METAREFRESH_MAXDELAY` instead (:issue:`4385`)
* The :class:`~scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware`
middleware has been removed, including the entire
:class:`scrapy.downloadermiddlewares.chunked` module; chunked transfers
work out of the box (:issue:`4431`)
* The ``spiders`` property has been removed from
:class:`~scrapy.crawler.Crawler`, use :class:`CrawlerRunner.spider_loader
<scrapy.crawler.CrawlerRunner.spider_loader>` or instantiate
:setting:`SPIDER_LOADER_CLASS` with your settings instead (:issue:`4398`)
* The ``MultiValueDict``, ``MultiValueDictKeyError``, and ``SiteNode``
classes have been removed from :mod:`scrapy.utils.datatypes`
(:issue:`4400`)
Deprecations
~~~~~~~~~~~~
* The ``FEED_FORMAT`` and ``FEED_URI`` settings have been deprecated in
favor of the new :setting:`FEEDS` setting (:issue:`1336`, :issue:`3858`,
:issue:`4507`)
New features
~~~~~~~~~~~~
* A new setting, :setting:`FEEDS`, allows configuring multiple output feeds
with different settings each (:issue:`1336`, :issue:`3858`, :issue:`4507`)
* The :command:`crawl` and :command:`runspider` commands now support multiple
``-o`` parameters (:issue:`1336`, :issue:`3858`, :issue:`4507`)
* The :command:`crawl` and :command:`runspider` commands now support
specifying an output format by appending ``:<format>`` to the output file
(:issue:`1336`, :issue:`3858`, :issue:`4507`)
* The new :attr:`Response.ip_address <scrapy.http.Response.ip_address>`
attribute gives access to the IP address that originated a response
(:issue:`3903`, :issue:`3940`)
* A warning is now issued when a value in
:attr:`~scrapy.spiders.Spider.allowed_domains` includes a port
(:issue:`50`, :issue:`3198`, :issue:`4413`)
* Zsh completion now excludes used option aliases from the completion list
(:issue:`4438`)
Bug fixes
~~~~~~~~~
* :ref:`Request serialization <request-serialization>` no longer breaks for
callbacks that are spider attributes which are assigned a function with a
different name (:issue:`4500`)
* ``None`` values in :attr:`~scrapy.spiders.Spider.allowed_domains` no longer
cause a :exc:`TypeError` exception (:issue:`4410`)
* Zsh completion no longer allows options after arguments (:issue:`4438`)
* zope.interface 5.0.0 and later versions are now supported
(:issue:`4447`, :issue:`4448`)
* :meth:`Spider.make_requests_from_url
<scrapy.spiders.Spider.make_requests_from_url>`, deprecated in Scrapy
1.4.0, now issues a warning when used (:issue:`4412`)
Documentation
~~~~~~~~~~~~~
* Improved the documentation about signals that allow their handlers to
return a :class:`~twisted.internet.defer.Deferred` (:issue:`4295`,
:issue:`4390`)
* Our PyPI entry now includes links for our documentation, our source code
repository and our issue tracker (:issue:`4456`)
* Covered the `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_
service in the documentation (:issue:`4206`, :issue:`4455`)
* Removed references to the Guppy library, which only works in Python 2
(:issue:`4285`, :issue:`4343`)
* Extended use of InterSphinx to link to Python 3 documentation
(:issue:`4444`, :issue:`4445`)
* Added support for Sphinx 3.0 and later (:issue:`4475`, :issue:`4480`,
:issue:`4496`, :issue:`4503`)
Quality assurance
~~~~~~~~~~~~~~~~~
* Removed warnings about using old, removed settings (:issue:`4404`)
* Removed a warning about importing
:class:`~twisted.internet.testing.StringTransport` from
``twisted.test.proto_helpers`` in Twisted 19.7.0 or newer (:issue:`4409`)
* Removed outdated Debian package build files (:issue:`4384`)
* Removed :class:`object` usage as a base class (:issue:`4430`)
* Removed code that added support for old versions of Twisted that we no
longer support (:issue:`4472`)
* Fixed code style issues (:issue:`4468`, :issue:`4469`, :issue:`4471`,
:issue:`4481`)
* Removed :func:`twisted.internet.defer.returnValue` calls (:issue:`4443`,
:issue:`4446`, :issue:`4489`)
.. _release-2.0.1: .. _release-2.0.1:
Scrapy 2.0.1 (2020-03-18) Scrapy 2.0.1 (2020-03-18)

View File

@ -1,4 +1,4 @@
Sphinx>=2.1 Sphinx>=3.0
sphinx-hoverxref sphinx-hoverxref>=0.2b1
sphinx-notfound-page sphinx-notfound-page>=0.4
sphinx_rtd_theme sphinx_rtd_theme>=0.4

View File

@ -91,7 +91,7 @@ how you :ref:`configure the downloader middlewares
provided while constructing the crawler, and it is created after the provided while constructing the crawler, and it is created after the
arguments given in the :meth:`crawl` method. arguments given in the :meth:`crawl` method.
.. method:: crawl(\*args, \**kwargs) .. method:: crawl(*args, **kwargs)
Starts the crawler by instantiating its spider class with the given Starts the crawler by instantiating its spider class with the given
``args`` and ``kwargs`` arguments, while setting the execution engine in ``args`` and ``kwargs`` arguments, while setting the execution engine in

View File

@ -104,7 +104,7 @@ Spiders
------- -------
Spiders are custom classes written by Scrapy users to parse responses and Spiders are custom classes written by Scrapy users to parse responses and
extract items (aka scraped items) from them or additional requests to extract :ref:`items <topics-items>` from them or additional requests to
follow. For more information see :ref:`topics-spiders`. follow. For more information see :ref:`topics-spiders`.
.. _component-pipelines: .. _component-pipelines:

View File

@ -78,7 +78,7 @@ override three methods:
.. module:: scrapy.contracts .. module:: scrapy.contracts
.. class:: Contract(method, \*args) .. class:: Contract(method, *args)
:param method: callback function to which the contract is associated :param method: callback function to which the contract is associated
:type method: function :type method: function
@ -136,7 +136,7 @@ Detecting check runs
==================== ====================
When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is
set to the ``true`` string. You can use `os.environ`_ to perform any change to set to the ``true`` string. You can use :data:`os.environ` to perform any change to
your spiders or your settings when ``scrapy check`` is used:: your spiders or your settings when ``scrapy check`` is used::
import os import os
@ -148,5 +148,3 @@ your spiders or your settings when ``scrapy check`` is used::
def __init__(self): def __init__(self):
if os.environ.get('SCRAPY_CHECK'): if os.environ.get('SCRAPY_CHECK'):
pass # Do some scraper adjustments when a check is running pass # Do some scraper adjustments when a check is running
.. _os.environ: https://docs.python.org/3/library/os.html#os.environ

View File

@ -7,10 +7,6 @@ Coroutines
Scrapy has :ref:`partial support <coroutine-support>` for the Scrapy has :ref:`partial support <coroutine-support>` for the
:ref:`coroutine syntax <async>`. :ref:`coroutine syntax <async>`.
.. warning:: :mod:`asyncio` support in Scrapy is experimental. Future Scrapy
versions may introduce related API and behavior changes without a
deprecation period or warning.
.. _coroutine-support: .. _coroutine-support:
Supported callables Supported callables
@ -57,27 +53,34 @@ There are several use cases for coroutines in Scrapy. Code that would
return Deferreds when written for previous Scrapy versions, such as downloader return Deferreds when written for previous Scrapy versions, such as downloader
middlewares and signal handlers, can be rewritten to be shorter and cleaner:: middlewares and signal handlers, can be rewritten to be shorter and cleaner::
from itemadapter import ItemAdapter
class DbPipeline: class DbPipeline:
def _update_item(self, data, item): def _update_item(self, data, item):
item['field'] = data adapter = ItemAdapter(item)
adapter['field'] = data
return item return item
def process_item(self, item, spider): def process_item(self, item, spider):
dfd = db.get_some_data(item['id']) adapter = ItemAdapter(item)
dfd = db.get_some_data(adapter['id'])
dfd.addCallback(self._update_item, item) dfd.addCallback(self._update_item, item)
return dfd return dfd
becomes:: becomes::
from itemadapter import ItemAdapter
class DbPipeline: class DbPipeline:
async def process_item(self, item, spider): async def process_item(self, item, spider):
item['field'] = await db.get_some_data(item['id']) adapter = ItemAdapter(item)
adapter['field'] = await db.get_some_data(adapter['id'])
return item return item
Coroutines may be used to call asynchronous code. This includes other Coroutines may be used to call asynchronous code. This includes other
coroutines, functions that return Deferreds and functions that return coroutines, functions that return Deferreds and functions that return
`awaitable objects`_ such as :class:`~asyncio.Future`. This means you can use :term:`awaitable objects <awaitable>` such as :class:`~asyncio.Future`.
many useful Python libraries providing such code:: This means you can use many useful Python libraries providing such code::
class MySpider(Spider): class MySpider(Spider):
# ... # ...
@ -107,4 +110,3 @@ Common use cases for asynchronous code include:
:ref:`the screenshot pipeline example<ScreenshotPipeline>`). :ref:`the screenshot pipeline example<ScreenshotPipeline>`).
.. _aio-libs: https://github.com/aio-libs .. _aio-libs: https://github.com/aio-libs
.. _awaitable objects: https://docs.python.org/3/glossary.html#term-awaitable

View File

@ -292,6 +292,9 @@ Alternatively, if you want to know the arguments needed to recreate that
request you can use the :func:`scrapy.utils.curl.curl_to_request_kwargs` request you can use the :func:`scrapy.utils.curl.curl_to_request_kwargs`
function to get a dictionary with the equivalent arguments. function to get a dictionary with the equivalent arguments.
Note that to translate a cURL command into a Scrapy request,
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
As you can see, with a few inspections in the `Network`-tool we As you can see, with a few inspections in the `Network`-tool we
were able to easily replicate the dynamic requests of the scrolling were able to easily replicate the dynamic requests of the scrolling
functionality of the page. Crawling dynamic pages can be quite functionality of the page. Crawling dynamic pages can be quite

View File

@ -202,6 +202,11 @@ CookiesMiddleware
sends them back on subsequent requests (from that spider), just like web sends them back on subsequent requests (from that spider), just like web
browsers do. browsers do.
.. caution:: When non-UTF8 encoded byte sequences are passed to a
:class:`~scrapy.http.Request`, the ``CookiesMiddleware`` will log
a warning. Refer to :ref:`topics-logging-advanced-customization`
to customize the logging behaviour.
The following settings can be used to configure the cookie middleware: The following settings can be used to configure the cookie middleware:
* :setting:`COOKIES_ENABLED` * :setting:`COOKIES_ENABLED`
@ -739,7 +744,7 @@ HttpProxyMiddleware
This middleware sets the HTTP proxy to use for requests, by setting the This middleware sets the HTTP proxy to use for requests, by setting the
``proxy`` meta value for :class:`~scrapy.http.Request` objects. ``proxy`` meta value for :class:`~scrapy.http.Request` objects.
Like the Python standard library modules `urllib`_ and `urllib2`_, it obeys Like the Python standard library module :mod:`urllib.request`, it obeys
the following environment variables: the following environment variables:
* ``http_proxy`` * ``http_proxy``
@ -751,9 +756,6 @@ HttpProxyMiddleware
Keep in mind this value will take precedence over ``http_proxy``/``https_proxy`` Keep in mind this value will take precedence over ``http_proxy``/``https_proxy``
environment variables, and it will also ignore ``no_proxy`` environment variable. environment variables, and it will also ignore ``no_proxy`` environment variable.
.. _urllib: https://docs.python.org/2/library/urllib.html
.. _urllib2: https://docs.python.org/2/library/urllib2.html
RedirectMiddleware RedirectMiddleware
------------------ ------------------
@ -829,6 +831,7 @@ REDIRECT_MAX_TIMES
Default: ``20`` Default: ``20``
The maximum number of redirections that will be followed for a single request. The maximum number of redirections that will be followed for a single request.
After this maximum, the request's response is returned as is.
MetaRefreshMiddleware MetaRefreshMiddleware
--------------------- ---------------------
@ -1036,8 +1039,7 @@ Scrapy uses this parser by default.
RobotFileParser RobotFileParser
~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~
Based on `RobotFileParser Based on :class:`~urllib.robotparser.RobotFileParser`:
<https://docs.python.org/3.7/library/urllib.robotparser.html>`_:
* is Python's built-in robots.txt_ parser * is Python's built-in robots.txt_ parser

View File

@ -104,6 +104,9 @@ If you get the expected response `sometimes`, but not always, the issue is
probably not your request, but the target server. The target server might be probably not your request, but the target server. The target server might be
buggy, overloaded, or :ref:`banning <bans>` some of your requests. buggy, overloaded, or :ref:`banning <bans>` some of your requests.
Note that to translate a cURL command into a Scrapy request,
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
.. _topics-handling-response-formats: .. _topics-handling-response-formats:
Handling different response formats Handling different response formats
@ -115,7 +118,7 @@ data from it depends on the type of response:
- If the response is HTML or XML, use :ref:`selectors - If the response is HTML or XML, use :ref:`selectors
<topics-selectors>` as usual. <topics-selectors>` as usual.
- If the response is JSON, use `json.loads`_ to load the desired data from - If the response is JSON, use :func:`json.loads` to load the desired data from
:attr:`response.text <scrapy.http.TextResponse.text>`:: :attr:`response.text <scrapy.http.TextResponse.text>`::
data = json.loads(response.text) data = json.loads(response.text)
@ -130,8 +133,9 @@ data from it depends on the type of response:
- If the response is JavaScript, or HTML with a ``<script/>`` element - If the response is JavaScript, or HTML with a ``<script/>`` element
containing the desired data, see :ref:`topics-parsing-javascript`. containing the desired data, see :ref:`topics-parsing-javascript`.
- If the response is CSS, use a `regular expression`_ to extract the desired - If the response is CSS, use a :doc:`regular expression <library/re>` to
data from :attr:`response.text <scrapy.http.TextResponse.text>`. extract the desired data from
:attr:`response.text <scrapy.http.TextResponse.text>`.
.. _topics-parsing-images: .. _topics-parsing-images:
@ -168,8 +172,9 @@ JavaScript code:
Once you have a string with the JavaScript code, you can extract the desired Once you have a string with the JavaScript code, you can extract the desired
data from it: data from it:
- You might be able to use a `regular expression`_ to extract the desired - You might be able to use a :doc:`regular expression <library/re>` to
data in JSON format, which you can then parse with `json.loads`_. extract the desired data in JSON format, which you can then parse with
:func:`json.loads`.
For example, if the JavaScript code contains a separate line like For example, if the JavaScript code contains a separate line like
``var data = {"field": "value"};`` you can extract that data as follows: ``var data = {"field": "value"};`` you can extract that data as follows:
@ -179,6 +184,18 @@ data from it:
>>> json.loads(json_data) >>> json.loads(json_data)
{'field': 'value'} {'field': 'value'}
- chompjs_ provides an API to parse JavaScript objects into a :class:`dict`.
For example, if the JavaScript code contains
``var data = {field: "value", secondField: "second value"};``
you can extract that data as follows:
>>> import chompjs
>>> javascript = response.css('script::text').get()
>>> data = chompjs.parse_js_object(javascript)
>>> data
{'field': 'value', 'secondField': 'second value'}
- Otherwise, use js2xml_ to convert the JavaScript code into an XML document - Otherwise, use js2xml_ to convert the JavaScript code into an XML document
that you can parse using :ref:`selectors <topics-selectors>`. that you can parse using :ref:`selectors <topics-selectors>`.
@ -236,14 +253,13 @@ along with `scrapy-selenium`_ for seamless integration.
.. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29 .. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29
.. _chompjs: https://github.com/Nykakin/chompjs
.. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets .. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
.. _curl: https://curl.haxx.se/ .. _curl: https://curl.haxx.se/
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser .. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript .. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
.. _js2xml: https://github.com/scrapinghub/js2xml .. _js2xml: https://github.com/scrapinghub/js2xml
.. _json.loads: https://docs.python.org/3/library/json.html#json.loads
.. _pytesseract: https://github.com/madmaze/pytesseract .. _pytesseract: https://github.com/madmaze/pytesseract
.. _regular expression: https://docs.python.org/3/library/re.html
.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium .. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash .. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
.. _Selenium: https://www.selenium.dev/ .. _Selenium: https://www.selenium.dev/

View File

@ -7,7 +7,7 @@ Sending e-mail
.. module:: scrapy.mail .. module:: scrapy.mail
:synopsis: Email sending facility :synopsis: Email sending facility
Although Python makes sending e-mails relatively easy via the `smtplib`_ Although Python makes sending e-mails relatively easy via the :mod:`smtplib`
library, Scrapy provides its own facility for sending e-mails which is very library, Scrapy provides its own facility for sending e-mails which is very
easy to use and it's implemented using :doc:`Twisted non-blocking IO easy to use and it's implemented using :doc:`Twisted non-blocking IO
<twisted:core/howto/defer-intro>`, to avoid interfering with the non-blocking <twisted:core/howto/defer-intro>`, to avoid interfering with the non-blocking
@ -15,8 +15,6 @@ IO of the crawler. It also provides a simple API for sending attachments and
it's very easy to configure, with a few :ref:`settings it's very easy to configure, with a few :ref:`settings
<topics-email-settings>`. <topics-email-settings>`.
.. _smtplib: https://docs.python.org/2/library/smtplib.html
Quick example Quick example
============= =============

View File

@ -14,13 +14,6 @@ Built-in Exceptions reference
Here's a list of all exceptions included in Scrapy and their usage. Here's a list of all exceptions included in Scrapy and their usage.
DropItem
--------
.. exception:: DropItem
The exception that must be raised by item pipeline stages to stop processing an
Item. For more information see :ref:`topics-item-pipeline`.
CloseSpider CloseSpider
----------- -----------
@ -47,6 +40,14 @@ DontCloseSpider
This exception can be raised in a :signal:`spider_idle` signal handler to This exception can be raised in a :signal:`spider_idle` signal handler to
prevent the spider from being closed. prevent the spider from being closed.
DropItem
--------
.. exception:: DropItem
The exception that must be raised by item pipeline stages to stop processing an
Item. For more information see :ref:`topics-item-pipeline`.
IgnoreRequest IgnoreRequest
------------- -------------
@ -77,3 +78,37 @@ NotSupported
This exception is raised to indicate an unsupported feature. This exception is raised to indicate an unsupported feature.
StopDownload
-------------
.. versionadded:: 2.2
.. exception:: StopDownload(fail=True)
Raised from a :class:`~scrapy.signals.bytes_received` signal handler to
indicate that no further bytes should be downloaded for a response.
The ``fail`` boolean parameter controls which method will handle the resulting
response:
* If ``fail=True`` (default), the request errback is called. The response object is
available as the ``response`` attribute of the ``StopDownload`` exception,
which is in turn stored as the ``value`` attribute of the received
:class:`~twisted.python.failure.Failure` object. This means that in an errback
defined as ``def errback(self, failure)``, the response can be accessed though
``failure.value.response``.
* If ``fail=False``, the request callback is called instead.
In both cases, the response could have its body truncated: the body contains
all bytes received up until the exception is raised, including the bytes
received in the signal handler that raises the exception. Also, the response
object is marked with ``"download_stopped"`` in its :attr:`Response.flags`
attribute.
.. note:: ``fail`` is a keyword-only parameter, i.e. raising
``StopDownload(False)`` or ``StopDownload(True)`` will raise
a :class:`TypeError`.
See the documentation for the :class:`~scrapy.signals.bytes_received` signal
and the :ref:`topics-stop-response-download` topic for additional information and examples.

View File

@ -40,6 +40,7 @@ Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses multiple
Item Exporters to group scraped items to different files according to the Item Exporters to group scraped items to different files according to the
value of one of their fields:: value of one of their fields::
from itemadapter import ItemAdapter
from scrapy.exporters import XmlItemExporter from scrapy.exporters import XmlItemExporter
class PerYearXmlExportPipeline: class PerYearXmlExportPipeline:
@ -53,7 +54,8 @@ value of one of their fields::
exporter.finish_exporting() exporter.finish_exporting()
def _exporter_for_item(self, item): def _exporter_for_item(self, item):
year = item['year'] adapter = ItemAdapter(item)
year = adapter['year']
if year not in self.year_to_exporter: if year not in self.year_to_exporter:
f = open('{}.xml'.format(year), 'wb') f = open('{}.xml'.format(year), 'wb')
exporter = XmlItemExporter(f) exporter = XmlItemExporter(f)
@ -167,9 +169,10 @@ BaseItemExporter
value unchanged except for ``unicode`` values which are encoded to value unchanged except for ``unicode`` values which are encoded to
``str`` using the encoding declared in the :attr:`encoding` attribute. ``str`` using the encoding declared in the :attr:`encoding` attribute.
:param field: the field being serialized. If a raw dict is being :param field: the field being serialized. If the source :ref:`item object
exported (not :class:`~.Item`) *field* value is an empty dict. <item-types>` does not define field metadata, *field* is an empty
:type field: :class:`~scrapy.item.Field` object or an empty dict :class:`dict`.
:type field: :class:`~scrapy.item.Field` object or a :class:`dict` instance
:param name: the name of the field being serialized :param name: the name of the field being serialized
:type name: str :type name: str
@ -192,14 +195,17 @@ BaseItemExporter
.. attribute:: fields_to_export .. attribute:: fields_to_export
A list with the name of the fields that will be exported, or None if you A list with the name of the fields that will be exported, or ``None`` if
want to export all fields. Defaults to None. you want to export all fields. Defaults to ``None``.
Some exporters (like :class:`CsvItemExporter`) respect the order of the Some exporters (like :class:`CsvItemExporter`) respect the order of the
fields defined in this attribute. fields defined in this attribute.
Some exporters may require fields_to_export list in order to export the When using :ref:`item objects <item-types>` that do not expose all their
data properly when spiders return dicts (not :class:`~Item` instances). possible fields, exporters that do not support exporting a different
subset of fields per item will only export the fields found in the first
item exported. Use ``fields_to_export`` to define all the fields to be
exported.
.. attribute:: export_empty_fields .. attribute:: export_empty_fields
@ -236,9 +242,9 @@ PythonItemExporter
XmlItemExporter XmlItemExporter
--------------- ---------------
.. class:: XmlItemExporter(file, item_element='item', root_element='items', \**kwargs) .. class:: XmlItemExporter(file, item_element='item', root_element='items', **kwargs)
Exports Items in XML format to the specified file object. Exports items in XML format to the specified file object.
:param file: the file-like object to use for exporting the data. Its ``write`` method should :param file: the file-like object to use for exporting the data. Its ``write`` method should
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc) accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
@ -290,9 +296,9 @@ XmlItemExporter
CsvItemExporter CsvItemExporter
--------------- ---------------
.. class:: CsvItemExporter(file, include_headers_line=True, join_multivalued=',', \**kwargs) .. class:: CsvItemExporter(file, include_headers_line=True, join_multivalued=',', **kwargs)
Exports Items in CSV format to the given file-like object. If the Exports items in CSV format to the given file-like object. If the
:attr:`fields_to_export` attribute is set, it will be used to define the :attr:`fields_to_export` attribute is set, it will be used to define the
CSV columns and their order. The :attr:`export_empty_fields` attribute has CSV columns and their order. The :attr:`export_empty_fields` attribute has
no effect on this exporter. no effect on this exporter.
@ -311,7 +317,7 @@ CsvItemExporter
The additional keyword arguments of this ``__init__`` method are passed to the The additional keyword arguments of this ``__init__`` method are passed to the
:class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to the
`csv.writer`_ ``__init__`` method, so you can use any ``csv.writer`` ``__init__`` method :func:`csv.writer` function, so you can use any :func:`csv.writer` function
argument to customize this exporter. argument to customize this exporter.
A typical output of this exporter would be:: A typical output of this exporter would be::
@ -320,14 +326,12 @@ CsvItemExporter
Color TV,1200 Color TV,1200
DVD player,200 DVD player,200
.. _csv.writer: https://docs.python.org/2/library/csv.html#csv.writer
PickleItemExporter PickleItemExporter
------------------ ------------------
.. class:: PickleItemExporter(file, protocol=0, \**kwargs) .. class:: PickleItemExporter(file, protocol=0, **kwargs)
Exports Items in pickle format to the given file-like object. Exports items in pickle format to the given file-like object.
:param file: the file-like object to use for exporting the data. Its ``write`` method should :param file: the file-like object to use for exporting the data. Its ``write`` method should
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc) accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
@ -335,21 +339,19 @@ PickleItemExporter
:param protocol: The pickle protocol to use. :param protocol: The pickle protocol to use.
:type protocol: int :type protocol: int
For more information, refer to the `pickle module documentation`_. For more information, see :mod:`pickle`.
The additional keyword arguments of this ``__init__`` method are passed to the The additional keyword arguments of this ``__init__`` method are passed to the
:class:`BaseItemExporter` ``__init__`` method. :class:`BaseItemExporter` ``__init__`` method.
Pickle isn't a human readable format, so no output examples are provided. Pickle isn't a human readable format, so no output examples are provided.
.. _pickle module documentation: https://docs.python.org/2/library/pickle.html
PprintItemExporter PprintItemExporter
------------------ ------------------
.. class:: PprintItemExporter(file, \**kwargs) .. class:: PprintItemExporter(file, **kwargs)
Exports Items in pretty print format to the specified file object. Exports items in pretty print format to the specified file object.
:param file: the file-like object to use for exporting the data. Its ``write`` method should :param file: the file-like object to use for exporting the data. Its ``write`` method should
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc) accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
@ -367,13 +369,13 @@ PprintItemExporter
JsonItemExporter JsonItemExporter
---------------- ----------------
.. class:: JsonItemExporter(file, \**kwargs) .. class:: JsonItemExporter(file, **kwargs)
Exports Items in JSON format to the specified file-like object, writing all Exports items in JSON format to the specified file-like object, writing all
objects as a list of objects. The additional ``__init__`` method arguments are objects as a list of objects. The additional ``__init__`` method arguments are
passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover
arguments to the `JSONEncoder`_ ``__init__`` method, so you can use any arguments to the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any
`JSONEncoder`_ ``__init__`` method argument to customize this exporter. :class:`~json.JSONEncoder` ``__init__`` method argument to customize this exporter.
:param file: the file-like object to use for exporting the data. Its ``write`` method should :param file: the file-like object to use for exporting the data. Its ``write`` method should
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc) accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
@ -393,18 +395,16 @@ JsonItemExporter
stream-friendly format, consider using :class:`JsonLinesItemExporter` stream-friendly format, consider using :class:`JsonLinesItemExporter`
instead, or splitting the output in multiple chunks. instead, or splitting the output in multiple chunks.
.. _JSONEncoder: https://docs.python.org/2/library/json.html#json.JSONEncoder
JsonLinesItemExporter JsonLinesItemExporter
--------------------- ---------------------
.. class:: JsonLinesItemExporter(file, \**kwargs) .. class:: JsonLinesItemExporter(file, **kwargs)
Exports Items in JSON format to the specified file-like object, writing one Exports items in JSON format to the specified file-like object, writing one
JSON-encoded item per line. The additional ``__init__`` method arguments are passed JSON-encoded item per line. The additional ``__init__`` method arguments are passed
to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to
the `JSONEncoder`_ ``__init__`` method, so you can use any `JSONEncoder`_ the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any
``__init__`` method argument to customize this exporter. :class:`~json.JSONEncoder` ``__init__`` method argument to customize this exporter.
:param file: the file-like object to use for exporting the data. Its ``write`` method should :param file: the file-like object to use for exporting the data. Its ``write`` method should
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc) accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
@ -417,8 +417,6 @@ JsonLinesItemExporter
Unlike the one produced by :class:`JsonItemExporter`, the format produced by Unlike the one produced by :class:`JsonItemExporter`, the format produced by
this exporter is well suited for serializing large amounts of data. this exporter is well suited for serializing large amounts of data.
.. _JSONEncoder: https://docs.python.org/2/library/json.html#json.JSONEncoder
MarshalItemExporter MarshalItemExporter
------------------- -------------------

View File

@ -364,7 +364,7 @@ Debugger extension
.. class:: Debugger .. class:: Debugger
Invokes a `Python debugger`_ inside a running Scrapy process when a `SIGUSR2`_ Invokes a :doc:`Python debugger <library/pdb>` inside a running Scrapy process when a `SIGUSR2`_
signal is received. After the debugger is exited, the Scrapy process continues signal is received. After the debugger is exited, the Scrapy process continues
running normally. running normally.
@ -372,5 +372,4 @@ For more info see `Debugging in Python`_.
This extension only works on POSIX-compliant platforms (i.e. not Windows). This extension only works on POSIX-compliant platforms (i.e. not Windows).
.. _Python debugger: https://docs.python.org/2/library/pdb.html
.. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/ .. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/

View File

@ -298,8 +298,8 @@ Example: ``FEED_EXPORT_FIELDS = ["foo", "bar", "baz"]``.
Use FEED_EXPORT_FIELDS option to define fields to export and their order. Use FEED_EXPORT_FIELDS option to define fields to export and their order.
When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses fields When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses the fields
defined in dicts or :class:`~.Item` subclasses a spider is yielding. defined in :ref:`item objects <topics-items>` yielded by your spider.
If an exporter requires a fixed set of fields (this is the case for If an exporter requires a fixed set of fields (this is the case for
:ref:`CSV <topics-feed-format-csv>` export format) and FEED_EXPORT_FIELDS :ref:`CSV <topics-feed-format-csv>` export format) and FEED_EXPORT_FIELDS

View File

@ -27,15 +27,19 @@ Each item pipeline component is a Python class that must implement the following
.. method:: process_item(self, item, spider) .. method:: process_item(self, item, spider)
This method is called for every item pipeline component. :meth:`process_item` This method is called for every item pipeline component.
must either: return a dict with data, return an :class:`~scrapy.item.Item`
(or any descendant class) object, return a
:class:`~twisted.internet.defer.Deferred` or raise
:exc:`~scrapy.exceptions.DropItem` exception. Dropped items are no longer
processed by further pipeline components.
:param item: the item scraped `item` is an :ref:`item object <item-types>`, see
:type item: :class:`~scrapy.item.Item` object or a dict :ref:`supporting-item-types`.
:meth:`process_item` must either: return an :ref:`item object <item-types>`,
return a :class:`~twisted.internet.defer.Deferred` or raise a
:exc:`~scrapy.exceptions.DropItem` exception.
Dropped items are no longer processed by further pipeline components.
:param item: the scraped item
:type item: :ref:`item object <item-types>`
:param spider: the spider which scraped the item :param spider: the spider which scraped the item
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
@ -79,16 +83,17 @@ Let's take a look at the following hypothetical pipeline that adjusts the
(``price_excludes_vat`` attribute), and drops those items which don't (``price_excludes_vat`` attribute), and drops those items which don't
contain a price:: contain a price::
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem from scrapy.exceptions import DropItem
class PricePipeline: class PricePipeline:
vat_factor = 1.15 vat_factor = 1.15
def process_item(self, item, spider): def process_item(self, item, spider):
if item.get('price'): adapter = ItemAdapter(item)
if item.get('price_excludes_vat'): if adapter.get('price'):
item['price'] = item['price'] * self.vat_factor if adapter.get('price_excludes_vat'):
adapter['price'] = adapter['price'] * self.vat_factor
return item return item
else: else:
raise DropItem("Missing price in %s" % item) raise DropItem("Missing price in %s" % item)
@ -103,6 +108,8 @@ format::
import json import json
from itemadapter import ItemAdapter
class JsonWriterPipeline: class JsonWriterPipeline:
def open_spider(self, spider): def open_spider(self, spider):
@ -112,7 +119,7 @@ format::
self.file.close() self.file.close()
def process_item(self, item, spider): def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n" line = json.dumps(ItemAdapter(item).asdict()) + "\n"
self.file.write(line) self.file.write(line)
return item return item
@ -131,6 +138,7 @@ The main point of this example is to show how to use :meth:`from_crawler`
method and how to clean up the resources properly.:: method and how to clean up the resources properly.::
import pymongo import pymongo
from itemadapter import ItemAdapter
class MongoPipeline: class MongoPipeline:
@ -155,7 +163,7 @@ method and how to clean up the resources properly.::
self.client.close() self.client.close()
def process_item(self, item, spider): def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item)) self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
return item return item
.. _MongoDB: https://www.mongodb.com/ .. _MongoDB: https://www.mongodb.com/
@ -167,18 +175,21 @@ method and how to clean up the resources properly.::
Take screenshot of item Take screenshot of item
----------------------- -----------------------
This example demonstrates how to return a This example demonstrates how to use :doc:`coroutine syntax <coroutines>` in
:class:`~twisted.internet.defer.Deferred` from the :meth:`process_item` method. the :meth:`process_item` method.
It uses Splash_ to render screenshot of item url. Pipeline
makes request to locally running instance of Splash_. After request is downloaded, This item pipeline makes a request to a locally-running instance of Splash_ to
it saves the screenshot to a file and adds filename to the item. render a screenshot of the item URL. After the request response is downloaded,
the item pipeline saves the screenshot to a file and adds the filename to the
item.
:: ::
import scrapy
import hashlib import hashlib
from urllib.parse import quote from urllib.parse import quote
import scrapy
from itemadapter import ItemAdapter
class ScreenshotPipeline: class ScreenshotPipeline:
"""Pipeline that uses Splash to render screenshot of """Pipeline that uses Splash to render screenshot of
@ -187,7 +198,8 @@ it saves the screenshot to a file and adds filename to the item.
SPLASH_URL = "http://localhost:8050/render.png?url={}" SPLASH_URL = "http://localhost:8050/render.png?url={}"
async def process_item(self, item, spider): async def process_item(self, item, spider):
encoded_item_url = quote(item["url"]) adapter = ItemAdapter(item)
encoded_item_url = quote(adapter["url"])
screenshot_url = self.SPLASH_URL.format(encoded_item_url) screenshot_url = self.SPLASH_URL.format(encoded_item_url)
request = scrapy.Request(screenshot_url) request = scrapy.Request(screenshot_url)
response = await spider.crawler.engine.download(request, spider) response = await spider.crawler.engine.download(request, spider)
@ -197,14 +209,14 @@ it saves the screenshot to a file and adds filename to the item.
return item return item
# Save screenshot to file, filename will be hash of url. # Save screenshot to file, filename will be hash of url.
url = item["url"] url = adapter["url"]
url_hash = hashlib.md5(url.encode("utf8")).hexdigest() url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
filename = "{}.png".format(url_hash) filename = "{}.png".format(url_hash)
with open(filename, "wb") as f: with open(filename, "wb") as f:
f.write(response.body) f.write(response.body)
# Store filename in item. # Store filename in item.
item["screenshot_filename"] = filename adapter["screenshot_filename"] = filename
return item return item
.. _Splash: https://splash.readthedocs.io/en/stable/ .. _Splash: https://splash.readthedocs.io/en/stable/
@ -217,6 +229,7 @@ already processed. Let's say that our items have a unique id, but our spider
returns multiples items with the same id:: returns multiples items with the same id::
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem from scrapy.exceptions import DropItem
class DuplicatesPipeline: class DuplicatesPipeline:
@ -225,10 +238,11 @@ returns multiples items with the same id::
self.ids_seen = set() self.ids_seen = set()
def process_item(self, item, spider): def process_item(self, item, spider):
if item['id'] in self.ids_seen: adapter = ItemAdapter(item)
raise DropItem("Duplicate item found: %s" % item) if adapter['id'] in self.ids_seen:
raise DropItem("Duplicate item found: %r" % item)
else: else:
self.ids_seen.add(item['id']) self.ids_seen.add(adapter['id'])
return item return item

View File

@ -8,31 +8,155 @@ Items
:synopsis: Item and Field classes :synopsis: Item and Field classes
The main goal in scraping is to extract structured data from unstructured The main goal in scraping is to extract structured data from unstructured
sources, typically, web pages. Scrapy spiders can return the extracted data sources, typically, web pages. :ref:`Spiders <topics-spiders>` may return the
as Python dicts. While convenient and familiar, Python dicts lack structure: extracted data as `items`, Python objects that define key-value pairs.
it is easy to make a typo in a field name or return inconsistent data,
especially in a larger project with many spiders.
To define common output data format Scrapy provides the :class:`Item` class. Scrapy supports :ref:`multiple types of items <item-types>`. When you create an
:class:`Item` objects are simple containers used to collect the scraped data. item, you may use whichever type of item you want. When you write code that
They provide a `dictionary-like`_ API with a convenient syntax for declaring receives an item, your code should :ref:`work for any item type
their available fields. <supporting-item-types>`.
Various Scrapy components use extra information provided by Items: .. _item-types:
exporters look at declared fields to figure out columns to export,
serialization can be customized using Item fields metadata, :mod:`trackref`
tracks Item instances to help find memory leaks
(see :ref:`topics-leaks-trackrefs`), etc.
.. _dictionary-like: https://docs.python.org/2/library/stdtypes.html#dict Item Types
==========
Scrapy supports the following types of items, via the `itemadapter`_ library:
:ref:`dictionaries <dict-items>`, :ref:`Item objects <item-objects>`,
:ref:`dataclass objects <dataclass-items>`, and :ref:`attrs objects <attrs-items>`.
.. _itemadapter: https://github.com/scrapy/itemadapter
.. _dict-items:
Dictionaries
------------
As an item type, :class:`dict` is convenient and familiar.
.. _item-objects:
Item objects
------------
:class:`Item` provides a :class:`dict`-like API plus additional features that
make it the most feature-complete item type:
.. class:: Item([arg])
:class:`Item` objects replicate the standard :class:`dict` API, including
its ``__init__`` method.
:class:`Item` allows defining field names, so that:
- :class:`KeyError` is raised when using undefined field names (i.e.
prevents typos going unnoticed)
- :ref:`Item exporters <topics-exporters>` can export all fields by
default even if the first scraped object does not have values for all
of them
:class:`Item` also allows defining field metadata, which can be used to
:ref:`customize serialization <topics-exporters-field-serialization>`.
:mod:`trackref` tracks :class:`Item` objects to help find memory leaks
(see :ref:`topics-leaks-trackrefs`).
:class:`Item` objects also provide the following additional API members:
.. automethod:: copy
.. automethod:: deepcopy
.. attribute:: fields
A dictionary containing *all declared fields* for this Item, not only
those populated. The keys are the field names and the values are the
:class:`Field` objects used in the :ref:`Item declaration
<topics-items-declaring>`.
Example::
from scrapy.item import Item, Field
class CustomItem(Item):
one_field = Field()
another_field = Field()
.. _dataclass-items:
Dataclass objects
-----------------
.. versionadded:: 2.2
:func:`~dataclasses.dataclass` allows defining item classes with field names,
so that :ref:`item exporters <topics-exporters>` can export all fields by
default even if the first scraped object does not have values for all of them.
Additionally, ``dataclass`` items also allow to:
* define the type and default value of each defined field.
* define custom field metadata through :func:`dataclasses.field`, which can be used to
:ref:`customize serialization <topics-exporters-field-serialization>`.
They work natively in Python 3.7 or later, or using the `dataclasses
backport`_ in Python 3.6.
.. _dataclasses backport: https://pypi.org/project/dataclasses/
Example::
from dataclasses import dataclass
@dataclass
class CustomItem:
one_field: str
another_field: int
.. note:: Field types are not enforced at run time.
.. _attrs-items:
attr.s objects
--------------
.. versionadded:: 2.2
:func:`attr.s` allows defining item classes with field names,
so that :ref:`item exporters <topics-exporters>` can export all fields by
default even if the first scraped object does not have values for all of them.
Additionally, ``attr.s`` items also allow to:
* define the type and default value of each defined field.
* define custom field :ref:`metadata <attrs:metadata>`, which can be used to
:ref:`customize serialization <topics-exporters-field-serialization>`.
In order to use this type, the :doc:`attrs package <attrs:index>` needs to be installed.
Example::
import attr
@attr.s
class CustomItem:
one_field = attr.ib()
another_field = attr.ib()
Working with Item objects
=========================
.. _topics-items-declaring: .. _topics-items-declaring:
Declaring Items Declaring Item subclasses
=============== -------------------------
Items are declared using a simple class definition syntax and :class:`Field` Item subclasses are declared using a simple class definition syntax and
objects. Here is an example:: :class:`Field` objects. Here is an example::
import scrapy import scrapy
@ -50,10 +174,11 @@ objects. Here is an example::
.. _Django: https://www.djangoproject.com/ .. _Django: https://www.djangoproject.com/
.. _Django Models: https://docs.djangoproject.com/en/dev/topics/db/models/ .. _Django Models: https://docs.djangoproject.com/en/dev/topics/db/models/
.. _topics-items-fields: .. _topics-items-fields:
Item Fields Declaring fields
=========== ----------------
:class:`Field` objects are used to specify metadata for each field. For :class:`Field` objects are used to specify metadata for each field. For
example, the serializer function for the ``last_updated`` field illustrated in example, the serializer function for the ``last_updated`` field illustrated in
@ -74,15 +199,31 @@ It's important to note that the :class:`Field` objects used to declare the item
do not stay assigned as class attributes. Instead, they can be accessed through do not stay assigned as class attributes. Instead, they can be accessed through
the :attr:`Item.fields` attribute. the :attr:`Item.fields` attribute.
Working with Items .. class:: Field([arg])
==================
The :class:`Field` class is just an alias to the built-in :class:`dict` class and
doesn't provide any extra functionality or attributes. In other words,
:class:`Field` objects are plain-old Python dicts. A separate class is used
to support the :ref:`item declaration syntax <topics-items-declaring>`
based on class attributes.
.. note:: Field metadata can also be declared for ``dataclass`` and ``attrs``
items. Please refer to the documentation for `dataclasses.field`_ and
`attr.ib`_ for additional information.
.. _dataclasses.field: https://docs.python.org/3/library/dataclasses.html#dataclasses.field
.. _attr.ib: https://www.attrs.org/en/stable/api.html#attr.ib
Working with Item objects
-------------------------
Here are some examples of common tasks performed with items, using the Here are some examples of common tasks performed with items, using the
``Product`` item :ref:`declared above <topics-items-declaring>`. You will ``Product`` item :ref:`declared above <topics-items-declaring>`. You will
notice the API is very similar to the `dict API`_. notice the API is very similar to the :class:`dict` API.
Creating items Creating items
-------------- ''''''''''''''
>>> product = Product(name='Desktop PC', price=1000) >>> product = Product(name='Desktop PC', price=1000)
>>> print(product) >>> print(product)
@ -90,7 +231,7 @@ Product(name='Desktop PC', price=1000)
Getting field values Getting field values
-------------------- ''''''''''''''''''''
>>> product['name'] >>> product['name']
Desktop PC Desktop PC
@ -130,7 +271,7 @@ False
Setting field values Setting field values
-------------------- ''''''''''''''''''''
>>> product['last_updated'] = 'today' >>> product['last_updated'] = 'today'
>>> product['last_updated'] >>> product['last_updated']
@ -143,9 +284,9 @@ KeyError: 'Product does not support field: lala'
Accessing all populated values Accessing all populated values
------------------------------ ''''''''''''''''''''''''''''''
To access all populated values, just use the typical `dict API`_: To access all populated values, just use the typical :class:`dict` API:
>>> product.keys() >>> product.keys()
['price', 'name'] ['price', 'name']
@ -157,16 +298,14 @@ To access all populated values, just use the typical `dict API`_:
.. _copying-items: .. _copying-items:
Copying items Copying items
------------- '''''''''''''
To copy an item, you must first decide whether you want a shallow copy or a To copy an item, you must first decide whether you want a shallow copy or a
deep copy. deep copy.
If your item contains mutable_ values like lists or dictionaries, a shallow If your item contains :term:`mutable` values like lists or dictionaries,
copy will keep references to the same mutable values across all different a shallow copy will keep references to the same mutable values across all
copies. different copies.
.. _mutable: https://docs.python.org/3/glossary.html#term-mutable
For example, if you have an item with a list of tags, and you create a shallow For example, if you have an item with a list of tags, and you create a shallow
copy of that item, both the original item and the copy have the same list of copy of that item, both the original item and the copy have the same list of
@ -175,9 +314,7 @@ other item as well.
If that is not the desired behavior, use a deep copy instead. If that is not the desired behavior, use a deep copy instead.
See the `documentation of the copy module`_ for more information. See :mod:`copy` for more information.
.. _documentation of the copy module: https://docs.python.org/3/library/copy.html
To create a shallow copy of an item, you can either call To create a shallow copy of an item, you can either call
:meth:`~scrapy.item.Item.copy` on an existing item :meth:`~scrapy.item.Item.copy` on an existing item
@ -189,7 +326,7 @@ To create a deep copy, call :meth:`~scrapy.item.Item.deepcopy` instead
Other common tasks Other common tasks
------------------ ''''''''''''''''''
Creating dicts from items: Creating dicts from items:
@ -207,8 +344,8 @@ Traceback (most recent call last):
KeyError: 'Product does not support field: lala' KeyError: 'Product does not support field: lala'
Extending Items Extending Item subclasses
=============== -------------------------
You can extend Items (to add more fields or to change some metadata for some You can extend Items (to add more fields or to change some metadata for some
fields) by declaring a subclass of your original Item. fields) by declaring a subclass of your original Item.
@ -228,46 +365,25 @@ appending more values, or changing existing values, like this::
That adds (or replaces) the ``serializer`` metadata key for the ``name`` field, That adds (or replaces) the ``serializer`` metadata key for the ``name`` field,
keeping all the previously existing metadata values. keeping all the previously existing metadata values.
Item objects
============
.. class:: Item([arg]) .. _supporting-item-types:
Return a new Item optionally initialized from the given argument. Supporting All Item Types
=========================
Items replicate the standard `dict API`_, including its ``__init__`` method, and In code that receives an item, such as methods of :ref:`item pipelines
also provide the following additional API members: <topics-item-pipeline>` or :ref:`spider middlewares
<topics-spider-middleware>`, it is a good practice to use the
:class:`~itemadapter.ItemAdapter` class and the
:func:`~itemadapter.is_item` function to write code that works for
any :ref:`supported item type <item-types>`:
.. automethod:: copy .. autoclass:: itemadapter.ItemAdapter
.. automethod:: deepcopy .. autofunction:: itemadapter.is_item
.. attribute:: fields
A dictionary containing *all declared fields* for this Item, not only
those populated. The keys are the field names and the values are the
:class:`Field` objects used in the :ref:`Item declaration
<topics-items-declaring>`.
.. _dict API: https://docs.python.org/2/library/stdtypes.html#dict
Field objects
=============
.. class:: Field([arg])
The :class:`Field` class is just an alias to the built-in `dict`_ class and
doesn't provide any extra functionality or attributes. In other words,
:class:`Field` objects are plain-old Python dicts. A separate class is used
to support the :ref:`item declaration syntax <topics-items-declaring>`
based on class attributes.
.. _dict: https://docs.python.org/2/library/stdtypes.html#dict
Other classes related to Item Other classes related to items
============================= ==============================
.. autoclass:: BaseItem
.. autoclass:: ItemMeta .. autoclass:: ItemMeta

View File

@ -4,7 +4,7 @@
Debugging memory leaks Debugging memory leaks
====================== ======================
In Scrapy, objects such as Requests, Responses and Items have a finite In Scrapy, objects such as requests, responses and items have a finite
lifetime: they are created, used for a while, and finally destroyed. lifetime: they are created, used for a while, and finally destroyed.
From all those objects, the Request is probably the one with the longest From all those objects, the Request is probably the one with the longest
@ -61,8 +61,8 @@ Debugging memory leaks with ``trackref``
======================================== ========================================
:mod:`trackref` is a module provided by Scrapy to debug the most common cases of :mod:`trackref` is a module provided by Scrapy to debug the most common cases of
memory leaks. It basically tracks the references to all live Requests, memory leaks. It basically tracks the references to all live Request,
Responses, Item and Selector objects. Response, Item, Spider and Selector objects.
You can enter the telnet console and inspect how many objects (of the classes You can enter the telnet console and inspect how many objects (of the classes
mentioned above) are currently alive using the ``prefs()`` function which is an mentioned above) are currently alive using the ``prefs()`` function which is an
@ -200,11 +200,10 @@ Debugging memory leaks with muppy
``trackref`` provides a very convenient mechanism for tracking down memory ``trackref`` provides a very convenient mechanism for tracking down memory
leaks, but it only keeps track of the objects that are more likely to cause leaks, but it only keeps track of the objects that are more likely to cause
memory leaks (Requests, Responses, Items, and Selectors). However, there are memory leaks. However, there are other cases where the memory leaks could come
other cases where the memory leaks could come from other (more or less obscure) from other (more or less obscure) objects. If this is your case, and you can't
objects. If this is your case, and you can't find your leaks using ``trackref``, find your leaks using ``trackref``, you still have another resource: the muppy
you still have another resource: the muppy library. library.
You can use muppy from `Pympler`_. You can use muppy from `Pympler`_.

View File

@ -7,13 +7,12 @@ Item Loaders
.. module:: scrapy.loader .. module:: scrapy.loader
:synopsis: Item Loader class :synopsis: Item Loader class
Item Loaders provide a convenient mechanism for populating scraped :ref:`Items Item Loaders provide a convenient mechanism for populating scraped :ref:`items
<topics-items>`. Even though Items can be populated using their own <topics-items>`. Even though items can be populated directly, Item Loaders provide a
dictionary-like API, Item Loaders provide a much more convenient API for much more convenient API for populating them from a scraping process, by automating
populating them from a scraping process, by automating some common tasks like some common tasks like parsing the raw extracted data before assigning it.
parsing the raw extracted data before assigning it.
In other words, :ref:`Items <topics-items>` provide the *container* of In other words, :ref:`items <topics-items>` provide the *container* of
scraped data, while Item Loaders provide the mechanism for *populating* that scraped data, while Item Loaders provide the mechanism for *populating* that
container. container.
@ -25,10 +24,10 @@ Using Item Loaders to populate items
==================================== ====================================
To use an Item Loader, you must first instantiate it. You can either To use an Item Loader, you must first instantiate it. You can either
instantiate it with a dict-like object (e.g. Item or dict) or without one, in instantiate it with an :ref:`item object <topics-items>` or without one, in which
which case an Item is automatically instantiated in the Item Loader ``__init__`` method case an :ref:`item object <topics-items>` is automatically created in the
using the Item class specified in the :attr:`ItemLoader.default_item_class` Item Loader ``__init__`` method using the :ref:`item <topics-items>` class
attribute. specified in the :attr:`ItemLoader.default_item_class` attribute.
Then, you start collecting values into the Item Loader, typically using Then, you start collecting values into the Item Loader, typically using
:ref:`Selectors <topics-selectors>`. You can add more than one value to :ref:`Selectors <topics-selectors>`. You can add more than one value to
@ -77,6 +76,31 @@ called which actually returns the item populated with the data
previously extracted and collected with the :meth:`~ItemLoader.add_xpath`, previously extracted and collected with the :meth:`~ItemLoader.add_xpath`,
:meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls. :meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls.
.. _topics-loaders-dataclass:
Working with dataclass items
============================
By default, :ref:`dataclass items <dataclass-items>` require all fields to be
passed when created. This could be an issue when using dataclass items with
item loaders: unless a pre-populated item is passed to the loader, fields
will be populated incrementally using the loader's :meth:`~ItemLoader.add_xpath`,
:meth:`~ItemLoader.add_css` and :meth:`~ItemLoader.add_value` methods.
One approach to overcome this is to define items using the
:func:`~dataclasses.field` function, with a ``default`` argument::
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class InventoryItem:
name: Optional[str] = field(default=None)
price: Optional[float] = field(default=None)
stock: Optional[int] = field(default=None)
.. _topics-loaders-processors: .. _topics-loaders-processors:
Input and Output processors Input and Output processors
@ -88,7 +112,7 @@ received (through the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`
:meth:`~ItemLoader.add_value` methods) and the result of the input processor is :meth:`~ItemLoader.add_value` methods) and the result of the input processor is
collected and kept inside the ItemLoader. After collecting all data, the collected and kept inside the ItemLoader. After collecting all data, the
:meth:`ItemLoader.load_item` method is called to populate and get the populated :meth:`ItemLoader.load_item` method is called to populate and get the populated
:class:`~scrapy.item.Item` object. That's when the output processor is :ref:`item object <topics-items>`. That's when the output processor is
called with the data previously collected (and processed using the input called with the data previously collected (and processed using the input
processor). The result of the output processor is the final value that gets processor). The result of the output processor is the final value that gets
assigned to the item. assigned to the item.
@ -153,12 +177,10 @@ Last, but not least, Scrapy comes with some :ref:`commonly used processors
<topics-loaders-available-processors>` built-in for convenience. <topics-loaders-available-processors>` built-in for convenience.
Declaring Item Loaders Declaring Item Loaders
====================== ======================
Item Loaders are declared like Items, by using a class definition syntax. Here Item Loaders are declared using a class definition syntax. Here is an example::
is an example::
from scrapy.loader import ItemLoader from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join from scrapy.loader.processors import TakeFirst, MapCompose, Join
@ -273,11 +295,11 @@ There are several ways to modify Item Loader context values:
ItemLoader objects ItemLoader objects
================== ==================
.. class:: ItemLoader([item, selector, response], \**kwargs) .. class:: ItemLoader([item, selector, response], **kwargs)
Return a new Item Loader for populating the given Item. If no item is Return a new Item Loader for populating the given :ref:`item object
given, one is instantiated automatically using the class in <topics-items>`. If no item object is given, one is instantiated
:attr:`default_item_class`. automatically using the class in :attr:`default_item_class`.
When instantiated with a ``selector`` or a ``response`` parameters When instantiated with a ``selector`` or a ``response`` parameters
the :class:`ItemLoader` class provides convenient mechanisms for extracting the :class:`ItemLoader` class provides convenient mechanisms for extracting
@ -286,7 +308,7 @@ ItemLoader objects
:param item: The item instance to populate using subsequent calls to :param item: The item instance to populate using subsequent calls to
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`, :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
or :meth:`~ItemLoader.add_value`. or :meth:`~ItemLoader.add_value`.
:type item: :class:`~scrapy.item.Item` object :type item: :ref:`item object <topics-items>`
:param selector: The selector to extract data from, when using the :param selector: The selector to extract data from, when using the
:meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath` :meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath`
@ -303,7 +325,7 @@ ItemLoader objects
:class:`ItemLoader` instances have the following methods: :class:`ItemLoader` instances have the following methods:
.. method:: get_value(value, \*processors, \**kwargs) .. method:: get_value(value, *processors, **kwargs)
Process the given ``value`` by the given ``processors`` and keyword Process the given ``value`` by the given ``processors`` and keyword
arguments. arguments.
@ -321,7 +343,7 @@ ItemLoader objects
>>> loader.get_value(u'name: foo', TakeFirst(), unicode.upper, re='name: (.+)') >>> loader.get_value(u'name: foo', TakeFirst(), unicode.upper, re='name: (.+)')
'FOO` 'FOO`
.. method:: add_value(field_name, value, \*processors, \**kwargs) .. method:: add_value(field_name, value, *processors, **kwargs)
Process and then add the given ``value`` for the given field. Process and then add the given ``value`` for the given field.
@ -343,11 +365,11 @@ ItemLoader objects
loader.add_value('name', u'name: foo', TakeFirst(), re='name: (.+)') loader.add_value('name', u'name: foo', TakeFirst(), re='name: (.+)')
loader.add_value(None, {'name': u'foo', 'sex': u'male'}) loader.add_value(None, {'name': u'foo', 'sex': u'male'})
.. method:: replace_value(field_name, value, \*processors, \**kwargs) .. method:: replace_value(field_name, value, *processors, **kwargs)
Similar to :meth:`add_value` but replaces the collected data with the Similar to :meth:`add_value` but replaces the collected data with the
new value instead of adding it. new value instead of adding it.
.. method:: get_xpath(xpath, \*processors, \**kwargs) .. method:: get_xpath(xpath, *processors, **kwargs)
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
value, which is used to extract a list of unicode strings from the value, which is used to extract a list of unicode strings from the
@ -367,7 +389,7 @@ ItemLoader objects
# HTML snippet: <p id="price">the price is $1200</p> # HTML snippet: <p id="price">the price is $1200</p>
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)') loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
.. method:: add_xpath(field_name, xpath, \*processors, \**kwargs) .. method:: add_xpath(field_name, xpath, *processors, **kwargs)
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
value, which is used to extract a list of unicode strings from the value, which is used to extract a list of unicode strings from the
@ -385,12 +407,12 @@ ItemLoader objects
# HTML snippet: <p id="price">the price is $1200</p> # HTML snippet: <p id="price">the price is $1200</p>
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
.. method:: replace_xpath(field_name, xpath, \*processors, \**kwargs) .. method:: replace_xpath(field_name, xpath, *processors, **kwargs)
Similar to :meth:`add_xpath` but replaces collected data instead of Similar to :meth:`add_xpath` but replaces collected data instead of
adding it. adding it.
.. method:: get_css(css, \*processors, \**kwargs) .. method:: get_css(css, *processors, **kwargs)
Similar to :meth:`ItemLoader.get_value` but receives a CSS selector Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
instead of a value, which is used to extract a list of unicode strings instead of a value, which is used to extract a list of unicode strings
@ -410,7 +432,7 @@ ItemLoader objects
# HTML snippet: <p id="price">the price is $1200</p> # HTML snippet: <p id="price">the price is $1200</p>
loader.get_css('p#price', TakeFirst(), re='the price is (.*)') loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
.. method:: add_css(field_name, css, \*processors, \**kwargs) .. method:: add_css(field_name, css, *processors, **kwargs)
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
instead of a value, which is used to extract a list of unicode strings instead of a value, which is used to extract a list of unicode strings
@ -428,7 +450,7 @@ ItemLoader objects
# HTML snippet: <p id="price">the price is $1200</p> # HTML snippet: <p id="price">the price is $1200</p>
loader.add_css('price', 'p#price', re='the price is (.*)') loader.add_css('price', 'p#price', re='the price is (.*)')
.. method:: replace_css(field_name, css, \*processors, \**kwargs) .. method:: replace_css(field_name, css, *processors, **kwargs)
Similar to :meth:`add_css` but replaces collected data instead of Similar to :meth:`add_css` but replaces collected data instead of
adding it. adding it.
@ -444,17 +466,19 @@ ItemLoader objects
Create a nested loader with an xpath selector. Create a nested loader with an xpath selector.
The supplied selector is applied relative to selector associated The supplied selector is applied relative to selector associated
with this :class:`ItemLoader`. The nested loader shares the :class:`Item` with this :class:`ItemLoader`. The nested loader shares the :ref:`item
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`, object <topics-items>` with the parent :class:`ItemLoader` so calls to
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected. :meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will
behave as expected.
.. method:: nested_css(css) .. method:: nested_css(css)
Create a nested loader with a css selector. Create a nested loader with a css selector.
The supplied selector is applied relative to selector associated The supplied selector is applied relative to selector associated
with this :class:`ItemLoader`. The nested loader shares the :class:`Item` with this :class:`ItemLoader`. The nested loader shares the :ref:`item
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`, object <topics-items>` with the parent :class:`ItemLoader` so calls to
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected. :meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will
behave as expected.
.. method:: get_collected_values(field_name) .. method:: get_collected_values(field_name)
@ -477,7 +501,7 @@ ItemLoader objects
.. attribute:: item .. attribute:: item
The :class:`~scrapy.item.Item` object being parsed by this Item Loader. The :ref:`item object <topics-items>` being parsed by this Item Loader.
This is mostly used as a property so when attempting to override this This is mostly used as a property so when attempting to override this
value, you may want to check out :attr:`default_item_class` first. value, you may want to check out :attr:`default_item_class` first.
@ -488,8 +512,8 @@ ItemLoader objects
.. attribute:: default_item_class .. attribute:: default_item_class
An Item class (or factory), used to instantiate items when not given in An :ref:`item object <topics-items>` class or factory, used to
the ``__init__`` method. instantiate items when not given in the ``__init__`` method.
.. attribute:: default_input_processor .. attribute:: default_input_processor
@ -678,7 +702,7 @@ Here is a list of all built-in processors:
>>> proc(['one', 'two', 'three']) >>> proc(['one', 'two', 'three'])
'one<br>two<br>three' 'one<br>two<br>three'
.. class:: Compose(\*functions, \**default_loader_context) .. class:: Compose(*functions, **default_loader_context)
A processor which is constructed from the composition of the given A processor which is constructed from the composition of the given
functions. This means that each input value of this processor is passed to functions. This means that each input value of this processor is passed to
@ -706,7 +730,7 @@ Here is a list of all built-in processors:
active Loader context accessible through the :meth:`ItemLoader.context` active Loader context accessible through the :meth:`ItemLoader.context`
attribute. attribute.
.. class:: MapCompose(\*functions, \**default_loader_context) .. class:: MapCompose(*functions, **default_loader_context)
A processor which is constructed from the composition of the given A processor which is constructed from the composition of the given
functions, similar to the :class:`Compose` processor. The difference with functions, similar to the :class:`Compose` processor. The difference with

View File

@ -9,8 +9,7 @@ Logging
explicit calls to the Python standard logging. Keep reading to learn more explicit calls to the Python standard logging. Keep reading to learn more
about the new logging system. about the new logging system.
Scrapy uses `Python's builtin logging system Scrapy uses :mod:`logging` for event logging. We'll
<https://docs.python.org/3/library/logging.html>`_ for event logging. We'll
provide some simple examples to get you started, but for more advanced provide some simple examples to get you started, but for more advanced
use-cases it's strongly suggested to read thoroughly its documentation. use-cases it's strongly suggested to read thoroughly its documentation.
@ -83,10 +82,10 @@ path::
.. seealso:: .. seealso::
Module logging, `HowTo <https://docs.python.org/2/howto/logging.html>`_ Module logging, :doc:`HowTo <howto/logging>`
Basic Logging Tutorial Basic Logging Tutorial
Module logging, `Loggers <https://docs.python.org/2/library/logging.html#logger-objects>`_ Module logging, :ref:`Loggers <logger>`
Further documentation on loggers Further documentation on loggers
.. _topics-logging-from-spiders: .. _topics-logging-from-spiders:
@ -165,14 +164,12 @@ possible levels listed in :ref:`topics-logging-levels`.
:setting:`LOG_FORMAT` and :setting:`LOG_DATEFORMAT` specify formatting strings :setting:`LOG_FORMAT` and :setting:`LOG_DATEFORMAT` specify formatting strings
used as layouts for all messages. Those strings can contain any placeholders used as layouts for all messages. Those strings can contain any placeholders
listed in `logging's logrecord attributes docs listed in :ref:`logging's logrecord attributes docs <logrecord-attributes>` and
<https://docs.python.org/2/library/logging.html#logrecord-attributes>`_ and :ref:`datetime's strftime and strptime directives <strftime-strptime-behavior>`
`datetime's strftime and strptime directives
<https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_
respectively. respectively.
If :setting:`LOG_SHORT_NAMES` is set, then the logs will not display the Scrapy If :setting:`LOG_SHORT_NAMES` is set, then the logs will not display the Scrapy
component that prints the log. It is unset by default, hence logs contain the component that prints the log. It is unset by default, hence logs contain the
Scrapy component responsible for that log output. Scrapy component responsible for that log output.
Command-line options Command-line options
@ -190,7 +187,7 @@ to override some of the Scrapy settings regarding logging.
.. seealso:: .. seealso::
Module `logging.handlers <https://docs.python.org/2/library/logging.handlers.html>`_ Module :mod:`logging.handlers`
Further documentation on available handlers Further documentation on available handlers
.. _custom-log-formats: .. _custom-log-formats:
@ -201,10 +198,13 @@ Custom Log Formats
A custom log format can be set for different actions by extending A custom log format can be set for different actions by extending
:class:`~scrapy.logformatter.LogFormatter` class and making :class:`~scrapy.logformatter.LogFormatter` class and making
:setting:`LOG_FORMATTER` point to your new class. :setting:`LOG_FORMATTER` point to your new class.
.. autoclass:: scrapy.logformatter.LogFormatter .. autoclass:: scrapy.logformatter.LogFormatter
:members: :members:
.. _topics-logging-advanced-customization:
Advanced customization Advanced customization
---------------------- ----------------------
@ -256,16 +256,15 @@ scrapy.utils.log module
In that case, its usage is not required but it's recommended. In that case, its usage is not required but it's recommended.
Another option when running custom scripts is to manually configure the logging. Another option when running custom scripts is to manually configure the logging.
To do this you can use `logging.basicConfig()`_ to set a basic root handler. To do this you can use :func:`logging.basicConfig` to set a basic root handler.
Note that :class:`~scrapy.crawler.CrawlerProcess` automatically calls ``configure_logging``, Note that :class:`~scrapy.crawler.CrawlerProcess` automatically calls ``configure_logging``,
so it is recommended to only use `logging.basicConfig()`_ together with so it is recommended to only use :func:`logging.basicConfig` together with
:class:`~scrapy.crawler.CrawlerRunner`. :class:`~scrapy.crawler.CrawlerRunner`.
This is an example on how to redirect ``INFO`` or higher messages to a file:: This is an example on how to redirect ``INFO`` or higher messages to a file::
import logging import logging
from scrapy.utils.log import configure_logging
logging.basicConfig( logging.basicConfig(
filename='log.txt', filename='log.txt',
@ -275,7 +274,3 @@ scrapy.utils.log module
Refer to :ref:`run-from-script` for more details about using Scrapy this Refer to :ref:`run-from-script` for more details about using Scrapy this
way. way.
.. _logging.basicConfig(): https://docs.python.org/2/library/logging.html#logging.basicConfig

View File

@ -50,7 +50,7 @@ this:
4. When the files are downloaded, another field (``files``) will be populated 4. When the files are downloaded, another field (``files``) will be populated
with the results. This field will contain a list of dicts with information with the results. This field will contain a list of dicts with information
about the downloaded files, such as the downloaded path, the original about the downloaded files, such as the downloaded path, the original
scraped url (taken from the ``file_urls`` field) , and the file checksum. scraped url (taken from the ``file_urls`` field), the file checksum and the file status.
The files in the list of the ``files`` field will retain the same order of The files in the list of the ``files`` field will retain the same order of
the original ``file_urls`` field. If some file failed downloading, an the original ``file_urls`` field. If some file failed downloading, an
error will be logged and the file won't be present in the ``files`` field. error will be logged and the file won't be present in the ``files`` field.
@ -156,7 +156,7 @@ following forms::
ftp://username:password@address:port/path ftp://username:password@address:port/path
ftp://address:port/path ftp://address:port/path
If ``username`` and ``password`` are not provided, they are taken from the :setting:`FTP_USER` and If ``username`` and ``password`` are not provided, they are taken from the :setting:`FTP_USER` and
:setting:`FTP_PASSWORD` settings respectively. :setting:`FTP_PASSWORD` settings respectively.
@ -201,6 +201,9 @@ For self-hosting you also might feel the need not to use SSL and not to verify S
.. _s3.scality: https://s3.scality.com/ .. _s3.scality: https://s3.scality.com/
.. _canned ACLs: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl .. _canned ACLs: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl
.. _media-pipeline-gcs:
Google Cloud Storage Google Cloud Storage
--------------------- ---------------------
@ -243,20 +246,22 @@ Usage example
.. setting:: IMAGES_URLS_FIELD .. setting:: IMAGES_URLS_FIELD
.. setting:: IMAGES_RESULT_FIELD .. setting:: IMAGES_RESULT_FIELD
In order to use a media pipeline first, :ref:`enable it In order to use a media pipeline, first :ref:`enable it
<topics-media-pipeline-enabling>`. <topics-media-pipeline-enabling>`.
Then, if a spider returns a dict with the URLs key (``file_urls`` or Then, if a spider returns an :ref:`item object <topics-items>` with the URLs
``image_urls``, for the Files or Images Pipeline respectively), the pipeline will field (``file_urls`` or ``image_urls``, for the Files or Images Pipeline
put the results under respective key (``files`` or ``images``). respectively), the pipeline will put the results under the respective field
(``files`` or ``images``).
If you prefer to use :class:`~.Item`, then define a custom item with the When using :ref:`item types <item-types>` for which fields are defined beforehand,
necessary fields, like in this example for Images Pipeline:: you must define both the URLs field and the results field. For example, when
using the images pipeline, items must define both the ``image_urls`` and the
``images`` field. For instance, using the :class:`~scrapy.item.Item` class::
import scrapy import scrapy
class MyItem(scrapy.Item): class MyItem(scrapy.Item):
# ... other item fields ... # ... other item fields ...
image_urls = scrapy.Field() image_urls = scrapy.Field()
images = scrapy.Field() images = scrapy.Field()
@ -445,8 +450,11 @@ See here the methods that you can override in your custom Files Pipeline:
:meth:`~get_media_requests` method and return a Request for each :meth:`~get_media_requests` method and return a Request for each
file URL:: file URL::
from itemadapter import ItemAdapter
def get_media_requests(self, item, info): def get_media_requests(self, item, info):
for file_url in item['file_urls']: adapter = ItemAdapter(item)
for file_url in adapter['file_urls']:
yield scrapy.Request(file_url) yield scrapy.Request(file_url)
Those requests will be processed by the pipeline and, when they have finished Those requests will be processed by the pipeline and, when they have finished
@ -470,6 +478,18 @@ See here the methods that you can override in your custom Files Pipeline:
* ``checksum`` - a `MD5 hash`_ of the image contents * ``checksum`` - a `MD5 hash`_ of the image contents
* ``status`` - the file status indication.
.. versionadded:: 2.2
It can be one of the following:
* ``downloaded`` - file was downloaded.
* ``uptodate`` - file was not downloaded, as it was downloaded recently,
according to the file expiration policy.
* ``cached`` - file was already scheduled for download, by another item
sharing the same file.
The list of tuples received by :meth:`~item_completed` is The list of tuples received by :meth:`~item_completed` is
guaranteed to retain the same order of the requests returned from the guaranteed to retain the same order of the requests returned from the
:meth:`~get_media_requests` method. :meth:`~get_media_requests` method.
@ -479,7 +499,8 @@ See here the methods that you can override in your custom Files Pipeline:
[(True, [(True,
{'checksum': '2b00042f7481c7b056c4b410d28f33cf', {'checksum': '2b00042f7481c7b056c4b410d28f33cf',
'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg', 'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg',
'url': 'http://www.example.com/files/product1.pdf'}), 'url': 'http://www.example.com/files/product1.pdf',
'status': 'downloaded'}),
(False, (False,
Failure(...))] Failure(...))]
@ -500,13 +521,15 @@ See here the methods that you can override in your custom Files Pipeline:
store the downloaded file paths (passed in results) in the ``file_paths`` store the downloaded file paths (passed in results) in the ``file_paths``
item field, and we drop the item if it doesn't contain any files:: item field, and we drop the item if it doesn't contain any files::
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem from scrapy.exceptions import DropItem
def item_completed(self, results, item, info): def item_completed(self, results, item, info):
file_paths = [x['path'] for ok, x in results if ok] file_paths = [x['path'] for ok, x in results if ok]
if not file_paths: if not file_paths:
raise DropItem("Item contains no files") raise DropItem("Item contains no files")
item['file_paths'] = file_paths adapter = ItemAdapter(item)
adapter['file_paths'] = file_paths
return item return item
By default, the :meth:`item_completed` method returns the item. By default, the :meth:`item_completed` method returns the item.
@ -580,8 +603,9 @@ Here is a full example of the Images Pipeline whose methods are exemplified
above:: above::
import scrapy import scrapy
from scrapy.pipelines.images import ImagesPipeline from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class MyImagesPipeline(ImagesPipeline): class MyImagesPipeline(ImagesPipeline):
@ -593,7 +617,8 @@ above::
image_paths = [x['path'] for ok, x in results if ok] image_paths = [x['path'] for ok, x in results if ok]
if not image_paths: if not image_paths:
raise DropItem("Item contains no images") raise DropItem("Item contains no images")
item['image_paths'] = image_paths adapter = ItemAdapter(item)
adapter['image_paths'] = image_paths
return item return item

View File

@ -35,8 +35,9 @@ Here's an example showing how to run a single spider with it.
... ...
process = CrawlerProcess(settings={ process = CrawlerProcess(settings={
'FEED_FORMAT': 'json', "FEEDS": {
'FEED_URI': 'items.json' "items.json": {"format": "json"},
},
}) })
process.crawl(MySpider) process.crawl(MySpider)

View File

@ -36,7 +36,7 @@ Request objects
:type url: string :type url: string
:param callback: the function that will be called with the response of this :param callback: the function that will be called with the response of this
request (once its downloaded) as its first parameter. For more information request (once it's downloaded) as its first parameter. For more information
see :ref:`topics-request-response-ref-request-callback-arguments` below. see :ref:`topics-request-response-ref-request-callback-arguments` below.
If a Request doesn't specify a callback, the spider's If a Request doesn't specify a callback, the spider's
:meth:`~scrapy.spiders.Spider.parse` method will be used. :meth:`~scrapy.spiders.Spider.parse` method will be used.
@ -174,9 +174,9 @@ Request objects
See :ref:`topics-request-meta` for a list of special meta keys See :ref:`topics-request-meta` for a list of special meta keys
recognized by Scrapy. recognized by Scrapy.
This dict is `shallow copied`_ when the request is cloned using the This dict is :doc:`shallow copied <library/copy>` when the request is
``copy()`` or ``replace()`` methods, and can also be accessed, in your cloned using the ``copy()`` or ``replace()`` methods, and can also be
spider, from the ``response.meta`` attribute. accessed, in your spider, from the ``response.meta`` attribute.
.. attribute:: Request.cb_kwargs .. attribute:: Request.cb_kwargs
@ -185,11 +185,13 @@ Request objects
for new Requests, which means by default callbacks only get a :class:`Response` for new Requests, which means by default callbacks only get a :class:`Response`
object as argument. object as argument.
This dict is `shallow copied`_ when the request is cloned using the This dict is :doc:`shallow copied <library/copy>` when the request is
``copy()`` or ``replace()`` methods, and can also be accessed, in your cloned using the ``copy()`` or ``replace()`` methods, and can also be
spider, from the ``response.cb_kwargs`` attribute. accessed, in your spider, from the ``response.cb_kwargs`` attribute.
.. _shallow copied: https://docs.python.org/2/library/copy.html In case of a failure to process the request, this dict can be accessed as
``failure.request.cb_kwargs`` in the request's errback. For more information,
see :ref:`errback-cb_kwargs`.
.. method:: Request.copy() .. method:: Request.copy()
@ -314,6 +316,31 @@ errors if needed::
request = failure.request request = failure.request
self.logger.error('TimeoutError on %s', request.url) self.logger.error('TimeoutError on %s', request.url)
.. _errback-cb_kwargs:
Accessing additional data in errback functions
----------------------------------------------
In case of a failure to process the request, you may be interested in
accessing arguments to the callback functions so you can process further
based on the arguments in the errback. The following example shows how to
achieve this by using ``Failure.request.cb_kwargs``::
def parse(self, response):
request = scrapy.Request('http://www.example.com/index.html',
callback=self.parse_page2,
errback=self.errback_page2,
cb_kwargs=dict(main_url=response.url))
yield request
def parse_page2(self, response, main_url):
pass
def errback_page2(self, failure):
yield dict(
main_url=failure.request.cb_kwargs['main_url'],
)
.. _topics-request-meta: .. _topics-request-meta:
Request.meta special keys Request.meta special keys
@ -387,6 +414,51 @@ The meta key is used set retry times per request. When initialized, the
:reqmeta:`max_retry_times` meta key takes higher precedence over the :reqmeta:`max_retry_times` meta key takes higher precedence over the
:setting:`RETRY_TIMES` setting. :setting:`RETRY_TIMES` setting.
.. _topics-stop-response-download:
Stopping the download of a Response
===================================
Raising a :exc:`~scrapy.exceptions.StopDownload` exception from a
:class:`~scrapy.signals.bytes_received` signal handler will stop the
download of a given response. See the following example::
import scrapy
class StopSpider(scrapy.Spider):
name = "stop"
start_urls = ["https://docs.scrapy.org/en/latest/"]
@classmethod
def from_crawler(cls, crawler):
spider = super().from_crawler(crawler)
crawler.signals.connect(spider.on_bytes_received, signal=scrapy.signals.bytes_received)
return spider
def parse(self, response):
# 'last_chars' show that the full response was not downloaded
yield {"len": len(response.text), "last_chars": response.text[-40:]}
def on_bytes_received(self, data, request, spider):
raise scrapy.exceptions.StopDownload(fail=False)
which produces the following output::
2020-05-19 17:26:12 [scrapy.core.engine] INFO: Spider opened
2020-05-19 17:26:12 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-05-19 17:26:13 [scrapy.core.downloader.handlers.http11] DEBUG: Download stopped for <GET https://docs.scrapy.org/en/latest/> from signal handler StopSpider.on_bytes_received
2020-05-19 17:26:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://docs.scrapy.org/en/latest/> (referer: None) ['download_stopped']
2020-05-19 17:26:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://docs.scrapy.org/en/latest/>
{'len': 279, 'last_chars': 'dth, initial-scale=1.0">\n \n <title>Scr'}
2020-05-19 17:26:13 [scrapy.core.engine] INFO: Closing spider (finished)
By default, resulting responses are handled by their corresponding errbacks. To
call their callback instead, like in this example, pass ``fail=False`` to the
:exc:`~scrapy.exceptions.StopDownload` exception.
.. _topics-request-response-ref-request-subclasses: .. _topics-request-response-ref-request-subclasses:
Request subclasses Request subclasses
@ -566,12 +638,10 @@ dealing with JSON requests.
set to ``'POST'`` automatically. set to ``'POST'`` automatically.
:type data: JSON serializable object :type data: JSON serializable object
:param dumps_kwargs: Parameters that will be passed to underlying `json.dumps`_ method which is used to serialize :param dumps_kwargs: Parameters that will be passed to underlying :func:`json.dumps` method which is used to serialize
data into JSON format. data into JSON format.
:type dumps_kwargs: dict :type dumps_kwargs: dict
.. _json.dumps: https://docs.python.org/3/library/json.html#json.dumps
JsonRequest usage example JsonRequest usage example
------------------------- -------------------------
@ -620,6 +690,12 @@ Response objects
:param certificate: an object representing the server's SSL certificate. :param certificate: an object representing the server's SSL certificate.
:type certificate: twisted.internet.ssl.Certificate :type certificate: twisted.internet.ssl.Certificate
:param ip_address: The IP address of the server from which the Response originated.
:type ip_address: :class:`ipaddress.IPv4Address` or :class:`ipaddress.IPv6Address`
.. versionadded:: 2.1.0
The ``ip_address`` parameter.
.. attribute:: Response.url .. attribute:: Response.url
A string containing the URL of the response. A string containing the URL of the response.
@ -706,9 +782,19 @@ Response objects
A :class:`twisted.internet.ssl.Certificate` object representing A :class:`twisted.internet.ssl.Certificate` object representing
the server's SSL certificate. the server's SSL certificate.
Only populated for ``https`` responses, ``None`` otherwise. Only populated for ``https`` responses, ``None`` otherwise.
.. attribute:: Response.ip_address
.. versionadded:: 2.1.0
The IP address of the server from which the Response originated.
This attribute is currently only populated by the HTTP 1.1 download
handler, i.e. for ``http(s)`` responses. For other handlers,
:attr:`ip_address` is always ``None``.
.. method:: Response.copy() .. method:: Response.copy()
Returns a new Response which is a copy of this Response. Returns a new Response which is a copy of this Response.
@ -724,18 +810,16 @@ Response objects
Constructs an absolute url by combining the Response's :attr:`url` with Constructs an absolute url by combining the Response's :attr:`url` with
a possible relative url. a possible relative url.
This is a wrapper over `urlparse.urljoin`_, it's merely an alias for This is a wrapper over :func:`~urllib.parse.urljoin`, it's merely an alias for
making this call:: making this call::
urlparse.urljoin(response.url, url) urllib.parse.urljoin(response.url, url)
.. automethod:: Response.follow .. automethod:: Response.follow
.. automethod:: Response.follow_all .. automethod:: Response.follow_all
.. _urlparse.urljoin: https://docs.python.org/2/library/urlparse.html#urlparse.urljoin
.. _topics-request-response-ref-response-subclasses: .. _topics-request-response-ref-response-subclasses:
Response subclasses Response subclasses
@ -824,10 +908,10 @@ TextResponse objects
.. automethod:: TextResponse.follow_all .. automethod:: TextResponse.follow_all
.. method:: TextResponse.body_as_unicode() .. automethod:: TextResponse.json()
The same as :attr:`text`, but available as a method. This method is Returns a Python object from deserialized JSON document.
kept for backward compatibility; please prefer ``response.text``. The result is cached after the first call.
HtmlResponse objects HtmlResponse objects

View File

@ -14,7 +14,7 @@ achieve this, such as:
drawback: it's slow. drawback: it's slow.
* `lxml`_ is an XML parsing library (which also parses HTML) with a pythonic * `lxml`_ is an XML parsing library (which also parses HTML) with a pythonic
API based on `ElementTree`_. (lxml is not part of the Python standard API based on :mod:`~xml.etree.ElementTree`. (lxml is not part of the Python standard
library.) library.)
Scrapy comes with its own mechanism for extracting data. They're called Scrapy comes with its own mechanism for extracting data. They're called
@ -36,7 +36,6 @@ defines selectors to associate those styles with specific HTML elements.
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ .. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
.. _lxml: https://lxml.de/ .. _lxml: https://lxml.de/
.. _ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html
.. _XPath: https://www.w3.org/TR/xpath/all/ .. _XPath: https://www.w3.org/TR/xpath/all/
.. _CSS: https://www.w3.org/TR/selectors .. _CSS: https://www.w3.org/TR/selectors
.. _parsel: https://parsel.readthedocs.io/en/latest/ .. _parsel: https://parsel.readthedocs.io/en/latest/

View File

@ -26,9 +26,7 @@ do this by using an environment variable, ``SCRAPY_SETTINGS_MODULE``.
The value of ``SCRAPY_SETTINGS_MODULE`` should be in Python path syntax, e.g. The value of ``SCRAPY_SETTINGS_MODULE`` should be in Python path syntax, e.g.
``myproject.settings``. Note that the settings module should be on the ``myproject.settings``. Note that the settings module should be on the
Python `import search path`_. Python :ref:`import search path <tut-searchpath>`.
.. _import search path: https://docs.python.org/2/tutorial/modules.html#the-module-search-path
.. _populating-settings: .. _populating-settings:
@ -238,8 +236,8 @@ CONCURRENT_ITEMS
Default: ``100`` Default: ``100``
Maximum number of concurrent items (per response) to process in parallel in the Maximum number of concurrent items (per response) to process in parallel in
Item Processor (also known as the :ref:`Item Pipeline <topics-item-pipeline>`). :ref:`item pipelines <topics-item-pipeline>`.
.. setting:: CONCURRENT_REQUESTS .. setting:: CONCURRENT_REQUESTS
@ -422,10 +420,9 @@ connections (for ``HTTP10DownloadHandler``).
.. note:: .. note::
HTTP/1.0 is rarely used nowadays so you can safely ignore this setting, HTTP/1.0 is rarely used nowadays so you can safely ignore this setting,
unless you use Twisted<11.1, or if you really want to use HTTP/1.0 unless you really want to use HTTP/1.0 and override
and override :setting:`DOWNLOAD_HANDLERS_BASE` for ``http(s)`` scheme :setting:`DOWNLOAD_HANDLERS` for ``http(s)`` scheme accordingly,
accordingly, i.e. to i.e. to ``'scrapy.core.downloader.handlers.http.HTTP10DownloadHandler'``.
``'scrapy.core.downloader.handlers.http.HTTP10DownloadHandler'``.
.. setting:: DOWNLOADER_CLIENTCONTEXTFACTORY .. setting:: DOWNLOADER_CLIENTCONTEXTFACTORY
@ -449,7 +446,6 @@ or even enable client-side authentication (and various other things).
Scrapy also has another context factory class that you can set, Scrapy also has another context factory class that you can set,
``'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory'``, ``'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory'``,
which uses the platform's certificates to validate remote endpoints. which uses the platform's certificates to validate remote endpoints.
**This is only available if you use Twisted>=14.0.**
If you do use a custom ContextFactory, make sure its ``__init__`` method If you do use a custom ContextFactory, make sure its ``__init__`` method
accepts a ``method`` parameter (this is the ``OpenSSL.SSL`` method mapping accepts a ``method`` parameter (this is the ``OpenSSL.SSL`` method mapping
@ -473,7 +469,7 @@ necessary to access certain HTTPS websites: for example, you may need to use
``'DEFAULT:!DH'`` for a website with weak DH parameters or enable a ``'DEFAULT:!DH'`` for a website with weak DH parameters or enable a
specific cipher that is not included in ``DEFAULT`` if a website requires it. specific cipher that is not included in ``DEFAULT`` if a website requires it.
.. _OpenSSL cipher list format: https://www.openssl.org/docs/manmaster/man1/ciphers.html#CIPHER-LIST-FORMAT .. _OpenSSL cipher list format: https://www.openssl.org/docs/manmaster/man1/openssl-ciphers.html#CIPHER-LIST-FORMAT
.. setting:: DOWNLOADER_CLIENT_TLS_METHOD .. setting:: DOWNLOADER_CLIENT_TLS_METHOD
@ -496,10 +492,6 @@ This setting must be one of these string values:
- ``'TLSv1.2'``: forces TLS version 1.2 - ``'TLSv1.2'``: forces TLS version 1.2
- ``'SSLv3'``: forces SSL version 3 (**not recommended**) - ``'SSLv3'``: forces SSL version 3 (**not recommended**)
.. note::
We recommend that you use PyOpenSSL>=0.13 and Twisted>=0.13
or above (Twisted>=14.0 if you can).
.. setting:: DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING .. setting:: DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING
@ -662,8 +654,6 @@ If you want to disable it set to 0.
spider attribute and per-request using :reqmeta:`download_maxsize` spider attribute and per-request using :reqmeta:`download_maxsize`
Request.meta key. Request.meta key.
This feature needs Twisted >= 11.1.
.. setting:: DOWNLOAD_WARNSIZE .. setting:: DOWNLOAD_WARNSIZE
DOWNLOAD_WARNSIZE DOWNLOAD_WARNSIZE
@ -681,8 +671,6 @@ If you want to disable it set to 0.
spider attribute and per-request using :reqmeta:`download_warnsize` spider attribute and per-request using :reqmeta:`download_warnsize`
Request.meta key. Request.meta key.
This feature needs Twisted >= 11.1.
.. setting:: DOWNLOAD_FAIL_ON_DATALOSS .. setting:: DOWNLOAD_FAIL_ON_DATALOSS
DOWNLOAD_FAIL_ON_DATALOSS DOWNLOAD_FAIL_ON_DATALOSS
@ -899,10 +887,9 @@ LOG_FORMAT
Default: ``'%(asctime)s [%(name)s] %(levelname)s: %(message)s'`` Default: ``'%(asctime)s [%(name)s] %(levelname)s: %(message)s'``
String for formatting log messages. Refer to the `Python logging documentation`_ for the whole list of available String for formatting log messages. Refer to the
placeholders. :ref:`Python logging documentation <logrecord-attributes>` for the qwhole
list of available placeholders.
.. _Python logging documentation: https://docs.python.org/2/library/logging.html#logrecord-attributes
.. setting:: LOG_DATEFORMAT .. setting:: LOG_DATEFORMAT
@ -912,10 +899,9 @@ LOG_DATEFORMAT
Default: ``'%Y-%m-%d %H:%M:%S'`` Default: ``'%Y-%m-%d %H:%M:%S'``
String for formatting date/time, expansion of the ``%(asctime)s`` placeholder String for formatting date/time, expansion of the ``%(asctime)s`` placeholder
in :setting:`LOG_FORMAT`. Refer to the `Python datetime documentation`_ for the whole list of available in :setting:`LOG_FORMAT`. Refer to the
directives. :ref:`Python datetime documentation <strftime-strptime-behavior>` for the
whole list of available directives.
.. _Python datetime documentation: https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior
.. setting:: LOG_FORMATTER .. setting:: LOG_FORMATTER
@ -1116,17 +1102,6 @@ multi-purpose thread pool used by various Scrapy components. Threaded
DNS Resolver, BlockingFeedStorage, S3FilesStore just to name a few. Increase DNS Resolver, BlockingFeedStorage, S3FilesStore just to name a few. Increase
this value if you're experiencing problems with insufficient blocking IO. this value if you're experiencing problems with insufficient blocking IO.
.. setting:: REDIRECT_MAX_TIMES
REDIRECT_MAX_TIMES
------------------
Default: ``20``
Defines the maximum times a request can be redirected. After this maximum the
request's response is returned as is. We used Firefox default value for the
same task.
.. setting:: REDIRECT_PRIORITY_ADJUST .. setting:: REDIRECT_PRIORITY_ADJUST
REDIRECT_PRIORITY_ADJUST REDIRECT_PRIORITY_ADJUST
@ -1422,17 +1397,6 @@ Default: ``True``
A boolean which specifies if the :ref:`telnet console <topics-telnetconsole>` A boolean which specifies if the :ref:`telnet console <topics-telnetconsole>`
will be enabled (provided its extension is also enabled). will be enabled (provided its extension is also enabled).
.. setting:: TELNETCONSOLE_PORT
TELNETCONSOLE_PORT
------------------
Default: ``[6023, 6073]``
The port range to use for the telnet console. If set to ``None`` or ``0``, a
dynamically assigned port is used. For more info see
:ref:`topics-telnetconsole`.
.. setting:: TEMPLATES_DIR .. setting:: TEMPLATES_DIR
TEMPLATES_DIR TEMPLATES_DIR

View File

@ -156,6 +156,17 @@ First, we launch the shell::
scrapy shell 'https://scrapy.org' --nolog scrapy shell 'https://scrapy.org' --nolog
.. note::
Remember to always enclose URLs in quotes when running the Scrapy shell from
the command line, otherwise URLs containing arguments (i.e. the ``&`` character)
will not work.
On Windows, use double quotes instead::
scrapy shell "https://scrapy.org" --nolog
Then, the shell fetches the URL (using the Scrapy downloader) and prints the Then, the shell fetches the URL (using the Scrapy downloader) and prints the
list of available objects and useful shortcuts (you'll notice that these lines list of available objects and useful shortcuts (you'll notice that these lines
all start with the ``[s]`` prefix):: all start with the ``[s]`` prefix)::

View File

@ -16,8 +16,7 @@ deliver the arguments that the handler receives.
You can connect to signals (or send your own) through the You can connect to signals (or send your own) through the
:ref:`topics-api-signals`. :ref:`topics-api-signals`.
Here is a simple example showing how you can catch signals and perform some action: Here is a simple example showing how you can catch signals and perform some action::
::
from scrapy import signals from scrapy import signals
from scrapy import Spider from scrapy import Spider
@ -52,9 +51,45 @@ Deferred signal handlers
======================== ========================
Some signals support returning :class:`~twisted.internet.defer.Deferred` Some signals support returning :class:`~twisted.internet.defer.Deferred`
objects from their handlers, see the :ref:`topics-signals-ref` below to know objects from their handlers, allowing you to run asynchronous code that
which ones. does not block Scrapy. If a signal handler returns a
:class:`~twisted.internet.defer.Deferred`, Scrapy waits for that
:class:`~twisted.internet.defer.Deferred` to fire.
Let's take an example::
class SignalSpider(scrapy.Spider):
name = 'signals'
start_urls = ['http://quotes.toscrape.com/page/1/']
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(SignalSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped)
return spider
def item_scraped(self, item):
# Send the scraped item to the server
d = treq.post(
'http://example.com/post',
json.dumps(item).encode('ascii'),
headers={b'Content-Type': [b'application/json']}
)
# The next item will be scraped only after
# deferred (d) is fired
return d
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
See the :ref:`topics-signals-ref` below to know which signals support
:class:`~twisted.internet.defer.Deferred`.
.. _topics-signals-ref: .. _topics-signals-ref:
@ -66,22 +101,25 @@ Built-in signals reference
Here's the list of Scrapy built-in signals and their meaning. Here's the list of Scrapy built-in signals and their meaning.
engine_started Engine signals
-------------- --------------
engine_started
~~~~~~~~~~~~~~
.. signal:: engine_started .. signal:: engine_started
.. function:: engine_started() .. function:: engine_started()
Sent when the Scrapy engine has started crawling. Sent when the Scrapy engine has started crawling.
This signal supports returning deferreds from their handlers. This signal supports returning deferreds from its handlers.
.. note:: This signal may be fired *after* the :signal:`spider_opened` signal, .. note:: This signal may be fired *after* the :signal:`spider_opened` signal,
depending on how the spider was started. So **don't** rely on this signal depending on how the spider was started. So **don't** rely on this signal
getting fired before :signal:`spider_opened`. getting fired before :signal:`spider_opened`.
engine_stopped engine_stopped
-------------- ~~~~~~~~~~~~~~
.. signal:: engine_stopped .. signal:: engine_stopped
.. function:: engine_stopped() .. function:: engine_stopped()
@ -89,10 +127,21 @@ engine_stopped
Sent when the Scrapy engine is stopped (for example, when a crawling Sent when the Scrapy engine is stopped (for example, when a crawling
process has finished). process has finished).
This signal supports returning deferreds from their handlers. This signal supports returning deferreds from its handlers.
Item signals
------------
.. note::
As at max :setting:`CONCURRENT_ITEMS` items are processed in
parallel, many deferreds are fired together using
:class:`~twisted.internet.defer.DeferredList`. Hence the next
batch waits for the :class:`~twisted.internet.defer.DeferredList`
to fire and then runs the respective item signal handler for
the next batch of scraped items.
item_scraped item_scraped
------------ ~~~~~~~~~~~~
.. signal:: item_scraped .. signal:: item_scraped
.. function:: item_scraped(item, response, spider) .. function:: item_scraped(item, response, spider)
@ -100,10 +149,10 @@ item_scraped
Sent when an item has been scraped, after it has passed all the Sent when an item has been scraped, after it has passed all the
:ref:`topics-item-pipeline` stages (without being dropped). :ref:`topics-item-pipeline` stages (without being dropped).
This signal supports returning deferreds from their handlers. This signal supports returning deferreds from its handlers.
:param item: the item scraped :param item: the scraped item
:type item: dict or :class:`~scrapy.item.Item` object :type item: :ref:`item object <item-types>`
:param spider: the spider which scraped the item :param spider: the spider which scraped the item
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
@ -112,7 +161,7 @@ item_scraped
:type response: :class:`~scrapy.http.Response` object :type response: :class:`~scrapy.http.Response` object
item_dropped item_dropped
------------ ~~~~~~~~~~~~
.. signal:: item_dropped .. signal:: item_dropped
.. function:: item_dropped(item, response, exception, spider) .. function:: item_dropped(item, response, exception, spider)
@ -120,10 +169,10 @@ item_dropped
Sent after an item has been dropped from the :ref:`topics-item-pipeline` Sent after an item has been dropped from the :ref:`topics-item-pipeline`
when some stage raised a :exc:`~scrapy.exceptions.DropItem` exception. when some stage raised a :exc:`~scrapy.exceptions.DropItem` exception.
This signal supports returning deferreds from their handlers. This signal supports returning deferreds from its handlers.
:param item: the item dropped from the :ref:`topics-item-pipeline` :param item: the item dropped from the :ref:`topics-item-pipeline`
:type item: dict or :class:`~scrapy.item.Item` object :type item: :ref:`item object <item-types>`
:param spider: the spider which scraped the item :param spider: the spider which scraped the item
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
@ -137,7 +186,7 @@ item_dropped
:type exception: :exc:`~scrapy.exceptions.DropItem` exception :type exception: :exc:`~scrapy.exceptions.DropItem` exception
item_error item_error
------------ ~~~~~~~~~~
.. signal:: item_error .. signal:: item_error
.. function:: item_error(item, response, spider, failure) .. function:: item_error(item, response, spider, failure)
@ -145,10 +194,10 @@ item_error
Sent when a :ref:`topics-item-pipeline` generates an error (i.e. raises Sent when a :ref:`topics-item-pipeline` generates an error (i.e. raises
an exception), except :exc:`~scrapy.exceptions.DropItem` exception. an exception), except :exc:`~scrapy.exceptions.DropItem` exception.
This signal supports returning deferreds from their handlers. This signal supports returning deferreds from its handlers.
:param item: the item dropped from the :ref:`topics-item-pipeline` :param item: the item that caused the error in the :ref:`topics-item-pipeline`
:type item: dict or :class:`~scrapy.item.Item` object :type item: :ref:`item object <item-types>`
:param response: the response being processed when the exception was raised :param response: the response being processed when the exception was raised
:type response: :class:`~scrapy.http.Response` object :type response: :class:`~scrapy.http.Response` object
@ -159,8 +208,11 @@ item_error
:param failure: the exception raised :param failure: the exception raised
:type failure: twisted.python.failure.Failure :type failure: twisted.python.failure.Failure
Spider signals
--------------
spider_closed spider_closed
------------- ~~~~~~~~~~~~~
.. signal:: spider_closed .. signal:: spider_closed
.. function:: spider_closed(spider, reason) .. function:: spider_closed(spider, reason)
@ -168,7 +220,7 @@ spider_closed
Sent after a spider has been closed. This can be used to release per-spider Sent after a spider has been closed. This can be used to release per-spider
resources reserved on :signal:`spider_opened`. resources reserved on :signal:`spider_opened`.
This signal supports returning deferreds from their handlers. This signal supports returning deferreds from its handlers.
:param spider: the spider which has been closed :param spider: the spider which has been closed
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
@ -183,7 +235,7 @@ spider_closed
:type reason: str :type reason: str
spider_opened spider_opened
------------- ~~~~~~~~~~~~~
.. signal:: spider_opened .. signal:: spider_opened
.. function:: spider_opened(spider) .. function:: spider_opened(spider)
@ -192,13 +244,13 @@ spider_opened
reserve per-spider resources, but can be used for any task that needs to be reserve per-spider resources, but can be used for any task that needs to be
performed when a spider is opened. performed when a spider is opened.
This signal supports returning deferreds from their handlers. This signal supports returning deferreds from its handlers.
:param spider: the spider which has been opened :param spider: the spider which has been opened
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
spider_idle spider_idle
----------- ~~~~~~~~~~~
.. signal:: spider_idle .. signal:: spider_idle
.. function:: spider_idle(spider) .. function:: spider_idle(spider)
@ -216,7 +268,7 @@ spider_idle
You may raise a :exc:`~scrapy.exceptions.DontCloseSpider` exception to You may raise a :exc:`~scrapy.exceptions.DontCloseSpider` exception to
prevent the spider from being closed. prevent the spider from being closed.
This signal does not support returning deferreds from their handlers. This signal does not support returning deferreds from its handlers.
:param spider: the spider which has gone idle :param spider: the spider which has gone idle
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
@ -228,14 +280,14 @@ spider_idle
due to duplication). due to duplication).
spider_error spider_error
------------ ~~~~~~~~~~~~
.. signal:: spider_error .. signal:: spider_error
.. function:: spider_error(failure, response, spider) .. function:: spider_error(failure, response, spider)
Sent when a spider callback generates an error (i.e. raises an exception). Sent when a spider callback generates an error (i.e. raises an exception).
This signal does not support returning deferreds from their handlers. This signal does not support returning deferreds from its handlers.
:param failure: the exception raised :param failure: the exception raised
:type failure: twisted.python.failure.Failure :type failure: twisted.python.failure.Failure
@ -246,8 +298,11 @@ spider_error
:param spider: the spider which raised the exception :param spider: the spider which raised the exception
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
Request signals
---------------
request_scheduled request_scheduled
----------------- ~~~~~~~~~~~~~~~~~
.. signal:: request_scheduled .. signal:: request_scheduled
.. function:: request_scheduled(request, spider) .. function:: request_scheduled(request, spider)
@ -255,7 +310,7 @@ request_scheduled
Sent when the engine schedules a :class:`~scrapy.http.Request`, to be Sent when the engine schedules a :class:`~scrapy.http.Request`, to be
downloaded later. downloaded later.
The signal does not support returning deferreds from their handlers. This signal does not support returning deferreds from its handlers.
:param request: the request that reached the scheduler :param request: the request that reached the scheduler
:type request: :class:`~scrapy.http.Request` object :type request: :class:`~scrapy.http.Request` object
@ -264,7 +319,7 @@ request_scheduled
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
request_dropped request_dropped
--------------- ~~~~~~~~~~~~~~~
.. signal:: request_dropped .. signal:: request_dropped
.. function:: request_dropped(request, spider) .. function:: request_dropped(request, spider)
@ -272,7 +327,7 @@ request_dropped
Sent when a :class:`~scrapy.http.Request`, scheduled by the engine to be Sent when a :class:`~scrapy.http.Request`, scheduled by the engine to be
downloaded later, is rejected by the scheduler. downloaded later, is rejected by the scheduler.
The signal does not support returning deferreds from their handlers. This signal does not support returning deferreds from its handlers.
:param request: the request that reached the scheduler :param request: the request that reached the scheduler
:type request: :class:`~scrapy.http.Request` object :type request: :class:`~scrapy.http.Request` object
@ -281,14 +336,14 @@ request_dropped
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
request_reached_downloader request_reached_downloader
--------------------------- ~~~~~~~~~~~~~~~~~~~~~~~~~~
.. signal:: request_reached_downloader .. signal:: request_reached_downloader
.. function:: request_reached_downloader(request, spider) .. function:: request_reached_downloader(request, spider)
Sent when a :class:`~scrapy.http.Request` reached downloader. Sent when a :class:`~scrapy.http.Request` reached downloader.
The signal does not support returning deferreds from their handlers. This signal does not support returning deferreds from its handlers.
:param request: the request that reached downloader :param request: the request that reached downloader
:type request: :class:`~scrapy.http.Request` object :type request: :class:`~scrapy.http.Request` object
@ -297,7 +352,7 @@ request_reached_downloader
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
request_left_downloader request_left_downloader
----------------------- ~~~~~~~~~~~~~~~~~~~~~~~
.. signal:: request_left_downloader .. signal:: request_left_downloader
.. function:: request_left_downloader(request, spider) .. function:: request_left_downloader(request, spider)
@ -315,8 +370,41 @@ request_left_downloader
:param spider: the spider that yielded the request :param spider: the spider that yielded the request
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
bytes_received
~~~~~~~~~~~~~~
.. versionadded:: 2.2
.. signal:: bytes_received
.. function:: bytes_received(data, request, spider)
Sent by the HTTP 1.1 and S3 download handlers when a group of bytes is
received for a specific request. This signal might be fired multiple
times for the same request, with partial data each time. For instance,
a possible scenario for a 25 kb response would be two signals fired
with 10 kb of data, and a final one with 5 kb of data.
This signal does not support returning deferreds from its handlers.
:param data: the data received by the download handler
:type data: :class:`bytes` object
:param request: the request that generated the download
:type request: :class:`~scrapy.http.Request` object
:param spider: the spider associated with the response
:type spider: :class:`~scrapy.spiders.Spider` object
.. note:: Handlers of this signal can stop the download of a response while it
is in progress by raising the :exc:`~scrapy.exceptions.StopDownload`
exception. Please refer to the :ref:`topics-stop-response-download` topic
for additional information and examples.
Response signals
----------------
response_received response_received
----------------- ~~~~~~~~~~~~~~~~~
.. signal:: response_received .. signal:: response_received
.. function:: response_received(response, request, spider) .. function:: response_received(response, request, spider)
@ -324,7 +412,7 @@ response_received
Sent when the engine receives a new :class:`~scrapy.http.Response` from the Sent when the engine receives a new :class:`~scrapy.http.Response` from the
downloader. downloader.
This signal does not support returning deferreds from their handlers. This signal does not support returning deferreds from its handlers.
:param response: the response received :param response: the response received
:type response: :class:`~scrapy.http.Response` object :type response: :class:`~scrapy.http.Response` object
@ -336,14 +424,14 @@ response_received
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
response_downloaded response_downloaded
------------------- ~~~~~~~~~~~~~~~~~~~
.. signal:: response_downloaded .. signal:: response_downloaded
.. function:: response_downloaded(response, request, spider) .. function:: response_downloaded(response, request, spider)
Sent by the downloader right after a ``HTTPResponse`` is downloaded. Sent by the downloader right after a ``HTTPResponse`` is downloaded.
This signal does not support returning deferreds from their handlers. This signal does not support returning deferreds from its handlers.
:param response: the response downloaded :param response: the response downloaded
:type response: :class:`~scrapy.http.Response` object :type response: :class:`~scrapy.http.Response` object

View File

@ -102,29 +102,28 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
it has processed the response. it has processed the response.
:meth:`process_spider_output` must return an iterable of :meth:`process_spider_output` must return an iterable of
:class:`~scrapy.http.Request`, dict or :class:`~scrapy.item.Item` :class:`~scrapy.http.Request` objects and :ref:`item object
objects. <topics-items>`.
:param response: the response which generated this output from the :param response: the response which generated this output from the
spider spider
:type response: :class:`~scrapy.http.Response` object :type response: :class:`~scrapy.http.Response` object
:param result: the result returned by the spider :param result: the result returned by the spider
:type result: an iterable of :class:`~scrapy.http.Request`, dict :type result: an iterable of :class:`~scrapy.http.Request` objects and
or :class:`~scrapy.item.Item` objects :ref:`item object <topics-items>`
:param spider: the spider whose result is being processed :param spider: the spider whose result is being processed
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
.. method:: process_spider_exception(response, exception, spider) .. method:: process_spider_exception(response, exception, spider)
This method is called when a spider or :meth:`process_spider_output` This method is called when a spider or :meth:`process_spider_output`
method (from a previous spider middleware) raises an exception. method (from a previous spider middleware) raises an exception.
:meth:`process_spider_exception` should return either ``None`` or an :meth:`process_spider_exception` should return either ``None`` or an
iterable of :class:`~scrapy.http.Request`, dict or iterable of :class:`~scrapy.http.Request` objects and :ref:`item object
:class:`~scrapy.item.Item` objects. <topics-items>`.
If it returns ``None``, Scrapy will continue processing this exception, If it returns ``None``, Scrapy will continue processing this exception,
executing any other :meth:`process_spider_exception` in the following executing any other :meth:`process_spider_exception` in the following
@ -140,7 +139,7 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
:type response: :class:`~scrapy.http.Response` object :type response: :class:`~scrapy.http.Response` object
:param exception: the exception raised :param exception: the exception raised
:type exception: `Exception`_ object :type exception: :exc:`Exception` object
:param spider: the spider which raised the exception :param spider: the spider which raised the exception
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
@ -173,20 +172,16 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
:type spider: :class:`~scrapy.spiders.Spider` object :type spider: :class:`~scrapy.spiders.Spider` object
.. method:: from_crawler(cls, crawler) .. method:: from_crawler(cls, crawler)
If present, this classmethod is called to create a middleware instance If present, this classmethod is called to create a middleware instance
from a :class:`~scrapy.crawler.Crawler`. It must return a new instance from a :class:`~scrapy.crawler.Crawler`. It must return a new instance
of the middleware. Crawler object provides access to all Scrapy core of the middleware. Crawler object provides access to all Scrapy core
components like settings and signals; it is a way for middleware to components like settings and signals; it is a way for middleware to
access them and hook its functionality into Scrapy. access them and hook its functionality into Scrapy.
:param crawler: crawler that uses this middleware :param crawler: crawler that uses this middleware
:type crawler: :class:`~scrapy.crawler.Crawler` object :type crawler: :class:`~scrapy.crawler.Crawler` object
.. _Exception: https://docs.python.org/2/library/exceptions.html#exceptions.Exception
.. _topics-spider-middleware-ref: .. _topics-spider-middleware-ref:
Built-in spider middleware reference Built-in spider middleware reference

View File

@ -23,8 +23,8 @@ For spiders, the scraping cycle goes through something like this:
:attr:`~scrapy.spiders.Spider.parse` method as callback function for the :attr:`~scrapy.spiders.Spider.parse` method as callback function for the
Requests. Requests.
2. In the callback function, you parse the response (web page) and return either 2. In the callback function, you parse the response (web page) and return
dicts with extracted data, :class:`~scrapy.item.Item` objects, :ref:`item objects <topics-items>`,
:class:`~scrapy.http.Request` objects, or an iterable of these objects. :class:`~scrapy.http.Request` objects, or an iterable of these objects.
Those Requests will also contain a callback (maybe Those Requests will also contain a callback (maybe
the same) and will then be downloaded by Scrapy and then their the same) and will then be downloaded by Scrapy and then their
@ -121,7 +121,7 @@ scrapy.Spider
send log messages through it as described on send log messages through it as described on
:ref:`topics-logging-from-spiders`. :ref:`topics-logging-from-spiders`.
.. method:: from_crawler(crawler, \*args, \**kwargs) .. method:: from_crawler(crawler, *args, **kwargs)
This is the class method used by Scrapy to create your spiders. This is the class method used by Scrapy to create your spiders.
@ -179,8 +179,8 @@ scrapy.Spider
the same requirements as the :class:`Spider` class. the same requirements as the :class:`Spider` class.
This method, as well as any other Request callback, must return an This method, as well as any other Request callback, must return an
iterable of :class:`~scrapy.http.Request` and/or iterable of :class:`~scrapy.http.Request` and/or :ref:`item objects
dicts or :class:`~scrapy.item.Item` objects. <topics-items>`.
:param response: the response to parse :param response: the response to parse
:type response: :class:`~scrapy.http.Response` :type response: :class:`~scrapy.http.Response`
@ -234,7 +234,7 @@ Return multiple Requests and items from a single callback::
yield scrapy.Request(response.urljoin(href), self.parse) yield scrapy.Request(response.urljoin(href), self.parse)
Instead of :attr:`~.start_urls` you can use :meth:`~.start_requests` directly; Instead of :attr:`~.start_urls` you can use :meth:`~.start_requests` directly;
to give data more structure you can use :ref:`topics-items`:: to give data more structure you can use :class:`~scrapy.item.Item` objects::
import scrapy import scrapy
from myproject.items import MyItem from myproject.items import MyItem
@ -298,9 +298,7 @@ Keep in mind that spider arguments are only strings.
The spider will not do any parsing on its own. The spider will not do any parsing on its own.
If you were to set the ``start_urls`` attribute from the command line, If you were to set the ``start_urls`` attribute from the command line,
you would have to parse it on your own into a list you would have to parse it on your own into a list
using something like using something like :func:`ast.literal_eval` or :func:`json.loads`
`ast.literal_eval <https://docs.python.org/3/library/ast.html#ast.literal_eval>`_
or `json.loads <https://docs.python.org/3/library/json.html#json.loads>`_
and then set it as an attribute. and then set it as an attribute.
Otherwise, you would cause iteration over a ``start_urls`` string Otherwise, you would cause iteration over a ``start_urls`` string
(a very common python pitfall) (a very common python pitfall)
@ -366,7 +364,7 @@ CrawlSpider
This method is called for the start_urls responses. It allows to parse This method is called for the start_urls responses. It allows to parse
the initial responses and must return either an the initial responses and must return either an
:class:`~scrapy.item.Item` object, a :class:`~scrapy.http.Request` :ref:`item object <topics-items>`, a :class:`~scrapy.http.Request`
object, or an iterable containing any of them. object, or an iterable containing any of them.
Crawling rules Crawling rules
@ -385,7 +383,7 @@ Crawling rules
object with that name will be used) to be called for each link extracted with object with that name will be used) to be called for each link extracted with
the specified link extractor. This callback receives a :class:`~scrapy.http.Response` the specified link extractor. This callback receives a :class:`~scrapy.http.Response`
as its first argument and must return either a single instance or an iterable of as its first argument and must return either a single instance or an iterable of
:class:`~scrapy.item.Item`, ``dict`` and/or :class:`~scrapy.http.Request` objects :ref:`item objects <topics-items>` and/or :class:`~scrapy.http.Request` objects
(or any subclass of them). As mentioned above, the received :class:`~scrapy.http.Response` (or any subclass of them). As mentioned above, the received :class:`~scrapy.http.Response`
object will contain the text of the link that produced the :class:`~scrapy.http.Request` object will contain the text of the link that produced the :class:`~scrapy.http.Request`
in its ``meta`` dictionary (under the ``link_text`` key) in its ``meta`` dictionary (under the ``link_text`` key)
@ -533,7 +531,7 @@ XMLFeedSpider
(``itertag``). Receives the response and an (``itertag``). Receives the response and an
:class:`~scrapy.selector.Selector` for each node. Overriding this :class:`~scrapy.selector.Selector` for each node. Overriding this
method is mandatory. Otherwise, you spider won't work. This method method is mandatory. Otherwise, you spider won't work. This method
must return either a :class:`~scrapy.item.Item` object, a must return an :ref:`item object <topics-items>`, a
:class:`~scrapy.http.Request` object, or an iterable containing any of :class:`~scrapy.http.Request` object, or an iterable containing any of
them. them.
@ -543,7 +541,7 @@ XMLFeedSpider
spider, and it's intended to perform any last time processing required spider, and it's intended to perform any last time processing required
before returning the results to the framework core, for example setting the before returning the results to the framework core, for example setting the
item IDs. It receives a list of results and the response which originated item IDs. It receives a list of results and the response which originated
those results. It must return a list of results (Items or Requests). those results. It must return a list of results (items or requests).
XMLFeedSpider example XMLFeedSpider example

View File

@ -40,10 +40,10 @@ the console you need to type::
Connected to localhost. Connected to localhost.
Escape character is '^]'. Escape character is '^]'.
Username: Username:
Password: Password:
>>> >>>
By default Username is ``scrapy`` and Password is autogenerated. The By default Username is ``scrapy`` and Password is autogenerated. The
autogenerated Password can be seen on Scrapy logs like the example below:: autogenerated Password can be seen on Scrapy logs like the example below::
2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326 2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326
@ -63,7 +63,7 @@ Available variables in the telnet console
========================================= =========================================
The telnet console is like a regular Python shell running inside the Scrapy The telnet console is like a regular Python shell running inside the Scrapy
process, so you can do anything from it including importing new modules, etc. process, so you can do anything from it including importing new modules, etc.
However, the telnet console comes with some default variables defined for However, the telnet console comes with some default variables defined for
convenience: convenience:
@ -89,13 +89,11 @@ convenience:
+----------------+-------------------------------------------------------------------+ +----------------+-------------------------------------------------------------------+
| ``prefs`` | for memory debugging (see :ref:`topics-leaks`) | | ``prefs`` | for memory debugging (see :ref:`topics-leaks`) |
+----------------+-------------------------------------------------------------------+ +----------------+-------------------------------------------------------------------+
| ``p`` | a shortcut to the `pprint.pprint`_ function | | ``p`` | a shortcut to the :func:`pprint.pprint` function |
+----------------+-------------------------------------------------------------------+ +----------------+-------------------------------------------------------------------+
| ``hpy`` | for memory debugging (see :ref:`topics-leaks`) | | ``hpy`` | for memory debugging (see :ref:`topics-leaks`) |
+----------------+-------------------------------------------------------------------+ +----------------+-------------------------------------------------------------------+
.. _pprint.pprint: https://docs.python.org/library/pprint.html#pprint.pprint
Telnet console usage examples Telnet console usage examples
============================= =============================
@ -208,4 +206,3 @@ Default: ``None``
The password used for the telnet console, default behaviour is to have it The password used for the telnet console, default behaviour is to have it
autogenerated autogenerated

View File

@ -14,50 +14,57 @@ Author: dufferzafar
import re import re
# Used for remembering the file (and its contents)
# so we don't have to open the same file again.
_filename = None
_contents = None
# A regex that matches standard linkcheck output lines def main():
line_re = re.compile(u'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))')
# Read lines from the linkcheck output file # Used for remembering the file (and its contents)
try: # so we don't have to open the same file again.
with open("build/linkcheck/output.txt") as out: _filename = None
output_lines = out.readlines() _contents = None
except IOError:
print("linkcheck output not found; please run linkcheck first.")
exit(1)
# For every line, fix the respective file # A regex that matches standard linkcheck output lines
for line in output_lines: line_re = re.compile(u'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))')
match = re.match(line_re, line)
if match: # Read lines from the linkcheck output file
newfilename = match.group(1) try:
errortype = match.group(2) with open("build/linkcheck/output.txt") as out:
output_lines = out.readlines()
except IOError:
print("linkcheck output not found; please run linkcheck first.")
exit(1)
# Broken links can't be fixed and # For every line, fix the respective file
# I am not sure what do with the local ones. for line in output_lines:
if errortype.lower() in ["broken", "local"]: match = re.match(line_re, line)
print("Not Fixed: " + line)
if match:
newfilename = match.group(1)
errortype = match.group(2)
# Broken links can't be fixed and
# I am not sure what do with the local ones.
if errortype.lower() in ["broken", "local"]:
print("Not Fixed: " + line)
else:
# If this is a new file
if newfilename != _filename:
# Update the previous file
if _filename:
with open(_filename, "w") as _file:
_file.write(_contents)
_filename = newfilename
# Read the new file to memory
with open(_filename) as _file:
_contents = _file.read()
_contents = _contents.replace(match.group(3), match.group(4))
else: else:
# If this is a new file # We don't understand what the current line means!
if newfilename != _filename: print("Not Understood: " + line)
# Update the previous file
if _filename:
with open(_filename, "w") as _file:
_file.write(_contents)
_filename = newfilename if __name__ == '__main__':
main()
# Read the new file to memory
with open(_filename) as _file:
_contents = _file.read()
_contents = _contents.replace(match.group(3), match.group(4))
else:
# We don't understand what the current line means!
print("Not Understood: " + line)

113
pylintrc Normal file
View File

@ -0,0 +1,113 @@
[MASTER]
persistent=no
jobs=1 # >1 hides results
[MESSAGES CONTROL]
disable=abstract-method,
anomalous-backslash-in-string,
arguments-differ,
attribute-defined-outside-init,
bad-classmethod-argument,
bad-continuation,
bad-indentation,
bad-mcs-classmethod-argument,
bad-super-call,
bad-whitespace,
bare-except,
blacklisted-name,
broad-except,
c-extension-no-member,
catching-non-exception,
cell-var-from-loop,
comparison-with-callable,
consider-iterating-dictionary,
consider-using-in,
consider-using-set-comprehension,
consider-using-sys-exit,
cyclic-import,
dangerous-default-value,
deprecated-method,
deprecated-module,
duplicate-code, # https://github.com/PyCQA/pylint/issues/214
eval-used,
expression-not-assigned,
fixme,
function-redefined,
global-statement,
import-error,
import-outside-toplevel,
import-self,
inconsistent-return-statements,
inherit-non-class,
invalid-name,
invalid-overridden-method,
isinstance-second-argument-not-valid-type,
keyword-arg-before-vararg,
line-too-long,
logging-format-interpolation,
logging-not-lazy,
lost-exception,
method-hidden,
misplaced-comparison-constant,
missing-docstring,
missing-final-newline,
multiple-imports,
multiple-statements,
no-else-continue,
no-else-raise,
no-else-return,
no-init,
no-member,
no-method-argument,
no-name-in-module,
no-self-argument,
no-self-use,
no-value-for-parameter,
not-an-iterable,
not-callable,
pointless-statement,
pointless-string-statement,
protected-access,
redefined-argument-from-local,
redefined-builtin,
redefined-outer-name,
reimported,
signature-differs,
singleton-comparison,
super-init-not-called,
superfluous-parens,
too-few-public-methods,
too-many-ancestors,
too-many-arguments,
too-many-branches,
too-many-format-args,
too-many-function-args,
too-many-instance-attributes,
too-many-lines,
too-many-locals,
too-many-public-methods,
too-many-return-statements,
trailing-newlines,
trailing-whitespace,
unbalanced-tuple-unpacking,
undefined-variable,
undefined-loop-variable,
unexpected-special-method-signature,
ungrouped-imports,
unidiomatic-typecheck,
unnecessary-comprehension,
unnecessary-lambda,
unnecessary-pass,
unreachable,
unsubscriptable-object,
unused-argument,
unused-import,
unused-variable,
unused-wildcard-import,
used-before-assignment,
useless-object-inheritance, # Required for Python 2 support
useless-return,
useless-super-delegation,
wildcard-import,
wrong-import-order,
wrong-import-position

View File

@ -20,232 +20,23 @@ addopts =
twisted = 1 twisted = 1
markers = markers =
only_asyncio: marks tests as only enabled when --reactor=asyncio is passed only_asyncio: marks tests as only enabled when --reactor=asyncio is passed
flake8-max-line-length = 119
flake8-ignore = flake8-ignore =
W503 W503
# Files that are only meant to provide top-level imports are expected not
# to use any of their imports: # Exclude files that are meant to provide top-level imports
# E402: Module level import not at top of file
# F401: Module imported but unused
scrapy/__init__.py E402
scrapy/core/downloader/handlers/http.py F401 scrapy/core/downloader/handlers/http.py F401
scrapy/http/__init__.py F401 scrapy/http/__init__.py F401
scrapy/linkextractors/__init__.py E402 F401
scrapy/selector/__init__.py F401
scrapy/spiders/__init__.py E402 F401
# Issues pending a review: # Issues pending a review:
# extras
extras/qps-bench-server.py E501
extras/qpsclient.py E501 E501
# scrapy/commands
scrapy/commands/__init__.py E128 E501
scrapy/commands/check.py E501
scrapy/commands/crawl.py E501
scrapy/commands/edit.py E501
scrapy/commands/fetch.py E401 E501 E128 E731
scrapy/commands/genspider.py E128 E501 E502
scrapy/commands/parse.py E128 E501 E731
scrapy/commands/runspider.py E501
scrapy/commands/settings.py E128
scrapy/commands/shell.py E128 E501 E502
scrapy/commands/startproject.py E127 E501 E128
scrapy/commands/version.py E501 E128
# scrapy/contracts
scrapy/contracts/__init__.py E501 W504
scrapy/contracts/default.py E128
# scrapy/core
scrapy/core/engine.py E501 E128 E127 E502
scrapy/core/scheduler.py E501
scrapy/core/scraper.py E501 E128 W504
scrapy/core/spidermw.py E501 E731 E126
scrapy/core/downloader/__init__.py E501
scrapy/core/downloader/contextfactory.py E501 E128 E126
scrapy/core/downloader/middleware.py E501 E502
scrapy/core/downloader/tls.py E501 E241
scrapy/core/downloader/webclient.py E731 E501 E128 E126
scrapy/core/downloader/handlers/__init__.py E501
scrapy/core/downloader/handlers/ftp.py E501 E128 E127
scrapy/core/downloader/handlers/http10.py E501
scrapy/core/downloader/handlers/http11.py E501
scrapy/core/downloader/handlers/s3.py E501 E128 E126
# scrapy/downloadermiddlewares
scrapy/downloadermiddlewares/ajaxcrawl.py E501
scrapy/downloadermiddlewares/decompression.py E501
scrapy/downloadermiddlewares/defaultheaders.py E501
scrapy/downloadermiddlewares/httpcache.py E501 E126
scrapy/downloadermiddlewares/httpcompression.py E501 E128
scrapy/downloadermiddlewares/httpproxy.py E501
scrapy/downloadermiddlewares/redirect.py E501 W504
scrapy/downloadermiddlewares/retry.py E501 E126
scrapy/downloadermiddlewares/robotstxt.py E501
scrapy/downloadermiddlewares/stats.py E501
# scrapy/extensions
scrapy/extensions/closespider.py E501 E128 E123
scrapy/extensions/corestats.py E501
scrapy/extensions/feedexport.py E128 E501
scrapy/extensions/httpcache.py E128 E501
scrapy/extensions/memdebug.py E501
scrapy/extensions/spiderstate.py E501
scrapy/extensions/telnet.py E501 W504
scrapy/extensions/throttle.py E501
# scrapy/http
scrapy/http/common.py E501
scrapy/http/cookies.py E501
scrapy/http/request/__init__.py E501
scrapy/http/request/form.py E501 E123
scrapy/http/request/json_request.py E501
scrapy/http/response/__init__.py E501 E128
scrapy/http/response/text.py E501 E128 E124
# scrapy/linkextractors
scrapy/linkextractors/__init__.py E731 E501 E402 W504
scrapy/linkextractors/lxmlhtml.py E501 E731
# scrapy/loader
scrapy/loader/__init__.py E501 E128
scrapy/loader/processors.py E501
# scrapy/pipelines
scrapy/pipelines/__init__.py E501
scrapy/pipelines/files.py E116 E501 E266
scrapy/pipelines/images.py E265 E501
scrapy/pipelines/media.py E125 E501 E266
# scrapy/selector
scrapy/selector/__init__.py F403
scrapy/selector/unified.py E501 E111
# scrapy/settings
scrapy/settings/__init__.py E501
scrapy/settings/default_settings.py E501 E114 E116
scrapy/settings/deprecated.py E501
# scrapy/spidermiddlewares
scrapy/spidermiddlewares/httperror.py E501
scrapy/spidermiddlewares/offsite.py E501
scrapy/spidermiddlewares/referer.py E501 E129 W504
scrapy/spidermiddlewares/urllength.py E501
# scrapy/spiders
scrapy/spiders/__init__.py E501 E402
scrapy/spiders/crawl.py E501
scrapy/spiders/feed.py E501
scrapy/spiders/sitemap.py E501
# scrapy/utils
scrapy/utils/asyncio.py E501
scrapy/utils/benchserver.py E501
scrapy/utils/conf.py E402 E501
scrapy/utils/datatypes.py E501
scrapy/utils/decorators.py E501
scrapy/utils/defer.py E501 E128
scrapy/utils/deprecate.py E128 E501 E127 E502
scrapy/utils/gz.py E501 W504
scrapy/utils/http.py F403 scrapy/utils/http.py F403
scrapy/utils/httpobj.py E501
scrapy/utils/iterators.py E501
scrapy/utils/log.py E128 E501
scrapy/utils/markup.py F403 scrapy/utils/markup.py F403
scrapy/utils/misc.py E501
scrapy/utils/multipart.py F403 scrapy/utils/multipart.py F403
scrapy/utils/project.py E501 scrapy/utils/url.py F403 F405
scrapy/utils/python.py E501 tests/test_loader.py E741
scrapy/utils/reactor.py E501
scrapy/utils/reqser.py E501
scrapy/utils/request.py E127 E501
scrapy/utils/response.py E501 E128
scrapy/utils/signal.py E501 E128
scrapy/utils/sitemap.py E501
scrapy/utils/spider.py E501
scrapy/utils/ssl.py E501
scrapy/utils/test.py E501
scrapy/utils/url.py E501 F403 E128 F405
# scrapy
scrapy/__init__.py E402 E501
scrapy/cmdline.py E501
scrapy/crawler.py E501
scrapy/dupefilters.py E501 E202
scrapy/exceptions.py E501
scrapy/exporters.py E501
scrapy/interfaces.py E501
scrapy/item.py E501 E128
scrapy/link.py E501
scrapy/logformatter.py E501
scrapy/mail.py E402 E128 E501 E502
scrapy/middleware.py E128 E501
scrapy/pqueues.py E501
scrapy/resolver.py E501
scrapy/responsetypes.py E128 E501
scrapy/robotstxt.py E501
scrapy/shell.py E501
scrapy/signalmanager.py E501
scrapy/spiderloader.py F841 E501 E126
scrapy/squeues.py E128
scrapy/statscollectors.py E501
# tests
tests/__init__.py E402 E501
tests/mockserver.py E401 E501 E126 E123
tests/pipelines.py F841
tests/spiders.py E501 E127
tests/test_closespider.py E501 E127
tests/test_command_fetch.py E501
tests/test_command_parse.py E501 E128
tests/test_command_shell.py E501 E128
tests/test_commands.py E128 E501
tests/test_contracts.py E501 E128
tests/test_crawl.py E501 E741 E265
tests/test_crawler.py F841 E501
tests/test_dependencies.py F841 E501
tests/test_downloader_handlers.py E124 E127 E128 E265 E501 E126 E123
tests/test_downloadermiddleware.py E501
tests/test_downloadermiddleware_ajaxcrawlable.py E501
tests/test_downloadermiddleware_cookies.py E731 E741 E501 E128 E265 E126
tests/test_downloadermiddleware_decompression.py E127
tests/test_downloadermiddleware_defaultheaders.py E501
tests/test_downloadermiddleware_downloadtimeout.py E501
tests/test_downloadermiddleware_httpcache.py E501
tests/test_downloadermiddleware_httpcompression.py E501 E126 E123
tests/test_downloadermiddleware_httpproxy.py E501 E128
tests/test_downloadermiddleware_redirect.py E501 E128 E127
tests/test_downloadermiddleware_retry.py E501 E128 E126
tests/test_downloadermiddleware_robotstxt.py E501
tests/test_downloadermiddleware_stats.py E501
tests/test_dupefilters.py E501 E741 E128 E124
tests/test_engine.py E401 E501 E128
tests/test_exporters.py E501 E731 E128 E124
tests/test_extension_telnet.py F841
tests/test_feedexport.py E501 F841 E241
tests/test_http_cookies.py E501
tests/test_http_headers.py E501
tests/test_http_request.py E402 E501 E127 E128 E128 E126 E123
tests/test_http_response.py E501 E128 E265
tests/test_item.py E128 F841
tests/test_link.py E501
tests/test_linkextractors.py E501 E128 E124
tests/test_loader.py E501 E731 E741 E128 E117 E241
tests/test_logformatter.py E128 E501 E122
tests/test_mail.py E128 E501
tests/test_middleware.py E501 E128
tests/test_pipeline_crawl.py E501 E128 E126
tests/test_pipeline_files.py E501
tests/test_pipeline_images.py F841 E501
tests/test_pipeline_media.py E501 E741 E731 E128 E502
tests/test_proxy_connect.py E501 E741
tests/test_request_cb_kwargs.py E501
tests/test_responsetypes.py E501
tests/test_robotstxt_interface.py E501 E501
tests/test_scheduler.py E501 E126 E123
tests/test_selector.py E501 E127
tests/test_spider.py E501
tests/test_spidermiddleware.py E501
tests/test_spidermiddleware_httperror.py E128 E501 E127 E121
tests/test_spidermiddleware_offsite.py E501 E128 E111
tests/test_spidermiddleware_output_chain.py E501
tests/test_spidermiddleware_referer.py E501 F841 E125 E201 E124 E501 E241 E121
tests/test_squeues.py E501 E741
tests/test_utils_asyncio.py E501
tests/test_utils_conf.py E501 E128
tests/test_utils_curl.py E501
tests/test_utils_datatypes.py E402 E501
tests/test_utils_defer.py E501 F841
tests/test_utils_deprecate.py F841 E501
tests/test_utils_http.py E501 E128 W504
tests/test_utils_iterators.py E501 E128 E129 E241
tests/test_utils_log.py E741
tests/test_utils_python.py E501 E731
tests/test_utils_reqser.py E501 E128
tests/test_utils_request.py E501 E128
tests/test_utils_response.py E501
tests/test_utils_signal.py E741 F841 E731
tests/test_utils_sitemap.py E128 E501 E124
tests/test_utils_url.py E501 E127 E125 E501 E241 E126 E123
tests/test_webclient.py E501 E128 E122 E402 E241 E123 E126
tests/test_cmdline/__init__.py E501
tests/test_settings/__init__.py E501 E128
tests/test_spiderloader/__init__.py E128 E501
tests/test_utils_misc/__init__.py E501

View File

@ -1 +1 @@
2.0.0 2.2.0

View File

@ -2,33 +2,11 @@
Scrapy - a web crawling and web scraping framework written for Python Scrapy - a web crawling and web scraping framework written for Python
""" """
__all__ = ['__version__', 'version_info', 'twisted_version',
'Spider', 'Request', 'FormRequest', 'Selector', 'Item', 'Field']
# Scrapy version
import pkgutil import pkgutil
__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
version_info = tuple(int(v) if v.isdigit() else v
for v in __version__.split('.'))
del pkgutil
# Check minimum required Python version
import sys import sys
if sys.version_info < (3, 5):
print("Scrapy %s requires Python 3.5" % __version__)
sys.exit(1)
# Ignore noisy twisted deprecation warnings
import warnings import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
del warnings
# Apply monkey patches to fix issues in external libraries
from scrapy import _monkeypatches
del _monkeypatches
from twisted import version as _txv from twisted import version as _txv
twisted_version = (_txv.major, _txv.minor, _txv.micro)
# Declare top-level shortcuts # Declare top-level shortcuts
from scrapy.spiders import Spider from scrapy.spiders import Spider
@ -36,4 +14,29 @@ from scrapy.http import Request, FormRequest
from scrapy.selector import Selector from scrapy.selector import Selector
from scrapy.item import Item, Field from scrapy.item import Item, Field
__all__ = [
'__version__', 'version_info', 'twisted_version', 'Spider',
'Request', 'FormRequest', 'Selector', 'Item', 'Field',
]
# Scrapy and Twisted versions
__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split('.'))
twisted_version = (_txv.major, _txv.minor, _txv.micro)
# Check minimum required Python version
if sys.version_info < (3, 5, 2):
print("Scrapy %s requires Python 3.5.2" % __version__)
sys.exit(1)
# Ignore noisy twisted deprecation warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
del pkgutil
del sys del sys
del warnings

View File

@ -1,11 +0,0 @@
import copyreg
# Undo what Twisted's perspective broker adds to pickle register
# to prevent bugs like Twisted#7989 while serializing requests
import twisted.persisted.styles # NOQA
# Remove only entries with twisted serializers for non-twisted types.
for k, v in frozenset(copyreg.dispatch_table.items()):
if not str(getattr(k, '__module__', '')).startswith('twisted') \
and str(getattr(v, '__module__', '')).startswith('twisted'):
copyreg.dispatch_table.pop(k)

View File

@ -165,6 +165,7 @@ if __name__ == '__main__':
try: try:
execute() execute()
finally: finally:
# Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() # Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() on exit:
# on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies # http://doc.pypy.org/en/latest/cpython_differences.html
# ?highlight=gc.collect#differences-related-to-garbage-collection-strategies
garbage_collect() garbage_collect()

View File

@ -5,7 +5,7 @@ import os
from optparse import OptionGroup from optparse import OptionGroup
from twisted.python import failure from twisted.python import failure
from scrapy.utils.conf import arglist_to_dict from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
from scrapy.exceptions import UsageError from scrapy.exceptions import UsageError
@ -23,7 +23,8 @@ class ScrapyCommand:
self.settings = None # set in scrapy.cmdline self.settings = None # set in scrapy.cmdline
def set_crawler(self, crawler): def set_crawler(self, crawler):
assert not hasattr(self, '_crawler'), "crawler already set" if hasattr(self, '_crawler'):
raise RuntimeError("crawler already set")
self._crawler = crawler self._crawler = crawler
def syntax(self): def syntax(self):
@ -58,17 +59,17 @@ class ScrapyCommand:
""" """
group = OptionGroup(parser, "Global Options") group = OptionGroup(parser, "Global Options")
group.add_option("--logfile", metavar="FILE", group.add_option("--logfile", metavar="FILE",
help="log file. if omitted stderr will be used") help="log file. if omitted stderr will be used")
group.add_option("-L", "--loglevel", metavar="LEVEL", default=None, group.add_option("-L", "--loglevel", metavar="LEVEL", default=None,
help="log level (default: %s)" % self.settings['LOG_LEVEL']) help="log level (default: %s)" % self.settings['LOG_LEVEL'])
group.add_option("--nolog", action="store_true", group.add_option("--nolog", action="store_true",
help="disable logging completely") help="disable logging completely")
group.add_option("--profile", metavar="FILE", default=None, group.add_option("--profile", metavar="FILE", default=None,
help="write python cProfile stats to FILE") help="write python cProfile stats to FILE")
group.add_option("--pidfile", metavar="FILE", group.add_option("--pidfile", metavar="FILE",
help="write process ID to FILE") help="write process ID to FILE")
group.add_option("-s", "--set", action="append", default=[], metavar="NAME=VALUE", group.add_option("-s", "--set", action="append", default=[], metavar="NAME=VALUE",
help="set/override setting (may be repeated)") help="set/override setting (may be repeated)")
group.add_option("--pdb", action="store_true", help="enable pdb on failure") group.add_option("--pdb", action="store_true", help="enable pdb on failure")
parser.add_option_group(group) parser.add_option_group(group)
@ -103,3 +104,27 @@ class ScrapyCommand:
Entry point for running commands Entry point for running commands
""" """
raise NotImplementedError raise NotImplementedError
class BaseRunSpiderCommand(ScrapyCommand):
"""
Common class used to share functionality between the crawl and runspider commands
"""
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE", action="append",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
if opts.output:
feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
self.settings.set('FEEDS', feeds, priority='cmdline')

View File

@ -1,9 +1,8 @@
from scrapy.commands import ScrapyCommand from scrapy.commands import BaseRunSpiderCommand
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
from scrapy.exceptions import UsageError from scrapy.exceptions import UsageError
class Command(ScrapyCommand): class Command(BaseRunSpiderCommand):
requires_project = True requires_project = True
@ -13,25 +12,6 @@ class Command(ScrapyCommand):
def short_desc(self): def short_desc(self):
return "Run a spider" return "Run a spider"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE", action="append",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
if opts.output:
feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
self.settings.set('FEEDS', feeds, priority='cmdline')
def run(self, args, opts): def run(self, args, opts):
if len(args) < 1: if len(args) < 1:
raise UsageError() raise UsageError()

View File

@ -27,8 +27,8 @@ class Command(ScrapyCommand):
parser.add_option("--spider", dest="spider", help="use this spider") parser.add_option("--spider", dest="spider", help="use this spider")
parser.add_option("--headers", dest="headers", action="store_true", parser.add_option("--headers", dest="headers", action="store_true",
help="print response HTTP headers instead of body") help="print response HTTP headers instead of body")
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
default=False, help="do not handle HTTP 3xx status codes and print response as-is") help="do not handle HTTP 3xx status codes and print response as-is")
def _print_headers(self, headers, prefix): def _print_headers(self, headers, prefix):
for key, values in headers.items(): for key, values in headers.items():
@ -49,8 +49,8 @@ class Command(ScrapyCommand):
def run(self, args, opts): def run(self, args, opts):
if len(args) != 1 or not is_url(args[0]): if len(args) != 1 or not is_url(args[0]):
raise UsageError() raise UsageError()
cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=self._print_response,
request = Request(args[0], callback=cb, dont_filter=True) cb_kwargs={"opts": opts}, dont_filter=True)
# by default, let the framework handle redirects, # by default, let the framework handle redirects,
# i.e. command handles all codes expect 3xx # i.e. command handles all codes expect 3xx
if not opts.no_redirect: if not opts.no_redirect:

View File

@ -36,15 +36,15 @@ class Command(ScrapyCommand):
def add_options(self, parser): def add_options(self, parser):
ScrapyCommand.add_options(self, parser) ScrapyCommand.add_options(self, parser)
parser.add_option("-l", "--list", dest="list", action="store_true", parser.add_option("-l", "--list", dest="list", action="store_true",
help="List available templates") help="List available templates")
parser.add_option("-e", "--edit", dest="edit", action="store_true", parser.add_option("-e", "--edit", dest="edit", action="store_true",
help="Edit spider after creating it") help="Edit spider after creating it")
parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE", parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE",
help="Dump template to standard output") help="Dump template to standard output")
parser.add_option("-t", "--template", dest="template", default="basic", parser.add_option("-t", "--template", dest="template", default="basic",
help="Uses a custom template.") help="Uses a custom template.")
parser.add_option("--force", dest="force", action="store_true", parser.add_option("--force", dest="force", action="store_true",
help="If the spider already exists, overwrite it with the template") help="If the spider already exists, overwrite it with the template")
def run(self, args, opts): def run(self, args, opts):
if opts.list: if opts.list:
@ -90,8 +90,7 @@ class Command(ScrapyCommand):
'module': module, 'module': module,
'name': name, 'name': name,
'domain': domain, 'domain': domain,
'classname': '%sSpider' % ''.join(s.capitalize() \ 'classname': '%sSpider' % ''.join(s.capitalize() for s in module.split('_'))
for s in module.split('_'))
} }
if self.settings.get('NEWSPIDER_MODULE'): if self.settings.get('NEWSPIDER_MODULE'):
spiders_module = import_module(self.settings['NEWSPIDER_MODULE']) spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
@ -102,8 +101,8 @@ class Command(ScrapyCommand):
spider_file = "%s.py" % join(spiders_dir, module) spider_file = "%s.py" % join(spiders_dir, module)
shutil.copyfile(template_file, spider_file) shutil.copyfile(template_file, spider_file)
render_templatefile(spider_file, **tvars) render_templatefile(spider_file, **tvars)
print("Created spider %r using template %r " % (name, \ print("Created spider %r using template %r "
template_name), end=('' if spiders_module else '\n')) % (name, template_name), end=('' if spiders_module else '\n'))
if spiders_module: if spiders_module:
print("in module:\n %s.%s" % (spiders_module.__name__, module)) print("in module:\n %s.%s" % (spiders_module.__name__, module))

View File

@ -1,11 +1,11 @@
import json import json
import logging import logging
from itemadapter import is_item, ItemAdapter
from w3lib.url import is_url from w3lib.url import is_url
from scrapy.commands import ScrapyCommand from scrapy.commands import ScrapyCommand
from scrapy.http import Request from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.utils import display from scrapy.utils import display
from scrapy.utils.conf import arglist_to_dict from scrapy.utils.conf import arglist_to_dict
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
@ -33,29 +33,29 @@ class Command(ScrapyCommand):
def add_options(self, parser): def add_options(self, parser):
ScrapyCommand.add_options(self, parser) ScrapyCommand.add_options(self, parser)
parser.add_option("--spider", dest="spider", default=None, parser.add_option("--spider", dest="spider", default=None,
help="use this spider without looking for one") help="use this spider without looking for one")
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)") help="set spider argument (may be repeated)")
parser.add_option("--pipelines", action="store_true", parser.add_option("--pipelines", action="store_true",
help="process items through pipelines") help="process items through pipelines")
parser.add_option("--nolinks", dest="nolinks", action="store_true", parser.add_option("--nolinks", dest="nolinks", action="store_true",
help="don't show links to follow (extracted requests)") help="don't show links to follow (extracted requests)")
parser.add_option("--noitems", dest="noitems", action="store_true", parser.add_option("--noitems", dest="noitems", action="store_true",
help="don't show scraped items") help="don't show scraped items")
parser.add_option("--nocolour", dest="nocolour", action="store_true", parser.add_option("--nocolour", dest="nocolour", action="store_true",
help="avoid using pygments to colorize the output") help="avoid using pygments to colorize the output")
parser.add_option("-r", "--rules", dest="rules", action="store_true", parser.add_option("-r", "--rules", dest="rules", action="store_true",
help="use CrawlSpider rules to discover the callback") help="use CrawlSpider rules to discover the callback")
parser.add_option("-c", "--callback", dest="callback", parser.add_option("-c", "--callback", dest="callback",
help="use this callback for parsing, instead looking for a callback") help="use this callback for parsing, instead looking for a callback")
parser.add_option("-m", "--meta", dest="meta", parser.add_option("-m", "--meta", dest="meta",
help="inject extra meta into the Request, it must be a valid raw json string") help="inject extra meta into the Request, it must be a valid raw json string")
parser.add_option("--cbkwargs", dest="cbkwargs", parser.add_option("--cbkwargs", dest="cbkwargs",
help="inject extra callback kwargs into the Request, it must be a valid raw json string") help="inject extra callback kwargs into the Request, it must be a valid raw json string")
parser.add_option("-d", "--depth", dest="depth", type="int", default=1, parser.add_option("-d", "--depth", dest="depth", type="int", default=1,
help="maximum depth for parsing requests [default: %default]") help="maximum depth for parsing requests [default: %default]")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
help="print each depth level one by one") help="print each depth level one by one")
@property @property
def max_level(self): def max_level(self):
@ -81,7 +81,7 @@ class Command(ScrapyCommand):
items = self.items.get(lvl, []) items = self.items.get(lvl, [])
print("# Scraped Items ", "-" * 60) print("# Scraped Items ", "-" * 60)
display.pprint([dict(x) for x in items], colorize=colour) display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
def print_requests(self, lvl=None, colour=True): def print_requests(self, lvl=None, colour=True):
if lvl is None: if lvl is None:
@ -117,7 +117,7 @@ class Command(ScrapyCommand):
items, requests = [], [] items, requests = [], []
for x in iterate_spider_output(callback(response, **cb_kwargs)): for x in iterate_spider_output(callback(response, **cb_kwargs)):
if isinstance(x, (BaseItem, dict)): if is_item(x):
items.append(x) items.append(x)
elif isinstance(x, Request): elif isinstance(x, Request):
requests.append(x) requests.append(x)
@ -146,9 +146,8 @@ class Command(ScrapyCommand):
if not self.spidercls: if not self.spidercls:
logger.error('Unable to find spider for: %(url)s', {'url': url}) logger.error('Unable to find spider for: %(url)s', {'url': url})
# Request requires callback argument as callable or None, not string def _start_requests(spider):
request = Request(url, None) yield self.prepare_request(spider, Request(url), opts)
_start_requests = lambda s: [self.prepare_request(s, request, opts)]
self.spidercls.start_requests = _start_requests self.spidercls.start_requests = _start_requests
def start_parsing(self, url, opts): def start_parsing(self, url, opts):

View File

@ -3,9 +3,8 @@ import os
from importlib import import_module from importlib import import_module
from scrapy.utils.spider import iter_spider_classes from scrapy.utils.spider import iter_spider_classes
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError from scrapy.exceptions import UsageError
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli from scrapy.commands import BaseRunSpiderCommand
def _import_file(filepath): def _import_file(filepath):
@ -24,7 +23,7 @@ def _import_file(filepath):
return module return module
class Command(ScrapyCommand): class Command(BaseRunSpiderCommand):
requires_project = False requires_project = False
default_settings = {'SPIDER_LOADER_WARN_ONLY': True} default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
@ -38,25 +37,6 @@ class Command(ScrapyCommand):
def long_desc(self): def long_desc(self):
return "Run the spider defined in the given file" return "Run the spider defined in the given file"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE", action="append",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
if opts.output:
feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
self.settings.set('FEEDS', feeds, priority='cmdline')
def run(self, args, opts): def run(self, args, opts):
if len(args) != 1: if len(args) != 1:
raise UsageError() raise UsageError()

View File

@ -19,15 +19,15 @@ class Command(ScrapyCommand):
def add_options(self, parser): def add_options(self, parser):
ScrapyCommand.add_options(self, parser) ScrapyCommand.add_options(self, parser)
parser.add_option("--get", dest="get", metavar="SETTING", parser.add_option("--get", dest="get", metavar="SETTING",
help="print raw setting value") help="print raw setting value")
parser.add_option("--getbool", dest="getbool", metavar="SETTING", parser.add_option("--getbool", dest="getbool", metavar="SETTING",
help="print setting value, interpreted as a boolean") help="print setting value, interpreted as a boolean")
parser.add_option("--getint", dest="getint", metavar="SETTING", parser.add_option("--getint", dest="getint", metavar="SETTING",
help="print setting value, interpreted as an integer") help="print setting value, interpreted as an integer")
parser.add_option("--getfloat", dest="getfloat", metavar="SETTING", parser.add_option("--getfloat", dest="getfloat", metavar="SETTING",
help="print setting value, interpreted as a float") help="print setting value, interpreted as a float")
parser.add_option("--getlist", dest="getlist", metavar="SETTING", parser.add_option("--getlist", dest="getlist", metavar="SETTING",
help="print setting value, interpreted as a list") help="print setting value, interpreted as a list")
def run(self, args, opts): def run(self, args, opts):
settings = self.crawler_process.settings settings = self.crawler_process.settings

View File

@ -34,11 +34,11 @@ class Command(ScrapyCommand):
def add_options(self, parser): def add_options(self, parser):
ScrapyCommand.add_options(self, parser) ScrapyCommand.add_options(self, parser)
parser.add_option("-c", dest="code", parser.add_option("-c", dest="code",
help="evaluate the code in the shell, print the result and exit") help="evaluate the code in the shell, print the result and exit")
parser.add_option("--spider", dest="spider", parser.add_option("--spider", dest="spider",
help="use this spider") help="use this spider")
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \ parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
default=False, help="do not handle HTTP 3xx status codes and print response as-is") help="do not handle HTTP 3xx status codes and print response as-is")
def update_vars(self, vars): def update_vars(self, vars):
"""You can use this function to update the Scrapy objects that will be """You can use this function to update the Scrapy objects that will be

View File

@ -1,5 +1,6 @@
import re import re
import os import os
import stat
import string import string
from importlib import import_module from importlib import import_module
from os.path import join, exists, abspath from os.path import join, exists, abspath
@ -78,6 +79,29 @@ class Command(ScrapyCommand):
else: else:
copy2(srcname, dstname) copy2(srcname, dstname)
copystat(src, dst) copystat(src, dst)
self._set_rw_permissions(dst)
def _set_rw_permissions(self, path):
"""
Sets permissions of a directory tree to +rw and +rwx for folders.
This is necessary if the start template files come without write
permissions.
"""
mode_rw = (stat.S_IRUSR
| stat.S_IWUSR
| stat.S_IRGRP
| stat.S_IROTH)
mode_x = (stat.S_IXUSR
| stat.S_IXGRP
| stat.S_IXOTH)
os.chmod(path, mode_rw | mode_x)
for root, dirs, files in os.walk(path):
for dir in dirs:
os.chmod(join(root, dir), mode_rw | mode_x)
for file in files:
os.chmod(join(root, file), mode_rw)
def run(self, args, opts): def run(self, args, opts):
if len(args) not in (1, 2): if len(args) not in (1, 2):
@ -102,10 +126,8 @@ class Command(ScrapyCommand):
move(join(project_dir, 'module'), join(project_dir, project_name)) move(join(project_dir, 'module'), join(project_dir, project_name))
for paths in TEMPLATES_TO_RENDER: for paths in TEMPLATES_TO_RENDER:
path = join(*paths) path = join(*paths)
tplfile = join(project_dir, tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
render_templatefile(tplfile, project_name=project_name,
ProjectName=string_camelcase(project_name))
print("New Scrapy project '%s', using template directory '%s', " print("New Scrapy project '%s', using template directory '%s', "
"created in:" % (project_name, self.templates_dir)) "created in:" % (project_name, self.templates_dir))
print(" %s\n" % abspath(project_dir)) print(" %s\n" % abspath(project_dir))

View File

@ -17,7 +17,7 @@ class Command(ScrapyCommand):
def add_options(self, parser): def add_options(self, parser):
ScrapyCommand.add_options(self, parser) ScrapyCommand.add_options(self, parser)
parser.add_option("--verbose", "-v", dest="verbose", action="store_true", parser.add_option("--verbose", "-v", dest="verbose", action="store_true",
help="also display twisted/python/platform info (useful for bug reports)") help="also display twisted/python/platform info (useful for bug reports)")
def run(self, args, opts): def run(self, args, opts):
if opts.verbose: if opts.verbose:

View File

@ -17,10 +17,10 @@ class ContractsManager:
self.contracts[contract.name] = contract self.contracts[contract.name] = contract
def tested_methods_from_spidercls(self, spidercls): def tested_methods_from_spidercls(self, spidercls):
is_method = re.compile(r"^\s*@", re.MULTILINE).search
methods = [] methods = []
for key, value in getmembers(spidercls): for key, value in getmembers(spidercls):
if (callable(value) and value.__doc__ and if callable(value) and value.__doc__ and is_method(value.__doc__):
re.search(r'^\s*@', value.__doc__, re.MULTILINE)):
methods.append(key) methods.append(key)
return methods return methods

View File

@ -1,10 +1,10 @@
import json import json
from scrapy.item import BaseItem from itemadapter import is_item, ItemAdapter
from scrapy.http import Request
from scrapy.exceptions import ContractFail
from scrapy.contracts import Contract from scrapy.contracts import Contract
from scrapy.exceptions import ContractFail
from scrapy.http import Request
# contracts # contracts
@ -48,19 +48,23 @@ class ReturnsContract(Contract):
""" """
name = 'returns' name = 'returns'
objects = { object_type_verifiers = {
'request': Request, 'request': lambda x: isinstance(x, Request),
'requests': Request, 'requests': lambda x: isinstance(x, Request),
'item': (BaseItem, dict), 'item': is_item,
'items': (BaseItem, dict), 'items': is_item,
} }
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(ReturnsContract, self).__init__(*args, **kwargs) super(ReturnsContract, self).__init__(*args, **kwargs)
assert len(self.args) in [1, 2, 3] if len(self.args) not in [1, 2, 3]:
raise ValueError(
"Incorrect argument quantity: expected 1, 2 or 3, got %i"
% len(self.args)
)
self.obj_name = self.args[0] or None self.obj_name = self.args[0] or None
self.obj_type = self.objects[self.obj_name] self.obj_type_verifier = self.object_type_verifiers[self.obj_name]
try: try:
self.min_bound = int(self.args[1]) self.min_bound = int(self.args[1])
@ -75,7 +79,7 @@ class ReturnsContract(Contract):
def post_process(self, output): def post_process(self, output):
occurrences = 0 occurrences = 0
for x in output: for x in output:
if isinstance(x, self.obj_type): if self.obj_type_verifier(x):
occurrences += 1 occurrences += 1
assertion = (self.min_bound <= occurrences <= self.max_bound) assertion = (self.min_bound <= occurrences <= self.max_bound)
@ -99,8 +103,8 @@ class ScrapesContract(Contract):
def post_process(self, output): def post_process(self, output):
for x in output: for x in output:
if isinstance(x, (BaseItem, dict)): if is_item(x):
missing = [arg for arg in self.args if arg not in x] missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
if missing: if missing:
raise ContractFail( missing_str = ", ".join(missing)
"Missing fields: %s" % ", ".join(missing)) raise ContractFail("Missing fields: %s" % missing_str)

View File

@ -173,7 +173,7 @@ class Downloader:
return response return response
dfd.addCallback(_downloaded) dfd.addCallback(_downloaded)
# 3. After response arrives, remove the request from transferring # 3. After response arrives, remove the request from transferring
# state to free up the transferring slot so it can be used by the # state to free up the transferring slot so it can be used by the
# following requests (perhaps those which came from the downloader # following requests (perhaps those which came from the downloader
# middleware itself) # middleware itself)

View File

@ -46,11 +46,12 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
# #
# * getattr() for `_ssl_method` attribute for context factories # * getattr() for `_ssl_method` attribute for context factories
# not calling super(..., self).__init__ # not calling super(..., self).__init__
return CertificateOptions(verify=False, return CertificateOptions(
method=getattr(self, 'method', verify=False,
getattr(self, '_ssl_method', None)), method=getattr(self, 'method', getattr(self, '_ssl_method', None)),
fixBrokenPeers=True, fixBrokenPeers=True,
acceptableCiphers=self.tls_ciphers) acceptableCiphers=self.tls_ciphers,
)
# kept for old-style HTTP/1.0 downloader context twisted calls, # kept for old-style HTTP/1.0 downloader context twisted calls,
# e.g. connectSSL() # e.g. connectSSL()
@ -86,8 +87,8 @@ class BrowserLikeContextFactory(ScrapyClientContextFactory):
# #
# This means that a website like https://www.cacert.org will be rejected # This means that a website like https://www.cacert.org will be rejected
# by default, since CAcert.org CA certificate is seldom shipped. # by default, since CAcert.org CA certificate is seldom shipped.
return optionsForClientTLS(hostname.decode("ascii"), return optionsForClientTLS(
trustRoot=platformTrust(), hostname=hostname.decode("ascii"),
extraCertificateOptions={ trustRoot=platformTrust(),
'method': self._ssl_method, extraCertificateOptions={'method': self._ssl_method},
}) )

View File

@ -86,19 +86,19 @@ class FTPDownloadHandler:
password = request.meta.get("ftp_password", self.default_password) password = request.meta.get("ftp_password", self.default_password)
passive_mode = 1 if bool(request.meta.get("ftp_passive", passive_mode = 1 if bool(request.meta.get("ftp_passive",
self.passive_mode)) else 0 self.passive_mode)) else 0
creator = ClientCreator(reactor, FTPClient, user, password, creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode)
passive=passive_mode) dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
return creator.connectTCP(parsed_url.hostname, parsed_url.port or 21).addCallback(self.gotClient, return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
request, unquote(parsed_url.path))
def gotClient(self, client, request, filepath): def gotClient(self, client, request, filepath):
self.client = client self.client = client
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename")) protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
return client.retrieveFile(filepath, protocol)\ return client.retrieveFile(filepath, protocol).addCallbacks(
.addCallbacks(callback=self._build_response, callback=self._build_response,
callbackArgs=(request, protocol), callbackArgs=(request, protocol),
errback=self._failed, errback=self._failed,
errbackArgs=(request,)) errbackArgs=(request,),
)
def _build_response(self, result, request, protocol): def _build_response(self, result, request, protocol):
self.result = result self.result = result

View File

@ -1,5 +1,6 @@
"""Download handlers for http and https schemes""" """Download handlers for http and https schemes"""
import ipaddress
import logging import logging
import re import re
import warnings import warnings
@ -11,15 +12,17 @@ from urllib.parse import urldefrag
from twisted.internet import defer, protocol, ssl from twisted.internet import defer, protocol, ssl
from twisted.internet.endpoints import TCP4ClientEndpoint from twisted.internet.endpoints import TCP4ClientEndpoint
from twisted.internet.error import TimeoutError from twisted.internet.error import TimeoutError
from twisted.python.failure import Failure
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
from twisted.web.http import _DataLoss, PotentialDataLoss from twisted.web.http import _DataLoss, PotentialDataLoss
from twisted.web.http_headers import Headers as TxHeaders from twisted.web.http_headers import Headers as TxHeaders
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
from zope.interface import implementer from zope.interface import implementer
from scrapy import signals
from scrapy.core.downloader.tls import openssl_methods from scrapy.core.downloader.tls import openssl_methods
from scrapy.core.downloader.webclient import _parse from scrapy.core.downloader.webclient import _parse
from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.exceptions import ScrapyDeprecationWarning, StopDownload
from scrapy.http import Headers from scrapy.http import Headers
from scrapy.responsetypes import responsetypes from scrapy.responsetypes import responsetypes
from scrapy.utils.misc import create_instance, load_object from scrapy.utils.misc import create_instance, load_object
@ -33,6 +36,8 @@ class HTTP11DownloadHandler:
lazy = False lazy = False
def __init__(self, settings, crawler=None): def __init__(self, settings, crawler=None):
self._crawler = crawler
from twisted.internet import reactor from twisted.internet import reactor
self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool = HTTPConnectionPool(reactor, persistent=True)
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
@ -78,6 +83,7 @@ class HTTP11DownloadHandler:
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize), warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
fail_on_dataloss=self._fail_on_dataloss, fail_on_dataloss=self._fail_on_dataloss,
crawler=self._crawler,
) )
return agent.download_request(request) return agent.download_request(request)
@ -275,7 +281,7 @@ class ScrapyAgent:
_TunnelingAgent = TunnelingAgent _TunnelingAgent = TunnelingAgent
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None, def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
maxsize=0, warnsize=0, fail_on_dataloss=True): maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None):
self._contextFactory = contextFactory self._contextFactory = contextFactory
self._connectTimeout = connectTimeout self._connectTimeout = connectTimeout
self._bindAddress = bindAddress self._bindAddress = bindAddress
@ -284,6 +290,7 @@ class ScrapyAgent:
self._warnsize = warnsize self._warnsize = warnsize
self._fail_on_dataloss = fail_on_dataloss self._fail_on_dataloss = fail_on_dataloss
self._txresponse = None self._txresponse = None
self._crawler = crawler
def _get_agent(self, request, timeout): def _get_agent(self, request, timeout):
from twisted.internet import reactor from twisted.internet import reactor
@ -341,20 +348,6 @@ class ScrapyAgent:
headers.removeHeader(b'Proxy-Authorization') headers.removeHeader(b'Proxy-Authorization')
if request.body: if request.body:
bodyproducer = _RequestBodyProducer(request.body) bodyproducer = _RequestBodyProducer(request.body)
elif method == b'POST':
# Setting Content-Length: 0 even for POST requests is not a
# MUST per HTTP RFCs, but it's common behavior, and some
# servers require this, otherwise returning HTTP 411 Length required
#
# RFC 7230#section-3.3.2:
# "a Content-Length header field is normally sent in a POST
# request even when the value is 0 (indicating an empty payload body)."
#
# Twisted < 17 will not add "Content-Length: 0" by itself;
# Twisted >= 17 fixes this;
# Using a producer with an empty-string sends `0` as Content-Length
# for all versions of Twisted.
bodyproducer = _RequestBodyProducer(b'')
else: else:
bodyproducer = None bodyproducer = None
start_time = time() start_time = time()
@ -387,7 +380,13 @@ class ScrapyAgent:
def _cb_bodyready(self, txresponse, request): def _cb_bodyready(self, txresponse, request):
# deliverBody hangs for responses without body # deliverBody hangs for responses without body
if txresponse.length == 0: if txresponse.length == 0:
return txresponse, b'', None, None return {
"txresponse": txresponse,
"body": b"",
"flags": None,
"certificate": None,
"ip_address": None,
}
maxsize = request.meta.get('download_maxsize', self._maxsize) maxsize = request.meta.get('download_maxsize', self._maxsize)
warnsize = request.meta.get('download_warnsize', self._warnsize) warnsize = request.meta.get('download_warnsize', self._warnsize)
@ -414,7 +413,15 @@ class ScrapyAgent:
d = defer.Deferred(_cancel) d = defer.Deferred(_cancel)
txresponse.deliverBody( txresponse.deliverBody(
_ResponseReader(d, txresponse, request, maxsize, warnsize, fail_on_dataloss) _ResponseReader(
finished=d,
txresponse=txresponse,
request=request,
maxsize=maxsize,
warnsize=warnsize,
fail_on_dataloss=fail_on_dataloss,
crawler=self._crawler,
)
) )
# save response for timeouts # save response for timeouts
@ -423,12 +430,21 @@ class ScrapyAgent:
return d return d
def _cb_bodydone(self, result, request, url): def _cb_bodydone(self, result, request, url):
txresponse, body, flags, certificate = result headers = Headers(result["txresponse"].headers.getAllRawHeaders())
status = int(txresponse.code) respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
headers = Headers(txresponse.headers.getAllRawHeaders()) response = respcls(
respcls = responsetypes.from_args(headers=headers, url=url, body=body) url=url,
return respcls(url=url, status=status, headers=headers, body=body, status=int(result["txresponse"].code),
flags=flags, certificate=certificate) headers=headers,
body=result["body"],
flags=result["flags"],
certificate=result["certificate"],
ip_address=result["ip_address"],
)
if result.get("failure"):
result["failure"].value.response = response
return result["failure"]
return response
@implementer(IBodyProducer) @implementer(IBodyProducer)
@ -451,7 +467,7 @@ class _RequestBodyProducer:
class _ResponseReader(protocol.Protocol): class _ResponseReader(protocol.Protocol):
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss): def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler):
self._finished = finished self._finished = finished
self._txresponse = txresponse self._txresponse = txresponse
self._request = request self._request = request
@ -463,12 +479,27 @@ class _ResponseReader(protocol.Protocol):
self._reached_warnsize = False self._reached_warnsize = False
self._bytes_received = 0 self._bytes_received = 0
self._certificate = None self._certificate = None
self._ip_address = None
self._crawler = crawler
def _finish_response(self, flags=None, failure=None):
self._finished.callback({
"txresponse": self._txresponse,
"body": self._bodybuf.getvalue(),
"flags": flags,
"certificate": self._certificate,
"ip_address": self._ip_address,
"failure": failure,
})
def connectionMade(self): def connectionMade(self):
if self._certificate is None: if self._certificate is None:
with suppress(AttributeError): with suppress(AttributeError):
self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate()) self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
if self._ip_address is None:
self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host)
def dataReceived(self, bodyBytes): def dataReceived(self, bodyBytes):
# This maybe called several times after cancel was called with buffered data. # This maybe called several times after cancel was called with buffered data.
if self._finished.called: if self._finished.called:
@ -477,6 +508,20 @@ class _ResponseReader(protocol.Protocol):
self._bodybuf.write(bodyBytes) self._bodybuf.write(bodyBytes)
self._bytes_received += len(bodyBytes) self._bytes_received += len(bodyBytes)
bytes_received_result = self._crawler.signals.send_catch_log(
signal=signals.bytes_received,
data=bodyBytes,
request=self._request,
spider=self._crawler.spider,
)
for handler, result in bytes_received_result:
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
{"request": self._request, "handler": handler.__qualname__})
self.transport._producer.loseConnection()
failure = result if result.value.fail else None
self._finish_response(flags=["download_stopped"], failure=failure)
if self._maxsize and self._bytes_received > self._maxsize: if self._maxsize and self._bytes_received > self._maxsize:
logger.error("Received (%(bytes)s) bytes larger than download " logger.error("Received (%(bytes)s) bytes larger than download "
"max size (%(maxsize)s) in request %(request)s.", "max size (%(maxsize)s) in request %(request)s.",
@ -498,18 +543,17 @@ class _ResponseReader(protocol.Protocol):
if self._finished.called: if self._finished.called:
return return
body = self._bodybuf.getvalue()
if reason.check(ResponseDone): if reason.check(ResponseDone):
self._finished.callback((self._txresponse, body, None, self._certificate)) self._finish_response()
return return
if reason.check(PotentialDataLoss): if reason.check(PotentialDataLoss):
self._finished.callback((self._txresponse, body, ['partial'], self._certificate)) self._finish_response(flags=["partial"])
return return
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons): if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
if not self._fail_on_dataloss: if not self._fail_on_dataloss:
self._finished.callback((self._txresponse, body, ['dataloss'], self._certificate)) self._finish_response(flags=["dataloss"])
return return
elif not self._fail_on_dataloss_warned: elif not self._fail_on_dataloss_warned:

View File

@ -100,11 +100,12 @@ class S3DownloadHandler:
url=url, headers=awsrequest.headers.items()) url=url, headers=awsrequest.headers.items())
else: else:
signed_headers = self.conn.make_request( signed_headers = self.conn.make_request(
method=request.method, method=request.method,
bucket=bucket, bucket=bucket,
key=unquote(p.path), key=unquote(p.path),
query_args=unquote(p.query), query_args=unquote(p.query),
headers=request.headers, headers=request.headers,
data=request.body) data=request.body,
)
request = request.replace(url=url, headers=signed_headers) request = request.replace(url=url, headers=signed_headers)
return self._download_http(request, spider) return self._download_http(request, spider)

View File

@ -35,38 +35,45 @@ class DownloaderMiddlewareManager(MiddlewareManager):
for method in self.methods['process_request']: for method in self.methods['process_request']:
response = yield deferred_from_coro(method(request=request, spider=spider)) response = yield deferred_from_coro(method(request=request, spider=spider))
if response is not None and not isinstance(response, (Response, Request)): if response is not None and not isinstance(response, (Response, Request)):
raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \ raise _InvalidOutput(
(method.__self__.__class__.__name__, response.__class__.__name__)) "Middleware %s.process_request must return None, Response or Request, got %s"
% (method.__self__.__class__.__name__, response.__class__.__name__)
)
if response: if response:
defer.returnValue(response) return response
defer.returnValue((yield download_func(request=request, spider=spider))) return (yield download_func(request=request, spider=spider))
@defer.inlineCallbacks @defer.inlineCallbacks
def process_response(response): def process_response(response):
assert response is not None, 'Received None in process_response' if response is None:
if isinstance(response, Request): raise TypeError("Received None in process_response")
defer.returnValue(response) elif isinstance(response, Request):
return response
for method in self.methods['process_response']: for method in self.methods['process_response']:
response = yield deferred_from_coro(method(request=request, response=response, spider=spider)) response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
if not isinstance(response, (Response, Request)): if not isinstance(response, (Response, Request)):
raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \ raise _InvalidOutput(
(method.__self__.__class__.__name__, type(response))) "Middleware %s.process_response must return Response or Request, got %s"
% (method.__self__.__class__.__name__, type(response))
)
if isinstance(response, Request): if isinstance(response, Request):
defer.returnValue(response) return response
defer.returnValue(response) return response
@defer.inlineCallbacks @defer.inlineCallbacks
def process_exception(_failure): def process_exception(failure):
exception = _failure.value exception = failure.value
for method in self.methods['process_exception']: for method in self.methods['process_exception']:
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider)) response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
if response is not None and not isinstance(response, (Response, Request)): if response is not None and not isinstance(response, (Response, Request)):
raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \ raise _InvalidOutput(
(method.__self__.__class__.__name__, type(response))) "Middleware %s.process_exception must return None, Response or Request, got %s"
% (method.__self__.__class__.__name__, type(response))
)
if response: if response:
defer.returnValue(response) return response
defer.returnValue(_failure) return failure
deferred = mustbe_deferred(process_request, request) deferred = mustbe_deferred(process_request, request)
deferred.addErrback(process_exception) deferred.addErrback(process_exception)

View File

@ -20,8 +20,8 @@ METHOD_TLSv12 = 'TLSv1.2'
openssl_methods = { openssl_methods = {
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended) METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
METHOD_SSLv3: SSL.SSLv3_METHOD, # SSL 3 (NOT recommended) METHOD_SSLv3: SSL.SSLv3_METHOD, # SSL 3 (NOT recommended)
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
METHOD_TLSv11: getattr(SSL, 'TLSv1_1_METHOD', 5), # TLS 1.1 only METHOD_TLSv11: getattr(SSL, 'TLSv1_1_METHOD', 5), # TLS 1.1 only
METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only

View File

@ -14,13 +14,12 @@ from scrapy.responsetypes import responsetypes
def _parsed_url_args(parsed): def _parsed_url_args(parsed):
# Assume parsed is urlparse-d from Request.url, # Assume parsed is urlparse-d from Request.url,
# which was passed via safe_url_string and is ascii-only. # which was passed via safe_url_string and is ascii-only.
b = lambda s: to_bytes(s, encoding='ascii')
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, '')) path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
path = b(path) path = to_bytes(path, encoding="ascii")
host = b(parsed.hostname) host = to_bytes(parsed.hostname, encoding="ascii")
port = parsed.port port = parsed.port
scheme = b(parsed.scheme) scheme = to_bytes(parsed.scheme, encoding="ascii")
netloc = b(parsed.netloc) netloc = to_bytes(parsed.netloc, encoding="ascii")
if port is None: if port is None:
port = 443 if scheme == b'https' else 80 port = 443 if scheme == b'https' else 80
return scheme, netloc, host, port, path return scheme, netloc, host, port, path
@ -89,8 +88,8 @@ class ScrapyHTTPPageGetter(HTTPClient):
self.transport.stopProducing() self.transport.stopProducing()
self.factory.noPage( self.factory.noPage(
defer.TimeoutError("Getting %s took longer than %s seconds." % defer.TimeoutError("Getting %s took longer than %s seconds."
(self.factory.url, self.factory.timeout))) % (self.factory.url, self.factory.timeout)))
class ScrapyHTTPClientFactory(HTTPClientFactory): class ScrapyHTTPClientFactory(HTTPClientFactory):

View File

@ -73,7 +73,8 @@ class ExecutionEngine:
@defer.inlineCallbacks @defer.inlineCallbacks
def start(self): def start(self):
"""Start the execution engine""" """Start the execution engine"""
assert not self.running, "Engine already running" if self.running:
raise RuntimeError("Engine already running")
self.start_time = time() self.start_time = time()
yield self.signals.send_catch_log_deferred(signal=signals.engine_started) yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
self.running = True self.running = True
@ -82,7 +83,8 @@ class ExecutionEngine:
def stop(self): def stop(self):
"""Stop the execution engine gracefully""" """Stop the execution engine gracefully"""
assert self.running, "Engine not running" if not self.running:
raise RuntimeError("Engine not running")
self.running = False self.running = False
dfd = self._close_all_spiders() dfd = self._close_all_spiders()
return dfd.addBoth(lambda _: self._finish_stopping_engine()) return dfd.addBoth(lambda _: self._finish_stopping_engine())
@ -165,7 +167,11 @@ class ExecutionEngine:
return d return d
def _handle_downloader_output(self, response, request, spider): def _handle_downloader_output(self, response, request, spider):
assert isinstance(response, (Request, Response, Failure)), response if not isinstance(response, (Request, Response, Failure)):
raise TypeError(
"Incorrect type: expected Request, Response or Failure, got %s: %r"
% (type(response), response)
)
# downloader middleware can return requests (for example, redirects) # downloader middleware can return requests (for example, redirects)
if isinstance(response, Request): if isinstance(response, Request):
self.crawl(response, spider) self.crawl(response, spider)
@ -205,17 +211,15 @@ class ExecutionEngine:
return not bool(self.slot) return not bool(self.slot)
def crawl(self, request, spider): def crawl(self, request, spider):
assert spider in self.open_spiders, \ if spider not in self.open_spiders:
"Spider %r not opened when crawling: %s" % (spider.name, request) raise RuntimeError("Spider %r not opened when crawling: %s" % (spider.name, request))
self.schedule(request, spider) self.schedule(request, spider)
self.slot.nextcall.schedule() self.slot.nextcall.schedule()
def schedule(self, request, spider): def schedule(self, request, spider):
self.signals.send_catch_log(signal=signals.request_scheduled, self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
request=request, spider=spider)
if not self.slot.scheduler.enqueue_request(request): if not self.slot.scheduler.enqueue_request(request):
self.signals.send_catch_log(signal=signals.request_dropped, self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
request=request, spider=spider)
def download(self, request, spider): def download(self, request, spider):
d = self._download(request, spider) d = self._download(request, spider)
@ -224,22 +228,25 @@ class ExecutionEngine:
def _downloaded(self, response, slot, request, spider): def _downloaded(self, response, slot, request, spider):
slot.remove_request(request) slot.remove_request(request)
return self.download(response, spider) \ return self.download(response, spider) if isinstance(response, Request) else response
if isinstance(response, Request) else response
def _download(self, request, spider): def _download(self, request, spider):
slot = self.slot slot = self.slot
slot.add_request(request) slot.add_request(request)
def _on_success(response): def _on_success(response):
assert isinstance(response, (Response, Request)) if not isinstance(response, (Response, Request)):
raise TypeError(
"Incorrect type: expected Response or Request, got %s: %r"
% (type(response), response)
)
if isinstance(response, Response): if isinstance(response, Response):
response.request = request # tie request to response received response.request = request # tie request to response received
logkws = self.logformatter.crawled(request, response, spider) logkws = self.logformatter.crawled(request, response, spider)
if logkws is not None: if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
self.signals.send_catch_log(signal=signals.response_received, self.signals.send_catch_log(signals.response_received,
response=response, request=request, spider=spider) response=response, request=request, spider=spider)
return response return response
def _on_complete(_): def _on_complete(_):
@ -253,8 +260,8 @@ class ExecutionEngine:
@defer.inlineCallbacks @defer.inlineCallbacks
def open_spider(self, spider, start_requests=(), close_if_idle=True): def open_spider(self, spider, start_requests=(), close_if_idle=True):
assert self.has_capacity(), "No free spider slot when opening %r" % \ if not self.has_capacity():
spider.name raise RuntimeError("No free spider slot when opening %r" % spider.name)
logger.info("Spider opened", extra={'spider': spider}) logger.info("Spider opened", extra={'spider': spider})
nextcall = CallLaterOnce(self._next_request, spider) nextcall = CallLaterOnce(self._next_request, spider)
scheduler = self.scheduler_cls.from_crawler(self.crawler) scheduler = self.scheduler_cls.from_crawler(self.crawler)
@ -277,10 +284,8 @@ class ExecutionEngine:
next loop and this function is guaranteed to be called (at least) once next loop and this function is guaranteed to be called (at least) once
again for this spider. again for this spider.
""" """
res = self.signals.send_catch_log(signal=signals.spider_idle, \ res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider)
spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res):
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
for _, x in res):
return return
if self.spider_is_idle(spider): if self.spider_is_idle(spider):

View File

@ -4,18 +4,18 @@ extracts information from them"""
import logging import logging
from collections import deque from collections import deque
from twisted.python.failure import Failure from itemadapter import is_item
from twisted.internet import defer from twisted.internet import defer
from twisted.python.failure import Failure
from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errback
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
from scrapy.utils.log import logformatter_adapter, failure_to_exc_info
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy import signals from scrapy import signals
from scrapy.http import Request, Response
from scrapy.item import BaseItem
from scrapy.core.spidermw import SpiderMiddlewareManager from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy.http import Request, Response
from scrapy.utils.defer import defer_result, defer_succeed, iter_errback, parallel
from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
from scrapy.utils.spider import iterate_spider_output
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -123,7 +123,11 @@ class Scraper:
def _scrape(self, response, request, spider): def _scrape(self, response, request, spider):
"""Handle the downloaded response or failure through the spider """Handle the downloaded response or failure through the spider
callback/errback""" callback/errback"""
assert isinstance(response, (Response, Failure)) if not isinstance(response, (Response, Failure)):
raise TypeError(
"Incorrect type: expected Response or Failure, got %s: %r"
% (type(response), response)
)
dfd = self._scrape2(response, request, spider) # returns spider's processed output dfd = self._scrape2(response, request, spider) # returns spider's processed output
dfd.addErrback(self.handle_spider_error, request, response, spider) dfd.addErrback(self.handle_spider_error, request, response, spider)
@ -187,7 +191,7 @@ class Scraper:
""" """
if isinstance(output, Request): if isinstance(output, Request):
self.crawler.engine.crawl(request=output, spider=spider) self.crawler.engine.crawl(request=output, spider=spider)
elif isinstance(output, (BaseItem, dict)): elif is_item(output):
self.slot.itemproc_size += 1 self.slot.itemproc_size += 1
dfd = self.itemproc.process_item(output, spider) dfd = self.itemproc.process_item(output, spider)
dfd.addBoth(self._itemproc_finished, output, response, spider) dfd.addBoth(self._itemproc_finished, output, response, spider)
@ -196,10 +200,11 @@ class Scraper:
pass pass
else: else:
typename = type(output).__name__ typename = type(output).__name__
logger.error('Spider must return Request, BaseItem, dict or None, ' logger.error(
'got %(typename)r in %(request)s', 'Spider must return request, item, or None, got %(typename)r in %(request)s',
{'request': request, 'typename': typename}, {'request': request, 'typename': typename},
extra={'spider': spider}) extra={'spider': spider},
)
def _log_download_errors(self, spider_failure, download_failure, request, spider): def _log_download_errors(self, spider_failure, download_failure, request, spider):
"""Log and silence errors that come from the engine (typically download """Log and silence errors that come from the engine (typically download

View File

@ -19,7 +19,7 @@ def _isiterable(possible_iterator):
def _fname(f): def _fname(f):
return "%s.%s".format( return "{}.{}".format(
f.__self__.__class__.__name__, f.__self__.__class__.__name__,
f.__func__.__name__ f.__func__.__name__
) )

View File

@ -78,7 +78,8 @@ class Crawler:
@defer.inlineCallbacks @defer.inlineCallbacks
def crawl(self, *args, **kwargs): def crawl(self, *args, **kwargs):
assert not self.crawling, "Crawling already taking place" if self.crawling:
raise RuntimeError("Crawling already taking place")
self.crawling = True self.crawling = True
try: try:

View File

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
import re import re
import logging import logging

View File

@ -29,8 +29,7 @@ class CookiesMiddleware:
cookiejarkey = request.meta.get("cookiejar") cookiejarkey = request.meta.get("cookiejar")
jar = self.jars[cookiejarkey] jar = self.jars[cookiejarkey]
cookies = self._get_request_cookies(jar, request) for cookie in self._get_request_cookies(jar, request):
for cookie in cookies:
jar.set_cookie_if_ok(cookie, request) jar.set_cookie_if_ok(cookie, request)
# set Cookie header # set Cookie header
@ -68,28 +67,65 @@ class CookiesMiddleware:
msg = "Received cookies from: {}\n{}".format(response, cookies) msg = "Received cookies from: {}\n{}".format(response, cookies)
logger.debug(msg, extra={'spider': spider}) logger.debug(msg, extra={'spider': spider})
def _format_cookie(self, cookie): def _format_cookie(self, cookie, request):
# build cookie string """
cookie_str = '%s=%s' % (cookie['name'], cookie['value']) Given a dict consisting of cookie components, return its string representation.
Decode from bytes if necessary.
if cookie.get('path', None): """
cookie_str += '; Path=%s' % cookie['path'] decoded = {}
if cookie.get('domain', None): for key in ("name", "value", "path", "domain"):
cookie_str += '; Domain=%s' % cookie['domain'] if not cookie.get(key):
if key in ("name", "value"):
msg = "Invalid cookie found in request {}: {} ('{}' is missing)"
logger.warning(msg.format(request, cookie, key))
return
continue
if isinstance(cookie[key], str):
decoded[key] = cookie[key]
else:
try:
decoded[key] = cookie[key].decode("utf8")
except UnicodeDecodeError:
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
request, cookie)
decoded[key] = cookie[key].decode("latin1", errors="replace")
cookie_str = "{}={}".format(decoded.pop("name"), decoded.pop("value"))
for key, value in decoded.items(): # path, domain
cookie_str += "; {}={}".format(key.capitalize(), value)
return cookie_str return cookie_str
def _get_request_cookies(self, jar, request): def _get_request_cookies(self, jar, request):
if isinstance(request.cookies, dict): """
cookie_list = [ Extract cookies from a Request. Values from the `Request.cookies` attribute
{'name': k, 'value': v} take precedence over values from the `Cookie` request header.
for k, v in request.cookies.items() """
] def get_cookies_from_header(jar, request):
else: cookie_header = request.headers.get("Cookie")
cookie_list = request.cookies if not cookie_header:
return []
cookie_gen_bytes = (s.strip() for s in cookie_header.split(b";"))
cookie_list_unicode = []
for cookie_bytes in cookie_gen_bytes:
try:
cookie_unicode = cookie_bytes.decode("utf8")
except UnicodeDecodeError:
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
request, cookie_bytes)
cookie_unicode = cookie_bytes.decode("latin1", errors="replace")
cookie_list_unicode.append(cookie_unicode)
response = Response(request.url, headers={"Set-Cookie": cookie_list_unicode})
return jar.make_cookies(response, request)
cookies = [self._format_cookie(x) for x in cookie_list] def get_cookies_from_attribute(jar, request):
headers = {'Set-Cookie': cookies} if not request.cookies:
response = Response(request.url, headers=headers) return []
elif isinstance(request.cookies, dict):
cookies = ({"name": k, "value": v} for k, v in request.cookies.items())
else:
cookies = request.cookies
formatted = filter(None, (self._format_cookie(c, request) for c in cookies))
response = Response(request.url, headers={"Set-Cookie": formatted})
return jar.make_cookies(response, request)
return jar.make_cookies(response, request) return get_cookies_from_header(jar, request) + get_cookies_from_attribute(jar, request)

View File

@ -60,11 +60,14 @@ class RedirectMiddleware(BaseRedirectMiddleware):
Handle redirection of requests based on response status Handle redirection of requests based on response status
and meta-refresh html tag. and meta-refresh html tag.
""" """
def process_response(self, request, response, spider): def process_response(self, request, response, spider):
if (request.meta.get('dont_redirect', False) or if (
response.status in getattr(spider, 'handle_httpstatus_list', []) or request.meta.get('dont_redirect', False)
response.status in request.meta.get('handle_httpstatus_list', []) or or response.status in getattr(spider, 'handle_httpstatus_list', [])
request.meta.get('handle_httpstatus_all', False)): or response.status in request.meta.get('handle_httpstatus_list', [])
or request.meta.get('handle_httpstatus_all', False)
):
return response return response
allowed_status = (301, 302, 303, 307, 308) allowed_status = (301, 302, 303, 307, 308)

View File

@ -12,9 +12,15 @@ once the spider has finished crawling all regular (non failed) pages.
import logging import logging
from twisted.internet import defer from twisted.internet import defer
from twisted.internet.error import TimeoutError, DNSLookupError, \ from twisted.internet.error import (
ConnectionRefusedError, ConnectionDone, ConnectError, \ ConnectError,
ConnectionLost, TCPTimedOutError ConnectionDone,
ConnectionLost,
ConnectionRefusedError,
DNSLookupError,
TCPTimedOutError,
TimeoutError,
)
from twisted.web.client import ResponseFailed from twisted.web.client import ResponseFailed
from scrapy.exceptions import NotConfigured from scrapy.exceptions import NotConfigured

View File

@ -61,7 +61,7 @@ class RFPDupeFilter(BaseDupeFilter):
def log(self, request, spider): def log(self, request, spider):
if self.debug: if self.debug:
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)" msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
args = {'request': request, 'referer': referer_str(request) } args = {'request': request, 'referer': referer_str(request)}
self.logger.debug(msg, args, extra={'spider': spider}) self.logger.debug(msg, args, extra={'spider': spider})
elif self.logdupes: elif self.logdupes:
msg = ("Filtered duplicate request: %(request)s" msg = ("Filtered duplicate request: %(request)s"

View File

@ -41,6 +41,18 @@ class CloseSpider(Exception):
self.reason = reason self.reason = reason
class StopDownload(Exception):
"""
Stop the download of the body for a given response.
The 'fail' boolean parameter indicates whether or not the resulting partial response
should be handled by the request errback. Note that 'fail' is a keyword-only argument.
"""
def __init__(self, *, fail=True):
super().__init__()
self.fail = fail
# Items # Items
@ -59,6 +71,7 @@ class NotSupported(Exception):
class UsageError(Exception): class UsageError(Exception):
"""To indicate a command-line usage error""" """To indicate a command-line usage error"""
def __init__(self, *a, **kw): def __init__(self, *a, **kw):
self.print_help = kw.pop('print_help', True) self.print_help = kw.pop('print_help', True)
super(UsageError, self).__init__(*a, **kw) super(UsageError, self).__init__(*a, **kw)

View File

@ -4,16 +4,18 @@ Item Exporters are used to export/serialize items into different formats.
import csv import csv
import io import io
import pprint
import marshal import marshal
import warnings
import pickle import pickle
import pprint
import warnings
from xml.sax.saxutils import XMLGenerator from xml.sax.saxutils import XMLGenerator
from scrapy.utils.serialize import ScrapyJSONEncoder from itemadapter import is_item, ItemAdapter
from scrapy.utils.python import to_bytes, to_unicode, is_listlike
from scrapy.item import BaseItem
from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.item import _BaseItem
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from scrapy.utils.serialize import ScrapyJSONEncoder
__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter', __all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
@ -56,11 +58,14 @@ class BaseItemExporter:
"""Return the fields to export as an iterable of tuples """Return the fields to export as an iterable of tuples
(name, serialized_value) (name, serialized_value)
""" """
item = ItemAdapter(item)
if include_empty is None: if include_empty is None:
include_empty = self.export_empty_fields include_empty = self.export_empty_fields
if self.fields_to_export is None: if self.fields_to_export is None:
if include_empty and not isinstance(item, dict): if include_empty:
field_iter = item.fields.keys() field_iter = item.field_names()
else: else:
field_iter = item.keys() field_iter = item.keys()
else: else:
@ -71,8 +76,8 @@ class BaseItemExporter:
for field_name in field_iter: for field_name in field_iter:
if field_name in item: if field_name in item:
field = {} if isinstance(item, dict) else item.fields[field_name] field_meta = item.get_field_meta(field_name)
value = self.serialize_field(field, field_name, item[field_name]) value = self.serialize_field(field_meta, field_name, item[field_name])
else: else:
value = default_value value = default_value
@ -250,7 +255,7 @@ class CsvItemExporter(BaseItemExporter):
class PickleItemExporter(BaseItemExporter): class PickleItemExporter(BaseItemExporter):
def __init__(self, file, protocol=2, **kwargs): def __init__(self, file, protocol=4, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.file = file self.file = file
self.protocol = protocol self.protocol = protocol
@ -297,6 +302,7 @@ class PythonItemExporter(BaseItemExporter):
.. _msgpack: https://pypi.org/project/msgpack/ .. _msgpack: https://pypi.org/project/msgpack/
""" """
def _configure(self, options, dont_fail=False): def _configure(self, options, dont_fail=False):
self.binary = options.pop('binary', True) self.binary = options.pop('binary', True)
super(PythonItemExporter, self)._configure(options, dont_fail) super(PythonItemExporter, self)._configure(options, dont_fail)
@ -312,24 +318,24 @@ class PythonItemExporter(BaseItemExporter):
return serializer(value) return serializer(value)
def _serialize_value(self, value): def _serialize_value(self, value):
if isinstance(value, BaseItem): if isinstance(value, _BaseItem):
return self.export_item(value) return self.export_item(value)
if isinstance(value, dict): elif is_item(value):
return dict(self._serialize_dict(value)) return dict(self._serialize_item(value))
if is_listlike(value): elif is_listlike(value):
return [self._serialize_value(v) for v in value] return [self._serialize_value(v) for v in value]
encode_func = to_bytes if self.binary else to_unicode encode_func = to_bytes if self.binary else to_unicode
if isinstance(value, (str, bytes)): if isinstance(value, (str, bytes)):
return encode_func(value, encoding=self.encoding) return encode_func(value, encoding=self.encoding)
return value return value
def _serialize_dict(self, value): def _serialize_item(self, item):
for key, val in value.items(): for key, value in ItemAdapter(item).items():
key = to_bytes(key) if self.binary else key key = to_bytes(key) if self.binary else key
yield key, self._serialize_value(val) yield key, self._serialize_value(value)
def export_item(self, item): def export_item(self, item):
result = dict(self._get_serialized_fields(item)) result = dict(self._get_serialized_fields(item))
if self.binary: if self.binary:
result = dict(self._serialize_dict(result)) result = dict(self._serialize_item(result))
return result return result

View File

@ -20,7 +20,7 @@ class CloseSpider:
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'), 'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'), 'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'), 'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
} }
if not any(self.close_on.values()): if not any(self.close_on.values()):
raise NotConfigured raise NotConfigured

View File

@ -270,18 +270,29 @@ class FeedExporter:
if not slot.itemcount and not slot.store_empty: if not slot.itemcount and not slot.store_empty:
# We need to call slot.storage.store nonetheless to get the file # We need to call slot.storage.store nonetheless to get the file
# properly closed. # properly closed.
return defer.maybeDeferred(slot.storage.store, slot.file) d = defer.maybeDeferred(slot.storage.store, slot.file)
deferred_list.append(d)
continue
slot.finish_exporting() slot.finish_exporting()
logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s" logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
log_args = {'format': slot.format, log_args = {'format': slot.format,
'itemcount': slot.itemcount, 'itemcount': slot.itemcount,
'uri': slot.uri} 'uri': slot.uri}
d = defer.maybeDeferred(slot.storage.store, slot.file) d = defer.maybeDeferred(slot.storage.store, slot.file)
d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args,
extra={'spider': spider})) # Use `largs=log_args` to copy log_args into function's scope
d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args, # instead of using `log_args` from the outer scope
exc_info=failure_to_exc_info(f), d.addCallback(
extra={'spider': spider})) lambda _, largs=log_args: logger.info(
logfmt % "Stored", largs, extra={'spider': spider}
)
)
d.addErrback(
lambda f, largs=log_args: logger.error(
logfmt % "Error storing", largs,
exc_info=failure_to_exc_info(f), extra={'spider': spider}
)
)
deferred_list.append(d) deferred_list.append(d)
return defer.DeferredList(deferred_list) if deferred_list else None return defer.DeferredList(deferred_list) if deferred_list else None

View File

@ -46,9 +46,10 @@ class RFC2616Policy:
def __init__(self, settings): def __init__(self, settings):
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE') self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES') self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_response_cache_controls = [to_bytes(cc) for cc in
settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')]
self._cc_parsed = WeakKeyDictionary() self._cc_parsed = WeakKeyDictionary()
self.ignore_response_cache_controls = [
to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
]
def _parse_cachecontrol(self, r): def _parse_cachecontrol(self, r):
if r not in self._cc_parsed: if r not in self._cc_parsed:
@ -250,7 +251,7 @@ class DbmCacheStorage:
'headers': dict(response.headers), 'headers': dict(response.headers),
'body': response.body, 'body': response.body,
} }
self.db['%s_data' % key] = pickle.dumps(data, protocol=2) self.db['%s_data' % key] = pickle.dumps(data, protocol=4)
self.db['%s_time' % key] = str(time()) self.db['%s_time' % key] = str(time())
def _read_data(self, spider, request): def _read_data(self, spider, request):
@ -317,7 +318,7 @@ class FilesystemCacheStorage:
with self._open(os.path.join(rpath, 'meta'), 'wb') as f: with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
f.write(to_bytes(repr(metadata))) f.write(to_bytes(repr(metadata)))
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f: with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
pickle.dump(metadata, f, protocol=2) pickle.dump(metadata, f, protocol=4)
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f: with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
f.write(headers_dict_to_raw(response.headers)) f.write(headers_dict_to_raw(response.headers))
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f: with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:

View File

@ -26,7 +26,7 @@ class SpiderState:
def spider_closed(self, spider): def spider_closed(self, spider):
if self.jobdir: if self.jobdir:
with open(self.statefn, 'wb') as f: with open(self.statefn, 'wb') as f:
pickle.dump(spider.state, f, protocol=2) pickle.dump(spider.state, f, protocol=4)
def spider_opened(self, spider): def spider_opened(self, spider):
if self.jobdir and os.path.exists(self.statefn): if self.jobdir and os.path.exists(self.statefn):

View File

@ -76,8 +76,10 @@ class TelnetConsole(protocol.ServerFactory):
"""An implementation of IPortal""" """An implementation of IPortal"""
@defers @defers
def login(self_, credentials, mind, *interfaces): def login(self_, credentials, mind, *interfaces):
if not (credentials.username == self.username.encode('utf8') and if not (
credentials.checkPassword(self.password.encode('utf8'))): credentials.username == self.username.encode('utf8')
and credentials.checkPassword(self.password.encode('utf8'))
):
raise ValueError("Invalid credentials") raise ValueError("Invalid credentials")
protocol = telnet.TelnetBootstrapProtocol( protocol = telnet.TelnetBootstrapProtocol(

View File

@ -24,7 +24,8 @@ class Request(object_ref):
self.method = str(method).upper() self.method = str(method).upper()
self._set_url(url) self._set_url(url)
self._set_body(body) self._set_body(body)
assert isinstance(priority, int), "Request priority not an integer: %r" % priority if not isinstance(priority, int):
raise TypeError("Request priority not an integer: %r" % priority)
self.priority = priority self.priority = priority
if callback is not None and not callable(callback): if callback is not None and not callable(callback):
@ -129,6 +130,9 @@ class Request(object_ref):
:class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`, :class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`,
may modify the :class:`~scrapy.http.Request` object. may modify the :class:`~scrapy.http.Request` object.
To translate a cURL command into a Scrapy request,
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
""" """
request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options) request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options)
request_kwargs.update(kwargs) request_kwargs.update(kwargs)

View File

@ -178,12 +178,11 @@ def _get_clickable(clickdata, form):
if the latter is given. If not, it returns the first if the latter is given. If not, it returns the first
clickable element found clickable element found
""" """
clickables = [ clickables = list(form.xpath(
el for el in form.xpath( 'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
'descendant::input[re:test(@type, "^(submit|image)$", "i")]' '|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]', namespaces={"re": "http://exslt.org/regular-expressions"}
namespaces={"re": "http://exslt.org/regular-expressions"}) ))
]
if not clickables: if not clickables:
return return

View File

@ -17,7 +17,8 @@ from scrapy.utils.trackref import object_ref
class Response(object_ref): class Response(object_ref):
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None, certificate=None): def __init__(self, url, status=200, headers=None, body=b'', flags=None,
request=None, certificate=None, ip_address=None):
self.headers = Headers(headers or {}) self.headers = Headers(headers or {})
self.status = int(status) self.status = int(status)
self._set_body(body) self._set_body(body)
@ -25,6 +26,7 @@ class Response(object_ref):
self.request = request self.request = request
self.flags = [] if flags is None else list(flags) self.flags = [] if flags is None else list(flags)
self.certificate = certificate self.certificate = certificate
self.ip_address = ip_address
@property @property
def cb_kwargs(self): def cb_kwargs(self):
@ -87,7 +89,8 @@ class Response(object_ref):
"""Create a new Response with the same attributes except for those """Create a new Response with the same attributes except for those
given new values. given new values.
""" """
for x in ['url', 'status', 'headers', 'body', 'request', 'flags', 'certificate']: for x in ['url', 'status', 'headers', 'body',
'request', 'flags', 'certificate', 'ip_address']:
kwargs.setdefault(x, getattr(self, x)) kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__) cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs) return cls(*args, **kwargs)

View File

@ -5,6 +5,8 @@ discovering (through HTTP headers) to base Response class.
See documentation in docs/topics/request-response.rst See documentation in docs/topics/request-response.rst
""" """
import json
import warnings
from contextlib import suppress from contextlib import suppress
from typing import Generator from typing import Generator
from urllib.parse import urljoin from urllib.parse import urljoin
@ -14,15 +16,19 @@ from w3lib.encoding import (html_body_declared_encoding, html_to_unicode,
http_content_type_encoding, resolve_encoding) http_content_type_encoding, resolve_encoding)
from w3lib.html import strip_html5_whitespace from w3lib.html import strip_html5_whitespace
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Request from scrapy.http import Request
from scrapy.http.response import Response from scrapy.http.response import Response
from scrapy.utils.python import memoizemethod_noargs, to_unicode from scrapy.utils.python import memoizemethod_noargs, to_unicode
from scrapy.utils.response import get_base_url from scrapy.utils.response import get_base_url
_NONE = object()
class TextResponse(Response): class TextResponse(Response):
_DEFAULT_ENCODING = 'ascii' _DEFAULT_ENCODING = 'ascii'
_cached_decoded_json = _NONE
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self._encoding = kwargs.pop('encoding', None) self._encoding = kwargs.pop('encoding', None)
@ -61,8 +67,21 @@ class TextResponse(Response):
def body_as_unicode(self): def body_as_unicode(self):
"""Return body as unicode""" """Return body as unicode"""
warnings.warn('Response.body_as_unicode() is deprecated, '
'please use Response.text instead.',
ScrapyDeprecationWarning, stacklevel=2)
return self.text return self.text
def json(self):
"""
.. versionadded:: 2.2
Deserialize a JSON document to a Python object.
"""
if self._cached_decoded_json is _NONE:
self._cached_decoded_json = json.loads(self.text)
return self._cached_decoded_json
@property @property
def text(self): def text(self):
""" Body as unicode """ """ Body as unicode """

View File

@ -14,28 +14,39 @@ from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.trackref import object_ref from scrapy.utils.trackref import object_ref
class BaseItem(object_ref): class _BaseItem(object_ref):
"""Base class for all scraped items. """
Temporary class used internally to avoid the deprecation
In Scrapy, an object is considered an *item* if it is an instance of either warning raised by isinstance checks using BaseItem.
:class:`BaseItem` or :class:`dict`. For example, when the output of a
spider callback is evaluated, only instances of :class:`BaseItem` or
:class:`dict` are passed to :ref:`item pipelines <topics-item-pipeline>`.
If you need instances of a custom class to be considered items by Scrapy,
you must inherit from either :class:`BaseItem` or :class:`dict`.
Unlike instances of :class:`dict`, instances of :class:`BaseItem` may be
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
""" """
pass pass
class _BaseItemMeta(ABCMeta):
def __instancecheck__(cls, instance):
if cls is BaseItem:
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
ScrapyDeprecationWarning, stacklevel=2)
return super().__instancecheck__(instance)
class BaseItem(_BaseItem, metaclass=_BaseItemMeta):
"""
Deprecated, please use :class:`scrapy.item.Item` instead
"""
def __new__(cls, *args, **kwargs):
if issubclass(cls, BaseItem) and not issubclass(cls, (Item, DictItem)):
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
ScrapyDeprecationWarning, stacklevel=2)
return super(BaseItem, cls).__new__(cls, *args, **kwargs)
class Field(dict): class Field(dict):
"""Container of field metadata""" """Container of field metadata"""
class ItemMeta(ABCMeta): class ItemMeta(_BaseItemMeta):
"""Metaclass_ of :class:`Item` that handles field definitions. """Metaclass_ of :class:`Item` that handles field definitions.
.. _metaclass: https://realpython.com/python-metaclasses .. _metaclass: https://realpython.com/python-metaclasses
@ -68,8 +79,7 @@ class DictItem(MutableMapping, BaseItem):
def __new__(cls, *args, **kwargs): def __new__(cls, *args, **kwargs):
if issubclass(cls, DictItem) and not issubclass(cls, Item): if issubclass(cls, DictItem) and not issubclass(cls, Item):
warn('scrapy.item.DictItem is deprecated, please use ' warn('scrapy.item.DictItem is deprecated, please use scrapy.item.Item instead',
'scrapy.item.Item instead',
ScrapyDeprecationWarning, stacklevel=2) ScrapyDeprecationWarning, stacklevel=2)
return super(DictItem, cls).__new__(cls, *args, **kwargs) return super(DictItem, cls).__new__(cls, *args, **kwargs)
@ -86,8 +96,7 @@ class DictItem(MutableMapping, BaseItem):
if key in self.fields: if key in self.fields:
self._values[key] = value self._values[key] = value
else: else:
raise KeyError("%s does not support field: %s" % raise KeyError("%s does not support field: %s" % (self.__class__.__name__, key))
(self.__class__.__name__, key))
def __delitem__(self, key): def __delitem__(self, key):
del self._values[key] del self._values[key]
@ -99,8 +108,7 @@ class DictItem(MutableMapping, BaseItem):
def __setattr__(self, name, value): def __setattr__(self, name, value):
if not name.startswith('_'): if not name.startswith('_'):
raise AttributeError("Use item[%r] = %r to set field value" % raise AttributeError("Use item[%r] = %r to set field value" % (name, value))
(name, value))
super(DictItem, self).__setattr__(name, value) super(DictItem, self).__setattr__(name, value)
def __len__(self): def __len__(self):
@ -121,12 +129,30 @@ class DictItem(MutableMapping, BaseItem):
return self.__class__(self) return self.__class__(self)
def deepcopy(self): def deepcopy(self):
"""Return a `deep copy`_ of this item. """Return a :func:`~copy.deepcopy` of this item.
.. _deep copy: https://docs.python.org/library/copy.html#copy.deepcopy
""" """
return deepcopy(self) return deepcopy(self)
class Item(DictItem, metaclass=ItemMeta): class Item(DictItem, metaclass=ItemMeta):
pass """
Base class for scraped items.
In Scrapy, an object is considered an ``item`` if it is an instance of either
:class:`Item` or :class:`dict`, or any subclass. For example, when the output of a
spider callback is evaluated, only instances of :class:`Item` or
:class:`dict` are passed to :ref:`item pipelines <topics-item-pipeline>`.
If you need instances of a custom class to be considered items by Scrapy,
you must inherit from either :class:`Item` or :class:`dict`.
Items must declare :class:`Field` attributes, which are processed and stored
in the ``fields`` attribute. This restricts the set of allowed field names
and prevents typos, raising ``KeyError`` when referring to undefined fields.
Additionally, fields can be used to define metadata and control the way
data is processed internally. Please refer to the :ref:`documentation
about fields <topics-items-fields>` for additional information.
Unlike instances of :class:`dict`, instances of :class:`Item` may be
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
"""

View File

@ -45,8 +45,14 @@ IGNORED_EXTENSIONS = [
_re_type = type(re.compile("", 0)) _re_type = type(re.compile("", 0))
_matches = lambda url, regexs: any(r.search(url) for r in regexs)
_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', 'file', 'ftp'}
def _matches(url, regexs):
return any(r.search(url) for r in regexs)
def _is_valid_url(url):
return url.split('://', 1)[0] in {'http', 'https', 'file', 'ftp'}
class FilteringLinkExtractor: class FilteringLinkExtractor:
@ -55,8 +61,7 @@ class FilteringLinkExtractor:
def __new__(cls, *args, **kwargs): def __new__(cls, *args, **kwargs):
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
if (issubclass(cls, FilteringLinkExtractor) and if issubclass(cls, FilteringLinkExtractor) and not issubclass(cls, LxmlLinkExtractor):
not issubclass(cls, LxmlLinkExtractor)):
warn('scrapy.linkextractors.FilteringLinkExtractor is deprecated, ' warn('scrapy.linkextractors.FilteringLinkExtractor is deprecated, '
'please use scrapy.linkextractors.LinkExtractor instead', 'please use scrapy.linkextractors.LinkExtractor instead',
ScrapyDeprecationWarning, stacklevel=2) ScrapyDeprecationWarning, stacklevel=2)
@ -128,4 +133,4 @@ class FilteringLinkExtractor:
# Top-level imports # Top-level imports
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor as LinkExtractor # noqa: F401 from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor as LinkExtractor

View File

@ -1,6 +1,8 @@
""" """
Link extractor based on lxml.html Link extractor based on lxml.html
""" """
import operator
from functools import partial
from urllib.parse import urljoin from urllib.parse import urljoin
import lxml.etree as etree import lxml.etree as etree
@ -8,10 +10,10 @@ from w3lib.html import strip_html5_whitespace
from w3lib.url import canonicalize_url, safe_url_string from w3lib.url import canonicalize_url, safe_url_string
from scrapy.link import Link from scrapy.link import Link
from scrapy.linkextractors import FilteringLinkExtractor
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list from scrapy.utils.python import unique as unique_list
from scrapy.utils.response import get_base_url from scrapy.utils.response import get_base_url
from scrapy.linkextractors import FilteringLinkExtractor
# from lxml/src/lxml/html/__init__.py # from lxml/src/lxml/html/__init__.py
@ -27,19 +29,24 @@ def _nons(tag):
return tag return tag
def _identity(x):
return x
def _canonicalize_link_url(link):
return canonicalize_url(link.url, keep_fragments=True)
class LxmlParserLinkExtractor: class LxmlParserLinkExtractor:
def __init__(self, tag="a", attr="href", process=None, unique=False, def __init__(
strip=True, canonicalized=False): self, tag="a", attr="href", process=None, unique=False, strip=True, canonicalized=False
self.scan_tag = tag if callable(tag) else lambda t: t == tag ):
self.scan_attr = attr if callable(attr) else lambda a: a == attr self.scan_tag = tag if callable(tag) else partial(operator.eq, tag)
self.process_attr = process if callable(process) else lambda v: v self.scan_attr = attr if callable(attr) else partial(operator.eq, attr)
self.process_attr = process if callable(process) else _identity
self.unique = unique self.unique = unique
self.strip = strip self.strip = strip
if canonicalized: self.link_key = operator.attrgetter("url") if canonicalized else _canonicalize_link_url
self.link_key = lambda link: link.url
else:
self.link_key = lambda link: canonicalize_url(link.url,
keep_fragments=True)
def _iter_links(self, document): def _iter_links(self, document):
for el in document.iter(etree.Element): for el in document.iter(etree.Element):
@ -93,27 +100,44 @@ class LxmlParserLinkExtractor:
class LxmlLinkExtractor(FilteringLinkExtractor): class LxmlLinkExtractor(FilteringLinkExtractor):
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), def __init__(
tags=('a', 'area'), attrs=('href',), canonicalize=False, self,
unique=True, process_value=None, deny_extensions=None, restrict_css=(), allow=(),
strip=True, restrict_text=None): deny=(),
allow_domains=(),
deny_domains=(),
restrict_xpaths=(),
tags=('a', 'area'),
attrs=('href',),
canonicalize=False,
unique=True,
process_value=None,
deny_extensions=None,
restrict_css=(),
strip=True,
restrict_text=None,
):
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
tag_func = lambda x: x in tags
attr_func = lambda x: x in attrs
lx = LxmlParserLinkExtractor( lx = LxmlParserLinkExtractor(
tag=tag_func, tag=partial(operator.contains, tags),
attr=attr_func, attr=partial(operator.contains, attrs),
unique=unique, unique=unique,
process=process_value, process=process_value,
strip=strip, strip=strip,
canonicalized=canonicalize canonicalized=canonicalize
) )
super(LxmlLinkExtractor, self).__init__(
super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny, link_extractor=lx,
allow_domains=allow_domains, deny_domains=deny_domains, allow=allow,
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css, deny=deny,
canonicalize=canonicalize, deny_extensions=deny_extensions, allow_domains=allow_domains,
restrict_text=restrict_text) deny_domains=deny_domains,
restrict_xpaths=restrict_xpaths,
restrict_css=restrict_css,
canonicalize=canonicalize,
deny_extensions=deny_extensions,
restrict_text=restrict_text,
)
def extract_links(self, response): def extract_links(self, response):
"""Returns a list of :class:`~scrapy.link.Link` objects from the """Returns a list of :class:`~scrapy.link.Link` objects from the
@ -126,9 +150,11 @@ class LxmlLinkExtractor(FilteringLinkExtractor):
""" """
base_url = get_base_url(response) base_url = get_base_url(response)
if self.restrict_xpaths: if self.restrict_xpaths:
docs = [subdoc docs = [
for x in self.restrict_xpaths subdoc
for subdoc in response.xpath(x)] for x in self.restrict_xpaths
for subdoc in response.xpath(x)
]
else: else:
docs = [response.selector] docs = [response.selector]
all_links = [] all_links = []

View File

@ -6,6 +6,8 @@ See documentation in docs/topics/loaders.rst
from collections import defaultdict from collections import defaultdict
from contextlib import suppress from contextlib import suppress
from itemadapter import ItemAdapter
from scrapy.item import Item from scrapy.item import Item
from scrapy.loader.common import wrap_loader_context from scrapy.loader.common import wrap_loader_context
from scrapy.loader.processors import Identity from scrapy.loader.processors import Identity
@ -44,7 +46,7 @@ class ItemLoader:
self._local_item = context['item'] = item self._local_item = context['item'] = item
self._local_values = defaultdict(list) self._local_values = defaultdict(list)
# values from initial item # values from initial item
for field_name, value in item.items(): for field_name, value in ItemAdapter(item).items():
self._values[field_name] += arg_to_iter(value) self._values[field_name] += arg_to_iter(value)
@property @property
@ -127,13 +129,12 @@ class ItemLoader:
return value return value
def load_item(self): def load_item(self):
item = self.item adapter = ItemAdapter(self.item)
for field_name in tuple(self._values): for field_name in tuple(self._values):
value = self.get_output_value(field_name) value = self.get_output_value(field_name)
if value is not None: if value is not None:
item[field_name] = value adapter[field_name] = value
return adapter.item
return item
def get_output_value(self, field_name): def get_output_value(self, field_name):
proc = self.get_output_processor(field_name) proc = self.get_output_processor(field_name)
@ -174,11 +175,8 @@ class ItemLoader:
value, type(e).__name__, str(e))) value, type(e).__name__, str(e)))
def _get_item_field_attr(self, field_name, key, default=None): def _get_item_field_attr(self, field_name, key, default=None):
if isinstance(self.item, Item): field_meta = ItemAdapter(self.item).get_field_meta(field_name)
value = self.item.fields[field_name].get(key, default) return field_meta.get(key, default)
else:
value = default
return value
def _check_selector_method(self): def _check_selector_method(self):
if self.selector is None: if self.selector is None:

View File

@ -28,8 +28,10 @@ def _to_bytes_or_none(text):
class MailSender: class MailSender:
def __init__(self, smtphost='localhost', mailfrom='scrapy@localhost', def __init__(
smtpuser=None, smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False): self, smtphost='localhost', mailfrom='scrapy@localhost', smtpuser=None,
smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False
):
self.smtphost = smtphost self.smtphost = smtphost
self.smtpport = smtpport self.smtpport = smtpport
self.smtpuser = _to_bytes_or_none(smtpuser) self.smtpuser = _to_bytes_or_none(smtpuser)
@ -41,9 +43,15 @@ class MailSender:
@classmethod @classmethod
def from_settings(cls, settings): def from_settings(cls, settings):
return cls(settings['MAIL_HOST'], settings['MAIL_FROM'], settings['MAIL_USER'], return cls(
settings['MAIL_PASS'], settings.getint('MAIL_PORT'), smtphost=settings['MAIL_HOST'],
settings.getbool('MAIL_TLS'), settings.getbool('MAIL_SSL')) mailfrom=settings['MAIL_FROM'],
smtpuser=settings['MAIL_USER'],
smtppass=settings['MAIL_PASS'],
smtpport=settings.getint('MAIL_PORT'),
smtptls=settings.getbool('MAIL_TLS'),
smtpssl=settings.getbool('MAIL_SSL'),
)
def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None): def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
from twisted.internet import reactor from twisted.internet import reactor
@ -89,9 +97,12 @@ class MailSender:
return return
dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8')) dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8'))
dfd.addCallbacks(self._sent_ok, self._sent_failed, dfd.addCallbacks(
callback=self._sent_ok,
errback=self._sent_failed,
callbackArgs=[to, cc, subject, len(attachs)], callbackArgs=[to, cc, subject, len(attachs)],
errbackArgs=[to, cc, subject, len(attachs)]) errbackArgs=[to, cc, subject, len(attachs)],
)
reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd) reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
return dfd return dfd
@ -115,9 +126,10 @@ class MailSender:
from twisted.mail.smtp import ESMTPSenderFactory from twisted.mail.smtp import ESMTPSenderFactory
msg = BytesIO(msg) msg = BytesIO(msg)
d = defer.Deferred() d = defer.Deferred()
factory = ESMTPSenderFactory(self.smtpuser, self.smtppass, self.mailfrom, \ factory = ESMTPSenderFactory(
to_addrs, msg, d, heloFallback=True, requireAuthentication=False, \ self.smtpuser, self.smtppass, self.mailfrom, to_addrs, msg, d,
requireTransportSecurity=self.smtptls) heloFallback=True, requireAuthentication=False, requireTransportSecurity=self.smtptls,
)
factory.noisy = False factory.noisy = False
if self.smtpssl: if self.smtpssl:

View File

@ -10,24 +10,26 @@ import mimetypes
import os import os
import time import time
from collections import defaultdict from collections import defaultdict
from email.utils import parsedate_tz, mktime_tz from contextlib import suppress
from email.utils import mktime_tz, parsedate_tz
from ftplib import FTP from ftplib import FTP
from io import BytesIO from io import BytesIO
from urllib.parse import urlparse from urllib.parse import urlparse
from itemadapter import ItemAdapter
from twisted.internet import defer, threads from twisted.internet import defer, threads
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request
from scrapy.pipelines.media import MediaPipeline from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings from scrapy.settings import Settings
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.misc import md5sum
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.python import to_bytes
from scrapy.utils.request import referer_str
from scrapy.utils.boto import is_botocore from scrapy.utils.boto import is_botocore
from scrapy.utils.datatypes import CaselessDict from scrapy.utils.datatypes import CaselessDict
from scrapy.utils.ftp import ftp_store_file from scrapy.utils.ftp import ftp_store_file
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes
from scrapy.utils.request import referer_str
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -83,8 +85,7 @@ class S3FilesStore:
AWS_USE_SSL = None AWS_USE_SSL = None
AWS_VERIFY = None AWS_VERIFY = None
POLICY = 'private' # Overriden from settings.FILES_STORE_S3_ACL in POLICY = 'private' # Overriden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
# FilesPipeline.from_settings.
HEADERS = { HEADERS = {
'Cache-Control': 'max-age=172800', 'Cache-Control': 'max-age=172800',
} }
@ -106,7 +107,8 @@ class S3FilesStore:
else: else:
from boto.s3.connection import S3Connection from boto.s3.connection import S3Connection
self.S3Connection = S3Connection self.S3Connection = S3Connection
assert uri.startswith('s3://') if not uri.startswith("s3://"):
raise ValueError("Incorrect URI scheme in %s, expected 's3'" % uri)
self.bucket, self.prefix = uri[5:].split('/', 1) self.bucket, self.prefix = uri[5:].split('/', 1)
def stat_file(self, path, info): def stat_file(self, path, info):
@ -229,6 +231,20 @@ class GCSFilesStore:
bucket, prefix = uri[5:].split('/', 1) bucket, prefix = uri[5:].split('/', 1)
self.bucket = client.bucket(bucket) self.bucket = client.bucket(bucket)
self.prefix = prefix self.prefix = prefix
permissions = self.bucket.test_iam_permissions(
['storage.objects.get', 'storage.objects.create']
)
if 'storage.objects.get' not in permissions:
logger.warning(
"No 'storage.objects.get' permission for GSC bucket %(bucket)s. "
"Checking if files are up to date will be impossible. Files will be downloaded every time.",
{'bucket': bucket}
)
if 'storage.objects.create' not in permissions:
logger.error(
"No 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!",
{'bucket': bucket}
)
def stat_file(self, path, info): def stat_file(self, path, info):
def _onsuccess(blob): def _onsuccess(blob):
@ -266,7 +282,8 @@ class FTPFilesStore:
USE_ACTIVE_MODE = None USE_ACTIVE_MODE = None
def __init__(self, uri): def __init__(self, uri):
assert uri.startswith('ftp://') if not uri.startswith("ftp://"):
raise ValueError("Incorrect URI scheme in %s, expected 'ftp'" % uri)
u = urlparse(uri) u = urlparse(uri)
self.port = u.port self.port = u.port
self.host = u.hostname self.host = u.hostname
@ -417,7 +434,7 @@ class FilesPipeline(MediaPipeline):
self.inc_stats(info.spider, 'uptodate') self.inc_stats(info.spider, 'uptodate')
checksum = result.get('checksum', None) checksum = result.get('checksum', None)
return {'url': request.url, 'path': path, 'checksum': checksum} return {'url': request.url, 'path': path, 'checksum': checksum, 'status': 'uptodate'}
path = self.file_path(request, info=info) path = self.file_path(request, info=info)
dfd = defer.maybeDeferred(self.store.stat_file, path, info) dfd = defer.maybeDeferred(self.store.stat_file, path, info)
@ -494,15 +511,16 @@ class FilesPipeline(MediaPipeline):
) )
raise FileException(str(exc)) raise FileException(str(exc))
return {'url': request.url, 'path': path, 'checksum': checksum} return {'url': request.url, 'path': path, 'checksum': checksum, 'status': status}
def inc_stats(self, spider, status): def inc_stats(self, spider, status):
spider.crawler.stats.inc_value('file_count', spider=spider) spider.crawler.stats.inc_value('file_count', spider=spider)
spider.crawler.stats.inc_value('file_status_count/%s' % status, spider=spider) spider.crawler.stats.inc_value('file_status_count/%s' % status, spider=spider)
### Overridable Interface # Overridable Interface
def get_media_requests(self, item, info): def get_media_requests(self, item, info):
return [Request(x) for x in item.get(self.files_urls_field, [])] urls = ItemAdapter(item).get(self.files_urls_field, [])
return [Request(u) for u in urls]
def file_downloaded(self, response, request, info): def file_downloaded(self, response, request, info):
path = self.file_path(request, response=response, info=info) path = self.file_path(request, response=response, info=info)
@ -513,8 +531,8 @@ class FilesPipeline(MediaPipeline):
return checksum return checksum
def item_completed(self, results, item, info): def item_completed(self, results, item, info):
if isinstance(item, dict) or self.files_result_field in item.fields: with suppress(KeyError):
item[self.files_result_field] = [x for ok, x in results if ok] ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]
return item return item
def file_path(self, request, response=None, info=None): def file_path(self, request, response=None, info=None):

View File

@ -5,17 +5,19 @@ See documentation in topics/media-pipeline.rst
""" """
import functools import functools
import hashlib import hashlib
from contextlib import suppress
from io import BytesIO from io import BytesIO
from itemadapter import ItemAdapter
from PIL import Image from PIL import Image
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.pipelines.files import FileException, FilesPipeline
# TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.misc import md5sum from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes from scrapy.utils.python import to_bytes
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy.exceptions import DropItem
#TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.pipelines.files import FileException, FilesPipeline
class NoimagesDrop(DropItem): class NoimagesDrop(DropItem):
@ -157,11 +159,12 @@ class ImagesPipeline(FilesPipeline):
return image, buf return image, buf
def get_media_requests(self, item, info): def get_media_requests(self, item, info):
return [Request(x) for x in item.get(self.images_urls_field, [])] urls = ItemAdapter(item).get(self.images_urls_field, [])
return [Request(u) for u in urls]
def item_completed(self, results, item, info): def item_completed(self, results, item, info):
if isinstance(item, dict) or self.images_result_field in item.fields: with suppress(KeyError):
item[self.images_result_field] = [x for ok, x in results if ok] ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
return item return item
def file_path(self, request, response=None, info=None): def file_path(self, request, response=None, info=None):

View File

@ -1,7 +1,7 @@
import functools import functools
import logging import logging
from collections import defaultdict from collections import defaultdict
from twisted.internet.defer import Deferred, DeferredList, _DefGen_Return from twisted.internet.defer import Deferred, DeferredList
from twisted.python.failure import Failure from twisted.python.failure import Failure
from scrapy.settings import Settings from scrapy.settings import Settings
@ -43,8 +43,7 @@ class MediaPipeline:
if allow_redirects: if allow_redirects:
self.handle_httpstatus_list = SequenceExclude(range(300, 400)) self.handle_httpstatus_list = SequenceExclude(range(300, 400))
def _key_for_pipe(self, key, base_class_name=None, def _key_for_pipe(self, key, base_class_name=None, settings=None):
settings=None):
""" """
>>> MediaPipeline()._key_for_pipe("IMAGES") >>> MediaPipeline()._key_for_pipe("IMAGES")
'IMAGES' 'IMAGES'
@ -55,8 +54,11 @@ class MediaPipeline:
""" """
class_name = self.__class__.__name__ class_name = self.__class__.__name__
formatted_key = "{}_{}".format(class_name.upper(), key) formatted_key = "{}_{}".format(class_name.upper(), key)
if class_name == base_class_name or not base_class_name \ if (
or (settings and not settings.get(formatted_key)): not base_class_name
or class_name == base_class_name
or settings and not settings.get(formatted_key)
):
return key return key
return formatted_key return formatted_key
@ -141,24 +143,26 @@ class MediaPipeline:
# This code fixes a memory leak by avoiding to keep references to # This code fixes a memory leak by avoiding to keep references to
# the Request and Response objects on the Media Pipeline cache. # the Request and Response objects on the Media Pipeline cache.
# #
# Twisted inline callbacks pass return values using the function # What happens when the media_downloaded callback raises an
# twisted.internet.defer.returnValue, which encapsulates the return
# value inside a _DefGen_Return base exception.
#
# What happens when the media_downloaded callback raises another
# exception, for example a FileException('download-error') when # exception, for example a FileException('download-error') when
# the Response status code is not 200 OK, is that it stores the # the Response status code is not 200 OK, is that the original
# _DefGen_Return exception on the FileException context. # StopIteration exception (which in turn contains the failed
# Response and by extension, the original Request) gets encapsulated
# within the FileException context.
#
# Originally, Scrapy was using twisted.internet.defer.returnValue
# inside functions decorated with twisted.internet.defer.inlineCallbacks,
# encapsulating the returned Response in a _DefGen_Return exception
# instead of a StopIteration.
# #
# To avoid keeping references to the Response and therefore Request # To avoid keeping references to the Response and therefore Request
# objects on the Media Pipeline cache, we should wipe the context of # objects on the Media Pipeline cache, we should wipe the context of
# the exception encapsulated by the Twisted Failure when its a # the encapsulated exception when it is a StopIteration instance
# _DefGen_Return instance.
# #
# This problem does not occur in Python 2.7 since we don't have # This problem does not occur in Python 2.7 since we don't have
# Exception Chaining (https://www.python.org/dev/peps/pep-3134/). # Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
context = getattr(result.value, '__context__', None) context = getattr(result.value, '__context__', None)
if isinstance(context, _DefGen_Return): if isinstance(context, StopIteration):
setattr(result.value, '__context__', None) setattr(result.value, '__context__', None)
info.downloading.remove(fp) info.downloading.remove(fp)
@ -166,7 +170,7 @@ class MediaPipeline:
for wad in info.waiting.pop(fp): for wad in info.waiting.pop(fp):
defer_result(result).chainDeferred(wad) defer_result(result).chainDeferred(wad)
### Overridable Interface # Overridable Interface
def media_to_download(self, request, info): def media_to_download(self, request, info):
"""Check request before starting download""" """Check request before starting download"""
pass pass

View File

@ -58,9 +58,9 @@ class ResponseTypes:
def from_content_disposition(self, content_disposition): def from_content_disposition(self, content_disposition):
try: try:
filename = to_unicode(content_disposition, filename = to_unicode(
encoding='latin-1', errors='replace').split(';')[1].split('=')[1] content_disposition, encoding='latin-1', errors='replace'
filename = filename.strip('"\'') ).split(';')[1].split('=')[1].strip('"\'')
return self.from_filename(filename) return self.from_filename(filename)
except IndexError: except IndexError:
return Response return Response
@ -71,7 +71,7 @@ class ResponseTypes:
cls = Response cls = Response
if b'Content-Type' in headers: if b'Content-Type' in headers:
cls = self.from_content_type( cls = self.from_content_type(
content_type=headers[b'Content-type'], content_type=headers[b'Content-Type'],
content_encoding=headers.get(b'Content-Encoding') content_encoding=headers.get(b'Content-Encoding')
) )
if cls is Response and b'Content-Disposition' in headers: if cls is Response and b'Content-Disposition' in headers:

View File

@ -17,10 +17,12 @@ def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
except UnicodeDecodeError: except UnicodeDecodeError:
# If we found garbage or robots.txt in an encoding other than UTF-8, disregard it. # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
# Switch to 'allow all' state. # Switch to 'allow all' state.
logger.warning("Failure while parsing robots.txt. " logger.warning(
"File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.", "Failure while parsing robots.txt. File either contains garbage or "
exc_info=sys.exc_info(), "is in an encoding other than UTF-8, treating it as an empty file.",
extra={'spider': spider}) exc_info=sys.exc_info(),
extra={'spider': spider},
)
robotstxt_body = '' robotstxt_body = ''
return robotstxt_body return robotstxt_body

View File

@ -1,4 +1,6 @@
""" """
Selectors Selectors
""" """
from scrapy.selector.unified import * # noqa: F401
# top-level imports
from scrapy.selector.unified import Selector, SelectorList

View File

@ -65,9 +65,9 @@ class Selector(_ParselSelector, object_ref):
selectorlist_cls = SelectorList selectorlist_cls = SelectorList
def __init__(self, response=None, text=None, type=None, root=None, **kwargs): def __init__(self, response=None, text=None, type=None, root=None, **kwargs):
if not(response is None or text is None): if response is not None and text is not None:
raise ValueError('%s.__init__() received both response and text' raise ValueError('%s.__init__() received both response and text'
% self.__class__.__name__) % self.__class__.__name__)
st = _st(response, type or self._default_type) st = _st(response, type or self._default_type)

Some files were not shown because too many files have changed in this diff Show More