mirror of
https://github.com/scrapy/scrapy.git
synced 2025-03-14 16:58:20 +00:00
Merge branch 'master' into azure-pipelines
This commit is contained in:
commit
6e58da1dcd
@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 2.0.0
|
||||
current_version = 2.2.0
|
||||
commit = True
|
||||
tag = True
|
||||
tag_name = {new_version}
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -15,6 +15,7 @@ htmlcov/
|
||||
.pytest_cache/
|
||||
.coverage.*
|
||||
.cache/
|
||||
.mypy_cache/
|
||||
|
||||
# Windows
|
||||
Thumbs.db
|
||||
|
@ -1,4 +1,5 @@
|
||||
version: 2
|
||||
formats: all
|
||||
sphinx:
|
||||
configuration: docs/conf.py
|
||||
fail_on_warning: true
|
||||
|
44
.travis.yml
44
.travis.yml
@ -11,25 +11,35 @@ matrix:
|
||||
python: 3.8
|
||||
- env: TOXENV=flake8
|
||||
python: 3.8
|
||||
- env: TOXENV=pypy3
|
||||
- env: TOXENV=py35
|
||||
python: 3.5
|
||||
- env: TOXENV=pinned
|
||||
python: 3.5
|
||||
- env: TOXENV=py35-asyncio
|
||||
python: 3.5.2
|
||||
- env: TOXENV=py36
|
||||
python: 3.6
|
||||
- env: TOXENV=py37
|
||||
python: 3.7
|
||||
- env: TOXENV=py38
|
||||
python: 3.8
|
||||
- env: TOXENV=extra-deps
|
||||
python: 3.8
|
||||
- env: TOXENV=py38-asyncio
|
||||
- env: TOXENV=pylint
|
||||
python: 3.8
|
||||
- env: TOXENV=docs
|
||||
python: 3.7 # Keep in sync with .readthedocs.yml
|
||||
- env: TOXENV=typing
|
||||
python: 3.8
|
||||
|
||||
- env: TOXENV=pypy3
|
||||
- env: TOXENV=pinned
|
||||
python: 3.5.2
|
||||
- env: TOXENV=asyncio
|
||||
python: 3.5.2 # We use additional code to support 3.5.3 and earlier
|
||||
- env: TOXENV=py
|
||||
python: 3.5
|
||||
- env: TOXENV=asyncio
|
||||
python: 3.5 # We use specific code to support >= 3.5.4, < 3.6
|
||||
- env: TOXENV=py
|
||||
python: 3.6
|
||||
- env: TOXENV=py
|
||||
python: 3.7
|
||||
- env: TOXENV=py PYPI_RELEASE_JOB=true
|
||||
python: 3.8
|
||||
dist: bionic
|
||||
- env: TOXENV=extra-deps
|
||||
python: 3.8
|
||||
dist: bionic
|
||||
- env: TOXENV=asyncio
|
||||
python: 3.8
|
||||
dist: bionic
|
||||
install:
|
||||
- |
|
||||
if [ "$TOXENV" = "pypy3" ]; then
|
||||
@ -62,4 +72,4 @@ deploy:
|
||||
on:
|
||||
tags: true
|
||||
repo: scrapy/scrapy
|
||||
condition: "$TOXENV == py37 && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$"
|
||||
condition: "$PYPI_RELEASE_JOB == true && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$"
|
||||
|
@ -40,7 +40,7 @@ including a list of features.
|
||||
Requirements
|
||||
============
|
||||
|
||||
* Python 3.5+
|
||||
* Python 3.5.2+
|
||||
* Works on Linux, Windows, macOS, BSD
|
||||
|
||||
Install
|
||||
|
@ -12,6 +12,8 @@ collect_ignore = [
|
||||
"scrapy/utils/testsite.py",
|
||||
# contains scripts to be run by tests/test_crawler.py::CrawlerProcessSubprocess
|
||||
*_py_files("tests/CrawlerProcess"),
|
||||
# contains scripts to be run by tests/test_crawler.py::CrawlerRunnerSubprocess
|
||||
*_py_files("tests/CrawlerRunner"),
|
||||
# Py36-only parts of respective tests
|
||||
*_py_files("tests/py36"),
|
||||
]
|
||||
|
@ -57,3 +57,12 @@ There is a way to recreate the doc automatically when you make changes, you
|
||||
need to install watchdog (``pip install watchdog``) and then use::
|
||||
|
||||
make watch
|
||||
|
||||
Alternative method using tox
|
||||
----------------------------
|
||||
|
||||
To compile the documentation to HTML run the following command::
|
||||
|
||||
tox -e docs
|
||||
|
||||
Documentation will be generated (in HTML format) inside the ``.tox/docs/tmp/html`` dir.
|
||||
|
14
docs/conf.py
14
docs/conf.py
@ -1,5 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Scrapy documentation build configuration file, created by
|
||||
# sphinx-quickstart on Mon Nov 24 12:02:52 2008.
|
||||
#
|
||||
@ -102,6 +100,9 @@ exclude_trees = ['.build']
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = 'sphinx'
|
||||
|
||||
# List of Sphinx warnings that will not be raised
|
||||
suppress_warnings = ['epub.unknown_project_files']
|
||||
|
||||
|
||||
# Options for HTML output
|
||||
# -----------------------
|
||||
@ -280,6 +281,7 @@ coverage_ignore_pyobjects = [
|
||||
# -------------------------------------
|
||||
|
||||
intersphinx_mapping = {
|
||||
'attrs': ('https://www.attrs.org/en/stable/', None),
|
||||
'coverage': ('https://coverage.readthedocs.io/en/stable', None),
|
||||
'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
|
||||
'pytest': ('https://docs.pytest.org/en/latest', None),
|
||||
@ -295,3 +297,11 @@ intersphinx_mapping = {
|
||||
# ------------------------------------
|
||||
|
||||
hoverxref_auto_ref = True
|
||||
hoverxref_role_types = {
|
||||
"class": "tooltip",
|
||||
"confval": "tooltip",
|
||||
"hoverxref": "tooltip",
|
||||
"mod": "tooltip",
|
||||
"ref": "tooltip",
|
||||
}
|
||||
hoverxref_roles = ['command', 'reqmeta', 'setting', 'signal']
|
||||
|
@ -155,6 +155,9 @@ Finally, try to keep aesthetic changes (:pep:`8` compliance, unused imports
|
||||
removal, etc) in separate commits from functional changes. This will make pull
|
||||
requests easier to review and more likely to get merged.
|
||||
|
||||
|
||||
.. _coding-style:
|
||||
|
||||
Coding style
|
||||
============
|
||||
|
||||
@ -163,7 +166,7 @@ Scrapy:
|
||||
|
||||
* Unless otherwise specified, follow :pep:`8`.
|
||||
|
||||
* It's OK to use lines longer than 80 chars if it improves the code
|
||||
* It's OK to use lines longer than 79 chars if it improves the code
|
||||
readability.
|
||||
|
||||
* Don't put your name in the code you contribute; git provides enough
|
||||
|
23
docs/faq.rst
23
docs/faq.rst
@ -69,7 +69,7 @@ Here's an example spider using BeautifulSoup API, with ``lxml`` as the HTML pars
|
||||
What Python versions does Scrapy support?
|
||||
-----------------------------------------
|
||||
|
||||
Scrapy is supported under Python 3.5+
|
||||
Scrapy is supported under Python 3.5.2+
|
||||
under CPython (default Python implementation) and PyPy (starting with PyPy 5.9).
|
||||
Python 3 support was added in Scrapy 1.1.
|
||||
PyPy support was added in Scrapy 1.4, PyPy3 support was added in Scrapy 1.5.
|
||||
@ -342,15 +342,15 @@ method for this purpose. For example::
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
from scrapy.item import BaseItem
|
||||
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
|
||||
class MultiplyItemsMiddleware:
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
for item in result:
|
||||
if isinstance(item, (BaseItem, dict)):
|
||||
for _ in range(item['multiply_by']):
|
||||
if is_item(item):
|
||||
adapter = ItemAdapter(item)
|
||||
for _ in range(adapter['multiply_by']):
|
||||
yield deepcopy(item)
|
||||
|
||||
Does Scrapy support IPv6 addresses?
|
||||
@ -371,6 +371,19 @@ Twisted reactor is :class:`twisted.internet.selectreactor.SelectReactor`. Switch
|
||||
different reactor is possible by using the :setting:`TWISTED_REACTOR` setting.
|
||||
|
||||
|
||||
.. _faq-stop-response-download:
|
||||
|
||||
How can I cancel the download of a given response?
|
||||
--------------------------------------------------
|
||||
|
||||
In some situations, it might be useful to stop the download of a certain response.
|
||||
For instance, if you only need the first part of a large response and you would like
|
||||
to save resources by avoiding the download of the whole body.
|
||||
In that case, you could attach a handler to the :class:`~scrapy.signals.bytes_received`
|
||||
signal and raise a :exc:`~scrapy.exceptions.StopDownload` exception. Please refer to
|
||||
the :ref:`topics-stop-response-download` topic for additional information and examples.
|
||||
|
||||
|
||||
.. _has been reported: https://github.com/scrapy/scrapy/issues/2905
|
||||
.. _user agents: https://en.wikipedia.org/wiki/User_agent
|
||||
.. _LIFO: https://en.wikipedia.org/wiki/Stack_(abstract_data_type)
|
||||
|
@ -7,7 +7,7 @@ Installation guide
|
||||
Installing Scrapy
|
||||
=================
|
||||
|
||||
Scrapy runs on Python 3.5 or above under CPython (default Python
|
||||
Scrapy runs on Python 3.5.2 or above under CPython (default Python
|
||||
implementation) and PyPy (starting with PyPy 5.9).
|
||||
|
||||
If you're using `Anaconda`_ or `Miniconda`_, you can install the package from
|
||||
|
@ -287,8 +287,8 @@ to be scraped, you can at least get **some** data.
|
||||
|
||||
Besides the :meth:`~scrapy.selector.SelectorList.getall` and
|
||||
:meth:`~scrapy.selector.SelectorList.get` methods, you can also use
|
||||
the :meth:`~scrapy.selector.SelectorList.re` method to extract using `regular
|
||||
expressions`_:
|
||||
the :meth:`~scrapy.selector.SelectorList.re` method to extract using
|
||||
:doc:`regular expressions <library/re>`:
|
||||
|
||||
>>> response.css('title::text').re(r'Quotes.*')
|
||||
['Quotes to Scrape']
|
||||
@ -305,7 +305,6 @@ with a selector (see :ref:`topics-developer-tools`).
|
||||
`Selector Gadget`_ is also a nice tool to quickly find CSS selector for
|
||||
visually selected elements, which works in many browsers.
|
||||
|
||||
.. _regular expressions: https://docs.python.org/3/library/re.html
|
||||
.. _Selector Gadget: https://selectorgadget.com/
|
||||
|
||||
|
||||
|
342
docs/news.rst
342
docs/news.rst
@ -3,6 +3,348 @@
|
||||
Release notes
|
||||
=============
|
||||
|
||||
.. _release-2.2.0:
|
||||
|
||||
Scrapy 2.2.0 (2020-06-24)
|
||||
-------------------------
|
||||
|
||||
Highlights:
|
||||
|
||||
* Python 3.5.2+ is required now
|
||||
* :ref:`dataclass objects <dataclass-items>` and
|
||||
:ref:`attrs objects <attrs-items>` are now valid :ref:`item types
|
||||
<item-types>`
|
||||
* New :meth:`TextResponse.json <scrapy.http.TextResponse.json>` method
|
||||
* New :signal:`bytes_received` signal that allows canceling response download
|
||||
* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` fixes
|
||||
|
||||
Backward-incompatible changes
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Support for Python 3.5.0 and 3.5.1 has been dropped; Scrapy now refuses to
|
||||
run with a Python version lower than 3.5.2, which introduced
|
||||
:class:`typing.Type` (:issue:`4615`)
|
||||
|
||||
|
||||
Deprecations
|
||||
~~~~~~~~~~~~
|
||||
|
||||
* :meth:`TextResponse.body_as_unicode
|
||||
<scrapy.http.TextResponse.body_as_unicode>` is now deprecated, use
|
||||
:attr:`TextResponse.text <scrapy.http.TextResponse.text>` instead
|
||||
(:issue:`4546`, :issue:`4555`, :issue:`4579`)
|
||||
|
||||
* :class:`scrapy.item.BaseItem` is now deprecated, use
|
||||
:class:`scrapy.item.Item` instead (:issue:`4534`)
|
||||
|
||||
|
||||
New features
|
||||
~~~~~~~~~~~~
|
||||
|
||||
* :ref:`dataclass objects <dataclass-items>` and
|
||||
:ref:`attrs objects <attrs-items>` are now valid :ref:`item types
|
||||
<item-types>`, and a new itemadapter_ library makes it easy to
|
||||
write code that :ref:`supports any item type <supporting-item-types>`
|
||||
(:issue:`2749`, :issue:`2807`, :issue:`3761`, :issue:`3881`, :issue:`4642`)
|
||||
|
||||
* A new :meth:`TextResponse.json <scrapy.http.TextResponse.json>` method
|
||||
allows to deserialize JSON responses (:issue:`2444`, :issue:`4460`,
|
||||
:issue:`4574`)
|
||||
|
||||
* A new :signal:`bytes_received` signal allows monitoring response download
|
||||
progress and :ref:`stopping downloads <topics-stop-response-download>`
|
||||
(:issue:`4205`, :issue:`4559`)
|
||||
|
||||
* The dictionaries in the result list of a :ref:`media pipeline
|
||||
<topics-media-pipeline>` now include a new key, ``status``, which indicates
|
||||
if the file was downloaded or, if the file was not downloaded, why it was
|
||||
not downloaded; see :meth:`FilesPipeline.get_media_requests
|
||||
<scrapy.pipelines.files.FilesPipeline.get_media_requests>` for more
|
||||
information (:issue:`2893`, :issue:`4486`)
|
||||
|
||||
* When using :ref:`Google Cloud Storage <media-pipeline-gcs>` for
|
||||
a :ref:`media pipeline <topics-media-pipeline>`, a warning is now logged if
|
||||
the configured credentials do not grant the required permissions
|
||||
(:issue:`4346`, :issue:`4508`)
|
||||
|
||||
* :ref:`Link extractors <topics-link-extractors>` are now serializable,
|
||||
as long as you do not use :ref:`lambdas <lambda>` for parameters; for
|
||||
example, you can now pass link extractors in :attr:`Request.cb_kwargs
|
||||
<scrapy.http.Request.cb_kwargs>` or
|
||||
:attr:`Request.meta <scrapy.http.Request.meta>` when :ref:`persisting
|
||||
scheduled requests <topics-jobs>` (:issue:`4554`)
|
||||
|
||||
* Upgraded the :ref:`pickle protocol <pickle-protocols>` that Scrapy uses
|
||||
from protocol 2 to protocol 4, improving serialization capabilities and
|
||||
performance (:issue:`4135`, :issue:`4541`)
|
||||
|
||||
* :func:`scrapy.utils.misc.create_instance` now raises a :exc:`TypeError`
|
||||
exception if the resulting instance is ``None`` (:issue:`4528`,
|
||||
:issue:`4532`)
|
||||
|
||||
.. _itemadapter: https://github.com/scrapy/itemadapter
|
||||
|
||||
|
||||
Bug fixes
|
||||
~~~~~~~~~
|
||||
|
||||
* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` no longer
|
||||
discards cookies defined in :attr:`Request.headers
|
||||
<scrapy.http.Request.headers>` (:issue:`1992`, :issue:`2400`)
|
||||
|
||||
* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` no longer
|
||||
re-encodes cookies defined as :class:`bytes` in the ``cookies`` parameter
|
||||
of the ``__init__`` method of :class:`~scrapy.http.Request`
|
||||
(:issue:`2400`, :issue:`3575`)
|
||||
|
||||
* When :setting:`FEEDS` defines multiple URIs, :setting:`FEED_STORE_EMPTY` is
|
||||
``False`` and the crawl yields no items, Scrapy no longer stops feed
|
||||
exports after the first URI (:issue:`4621`, :issue:`4626`)
|
||||
|
||||
* :class:`~scrapy.spiders.Spider` callbacks defined using :doc:`coroutine
|
||||
syntax <topics/coroutines>` no longer need to return an iterable, and may
|
||||
instead return a :class:`~scrapy.http.Request` object, an
|
||||
:ref:`item <topics-items>`, or ``None`` (:issue:`4609`)
|
||||
|
||||
* The :command:`startproject` command now ensures that the generated project
|
||||
folders and files have the right permissions (:issue:`4604`)
|
||||
|
||||
* Fix a :exc:`KeyError` exception being sometimes raised from
|
||||
:class:`scrapy.utils.datatypes.LocalWeakReferencedCache` (:issue:`4597`,
|
||||
:issue:`4599`)
|
||||
|
||||
* When :setting:`FEEDS` defines multiple URIs, log messages about items being
|
||||
stored now contain information from the corresponding feed, instead of
|
||||
always containing information about only one of the feeds (:issue:`4619`,
|
||||
:issue:`4629`)
|
||||
|
||||
|
||||
Documentation
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
* Added a new section about :ref:`accessing cb_kwargs from errbacks
|
||||
<errback-cb_kwargs>` (:issue:`4598`, :issue:`4634`)
|
||||
|
||||
* Covered chompjs_ in :ref:`topics-parsing-javascript` (:issue:`4556`,
|
||||
:issue:`4562`)
|
||||
|
||||
* Removed from :doc:`topics/coroutines` the warning about the API being
|
||||
experimental (:issue:`4511`, :issue:`4513`)
|
||||
|
||||
* Removed references to unsupported versions of :doc:`Twisted
|
||||
<twisted:index>` (:issue:`4533`)
|
||||
|
||||
* Updated the description of the :ref:`screenshot pipeline example
|
||||
<ScreenshotPipeline>`, which now uses :doc:`coroutine syntax
|
||||
<topics/coroutines>` instead of returning a
|
||||
:class:`~twisted.internet.defer.Deferred` (:issue:`4514`, :issue:`4593`)
|
||||
|
||||
* Removed a misleading import line from the
|
||||
:func:`scrapy.utils.log.configure_logging` code example (:issue:`4510`,
|
||||
:issue:`4587`)
|
||||
|
||||
* The display-on-hover behavior of internal documentation references now also
|
||||
covers links to :ref:`commands <topics-commands>`, :attr:`Request.meta
|
||||
<scrapy.http.Request.meta>` keys, :ref:`settings <topics-settings>` and
|
||||
:ref:`signals <topics-signals>` (:issue:`4495`, :issue:`4563`)
|
||||
|
||||
* It is again possible to download the documentation for offline reading
|
||||
(:issue:`4578`, :issue:`4585`)
|
||||
|
||||
* Removed backslashes preceding ``*args`` and ``**kwargs`` in some function
|
||||
and method signatures (:issue:`4592`, :issue:`4596`)
|
||||
|
||||
.. _chompjs: https://github.com/Nykakin/chompjs
|
||||
|
||||
|
||||
Quality assurance
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Adjusted the code base further to our :ref:`style guidelines
|
||||
<coding-style>` (:issue:`4237`, :issue:`4525`, :issue:`4538`,
|
||||
:issue:`4539`, :issue:`4540`, :issue:`4542`, :issue:`4543`, :issue:`4544`,
|
||||
:issue:`4545`, :issue:`4557`, :issue:`4558`, :issue:`4566`, :issue:`4568`,
|
||||
:issue:`4572`)
|
||||
|
||||
* Removed remnants of Python 2 support (:issue:`4550`, :issue:`4553`,
|
||||
:issue:`4568`)
|
||||
|
||||
* Improved code sharing between the :command:`crawl` and :command:`runspider`
|
||||
commands (:issue:`4548`, :issue:`4552`)
|
||||
|
||||
* Replaced ``chain(*iterable)`` with ``chain.from_iterable(iterable)``
|
||||
(:issue:`4635`)
|
||||
|
||||
* You may now run the :mod:`asyncio` tests with Tox on any Python version
|
||||
(:issue:`4521`)
|
||||
|
||||
* Updated test requirements to reflect an incompatibility with pytest 5.4 and
|
||||
5.4.1 (:issue:`4588`)
|
||||
|
||||
* Improved :class:`~scrapy.spiderloader.SpiderLoader` test coverage for
|
||||
scenarios involving duplicate spider names (:issue:`4549`, :issue:`4560`)
|
||||
|
||||
* Configured Travis CI to also run the tests with Python 3.5.2
|
||||
(:issue:`4518`, :issue:`4615`)
|
||||
|
||||
* Added a `Pylint <https://www.pylint.org/>`_ job to Travis CI
|
||||
(:issue:`3727`)
|
||||
|
||||
* Added a `Mypy <http://mypy-lang.org/>`_ job to Travis CI (:issue:`4637`)
|
||||
|
||||
* Made use of set literals in tests (:issue:`4573`)
|
||||
|
||||
* Cleaned up the Travis CI configuration (:issue:`4517`, :issue:`4519`,
|
||||
:issue:`4522`, :issue:`4537`)
|
||||
|
||||
|
||||
.. _release-2.1.0:
|
||||
|
||||
Scrapy 2.1.0 (2020-04-24)
|
||||
-------------------------
|
||||
|
||||
Highlights:
|
||||
|
||||
* New :setting:`FEEDS` setting to export to multiple feeds
|
||||
* New :attr:`Response.ip_address <scrapy.http.Response.ip_address>` attribute
|
||||
|
||||
Backward-incompatible changes
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* :exc:`AssertionError` exceptions triggered by :ref:`assert <assert>`
|
||||
statements have been replaced by new exception types, to support running
|
||||
Python in optimized mode (see :option:`-O`) without changing Scrapy’s
|
||||
behavior in any unexpected ways.
|
||||
|
||||
If you catch an :exc:`AssertionError` exception from Scrapy, update your
|
||||
code to catch the corresponding new exception.
|
||||
|
||||
(:issue:`4440`)
|
||||
|
||||
|
||||
Deprecation removals
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* The ``LOG_UNSERIALIZABLE_REQUESTS`` setting is no longer supported, use
|
||||
:setting:`SCHEDULER_DEBUG` instead (:issue:`4385`)
|
||||
|
||||
* The ``REDIRECT_MAX_METAREFRESH_DELAY`` setting is no longer supported, use
|
||||
:setting:`METAREFRESH_MAXDELAY` instead (:issue:`4385`)
|
||||
|
||||
* The :class:`~scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware`
|
||||
middleware has been removed, including the entire
|
||||
:class:`scrapy.downloadermiddlewares.chunked` module; chunked transfers
|
||||
work out of the box (:issue:`4431`)
|
||||
|
||||
* The ``spiders`` property has been removed from
|
||||
:class:`~scrapy.crawler.Crawler`, use :class:`CrawlerRunner.spider_loader
|
||||
<scrapy.crawler.CrawlerRunner.spider_loader>` or instantiate
|
||||
:setting:`SPIDER_LOADER_CLASS` with your settings instead (:issue:`4398`)
|
||||
|
||||
* The ``MultiValueDict``, ``MultiValueDictKeyError``, and ``SiteNode``
|
||||
classes have been removed from :mod:`scrapy.utils.datatypes`
|
||||
(:issue:`4400`)
|
||||
|
||||
|
||||
Deprecations
|
||||
~~~~~~~~~~~~
|
||||
|
||||
* The ``FEED_FORMAT`` and ``FEED_URI`` settings have been deprecated in
|
||||
favor of the new :setting:`FEEDS` setting (:issue:`1336`, :issue:`3858`,
|
||||
:issue:`4507`)
|
||||
|
||||
|
||||
New features
|
||||
~~~~~~~~~~~~
|
||||
|
||||
* A new setting, :setting:`FEEDS`, allows configuring multiple output feeds
|
||||
with different settings each (:issue:`1336`, :issue:`3858`, :issue:`4507`)
|
||||
|
||||
* The :command:`crawl` and :command:`runspider` commands now support multiple
|
||||
``-o`` parameters (:issue:`1336`, :issue:`3858`, :issue:`4507`)
|
||||
|
||||
* The :command:`crawl` and :command:`runspider` commands now support
|
||||
specifying an output format by appending ``:<format>`` to the output file
|
||||
(:issue:`1336`, :issue:`3858`, :issue:`4507`)
|
||||
|
||||
* The new :attr:`Response.ip_address <scrapy.http.Response.ip_address>`
|
||||
attribute gives access to the IP address that originated a response
|
||||
(:issue:`3903`, :issue:`3940`)
|
||||
|
||||
* A warning is now issued when a value in
|
||||
:attr:`~scrapy.spiders.Spider.allowed_domains` includes a port
|
||||
(:issue:`50`, :issue:`3198`, :issue:`4413`)
|
||||
|
||||
* Zsh completion now excludes used option aliases from the completion list
|
||||
(:issue:`4438`)
|
||||
|
||||
|
||||
Bug fixes
|
||||
~~~~~~~~~
|
||||
|
||||
* :ref:`Request serialization <request-serialization>` no longer breaks for
|
||||
callbacks that are spider attributes which are assigned a function with a
|
||||
different name (:issue:`4500`)
|
||||
|
||||
* ``None`` values in :attr:`~scrapy.spiders.Spider.allowed_domains` no longer
|
||||
cause a :exc:`TypeError` exception (:issue:`4410`)
|
||||
|
||||
* Zsh completion no longer allows options after arguments (:issue:`4438`)
|
||||
|
||||
* zope.interface 5.0.0 and later versions are now supported
|
||||
(:issue:`4447`, :issue:`4448`)
|
||||
|
||||
* :meth:`Spider.make_requests_from_url
|
||||
<scrapy.spiders.Spider.make_requests_from_url>`, deprecated in Scrapy
|
||||
1.4.0, now issues a warning when used (:issue:`4412`)
|
||||
|
||||
|
||||
Documentation
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
* Improved the documentation about signals that allow their handlers to
|
||||
return a :class:`~twisted.internet.defer.Deferred` (:issue:`4295`,
|
||||
:issue:`4390`)
|
||||
|
||||
* Our PyPI entry now includes links for our documentation, our source code
|
||||
repository and our issue tracker (:issue:`4456`)
|
||||
|
||||
* Covered the `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_
|
||||
service in the documentation (:issue:`4206`, :issue:`4455`)
|
||||
|
||||
* Removed references to the Guppy library, which only works in Python 2
|
||||
(:issue:`4285`, :issue:`4343`)
|
||||
|
||||
* Extended use of InterSphinx to link to Python 3 documentation
|
||||
(:issue:`4444`, :issue:`4445`)
|
||||
|
||||
* Added support for Sphinx 3.0 and later (:issue:`4475`, :issue:`4480`,
|
||||
:issue:`4496`, :issue:`4503`)
|
||||
|
||||
|
||||
Quality assurance
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Removed warnings about using old, removed settings (:issue:`4404`)
|
||||
|
||||
* Removed a warning about importing
|
||||
:class:`~twisted.internet.testing.StringTransport` from
|
||||
``twisted.test.proto_helpers`` in Twisted 19.7.0 or newer (:issue:`4409`)
|
||||
|
||||
* Removed outdated Debian package build files (:issue:`4384`)
|
||||
|
||||
* Removed :class:`object` usage as a base class (:issue:`4430`)
|
||||
|
||||
* Removed code that added support for old versions of Twisted that we no
|
||||
longer support (:issue:`4472`)
|
||||
|
||||
* Fixed code style issues (:issue:`4468`, :issue:`4469`, :issue:`4471`,
|
||||
:issue:`4481`)
|
||||
|
||||
* Removed :func:`twisted.internet.defer.returnValue` calls (:issue:`4443`,
|
||||
:issue:`4446`, :issue:`4489`)
|
||||
|
||||
|
||||
.. _release-2.0.1:
|
||||
|
||||
Scrapy 2.0.1 (2020-03-18)
|
||||
|
@ -1,4 +1,4 @@
|
||||
Sphinx>=2.1
|
||||
sphinx-hoverxref
|
||||
sphinx-notfound-page
|
||||
sphinx_rtd_theme
|
||||
Sphinx>=3.0
|
||||
sphinx-hoverxref>=0.2b1
|
||||
sphinx-notfound-page>=0.4
|
||||
sphinx_rtd_theme>=0.4
|
||||
|
@ -91,7 +91,7 @@ how you :ref:`configure the downloader middlewares
|
||||
provided while constructing the crawler, and it is created after the
|
||||
arguments given in the :meth:`crawl` method.
|
||||
|
||||
.. method:: crawl(\*args, \**kwargs)
|
||||
.. method:: crawl(*args, **kwargs)
|
||||
|
||||
Starts the crawler by instantiating its spider class with the given
|
||||
``args`` and ``kwargs`` arguments, while setting the execution engine in
|
||||
|
@ -104,7 +104,7 @@ Spiders
|
||||
-------
|
||||
|
||||
Spiders are custom classes written by Scrapy users to parse responses and
|
||||
extract items (aka scraped items) from them or additional requests to
|
||||
extract :ref:`items <topics-items>` from them or additional requests to
|
||||
follow. For more information see :ref:`topics-spiders`.
|
||||
|
||||
.. _component-pipelines:
|
||||
|
@ -78,7 +78,7 @@ override three methods:
|
||||
|
||||
.. module:: scrapy.contracts
|
||||
|
||||
.. class:: Contract(method, \*args)
|
||||
.. class:: Contract(method, *args)
|
||||
|
||||
:param method: callback function to which the contract is associated
|
||||
:type method: function
|
||||
@ -136,7 +136,7 @@ Detecting check runs
|
||||
====================
|
||||
|
||||
When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is
|
||||
set to the ``true`` string. You can use `os.environ`_ to perform any change to
|
||||
set to the ``true`` string. You can use :data:`os.environ` to perform any change to
|
||||
your spiders or your settings when ``scrapy check`` is used::
|
||||
|
||||
import os
|
||||
@ -148,5 +148,3 @@ your spiders or your settings when ``scrapy check`` is used::
|
||||
def __init__(self):
|
||||
if os.environ.get('SCRAPY_CHECK'):
|
||||
pass # Do some scraper adjustments when a check is running
|
||||
|
||||
.. _os.environ: https://docs.python.org/3/library/os.html#os.environ
|
||||
|
@ -7,10 +7,6 @@ Coroutines
|
||||
Scrapy has :ref:`partial support <coroutine-support>` for the
|
||||
:ref:`coroutine syntax <async>`.
|
||||
|
||||
.. warning:: :mod:`asyncio` support in Scrapy is experimental. Future Scrapy
|
||||
versions may introduce related API and behavior changes without a
|
||||
deprecation period or warning.
|
||||
|
||||
.. _coroutine-support:
|
||||
|
||||
Supported callables
|
||||
@ -57,27 +53,34 @@ There are several use cases for coroutines in Scrapy. Code that would
|
||||
return Deferreds when written for previous Scrapy versions, such as downloader
|
||||
middlewares and signal handlers, can be rewritten to be shorter and cleaner::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
class DbPipeline:
|
||||
def _update_item(self, data, item):
|
||||
item['field'] = data
|
||||
adapter = ItemAdapter(item)
|
||||
adapter['field'] = data
|
||||
return item
|
||||
|
||||
def process_item(self, item, spider):
|
||||
dfd = db.get_some_data(item['id'])
|
||||
adapter = ItemAdapter(item)
|
||||
dfd = db.get_some_data(adapter['id'])
|
||||
dfd.addCallback(self._update_item, item)
|
||||
return dfd
|
||||
|
||||
becomes::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
class DbPipeline:
|
||||
async def process_item(self, item, spider):
|
||||
item['field'] = await db.get_some_data(item['id'])
|
||||
adapter = ItemAdapter(item)
|
||||
adapter['field'] = await db.get_some_data(adapter['id'])
|
||||
return item
|
||||
|
||||
Coroutines may be used to call asynchronous code. This includes other
|
||||
coroutines, functions that return Deferreds and functions that return
|
||||
`awaitable objects`_ such as :class:`~asyncio.Future`. This means you can use
|
||||
many useful Python libraries providing such code::
|
||||
:term:`awaitable objects <awaitable>` such as :class:`~asyncio.Future`.
|
||||
This means you can use many useful Python libraries providing such code::
|
||||
|
||||
class MySpider(Spider):
|
||||
# ...
|
||||
@ -107,4 +110,3 @@ Common use cases for asynchronous code include:
|
||||
:ref:`the screenshot pipeline example<ScreenshotPipeline>`).
|
||||
|
||||
.. _aio-libs: https://github.com/aio-libs
|
||||
.. _awaitable objects: https://docs.python.org/3/glossary.html#term-awaitable
|
||||
|
@ -292,6 +292,9 @@ Alternatively, if you want to know the arguments needed to recreate that
|
||||
request you can use the :func:`scrapy.utils.curl.curl_to_request_kwargs`
|
||||
function to get a dictionary with the equivalent arguments.
|
||||
|
||||
Note that to translate a cURL command into a Scrapy request,
|
||||
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
|
||||
|
||||
As you can see, with a few inspections in the `Network`-tool we
|
||||
were able to easily replicate the dynamic requests of the scrolling
|
||||
functionality of the page. Crawling dynamic pages can be quite
|
||||
|
@ -202,6 +202,11 @@ CookiesMiddleware
|
||||
sends them back on subsequent requests (from that spider), just like web
|
||||
browsers do.
|
||||
|
||||
.. caution:: When non-UTF8 encoded byte sequences are passed to a
|
||||
:class:`~scrapy.http.Request`, the ``CookiesMiddleware`` will log
|
||||
a warning. Refer to :ref:`topics-logging-advanced-customization`
|
||||
to customize the logging behaviour.
|
||||
|
||||
The following settings can be used to configure the cookie middleware:
|
||||
|
||||
* :setting:`COOKIES_ENABLED`
|
||||
@ -739,7 +744,7 @@ HttpProxyMiddleware
|
||||
This middleware sets the HTTP proxy to use for requests, by setting the
|
||||
``proxy`` meta value for :class:`~scrapy.http.Request` objects.
|
||||
|
||||
Like the Python standard library modules `urllib`_ and `urllib2`_, it obeys
|
||||
Like the Python standard library module :mod:`urllib.request`, it obeys
|
||||
the following environment variables:
|
||||
|
||||
* ``http_proxy``
|
||||
@ -751,9 +756,6 @@ HttpProxyMiddleware
|
||||
Keep in mind this value will take precedence over ``http_proxy``/``https_proxy``
|
||||
environment variables, and it will also ignore ``no_proxy`` environment variable.
|
||||
|
||||
.. _urllib: https://docs.python.org/2/library/urllib.html
|
||||
.. _urllib2: https://docs.python.org/2/library/urllib2.html
|
||||
|
||||
RedirectMiddleware
|
||||
------------------
|
||||
|
||||
@ -829,6 +831,7 @@ REDIRECT_MAX_TIMES
|
||||
Default: ``20``
|
||||
|
||||
The maximum number of redirections that will be followed for a single request.
|
||||
After this maximum, the request's response is returned as is.
|
||||
|
||||
MetaRefreshMiddleware
|
||||
---------------------
|
||||
@ -1036,8 +1039,7 @@ Scrapy uses this parser by default.
|
||||
RobotFileParser
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Based on `RobotFileParser
|
||||
<https://docs.python.org/3.7/library/urllib.robotparser.html>`_:
|
||||
Based on :class:`~urllib.robotparser.RobotFileParser`:
|
||||
|
||||
* is Python's built-in robots.txt_ parser
|
||||
|
||||
|
@ -104,6 +104,9 @@ If you get the expected response `sometimes`, but not always, the issue is
|
||||
probably not your request, but the target server. The target server might be
|
||||
buggy, overloaded, or :ref:`banning <bans>` some of your requests.
|
||||
|
||||
Note that to translate a cURL command into a Scrapy request,
|
||||
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
|
||||
|
||||
.. _topics-handling-response-formats:
|
||||
|
||||
Handling different response formats
|
||||
@ -115,7 +118,7 @@ data from it depends on the type of response:
|
||||
- If the response is HTML or XML, use :ref:`selectors
|
||||
<topics-selectors>` as usual.
|
||||
|
||||
- If the response is JSON, use `json.loads`_ to load the desired data from
|
||||
- If the response is JSON, use :func:`json.loads` to load the desired data from
|
||||
:attr:`response.text <scrapy.http.TextResponse.text>`::
|
||||
|
||||
data = json.loads(response.text)
|
||||
@ -130,8 +133,9 @@ data from it depends on the type of response:
|
||||
- If the response is JavaScript, or HTML with a ``<script/>`` element
|
||||
containing the desired data, see :ref:`topics-parsing-javascript`.
|
||||
|
||||
- If the response is CSS, use a `regular expression`_ to extract the desired
|
||||
data from :attr:`response.text <scrapy.http.TextResponse.text>`.
|
||||
- If the response is CSS, use a :doc:`regular expression <library/re>` to
|
||||
extract the desired data from
|
||||
:attr:`response.text <scrapy.http.TextResponse.text>`.
|
||||
|
||||
.. _topics-parsing-images:
|
||||
|
||||
@ -168,8 +172,9 @@ JavaScript code:
|
||||
Once you have a string with the JavaScript code, you can extract the desired
|
||||
data from it:
|
||||
|
||||
- You might be able to use a `regular expression`_ to extract the desired
|
||||
data in JSON format, which you can then parse with `json.loads`_.
|
||||
- You might be able to use a :doc:`regular expression <library/re>` to
|
||||
extract the desired data in JSON format, which you can then parse with
|
||||
:func:`json.loads`.
|
||||
|
||||
For example, if the JavaScript code contains a separate line like
|
||||
``var data = {"field": "value"};`` you can extract that data as follows:
|
||||
@ -179,6 +184,18 @@ data from it:
|
||||
>>> json.loads(json_data)
|
||||
{'field': 'value'}
|
||||
|
||||
- chompjs_ provides an API to parse JavaScript objects into a :class:`dict`.
|
||||
|
||||
For example, if the JavaScript code contains
|
||||
``var data = {field: "value", secondField: "second value"};``
|
||||
you can extract that data as follows:
|
||||
|
||||
>>> import chompjs
|
||||
>>> javascript = response.css('script::text').get()
|
||||
>>> data = chompjs.parse_js_object(javascript)
|
||||
>>> data
|
||||
{'field': 'value', 'secondField': 'second value'}
|
||||
|
||||
- Otherwise, use js2xml_ to convert the JavaScript code into an XML document
|
||||
that you can parse using :ref:`selectors <topics-selectors>`.
|
||||
|
||||
@ -236,14 +253,13 @@ along with `scrapy-selenium`_ for seamless integration.
|
||||
|
||||
|
||||
.. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29
|
||||
.. _chompjs: https://github.com/Nykakin/chompjs
|
||||
.. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
|
||||
.. _curl: https://curl.haxx.se/
|
||||
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
|
||||
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
|
||||
.. _js2xml: https://github.com/scrapinghub/js2xml
|
||||
.. _json.loads: https://docs.python.org/3/library/json.html#json.loads
|
||||
.. _pytesseract: https://github.com/madmaze/pytesseract
|
||||
.. _regular expression: https://docs.python.org/3/library/re.html
|
||||
.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
|
||||
.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
|
||||
.. _Selenium: https://www.selenium.dev/
|
||||
|
@ -7,7 +7,7 @@ Sending e-mail
|
||||
.. module:: scrapy.mail
|
||||
:synopsis: Email sending facility
|
||||
|
||||
Although Python makes sending e-mails relatively easy via the `smtplib`_
|
||||
Although Python makes sending e-mails relatively easy via the :mod:`smtplib`
|
||||
library, Scrapy provides its own facility for sending e-mails which is very
|
||||
easy to use and it's implemented using :doc:`Twisted non-blocking IO
|
||||
<twisted:core/howto/defer-intro>`, to avoid interfering with the non-blocking
|
||||
@ -15,8 +15,6 @@ IO of the crawler. It also provides a simple API for sending attachments and
|
||||
it's very easy to configure, with a few :ref:`settings
|
||||
<topics-email-settings>`.
|
||||
|
||||
.. _smtplib: https://docs.python.org/2/library/smtplib.html
|
||||
|
||||
Quick example
|
||||
=============
|
||||
|
||||
|
@ -14,13 +14,6 @@ Built-in Exceptions reference
|
||||
|
||||
Here's a list of all exceptions included in Scrapy and their usage.
|
||||
|
||||
DropItem
|
||||
--------
|
||||
|
||||
.. exception:: DropItem
|
||||
|
||||
The exception that must be raised by item pipeline stages to stop processing an
|
||||
Item. For more information see :ref:`topics-item-pipeline`.
|
||||
|
||||
CloseSpider
|
||||
-----------
|
||||
@ -47,6 +40,14 @@ DontCloseSpider
|
||||
This exception can be raised in a :signal:`spider_idle` signal handler to
|
||||
prevent the spider from being closed.
|
||||
|
||||
DropItem
|
||||
--------
|
||||
|
||||
.. exception:: DropItem
|
||||
|
||||
The exception that must be raised by item pipeline stages to stop processing an
|
||||
Item. For more information see :ref:`topics-item-pipeline`.
|
||||
|
||||
IgnoreRequest
|
||||
-------------
|
||||
|
||||
@ -77,3 +78,37 @@ NotSupported
|
||||
|
||||
This exception is raised to indicate an unsupported feature.
|
||||
|
||||
StopDownload
|
||||
-------------
|
||||
|
||||
.. versionadded:: 2.2
|
||||
|
||||
.. exception:: StopDownload(fail=True)
|
||||
|
||||
Raised from a :class:`~scrapy.signals.bytes_received` signal handler to
|
||||
indicate that no further bytes should be downloaded for a response.
|
||||
|
||||
The ``fail`` boolean parameter controls which method will handle the resulting
|
||||
response:
|
||||
|
||||
* If ``fail=True`` (default), the request errback is called. The response object is
|
||||
available as the ``response`` attribute of the ``StopDownload`` exception,
|
||||
which is in turn stored as the ``value`` attribute of the received
|
||||
:class:`~twisted.python.failure.Failure` object. This means that in an errback
|
||||
defined as ``def errback(self, failure)``, the response can be accessed though
|
||||
``failure.value.response``.
|
||||
|
||||
* If ``fail=False``, the request callback is called instead.
|
||||
|
||||
In both cases, the response could have its body truncated: the body contains
|
||||
all bytes received up until the exception is raised, including the bytes
|
||||
received in the signal handler that raises the exception. Also, the response
|
||||
object is marked with ``"download_stopped"`` in its :attr:`Response.flags`
|
||||
attribute.
|
||||
|
||||
.. note:: ``fail`` is a keyword-only parameter, i.e. raising
|
||||
``StopDownload(False)`` or ``StopDownload(True)`` will raise
|
||||
a :class:`TypeError`.
|
||||
|
||||
See the documentation for the :class:`~scrapy.signals.bytes_received` signal
|
||||
and the :ref:`topics-stop-response-download` topic for additional information and examples.
|
||||
|
@ -40,6 +40,7 @@ Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses multiple
|
||||
Item Exporters to group scraped items to different files according to the
|
||||
value of one of their fields::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exporters import XmlItemExporter
|
||||
|
||||
class PerYearXmlExportPipeline:
|
||||
@ -53,7 +54,8 @@ value of one of their fields::
|
||||
exporter.finish_exporting()
|
||||
|
||||
def _exporter_for_item(self, item):
|
||||
year = item['year']
|
||||
adapter = ItemAdapter(item)
|
||||
year = adapter['year']
|
||||
if year not in self.year_to_exporter:
|
||||
f = open('{}.xml'.format(year), 'wb')
|
||||
exporter = XmlItemExporter(f)
|
||||
@ -167,9 +169,10 @@ BaseItemExporter
|
||||
value unchanged except for ``unicode`` values which are encoded to
|
||||
``str`` using the encoding declared in the :attr:`encoding` attribute.
|
||||
|
||||
:param field: the field being serialized. If a raw dict is being
|
||||
exported (not :class:`~.Item`) *field* value is an empty dict.
|
||||
:type field: :class:`~scrapy.item.Field` object or an empty dict
|
||||
:param field: the field being serialized. If the source :ref:`item object
|
||||
<item-types>` does not define field metadata, *field* is an empty
|
||||
:class:`dict`.
|
||||
:type field: :class:`~scrapy.item.Field` object or a :class:`dict` instance
|
||||
|
||||
:param name: the name of the field being serialized
|
||||
:type name: str
|
||||
@ -192,14 +195,17 @@ BaseItemExporter
|
||||
|
||||
.. attribute:: fields_to_export
|
||||
|
||||
A list with the name of the fields that will be exported, or None if you
|
||||
want to export all fields. Defaults to None.
|
||||
A list with the name of the fields that will be exported, or ``None`` if
|
||||
you want to export all fields. Defaults to ``None``.
|
||||
|
||||
Some exporters (like :class:`CsvItemExporter`) respect the order of the
|
||||
fields defined in this attribute.
|
||||
|
||||
Some exporters may require fields_to_export list in order to export the
|
||||
data properly when spiders return dicts (not :class:`~Item` instances).
|
||||
When using :ref:`item objects <item-types>` that do not expose all their
|
||||
possible fields, exporters that do not support exporting a different
|
||||
subset of fields per item will only export the fields found in the first
|
||||
item exported. Use ``fields_to_export`` to define all the fields to be
|
||||
exported.
|
||||
|
||||
.. attribute:: export_empty_fields
|
||||
|
||||
@ -236,9 +242,9 @@ PythonItemExporter
|
||||
XmlItemExporter
|
||||
---------------
|
||||
|
||||
.. class:: XmlItemExporter(file, item_element='item', root_element='items', \**kwargs)
|
||||
.. class:: XmlItemExporter(file, item_element='item', root_element='items', **kwargs)
|
||||
|
||||
Exports Items in XML format to the specified file object.
|
||||
Exports items in XML format to the specified file object.
|
||||
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
@ -290,9 +296,9 @@ XmlItemExporter
|
||||
CsvItemExporter
|
||||
---------------
|
||||
|
||||
.. class:: CsvItemExporter(file, include_headers_line=True, join_multivalued=',', \**kwargs)
|
||||
.. class:: CsvItemExporter(file, include_headers_line=True, join_multivalued=',', **kwargs)
|
||||
|
||||
Exports Items in CSV format to the given file-like object. If the
|
||||
Exports items in CSV format to the given file-like object. If the
|
||||
:attr:`fields_to_export` attribute is set, it will be used to define the
|
||||
CSV columns and their order. The :attr:`export_empty_fields` attribute has
|
||||
no effect on this exporter.
|
||||
@ -311,7 +317,7 @@ CsvItemExporter
|
||||
|
||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||
:class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to the
|
||||
`csv.writer`_ ``__init__`` method, so you can use any ``csv.writer`` ``__init__`` method
|
||||
:func:`csv.writer` function, so you can use any :func:`csv.writer` function
|
||||
argument to customize this exporter.
|
||||
|
||||
A typical output of this exporter would be::
|
||||
@ -320,14 +326,12 @@ CsvItemExporter
|
||||
Color TV,1200
|
||||
DVD player,200
|
||||
|
||||
.. _csv.writer: https://docs.python.org/2/library/csv.html#csv.writer
|
||||
|
||||
PickleItemExporter
|
||||
------------------
|
||||
|
||||
.. class:: PickleItemExporter(file, protocol=0, \**kwargs)
|
||||
.. class:: PickleItemExporter(file, protocol=0, **kwargs)
|
||||
|
||||
Exports Items in pickle format to the given file-like object.
|
||||
Exports items in pickle format to the given file-like object.
|
||||
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
@ -335,21 +339,19 @@ PickleItemExporter
|
||||
:param protocol: The pickle protocol to use.
|
||||
:type protocol: int
|
||||
|
||||
For more information, refer to the `pickle module documentation`_.
|
||||
For more information, see :mod:`pickle`.
|
||||
|
||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||
:class:`BaseItemExporter` ``__init__`` method.
|
||||
|
||||
Pickle isn't a human readable format, so no output examples are provided.
|
||||
|
||||
.. _pickle module documentation: https://docs.python.org/2/library/pickle.html
|
||||
|
||||
PprintItemExporter
|
||||
------------------
|
||||
|
||||
.. class:: PprintItemExporter(file, \**kwargs)
|
||||
.. class:: PprintItemExporter(file, **kwargs)
|
||||
|
||||
Exports Items in pretty print format to the specified file object.
|
||||
Exports items in pretty print format to the specified file object.
|
||||
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
@ -367,13 +369,13 @@ PprintItemExporter
|
||||
JsonItemExporter
|
||||
----------------
|
||||
|
||||
.. class:: JsonItemExporter(file, \**kwargs)
|
||||
.. class:: JsonItemExporter(file, **kwargs)
|
||||
|
||||
Exports Items in JSON format to the specified file-like object, writing all
|
||||
Exports items in JSON format to the specified file-like object, writing all
|
||||
objects as a list of objects. The additional ``__init__`` method arguments are
|
||||
passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover
|
||||
arguments to the `JSONEncoder`_ ``__init__`` method, so you can use any
|
||||
`JSONEncoder`_ ``__init__`` method argument to customize this exporter.
|
||||
arguments to the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any
|
||||
:class:`~json.JSONEncoder` ``__init__`` method argument to customize this exporter.
|
||||
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
@ -393,18 +395,16 @@ JsonItemExporter
|
||||
stream-friendly format, consider using :class:`JsonLinesItemExporter`
|
||||
instead, or splitting the output in multiple chunks.
|
||||
|
||||
.. _JSONEncoder: https://docs.python.org/2/library/json.html#json.JSONEncoder
|
||||
|
||||
JsonLinesItemExporter
|
||||
---------------------
|
||||
|
||||
.. class:: JsonLinesItemExporter(file, \**kwargs)
|
||||
.. class:: JsonLinesItemExporter(file, **kwargs)
|
||||
|
||||
Exports Items in JSON format to the specified file-like object, writing one
|
||||
Exports items in JSON format to the specified file-like object, writing one
|
||||
JSON-encoded item per line. The additional ``__init__`` method arguments are passed
|
||||
to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to
|
||||
the `JSONEncoder`_ ``__init__`` method, so you can use any `JSONEncoder`_
|
||||
``__init__`` method argument to customize this exporter.
|
||||
the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any
|
||||
:class:`~json.JSONEncoder` ``__init__`` method argument to customize this exporter.
|
||||
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
@ -417,8 +417,6 @@ JsonLinesItemExporter
|
||||
Unlike the one produced by :class:`JsonItemExporter`, the format produced by
|
||||
this exporter is well suited for serializing large amounts of data.
|
||||
|
||||
.. _JSONEncoder: https://docs.python.org/2/library/json.html#json.JSONEncoder
|
||||
|
||||
MarshalItemExporter
|
||||
-------------------
|
||||
|
||||
|
@ -364,7 +364,7 @@ Debugger extension
|
||||
|
||||
.. class:: Debugger
|
||||
|
||||
Invokes a `Python debugger`_ inside a running Scrapy process when a `SIGUSR2`_
|
||||
Invokes a :doc:`Python debugger <library/pdb>` inside a running Scrapy process when a `SIGUSR2`_
|
||||
signal is received. After the debugger is exited, the Scrapy process continues
|
||||
running normally.
|
||||
|
||||
@ -372,5 +372,4 @@ For more info see `Debugging in Python`_.
|
||||
|
||||
This extension only works on POSIX-compliant platforms (i.e. not Windows).
|
||||
|
||||
.. _Python debugger: https://docs.python.org/2/library/pdb.html
|
||||
.. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/
|
||||
|
@ -298,8 +298,8 @@ Example: ``FEED_EXPORT_FIELDS = ["foo", "bar", "baz"]``.
|
||||
|
||||
Use FEED_EXPORT_FIELDS option to define fields to export and their order.
|
||||
|
||||
When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses fields
|
||||
defined in dicts or :class:`~.Item` subclasses a spider is yielding.
|
||||
When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses the fields
|
||||
defined in :ref:`item objects <topics-items>` yielded by your spider.
|
||||
|
||||
If an exporter requires a fixed set of fields (this is the case for
|
||||
:ref:`CSV <topics-feed-format-csv>` export format) and FEED_EXPORT_FIELDS
|
||||
|
@ -27,15 +27,19 @@ Each item pipeline component is a Python class that must implement the following
|
||||
|
||||
.. method:: process_item(self, item, spider)
|
||||
|
||||
This method is called for every item pipeline component. :meth:`process_item`
|
||||
must either: return a dict with data, return an :class:`~scrapy.item.Item`
|
||||
(or any descendant class) object, return a
|
||||
:class:`~twisted.internet.defer.Deferred` or raise
|
||||
:exc:`~scrapy.exceptions.DropItem` exception. Dropped items are no longer
|
||||
processed by further pipeline components.
|
||||
This method is called for every item pipeline component.
|
||||
|
||||
:param item: the item scraped
|
||||
:type item: :class:`~scrapy.item.Item` object or a dict
|
||||
`item` is an :ref:`item object <item-types>`, see
|
||||
:ref:`supporting-item-types`.
|
||||
|
||||
:meth:`process_item` must either: return an :ref:`item object <item-types>`,
|
||||
return a :class:`~twisted.internet.defer.Deferred` or raise a
|
||||
:exc:`~scrapy.exceptions.DropItem` exception.
|
||||
|
||||
Dropped items are no longer processed by further pipeline components.
|
||||
|
||||
:param item: the scraped item
|
||||
:type item: :ref:`item object <item-types>`
|
||||
|
||||
:param spider: the spider which scraped the item
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
@ -79,16 +83,17 @@ Let's take a look at the following hypothetical pipeline that adjusts the
|
||||
(``price_excludes_vat`` attribute), and drops those items which don't
|
||||
contain a price::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
class PricePipeline:
|
||||
|
||||
vat_factor = 1.15
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if item.get('price'):
|
||||
if item.get('price_excludes_vat'):
|
||||
item['price'] = item['price'] * self.vat_factor
|
||||
adapter = ItemAdapter(item)
|
||||
if adapter.get('price'):
|
||||
if adapter.get('price_excludes_vat'):
|
||||
adapter['price'] = adapter['price'] * self.vat_factor
|
||||
return item
|
||||
else:
|
||||
raise DropItem("Missing price in %s" % item)
|
||||
@ -103,6 +108,8 @@ format::
|
||||
|
||||
import json
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
class JsonWriterPipeline:
|
||||
|
||||
def open_spider(self, spider):
|
||||
@ -112,7 +119,7 @@ format::
|
||||
self.file.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
line = json.dumps(dict(item)) + "\n"
|
||||
line = json.dumps(ItemAdapter(item).asdict()) + "\n"
|
||||
self.file.write(line)
|
||||
return item
|
||||
|
||||
@ -131,6 +138,7 @@ The main point of this example is to show how to use :meth:`from_crawler`
|
||||
method and how to clean up the resources properly.::
|
||||
|
||||
import pymongo
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
class MongoPipeline:
|
||||
|
||||
@ -155,7 +163,7 @@ method and how to clean up the resources properly.::
|
||||
self.client.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
self.db[self.collection_name].insert_one(dict(item))
|
||||
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
|
||||
return item
|
||||
|
||||
.. _MongoDB: https://www.mongodb.com/
|
||||
@ -167,18 +175,21 @@ method and how to clean up the resources properly.::
|
||||
Take screenshot of item
|
||||
-----------------------
|
||||
|
||||
This example demonstrates how to return a
|
||||
:class:`~twisted.internet.defer.Deferred` from the :meth:`process_item` method.
|
||||
It uses Splash_ to render screenshot of item url. Pipeline
|
||||
makes request to locally running instance of Splash_. After request is downloaded,
|
||||
it saves the screenshot to a file and adds filename to the item.
|
||||
This example demonstrates how to use :doc:`coroutine syntax <coroutines>` in
|
||||
the :meth:`process_item` method.
|
||||
|
||||
This item pipeline makes a request to a locally-running instance of Splash_ to
|
||||
render a screenshot of the item URL. After the request response is downloaded,
|
||||
the item pipeline saves the screenshot to a file and adds the filename to the
|
||||
item.
|
||||
|
||||
::
|
||||
|
||||
import scrapy
|
||||
import hashlib
|
||||
from urllib.parse import quote
|
||||
|
||||
import scrapy
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
class ScreenshotPipeline:
|
||||
"""Pipeline that uses Splash to render screenshot of
|
||||
@ -187,7 +198,8 @@ it saves the screenshot to a file and adds filename to the item.
|
||||
SPLASH_URL = "http://localhost:8050/render.png?url={}"
|
||||
|
||||
async def process_item(self, item, spider):
|
||||
encoded_item_url = quote(item["url"])
|
||||
adapter = ItemAdapter(item)
|
||||
encoded_item_url = quote(adapter["url"])
|
||||
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
|
||||
request = scrapy.Request(screenshot_url)
|
||||
response = await spider.crawler.engine.download(request, spider)
|
||||
@ -197,14 +209,14 @@ it saves the screenshot to a file and adds filename to the item.
|
||||
return item
|
||||
|
||||
# Save screenshot to file, filename will be hash of url.
|
||||
url = item["url"]
|
||||
url = adapter["url"]
|
||||
url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
|
||||
filename = "{}.png".format(url_hash)
|
||||
with open(filename, "wb") as f:
|
||||
f.write(response.body)
|
||||
|
||||
# Store filename in item.
|
||||
item["screenshot_filename"] = filename
|
||||
adapter["screenshot_filename"] = filename
|
||||
return item
|
||||
|
||||
.. _Splash: https://splash.readthedocs.io/en/stable/
|
||||
@ -217,6 +229,7 @@ already processed. Let's say that our items have a unique id, but our spider
|
||||
returns multiples items with the same id::
|
||||
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
class DuplicatesPipeline:
|
||||
@ -225,10 +238,11 @@ returns multiples items with the same id::
|
||||
self.ids_seen = set()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if item['id'] in self.ids_seen:
|
||||
raise DropItem("Duplicate item found: %s" % item)
|
||||
adapter = ItemAdapter(item)
|
||||
if adapter['id'] in self.ids_seen:
|
||||
raise DropItem("Duplicate item found: %r" % item)
|
||||
else:
|
||||
self.ids_seen.add(item['id'])
|
||||
self.ids_seen.add(adapter['id'])
|
||||
return item
|
||||
|
||||
|
||||
|
@ -8,31 +8,155 @@ Items
|
||||
:synopsis: Item and Field classes
|
||||
|
||||
The main goal in scraping is to extract structured data from unstructured
|
||||
sources, typically, web pages. Scrapy spiders can return the extracted data
|
||||
as Python dicts. While convenient and familiar, Python dicts lack structure:
|
||||
it is easy to make a typo in a field name or return inconsistent data,
|
||||
especially in a larger project with many spiders.
|
||||
sources, typically, web pages. :ref:`Spiders <topics-spiders>` may return the
|
||||
extracted data as `items`, Python objects that define key-value pairs.
|
||||
|
||||
To define common output data format Scrapy provides the :class:`Item` class.
|
||||
:class:`Item` objects are simple containers used to collect the scraped data.
|
||||
They provide a `dictionary-like`_ API with a convenient syntax for declaring
|
||||
their available fields.
|
||||
Scrapy supports :ref:`multiple types of items <item-types>`. When you create an
|
||||
item, you may use whichever type of item you want. When you write code that
|
||||
receives an item, your code should :ref:`work for any item type
|
||||
<supporting-item-types>`.
|
||||
|
||||
Various Scrapy components use extra information provided by Items:
|
||||
exporters look at declared fields to figure out columns to export,
|
||||
serialization can be customized using Item fields metadata, :mod:`trackref`
|
||||
tracks Item instances to help find memory leaks
|
||||
(see :ref:`topics-leaks-trackrefs`), etc.
|
||||
.. _item-types:
|
||||
|
||||
.. _dictionary-like: https://docs.python.org/2/library/stdtypes.html#dict
|
||||
Item Types
|
||||
==========
|
||||
|
||||
Scrapy supports the following types of items, via the `itemadapter`_ library:
|
||||
:ref:`dictionaries <dict-items>`, :ref:`Item objects <item-objects>`,
|
||||
:ref:`dataclass objects <dataclass-items>`, and :ref:`attrs objects <attrs-items>`.
|
||||
|
||||
.. _itemadapter: https://github.com/scrapy/itemadapter
|
||||
|
||||
.. _dict-items:
|
||||
|
||||
Dictionaries
|
||||
------------
|
||||
|
||||
As an item type, :class:`dict` is convenient and familiar.
|
||||
|
||||
.. _item-objects:
|
||||
|
||||
Item objects
|
||||
------------
|
||||
|
||||
:class:`Item` provides a :class:`dict`-like API plus additional features that
|
||||
make it the most feature-complete item type:
|
||||
|
||||
.. class:: Item([arg])
|
||||
|
||||
:class:`Item` objects replicate the standard :class:`dict` API, including
|
||||
its ``__init__`` method.
|
||||
|
||||
:class:`Item` allows defining field names, so that:
|
||||
|
||||
- :class:`KeyError` is raised when using undefined field names (i.e.
|
||||
prevents typos going unnoticed)
|
||||
|
||||
- :ref:`Item exporters <topics-exporters>` can export all fields by
|
||||
default even if the first scraped object does not have values for all
|
||||
of them
|
||||
|
||||
:class:`Item` also allows defining field metadata, which can be used to
|
||||
:ref:`customize serialization <topics-exporters-field-serialization>`.
|
||||
|
||||
:mod:`trackref` tracks :class:`Item` objects to help find memory leaks
|
||||
(see :ref:`topics-leaks-trackrefs`).
|
||||
|
||||
:class:`Item` objects also provide the following additional API members:
|
||||
|
||||
.. automethod:: copy
|
||||
|
||||
.. automethod:: deepcopy
|
||||
|
||||
.. attribute:: fields
|
||||
|
||||
A dictionary containing *all declared fields* for this Item, not only
|
||||
those populated. The keys are the field names and the values are the
|
||||
:class:`Field` objects used in the :ref:`Item declaration
|
||||
<topics-items-declaring>`.
|
||||
|
||||
Example::
|
||||
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
class CustomItem(Item):
|
||||
one_field = Field()
|
||||
another_field = Field()
|
||||
|
||||
.. _dataclass-items:
|
||||
|
||||
Dataclass objects
|
||||
-----------------
|
||||
|
||||
.. versionadded:: 2.2
|
||||
|
||||
:func:`~dataclasses.dataclass` allows defining item classes with field names,
|
||||
so that :ref:`item exporters <topics-exporters>` can export all fields by
|
||||
default even if the first scraped object does not have values for all of them.
|
||||
|
||||
Additionally, ``dataclass`` items also allow to:
|
||||
|
||||
* define the type and default value of each defined field.
|
||||
|
||||
* define custom field metadata through :func:`dataclasses.field`, which can be used to
|
||||
:ref:`customize serialization <topics-exporters-field-serialization>`.
|
||||
|
||||
They work natively in Python 3.7 or later, or using the `dataclasses
|
||||
backport`_ in Python 3.6.
|
||||
|
||||
.. _dataclasses backport: https://pypi.org/project/dataclasses/
|
||||
|
||||
Example::
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class CustomItem:
|
||||
one_field: str
|
||||
another_field: int
|
||||
|
||||
.. note:: Field types are not enforced at run time.
|
||||
|
||||
.. _attrs-items:
|
||||
|
||||
attr.s objects
|
||||
--------------
|
||||
|
||||
.. versionadded:: 2.2
|
||||
|
||||
:func:`attr.s` allows defining item classes with field names,
|
||||
so that :ref:`item exporters <topics-exporters>` can export all fields by
|
||||
default even if the first scraped object does not have values for all of them.
|
||||
|
||||
Additionally, ``attr.s`` items also allow to:
|
||||
|
||||
* define the type and default value of each defined field.
|
||||
|
||||
* define custom field :ref:`metadata <attrs:metadata>`, which can be used to
|
||||
:ref:`customize serialization <topics-exporters-field-serialization>`.
|
||||
|
||||
In order to use this type, the :doc:`attrs package <attrs:index>` needs to be installed.
|
||||
|
||||
Example::
|
||||
|
||||
import attr
|
||||
|
||||
@attr.s
|
||||
class CustomItem:
|
||||
one_field = attr.ib()
|
||||
another_field = attr.ib()
|
||||
|
||||
|
||||
Working with Item objects
|
||||
=========================
|
||||
|
||||
.. _topics-items-declaring:
|
||||
|
||||
Declaring Items
|
||||
===============
|
||||
Declaring Item subclasses
|
||||
-------------------------
|
||||
|
||||
Items are declared using a simple class definition syntax and :class:`Field`
|
||||
objects. Here is an example::
|
||||
Item subclasses are declared using a simple class definition syntax and
|
||||
:class:`Field` objects. Here is an example::
|
||||
|
||||
import scrapy
|
||||
|
||||
@ -50,10 +174,11 @@ objects. Here is an example::
|
||||
.. _Django: https://www.djangoproject.com/
|
||||
.. _Django Models: https://docs.djangoproject.com/en/dev/topics/db/models/
|
||||
|
||||
|
||||
.. _topics-items-fields:
|
||||
|
||||
Item Fields
|
||||
===========
|
||||
Declaring fields
|
||||
----------------
|
||||
|
||||
:class:`Field` objects are used to specify metadata for each field. For
|
||||
example, the serializer function for the ``last_updated`` field illustrated in
|
||||
@ -74,15 +199,31 @@ It's important to note that the :class:`Field` objects used to declare the item
|
||||
do not stay assigned as class attributes. Instead, they can be accessed through
|
||||
the :attr:`Item.fields` attribute.
|
||||
|
||||
Working with Items
|
||||
==================
|
||||
.. class:: Field([arg])
|
||||
|
||||
The :class:`Field` class is just an alias to the built-in :class:`dict` class and
|
||||
doesn't provide any extra functionality or attributes. In other words,
|
||||
:class:`Field` objects are plain-old Python dicts. A separate class is used
|
||||
to support the :ref:`item declaration syntax <topics-items-declaring>`
|
||||
based on class attributes.
|
||||
|
||||
.. note:: Field metadata can also be declared for ``dataclass`` and ``attrs``
|
||||
items. Please refer to the documentation for `dataclasses.field`_ and
|
||||
`attr.ib`_ for additional information.
|
||||
|
||||
.. _dataclasses.field: https://docs.python.org/3/library/dataclasses.html#dataclasses.field
|
||||
.. _attr.ib: https://www.attrs.org/en/stable/api.html#attr.ib
|
||||
|
||||
|
||||
Working with Item objects
|
||||
-------------------------
|
||||
|
||||
Here are some examples of common tasks performed with items, using the
|
||||
``Product`` item :ref:`declared above <topics-items-declaring>`. You will
|
||||
notice the API is very similar to the `dict API`_.
|
||||
notice the API is very similar to the :class:`dict` API.
|
||||
|
||||
Creating items
|
||||
--------------
|
||||
''''''''''''''
|
||||
|
||||
>>> product = Product(name='Desktop PC', price=1000)
|
||||
>>> print(product)
|
||||
@ -90,7 +231,7 @@ Product(name='Desktop PC', price=1000)
|
||||
|
||||
|
||||
Getting field values
|
||||
--------------------
|
||||
''''''''''''''''''''
|
||||
|
||||
>>> product['name']
|
||||
Desktop PC
|
||||
@ -130,7 +271,7 @@ False
|
||||
|
||||
|
||||
Setting field values
|
||||
--------------------
|
||||
''''''''''''''''''''
|
||||
|
||||
>>> product['last_updated'] = 'today'
|
||||
>>> product['last_updated']
|
||||
@ -143,9 +284,9 @@ KeyError: 'Product does not support field: lala'
|
||||
|
||||
|
||||
Accessing all populated values
|
||||
------------------------------
|
||||
''''''''''''''''''''''''''''''
|
||||
|
||||
To access all populated values, just use the typical `dict API`_:
|
||||
To access all populated values, just use the typical :class:`dict` API:
|
||||
|
||||
>>> product.keys()
|
||||
['price', 'name']
|
||||
@ -157,16 +298,14 @@ To access all populated values, just use the typical `dict API`_:
|
||||
.. _copying-items:
|
||||
|
||||
Copying items
|
||||
-------------
|
||||
'''''''''''''
|
||||
|
||||
To copy an item, you must first decide whether you want a shallow copy or a
|
||||
deep copy.
|
||||
|
||||
If your item contains mutable_ values like lists or dictionaries, a shallow
|
||||
copy will keep references to the same mutable values across all different
|
||||
copies.
|
||||
|
||||
.. _mutable: https://docs.python.org/3/glossary.html#term-mutable
|
||||
If your item contains :term:`mutable` values like lists or dictionaries,
|
||||
a shallow copy will keep references to the same mutable values across all
|
||||
different copies.
|
||||
|
||||
For example, if you have an item with a list of tags, and you create a shallow
|
||||
copy of that item, both the original item and the copy have the same list of
|
||||
@ -175,9 +314,7 @@ other item as well.
|
||||
|
||||
If that is not the desired behavior, use a deep copy instead.
|
||||
|
||||
See the `documentation of the copy module`_ for more information.
|
||||
|
||||
.. _documentation of the copy module: https://docs.python.org/3/library/copy.html
|
||||
See :mod:`copy` for more information.
|
||||
|
||||
To create a shallow copy of an item, you can either call
|
||||
:meth:`~scrapy.item.Item.copy` on an existing item
|
||||
@ -189,7 +326,7 @@ To create a deep copy, call :meth:`~scrapy.item.Item.deepcopy` instead
|
||||
|
||||
|
||||
Other common tasks
|
||||
------------------
|
||||
''''''''''''''''''
|
||||
|
||||
Creating dicts from items:
|
||||
|
||||
@ -207,8 +344,8 @@ Traceback (most recent call last):
|
||||
KeyError: 'Product does not support field: lala'
|
||||
|
||||
|
||||
Extending Items
|
||||
===============
|
||||
Extending Item subclasses
|
||||
-------------------------
|
||||
|
||||
You can extend Items (to add more fields or to change some metadata for some
|
||||
fields) by declaring a subclass of your original Item.
|
||||
@ -228,46 +365,25 @@ appending more values, or changing existing values, like this::
|
||||
That adds (or replaces) the ``serializer`` metadata key for the ``name`` field,
|
||||
keeping all the previously existing metadata values.
|
||||
|
||||
Item objects
|
||||
============
|
||||
|
||||
.. class:: Item([arg])
|
||||
.. _supporting-item-types:
|
||||
|
||||
Return a new Item optionally initialized from the given argument.
|
||||
Supporting All Item Types
|
||||
=========================
|
||||
|
||||
Items replicate the standard `dict API`_, including its ``__init__`` method, and
|
||||
also provide the following additional API members:
|
||||
In code that receives an item, such as methods of :ref:`item pipelines
|
||||
<topics-item-pipeline>` or :ref:`spider middlewares
|
||||
<topics-spider-middleware>`, it is a good practice to use the
|
||||
:class:`~itemadapter.ItemAdapter` class and the
|
||||
:func:`~itemadapter.is_item` function to write code that works for
|
||||
any :ref:`supported item type <item-types>`:
|
||||
|
||||
.. automethod:: copy
|
||||
.. autoclass:: itemadapter.ItemAdapter
|
||||
|
||||
.. automethod:: deepcopy
|
||||
|
||||
.. attribute:: fields
|
||||
|
||||
A dictionary containing *all declared fields* for this Item, not only
|
||||
those populated. The keys are the field names and the values are the
|
||||
:class:`Field` objects used in the :ref:`Item declaration
|
||||
<topics-items-declaring>`.
|
||||
|
||||
.. _dict API: https://docs.python.org/2/library/stdtypes.html#dict
|
||||
|
||||
Field objects
|
||||
=============
|
||||
|
||||
.. class:: Field([arg])
|
||||
|
||||
The :class:`Field` class is just an alias to the built-in `dict`_ class and
|
||||
doesn't provide any extra functionality or attributes. In other words,
|
||||
:class:`Field` objects are plain-old Python dicts. A separate class is used
|
||||
to support the :ref:`item declaration syntax <topics-items-declaring>`
|
||||
based on class attributes.
|
||||
|
||||
.. _dict: https://docs.python.org/2/library/stdtypes.html#dict
|
||||
.. autofunction:: itemadapter.is_item
|
||||
|
||||
|
||||
Other classes related to Item
|
||||
=============================
|
||||
|
||||
.. autoclass:: BaseItem
|
||||
Other classes related to items
|
||||
==============================
|
||||
|
||||
.. autoclass:: ItemMeta
|
||||
|
@ -4,7 +4,7 @@
|
||||
Debugging memory leaks
|
||||
======================
|
||||
|
||||
In Scrapy, objects such as Requests, Responses and Items have a finite
|
||||
In Scrapy, objects such as requests, responses and items have a finite
|
||||
lifetime: they are created, used for a while, and finally destroyed.
|
||||
|
||||
From all those objects, the Request is probably the one with the longest
|
||||
@ -61,8 +61,8 @@ Debugging memory leaks with ``trackref``
|
||||
========================================
|
||||
|
||||
:mod:`trackref` is a module provided by Scrapy to debug the most common cases of
|
||||
memory leaks. It basically tracks the references to all live Requests,
|
||||
Responses, Item and Selector objects.
|
||||
memory leaks. It basically tracks the references to all live Request,
|
||||
Response, Item, Spider and Selector objects.
|
||||
|
||||
You can enter the telnet console and inspect how many objects (of the classes
|
||||
mentioned above) are currently alive using the ``prefs()`` function which is an
|
||||
@ -200,11 +200,10 @@ Debugging memory leaks with muppy
|
||||
|
||||
``trackref`` provides a very convenient mechanism for tracking down memory
|
||||
leaks, but it only keeps track of the objects that are more likely to cause
|
||||
memory leaks (Requests, Responses, Items, and Selectors). However, there are
|
||||
other cases where the memory leaks could come from other (more or less obscure)
|
||||
objects. If this is your case, and you can't find your leaks using ``trackref``,
|
||||
you still have another resource: the muppy library.
|
||||
|
||||
memory leaks. However, there are other cases where the memory leaks could come
|
||||
from other (more or less obscure) objects. If this is your case, and you can't
|
||||
find your leaks using ``trackref``, you still have another resource: the muppy
|
||||
library.
|
||||
|
||||
You can use muppy from `Pympler`_.
|
||||
|
||||
|
@ -7,13 +7,12 @@ Item Loaders
|
||||
.. module:: scrapy.loader
|
||||
:synopsis: Item Loader class
|
||||
|
||||
Item Loaders provide a convenient mechanism for populating scraped :ref:`Items
|
||||
<topics-items>`. Even though Items can be populated using their own
|
||||
dictionary-like API, Item Loaders provide a much more convenient API for
|
||||
populating them from a scraping process, by automating some common tasks like
|
||||
parsing the raw extracted data before assigning it.
|
||||
Item Loaders provide a convenient mechanism for populating scraped :ref:`items
|
||||
<topics-items>`. Even though items can be populated directly, Item Loaders provide a
|
||||
much more convenient API for populating them from a scraping process, by automating
|
||||
some common tasks like parsing the raw extracted data before assigning it.
|
||||
|
||||
In other words, :ref:`Items <topics-items>` provide the *container* of
|
||||
In other words, :ref:`items <topics-items>` provide the *container* of
|
||||
scraped data, while Item Loaders provide the mechanism for *populating* that
|
||||
container.
|
||||
|
||||
@ -25,10 +24,10 @@ Using Item Loaders to populate items
|
||||
====================================
|
||||
|
||||
To use an Item Loader, you must first instantiate it. You can either
|
||||
instantiate it with a dict-like object (e.g. Item or dict) or without one, in
|
||||
which case an Item is automatically instantiated in the Item Loader ``__init__`` method
|
||||
using the Item class specified in the :attr:`ItemLoader.default_item_class`
|
||||
attribute.
|
||||
instantiate it with an :ref:`item object <topics-items>` or without one, in which
|
||||
case an :ref:`item object <topics-items>` is automatically created in the
|
||||
Item Loader ``__init__`` method using the :ref:`item <topics-items>` class
|
||||
specified in the :attr:`ItemLoader.default_item_class` attribute.
|
||||
|
||||
Then, you start collecting values into the Item Loader, typically using
|
||||
:ref:`Selectors <topics-selectors>`. You can add more than one value to
|
||||
@ -77,6 +76,31 @@ called which actually returns the item populated with the data
|
||||
previously extracted and collected with the :meth:`~ItemLoader.add_xpath`,
|
||||
:meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls.
|
||||
|
||||
|
||||
.. _topics-loaders-dataclass:
|
||||
|
||||
Working with dataclass items
|
||||
============================
|
||||
|
||||
By default, :ref:`dataclass items <dataclass-items>` require all fields to be
|
||||
passed when created. This could be an issue when using dataclass items with
|
||||
item loaders: unless a pre-populated item is passed to the loader, fields
|
||||
will be populated incrementally using the loader's :meth:`~ItemLoader.add_xpath`,
|
||||
:meth:`~ItemLoader.add_css` and :meth:`~ItemLoader.add_value` methods.
|
||||
|
||||
One approach to overcome this is to define items using the
|
||||
:func:`~dataclasses.field` function, with a ``default`` argument::
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
@dataclass
|
||||
class InventoryItem:
|
||||
name: Optional[str] = field(default=None)
|
||||
price: Optional[float] = field(default=None)
|
||||
stock: Optional[int] = field(default=None)
|
||||
|
||||
|
||||
.. _topics-loaders-processors:
|
||||
|
||||
Input and Output processors
|
||||
@ -88,7 +112,7 @@ received (through the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`
|
||||
:meth:`~ItemLoader.add_value` methods) and the result of the input processor is
|
||||
collected and kept inside the ItemLoader. After collecting all data, the
|
||||
:meth:`ItemLoader.load_item` method is called to populate and get the populated
|
||||
:class:`~scrapy.item.Item` object. That's when the output processor is
|
||||
:ref:`item object <topics-items>`. That's when the output processor is
|
||||
called with the data previously collected (and processed using the input
|
||||
processor). The result of the output processor is the final value that gets
|
||||
assigned to the item.
|
||||
@ -153,12 +177,10 @@ Last, but not least, Scrapy comes with some :ref:`commonly used processors
|
||||
<topics-loaders-available-processors>` built-in for convenience.
|
||||
|
||||
|
||||
|
||||
Declaring Item Loaders
|
||||
======================
|
||||
|
||||
Item Loaders are declared like Items, by using a class definition syntax. Here
|
||||
is an example::
|
||||
Item Loaders are declared using a class definition syntax. Here is an example::
|
||||
|
||||
from scrapy.loader import ItemLoader
|
||||
from scrapy.loader.processors import TakeFirst, MapCompose, Join
|
||||
@ -273,11 +295,11 @@ There are several ways to modify Item Loader context values:
|
||||
ItemLoader objects
|
||||
==================
|
||||
|
||||
.. class:: ItemLoader([item, selector, response], \**kwargs)
|
||||
.. class:: ItemLoader([item, selector, response], **kwargs)
|
||||
|
||||
Return a new Item Loader for populating the given Item. If no item is
|
||||
given, one is instantiated automatically using the class in
|
||||
:attr:`default_item_class`.
|
||||
Return a new Item Loader for populating the given :ref:`item object
|
||||
<topics-items>`. If no item object is given, one is instantiated
|
||||
automatically using the class in :attr:`default_item_class`.
|
||||
|
||||
When instantiated with a ``selector`` or a ``response`` parameters
|
||||
the :class:`ItemLoader` class provides convenient mechanisms for extracting
|
||||
@ -286,7 +308,7 @@ ItemLoader objects
|
||||
:param item: The item instance to populate using subsequent calls to
|
||||
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
|
||||
or :meth:`~ItemLoader.add_value`.
|
||||
:type item: :class:`~scrapy.item.Item` object
|
||||
:type item: :ref:`item object <topics-items>`
|
||||
|
||||
:param selector: The selector to extract data from, when using the
|
||||
:meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath`
|
||||
@ -303,7 +325,7 @@ ItemLoader objects
|
||||
|
||||
:class:`ItemLoader` instances have the following methods:
|
||||
|
||||
.. method:: get_value(value, \*processors, \**kwargs)
|
||||
.. method:: get_value(value, *processors, **kwargs)
|
||||
|
||||
Process the given ``value`` by the given ``processors`` and keyword
|
||||
arguments.
|
||||
@ -321,7 +343,7 @@ ItemLoader objects
|
||||
>>> loader.get_value(u'name: foo', TakeFirst(), unicode.upper, re='name: (.+)')
|
||||
'FOO`
|
||||
|
||||
.. method:: add_value(field_name, value, \*processors, \**kwargs)
|
||||
.. method:: add_value(field_name, value, *processors, **kwargs)
|
||||
|
||||
Process and then add the given ``value`` for the given field.
|
||||
|
||||
@ -343,11 +365,11 @@ ItemLoader objects
|
||||
loader.add_value('name', u'name: foo', TakeFirst(), re='name: (.+)')
|
||||
loader.add_value(None, {'name': u'foo', 'sex': u'male'})
|
||||
|
||||
.. method:: replace_value(field_name, value, \*processors, \**kwargs)
|
||||
.. method:: replace_value(field_name, value, *processors, **kwargs)
|
||||
|
||||
Similar to :meth:`add_value` but replaces the collected data with the
|
||||
new value instead of adding it.
|
||||
.. method:: get_xpath(xpath, \*processors, \**kwargs)
|
||||
.. method:: get_xpath(xpath, *processors, **kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
|
||||
value, which is used to extract a list of unicode strings from the
|
||||
@ -367,7 +389,7 @@ ItemLoader objects
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
|
||||
|
||||
.. method:: add_xpath(field_name, xpath, \*processors, \**kwargs)
|
||||
.. method:: add_xpath(field_name, xpath, *processors, **kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
|
||||
value, which is used to extract a list of unicode strings from the
|
||||
@ -385,12 +407,12 @@ ItemLoader objects
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
|
||||
|
||||
.. method:: replace_xpath(field_name, xpath, \*processors, \**kwargs)
|
||||
.. method:: replace_xpath(field_name, xpath, *processors, **kwargs)
|
||||
|
||||
Similar to :meth:`add_xpath` but replaces collected data instead of
|
||||
adding it.
|
||||
|
||||
.. method:: get_css(css, \*processors, \**kwargs)
|
||||
.. method:: get_css(css, *processors, **kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
|
||||
instead of a value, which is used to extract a list of unicode strings
|
||||
@ -410,7 +432,7 @@ ItemLoader objects
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
|
||||
|
||||
.. method:: add_css(field_name, css, \*processors, \**kwargs)
|
||||
.. method:: add_css(field_name, css, *processors, **kwargs)
|
||||
|
||||
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
|
||||
instead of a value, which is used to extract a list of unicode strings
|
||||
@ -428,7 +450,7 @@ ItemLoader objects
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.add_css('price', 'p#price', re='the price is (.*)')
|
||||
|
||||
.. method:: replace_css(field_name, css, \*processors, \**kwargs)
|
||||
.. method:: replace_css(field_name, css, *processors, **kwargs)
|
||||
|
||||
Similar to :meth:`add_css` but replaces collected data instead of
|
||||
adding it.
|
||||
@ -444,17 +466,19 @@ ItemLoader objects
|
||||
|
||||
Create a nested loader with an xpath selector.
|
||||
The supplied selector is applied relative to selector associated
|
||||
with this :class:`ItemLoader`. The nested loader shares the :class:`Item`
|
||||
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
|
||||
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
|
||||
with this :class:`ItemLoader`. The nested loader shares the :ref:`item
|
||||
object <topics-items>` with the parent :class:`ItemLoader` so calls to
|
||||
:meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will
|
||||
behave as expected.
|
||||
|
||||
.. method:: nested_css(css)
|
||||
|
||||
Create a nested loader with a css selector.
|
||||
The supplied selector is applied relative to selector associated
|
||||
with this :class:`ItemLoader`. The nested loader shares the :class:`Item`
|
||||
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
|
||||
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
|
||||
with this :class:`ItemLoader`. The nested loader shares the :ref:`item
|
||||
object <topics-items>` with the parent :class:`ItemLoader` so calls to
|
||||
:meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will
|
||||
behave as expected.
|
||||
|
||||
.. method:: get_collected_values(field_name)
|
||||
|
||||
@ -477,7 +501,7 @@ ItemLoader objects
|
||||
|
||||
.. attribute:: item
|
||||
|
||||
The :class:`~scrapy.item.Item` object being parsed by this Item Loader.
|
||||
The :ref:`item object <topics-items>` being parsed by this Item Loader.
|
||||
This is mostly used as a property so when attempting to override this
|
||||
value, you may want to check out :attr:`default_item_class` first.
|
||||
|
||||
@ -488,8 +512,8 @@ ItemLoader objects
|
||||
|
||||
.. attribute:: default_item_class
|
||||
|
||||
An Item class (or factory), used to instantiate items when not given in
|
||||
the ``__init__`` method.
|
||||
An :ref:`item object <topics-items>` class or factory, used to
|
||||
instantiate items when not given in the ``__init__`` method.
|
||||
|
||||
.. attribute:: default_input_processor
|
||||
|
||||
@ -678,7 +702,7 @@ Here is a list of all built-in processors:
|
||||
>>> proc(['one', 'two', 'three'])
|
||||
'one<br>two<br>three'
|
||||
|
||||
.. class:: Compose(\*functions, \**default_loader_context)
|
||||
.. class:: Compose(*functions, **default_loader_context)
|
||||
|
||||
A processor which is constructed from the composition of the given
|
||||
functions. This means that each input value of this processor is passed to
|
||||
@ -706,7 +730,7 @@ Here is a list of all built-in processors:
|
||||
active Loader context accessible through the :meth:`ItemLoader.context`
|
||||
attribute.
|
||||
|
||||
.. class:: MapCompose(\*functions, \**default_loader_context)
|
||||
.. class:: MapCompose(*functions, **default_loader_context)
|
||||
|
||||
A processor which is constructed from the composition of the given
|
||||
functions, similar to the :class:`Compose` processor. The difference with
|
||||
|
@ -9,8 +9,7 @@ Logging
|
||||
explicit calls to the Python standard logging. Keep reading to learn more
|
||||
about the new logging system.
|
||||
|
||||
Scrapy uses `Python's builtin logging system
|
||||
<https://docs.python.org/3/library/logging.html>`_ for event logging. We'll
|
||||
Scrapy uses :mod:`logging` for event logging. We'll
|
||||
provide some simple examples to get you started, but for more advanced
|
||||
use-cases it's strongly suggested to read thoroughly its documentation.
|
||||
|
||||
@ -83,10 +82,10 @@ path::
|
||||
|
||||
.. seealso::
|
||||
|
||||
Module logging, `HowTo <https://docs.python.org/2/howto/logging.html>`_
|
||||
Module logging, :doc:`HowTo <howto/logging>`
|
||||
Basic Logging Tutorial
|
||||
|
||||
Module logging, `Loggers <https://docs.python.org/2/library/logging.html#logger-objects>`_
|
||||
Module logging, :ref:`Loggers <logger>`
|
||||
Further documentation on loggers
|
||||
|
||||
.. _topics-logging-from-spiders:
|
||||
@ -165,10 +164,8 @@ possible levels listed in :ref:`topics-logging-levels`.
|
||||
|
||||
:setting:`LOG_FORMAT` and :setting:`LOG_DATEFORMAT` specify formatting strings
|
||||
used as layouts for all messages. Those strings can contain any placeholders
|
||||
listed in `logging's logrecord attributes docs
|
||||
<https://docs.python.org/2/library/logging.html#logrecord-attributes>`_ and
|
||||
`datetime's strftime and strptime directives
|
||||
<https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_
|
||||
listed in :ref:`logging's logrecord attributes docs <logrecord-attributes>` and
|
||||
:ref:`datetime's strftime and strptime directives <strftime-strptime-behavior>`
|
||||
respectively.
|
||||
|
||||
If :setting:`LOG_SHORT_NAMES` is set, then the logs will not display the Scrapy
|
||||
@ -190,7 +187,7 @@ to override some of the Scrapy settings regarding logging.
|
||||
|
||||
.. seealso::
|
||||
|
||||
Module `logging.handlers <https://docs.python.org/2/library/logging.handlers.html>`_
|
||||
Module :mod:`logging.handlers`
|
||||
Further documentation on available handlers
|
||||
|
||||
.. _custom-log-formats:
|
||||
@ -205,6 +202,9 @@ A custom log format can be set for different actions by extending
|
||||
.. autoclass:: scrapy.logformatter.LogFormatter
|
||||
:members:
|
||||
|
||||
|
||||
.. _topics-logging-advanced-customization:
|
||||
|
||||
Advanced customization
|
||||
----------------------
|
||||
|
||||
@ -256,16 +256,15 @@ scrapy.utils.log module
|
||||
In that case, its usage is not required but it's recommended.
|
||||
|
||||
Another option when running custom scripts is to manually configure the logging.
|
||||
To do this you can use `logging.basicConfig()`_ to set a basic root handler.
|
||||
To do this you can use :func:`logging.basicConfig` to set a basic root handler.
|
||||
|
||||
Note that :class:`~scrapy.crawler.CrawlerProcess` automatically calls ``configure_logging``,
|
||||
so it is recommended to only use `logging.basicConfig()`_ together with
|
||||
so it is recommended to only use :func:`logging.basicConfig` together with
|
||||
:class:`~scrapy.crawler.CrawlerRunner`.
|
||||
|
||||
This is an example on how to redirect ``INFO`` or higher messages to a file::
|
||||
|
||||
import logging
|
||||
from scrapy.utils.log import configure_logging
|
||||
|
||||
logging.basicConfig(
|
||||
filename='log.txt',
|
||||
@ -275,7 +274,3 @@ scrapy.utils.log module
|
||||
|
||||
Refer to :ref:`run-from-script` for more details about using Scrapy this
|
||||
way.
|
||||
|
||||
.. _logging.basicConfig(): https://docs.python.org/2/library/logging.html#logging.basicConfig
|
||||
|
||||
|
||||
|
@ -50,7 +50,7 @@ this:
|
||||
4. When the files are downloaded, another field (``files``) will be populated
|
||||
with the results. This field will contain a list of dicts with information
|
||||
about the downloaded files, such as the downloaded path, the original
|
||||
scraped url (taken from the ``file_urls`` field) , and the file checksum.
|
||||
scraped url (taken from the ``file_urls`` field), the file checksum and the file status.
|
||||
The files in the list of the ``files`` field will retain the same order of
|
||||
the original ``file_urls`` field. If some file failed downloading, an
|
||||
error will be logged and the file won't be present in the ``files`` field.
|
||||
@ -201,6 +201,9 @@ For self-hosting you also might feel the need not to use SSL and not to verify S
|
||||
.. _s3.scality: https://s3.scality.com/
|
||||
.. _canned ACLs: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl
|
||||
|
||||
|
||||
.. _media-pipeline-gcs:
|
||||
|
||||
Google Cloud Storage
|
||||
---------------------
|
||||
|
||||
@ -243,20 +246,22 @@ Usage example
|
||||
.. setting:: IMAGES_URLS_FIELD
|
||||
.. setting:: IMAGES_RESULT_FIELD
|
||||
|
||||
In order to use a media pipeline first, :ref:`enable it
|
||||
In order to use a media pipeline, first :ref:`enable it
|
||||
<topics-media-pipeline-enabling>`.
|
||||
|
||||
Then, if a spider returns a dict with the URLs key (``file_urls`` or
|
||||
``image_urls``, for the Files or Images Pipeline respectively), the pipeline will
|
||||
put the results under respective key (``files`` or ``images``).
|
||||
Then, if a spider returns an :ref:`item object <topics-items>` with the URLs
|
||||
field (``file_urls`` or ``image_urls``, for the Files or Images Pipeline
|
||||
respectively), the pipeline will put the results under the respective field
|
||||
(``files`` or ``images``).
|
||||
|
||||
If you prefer to use :class:`~.Item`, then define a custom item with the
|
||||
necessary fields, like in this example for Images Pipeline::
|
||||
When using :ref:`item types <item-types>` for which fields are defined beforehand,
|
||||
you must define both the URLs field and the results field. For example, when
|
||||
using the images pipeline, items must define both the ``image_urls`` and the
|
||||
``images`` field. For instance, using the :class:`~scrapy.item.Item` class::
|
||||
|
||||
import scrapy
|
||||
|
||||
class MyItem(scrapy.Item):
|
||||
|
||||
# ... other item fields ...
|
||||
image_urls = scrapy.Field()
|
||||
images = scrapy.Field()
|
||||
@ -445,8 +450,11 @@ See here the methods that you can override in your custom Files Pipeline:
|
||||
:meth:`~get_media_requests` method and return a Request for each
|
||||
file URL::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
for file_url in item['file_urls']:
|
||||
adapter = ItemAdapter(item)
|
||||
for file_url in adapter['file_urls']:
|
||||
yield scrapy.Request(file_url)
|
||||
|
||||
Those requests will be processed by the pipeline and, when they have finished
|
||||
@ -470,6 +478,18 @@ See here the methods that you can override in your custom Files Pipeline:
|
||||
|
||||
* ``checksum`` - a `MD5 hash`_ of the image contents
|
||||
|
||||
* ``status`` - the file status indication.
|
||||
|
||||
.. versionadded:: 2.2
|
||||
|
||||
It can be one of the following:
|
||||
|
||||
* ``downloaded`` - file was downloaded.
|
||||
* ``uptodate`` - file was not downloaded, as it was downloaded recently,
|
||||
according to the file expiration policy.
|
||||
* ``cached`` - file was already scheduled for download, by another item
|
||||
sharing the same file.
|
||||
|
||||
The list of tuples received by :meth:`~item_completed` is
|
||||
guaranteed to retain the same order of the requests returned from the
|
||||
:meth:`~get_media_requests` method.
|
||||
@ -479,7 +499,8 @@ See here the methods that you can override in your custom Files Pipeline:
|
||||
[(True,
|
||||
{'checksum': '2b00042f7481c7b056c4b410d28f33cf',
|
||||
'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg',
|
||||
'url': 'http://www.example.com/files/product1.pdf'}),
|
||||
'url': 'http://www.example.com/files/product1.pdf',
|
||||
'status': 'downloaded'}),
|
||||
(False,
|
||||
Failure(...))]
|
||||
|
||||
@ -500,13 +521,15 @@ See here the methods that you can override in your custom Files Pipeline:
|
||||
store the downloaded file paths (passed in results) in the ``file_paths``
|
||||
item field, and we drop the item if it doesn't contain any files::
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
file_paths = [x['path'] for ok, x in results if ok]
|
||||
if not file_paths:
|
||||
raise DropItem("Item contains no files")
|
||||
item['file_paths'] = file_paths
|
||||
adapter = ItemAdapter(item)
|
||||
adapter['file_paths'] = file_paths
|
||||
return item
|
||||
|
||||
By default, the :meth:`item_completed` method returns the item.
|
||||
@ -580,8 +603,9 @@ Here is a full example of the Images Pipeline whose methods are exemplified
|
||||
above::
|
||||
|
||||
import scrapy
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.exceptions import DropItem
|
||||
from scrapy.pipelines.images import ImagesPipeline
|
||||
|
||||
class MyImagesPipeline(ImagesPipeline):
|
||||
|
||||
@ -593,7 +617,8 @@ above::
|
||||
image_paths = [x['path'] for ok, x in results if ok]
|
||||
if not image_paths:
|
||||
raise DropItem("Item contains no images")
|
||||
item['image_paths'] = image_paths
|
||||
adapter = ItemAdapter(item)
|
||||
adapter['image_paths'] = image_paths
|
||||
return item
|
||||
|
||||
|
||||
|
@ -35,8 +35,9 @@ Here's an example showing how to run a single spider with it.
|
||||
...
|
||||
|
||||
process = CrawlerProcess(settings={
|
||||
'FEED_FORMAT': 'json',
|
||||
'FEED_URI': 'items.json'
|
||||
"FEEDS": {
|
||||
"items.json": {"format": "json"},
|
||||
},
|
||||
})
|
||||
|
||||
process.crawl(MySpider)
|
||||
|
@ -36,7 +36,7 @@ Request objects
|
||||
:type url: string
|
||||
|
||||
:param callback: the function that will be called with the response of this
|
||||
request (once its downloaded) as its first parameter. For more information
|
||||
request (once it's downloaded) as its first parameter. For more information
|
||||
see :ref:`topics-request-response-ref-request-callback-arguments` below.
|
||||
If a Request doesn't specify a callback, the spider's
|
||||
:meth:`~scrapy.spiders.Spider.parse` method will be used.
|
||||
@ -174,9 +174,9 @@ Request objects
|
||||
See :ref:`topics-request-meta` for a list of special meta keys
|
||||
recognized by Scrapy.
|
||||
|
||||
This dict is `shallow copied`_ when the request is cloned using the
|
||||
``copy()`` or ``replace()`` methods, and can also be accessed, in your
|
||||
spider, from the ``response.meta`` attribute.
|
||||
This dict is :doc:`shallow copied <library/copy>` when the request is
|
||||
cloned using the ``copy()`` or ``replace()`` methods, and can also be
|
||||
accessed, in your spider, from the ``response.meta`` attribute.
|
||||
|
||||
.. attribute:: Request.cb_kwargs
|
||||
|
||||
@ -185,11 +185,13 @@ Request objects
|
||||
for new Requests, which means by default callbacks only get a :class:`Response`
|
||||
object as argument.
|
||||
|
||||
This dict is `shallow copied`_ when the request is cloned using the
|
||||
``copy()`` or ``replace()`` methods, and can also be accessed, in your
|
||||
spider, from the ``response.cb_kwargs`` attribute.
|
||||
This dict is :doc:`shallow copied <library/copy>` when the request is
|
||||
cloned using the ``copy()`` or ``replace()`` methods, and can also be
|
||||
accessed, in your spider, from the ``response.cb_kwargs`` attribute.
|
||||
|
||||
.. _shallow copied: https://docs.python.org/2/library/copy.html
|
||||
In case of a failure to process the request, this dict can be accessed as
|
||||
``failure.request.cb_kwargs`` in the request's errback. For more information,
|
||||
see :ref:`errback-cb_kwargs`.
|
||||
|
||||
.. method:: Request.copy()
|
||||
|
||||
@ -314,6 +316,31 @@ errors if needed::
|
||||
request = failure.request
|
||||
self.logger.error('TimeoutError on %s', request.url)
|
||||
|
||||
.. _errback-cb_kwargs:
|
||||
|
||||
Accessing additional data in errback functions
|
||||
----------------------------------------------
|
||||
|
||||
In case of a failure to process the request, you may be interested in
|
||||
accessing arguments to the callback functions so you can process further
|
||||
based on the arguments in the errback. The following example shows how to
|
||||
achieve this by using ``Failure.request.cb_kwargs``::
|
||||
|
||||
def parse(self, response):
|
||||
request = scrapy.Request('http://www.example.com/index.html',
|
||||
callback=self.parse_page2,
|
||||
errback=self.errback_page2,
|
||||
cb_kwargs=dict(main_url=response.url))
|
||||
yield request
|
||||
|
||||
def parse_page2(self, response, main_url):
|
||||
pass
|
||||
|
||||
def errback_page2(self, failure):
|
||||
yield dict(
|
||||
main_url=failure.request.cb_kwargs['main_url'],
|
||||
)
|
||||
|
||||
.. _topics-request-meta:
|
||||
|
||||
Request.meta special keys
|
||||
@ -387,6 +414,51 @@ The meta key is used set retry times per request. When initialized, the
|
||||
:reqmeta:`max_retry_times` meta key takes higher precedence over the
|
||||
:setting:`RETRY_TIMES` setting.
|
||||
|
||||
|
||||
.. _topics-stop-response-download:
|
||||
|
||||
Stopping the download of a Response
|
||||
===================================
|
||||
|
||||
Raising a :exc:`~scrapy.exceptions.StopDownload` exception from a
|
||||
:class:`~scrapy.signals.bytes_received` signal handler will stop the
|
||||
download of a given response. See the following example::
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class StopSpider(scrapy.Spider):
|
||||
name = "stop"
|
||||
start_urls = ["https://docs.scrapy.org/en/latest/"]
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
spider = super().from_crawler(crawler)
|
||||
crawler.signals.connect(spider.on_bytes_received, signal=scrapy.signals.bytes_received)
|
||||
return spider
|
||||
|
||||
def parse(self, response):
|
||||
# 'last_chars' show that the full response was not downloaded
|
||||
yield {"len": len(response.text), "last_chars": response.text[-40:]}
|
||||
|
||||
def on_bytes_received(self, data, request, spider):
|
||||
raise scrapy.exceptions.StopDownload(fail=False)
|
||||
|
||||
which produces the following output::
|
||||
|
||||
2020-05-19 17:26:12 [scrapy.core.engine] INFO: Spider opened
|
||||
2020-05-19 17:26:12 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
|
||||
2020-05-19 17:26:13 [scrapy.core.downloader.handlers.http11] DEBUG: Download stopped for <GET https://docs.scrapy.org/en/latest/> from signal handler StopSpider.on_bytes_received
|
||||
2020-05-19 17:26:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://docs.scrapy.org/en/latest/> (referer: None) ['download_stopped']
|
||||
2020-05-19 17:26:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://docs.scrapy.org/en/latest/>
|
||||
{'len': 279, 'last_chars': 'dth, initial-scale=1.0">\n \n <title>Scr'}
|
||||
2020-05-19 17:26:13 [scrapy.core.engine] INFO: Closing spider (finished)
|
||||
|
||||
By default, resulting responses are handled by their corresponding errbacks. To
|
||||
call their callback instead, like in this example, pass ``fail=False`` to the
|
||||
:exc:`~scrapy.exceptions.StopDownload` exception.
|
||||
|
||||
|
||||
.. _topics-request-response-ref-request-subclasses:
|
||||
|
||||
Request subclasses
|
||||
@ -566,12 +638,10 @@ dealing with JSON requests.
|
||||
set to ``'POST'`` automatically.
|
||||
:type data: JSON serializable object
|
||||
|
||||
:param dumps_kwargs: Parameters that will be passed to underlying `json.dumps`_ method which is used to serialize
|
||||
:param dumps_kwargs: Parameters that will be passed to underlying :func:`json.dumps` method which is used to serialize
|
||||
data into JSON format.
|
||||
:type dumps_kwargs: dict
|
||||
|
||||
.. _json.dumps: https://docs.python.org/3/library/json.html#json.dumps
|
||||
|
||||
JsonRequest usage example
|
||||
-------------------------
|
||||
|
||||
@ -620,6 +690,12 @@ Response objects
|
||||
:param certificate: an object representing the server's SSL certificate.
|
||||
:type certificate: twisted.internet.ssl.Certificate
|
||||
|
||||
:param ip_address: The IP address of the server from which the Response originated.
|
||||
:type ip_address: :class:`ipaddress.IPv4Address` or :class:`ipaddress.IPv6Address`
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
The ``ip_address`` parameter.
|
||||
|
||||
.. attribute:: Response.url
|
||||
|
||||
A string containing the URL of the response.
|
||||
@ -709,6 +785,16 @@ Response objects
|
||||
|
||||
Only populated for ``https`` responses, ``None`` otherwise.
|
||||
|
||||
.. attribute:: Response.ip_address
|
||||
|
||||
.. versionadded:: 2.1.0
|
||||
|
||||
The IP address of the server from which the Response originated.
|
||||
|
||||
This attribute is currently only populated by the HTTP 1.1 download
|
||||
handler, i.e. for ``http(s)`` responses. For other handlers,
|
||||
:attr:`ip_address` is always ``None``.
|
||||
|
||||
.. method:: Response.copy()
|
||||
|
||||
Returns a new Response which is a copy of this Response.
|
||||
@ -724,18 +810,16 @@ Response objects
|
||||
Constructs an absolute url by combining the Response's :attr:`url` with
|
||||
a possible relative url.
|
||||
|
||||
This is a wrapper over `urlparse.urljoin`_, it's merely an alias for
|
||||
This is a wrapper over :func:`~urllib.parse.urljoin`, it's merely an alias for
|
||||
making this call::
|
||||
|
||||
urlparse.urljoin(response.url, url)
|
||||
urllib.parse.urljoin(response.url, url)
|
||||
|
||||
.. automethod:: Response.follow
|
||||
|
||||
.. automethod:: Response.follow_all
|
||||
|
||||
|
||||
.. _urlparse.urljoin: https://docs.python.org/2/library/urlparse.html#urlparse.urljoin
|
||||
|
||||
.. _topics-request-response-ref-response-subclasses:
|
||||
|
||||
Response subclasses
|
||||
@ -824,10 +908,10 @@ TextResponse objects
|
||||
|
||||
.. automethod:: TextResponse.follow_all
|
||||
|
||||
.. method:: TextResponse.body_as_unicode()
|
||||
.. automethod:: TextResponse.json()
|
||||
|
||||
The same as :attr:`text`, but available as a method. This method is
|
||||
kept for backward compatibility; please prefer ``response.text``.
|
||||
Returns a Python object from deserialized JSON document.
|
||||
The result is cached after the first call.
|
||||
|
||||
|
||||
HtmlResponse objects
|
||||
|
@ -14,7 +14,7 @@ achieve this, such as:
|
||||
drawback: it's slow.
|
||||
|
||||
* `lxml`_ is an XML parsing library (which also parses HTML) with a pythonic
|
||||
API based on `ElementTree`_. (lxml is not part of the Python standard
|
||||
API based on :mod:`~xml.etree.ElementTree`. (lxml is not part of the Python standard
|
||||
library.)
|
||||
|
||||
Scrapy comes with its own mechanism for extracting data. They're called
|
||||
@ -36,7 +36,6 @@ defines selectors to associate those styles with specific HTML elements.
|
||||
|
||||
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
|
||||
.. _lxml: https://lxml.de/
|
||||
.. _ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html
|
||||
.. _XPath: https://www.w3.org/TR/xpath/all/
|
||||
.. _CSS: https://www.w3.org/TR/selectors
|
||||
.. _parsel: https://parsel.readthedocs.io/en/latest/
|
||||
|
@ -26,9 +26,7 @@ do this by using an environment variable, ``SCRAPY_SETTINGS_MODULE``.
|
||||
|
||||
The value of ``SCRAPY_SETTINGS_MODULE`` should be in Python path syntax, e.g.
|
||||
``myproject.settings``. Note that the settings module should be on the
|
||||
Python `import search path`_.
|
||||
|
||||
.. _import search path: https://docs.python.org/2/tutorial/modules.html#the-module-search-path
|
||||
Python :ref:`import search path <tut-searchpath>`.
|
||||
|
||||
.. _populating-settings:
|
||||
|
||||
@ -238,8 +236,8 @@ CONCURRENT_ITEMS
|
||||
|
||||
Default: ``100``
|
||||
|
||||
Maximum number of concurrent items (per response) to process in parallel in the
|
||||
Item Processor (also known as the :ref:`Item Pipeline <topics-item-pipeline>`).
|
||||
Maximum number of concurrent items (per response) to process in parallel in
|
||||
:ref:`item pipelines <topics-item-pipeline>`.
|
||||
|
||||
.. setting:: CONCURRENT_REQUESTS
|
||||
|
||||
@ -422,10 +420,9 @@ connections (for ``HTTP10DownloadHandler``).
|
||||
.. note::
|
||||
|
||||
HTTP/1.0 is rarely used nowadays so you can safely ignore this setting,
|
||||
unless you use Twisted<11.1, or if you really want to use HTTP/1.0
|
||||
and override :setting:`DOWNLOAD_HANDLERS_BASE` for ``http(s)`` scheme
|
||||
accordingly, i.e. to
|
||||
``'scrapy.core.downloader.handlers.http.HTTP10DownloadHandler'``.
|
||||
unless you really want to use HTTP/1.0 and override
|
||||
:setting:`DOWNLOAD_HANDLERS` for ``http(s)`` scheme accordingly,
|
||||
i.e. to ``'scrapy.core.downloader.handlers.http.HTTP10DownloadHandler'``.
|
||||
|
||||
.. setting:: DOWNLOADER_CLIENTCONTEXTFACTORY
|
||||
|
||||
@ -449,7 +446,6 @@ or even enable client-side authentication (and various other things).
|
||||
Scrapy also has another context factory class that you can set,
|
||||
``'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory'``,
|
||||
which uses the platform's certificates to validate remote endpoints.
|
||||
**This is only available if you use Twisted>=14.0.**
|
||||
|
||||
If you do use a custom ContextFactory, make sure its ``__init__`` method
|
||||
accepts a ``method`` parameter (this is the ``OpenSSL.SSL`` method mapping
|
||||
@ -473,7 +469,7 @@ necessary to access certain HTTPS websites: for example, you may need to use
|
||||
``'DEFAULT:!DH'`` for a website with weak DH parameters or enable a
|
||||
specific cipher that is not included in ``DEFAULT`` if a website requires it.
|
||||
|
||||
.. _OpenSSL cipher list format: https://www.openssl.org/docs/manmaster/man1/ciphers.html#CIPHER-LIST-FORMAT
|
||||
.. _OpenSSL cipher list format: https://www.openssl.org/docs/manmaster/man1/openssl-ciphers.html#CIPHER-LIST-FORMAT
|
||||
|
||||
.. setting:: DOWNLOADER_CLIENT_TLS_METHOD
|
||||
|
||||
@ -496,10 +492,6 @@ This setting must be one of these string values:
|
||||
- ``'TLSv1.2'``: forces TLS version 1.2
|
||||
- ``'SSLv3'``: forces SSL version 3 (**not recommended**)
|
||||
|
||||
.. note::
|
||||
|
||||
We recommend that you use PyOpenSSL>=0.13 and Twisted>=0.13
|
||||
or above (Twisted>=14.0 if you can).
|
||||
|
||||
.. setting:: DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING
|
||||
|
||||
@ -662,8 +654,6 @@ If you want to disable it set to 0.
|
||||
spider attribute and per-request using :reqmeta:`download_maxsize`
|
||||
Request.meta key.
|
||||
|
||||
This feature needs Twisted >= 11.1.
|
||||
|
||||
.. setting:: DOWNLOAD_WARNSIZE
|
||||
|
||||
DOWNLOAD_WARNSIZE
|
||||
@ -681,8 +671,6 @@ If you want to disable it set to 0.
|
||||
spider attribute and per-request using :reqmeta:`download_warnsize`
|
||||
Request.meta key.
|
||||
|
||||
This feature needs Twisted >= 11.1.
|
||||
|
||||
.. setting:: DOWNLOAD_FAIL_ON_DATALOSS
|
||||
|
||||
DOWNLOAD_FAIL_ON_DATALOSS
|
||||
@ -899,10 +887,9 @@ LOG_FORMAT
|
||||
|
||||
Default: ``'%(asctime)s [%(name)s] %(levelname)s: %(message)s'``
|
||||
|
||||
String for formatting log messages. Refer to the `Python logging documentation`_ for the whole list of available
|
||||
placeholders.
|
||||
|
||||
.. _Python logging documentation: https://docs.python.org/2/library/logging.html#logrecord-attributes
|
||||
String for formatting log messages. Refer to the
|
||||
:ref:`Python logging documentation <logrecord-attributes>` for the qwhole
|
||||
list of available placeholders.
|
||||
|
||||
.. setting:: LOG_DATEFORMAT
|
||||
|
||||
@ -912,10 +899,9 @@ LOG_DATEFORMAT
|
||||
Default: ``'%Y-%m-%d %H:%M:%S'``
|
||||
|
||||
String for formatting date/time, expansion of the ``%(asctime)s`` placeholder
|
||||
in :setting:`LOG_FORMAT`. Refer to the `Python datetime documentation`_ for the whole list of available
|
||||
directives.
|
||||
|
||||
.. _Python datetime documentation: https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior
|
||||
in :setting:`LOG_FORMAT`. Refer to the
|
||||
:ref:`Python datetime documentation <strftime-strptime-behavior>` for the
|
||||
whole list of available directives.
|
||||
|
||||
.. setting:: LOG_FORMATTER
|
||||
|
||||
@ -1116,17 +1102,6 @@ multi-purpose thread pool used by various Scrapy components. Threaded
|
||||
DNS Resolver, BlockingFeedStorage, S3FilesStore just to name a few. Increase
|
||||
this value if you're experiencing problems with insufficient blocking IO.
|
||||
|
||||
.. setting:: REDIRECT_MAX_TIMES
|
||||
|
||||
REDIRECT_MAX_TIMES
|
||||
------------------
|
||||
|
||||
Default: ``20``
|
||||
|
||||
Defines the maximum times a request can be redirected. After this maximum the
|
||||
request's response is returned as is. We used Firefox default value for the
|
||||
same task.
|
||||
|
||||
.. setting:: REDIRECT_PRIORITY_ADJUST
|
||||
|
||||
REDIRECT_PRIORITY_ADJUST
|
||||
@ -1422,17 +1397,6 @@ Default: ``True``
|
||||
A boolean which specifies if the :ref:`telnet console <topics-telnetconsole>`
|
||||
will be enabled (provided its extension is also enabled).
|
||||
|
||||
.. setting:: TELNETCONSOLE_PORT
|
||||
|
||||
TELNETCONSOLE_PORT
|
||||
------------------
|
||||
|
||||
Default: ``[6023, 6073]``
|
||||
|
||||
The port range to use for the telnet console. If set to ``None`` or ``0``, a
|
||||
dynamically assigned port is used. For more info see
|
||||
:ref:`topics-telnetconsole`.
|
||||
|
||||
.. setting:: TEMPLATES_DIR
|
||||
|
||||
TEMPLATES_DIR
|
||||
|
@ -156,6 +156,17 @@ First, we launch the shell::
|
||||
|
||||
scrapy shell 'https://scrapy.org' --nolog
|
||||
|
||||
.. note::
|
||||
|
||||
Remember to always enclose URLs in quotes when running the Scrapy shell from
|
||||
the command line, otherwise URLs containing arguments (i.e. the ``&`` character)
|
||||
will not work.
|
||||
|
||||
On Windows, use double quotes instead::
|
||||
|
||||
scrapy shell "https://scrapy.org" --nolog
|
||||
|
||||
|
||||
Then, the shell fetches the URL (using the Scrapy downloader) and prints the
|
||||
list of available objects and useful shortcuts (you'll notice that these lines
|
||||
all start with the ``[s]`` prefix)::
|
||||
|
@ -16,8 +16,7 @@ deliver the arguments that the handler receives.
|
||||
You can connect to signals (or send your own) through the
|
||||
:ref:`topics-api-signals`.
|
||||
|
||||
Here is a simple example showing how you can catch signals and perform some action:
|
||||
::
|
||||
Here is a simple example showing how you can catch signals and perform some action::
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy import Spider
|
||||
@ -52,9 +51,45 @@ Deferred signal handlers
|
||||
========================
|
||||
|
||||
Some signals support returning :class:`~twisted.internet.defer.Deferred`
|
||||
objects from their handlers, see the :ref:`topics-signals-ref` below to know
|
||||
which ones.
|
||||
objects from their handlers, allowing you to run asynchronous code that
|
||||
does not block Scrapy. If a signal handler returns a
|
||||
:class:`~twisted.internet.defer.Deferred`, Scrapy waits for that
|
||||
:class:`~twisted.internet.defer.Deferred` to fire.
|
||||
|
||||
Let's take an example::
|
||||
|
||||
class SignalSpider(scrapy.Spider):
|
||||
name = 'signals'
|
||||
start_urls = ['http://quotes.toscrape.com/page/1/']
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
spider = super(SignalSpider, cls).from_crawler(crawler, *args, **kwargs)
|
||||
crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped)
|
||||
return spider
|
||||
|
||||
def item_scraped(self, item):
|
||||
# Send the scraped item to the server
|
||||
d = treq.post(
|
||||
'http://example.com/post',
|
||||
json.dumps(item).encode('ascii'),
|
||||
headers={b'Content-Type': [b'application/json']}
|
||||
)
|
||||
|
||||
# The next item will be scraped only after
|
||||
# deferred (d) is fired
|
||||
return d
|
||||
|
||||
def parse(self, response):
|
||||
for quote in response.css('div.quote'):
|
||||
yield {
|
||||
'text': quote.css('span.text::text').get(),
|
||||
'author': quote.css('small.author::text').get(),
|
||||
'tags': quote.css('div.tags a.tag::text').getall(),
|
||||
}
|
||||
|
||||
See the :ref:`topics-signals-ref` below to know which signals support
|
||||
:class:`~twisted.internet.defer.Deferred`.
|
||||
|
||||
.. _topics-signals-ref:
|
||||
|
||||
@ -66,22 +101,25 @@ Built-in signals reference
|
||||
|
||||
Here's the list of Scrapy built-in signals and their meaning.
|
||||
|
||||
engine_started
|
||||
Engine signals
|
||||
--------------
|
||||
|
||||
engine_started
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: engine_started
|
||||
.. function:: engine_started()
|
||||
|
||||
Sent when the Scrapy engine has started crawling.
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
.. note:: This signal may be fired *after* the :signal:`spider_opened` signal,
|
||||
depending on how the spider was started. So **don't** rely on this signal
|
||||
getting fired before :signal:`spider_opened`.
|
||||
|
||||
engine_stopped
|
||||
--------------
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: engine_stopped
|
||||
.. function:: engine_stopped()
|
||||
@ -89,10 +127,21 @@ engine_stopped
|
||||
Sent when the Scrapy engine is stopped (for example, when a crawling
|
||||
process has finished).
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
Item signals
|
||||
------------
|
||||
|
||||
.. note::
|
||||
As at max :setting:`CONCURRENT_ITEMS` items are processed in
|
||||
parallel, many deferreds are fired together using
|
||||
:class:`~twisted.internet.defer.DeferredList`. Hence the next
|
||||
batch waits for the :class:`~twisted.internet.defer.DeferredList`
|
||||
to fire and then runs the respective item signal handler for
|
||||
the next batch of scraped items.
|
||||
|
||||
item_scraped
|
||||
------------
|
||||
~~~~~~~~~~~~
|
||||
|
||||
.. signal:: item_scraped
|
||||
.. function:: item_scraped(item, response, spider)
|
||||
@ -100,10 +149,10 @@ item_scraped
|
||||
Sent when an item has been scraped, after it has passed all the
|
||||
:ref:`topics-item-pipeline` stages (without being dropped).
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
:param item: the item scraped
|
||||
:type item: dict or :class:`~scrapy.item.Item` object
|
||||
:param item: the scraped item
|
||||
:type item: :ref:`item object <item-types>`
|
||||
|
||||
:param spider: the spider which scraped the item
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
@ -112,7 +161,7 @@ item_scraped
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
item_dropped
|
||||
------------
|
||||
~~~~~~~~~~~~
|
||||
|
||||
.. signal:: item_dropped
|
||||
.. function:: item_dropped(item, response, exception, spider)
|
||||
@ -120,10 +169,10 @@ item_dropped
|
||||
Sent after an item has been dropped from the :ref:`topics-item-pipeline`
|
||||
when some stage raised a :exc:`~scrapy.exceptions.DropItem` exception.
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
:param item: the item dropped from the :ref:`topics-item-pipeline`
|
||||
:type item: dict or :class:`~scrapy.item.Item` object
|
||||
:type item: :ref:`item object <item-types>`
|
||||
|
||||
:param spider: the spider which scraped the item
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
@ -137,7 +186,7 @@ item_dropped
|
||||
:type exception: :exc:`~scrapy.exceptions.DropItem` exception
|
||||
|
||||
item_error
|
||||
------------
|
||||
~~~~~~~~~~
|
||||
|
||||
.. signal:: item_error
|
||||
.. function:: item_error(item, response, spider, failure)
|
||||
@ -145,10 +194,10 @@ item_error
|
||||
Sent when a :ref:`topics-item-pipeline` generates an error (i.e. raises
|
||||
an exception), except :exc:`~scrapy.exceptions.DropItem` exception.
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
:param item: the item dropped from the :ref:`topics-item-pipeline`
|
||||
:type item: dict or :class:`~scrapy.item.Item` object
|
||||
:param item: the item that caused the error in the :ref:`topics-item-pipeline`
|
||||
:type item: :ref:`item object <item-types>`
|
||||
|
||||
:param response: the response being processed when the exception was raised
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
@ -159,8 +208,11 @@ item_error
|
||||
:param failure: the exception raised
|
||||
:type failure: twisted.python.failure.Failure
|
||||
|
||||
Spider signals
|
||||
--------------
|
||||
|
||||
spider_closed
|
||||
-------------
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: spider_closed
|
||||
.. function:: spider_closed(spider, reason)
|
||||
@ -168,7 +220,7 @@ spider_closed
|
||||
Sent after a spider has been closed. This can be used to release per-spider
|
||||
resources reserved on :signal:`spider_opened`.
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
:param spider: the spider which has been closed
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
@ -183,7 +235,7 @@ spider_closed
|
||||
:type reason: str
|
||||
|
||||
spider_opened
|
||||
-------------
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: spider_opened
|
||||
.. function:: spider_opened(spider)
|
||||
@ -192,13 +244,13 @@ spider_opened
|
||||
reserve per-spider resources, but can be used for any task that needs to be
|
||||
performed when a spider is opened.
|
||||
|
||||
This signal supports returning deferreds from their handlers.
|
||||
This signal supports returning deferreds from its handlers.
|
||||
|
||||
:param spider: the spider which has been opened
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
spider_idle
|
||||
-----------
|
||||
~~~~~~~~~~~
|
||||
|
||||
.. signal:: spider_idle
|
||||
.. function:: spider_idle(spider)
|
||||
@ -216,7 +268,7 @@ spider_idle
|
||||
You may raise a :exc:`~scrapy.exceptions.DontCloseSpider` exception to
|
||||
prevent the spider from being closed.
|
||||
|
||||
This signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param spider: the spider which has gone idle
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
@ -228,14 +280,14 @@ spider_idle
|
||||
due to duplication).
|
||||
|
||||
spider_error
|
||||
------------
|
||||
~~~~~~~~~~~~
|
||||
|
||||
.. signal:: spider_error
|
||||
.. function:: spider_error(failure, response, spider)
|
||||
|
||||
Sent when a spider callback generates an error (i.e. raises an exception).
|
||||
|
||||
This signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param failure: the exception raised
|
||||
:type failure: twisted.python.failure.Failure
|
||||
@ -246,8 +298,11 @@ spider_error
|
||||
:param spider: the spider which raised the exception
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
Request signals
|
||||
---------------
|
||||
|
||||
request_scheduled
|
||||
-----------------
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: request_scheduled
|
||||
.. function:: request_scheduled(request, spider)
|
||||
@ -255,7 +310,7 @@ request_scheduled
|
||||
Sent when the engine schedules a :class:`~scrapy.http.Request`, to be
|
||||
downloaded later.
|
||||
|
||||
The signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param request: the request that reached the scheduler
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
@ -264,7 +319,7 @@ request_scheduled
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
request_dropped
|
||||
---------------
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: request_dropped
|
||||
.. function:: request_dropped(request, spider)
|
||||
@ -272,7 +327,7 @@ request_dropped
|
||||
Sent when a :class:`~scrapy.http.Request`, scheduled by the engine to be
|
||||
downloaded later, is rejected by the scheduler.
|
||||
|
||||
The signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param request: the request that reached the scheduler
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
@ -281,14 +336,14 @@ request_dropped
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
request_reached_downloader
|
||||
---------------------------
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: request_reached_downloader
|
||||
.. function:: request_reached_downloader(request, spider)
|
||||
|
||||
Sent when a :class:`~scrapy.http.Request` reached downloader.
|
||||
|
||||
The signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param request: the request that reached downloader
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
@ -297,7 +352,7 @@ request_reached_downloader
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
request_left_downloader
|
||||
-----------------------
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: request_left_downloader
|
||||
.. function:: request_left_downloader(request, spider)
|
||||
@ -315,8 +370,41 @@ request_left_downloader
|
||||
:param spider: the spider that yielded the request
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
bytes_received
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
.. versionadded:: 2.2
|
||||
|
||||
.. signal:: bytes_received
|
||||
.. function:: bytes_received(data, request, spider)
|
||||
|
||||
Sent by the HTTP 1.1 and S3 download handlers when a group of bytes is
|
||||
received for a specific request. This signal might be fired multiple
|
||||
times for the same request, with partial data each time. For instance,
|
||||
a possible scenario for a 25 kb response would be two signals fired
|
||||
with 10 kb of data, and a final one with 5 kb of data.
|
||||
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param data: the data received by the download handler
|
||||
:type data: :class:`bytes` object
|
||||
|
||||
:param request: the request that generated the download
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider associated with the response
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
.. note:: Handlers of this signal can stop the download of a response while it
|
||||
is in progress by raising the :exc:`~scrapy.exceptions.StopDownload`
|
||||
exception. Please refer to the :ref:`topics-stop-response-download` topic
|
||||
for additional information and examples.
|
||||
|
||||
Response signals
|
||||
----------------
|
||||
|
||||
response_received
|
||||
-----------------
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: response_received
|
||||
.. function:: response_received(response, request, spider)
|
||||
@ -324,7 +412,7 @@ response_received
|
||||
Sent when the engine receives a new :class:`~scrapy.http.Response` from the
|
||||
downloader.
|
||||
|
||||
This signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param response: the response received
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
@ -336,14 +424,14 @@ response_received
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
response_downloaded
|
||||
-------------------
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. signal:: response_downloaded
|
||||
.. function:: response_downloaded(response, request, spider)
|
||||
|
||||
Sent by the downloader right after a ``HTTPResponse`` is downloaded.
|
||||
|
||||
This signal does not support returning deferreds from their handlers.
|
||||
This signal does not support returning deferreds from its handlers.
|
||||
|
||||
:param response: the response downloaded
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
@ -102,29 +102,28 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
||||
it has processed the response.
|
||||
|
||||
:meth:`process_spider_output` must return an iterable of
|
||||
:class:`~scrapy.http.Request`, dict or :class:`~scrapy.item.Item`
|
||||
objects.
|
||||
:class:`~scrapy.http.Request` objects and :ref:`item object
|
||||
<topics-items>`.
|
||||
|
||||
:param response: the response which generated this output from the
|
||||
spider
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param result: the result returned by the spider
|
||||
:type result: an iterable of :class:`~scrapy.http.Request`, dict
|
||||
or :class:`~scrapy.item.Item` objects
|
||||
:type result: an iterable of :class:`~scrapy.http.Request` objects and
|
||||
:ref:`item object <topics-items>`
|
||||
|
||||
:param spider: the spider whose result is being processed
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
|
||||
|
||||
.. method:: process_spider_exception(response, exception, spider)
|
||||
|
||||
This method is called when a spider or :meth:`process_spider_output`
|
||||
method (from a previous spider middleware) raises an exception.
|
||||
|
||||
:meth:`process_spider_exception` should return either ``None`` or an
|
||||
iterable of :class:`~scrapy.http.Request`, dict or
|
||||
:class:`~scrapy.item.Item` objects.
|
||||
iterable of :class:`~scrapy.http.Request` objects and :ref:`item object
|
||||
<topics-items>`.
|
||||
|
||||
If it returns ``None``, Scrapy will continue processing this exception,
|
||||
executing any other :meth:`process_spider_exception` in the following
|
||||
@ -140,7 +139,7 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param exception: the exception raised
|
||||
:type exception: `Exception`_ object
|
||||
:type exception: :exc:`Exception` object
|
||||
|
||||
:param spider: the spider which raised the exception
|
||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||
@ -183,10 +182,6 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
||||
:param crawler: crawler that uses this middleware
|
||||
:type crawler: :class:`~scrapy.crawler.Crawler` object
|
||||
|
||||
|
||||
.. _Exception: https://docs.python.org/2/library/exceptions.html#exceptions.Exception
|
||||
|
||||
|
||||
.. _topics-spider-middleware-ref:
|
||||
|
||||
Built-in spider middleware reference
|
||||
|
@ -23,8 +23,8 @@ For spiders, the scraping cycle goes through something like this:
|
||||
:attr:`~scrapy.spiders.Spider.parse` method as callback function for the
|
||||
Requests.
|
||||
|
||||
2. In the callback function, you parse the response (web page) and return either
|
||||
dicts with extracted data, :class:`~scrapy.item.Item` objects,
|
||||
2. In the callback function, you parse the response (web page) and return
|
||||
:ref:`item objects <topics-items>`,
|
||||
:class:`~scrapy.http.Request` objects, or an iterable of these objects.
|
||||
Those Requests will also contain a callback (maybe
|
||||
the same) and will then be downloaded by Scrapy and then their
|
||||
@ -121,7 +121,7 @@ scrapy.Spider
|
||||
send log messages through it as described on
|
||||
:ref:`topics-logging-from-spiders`.
|
||||
|
||||
.. method:: from_crawler(crawler, \*args, \**kwargs)
|
||||
.. method:: from_crawler(crawler, *args, **kwargs)
|
||||
|
||||
This is the class method used by Scrapy to create your spiders.
|
||||
|
||||
@ -179,8 +179,8 @@ scrapy.Spider
|
||||
the same requirements as the :class:`Spider` class.
|
||||
|
||||
This method, as well as any other Request callback, must return an
|
||||
iterable of :class:`~scrapy.http.Request` and/or
|
||||
dicts or :class:`~scrapy.item.Item` objects.
|
||||
iterable of :class:`~scrapy.http.Request` and/or :ref:`item objects
|
||||
<topics-items>`.
|
||||
|
||||
:param response: the response to parse
|
||||
:type response: :class:`~scrapy.http.Response`
|
||||
@ -234,7 +234,7 @@ Return multiple Requests and items from a single callback::
|
||||
yield scrapy.Request(response.urljoin(href), self.parse)
|
||||
|
||||
Instead of :attr:`~.start_urls` you can use :meth:`~.start_requests` directly;
|
||||
to give data more structure you can use :ref:`topics-items`::
|
||||
to give data more structure you can use :class:`~scrapy.item.Item` objects::
|
||||
|
||||
import scrapy
|
||||
from myproject.items import MyItem
|
||||
@ -298,9 +298,7 @@ Keep in mind that spider arguments are only strings.
|
||||
The spider will not do any parsing on its own.
|
||||
If you were to set the ``start_urls`` attribute from the command line,
|
||||
you would have to parse it on your own into a list
|
||||
using something like
|
||||
`ast.literal_eval <https://docs.python.org/3/library/ast.html#ast.literal_eval>`_
|
||||
or `json.loads <https://docs.python.org/3/library/json.html#json.loads>`_
|
||||
using something like :func:`ast.literal_eval` or :func:`json.loads`
|
||||
and then set it as an attribute.
|
||||
Otherwise, you would cause iteration over a ``start_urls`` string
|
||||
(a very common python pitfall)
|
||||
@ -366,7 +364,7 @@ CrawlSpider
|
||||
|
||||
This method is called for the start_urls responses. It allows to parse
|
||||
the initial responses and must return either an
|
||||
:class:`~scrapy.item.Item` object, a :class:`~scrapy.http.Request`
|
||||
:ref:`item object <topics-items>`, a :class:`~scrapy.http.Request`
|
||||
object, or an iterable containing any of them.
|
||||
|
||||
Crawling rules
|
||||
@ -385,7 +383,7 @@ Crawling rules
|
||||
object with that name will be used) to be called for each link extracted with
|
||||
the specified link extractor. This callback receives a :class:`~scrapy.http.Response`
|
||||
as its first argument and must return either a single instance or an iterable of
|
||||
:class:`~scrapy.item.Item`, ``dict`` and/or :class:`~scrapy.http.Request` objects
|
||||
:ref:`item objects <topics-items>` and/or :class:`~scrapy.http.Request` objects
|
||||
(or any subclass of them). As mentioned above, the received :class:`~scrapy.http.Response`
|
||||
object will contain the text of the link that produced the :class:`~scrapy.http.Request`
|
||||
in its ``meta`` dictionary (under the ``link_text`` key)
|
||||
@ -533,7 +531,7 @@ XMLFeedSpider
|
||||
(``itertag``). Receives the response and an
|
||||
:class:`~scrapy.selector.Selector` for each node. Overriding this
|
||||
method is mandatory. Otherwise, you spider won't work. This method
|
||||
must return either a :class:`~scrapy.item.Item` object, a
|
||||
must return an :ref:`item object <topics-items>`, a
|
||||
:class:`~scrapy.http.Request` object, or an iterable containing any of
|
||||
them.
|
||||
|
||||
@ -543,7 +541,7 @@ XMLFeedSpider
|
||||
spider, and it's intended to perform any last time processing required
|
||||
before returning the results to the framework core, for example setting the
|
||||
item IDs. It receives a list of results and the response which originated
|
||||
those results. It must return a list of results (Items or Requests).
|
||||
those results. It must return a list of results (items or requests).
|
||||
|
||||
|
||||
XMLFeedSpider example
|
||||
|
@ -89,13 +89,11 @@ convenience:
|
||||
+----------------+-------------------------------------------------------------------+
|
||||
| ``prefs`` | for memory debugging (see :ref:`topics-leaks`) |
|
||||
+----------------+-------------------------------------------------------------------+
|
||||
| ``p`` | a shortcut to the `pprint.pprint`_ function |
|
||||
| ``p`` | a shortcut to the :func:`pprint.pprint` function |
|
||||
+----------------+-------------------------------------------------------------------+
|
||||
| ``hpy`` | for memory debugging (see :ref:`topics-leaks`) |
|
||||
+----------------+-------------------------------------------------------------------+
|
||||
|
||||
.. _pprint.pprint: https://docs.python.org/library/pprint.html#pprint.pprint
|
||||
|
||||
Telnet console usage examples
|
||||
=============================
|
||||
|
||||
@ -208,4 +206,3 @@ Default: ``None``
|
||||
|
||||
The password used for the telnet console, default behaviour is to have it
|
||||
autogenerated
|
||||
|
||||
|
@ -14,24 +14,27 @@ Author: dufferzafar
|
||||
|
||||
import re
|
||||
|
||||
# Used for remembering the file (and its contents)
|
||||
# so we don't have to open the same file again.
|
||||
_filename = None
|
||||
_contents = None
|
||||
|
||||
# A regex that matches standard linkcheck output lines
|
||||
line_re = re.compile(u'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))')
|
||||
def main():
|
||||
|
||||
# Read lines from the linkcheck output file
|
||||
try:
|
||||
# Used for remembering the file (and its contents)
|
||||
# so we don't have to open the same file again.
|
||||
_filename = None
|
||||
_contents = None
|
||||
|
||||
# A regex that matches standard linkcheck output lines
|
||||
line_re = re.compile(u'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))')
|
||||
|
||||
# Read lines from the linkcheck output file
|
||||
try:
|
||||
with open("build/linkcheck/output.txt") as out:
|
||||
output_lines = out.readlines()
|
||||
except IOError:
|
||||
except IOError:
|
||||
print("linkcheck output not found; please run linkcheck first.")
|
||||
exit(1)
|
||||
|
||||
# For every line, fix the respective file
|
||||
for line in output_lines:
|
||||
# For every line, fix the respective file
|
||||
for line in output_lines:
|
||||
match = re.match(line_re, line)
|
||||
|
||||
if match:
|
||||
@ -61,3 +64,7 @@ for line in output_lines:
|
||||
else:
|
||||
# We don't understand what the current line means!
|
||||
print("Not Understood: " + line)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
113
pylintrc
Normal file
113
pylintrc
Normal file
@ -0,0 +1,113 @@
|
||||
[MASTER]
|
||||
persistent=no
|
||||
jobs=1 # >1 hides results
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
disable=abstract-method,
|
||||
anomalous-backslash-in-string,
|
||||
arguments-differ,
|
||||
attribute-defined-outside-init,
|
||||
bad-classmethod-argument,
|
||||
bad-continuation,
|
||||
bad-indentation,
|
||||
bad-mcs-classmethod-argument,
|
||||
bad-super-call,
|
||||
bad-whitespace,
|
||||
bare-except,
|
||||
blacklisted-name,
|
||||
broad-except,
|
||||
c-extension-no-member,
|
||||
catching-non-exception,
|
||||
cell-var-from-loop,
|
||||
comparison-with-callable,
|
||||
consider-iterating-dictionary,
|
||||
consider-using-in,
|
||||
consider-using-set-comprehension,
|
||||
consider-using-sys-exit,
|
||||
cyclic-import,
|
||||
dangerous-default-value,
|
||||
deprecated-method,
|
||||
deprecated-module,
|
||||
duplicate-code, # https://github.com/PyCQA/pylint/issues/214
|
||||
eval-used,
|
||||
expression-not-assigned,
|
||||
fixme,
|
||||
function-redefined,
|
||||
global-statement,
|
||||
import-error,
|
||||
import-outside-toplevel,
|
||||
import-self,
|
||||
inconsistent-return-statements,
|
||||
inherit-non-class,
|
||||
invalid-name,
|
||||
invalid-overridden-method,
|
||||
isinstance-second-argument-not-valid-type,
|
||||
keyword-arg-before-vararg,
|
||||
line-too-long,
|
||||
logging-format-interpolation,
|
||||
logging-not-lazy,
|
||||
lost-exception,
|
||||
method-hidden,
|
||||
misplaced-comparison-constant,
|
||||
missing-docstring,
|
||||
missing-final-newline,
|
||||
multiple-imports,
|
||||
multiple-statements,
|
||||
no-else-continue,
|
||||
no-else-raise,
|
||||
no-else-return,
|
||||
no-init,
|
||||
no-member,
|
||||
no-method-argument,
|
||||
no-name-in-module,
|
||||
no-self-argument,
|
||||
no-self-use,
|
||||
no-value-for-parameter,
|
||||
not-an-iterable,
|
||||
not-callable,
|
||||
pointless-statement,
|
||||
pointless-string-statement,
|
||||
protected-access,
|
||||
redefined-argument-from-local,
|
||||
redefined-builtin,
|
||||
redefined-outer-name,
|
||||
reimported,
|
||||
signature-differs,
|
||||
singleton-comparison,
|
||||
super-init-not-called,
|
||||
superfluous-parens,
|
||||
too-few-public-methods,
|
||||
too-many-ancestors,
|
||||
too-many-arguments,
|
||||
too-many-branches,
|
||||
too-many-format-args,
|
||||
too-many-function-args,
|
||||
too-many-instance-attributes,
|
||||
too-many-lines,
|
||||
too-many-locals,
|
||||
too-many-public-methods,
|
||||
too-many-return-statements,
|
||||
trailing-newlines,
|
||||
trailing-whitespace,
|
||||
unbalanced-tuple-unpacking,
|
||||
undefined-variable,
|
||||
undefined-loop-variable,
|
||||
unexpected-special-method-signature,
|
||||
ungrouped-imports,
|
||||
unidiomatic-typecheck,
|
||||
unnecessary-comprehension,
|
||||
unnecessary-lambda,
|
||||
unnecessary-pass,
|
||||
unreachable,
|
||||
unsubscriptable-object,
|
||||
unused-argument,
|
||||
unused-import,
|
||||
unused-variable,
|
||||
unused-wildcard-import,
|
||||
used-before-assignment,
|
||||
useless-object-inheritance, # Required for Python 2 support
|
||||
useless-return,
|
||||
useless-super-delegation,
|
||||
wildcard-import,
|
||||
wrong-import-order,
|
||||
wrong-import-position
|
233
pytest.ini
233
pytest.ini
@ -20,232 +20,23 @@ addopts =
|
||||
twisted = 1
|
||||
markers =
|
||||
only_asyncio: marks tests as only enabled when --reactor=asyncio is passed
|
||||
flake8-max-line-length = 119
|
||||
flake8-ignore =
|
||||
W503
|
||||
# Files that are only meant to provide top-level imports are expected not
|
||||
# to use any of their imports:
|
||||
|
||||
# Exclude files that are meant to provide top-level imports
|
||||
# E402: Module level import not at top of file
|
||||
# F401: Module imported but unused
|
||||
scrapy/__init__.py E402
|
||||
scrapy/core/downloader/handlers/http.py F401
|
||||
scrapy/http/__init__.py F401
|
||||
scrapy/linkextractors/__init__.py E402 F401
|
||||
scrapy/selector/__init__.py F401
|
||||
scrapy/spiders/__init__.py E402 F401
|
||||
|
||||
# Issues pending a review:
|
||||
# extras
|
||||
extras/qps-bench-server.py E501
|
||||
extras/qpsclient.py E501 E501
|
||||
# scrapy/commands
|
||||
scrapy/commands/__init__.py E128 E501
|
||||
scrapy/commands/check.py E501
|
||||
scrapy/commands/crawl.py E501
|
||||
scrapy/commands/edit.py E501
|
||||
scrapy/commands/fetch.py E401 E501 E128 E731
|
||||
scrapy/commands/genspider.py E128 E501 E502
|
||||
scrapy/commands/parse.py E128 E501 E731
|
||||
scrapy/commands/runspider.py E501
|
||||
scrapy/commands/settings.py E128
|
||||
scrapy/commands/shell.py E128 E501 E502
|
||||
scrapy/commands/startproject.py E127 E501 E128
|
||||
scrapy/commands/version.py E501 E128
|
||||
# scrapy/contracts
|
||||
scrapy/contracts/__init__.py E501 W504
|
||||
scrapy/contracts/default.py E128
|
||||
# scrapy/core
|
||||
scrapy/core/engine.py E501 E128 E127 E502
|
||||
scrapy/core/scheduler.py E501
|
||||
scrapy/core/scraper.py E501 E128 W504
|
||||
scrapy/core/spidermw.py E501 E731 E126
|
||||
scrapy/core/downloader/__init__.py E501
|
||||
scrapy/core/downloader/contextfactory.py E501 E128 E126
|
||||
scrapy/core/downloader/middleware.py E501 E502
|
||||
scrapy/core/downloader/tls.py E501 E241
|
||||
scrapy/core/downloader/webclient.py E731 E501 E128 E126
|
||||
scrapy/core/downloader/handlers/__init__.py E501
|
||||
scrapy/core/downloader/handlers/ftp.py E501 E128 E127
|
||||
scrapy/core/downloader/handlers/http10.py E501
|
||||
scrapy/core/downloader/handlers/http11.py E501
|
||||
scrapy/core/downloader/handlers/s3.py E501 E128 E126
|
||||
# scrapy/downloadermiddlewares
|
||||
scrapy/downloadermiddlewares/ajaxcrawl.py E501
|
||||
scrapy/downloadermiddlewares/decompression.py E501
|
||||
scrapy/downloadermiddlewares/defaultheaders.py E501
|
||||
scrapy/downloadermiddlewares/httpcache.py E501 E126
|
||||
scrapy/downloadermiddlewares/httpcompression.py E501 E128
|
||||
scrapy/downloadermiddlewares/httpproxy.py E501
|
||||
scrapy/downloadermiddlewares/redirect.py E501 W504
|
||||
scrapy/downloadermiddlewares/retry.py E501 E126
|
||||
scrapy/downloadermiddlewares/robotstxt.py E501
|
||||
scrapy/downloadermiddlewares/stats.py E501
|
||||
# scrapy/extensions
|
||||
scrapy/extensions/closespider.py E501 E128 E123
|
||||
scrapy/extensions/corestats.py E501
|
||||
scrapy/extensions/feedexport.py E128 E501
|
||||
scrapy/extensions/httpcache.py E128 E501
|
||||
scrapy/extensions/memdebug.py E501
|
||||
scrapy/extensions/spiderstate.py E501
|
||||
scrapy/extensions/telnet.py E501 W504
|
||||
scrapy/extensions/throttle.py E501
|
||||
# scrapy/http
|
||||
scrapy/http/common.py E501
|
||||
scrapy/http/cookies.py E501
|
||||
scrapy/http/request/__init__.py E501
|
||||
scrapy/http/request/form.py E501 E123
|
||||
scrapy/http/request/json_request.py E501
|
||||
scrapy/http/response/__init__.py E501 E128
|
||||
scrapy/http/response/text.py E501 E128 E124
|
||||
# scrapy/linkextractors
|
||||
scrapy/linkextractors/__init__.py E731 E501 E402 W504
|
||||
scrapy/linkextractors/lxmlhtml.py E501 E731
|
||||
# scrapy/loader
|
||||
scrapy/loader/__init__.py E501 E128
|
||||
scrapy/loader/processors.py E501
|
||||
# scrapy/pipelines
|
||||
scrapy/pipelines/__init__.py E501
|
||||
scrapy/pipelines/files.py E116 E501 E266
|
||||
scrapy/pipelines/images.py E265 E501
|
||||
scrapy/pipelines/media.py E125 E501 E266
|
||||
# scrapy/selector
|
||||
scrapy/selector/__init__.py F403
|
||||
scrapy/selector/unified.py E501 E111
|
||||
# scrapy/settings
|
||||
scrapy/settings/__init__.py E501
|
||||
scrapy/settings/default_settings.py E501 E114 E116
|
||||
scrapy/settings/deprecated.py E501
|
||||
# scrapy/spidermiddlewares
|
||||
scrapy/spidermiddlewares/httperror.py E501
|
||||
scrapy/spidermiddlewares/offsite.py E501
|
||||
scrapy/spidermiddlewares/referer.py E501 E129 W504
|
||||
scrapy/spidermiddlewares/urllength.py E501
|
||||
# scrapy/spiders
|
||||
scrapy/spiders/__init__.py E501 E402
|
||||
scrapy/spiders/crawl.py E501
|
||||
scrapy/spiders/feed.py E501
|
||||
scrapy/spiders/sitemap.py E501
|
||||
# scrapy/utils
|
||||
scrapy/utils/asyncio.py E501
|
||||
scrapy/utils/benchserver.py E501
|
||||
scrapy/utils/conf.py E402 E501
|
||||
scrapy/utils/datatypes.py E501
|
||||
scrapy/utils/decorators.py E501
|
||||
scrapy/utils/defer.py E501 E128
|
||||
scrapy/utils/deprecate.py E128 E501 E127 E502
|
||||
scrapy/utils/gz.py E501 W504
|
||||
scrapy/utils/http.py F403
|
||||
scrapy/utils/httpobj.py E501
|
||||
scrapy/utils/iterators.py E501
|
||||
scrapy/utils/log.py E128 E501
|
||||
scrapy/utils/markup.py F403
|
||||
scrapy/utils/misc.py E501
|
||||
scrapy/utils/multipart.py F403
|
||||
scrapy/utils/project.py E501
|
||||
scrapy/utils/python.py E501
|
||||
scrapy/utils/reactor.py E501
|
||||
scrapy/utils/reqser.py E501
|
||||
scrapy/utils/request.py E127 E501
|
||||
scrapy/utils/response.py E501 E128
|
||||
scrapy/utils/signal.py E501 E128
|
||||
scrapy/utils/sitemap.py E501
|
||||
scrapy/utils/spider.py E501
|
||||
scrapy/utils/ssl.py E501
|
||||
scrapy/utils/test.py E501
|
||||
scrapy/utils/url.py E501 F403 E128 F405
|
||||
# scrapy
|
||||
scrapy/__init__.py E402 E501
|
||||
scrapy/cmdline.py E501
|
||||
scrapy/crawler.py E501
|
||||
scrapy/dupefilters.py E501 E202
|
||||
scrapy/exceptions.py E501
|
||||
scrapy/exporters.py E501
|
||||
scrapy/interfaces.py E501
|
||||
scrapy/item.py E501 E128
|
||||
scrapy/link.py E501
|
||||
scrapy/logformatter.py E501
|
||||
scrapy/mail.py E402 E128 E501 E502
|
||||
scrapy/middleware.py E128 E501
|
||||
scrapy/pqueues.py E501
|
||||
scrapy/resolver.py E501
|
||||
scrapy/responsetypes.py E128 E501
|
||||
scrapy/robotstxt.py E501
|
||||
scrapy/shell.py E501
|
||||
scrapy/signalmanager.py E501
|
||||
scrapy/spiderloader.py F841 E501 E126
|
||||
scrapy/squeues.py E128
|
||||
scrapy/statscollectors.py E501
|
||||
# tests
|
||||
tests/__init__.py E402 E501
|
||||
tests/mockserver.py E401 E501 E126 E123
|
||||
tests/pipelines.py F841
|
||||
tests/spiders.py E501 E127
|
||||
tests/test_closespider.py E501 E127
|
||||
tests/test_command_fetch.py E501
|
||||
tests/test_command_parse.py E501 E128
|
||||
tests/test_command_shell.py E501 E128
|
||||
tests/test_commands.py E128 E501
|
||||
tests/test_contracts.py E501 E128
|
||||
tests/test_crawl.py E501 E741 E265
|
||||
tests/test_crawler.py F841 E501
|
||||
tests/test_dependencies.py F841 E501
|
||||
tests/test_downloader_handlers.py E124 E127 E128 E265 E501 E126 E123
|
||||
tests/test_downloadermiddleware.py E501
|
||||
tests/test_downloadermiddleware_ajaxcrawlable.py E501
|
||||
tests/test_downloadermiddleware_cookies.py E731 E741 E501 E128 E265 E126
|
||||
tests/test_downloadermiddleware_decompression.py E127
|
||||
tests/test_downloadermiddleware_defaultheaders.py E501
|
||||
tests/test_downloadermiddleware_downloadtimeout.py E501
|
||||
tests/test_downloadermiddleware_httpcache.py E501
|
||||
tests/test_downloadermiddleware_httpcompression.py E501 E126 E123
|
||||
tests/test_downloadermiddleware_httpproxy.py E501 E128
|
||||
tests/test_downloadermiddleware_redirect.py E501 E128 E127
|
||||
tests/test_downloadermiddleware_retry.py E501 E128 E126
|
||||
tests/test_downloadermiddleware_robotstxt.py E501
|
||||
tests/test_downloadermiddleware_stats.py E501
|
||||
tests/test_dupefilters.py E501 E741 E128 E124
|
||||
tests/test_engine.py E401 E501 E128
|
||||
tests/test_exporters.py E501 E731 E128 E124
|
||||
tests/test_extension_telnet.py F841
|
||||
tests/test_feedexport.py E501 F841 E241
|
||||
tests/test_http_cookies.py E501
|
||||
tests/test_http_headers.py E501
|
||||
tests/test_http_request.py E402 E501 E127 E128 E128 E126 E123
|
||||
tests/test_http_response.py E501 E128 E265
|
||||
tests/test_item.py E128 F841
|
||||
tests/test_link.py E501
|
||||
tests/test_linkextractors.py E501 E128 E124
|
||||
tests/test_loader.py E501 E731 E741 E128 E117 E241
|
||||
tests/test_logformatter.py E128 E501 E122
|
||||
tests/test_mail.py E128 E501
|
||||
tests/test_middleware.py E501 E128
|
||||
tests/test_pipeline_crawl.py E501 E128 E126
|
||||
tests/test_pipeline_files.py E501
|
||||
tests/test_pipeline_images.py F841 E501
|
||||
tests/test_pipeline_media.py E501 E741 E731 E128 E502
|
||||
tests/test_proxy_connect.py E501 E741
|
||||
tests/test_request_cb_kwargs.py E501
|
||||
tests/test_responsetypes.py E501
|
||||
tests/test_robotstxt_interface.py E501 E501
|
||||
tests/test_scheduler.py E501 E126 E123
|
||||
tests/test_selector.py E501 E127
|
||||
tests/test_spider.py E501
|
||||
tests/test_spidermiddleware.py E501
|
||||
tests/test_spidermiddleware_httperror.py E128 E501 E127 E121
|
||||
tests/test_spidermiddleware_offsite.py E501 E128 E111
|
||||
tests/test_spidermiddleware_output_chain.py E501
|
||||
tests/test_spidermiddleware_referer.py E501 F841 E125 E201 E124 E501 E241 E121
|
||||
tests/test_squeues.py E501 E741
|
||||
tests/test_utils_asyncio.py E501
|
||||
tests/test_utils_conf.py E501 E128
|
||||
tests/test_utils_curl.py E501
|
||||
tests/test_utils_datatypes.py E402 E501
|
||||
tests/test_utils_defer.py E501 F841
|
||||
tests/test_utils_deprecate.py F841 E501
|
||||
tests/test_utils_http.py E501 E128 W504
|
||||
tests/test_utils_iterators.py E501 E128 E129 E241
|
||||
tests/test_utils_log.py E741
|
||||
tests/test_utils_python.py E501 E731
|
||||
tests/test_utils_reqser.py E501 E128
|
||||
tests/test_utils_request.py E501 E128
|
||||
tests/test_utils_response.py E501
|
||||
tests/test_utils_signal.py E741 F841 E731
|
||||
tests/test_utils_sitemap.py E128 E501 E124
|
||||
tests/test_utils_url.py E501 E127 E125 E501 E241 E126 E123
|
||||
tests/test_webclient.py E501 E128 E122 E402 E241 E123 E126
|
||||
tests/test_cmdline/__init__.py E501
|
||||
tests/test_settings/__init__.py E501 E128
|
||||
tests/test_spiderloader/__init__.py E128 E501
|
||||
tests/test_utils_misc/__init__.py E501
|
||||
scrapy/utils/url.py F403 F405
|
||||
tests/test_loader.py E741
|
||||
|
@ -1 +1 @@
|
||||
2.0.0
|
||||
2.2.0
|
||||
|
@ -2,33 +2,11 @@
|
||||
Scrapy - a web crawling and web scraping framework written for Python
|
||||
"""
|
||||
|
||||
__all__ = ['__version__', 'version_info', 'twisted_version',
|
||||
'Spider', 'Request', 'FormRequest', 'Selector', 'Item', 'Field']
|
||||
|
||||
# Scrapy version
|
||||
import pkgutil
|
||||
__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
|
||||
version_info = tuple(int(v) if v.isdigit() else v
|
||||
for v in __version__.split('.'))
|
||||
del pkgutil
|
||||
|
||||
# Check minimum required Python version
|
||||
import sys
|
||||
if sys.version_info < (3, 5):
|
||||
print("Scrapy %s requires Python 3.5" % __version__)
|
||||
sys.exit(1)
|
||||
|
||||
# Ignore noisy twisted deprecation warnings
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
|
||||
del warnings
|
||||
|
||||
# Apply monkey patches to fix issues in external libraries
|
||||
from scrapy import _monkeypatches
|
||||
del _monkeypatches
|
||||
|
||||
from twisted import version as _txv
|
||||
twisted_version = (_txv.major, _txv.minor, _txv.micro)
|
||||
|
||||
# Declare top-level shortcuts
|
||||
from scrapy.spiders import Spider
|
||||
@ -36,4 +14,29 @@ from scrapy.http import Request, FormRequest
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
|
||||
__all__ = [
|
||||
'__version__', 'version_info', 'twisted_version', 'Spider',
|
||||
'Request', 'FormRequest', 'Selector', 'Item', 'Field',
|
||||
]
|
||||
|
||||
|
||||
# Scrapy and Twisted versions
|
||||
__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
|
||||
version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split('.'))
|
||||
twisted_version = (_txv.major, _txv.minor, _txv.micro)
|
||||
|
||||
|
||||
# Check minimum required Python version
|
||||
if sys.version_info < (3, 5, 2):
|
||||
print("Scrapy %s requires Python 3.5.2" % __version__)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# Ignore noisy twisted deprecation warnings
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
|
||||
|
||||
|
||||
del pkgutil
|
||||
del sys
|
||||
del warnings
|
||||
|
@ -1,11 +0,0 @@
|
||||
import copyreg
|
||||
|
||||
|
||||
# Undo what Twisted's perspective broker adds to pickle register
|
||||
# to prevent bugs like Twisted#7989 while serializing requests
|
||||
import twisted.persisted.styles # NOQA
|
||||
# Remove only entries with twisted serializers for non-twisted types.
|
||||
for k, v in frozenset(copyreg.dispatch_table.items()):
|
||||
if not str(getattr(k, '__module__', '')).startswith('twisted') \
|
||||
and str(getattr(v, '__module__', '')).startswith('twisted'):
|
||||
copyreg.dispatch_table.pop(k)
|
@ -165,6 +165,7 @@ if __name__ == '__main__':
|
||||
try:
|
||||
execute()
|
||||
finally:
|
||||
# Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect()
|
||||
# on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies
|
||||
# Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() on exit:
|
||||
# http://doc.pypy.org/en/latest/cpython_differences.html
|
||||
# ?highlight=gc.collect#differences-related-to-garbage-collection-strategies
|
||||
garbage_collect()
|
||||
|
@ -5,7 +5,7 @@ import os
|
||||
from optparse import OptionGroup
|
||||
from twisted.python import failure
|
||||
|
||||
from scrapy.utils.conf import arglist_to_dict
|
||||
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
@ -23,7 +23,8 @@ class ScrapyCommand:
|
||||
self.settings = None # set in scrapy.cmdline
|
||||
|
||||
def set_crawler(self, crawler):
|
||||
assert not hasattr(self, '_crawler'), "crawler already set"
|
||||
if hasattr(self, '_crawler'):
|
||||
raise RuntimeError("crawler already set")
|
||||
self._crawler = crawler
|
||||
|
||||
def syntax(self):
|
||||
@ -103,3 +104,27 @@ class ScrapyCommand:
|
||||
Entry point for running commands
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BaseRunSpiderCommand(ScrapyCommand):
|
||||
"""
|
||||
Common class used to share functionality between the crawl and runspider commands
|
||||
"""
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
||||
help="set spider argument (may be repeated)")
|
||||
parser.add_option("-o", "--output", metavar="FILE", action="append",
|
||||
help="dump scraped items into FILE (use - for stdout)")
|
||||
parser.add_option("-t", "--output-format", metavar="FORMAT",
|
||||
help="format to use for dumping items with -o")
|
||||
|
||||
def process_options(self, args, opts):
|
||||
ScrapyCommand.process_options(self, args, opts)
|
||||
try:
|
||||
opts.spargs = arglist_to_dict(opts.spargs)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
|
||||
if opts.output:
|
||||
feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
|
||||
self.settings.set('FEEDS', feeds, priority='cmdline')
|
||||
|
@ -1,9 +1,8 @@
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
|
||||
from scrapy.commands import BaseRunSpiderCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
class Command(BaseRunSpiderCommand):
|
||||
|
||||
requires_project = True
|
||||
|
||||
@ -13,25 +12,6 @@ class Command(ScrapyCommand):
|
||||
def short_desc(self):
|
||||
return "Run a spider"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
||||
help="set spider argument (may be repeated)")
|
||||
parser.add_option("-o", "--output", metavar="FILE", action="append",
|
||||
help="dump scraped items into FILE (use - for stdout)")
|
||||
parser.add_option("-t", "--output-format", metavar="FORMAT",
|
||||
help="format to use for dumping items with -o")
|
||||
|
||||
def process_options(self, args, opts):
|
||||
ScrapyCommand.process_options(self, args, opts)
|
||||
try:
|
||||
opts.spargs = arglist_to_dict(opts.spargs)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
|
||||
if opts.output:
|
||||
feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
|
||||
self.settings.set('FEEDS', feeds, priority='cmdline')
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) < 1:
|
||||
raise UsageError()
|
||||
|
@ -27,8 +27,8 @@ class Command(ScrapyCommand):
|
||||
parser.add_option("--spider", dest="spider", help="use this spider")
|
||||
parser.add_option("--headers", dest="headers", action="store_true",
|
||||
help="print response HTTP headers instead of body")
|
||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true",
|
||||
default=False, help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
|
||||
help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
|
||||
def _print_headers(self, headers, prefix):
|
||||
for key, values in headers.items():
|
||||
@ -49,8 +49,8 @@ class Command(ScrapyCommand):
|
||||
def run(self, args, opts):
|
||||
if len(args) != 1 or not is_url(args[0]):
|
||||
raise UsageError()
|
||||
cb = lambda x: self._print_response(x, opts)
|
||||
request = Request(args[0], callback=cb, dont_filter=True)
|
||||
request = Request(args[0], callback=self._print_response,
|
||||
cb_kwargs={"opts": opts}, dont_filter=True)
|
||||
# by default, let the framework handle redirects,
|
||||
# i.e. command handles all codes expect 3xx
|
||||
if not opts.no_redirect:
|
||||
|
@ -90,8 +90,7 @@ class Command(ScrapyCommand):
|
||||
'module': module,
|
||||
'name': name,
|
||||
'domain': domain,
|
||||
'classname': '%sSpider' % ''.join(s.capitalize() \
|
||||
for s in module.split('_'))
|
||||
'classname': '%sSpider' % ''.join(s.capitalize() for s in module.split('_'))
|
||||
}
|
||||
if self.settings.get('NEWSPIDER_MODULE'):
|
||||
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
|
||||
@ -102,8 +101,8 @@ class Command(ScrapyCommand):
|
||||
spider_file = "%s.py" % join(spiders_dir, module)
|
||||
shutil.copyfile(template_file, spider_file)
|
||||
render_templatefile(spider_file, **tvars)
|
||||
print("Created spider %r using template %r " % (name, \
|
||||
template_name), end=('' if spiders_module else '\n'))
|
||||
print("Created spider %r using template %r "
|
||||
% (name, template_name), end=('' if spiders_module else '\n'))
|
||||
if spiders_module:
|
||||
print("in module:\n %s.%s" % (spiders_module.__name__, module))
|
||||
|
||||
|
@ -1,11 +1,11 @@
|
||||
import json
|
||||
import logging
|
||||
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
from w3lib.url import is_url
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.utils import display
|
||||
from scrapy.utils.conf import arglist_to_dict
|
||||
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
|
||||
@ -81,7 +81,7 @@ class Command(ScrapyCommand):
|
||||
items = self.items.get(lvl, [])
|
||||
|
||||
print("# Scraped Items ", "-" * 60)
|
||||
display.pprint([dict(x) for x in items], colorize=colour)
|
||||
display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
|
||||
|
||||
def print_requests(self, lvl=None, colour=True):
|
||||
if lvl is None:
|
||||
@ -117,7 +117,7 @@ class Command(ScrapyCommand):
|
||||
items, requests = [], []
|
||||
|
||||
for x in iterate_spider_output(callback(response, **cb_kwargs)):
|
||||
if isinstance(x, (BaseItem, dict)):
|
||||
if is_item(x):
|
||||
items.append(x)
|
||||
elif isinstance(x, Request):
|
||||
requests.append(x)
|
||||
@ -146,9 +146,8 @@ class Command(ScrapyCommand):
|
||||
if not self.spidercls:
|
||||
logger.error('Unable to find spider for: %(url)s', {'url': url})
|
||||
|
||||
# Request requires callback argument as callable or None, not string
|
||||
request = Request(url, None)
|
||||
_start_requests = lambda s: [self.prepare_request(s, request, opts)]
|
||||
def _start_requests(spider):
|
||||
yield self.prepare_request(spider, Request(url), opts)
|
||||
self.spidercls.start_requests = _start_requests
|
||||
|
||||
def start_parsing(self, url, opts):
|
||||
|
@ -3,9 +3,8 @@ import os
|
||||
from importlib import import_module
|
||||
|
||||
from scrapy.utils.spider import iter_spider_classes
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
|
||||
from scrapy.commands import BaseRunSpiderCommand
|
||||
|
||||
|
||||
def _import_file(filepath):
|
||||
@ -24,7 +23,7 @@ def _import_file(filepath):
|
||||
return module
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
class Command(BaseRunSpiderCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
|
||||
@ -38,25 +37,6 @@ class Command(ScrapyCommand):
|
||||
def long_desc(self):
|
||||
return "Run the spider defined in the given file"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
||||
help="set spider argument (may be repeated)")
|
||||
parser.add_option("-o", "--output", metavar="FILE", action="append",
|
||||
help="dump scraped items into FILE (use - for stdout)")
|
||||
parser.add_option("-t", "--output-format", metavar="FORMAT",
|
||||
help="format to use for dumping items with -o")
|
||||
|
||||
def process_options(self, args, opts):
|
||||
ScrapyCommand.process_options(self, args, opts)
|
||||
try:
|
||||
opts.spargs = arglist_to_dict(opts.spargs)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
|
||||
if opts.output:
|
||||
feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
|
||||
self.settings.set('FEEDS', feeds, priority='cmdline')
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) != 1:
|
||||
raise UsageError()
|
||||
|
@ -37,8 +37,8 @@ class Command(ScrapyCommand):
|
||||
help="evaluate the code in the shell, print the result and exit")
|
||||
parser.add_option("--spider", dest="spider",
|
||||
help="use this spider")
|
||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \
|
||||
default=False, help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
|
||||
help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
|
||||
def update_vars(self, vars):
|
||||
"""You can use this function to update the Scrapy objects that will be
|
||||
|
@ -1,5 +1,6 @@
|
||||
import re
|
||||
import os
|
||||
import stat
|
||||
import string
|
||||
from importlib import import_module
|
||||
from os.path import join, exists, abspath
|
||||
@ -78,6 +79,29 @@ class Command(ScrapyCommand):
|
||||
else:
|
||||
copy2(srcname, dstname)
|
||||
copystat(src, dst)
|
||||
self._set_rw_permissions(dst)
|
||||
|
||||
def _set_rw_permissions(self, path):
|
||||
"""
|
||||
Sets permissions of a directory tree to +rw and +rwx for folders.
|
||||
This is necessary if the start template files come without write
|
||||
permissions.
|
||||
"""
|
||||
mode_rw = (stat.S_IRUSR
|
||||
| stat.S_IWUSR
|
||||
| stat.S_IRGRP
|
||||
| stat.S_IROTH)
|
||||
|
||||
mode_x = (stat.S_IXUSR
|
||||
| stat.S_IXGRP
|
||||
| stat.S_IXOTH)
|
||||
|
||||
os.chmod(path, mode_rw | mode_x)
|
||||
for root, dirs, files in os.walk(path):
|
||||
for dir in dirs:
|
||||
os.chmod(join(root, dir), mode_rw | mode_x)
|
||||
for file in files:
|
||||
os.chmod(join(root, file), mode_rw)
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) not in (1, 2):
|
||||
@ -102,10 +126,8 @@ class Command(ScrapyCommand):
|
||||
move(join(project_dir, 'module'), join(project_dir, project_name))
|
||||
for paths in TEMPLATES_TO_RENDER:
|
||||
path = join(*paths)
|
||||
tplfile = join(project_dir,
|
||||
string.Template(path).substitute(project_name=project_name))
|
||||
render_templatefile(tplfile, project_name=project_name,
|
||||
ProjectName=string_camelcase(project_name))
|
||||
tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
|
||||
render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
|
||||
print("New Scrapy project '%s', using template directory '%s', "
|
||||
"created in:" % (project_name, self.templates_dir))
|
||||
print(" %s\n" % abspath(project_dir))
|
||||
|
@ -17,10 +17,10 @@ class ContractsManager:
|
||||
self.contracts[contract.name] = contract
|
||||
|
||||
def tested_methods_from_spidercls(self, spidercls):
|
||||
is_method = re.compile(r"^\s*@", re.MULTILINE).search
|
||||
methods = []
|
||||
for key, value in getmembers(spidercls):
|
||||
if (callable(value) and value.__doc__ and
|
||||
re.search(r'^\s*@', value.__doc__, re.MULTILINE)):
|
||||
if callable(value) and value.__doc__ and is_method(value.__doc__):
|
||||
methods.append(key)
|
||||
|
||||
return methods
|
||||
|
@ -1,10 +1,10 @@
|
||||
import json
|
||||
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import ContractFail
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
|
||||
from scrapy.contracts import Contract
|
||||
from scrapy.exceptions import ContractFail
|
||||
from scrapy.http import Request
|
||||
|
||||
|
||||
# contracts
|
||||
@ -48,19 +48,23 @@ class ReturnsContract(Contract):
|
||||
"""
|
||||
|
||||
name = 'returns'
|
||||
objects = {
|
||||
'request': Request,
|
||||
'requests': Request,
|
||||
'item': (BaseItem, dict),
|
||||
'items': (BaseItem, dict),
|
||||
object_type_verifiers = {
|
||||
'request': lambda x: isinstance(x, Request),
|
||||
'requests': lambda x: isinstance(x, Request),
|
||||
'item': is_item,
|
||||
'items': is_item,
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ReturnsContract, self).__init__(*args, **kwargs)
|
||||
|
||||
assert len(self.args) in [1, 2, 3]
|
||||
if len(self.args) not in [1, 2, 3]:
|
||||
raise ValueError(
|
||||
"Incorrect argument quantity: expected 1, 2 or 3, got %i"
|
||||
% len(self.args)
|
||||
)
|
||||
self.obj_name = self.args[0] or None
|
||||
self.obj_type = self.objects[self.obj_name]
|
||||
self.obj_type_verifier = self.object_type_verifiers[self.obj_name]
|
||||
|
||||
try:
|
||||
self.min_bound = int(self.args[1])
|
||||
@ -75,7 +79,7 @@ class ReturnsContract(Contract):
|
||||
def post_process(self, output):
|
||||
occurrences = 0
|
||||
for x in output:
|
||||
if isinstance(x, self.obj_type):
|
||||
if self.obj_type_verifier(x):
|
||||
occurrences += 1
|
||||
|
||||
assertion = (self.min_bound <= occurrences <= self.max_bound)
|
||||
@ -99,8 +103,8 @@ class ScrapesContract(Contract):
|
||||
|
||||
def post_process(self, output):
|
||||
for x in output:
|
||||
if isinstance(x, (BaseItem, dict)):
|
||||
missing = [arg for arg in self.args if arg not in x]
|
||||
if is_item(x):
|
||||
missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
|
||||
if missing:
|
||||
raise ContractFail(
|
||||
"Missing fields: %s" % ", ".join(missing))
|
||||
missing_str = ", ".join(missing)
|
||||
raise ContractFail("Missing fields: %s" % missing_str)
|
||||
|
@ -46,11 +46,12 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
||||
#
|
||||
# * getattr() for `_ssl_method` attribute for context factories
|
||||
# not calling super(..., self).__init__
|
||||
return CertificateOptions(verify=False,
|
||||
method=getattr(self, 'method',
|
||||
getattr(self, '_ssl_method', None)),
|
||||
return CertificateOptions(
|
||||
verify=False,
|
||||
method=getattr(self, 'method', getattr(self, '_ssl_method', None)),
|
||||
fixBrokenPeers=True,
|
||||
acceptableCiphers=self.tls_ciphers)
|
||||
acceptableCiphers=self.tls_ciphers,
|
||||
)
|
||||
|
||||
# kept for old-style HTTP/1.0 downloader context twisted calls,
|
||||
# e.g. connectSSL()
|
||||
@ -86,8 +87,8 @@ class BrowserLikeContextFactory(ScrapyClientContextFactory):
|
||||
#
|
||||
# This means that a website like https://www.cacert.org will be rejected
|
||||
# by default, since CAcert.org CA certificate is seldom shipped.
|
||||
return optionsForClientTLS(hostname.decode("ascii"),
|
||||
return optionsForClientTLS(
|
||||
hostname=hostname.decode("ascii"),
|
||||
trustRoot=platformTrust(),
|
||||
extraCertificateOptions={
|
||||
'method': self._ssl_method,
|
||||
})
|
||||
extraCertificateOptions={'method': self._ssl_method},
|
||||
)
|
||||
|
@ -86,19 +86,19 @@ class FTPDownloadHandler:
|
||||
password = request.meta.get("ftp_password", self.default_password)
|
||||
passive_mode = 1 if bool(request.meta.get("ftp_passive",
|
||||
self.passive_mode)) else 0
|
||||
creator = ClientCreator(reactor, FTPClient, user, password,
|
||||
passive=passive_mode)
|
||||
return creator.connectTCP(parsed_url.hostname, parsed_url.port or 21).addCallback(self.gotClient,
|
||||
request, unquote(parsed_url.path))
|
||||
creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode)
|
||||
dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
|
||||
return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
|
||||
|
||||
def gotClient(self, client, request, filepath):
|
||||
self.client = client
|
||||
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
|
||||
return client.retrieveFile(filepath, protocol)\
|
||||
.addCallbacks(callback=self._build_response,
|
||||
return client.retrieveFile(filepath, protocol).addCallbacks(
|
||||
callback=self._build_response,
|
||||
callbackArgs=(request, protocol),
|
||||
errback=self._failed,
|
||||
errbackArgs=(request,))
|
||||
errbackArgs=(request,),
|
||||
)
|
||||
|
||||
def _build_response(self, result, request, protocol):
|
||||
self.result = result
|
||||
|
@ -1,5 +1,6 @@
|
||||
"""Download handlers for http and https schemes"""
|
||||
|
||||
import ipaddress
|
||||
import logging
|
||||
import re
|
||||
import warnings
|
||||
@ -11,15 +12,17 @@ from urllib.parse import urldefrag
|
||||
from twisted.internet import defer, protocol, ssl
|
||||
from twisted.internet.endpoints import TCP4ClientEndpoint
|
||||
from twisted.internet.error import TimeoutError
|
||||
from twisted.python.failure import Failure
|
||||
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
|
||||
from twisted.web.http import _DataLoss, PotentialDataLoss
|
||||
from twisted.web.http_headers import Headers as TxHeaders
|
||||
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
|
||||
from zope.interface import implementer
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.core.downloader.tls import openssl_methods
|
||||
from scrapy.core.downloader.webclient import _parse
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning, StopDownload
|
||||
from scrapy.http import Headers
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
@ -33,6 +36,8 @@ class HTTP11DownloadHandler:
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings, crawler=None):
|
||||
self._crawler = crawler
|
||||
|
||||
from twisted.internet import reactor
|
||||
self._pool = HTTPConnectionPool(reactor, persistent=True)
|
||||
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
||||
@ -78,6 +83,7 @@ class HTTP11DownloadHandler:
|
||||
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
|
||||
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
|
||||
fail_on_dataloss=self._fail_on_dataloss,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
return agent.download_request(request)
|
||||
|
||||
@ -275,7 +281,7 @@ class ScrapyAgent:
|
||||
_TunnelingAgent = TunnelingAgent
|
||||
|
||||
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
|
||||
maxsize=0, warnsize=0, fail_on_dataloss=True):
|
||||
maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None):
|
||||
self._contextFactory = contextFactory
|
||||
self._connectTimeout = connectTimeout
|
||||
self._bindAddress = bindAddress
|
||||
@ -284,6 +290,7 @@ class ScrapyAgent:
|
||||
self._warnsize = warnsize
|
||||
self._fail_on_dataloss = fail_on_dataloss
|
||||
self._txresponse = None
|
||||
self._crawler = crawler
|
||||
|
||||
def _get_agent(self, request, timeout):
|
||||
from twisted.internet import reactor
|
||||
@ -341,20 +348,6 @@ class ScrapyAgent:
|
||||
headers.removeHeader(b'Proxy-Authorization')
|
||||
if request.body:
|
||||
bodyproducer = _RequestBodyProducer(request.body)
|
||||
elif method == b'POST':
|
||||
# Setting Content-Length: 0 even for POST requests is not a
|
||||
# MUST per HTTP RFCs, but it's common behavior, and some
|
||||
# servers require this, otherwise returning HTTP 411 Length required
|
||||
#
|
||||
# RFC 7230#section-3.3.2:
|
||||
# "a Content-Length header field is normally sent in a POST
|
||||
# request even when the value is 0 (indicating an empty payload body)."
|
||||
#
|
||||
# Twisted < 17 will not add "Content-Length: 0" by itself;
|
||||
# Twisted >= 17 fixes this;
|
||||
# Using a producer with an empty-string sends `0` as Content-Length
|
||||
# for all versions of Twisted.
|
||||
bodyproducer = _RequestBodyProducer(b'')
|
||||
else:
|
||||
bodyproducer = None
|
||||
start_time = time()
|
||||
@ -387,7 +380,13 @@ class ScrapyAgent:
|
||||
def _cb_bodyready(self, txresponse, request):
|
||||
# deliverBody hangs for responses without body
|
||||
if txresponse.length == 0:
|
||||
return txresponse, b'', None, None
|
||||
return {
|
||||
"txresponse": txresponse,
|
||||
"body": b"",
|
||||
"flags": None,
|
||||
"certificate": None,
|
||||
"ip_address": None,
|
||||
}
|
||||
|
||||
maxsize = request.meta.get('download_maxsize', self._maxsize)
|
||||
warnsize = request.meta.get('download_warnsize', self._warnsize)
|
||||
@ -414,7 +413,15 @@ class ScrapyAgent:
|
||||
|
||||
d = defer.Deferred(_cancel)
|
||||
txresponse.deliverBody(
|
||||
_ResponseReader(d, txresponse, request, maxsize, warnsize, fail_on_dataloss)
|
||||
_ResponseReader(
|
||||
finished=d,
|
||||
txresponse=txresponse,
|
||||
request=request,
|
||||
maxsize=maxsize,
|
||||
warnsize=warnsize,
|
||||
fail_on_dataloss=fail_on_dataloss,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
)
|
||||
|
||||
# save response for timeouts
|
||||
@ -423,12 +430,21 @@ class ScrapyAgent:
|
||||
return d
|
||||
|
||||
def _cb_bodydone(self, result, request, url):
|
||||
txresponse, body, flags, certificate = result
|
||||
status = int(txresponse.code)
|
||||
headers = Headers(txresponse.headers.getAllRawHeaders())
|
||||
respcls = responsetypes.from_args(headers=headers, url=url, body=body)
|
||||
return respcls(url=url, status=status, headers=headers, body=body,
|
||||
flags=flags, certificate=certificate)
|
||||
headers = Headers(result["txresponse"].headers.getAllRawHeaders())
|
||||
respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
|
||||
response = respcls(
|
||||
url=url,
|
||||
status=int(result["txresponse"].code),
|
||||
headers=headers,
|
||||
body=result["body"],
|
||||
flags=result["flags"],
|
||||
certificate=result["certificate"],
|
||||
ip_address=result["ip_address"],
|
||||
)
|
||||
if result.get("failure"):
|
||||
result["failure"].value.response = response
|
||||
return result["failure"]
|
||||
return response
|
||||
|
||||
|
||||
@implementer(IBodyProducer)
|
||||
@ -451,7 +467,7 @@ class _RequestBodyProducer:
|
||||
|
||||
class _ResponseReader(protocol.Protocol):
|
||||
|
||||
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss):
|
||||
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler):
|
||||
self._finished = finished
|
||||
self._txresponse = txresponse
|
||||
self._request = request
|
||||
@ -463,12 +479,27 @@ class _ResponseReader(protocol.Protocol):
|
||||
self._reached_warnsize = False
|
||||
self._bytes_received = 0
|
||||
self._certificate = None
|
||||
self._ip_address = None
|
||||
self._crawler = crawler
|
||||
|
||||
def _finish_response(self, flags=None, failure=None):
|
||||
self._finished.callback({
|
||||
"txresponse": self._txresponse,
|
||||
"body": self._bodybuf.getvalue(),
|
||||
"flags": flags,
|
||||
"certificate": self._certificate,
|
||||
"ip_address": self._ip_address,
|
||||
"failure": failure,
|
||||
})
|
||||
|
||||
def connectionMade(self):
|
||||
if self._certificate is None:
|
||||
with suppress(AttributeError):
|
||||
self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
|
||||
|
||||
if self._ip_address is None:
|
||||
self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host)
|
||||
|
||||
def dataReceived(self, bodyBytes):
|
||||
# This maybe called several times after cancel was called with buffered data.
|
||||
if self._finished.called:
|
||||
@ -477,6 +508,20 @@ class _ResponseReader(protocol.Protocol):
|
||||
self._bodybuf.write(bodyBytes)
|
||||
self._bytes_received += len(bodyBytes)
|
||||
|
||||
bytes_received_result = self._crawler.signals.send_catch_log(
|
||||
signal=signals.bytes_received,
|
||||
data=bodyBytes,
|
||||
request=self._request,
|
||||
spider=self._crawler.spider,
|
||||
)
|
||||
for handler, result in bytes_received_result:
|
||||
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
|
||||
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
|
||||
{"request": self._request, "handler": handler.__qualname__})
|
||||
self.transport._producer.loseConnection()
|
||||
failure = result if result.value.fail else None
|
||||
self._finish_response(flags=["download_stopped"], failure=failure)
|
||||
|
||||
if self._maxsize and self._bytes_received > self._maxsize:
|
||||
logger.error("Received (%(bytes)s) bytes larger than download "
|
||||
"max size (%(maxsize)s) in request %(request)s.",
|
||||
@ -498,18 +543,17 @@ class _ResponseReader(protocol.Protocol):
|
||||
if self._finished.called:
|
||||
return
|
||||
|
||||
body = self._bodybuf.getvalue()
|
||||
if reason.check(ResponseDone):
|
||||
self._finished.callback((self._txresponse, body, None, self._certificate))
|
||||
self._finish_response()
|
||||
return
|
||||
|
||||
if reason.check(PotentialDataLoss):
|
||||
self._finished.callback((self._txresponse, body, ['partial'], self._certificate))
|
||||
self._finish_response(flags=["partial"])
|
||||
return
|
||||
|
||||
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
|
||||
if not self._fail_on_dataloss:
|
||||
self._finished.callback((self._txresponse, body, ['dataloss'], self._certificate))
|
||||
self._finish_response(flags=["dataloss"])
|
||||
return
|
||||
|
||||
elif not self._fail_on_dataloss_warned:
|
||||
|
@ -105,6 +105,7 @@ class S3DownloadHandler:
|
||||
key=unquote(p.path),
|
||||
query_args=unquote(p.query),
|
||||
headers=request.headers,
|
||||
data=request.body)
|
||||
data=request.body,
|
||||
)
|
||||
request = request.replace(url=url, headers=signed_headers)
|
||||
return self._download_http(request, spider)
|
||||
|
@ -35,38 +35,45 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
for method in self.methods['process_request']:
|
||||
response = yield deferred_from_coro(method(request=request, spider=spider))
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
|
||||
(method.__self__.__class__.__name__, response.__class__.__name__))
|
||||
raise _InvalidOutput(
|
||||
"Middleware %s.process_request must return None, Response or Request, got %s"
|
||||
% (method.__self__.__class__.__name__, response.__class__.__name__)
|
||||
)
|
||||
if response:
|
||||
defer.returnValue(response)
|
||||
defer.returnValue((yield download_func(request=request, spider=spider)))
|
||||
return response
|
||||
return (yield download_func(request=request, spider=spider))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def process_response(response):
|
||||
assert response is not None, 'Received None in process_response'
|
||||
if isinstance(response, Request):
|
||||
defer.returnValue(response)
|
||||
if response is None:
|
||||
raise TypeError("Received None in process_response")
|
||||
elif isinstance(response, Request):
|
||||
return response
|
||||
|
||||
for method in self.methods['process_response']:
|
||||
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
|
||||
if not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
|
||||
(method.__self__.__class__.__name__, type(response)))
|
||||
raise _InvalidOutput(
|
||||
"Middleware %s.process_response must return Response or Request, got %s"
|
||||
% (method.__self__.__class__.__name__, type(response))
|
||||
)
|
||||
if isinstance(response, Request):
|
||||
defer.returnValue(response)
|
||||
defer.returnValue(response)
|
||||
return response
|
||||
return response
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def process_exception(_failure):
|
||||
exception = _failure.value
|
||||
def process_exception(failure):
|
||||
exception = failure.value
|
||||
for method in self.methods['process_exception']:
|
||||
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \
|
||||
(method.__self__.__class__.__name__, type(response)))
|
||||
raise _InvalidOutput(
|
||||
"Middleware %s.process_exception must return None, Response or Request, got %s"
|
||||
% (method.__self__.__class__.__name__, type(response))
|
||||
)
|
||||
if response:
|
||||
defer.returnValue(response)
|
||||
defer.returnValue(_failure)
|
||||
return response
|
||||
return failure
|
||||
|
||||
deferred = mustbe_deferred(process_request, request)
|
||||
deferred.addErrback(process_exception)
|
||||
|
@ -14,13 +14,12 @@ from scrapy.responsetypes import responsetypes
|
||||
def _parsed_url_args(parsed):
|
||||
# Assume parsed is urlparse-d from Request.url,
|
||||
# which was passed via safe_url_string and is ascii-only.
|
||||
b = lambda s: to_bytes(s, encoding='ascii')
|
||||
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
||||
path = b(path)
|
||||
host = b(parsed.hostname)
|
||||
path = to_bytes(path, encoding="ascii")
|
||||
host = to_bytes(parsed.hostname, encoding="ascii")
|
||||
port = parsed.port
|
||||
scheme = b(parsed.scheme)
|
||||
netloc = b(parsed.netloc)
|
||||
scheme = to_bytes(parsed.scheme, encoding="ascii")
|
||||
netloc = to_bytes(parsed.netloc, encoding="ascii")
|
||||
if port is None:
|
||||
port = 443 if scheme == b'https' else 80
|
||||
return scheme, netloc, host, port, path
|
||||
@ -89,8 +88,8 @@ class ScrapyHTTPPageGetter(HTTPClient):
|
||||
self.transport.stopProducing()
|
||||
|
||||
self.factory.noPage(
|
||||
defer.TimeoutError("Getting %s took longer than %s seconds." %
|
||||
(self.factory.url, self.factory.timeout)))
|
||||
defer.TimeoutError("Getting %s took longer than %s seconds."
|
||||
% (self.factory.url, self.factory.timeout)))
|
||||
|
||||
|
||||
class ScrapyHTTPClientFactory(HTTPClientFactory):
|
||||
|
@ -73,7 +73,8 @@ class ExecutionEngine:
|
||||
@defer.inlineCallbacks
|
||||
def start(self):
|
||||
"""Start the execution engine"""
|
||||
assert not self.running, "Engine already running"
|
||||
if self.running:
|
||||
raise RuntimeError("Engine already running")
|
||||
self.start_time = time()
|
||||
yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
|
||||
self.running = True
|
||||
@ -82,7 +83,8 @@ class ExecutionEngine:
|
||||
|
||||
def stop(self):
|
||||
"""Stop the execution engine gracefully"""
|
||||
assert self.running, "Engine not running"
|
||||
if not self.running:
|
||||
raise RuntimeError("Engine not running")
|
||||
self.running = False
|
||||
dfd = self._close_all_spiders()
|
||||
return dfd.addBoth(lambda _: self._finish_stopping_engine())
|
||||
@ -165,7 +167,11 @@ class ExecutionEngine:
|
||||
return d
|
||||
|
||||
def _handle_downloader_output(self, response, request, spider):
|
||||
assert isinstance(response, (Request, Response, Failure)), response
|
||||
if not isinstance(response, (Request, Response, Failure)):
|
||||
raise TypeError(
|
||||
"Incorrect type: expected Request, Response or Failure, got %s: %r"
|
||||
% (type(response), response)
|
||||
)
|
||||
# downloader middleware can return requests (for example, redirects)
|
||||
if isinstance(response, Request):
|
||||
self.crawl(response, spider)
|
||||
@ -205,17 +211,15 @@ class ExecutionEngine:
|
||||
return not bool(self.slot)
|
||||
|
||||
def crawl(self, request, spider):
|
||||
assert spider in self.open_spiders, \
|
||||
"Spider %r not opened when crawling: %s" % (spider.name, request)
|
||||
if spider not in self.open_spiders:
|
||||
raise RuntimeError("Spider %r not opened when crawling: %s" % (spider.name, request))
|
||||
self.schedule(request, spider)
|
||||
self.slot.nextcall.schedule()
|
||||
|
||||
def schedule(self, request, spider):
|
||||
self.signals.send_catch_log(signal=signals.request_scheduled,
|
||||
request=request, spider=spider)
|
||||
self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
|
||||
if not self.slot.scheduler.enqueue_request(request):
|
||||
self.signals.send_catch_log(signal=signals.request_dropped,
|
||||
request=request, spider=spider)
|
||||
self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
|
||||
|
||||
def download(self, request, spider):
|
||||
d = self._download(request, spider)
|
||||
@ -224,21 +228,24 @@ class ExecutionEngine:
|
||||
|
||||
def _downloaded(self, response, slot, request, spider):
|
||||
slot.remove_request(request)
|
||||
return self.download(response, spider) \
|
||||
if isinstance(response, Request) else response
|
||||
return self.download(response, spider) if isinstance(response, Request) else response
|
||||
|
||||
def _download(self, request, spider):
|
||||
slot = self.slot
|
||||
slot.add_request(request)
|
||||
|
||||
def _on_success(response):
|
||||
assert isinstance(response, (Response, Request))
|
||||
if not isinstance(response, (Response, Request)):
|
||||
raise TypeError(
|
||||
"Incorrect type: expected Response or Request, got %s: %r"
|
||||
% (type(response), response)
|
||||
)
|
||||
if isinstance(response, Response):
|
||||
response.request = request # tie request to response received
|
||||
logkws = self.logformatter.crawled(request, response, spider)
|
||||
if logkws is not None:
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
self.signals.send_catch_log(signal=signals.response_received,
|
||||
self.signals.send_catch_log(signals.response_received,
|
||||
response=response, request=request, spider=spider)
|
||||
return response
|
||||
|
||||
@ -253,8 +260,8 @@ class ExecutionEngine:
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def open_spider(self, spider, start_requests=(), close_if_idle=True):
|
||||
assert self.has_capacity(), "No free spider slot when opening %r" % \
|
||||
spider.name
|
||||
if not self.has_capacity():
|
||||
raise RuntimeError("No free spider slot when opening %r" % spider.name)
|
||||
logger.info("Spider opened", extra={'spider': spider})
|
||||
nextcall = CallLaterOnce(self._next_request, spider)
|
||||
scheduler = self.scheduler_cls.from_crawler(self.crawler)
|
||||
@ -277,10 +284,8 @@ class ExecutionEngine:
|
||||
next loop and this function is guaranteed to be called (at least) once
|
||||
again for this spider.
|
||||
"""
|
||||
res = self.signals.send_catch_log(signal=signals.spider_idle, \
|
||||
spider=spider, dont_log=DontCloseSpider)
|
||||
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
|
||||
for _, x in res):
|
||||
res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider)
|
||||
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res):
|
||||
return
|
||||
|
||||
if self.spider_is_idle(spider):
|
||||
|
@ -4,18 +4,18 @@ extracts information from them"""
|
||||
import logging
|
||||
from collections import deque
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
from itemadapter import is_item
|
||||
from twisted.internet import defer
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errback
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
|
||||
from scrapy.utils.log import logformatter_adapter, failure_to_exc_info
|
||||
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
|
||||
from scrapy import signals
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.core.spidermw import SpiderMiddlewareManager
|
||||
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.utils.defer import defer_result, defer_succeed, iter_errback, parallel
|
||||
from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
|
||||
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -123,7 +123,11 @@ class Scraper:
|
||||
def _scrape(self, response, request, spider):
|
||||
"""Handle the downloaded response or failure through the spider
|
||||
callback/errback"""
|
||||
assert isinstance(response, (Response, Failure))
|
||||
if not isinstance(response, (Response, Failure)):
|
||||
raise TypeError(
|
||||
"Incorrect type: expected Response or Failure, got %s: %r"
|
||||
% (type(response), response)
|
||||
)
|
||||
|
||||
dfd = self._scrape2(response, request, spider) # returns spider's processed output
|
||||
dfd.addErrback(self.handle_spider_error, request, response, spider)
|
||||
@ -187,7 +191,7 @@ class Scraper:
|
||||
"""
|
||||
if isinstance(output, Request):
|
||||
self.crawler.engine.crawl(request=output, spider=spider)
|
||||
elif isinstance(output, (BaseItem, dict)):
|
||||
elif is_item(output):
|
||||
self.slot.itemproc_size += 1
|
||||
dfd = self.itemproc.process_item(output, spider)
|
||||
dfd.addBoth(self._itemproc_finished, output, response, spider)
|
||||
@ -196,10 +200,11 @@ class Scraper:
|
||||
pass
|
||||
else:
|
||||
typename = type(output).__name__
|
||||
logger.error('Spider must return Request, BaseItem, dict or None, '
|
||||
'got %(typename)r in %(request)s',
|
||||
logger.error(
|
||||
'Spider must return request, item, or None, got %(typename)r in %(request)s',
|
||||
{'request': request, 'typename': typename},
|
||||
extra={'spider': spider})
|
||||
extra={'spider': spider},
|
||||
)
|
||||
|
||||
def _log_download_errors(self, spider_failure, download_failure, request, spider):
|
||||
"""Log and silence errors that come from the engine (typically download
|
||||
|
@ -19,7 +19,7 @@ def _isiterable(possible_iterator):
|
||||
|
||||
|
||||
def _fname(f):
|
||||
return "%s.%s".format(
|
||||
return "{}.{}".format(
|
||||
f.__self__.__class__.__name__,
|
||||
f.__func__.__name__
|
||||
)
|
||||
|
@ -78,7 +78,8 @@ class Crawler:
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def crawl(self, *args, **kwargs):
|
||||
assert not self.crawling, "Crawling already taking place"
|
||||
if self.crawling:
|
||||
raise RuntimeError("Crawling already taking place")
|
||||
self.crawling = True
|
||||
|
||||
try:
|
||||
|
@ -1,4 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import logging
|
||||
|
||||
|
@ -29,8 +29,7 @@ class CookiesMiddleware:
|
||||
|
||||
cookiejarkey = request.meta.get("cookiejar")
|
||||
jar = self.jars[cookiejarkey]
|
||||
cookies = self._get_request_cookies(jar, request)
|
||||
for cookie in cookies:
|
||||
for cookie in self._get_request_cookies(jar, request):
|
||||
jar.set_cookie_if_ok(cookie, request)
|
||||
|
||||
# set Cookie header
|
||||
@ -68,28 +67,65 @@ class CookiesMiddleware:
|
||||
msg = "Received cookies from: {}\n{}".format(response, cookies)
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _format_cookie(self, cookie):
|
||||
# build cookie string
|
||||
cookie_str = '%s=%s' % (cookie['name'], cookie['value'])
|
||||
|
||||
if cookie.get('path', None):
|
||||
cookie_str += '; Path=%s' % cookie['path']
|
||||
if cookie.get('domain', None):
|
||||
cookie_str += '; Domain=%s' % cookie['domain']
|
||||
def _format_cookie(self, cookie, request):
|
||||
"""
|
||||
Given a dict consisting of cookie components, return its string representation.
|
||||
Decode from bytes if necessary.
|
||||
"""
|
||||
decoded = {}
|
||||
for key in ("name", "value", "path", "domain"):
|
||||
if not cookie.get(key):
|
||||
if key in ("name", "value"):
|
||||
msg = "Invalid cookie found in request {}: {} ('{}' is missing)"
|
||||
logger.warning(msg.format(request, cookie, key))
|
||||
return
|
||||
continue
|
||||
if isinstance(cookie[key], str):
|
||||
decoded[key] = cookie[key]
|
||||
else:
|
||||
try:
|
||||
decoded[key] = cookie[key].decode("utf8")
|
||||
except UnicodeDecodeError:
|
||||
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
|
||||
request, cookie)
|
||||
decoded[key] = cookie[key].decode("latin1", errors="replace")
|
||||
|
||||
cookie_str = "{}={}".format(decoded.pop("name"), decoded.pop("value"))
|
||||
for key, value in decoded.items(): # path, domain
|
||||
cookie_str += "; {}={}".format(key.capitalize(), value)
|
||||
return cookie_str
|
||||
|
||||
def _get_request_cookies(self, jar, request):
|
||||
if isinstance(request.cookies, dict):
|
||||
cookie_list = [
|
||||
{'name': k, 'value': v}
|
||||
for k, v in request.cookies.items()
|
||||
]
|
||||
else:
|
||||
cookie_list = request.cookies
|
||||
|
||||
cookies = [self._format_cookie(x) for x in cookie_list]
|
||||
headers = {'Set-Cookie': cookies}
|
||||
response = Response(request.url, headers=headers)
|
||||
|
||||
"""
|
||||
Extract cookies from a Request. Values from the `Request.cookies` attribute
|
||||
take precedence over values from the `Cookie` request header.
|
||||
"""
|
||||
def get_cookies_from_header(jar, request):
|
||||
cookie_header = request.headers.get("Cookie")
|
||||
if not cookie_header:
|
||||
return []
|
||||
cookie_gen_bytes = (s.strip() for s in cookie_header.split(b";"))
|
||||
cookie_list_unicode = []
|
||||
for cookie_bytes in cookie_gen_bytes:
|
||||
try:
|
||||
cookie_unicode = cookie_bytes.decode("utf8")
|
||||
except UnicodeDecodeError:
|
||||
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
|
||||
request, cookie_bytes)
|
||||
cookie_unicode = cookie_bytes.decode("latin1", errors="replace")
|
||||
cookie_list_unicode.append(cookie_unicode)
|
||||
response = Response(request.url, headers={"Set-Cookie": cookie_list_unicode})
|
||||
return jar.make_cookies(response, request)
|
||||
|
||||
def get_cookies_from_attribute(jar, request):
|
||||
if not request.cookies:
|
||||
return []
|
||||
elif isinstance(request.cookies, dict):
|
||||
cookies = ({"name": k, "value": v} for k, v in request.cookies.items())
|
||||
else:
|
||||
cookies = request.cookies
|
||||
formatted = filter(None, (self._format_cookie(c, request) for c in cookies))
|
||||
response = Response(request.url, headers={"Set-Cookie": formatted})
|
||||
return jar.make_cookies(response, request)
|
||||
|
||||
return get_cookies_from_header(jar, request) + get_cookies_from_attribute(jar, request)
|
||||
|
@ -60,11 +60,14 @@ class RedirectMiddleware(BaseRedirectMiddleware):
|
||||
Handle redirection of requests based on response status
|
||||
and meta-refresh html tag.
|
||||
"""
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if (request.meta.get('dont_redirect', False) or
|
||||
response.status in getattr(spider, 'handle_httpstatus_list', []) or
|
||||
response.status in request.meta.get('handle_httpstatus_list', []) or
|
||||
request.meta.get('handle_httpstatus_all', False)):
|
||||
if (
|
||||
request.meta.get('dont_redirect', False)
|
||||
or response.status in getattr(spider, 'handle_httpstatus_list', [])
|
||||
or response.status in request.meta.get('handle_httpstatus_list', [])
|
||||
or request.meta.get('handle_httpstatus_all', False)
|
||||
):
|
||||
return response
|
||||
|
||||
allowed_status = (301, 302, 303, 307, 308)
|
||||
|
@ -12,9 +12,15 @@ once the spider has finished crawling all regular (non failed) pages.
|
||||
import logging
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.error import TimeoutError, DNSLookupError, \
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError, \
|
||||
ConnectionLost, TCPTimedOutError
|
||||
from twisted.internet.error import (
|
||||
ConnectError,
|
||||
ConnectionDone,
|
||||
ConnectionLost,
|
||||
ConnectionRefusedError,
|
||||
DNSLookupError,
|
||||
TCPTimedOutError,
|
||||
TimeoutError,
|
||||
)
|
||||
from twisted.web.client import ResponseFailed
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
@ -61,7 +61,7 @@ class RFPDupeFilter(BaseDupeFilter):
|
||||
def log(self, request, spider):
|
||||
if self.debug:
|
||||
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
|
||||
args = {'request': request, 'referer': referer_str(request) }
|
||||
args = {'request': request, 'referer': referer_str(request)}
|
||||
self.logger.debug(msg, args, extra={'spider': spider})
|
||||
elif self.logdupes:
|
||||
msg = ("Filtered duplicate request: %(request)s"
|
||||
|
@ -41,6 +41,18 @@ class CloseSpider(Exception):
|
||||
self.reason = reason
|
||||
|
||||
|
||||
class StopDownload(Exception):
|
||||
"""
|
||||
Stop the download of the body for a given response.
|
||||
The 'fail' boolean parameter indicates whether or not the resulting partial response
|
||||
should be handled by the request errback. Note that 'fail' is a keyword-only argument.
|
||||
"""
|
||||
|
||||
def __init__(self, *, fail=True):
|
||||
super().__init__()
|
||||
self.fail = fail
|
||||
|
||||
|
||||
# Items
|
||||
|
||||
|
||||
@ -59,6 +71,7 @@ class NotSupported(Exception):
|
||||
|
||||
class UsageError(Exception):
|
||||
"""To indicate a command-line usage error"""
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
self.print_help = kw.pop('print_help', True)
|
||||
super(UsageError, self).__init__(*a, **kw)
|
||||
|
@ -4,16 +4,18 @@ Item Exporters are used to export/serialize items into different formats.
|
||||
|
||||
import csv
|
||||
import io
|
||||
import pprint
|
||||
import marshal
|
||||
import warnings
|
||||
import pickle
|
||||
import pprint
|
||||
import warnings
|
||||
from xml.sax.saxutils import XMLGenerator
|
||||
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder
|
||||
from scrapy.utils.python import to_bytes, to_unicode, is_listlike
|
||||
from scrapy.item import BaseItem
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.item import _BaseItem
|
||||
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder
|
||||
|
||||
|
||||
__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
|
||||
@ -56,11 +58,14 @@ class BaseItemExporter:
|
||||
"""Return the fields to export as an iterable of tuples
|
||||
(name, serialized_value)
|
||||
"""
|
||||
item = ItemAdapter(item)
|
||||
|
||||
if include_empty is None:
|
||||
include_empty = self.export_empty_fields
|
||||
|
||||
if self.fields_to_export is None:
|
||||
if include_empty and not isinstance(item, dict):
|
||||
field_iter = item.fields.keys()
|
||||
if include_empty:
|
||||
field_iter = item.field_names()
|
||||
else:
|
||||
field_iter = item.keys()
|
||||
else:
|
||||
@ -71,8 +76,8 @@ class BaseItemExporter:
|
||||
|
||||
for field_name in field_iter:
|
||||
if field_name in item:
|
||||
field = {} if isinstance(item, dict) else item.fields[field_name]
|
||||
value = self.serialize_field(field, field_name, item[field_name])
|
||||
field_meta = item.get_field_meta(field_name)
|
||||
value = self.serialize_field(field_meta, field_name, item[field_name])
|
||||
else:
|
||||
value = default_value
|
||||
|
||||
@ -250,7 +255,7 @@ class CsvItemExporter(BaseItemExporter):
|
||||
|
||||
class PickleItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, protocol=2, **kwargs):
|
||||
def __init__(self, file, protocol=4, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.file = file
|
||||
self.protocol = protocol
|
||||
@ -297,6 +302,7 @@ class PythonItemExporter(BaseItemExporter):
|
||||
|
||||
.. _msgpack: https://pypi.org/project/msgpack/
|
||||
"""
|
||||
|
||||
def _configure(self, options, dont_fail=False):
|
||||
self.binary = options.pop('binary', True)
|
||||
super(PythonItemExporter, self)._configure(options, dont_fail)
|
||||
@ -312,24 +318,24 @@ class PythonItemExporter(BaseItemExporter):
|
||||
return serializer(value)
|
||||
|
||||
def _serialize_value(self, value):
|
||||
if isinstance(value, BaseItem):
|
||||
if isinstance(value, _BaseItem):
|
||||
return self.export_item(value)
|
||||
if isinstance(value, dict):
|
||||
return dict(self._serialize_dict(value))
|
||||
if is_listlike(value):
|
||||
elif is_item(value):
|
||||
return dict(self._serialize_item(value))
|
||||
elif is_listlike(value):
|
||||
return [self._serialize_value(v) for v in value]
|
||||
encode_func = to_bytes if self.binary else to_unicode
|
||||
if isinstance(value, (str, bytes)):
|
||||
return encode_func(value, encoding=self.encoding)
|
||||
return value
|
||||
|
||||
def _serialize_dict(self, value):
|
||||
for key, val in value.items():
|
||||
def _serialize_item(self, item):
|
||||
for key, value in ItemAdapter(item).items():
|
||||
key = to_bytes(key) if self.binary else key
|
||||
yield key, self._serialize_value(val)
|
||||
yield key, self._serialize_value(value)
|
||||
|
||||
def export_item(self, item):
|
||||
result = dict(self._get_serialized_fields(item))
|
||||
if self.binary:
|
||||
result = dict(self._serialize_dict(result))
|
||||
result = dict(self._serialize_item(result))
|
||||
return result
|
||||
|
@ -270,18 +270,29 @@ class FeedExporter:
|
||||
if not slot.itemcount and not slot.store_empty:
|
||||
# We need to call slot.storage.store nonetheless to get the file
|
||||
# properly closed.
|
||||
return defer.maybeDeferred(slot.storage.store, slot.file)
|
||||
d = defer.maybeDeferred(slot.storage.store, slot.file)
|
||||
deferred_list.append(d)
|
||||
continue
|
||||
slot.finish_exporting()
|
||||
logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
|
||||
log_args = {'format': slot.format,
|
||||
'itemcount': slot.itemcount,
|
||||
'uri': slot.uri}
|
||||
d = defer.maybeDeferred(slot.storage.store, slot.file)
|
||||
d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args,
|
||||
extra={'spider': spider}))
|
||||
d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args,
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
|
||||
# Use `largs=log_args` to copy log_args into function's scope
|
||||
# instead of using `log_args` from the outer scope
|
||||
d.addCallback(
|
||||
lambda _, largs=log_args: logger.info(
|
||||
logfmt % "Stored", largs, extra={'spider': spider}
|
||||
)
|
||||
)
|
||||
d.addErrback(
|
||||
lambda f, largs=log_args: logger.error(
|
||||
logfmt % "Error storing", largs,
|
||||
exc_info=failure_to_exc_info(f), extra={'spider': spider}
|
||||
)
|
||||
)
|
||||
deferred_list.append(d)
|
||||
return defer.DeferredList(deferred_list) if deferred_list else None
|
||||
|
||||
|
@ -46,9 +46,10 @@ class RFC2616Policy:
|
||||
def __init__(self, settings):
|
||||
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self.ignore_response_cache_controls = [to_bytes(cc) for cc in
|
||||
settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')]
|
||||
self._cc_parsed = WeakKeyDictionary()
|
||||
self.ignore_response_cache_controls = [
|
||||
to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
|
||||
]
|
||||
|
||||
def _parse_cachecontrol(self, r):
|
||||
if r not in self._cc_parsed:
|
||||
@ -250,7 +251,7 @@ class DbmCacheStorage:
|
||||
'headers': dict(response.headers),
|
||||
'body': response.body,
|
||||
}
|
||||
self.db['%s_data' % key] = pickle.dumps(data, protocol=2)
|
||||
self.db['%s_data' % key] = pickle.dumps(data, protocol=4)
|
||||
self.db['%s_time' % key] = str(time())
|
||||
|
||||
def _read_data(self, spider, request):
|
||||
@ -317,7 +318,7 @@ class FilesystemCacheStorage:
|
||||
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
|
||||
f.write(to_bytes(repr(metadata)))
|
||||
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
|
||||
pickle.dump(metadata, f, protocol=2)
|
||||
pickle.dump(metadata, f, protocol=4)
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(response.headers))
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
|
||||
|
@ -26,7 +26,7 @@ class SpiderState:
|
||||
def spider_closed(self, spider):
|
||||
if self.jobdir:
|
||||
with open(self.statefn, 'wb') as f:
|
||||
pickle.dump(spider.state, f, protocol=2)
|
||||
pickle.dump(spider.state, f, protocol=4)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
if self.jobdir and os.path.exists(self.statefn):
|
||||
|
@ -76,8 +76,10 @@ class TelnetConsole(protocol.ServerFactory):
|
||||
"""An implementation of IPortal"""
|
||||
@defers
|
||||
def login(self_, credentials, mind, *interfaces):
|
||||
if not (credentials.username == self.username.encode('utf8') and
|
||||
credentials.checkPassword(self.password.encode('utf8'))):
|
||||
if not (
|
||||
credentials.username == self.username.encode('utf8')
|
||||
and credentials.checkPassword(self.password.encode('utf8'))
|
||||
):
|
||||
raise ValueError("Invalid credentials")
|
||||
|
||||
protocol = telnet.TelnetBootstrapProtocol(
|
||||
|
@ -24,7 +24,8 @@ class Request(object_ref):
|
||||
self.method = str(method).upper()
|
||||
self._set_url(url)
|
||||
self._set_body(body)
|
||||
assert isinstance(priority, int), "Request priority not an integer: %r" % priority
|
||||
if not isinstance(priority, int):
|
||||
raise TypeError("Request priority not an integer: %r" % priority)
|
||||
self.priority = priority
|
||||
|
||||
if callback is not None and not callable(callback):
|
||||
@ -129,6 +130,9 @@ class Request(object_ref):
|
||||
:class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`,
|
||||
may modify the :class:`~scrapy.http.Request` object.
|
||||
|
||||
To translate a cURL command into a Scrapy request,
|
||||
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
|
||||
|
||||
"""
|
||||
request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options)
|
||||
request_kwargs.update(kwargs)
|
||||
|
@ -178,12 +178,11 @@ def _get_clickable(clickdata, form):
|
||||
if the latter is given. If not, it returns the first
|
||||
clickable element found
|
||||
"""
|
||||
clickables = [
|
||||
el for el in form.xpath(
|
||||
clickables = list(form.xpath(
|
||||
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
|
||||
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
|
||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
||||
]
|
||||
namespaces={"re": "http://exslt.org/regular-expressions"}
|
||||
))
|
||||
if not clickables:
|
||||
return
|
||||
|
||||
|
@ -17,7 +17,8 @@ from scrapy.utils.trackref import object_ref
|
||||
|
||||
class Response(object_ref):
|
||||
|
||||
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None, certificate=None):
|
||||
def __init__(self, url, status=200, headers=None, body=b'', flags=None,
|
||||
request=None, certificate=None, ip_address=None):
|
||||
self.headers = Headers(headers or {})
|
||||
self.status = int(status)
|
||||
self._set_body(body)
|
||||
@ -25,6 +26,7 @@ class Response(object_ref):
|
||||
self.request = request
|
||||
self.flags = [] if flags is None else list(flags)
|
||||
self.certificate = certificate
|
||||
self.ip_address = ip_address
|
||||
|
||||
@property
|
||||
def cb_kwargs(self):
|
||||
@ -87,7 +89,8 @@ class Response(object_ref):
|
||||
"""Create a new Response with the same attributes except for those
|
||||
given new values.
|
||||
"""
|
||||
for x in ['url', 'status', 'headers', 'body', 'request', 'flags', 'certificate']:
|
||||
for x in ['url', 'status', 'headers', 'body',
|
||||
'request', 'flags', 'certificate', 'ip_address']:
|
||||
kwargs.setdefault(x, getattr(self, x))
|
||||
cls = kwargs.pop('cls', self.__class__)
|
||||
return cls(*args, **kwargs)
|
||||
|
@ -5,6 +5,8 @@ discovering (through HTTP headers) to base Response class.
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
|
||||
import json
|
||||
import warnings
|
||||
from contextlib import suppress
|
||||
from typing import Generator
|
||||
from urllib.parse import urljoin
|
||||
@ -14,15 +16,19 @@ from w3lib.encoding import (html_body_declared_encoding, html_to_unicode,
|
||||
http_content_type_encoding, resolve_encoding)
|
||||
from w3lib.html import strip_html5_whitespace
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.http import Request
|
||||
from scrapy.http.response import Response
|
||||
from scrapy.utils.python import memoizemethod_noargs, to_unicode
|
||||
from scrapy.utils.response import get_base_url
|
||||
|
||||
_NONE = object()
|
||||
|
||||
|
||||
class TextResponse(Response):
|
||||
|
||||
_DEFAULT_ENCODING = 'ascii'
|
||||
_cached_decoded_json = _NONE
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._encoding = kwargs.pop('encoding', None)
|
||||
@ -61,8 +67,21 @@ class TextResponse(Response):
|
||||
|
||||
def body_as_unicode(self):
|
||||
"""Return body as unicode"""
|
||||
warnings.warn('Response.body_as_unicode() is deprecated, '
|
||||
'please use Response.text instead.',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
return self.text
|
||||
|
||||
def json(self):
|
||||
"""
|
||||
.. versionadded:: 2.2
|
||||
|
||||
Deserialize a JSON document to a Python object.
|
||||
"""
|
||||
if self._cached_decoded_json is _NONE:
|
||||
self._cached_decoded_json = json.loads(self.text)
|
||||
return self._cached_decoded_json
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
""" Body as unicode """
|
||||
|
@ -14,28 +14,39 @@ from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
from scrapy.utils.trackref import object_ref
|
||||
|
||||
|
||||
class BaseItem(object_ref):
|
||||
"""Base class for all scraped items.
|
||||
|
||||
In Scrapy, an object is considered an *item* if it is an instance of either
|
||||
:class:`BaseItem` or :class:`dict`. For example, when the output of a
|
||||
spider callback is evaluated, only instances of :class:`BaseItem` or
|
||||
:class:`dict` are passed to :ref:`item pipelines <topics-item-pipeline>`.
|
||||
|
||||
If you need instances of a custom class to be considered items by Scrapy,
|
||||
you must inherit from either :class:`BaseItem` or :class:`dict`.
|
||||
|
||||
Unlike instances of :class:`dict`, instances of :class:`BaseItem` may be
|
||||
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
|
||||
class _BaseItem(object_ref):
|
||||
"""
|
||||
Temporary class used internally to avoid the deprecation
|
||||
warning raised by isinstance checks using BaseItem.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class _BaseItemMeta(ABCMeta):
|
||||
def __instancecheck__(cls, instance):
|
||||
if cls is BaseItem:
|
||||
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
return super().__instancecheck__(instance)
|
||||
|
||||
|
||||
class BaseItem(_BaseItem, metaclass=_BaseItemMeta):
|
||||
"""
|
||||
Deprecated, please use :class:`scrapy.item.Item` instead
|
||||
"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if issubclass(cls, BaseItem) and not issubclass(cls, (Item, DictItem)):
|
||||
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
return super(BaseItem, cls).__new__(cls, *args, **kwargs)
|
||||
|
||||
|
||||
class Field(dict):
|
||||
"""Container of field metadata"""
|
||||
|
||||
|
||||
class ItemMeta(ABCMeta):
|
||||
class ItemMeta(_BaseItemMeta):
|
||||
"""Metaclass_ of :class:`Item` that handles field definitions.
|
||||
|
||||
.. _metaclass: https://realpython.com/python-metaclasses
|
||||
@ -68,8 +79,7 @@ class DictItem(MutableMapping, BaseItem):
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if issubclass(cls, DictItem) and not issubclass(cls, Item):
|
||||
warn('scrapy.item.DictItem is deprecated, please use '
|
||||
'scrapy.item.Item instead',
|
||||
warn('scrapy.item.DictItem is deprecated, please use scrapy.item.Item instead',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
return super(DictItem, cls).__new__(cls, *args, **kwargs)
|
||||
|
||||
@ -86,8 +96,7 @@ class DictItem(MutableMapping, BaseItem):
|
||||
if key in self.fields:
|
||||
self._values[key] = value
|
||||
else:
|
||||
raise KeyError("%s does not support field: %s" %
|
||||
(self.__class__.__name__, key))
|
||||
raise KeyError("%s does not support field: %s" % (self.__class__.__name__, key))
|
||||
|
||||
def __delitem__(self, key):
|
||||
del self._values[key]
|
||||
@ -99,8 +108,7 @@ class DictItem(MutableMapping, BaseItem):
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
if not name.startswith('_'):
|
||||
raise AttributeError("Use item[%r] = %r to set field value" %
|
||||
(name, value))
|
||||
raise AttributeError("Use item[%r] = %r to set field value" % (name, value))
|
||||
super(DictItem, self).__setattr__(name, value)
|
||||
|
||||
def __len__(self):
|
||||
@ -121,12 +129,30 @@ class DictItem(MutableMapping, BaseItem):
|
||||
return self.__class__(self)
|
||||
|
||||
def deepcopy(self):
|
||||
"""Return a `deep copy`_ of this item.
|
||||
|
||||
.. _deep copy: https://docs.python.org/library/copy.html#copy.deepcopy
|
||||
"""Return a :func:`~copy.deepcopy` of this item.
|
||||
"""
|
||||
return deepcopy(self)
|
||||
|
||||
|
||||
class Item(DictItem, metaclass=ItemMeta):
|
||||
pass
|
||||
"""
|
||||
Base class for scraped items.
|
||||
|
||||
In Scrapy, an object is considered an ``item`` if it is an instance of either
|
||||
:class:`Item` or :class:`dict`, or any subclass. For example, when the output of a
|
||||
spider callback is evaluated, only instances of :class:`Item` or
|
||||
:class:`dict` are passed to :ref:`item pipelines <topics-item-pipeline>`.
|
||||
|
||||
If you need instances of a custom class to be considered items by Scrapy,
|
||||
you must inherit from either :class:`Item` or :class:`dict`.
|
||||
|
||||
Items must declare :class:`Field` attributes, which are processed and stored
|
||||
in the ``fields`` attribute. This restricts the set of allowed field names
|
||||
and prevents typos, raising ``KeyError`` when referring to undefined fields.
|
||||
Additionally, fields can be used to define metadata and control the way
|
||||
data is processed internally. Please refer to the :ref:`documentation
|
||||
about fields <topics-items-fields>` for additional information.
|
||||
|
||||
Unlike instances of :class:`dict`, instances of :class:`Item` may be
|
||||
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
|
||||
"""
|
||||
|
@ -45,8 +45,14 @@ IGNORED_EXTENSIONS = [
|
||||
|
||||
|
||||
_re_type = type(re.compile("", 0))
|
||||
_matches = lambda url, regexs: any(r.search(url) for r in regexs)
|
||||
_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', 'file', 'ftp'}
|
||||
|
||||
|
||||
def _matches(url, regexs):
|
||||
return any(r.search(url) for r in regexs)
|
||||
|
||||
|
||||
def _is_valid_url(url):
|
||||
return url.split('://', 1)[0] in {'http', 'https', 'file', 'ftp'}
|
||||
|
||||
|
||||
class FilteringLinkExtractor:
|
||||
@ -55,8 +61,7 @@ class FilteringLinkExtractor:
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
|
||||
if (issubclass(cls, FilteringLinkExtractor) and
|
||||
not issubclass(cls, LxmlLinkExtractor)):
|
||||
if issubclass(cls, FilteringLinkExtractor) and not issubclass(cls, LxmlLinkExtractor):
|
||||
warn('scrapy.linkextractors.FilteringLinkExtractor is deprecated, '
|
||||
'please use scrapy.linkextractors.LinkExtractor instead',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
@ -128,4 +133,4 @@ class FilteringLinkExtractor:
|
||||
|
||||
|
||||
# Top-level imports
|
||||
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor as LinkExtractor # noqa: F401
|
||||
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor as LinkExtractor
|
||||
|
@ -1,6 +1,8 @@
|
||||
"""
|
||||
Link extractor based on lxml.html
|
||||
"""
|
||||
import operator
|
||||
from functools import partial
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import lxml.etree as etree
|
||||
@ -8,10 +10,10 @@ from w3lib.html import strip_html5_whitespace
|
||||
from w3lib.url import canonicalize_url, safe_url_string
|
||||
|
||||
from scrapy.link import Link
|
||||
from scrapy.linkextractors import FilteringLinkExtractor
|
||||
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
|
||||
from scrapy.utils.python import unique as unique_list
|
||||
from scrapy.utils.response import get_base_url
|
||||
from scrapy.linkextractors import FilteringLinkExtractor
|
||||
|
||||
|
||||
# from lxml/src/lxml/html/__init__.py
|
||||
@ -27,19 +29,24 @@ def _nons(tag):
|
||||
return tag
|
||||
|
||||
|
||||
def _identity(x):
|
||||
return x
|
||||
|
||||
|
||||
def _canonicalize_link_url(link):
|
||||
return canonicalize_url(link.url, keep_fragments=True)
|
||||
|
||||
|
||||
class LxmlParserLinkExtractor:
|
||||
def __init__(self, tag="a", attr="href", process=None, unique=False,
|
||||
strip=True, canonicalized=False):
|
||||
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||
self.process_attr = process if callable(process) else lambda v: v
|
||||
def __init__(
|
||||
self, tag="a", attr="href", process=None, unique=False, strip=True, canonicalized=False
|
||||
):
|
||||
self.scan_tag = tag if callable(tag) else partial(operator.eq, tag)
|
||||
self.scan_attr = attr if callable(attr) else partial(operator.eq, attr)
|
||||
self.process_attr = process if callable(process) else _identity
|
||||
self.unique = unique
|
||||
self.strip = strip
|
||||
if canonicalized:
|
||||
self.link_key = lambda link: link.url
|
||||
else:
|
||||
self.link_key = lambda link: canonicalize_url(link.url,
|
||||
keep_fragments=True)
|
||||
self.link_key = operator.attrgetter("url") if canonicalized else _canonicalize_link_url
|
||||
|
||||
def _iter_links(self, document):
|
||||
for el in document.iter(etree.Element):
|
||||
@ -93,27 +100,44 @@ class LxmlParserLinkExtractor:
|
||||
|
||||
class LxmlLinkExtractor(FilteringLinkExtractor):
|
||||
|
||||
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
||||
tags=('a', 'area'), attrs=('href',), canonicalize=False,
|
||||
unique=True, process_value=None, deny_extensions=None, restrict_css=(),
|
||||
strip=True, restrict_text=None):
|
||||
def __init__(
|
||||
self,
|
||||
allow=(),
|
||||
deny=(),
|
||||
allow_domains=(),
|
||||
deny_domains=(),
|
||||
restrict_xpaths=(),
|
||||
tags=('a', 'area'),
|
||||
attrs=('href',),
|
||||
canonicalize=False,
|
||||
unique=True,
|
||||
process_value=None,
|
||||
deny_extensions=None,
|
||||
restrict_css=(),
|
||||
strip=True,
|
||||
restrict_text=None,
|
||||
):
|
||||
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
|
||||
tag_func = lambda x: x in tags
|
||||
attr_func = lambda x: x in attrs
|
||||
lx = LxmlParserLinkExtractor(
|
||||
tag=tag_func,
|
||||
attr=attr_func,
|
||||
tag=partial(operator.contains, tags),
|
||||
attr=partial(operator.contains, attrs),
|
||||
unique=unique,
|
||||
process=process_value,
|
||||
strip=strip,
|
||||
canonicalized=canonicalize
|
||||
)
|
||||
|
||||
super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
|
||||
allow_domains=allow_domains, deny_domains=deny_domains,
|
||||
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
|
||||
canonicalize=canonicalize, deny_extensions=deny_extensions,
|
||||
restrict_text=restrict_text)
|
||||
super(LxmlLinkExtractor, self).__init__(
|
||||
link_extractor=lx,
|
||||
allow=allow,
|
||||
deny=deny,
|
||||
allow_domains=allow_domains,
|
||||
deny_domains=deny_domains,
|
||||
restrict_xpaths=restrict_xpaths,
|
||||
restrict_css=restrict_css,
|
||||
canonicalize=canonicalize,
|
||||
deny_extensions=deny_extensions,
|
||||
restrict_text=restrict_text,
|
||||
)
|
||||
|
||||
def extract_links(self, response):
|
||||
"""Returns a list of :class:`~scrapy.link.Link` objects from the
|
||||
@ -126,9 +150,11 @@ class LxmlLinkExtractor(FilteringLinkExtractor):
|
||||
"""
|
||||
base_url = get_base_url(response)
|
||||
if self.restrict_xpaths:
|
||||
docs = [subdoc
|
||||
docs = [
|
||||
subdoc
|
||||
for x in self.restrict_xpaths
|
||||
for subdoc in response.xpath(x)]
|
||||
for subdoc in response.xpath(x)
|
||||
]
|
||||
else:
|
||||
docs = [response.selector]
|
||||
all_links = []
|
||||
|
@ -6,6 +6,8 @@ See documentation in docs/topics/loaders.rst
|
||||
from collections import defaultdict
|
||||
from contextlib import suppress
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
from scrapy.item import Item
|
||||
from scrapy.loader.common import wrap_loader_context
|
||||
from scrapy.loader.processors import Identity
|
||||
@ -44,7 +46,7 @@ class ItemLoader:
|
||||
self._local_item = context['item'] = item
|
||||
self._local_values = defaultdict(list)
|
||||
# values from initial item
|
||||
for field_name, value in item.items():
|
||||
for field_name, value in ItemAdapter(item).items():
|
||||
self._values[field_name] += arg_to_iter(value)
|
||||
|
||||
@property
|
||||
@ -127,13 +129,12 @@ class ItemLoader:
|
||||
return value
|
||||
|
||||
def load_item(self):
|
||||
item = self.item
|
||||
adapter = ItemAdapter(self.item)
|
||||
for field_name in tuple(self._values):
|
||||
value = self.get_output_value(field_name)
|
||||
if value is not None:
|
||||
item[field_name] = value
|
||||
|
||||
return item
|
||||
adapter[field_name] = value
|
||||
return adapter.item
|
||||
|
||||
def get_output_value(self, field_name):
|
||||
proc = self.get_output_processor(field_name)
|
||||
@ -174,11 +175,8 @@ class ItemLoader:
|
||||
value, type(e).__name__, str(e)))
|
||||
|
||||
def _get_item_field_attr(self, field_name, key, default=None):
|
||||
if isinstance(self.item, Item):
|
||||
value = self.item.fields[field_name].get(key, default)
|
||||
else:
|
||||
value = default
|
||||
return value
|
||||
field_meta = ItemAdapter(self.item).get_field_meta(field_name)
|
||||
return field_meta.get(key, default)
|
||||
|
||||
def _check_selector_method(self):
|
||||
if self.selector is None:
|
||||
|
@ -28,8 +28,10 @@ def _to_bytes_or_none(text):
|
||||
|
||||
|
||||
class MailSender:
|
||||
def __init__(self, smtphost='localhost', mailfrom='scrapy@localhost',
|
||||
smtpuser=None, smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False):
|
||||
def __init__(
|
||||
self, smtphost='localhost', mailfrom='scrapy@localhost', smtpuser=None,
|
||||
smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False
|
||||
):
|
||||
self.smtphost = smtphost
|
||||
self.smtpport = smtpport
|
||||
self.smtpuser = _to_bytes_or_none(smtpuser)
|
||||
@ -41,9 +43,15 @@ class MailSender:
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
return cls(settings['MAIL_HOST'], settings['MAIL_FROM'], settings['MAIL_USER'],
|
||||
settings['MAIL_PASS'], settings.getint('MAIL_PORT'),
|
||||
settings.getbool('MAIL_TLS'), settings.getbool('MAIL_SSL'))
|
||||
return cls(
|
||||
smtphost=settings['MAIL_HOST'],
|
||||
mailfrom=settings['MAIL_FROM'],
|
||||
smtpuser=settings['MAIL_USER'],
|
||||
smtppass=settings['MAIL_PASS'],
|
||||
smtpport=settings.getint('MAIL_PORT'),
|
||||
smtptls=settings.getbool('MAIL_TLS'),
|
||||
smtpssl=settings.getbool('MAIL_SSL'),
|
||||
)
|
||||
|
||||
def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
|
||||
from twisted.internet import reactor
|
||||
@ -89,9 +97,12 @@ class MailSender:
|
||||
return
|
||||
|
||||
dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8'))
|
||||
dfd.addCallbacks(self._sent_ok, self._sent_failed,
|
||||
dfd.addCallbacks(
|
||||
callback=self._sent_ok,
|
||||
errback=self._sent_failed,
|
||||
callbackArgs=[to, cc, subject, len(attachs)],
|
||||
errbackArgs=[to, cc, subject, len(attachs)])
|
||||
errbackArgs=[to, cc, subject, len(attachs)],
|
||||
)
|
||||
reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
|
||||
return dfd
|
||||
|
||||
@ -115,9 +126,10 @@ class MailSender:
|
||||
from twisted.mail.smtp import ESMTPSenderFactory
|
||||
msg = BytesIO(msg)
|
||||
d = defer.Deferred()
|
||||
factory = ESMTPSenderFactory(self.smtpuser, self.smtppass, self.mailfrom, \
|
||||
to_addrs, msg, d, heloFallback=True, requireAuthentication=False, \
|
||||
requireTransportSecurity=self.smtptls)
|
||||
factory = ESMTPSenderFactory(
|
||||
self.smtpuser, self.smtppass, self.mailfrom, to_addrs, msg, d,
|
||||
heloFallback=True, requireAuthentication=False, requireTransportSecurity=self.smtptls,
|
||||
)
|
||||
factory.noisy = False
|
||||
|
||||
if self.smtpssl:
|
||||
|
@ -10,24 +10,26 @@ import mimetypes
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from email.utils import parsedate_tz, mktime_tz
|
||||
from contextlib import suppress
|
||||
from email.utils import mktime_tz, parsedate_tz
|
||||
from ftplib import FTP
|
||||
from io import BytesIO
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from twisted.internet import defer, threads
|
||||
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
from scrapy.http import Request
|
||||
from scrapy.pipelines.media import MediaPipeline
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.misc import md5sum
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.utils.request import referer_str
|
||||
from scrapy.utils.boto import is_botocore
|
||||
from scrapy.utils.datatypes import CaselessDict
|
||||
from scrapy.utils.ftp import ftp_store_file
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
from scrapy.utils.misc import md5sum
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.utils.request import referer_str
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -83,8 +85,7 @@ class S3FilesStore:
|
||||
AWS_USE_SSL = None
|
||||
AWS_VERIFY = None
|
||||
|
||||
POLICY = 'private' # Overriden from settings.FILES_STORE_S3_ACL in
|
||||
# FilesPipeline.from_settings.
|
||||
POLICY = 'private' # Overriden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
|
||||
HEADERS = {
|
||||
'Cache-Control': 'max-age=172800',
|
||||
}
|
||||
@ -106,7 +107,8 @@ class S3FilesStore:
|
||||
else:
|
||||
from boto.s3.connection import S3Connection
|
||||
self.S3Connection = S3Connection
|
||||
assert uri.startswith('s3://')
|
||||
if not uri.startswith("s3://"):
|
||||
raise ValueError("Incorrect URI scheme in %s, expected 's3'" % uri)
|
||||
self.bucket, self.prefix = uri[5:].split('/', 1)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
@ -229,6 +231,20 @@ class GCSFilesStore:
|
||||
bucket, prefix = uri[5:].split('/', 1)
|
||||
self.bucket = client.bucket(bucket)
|
||||
self.prefix = prefix
|
||||
permissions = self.bucket.test_iam_permissions(
|
||||
['storage.objects.get', 'storage.objects.create']
|
||||
)
|
||||
if 'storage.objects.get' not in permissions:
|
||||
logger.warning(
|
||||
"No 'storage.objects.get' permission for GSC bucket %(bucket)s. "
|
||||
"Checking if files are up to date will be impossible. Files will be downloaded every time.",
|
||||
{'bucket': bucket}
|
||||
)
|
||||
if 'storage.objects.create' not in permissions:
|
||||
logger.error(
|
||||
"No 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!",
|
||||
{'bucket': bucket}
|
||||
)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
def _onsuccess(blob):
|
||||
@ -266,7 +282,8 @@ class FTPFilesStore:
|
||||
USE_ACTIVE_MODE = None
|
||||
|
||||
def __init__(self, uri):
|
||||
assert uri.startswith('ftp://')
|
||||
if not uri.startswith("ftp://"):
|
||||
raise ValueError("Incorrect URI scheme in %s, expected 'ftp'" % uri)
|
||||
u = urlparse(uri)
|
||||
self.port = u.port
|
||||
self.host = u.hostname
|
||||
@ -417,7 +434,7 @@ class FilesPipeline(MediaPipeline):
|
||||
self.inc_stats(info.spider, 'uptodate')
|
||||
|
||||
checksum = result.get('checksum', None)
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum}
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': 'uptodate'}
|
||||
|
||||
path = self.file_path(request, info=info)
|
||||
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
|
||||
@ -494,15 +511,16 @@ class FilesPipeline(MediaPipeline):
|
||||
)
|
||||
raise FileException(str(exc))
|
||||
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum}
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': status}
|
||||
|
||||
def inc_stats(self, spider, status):
|
||||
spider.crawler.stats.inc_value('file_count', spider=spider)
|
||||
spider.crawler.stats.inc_value('file_status_count/%s' % status, spider=spider)
|
||||
|
||||
### Overridable Interface
|
||||
# Overridable Interface
|
||||
def get_media_requests(self, item, info):
|
||||
return [Request(x) for x in item.get(self.files_urls_field, [])]
|
||||
urls = ItemAdapter(item).get(self.files_urls_field, [])
|
||||
return [Request(u) for u in urls]
|
||||
|
||||
def file_downloaded(self, response, request, info):
|
||||
path = self.file_path(request, response=response, info=info)
|
||||
@ -513,8 +531,8 @@ class FilesPipeline(MediaPipeline):
|
||||
return checksum
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
if isinstance(item, dict) or self.files_result_field in item.fields:
|
||||
item[self.files_result_field] = [x for ok, x in results if ok]
|
||||
with suppress(KeyError):
|
||||
ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]
|
||||
return item
|
||||
|
||||
def file_path(self, request, response=None, info=None):
|
||||
|
@ -5,17 +5,19 @@ See documentation in topics/media-pipeline.rst
|
||||
"""
|
||||
import functools
|
||||
import hashlib
|
||||
from contextlib import suppress
|
||||
from io import BytesIO
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from PIL import Image
|
||||
|
||||
from scrapy.exceptions import DropItem
|
||||
from scrapy.http import Request
|
||||
from scrapy.pipelines.files import FileException, FilesPipeline
|
||||
# TODO: from scrapy.pipelines.media import MediaPipeline
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.misc import md5sum
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.http import Request
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.exceptions import DropItem
|
||||
#TODO: from scrapy.pipelines.media import MediaPipeline
|
||||
from scrapy.pipelines.files import FileException, FilesPipeline
|
||||
|
||||
|
||||
class NoimagesDrop(DropItem):
|
||||
@ -157,11 +159,12 @@ class ImagesPipeline(FilesPipeline):
|
||||
return image, buf
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
return [Request(x) for x in item.get(self.images_urls_field, [])]
|
||||
urls = ItemAdapter(item).get(self.images_urls_field, [])
|
||||
return [Request(u) for u in urls]
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
if isinstance(item, dict) or self.images_result_field in item.fields:
|
||||
item[self.images_result_field] = [x for ok, x in results if ok]
|
||||
with suppress(KeyError):
|
||||
ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
|
||||
return item
|
||||
|
||||
def file_path(self, request, response=None, info=None):
|
||||
|
@ -1,7 +1,7 @@
|
||||
import functools
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from twisted.internet.defer import Deferred, DeferredList, _DefGen_Return
|
||||
from twisted.internet.defer import Deferred, DeferredList
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.settings import Settings
|
||||
@ -43,8 +43,7 @@ class MediaPipeline:
|
||||
if allow_redirects:
|
||||
self.handle_httpstatus_list = SequenceExclude(range(300, 400))
|
||||
|
||||
def _key_for_pipe(self, key, base_class_name=None,
|
||||
settings=None):
|
||||
def _key_for_pipe(self, key, base_class_name=None, settings=None):
|
||||
"""
|
||||
>>> MediaPipeline()._key_for_pipe("IMAGES")
|
||||
'IMAGES'
|
||||
@ -55,8 +54,11 @@ class MediaPipeline:
|
||||
"""
|
||||
class_name = self.__class__.__name__
|
||||
formatted_key = "{}_{}".format(class_name.upper(), key)
|
||||
if class_name == base_class_name or not base_class_name \
|
||||
or (settings and not settings.get(formatted_key)):
|
||||
if (
|
||||
not base_class_name
|
||||
or class_name == base_class_name
|
||||
or settings and not settings.get(formatted_key)
|
||||
):
|
||||
return key
|
||||
return formatted_key
|
||||
|
||||
@ -141,24 +143,26 @@ class MediaPipeline:
|
||||
# This code fixes a memory leak by avoiding to keep references to
|
||||
# the Request and Response objects on the Media Pipeline cache.
|
||||
#
|
||||
# Twisted inline callbacks pass return values using the function
|
||||
# twisted.internet.defer.returnValue, which encapsulates the return
|
||||
# value inside a _DefGen_Return base exception.
|
||||
#
|
||||
# What happens when the media_downloaded callback raises another
|
||||
# What happens when the media_downloaded callback raises an
|
||||
# exception, for example a FileException('download-error') when
|
||||
# the Response status code is not 200 OK, is that it stores the
|
||||
# _DefGen_Return exception on the FileException context.
|
||||
# the Response status code is not 200 OK, is that the original
|
||||
# StopIteration exception (which in turn contains the failed
|
||||
# Response and by extension, the original Request) gets encapsulated
|
||||
# within the FileException context.
|
||||
#
|
||||
# Originally, Scrapy was using twisted.internet.defer.returnValue
|
||||
# inside functions decorated with twisted.internet.defer.inlineCallbacks,
|
||||
# encapsulating the returned Response in a _DefGen_Return exception
|
||||
# instead of a StopIteration.
|
||||
#
|
||||
# To avoid keeping references to the Response and therefore Request
|
||||
# objects on the Media Pipeline cache, we should wipe the context of
|
||||
# the exception encapsulated by the Twisted Failure when its a
|
||||
# _DefGen_Return instance.
|
||||
# the encapsulated exception when it is a StopIteration instance
|
||||
#
|
||||
# This problem does not occur in Python 2.7 since we don't have
|
||||
# Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
|
||||
context = getattr(result.value, '__context__', None)
|
||||
if isinstance(context, _DefGen_Return):
|
||||
if isinstance(context, StopIteration):
|
||||
setattr(result.value, '__context__', None)
|
||||
|
||||
info.downloading.remove(fp)
|
||||
@ -166,7 +170,7 @@ class MediaPipeline:
|
||||
for wad in info.waiting.pop(fp):
|
||||
defer_result(result).chainDeferred(wad)
|
||||
|
||||
### Overridable Interface
|
||||
# Overridable Interface
|
||||
def media_to_download(self, request, info):
|
||||
"""Check request before starting download"""
|
||||
pass
|
||||
|
@ -58,9 +58,9 @@ class ResponseTypes:
|
||||
|
||||
def from_content_disposition(self, content_disposition):
|
||||
try:
|
||||
filename = to_unicode(content_disposition,
|
||||
encoding='latin-1', errors='replace').split(';')[1].split('=')[1]
|
||||
filename = filename.strip('"\'')
|
||||
filename = to_unicode(
|
||||
content_disposition, encoding='latin-1', errors='replace'
|
||||
).split(';')[1].split('=')[1].strip('"\'')
|
||||
return self.from_filename(filename)
|
||||
except IndexError:
|
||||
return Response
|
||||
@ -71,7 +71,7 @@ class ResponseTypes:
|
||||
cls = Response
|
||||
if b'Content-Type' in headers:
|
||||
cls = self.from_content_type(
|
||||
content_type=headers[b'Content-type'],
|
||||
content_type=headers[b'Content-Type'],
|
||||
content_encoding=headers.get(b'Content-Encoding')
|
||||
)
|
||||
if cls is Response and b'Content-Disposition' in headers:
|
||||
|
@ -17,10 +17,12 @@ def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
|
||||
except UnicodeDecodeError:
|
||||
# If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
|
||||
# Switch to 'allow all' state.
|
||||
logger.warning("Failure while parsing robots.txt. "
|
||||
"File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
|
||||
logger.warning(
|
||||
"Failure while parsing robots.txt. File either contains garbage or "
|
||||
"is in an encoding other than UTF-8, treating it as an empty file.",
|
||||
exc_info=sys.exc_info(),
|
||||
extra={'spider': spider})
|
||||
extra={'spider': spider},
|
||||
)
|
||||
robotstxt_body = ''
|
||||
return robotstxt_body
|
||||
|
||||
|
@ -1,4 +1,6 @@
|
||||
"""
|
||||
Selectors
|
||||
"""
|
||||
from scrapy.selector.unified import * # noqa: F401
|
||||
|
||||
# top-level imports
|
||||
from scrapy.selector.unified import Selector, SelectorList
|
||||
|
@ -65,7 +65,7 @@ class Selector(_ParselSelector, object_ref):
|
||||
selectorlist_cls = SelectorList
|
||||
|
||||
def __init__(self, response=None, text=None, type=None, root=None, **kwargs):
|
||||
if not(response is None or text is None):
|
||||
if response is not None and text is not None:
|
||||
raise ValueError('%s.__init__() received both response and text'
|
||||
% self.__class__.__name__)
|
||||
|
||||
|
@ -6,6 +6,7 @@ See documentation in docs/topics/shell.rst
|
||||
import os
|
||||
import signal
|
||||
|
||||
from itemadapter import is_item
|
||||
from twisted.internet import threads, defer
|
||||
from twisted.python import threadable
|
||||
from w3lib.url import any_to_uri
|
||||
@ -13,21 +14,18 @@ from w3lib.url import any_to_uri
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.console import start_python_console
|
||||
from scrapy.utils.conf import get_config
|
||||
from scrapy.utils.console import DEFAULT_PYTHON_SHELLS, start_python_console
|
||||
from scrapy.utils.datatypes import SequenceExclude
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.response import open_in_browser
|
||||
from scrapy.utils.conf import get_config
|
||||
from scrapy.utils.console import DEFAULT_PYTHON_SHELLS
|
||||
|
||||
|
||||
class Shell:
|
||||
|
||||
relevant_classes = (Crawler, Spider, Request, Response, BaseItem,
|
||||
Settings)
|
||||
relevant_classes = (Crawler, Spider, Request, Response, Settings)
|
||||
|
||||
def __init__(self, crawler, update_vars=None, code=None):
|
||||
self.crawler = crawler
|
||||
@ -146,17 +144,16 @@ class Shell:
|
||||
b.append("Useful shortcuts:")
|
||||
if self.inthread:
|
||||
b.append(" fetch(url[, redirect=True]) "
|
||||
"Fetch URL and update local objects "
|
||||
"(by default, redirects are followed)")
|
||||
"Fetch URL and update local objects (by default, redirects are followed)")
|
||||
b.append(" fetch(req) "
|
||||
"Fetch a scrapy.Request and update local objects ")
|
||||
b.append(" shelp() Shell help (print this help)")
|
||||
b.append(" view(response) View response in a browser")
|
||||
|
||||
return "\n".join("[s] %s" % l for l in b)
|
||||
return "\n".join("[s] %s" % line for line in b)
|
||||
|
||||
def _is_relevant(self, value):
|
||||
return isinstance(value, self.relevant_classes)
|
||||
return isinstance(value, self.relevant_classes) or is_item(value)
|
||||
|
||||
|
||||
def inspect_response(response, spider):
|
||||
|
@ -17,6 +17,7 @@ request_reached_downloader = object()
|
||||
request_left_downloader = object()
|
||||
response_received = object()
|
||||
response_downloaded = object()
|
||||
bytes_received = object()
|
||||
item_scraped = object()
|
||||
item_dropped = object()
|
||||
item_error = object()
|
||||
|
@ -1,7 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from collections import defaultdict
|
||||
import traceback
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
|
||||
from zope.interface import implementer
|
||||
|
||||
@ -16,6 +15,7 @@ class SpiderLoader:
|
||||
SpiderLoader is a class which locates and loads spiders
|
||||
in a Scrapy project.
|
||||
"""
|
||||
|
||||
def __init__(self, settings):
|
||||
self.spider_modules = settings.getlist('SPIDER_MODULES')
|
||||
self.warn_only = settings.getbool('SPIDER_LOADER_WARN_ONLY')
|
||||
@ -24,16 +24,21 @@ class SpiderLoader:
|
||||
self._load_all_spiders()
|
||||
|
||||
def _check_name_duplicates(self):
|
||||
dupes = ["\n".join(" {cls} named {name!r} (in {module})".format(
|
||||
module=mod, cls=cls, name=name)
|
||||
for (mod, cls) in locations)
|
||||
for name, locations in self._found.items()
|
||||
if len(locations) > 1]
|
||||
dupes = []
|
||||
for name, locations in self._found.items():
|
||||
dupes.extend([
|
||||
" {cls} named {name!r} (in {module})".format(module=mod, cls=cls, name=name)
|
||||
for mod, cls in locations
|
||||
if len(locations) > 1
|
||||
])
|
||||
|
||||
if dupes:
|
||||
msg = ("There are several spiders with the same name:\n\n"
|
||||
"{}\n\n This can cause unexpected behavior.".format(
|
||||
"\n\n".join(dupes)))
|
||||
warnings.warn(msg, UserWarning)
|
||||
dupes_string = "\n\n".join(dupes)
|
||||
warnings.warn(
|
||||
"There are several spiders with the same name:\n\n"
|
||||
"{}\n\n This can cause unexpected behavior.".format(dupes_string),
|
||||
category=UserWarning,
|
||||
)
|
||||
|
||||
def _load_spiders(self, module):
|
||||
for spcls in iter_spider_classes(module):
|
||||
@ -45,12 +50,15 @@ class SpiderLoader:
|
||||
try:
|
||||
for module in walk_modules(name):
|
||||
self._load_spiders(module)
|
||||
except ImportError as e:
|
||||
except ImportError:
|
||||
if self.warn_only:
|
||||
msg = ("\n{tb}Could not load spiders from module '{modname}'. "
|
||||
warnings.warn(
|
||||
"\n{tb}Could not load spiders from module '{modname}'. "
|
||||
"See above traceback for details.".format(
|
||||
modname=name, tb=traceback.format_exc()))
|
||||
warnings.warn(msg, RuntimeWarning)
|
||||
modname=name, tb=traceback.format_exc()
|
||||
),
|
||||
category=RuntimeWarning,
|
||||
)
|
||||
else:
|
||||
raise
|
||||
self._check_name_duplicates()
|
||||
@ -73,8 +81,10 @@ class SpiderLoader:
|
||||
"""
|
||||
Return the list of spider names that can handle the given request.
|
||||
"""
|
||||
return [name for name, cls in self._spiders.items()
|
||||
if cls.handles_request(request)]
|
||||
return [
|
||||
name for name, cls in self._spiders.items()
|
||||
if cls.handles_request(request)
|
||||
]
|
||||
|
||||
def list(self):
|
||||
"""
|
||||
|
@ -163,9 +163,10 @@ class StrictOriginPolicy(ReferrerPolicy):
|
||||
name = POLICY_STRICT_ORIGIN
|
||||
|
||||
def referrer(self, response_url, request_url):
|
||||
if ((self.tls_protected(response_url) and
|
||||
self.potentially_trustworthy(request_url))
|
||||
or not self.tls_protected(response_url)):
|
||||
if (
|
||||
self.tls_protected(response_url) and self.potentially_trustworthy(request_url)
|
||||
or not self.tls_protected(response_url)
|
||||
):
|
||||
return self.origin_referrer(response_url)
|
||||
|
||||
|
||||
@ -213,9 +214,10 @@ class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
|
||||
origin = self.origin(response_url)
|
||||
if origin == self.origin(request_url):
|
||||
return self.stripped_referrer(response_url)
|
||||
elif ((self.tls_protected(response_url) and
|
||||
self.potentially_trustworthy(request_url))
|
||||
or not self.tls_protected(response_url)):
|
||||
elif (
|
||||
self.tls_protected(response_url) and self.potentially_trustworthy(request_url)
|
||||
or not self.tls_protected(response_url)
|
||||
):
|
||||
return self.origin_referrer(response_url)
|
||||
|
||||
|
||||
|
@ -110,6 +110,6 @@ class Spider(object_ref):
|
||||
|
||||
|
||||
# Top-level imports
|
||||
from scrapy.spiders.crawl import CrawlSpider, Rule # noqa: F401
|
||||
from scrapy.spiders.feed import XMLFeedSpider, CSVFeedSpider # noqa: F401
|
||||
from scrapy.spiders.sitemap import SitemapSpider # noqa: F401
|
||||
from scrapy.spiders.crawl import CrawlSpider, Rule
|
||||
from scrapy.spiders.feed import XMLFeedSpider, CSVFeedSpider
|
||||
from scrapy.spiders.sitemap import SitemapSpider
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user