mirror of
https://github.com/scrapy/scrapy.git
synced 2025-03-14 16:28:31 +00:00
Merge branch 'master' into azure-pipelines
This commit is contained in:
commit
6e58da1dcd
@ -1,5 +1,5 @@
|
|||||||
[bumpversion]
|
[bumpversion]
|
||||||
current_version = 2.0.0
|
current_version = 2.2.0
|
||||||
commit = True
|
commit = True
|
||||||
tag = True
|
tag = True
|
||||||
tag_name = {new_version}
|
tag_name = {new_version}
|
||||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -15,6 +15,7 @@ htmlcov/
|
|||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
.coverage.*
|
.coverage.*
|
||||||
.cache/
|
.cache/
|
||||||
|
.mypy_cache/
|
||||||
|
|
||||||
# Windows
|
# Windows
|
||||||
Thumbs.db
|
Thumbs.db
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
version: 2
|
version: 2
|
||||||
|
formats: all
|
||||||
sphinx:
|
sphinx:
|
||||||
configuration: docs/conf.py
|
configuration: docs/conf.py
|
||||||
fail_on_warning: true
|
fail_on_warning: true
|
||||||
|
44
.travis.yml
44
.travis.yml
@ -11,25 +11,35 @@ matrix:
|
|||||||
python: 3.8
|
python: 3.8
|
||||||
- env: TOXENV=flake8
|
- env: TOXENV=flake8
|
||||||
python: 3.8
|
python: 3.8
|
||||||
- env: TOXENV=pypy3
|
- env: TOXENV=pylint
|
||||||
- env: TOXENV=py35
|
|
||||||
python: 3.5
|
|
||||||
- env: TOXENV=pinned
|
|
||||||
python: 3.5
|
|
||||||
- env: TOXENV=py35-asyncio
|
|
||||||
python: 3.5.2
|
|
||||||
- env: TOXENV=py36
|
|
||||||
python: 3.6
|
|
||||||
- env: TOXENV=py37
|
|
||||||
python: 3.7
|
|
||||||
- env: TOXENV=py38
|
|
||||||
python: 3.8
|
|
||||||
- env: TOXENV=extra-deps
|
|
||||||
python: 3.8
|
|
||||||
- env: TOXENV=py38-asyncio
|
|
||||||
python: 3.8
|
python: 3.8
|
||||||
- env: TOXENV=docs
|
- env: TOXENV=docs
|
||||||
python: 3.7 # Keep in sync with .readthedocs.yml
|
python: 3.7 # Keep in sync with .readthedocs.yml
|
||||||
|
- env: TOXENV=typing
|
||||||
|
python: 3.8
|
||||||
|
|
||||||
|
- env: TOXENV=pypy3
|
||||||
|
- env: TOXENV=pinned
|
||||||
|
python: 3.5.2
|
||||||
|
- env: TOXENV=asyncio
|
||||||
|
python: 3.5.2 # We use additional code to support 3.5.3 and earlier
|
||||||
|
- env: TOXENV=py
|
||||||
|
python: 3.5
|
||||||
|
- env: TOXENV=asyncio
|
||||||
|
python: 3.5 # We use specific code to support >= 3.5.4, < 3.6
|
||||||
|
- env: TOXENV=py
|
||||||
|
python: 3.6
|
||||||
|
- env: TOXENV=py
|
||||||
|
python: 3.7
|
||||||
|
- env: TOXENV=py PYPI_RELEASE_JOB=true
|
||||||
|
python: 3.8
|
||||||
|
dist: bionic
|
||||||
|
- env: TOXENV=extra-deps
|
||||||
|
python: 3.8
|
||||||
|
dist: bionic
|
||||||
|
- env: TOXENV=asyncio
|
||||||
|
python: 3.8
|
||||||
|
dist: bionic
|
||||||
install:
|
install:
|
||||||
- |
|
- |
|
||||||
if [ "$TOXENV" = "pypy3" ]; then
|
if [ "$TOXENV" = "pypy3" ]; then
|
||||||
@ -62,4 +72,4 @@ deploy:
|
|||||||
on:
|
on:
|
||||||
tags: true
|
tags: true
|
||||||
repo: scrapy/scrapy
|
repo: scrapy/scrapy
|
||||||
condition: "$TOXENV == py37 && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$"
|
condition: "$PYPI_RELEASE_JOB == true && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$"
|
||||||
|
@ -40,7 +40,7 @@ including a list of features.
|
|||||||
Requirements
|
Requirements
|
||||||
============
|
============
|
||||||
|
|
||||||
* Python 3.5+
|
* Python 3.5.2+
|
||||||
* Works on Linux, Windows, macOS, BSD
|
* Works on Linux, Windows, macOS, BSD
|
||||||
|
|
||||||
Install
|
Install
|
||||||
|
@ -12,6 +12,8 @@ collect_ignore = [
|
|||||||
"scrapy/utils/testsite.py",
|
"scrapy/utils/testsite.py",
|
||||||
# contains scripts to be run by tests/test_crawler.py::CrawlerProcessSubprocess
|
# contains scripts to be run by tests/test_crawler.py::CrawlerProcessSubprocess
|
||||||
*_py_files("tests/CrawlerProcess"),
|
*_py_files("tests/CrawlerProcess"),
|
||||||
|
# contains scripts to be run by tests/test_crawler.py::CrawlerRunnerSubprocess
|
||||||
|
*_py_files("tests/CrawlerRunner"),
|
||||||
# Py36-only parts of respective tests
|
# Py36-only parts of respective tests
|
||||||
*_py_files("tests/py36"),
|
*_py_files("tests/py36"),
|
||||||
]
|
]
|
||||||
|
@ -57,3 +57,12 @@ There is a way to recreate the doc automatically when you make changes, you
|
|||||||
need to install watchdog (``pip install watchdog``) and then use::
|
need to install watchdog (``pip install watchdog``) and then use::
|
||||||
|
|
||||||
make watch
|
make watch
|
||||||
|
|
||||||
|
Alternative method using tox
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
To compile the documentation to HTML run the following command::
|
||||||
|
|
||||||
|
tox -e docs
|
||||||
|
|
||||||
|
Documentation will be generated (in HTML format) inside the ``.tox/docs/tmp/html`` dir.
|
||||||
|
14
docs/conf.py
14
docs/conf.py
@ -1,5 +1,3 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
#
|
|
||||||
# Scrapy documentation build configuration file, created by
|
# Scrapy documentation build configuration file, created by
|
||||||
# sphinx-quickstart on Mon Nov 24 12:02:52 2008.
|
# sphinx-quickstart on Mon Nov 24 12:02:52 2008.
|
||||||
#
|
#
|
||||||
@ -102,6 +100,9 @@ exclude_trees = ['.build']
|
|||||||
# The name of the Pygments (syntax highlighting) style to use.
|
# The name of the Pygments (syntax highlighting) style to use.
|
||||||
pygments_style = 'sphinx'
|
pygments_style = 'sphinx'
|
||||||
|
|
||||||
|
# List of Sphinx warnings that will not be raised
|
||||||
|
suppress_warnings = ['epub.unknown_project_files']
|
||||||
|
|
||||||
|
|
||||||
# Options for HTML output
|
# Options for HTML output
|
||||||
# -----------------------
|
# -----------------------
|
||||||
@ -280,6 +281,7 @@ coverage_ignore_pyobjects = [
|
|||||||
# -------------------------------------
|
# -------------------------------------
|
||||||
|
|
||||||
intersphinx_mapping = {
|
intersphinx_mapping = {
|
||||||
|
'attrs': ('https://www.attrs.org/en/stable/', None),
|
||||||
'coverage': ('https://coverage.readthedocs.io/en/stable', None),
|
'coverage': ('https://coverage.readthedocs.io/en/stable', None),
|
||||||
'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
|
'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
|
||||||
'pytest': ('https://docs.pytest.org/en/latest', None),
|
'pytest': ('https://docs.pytest.org/en/latest', None),
|
||||||
@ -295,3 +297,11 @@ intersphinx_mapping = {
|
|||||||
# ------------------------------------
|
# ------------------------------------
|
||||||
|
|
||||||
hoverxref_auto_ref = True
|
hoverxref_auto_ref = True
|
||||||
|
hoverxref_role_types = {
|
||||||
|
"class": "tooltip",
|
||||||
|
"confval": "tooltip",
|
||||||
|
"hoverxref": "tooltip",
|
||||||
|
"mod": "tooltip",
|
||||||
|
"ref": "tooltip",
|
||||||
|
}
|
||||||
|
hoverxref_roles = ['command', 'reqmeta', 'setting', 'signal']
|
||||||
|
@ -155,6 +155,9 @@ Finally, try to keep aesthetic changes (:pep:`8` compliance, unused imports
|
|||||||
removal, etc) in separate commits from functional changes. This will make pull
|
removal, etc) in separate commits from functional changes. This will make pull
|
||||||
requests easier to review and more likely to get merged.
|
requests easier to review and more likely to get merged.
|
||||||
|
|
||||||
|
|
||||||
|
.. _coding-style:
|
||||||
|
|
||||||
Coding style
|
Coding style
|
||||||
============
|
============
|
||||||
|
|
||||||
@ -163,7 +166,7 @@ Scrapy:
|
|||||||
|
|
||||||
* Unless otherwise specified, follow :pep:`8`.
|
* Unless otherwise specified, follow :pep:`8`.
|
||||||
|
|
||||||
* It's OK to use lines longer than 80 chars if it improves the code
|
* It's OK to use lines longer than 79 chars if it improves the code
|
||||||
readability.
|
readability.
|
||||||
|
|
||||||
* Don't put your name in the code you contribute; git provides enough
|
* Don't put your name in the code you contribute; git provides enough
|
||||||
|
23
docs/faq.rst
23
docs/faq.rst
@ -69,7 +69,7 @@ Here's an example spider using BeautifulSoup API, with ``lxml`` as the HTML pars
|
|||||||
What Python versions does Scrapy support?
|
What Python versions does Scrapy support?
|
||||||
-----------------------------------------
|
-----------------------------------------
|
||||||
|
|
||||||
Scrapy is supported under Python 3.5+
|
Scrapy is supported under Python 3.5.2+
|
||||||
under CPython (default Python implementation) and PyPy (starting with PyPy 5.9).
|
under CPython (default Python implementation) and PyPy (starting with PyPy 5.9).
|
||||||
Python 3 support was added in Scrapy 1.1.
|
Python 3 support was added in Scrapy 1.1.
|
||||||
PyPy support was added in Scrapy 1.4, PyPy3 support was added in Scrapy 1.5.
|
PyPy support was added in Scrapy 1.4, PyPy3 support was added in Scrapy 1.5.
|
||||||
@ -342,15 +342,15 @@ method for this purpose. For example::
|
|||||||
|
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
|
||||||
from scrapy.item import BaseItem
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
class MultiplyItemsMiddleware:
|
class MultiplyItemsMiddleware:
|
||||||
|
|
||||||
def process_spider_output(self, response, result, spider):
|
def process_spider_output(self, response, result, spider):
|
||||||
for item in result:
|
for item in result:
|
||||||
if isinstance(item, (BaseItem, dict)):
|
if is_item(item):
|
||||||
for _ in range(item['multiply_by']):
|
adapter = ItemAdapter(item)
|
||||||
|
for _ in range(adapter['multiply_by']):
|
||||||
yield deepcopy(item)
|
yield deepcopy(item)
|
||||||
|
|
||||||
Does Scrapy support IPv6 addresses?
|
Does Scrapy support IPv6 addresses?
|
||||||
@ -371,6 +371,19 @@ Twisted reactor is :class:`twisted.internet.selectreactor.SelectReactor`. Switch
|
|||||||
different reactor is possible by using the :setting:`TWISTED_REACTOR` setting.
|
different reactor is possible by using the :setting:`TWISTED_REACTOR` setting.
|
||||||
|
|
||||||
|
|
||||||
|
.. _faq-stop-response-download:
|
||||||
|
|
||||||
|
How can I cancel the download of a given response?
|
||||||
|
--------------------------------------------------
|
||||||
|
|
||||||
|
In some situations, it might be useful to stop the download of a certain response.
|
||||||
|
For instance, if you only need the first part of a large response and you would like
|
||||||
|
to save resources by avoiding the download of the whole body.
|
||||||
|
In that case, you could attach a handler to the :class:`~scrapy.signals.bytes_received`
|
||||||
|
signal and raise a :exc:`~scrapy.exceptions.StopDownload` exception. Please refer to
|
||||||
|
the :ref:`topics-stop-response-download` topic for additional information and examples.
|
||||||
|
|
||||||
|
|
||||||
.. _has been reported: https://github.com/scrapy/scrapy/issues/2905
|
.. _has been reported: https://github.com/scrapy/scrapy/issues/2905
|
||||||
.. _user agents: https://en.wikipedia.org/wiki/User_agent
|
.. _user agents: https://en.wikipedia.org/wiki/User_agent
|
||||||
.. _LIFO: https://en.wikipedia.org/wiki/Stack_(abstract_data_type)
|
.. _LIFO: https://en.wikipedia.org/wiki/Stack_(abstract_data_type)
|
||||||
|
@ -7,7 +7,7 @@ Installation guide
|
|||||||
Installing Scrapy
|
Installing Scrapy
|
||||||
=================
|
=================
|
||||||
|
|
||||||
Scrapy runs on Python 3.5 or above under CPython (default Python
|
Scrapy runs on Python 3.5.2 or above under CPython (default Python
|
||||||
implementation) and PyPy (starting with PyPy 5.9).
|
implementation) and PyPy (starting with PyPy 5.9).
|
||||||
|
|
||||||
If you're using `Anaconda`_ or `Miniconda`_, you can install the package from
|
If you're using `Anaconda`_ or `Miniconda`_, you can install the package from
|
||||||
|
@ -25,16 +25,16 @@ Scrapy.
|
|||||||
If you're already familiar with other languages, and want to learn Python quickly, the `Python Tutorial`_ is a good resource.
|
If you're already familiar with other languages, and want to learn Python quickly, the `Python Tutorial`_ is a good resource.
|
||||||
|
|
||||||
If you're new to programming and want to start with Python, the following books
|
If you're new to programming and want to start with Python, the following books
|
||||||
may be useful to you:
|
may be useful to you:
|
||||||
|
|
||||||
* `Automate the Boring Stuff With Python`_
|
* `Automate the Boring Stuff With Python`_
|
||||||
|
|
||||||
* `How To Think Like a Computer Scientist`_
|
* `How To Think Like a Computer Scientist`_
|
||||||
|
|
||||||
* `Learn Python 3 The Hard Way`_
|
* `Learn Python 3 The Hard Way`_
|
||||||
|
|
||||||
You can also take a look at `this list of Python resources for non-programmers`_,
|
You can also take a look at `this list of Python resources for non-programmers`_,
|
||||||
as well as the `suggested resources in the learnpython-subreddit`_.
|
as well as the `suggested resources in the learnpython-subreddit`_.
|
||||||
|
|
||||||
.. _Python: https://www.python.org/
|
.. _Python: https://www.python.org/
|
||||||
.. _this list of Python resources for non-programmers: https://wiki.python.org/moin/BeginnersGuide/NonProgrammers
|
.. _this list of Python resources for non-programmers: https://wiki.python.org/moin/BeginnersGuide/NonProgrammers
|
||||||
@ -62,7 +62,7 @@ This will create a ``tutorial`` directory with the following contents::
|
|||||||
__init__.py
|
__init__.py
|
||||||
|
|
||||||
items.py # project items definition file
|
items.py # project items definition file
|
||||||
|
|
||||||
middlewares.py # project middlewares file
|
middlewares.py # project middlewares file
|
||||||
|
|
||||||
pipelines.py # project pipelines file
|
pipelines.py # project pipelines file
|
||||||
@ -287,8 +287,8 @@ to be scraped, you can at least get **some** data.
|
|||||||
|
|
||||||
Besides the :meth:`~scrapy.selector.SelectorList.getall` and
|
Besides the :meth:`~scrapy.selector.SelectorList.getall` and
|
||||||
:meth:`~scrapy.selector.SelectorList.get` methods, you can also use
|
:meth:`~scrapy.selector.SelectorList.get` methods, you can also use
|
||||||
the :meth:`~scrapy.selector.SelectorList.re` method to extract using `regular
|
the :meth:`~scrapy.selector.SelectorList.re` method to extract using
|
||||||
expressions`_:
|
:doc:`regular expressions <library/re>`:
|
||||||
|
|
||||||
>>> response.css('title::text').re(r'Quotes.*')
|
>>> response.css('title::text').re(r'Quotes.*')
|
||||||
['Quotes to Scrape']
|
['Quotes to Scrape']
|
||||||
@ -305,7 +305,6 @@ with a selector (see :ref:`topics-developer-tools`).
|
|||||||
`Selector Gadget`_ is also a nice tool to quickly find CSS selector for
|
`Selector Gadget`_ is also a nice tool to quickly find CSS selector for
|
||||||
visually selected elements, which works in many browsers.
|
visually selected elements, which works in many browsers.
|
||||||
|
|
||||||
.. _regular expressions: https://docs.python.org/3/library/re.html
|
|
||||||
.. _Selector Gadget: https://selectorgadget.com/
|
.. _Selector Gadget: https://selectorgadget.com/
|
||||||
|
|
||||||
|
|
||||||
|
342
docs/news.rst
342
docs/news.rst
@ -3,6 +3,348 @@
|
|||||||
Release notes
|
Release notes
|
||||||
=============
|
=============
|
||||||
|
|
||||||
|
.. _release-2.2.0:
|
||||||
|
|
||||||
|
Scrapy 2.2.0 (2020-06-24)
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Highlights:
|
||||||
|
|
||||||
|
* Python 3.5.2+ is required now
|
||||||
|
* :ref:`dataclass objects <dataclass-items>` and
|
||||||
|
:ref:`attrs objects <attrs-items>` are now valid :ref:`item types
|
||||||
|
<item-types>`
|
||||||
|
* New :meth:`TextResponse.json <scrapy.http.TextResponse.json>` method
|
||||||
|
* New :signal:`bytes_received` signal that allows canceling response download
|
||||||
|
* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` fixes
|
||||||
|
|
||||||
|
Backward-incompatible changes
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Support for Python 3.5.0 and 3.5.1 has been dropped; Scrapy now refuses to
|
||||||
|
run with a Python version lower than 3.5.2, which introduced
|
||||||
|
:class:`typing.Type` (:issue:`4615`)
|
||||||
|
|
||||||
|
|
||||||
|
Deprecations
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* :meth:`TextResponse.body_as_unicode
|
||||||
|
<scrapy.http.TextResponse.body_as_unicode>` is now deprecated, use
|
||||||
|
:attr:`TextResponse.text <scrapy.http.TextResponse.text>` instead
|
||||||
|
(:issue:`4546`, :issue:`4555`, :issue:`4579`)
|
||||||
|
|
||||||
|
* :class:`scrapy.item.BaseItem` is now deprecated, use
|
||||||
|
:class:`scrapy.item.Item` instead (:issue:`4534`)
|
||||||
|
|
||||||
|
|
||||||
|
New features
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* :ref:`dataclass objects <dataclass-items>` and
|
||||||
|
:ref:`attrs objects <attrs-items>` are now valid :ref:`item types
|
||||||
|
<item-types>`, and a new itemadapter_ library makes it easy to
|
||||||
|
write code that :ref:`supports any item type <supporting-item-types>`
|
||||||
|
(:issue:`2749`, :issue:`2807`, :issue:`3761`, :issue:`3881`, :issue:`4642`)
|
||||||
|
|
||||||
|
* A new :meth:`TextResponse.json <scrapy.http.TextResponse.json>` method
|
||||||
|
allows to deserialize JSON responses (:issue:`2444`, :issue:`4460`,
|
||||||
|
:issue:`4574`)
|
||||||
|
|
||||||
|
* A new :signal:`bytes_received` signal allows monitoring response download
|
||||||
|
progress and :ref:`stopping downloads <topics-stop-response-download>`
|
||||||
|
(:issue:`4205`, :issue:`4559`)
|
||||||
|
|
||||||
|
* The dictionaries in the result list of a :ref:`media pipeline
|
||||||
|
<topics-media-pipeline>` now include a new key, ``status``, which indicates
|
||||||
|
if the file was downloaded or, if the file was not downloaded, why it was
|
||||||
|
not downloaded; see :meth:`FilesPipeline.get_media_requests
|
||||||
|
<scrapy.pipelines.files.FilesPipeline.get_media_requests>` for more
|
||||||
|
information (:issue:`2893`, :issue:`4486`)
|
||||||
|
|
||||||
|
* When using :ref:`Google Cloud Storage <media-pipeline-gcs>` for
|
||||||
|
a :ref:`media pipeline <topics-media-pipeline>`, a warning is now logged if
|
||||||
|
the configured credentials do not grant the required permissions
|
||||||
|
(:issue:`4346`, :issue:`4508`)
|
||||||
|
|
||||||
|
* :ref:`Link extractors <topics-link-extractors>` are now serializable,
|
||||||
|
as long as you do not use :ref:`lambdas <lambda>` for parameters; for
|
||||||
|
example, you can now pass link extractors in :attr:`Request.cb_kwargs
|
||||||
|
<scrapy.http.Request.cb_kwargs>` or
|
||||||
|
:attr:`Request.meta <scrapy.http.Request.meta>` when :ref:`persisting
|
||||||
|
scheduled requests <topics-jobs>` (:issue:`4554`)
|
||||||
|
|
||||||
|
* Upgraded the :ref:`pickle protocol <pickle-protocols>` that Scrapy uses
|
||||||
|
from protocol 2 to protocol 4, improving serialization capabilities and
|
||||||
|
performance (:issue:`4135`, :issue:`4541`)
|
||||||
|
|
||||||
|
* :func:`scrapy.utils.misc.create_instance` now raises a :exc:`TypeError`
|
||||||
|
exception if the resulting instance is ``None`` (:issue:`4528`,
|
||||||
|
:issue:`4532`)
|
||||||
|
|
||||||
|
.. _itemadapter: https://github.com/scrapy/itemadapter
|
||||||
|
|
||||||
|
|
||||||
|
Bug fixes
|
||||||
|
~~~~~~~~~
|
||||||
|
|
||||||
|
* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` no longer
|
||||||
|
discards cookies defined in :attr:`Request.headers
|
||||||
|
<scrapy.http.Request.headers>` (:issue:`1992`, :issue:`2400`)
|
||||||
|
|
||||||
|
* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` no longer
|
||||||
|
re-encodes cookies defined as :class:`bytes` in the ``cookies`` parameter
|
||||||
|
of the ``__init__`` method of :class:`~scrapy.http.Request`
|
||||||
|
(:issue:`2400`, :issue:`3575`)
|
||||||
|
|
||||||
|
* When :setting:`FEEDS` defines multiple URIs, :setting:`FEED_STORE_EMPTY` is
|
||||||
|
``False`` and the crawl yields no items, Scrapy no longer stops feed
|
||||||
|
exports after the first URI (:issue:`4621`, :issue:`4626`)
|
||||||
|
|
||||||
|
* :class:`~scrapy.spiders.Spider` callbacks defined using :doc:`coroutine
|
||||||
|
syntax <topics/coroutines>` no longer need to return an iterable, and may
|
||||||
|
instead return a :class:`~scrapy.http.Request` object, an
|
||||||
|
:ref:`item <topics-items>`, or ``None`` (:issue:`4609`)
|
||||||
|
|
||||||
|
* The :command:`startproject` command now ensures that the generated project
|
||||||
|
folders and files have the right permissions (:issue:`4604`)
|
||||||
|
|
||||||
|
* Fix a :exc:`KeyError` exception being sometimes raised from
|
||||||
|
:class:`scrapy.utils.datatypes.LocalWeakReferencedCache` (:issue:`4597`,
|
||||||
|
:issue:`4599`)
|
||||||
|
|
||||||
|
* When :setting:`FEEDS` defines multiple URIs, log messages about items being
|
||||||
|
stored now contain information from the corresponding feed, instead of
|
||||||
|
always containing information about only one of the feeds (:issue:`4619`,
|
||||||
|
:issue:`4629`)
|
||||||
|
|
||||||
|
|
||||||
|
Documentation
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Added a new section about :ref:`accessing cb_kwargs from errbacks
|
||||||
|
<errback-cb_kwargs>` (:issue:`4598`, :issue:`4634`)
|
||||||
|
|
||||||
|
* Covered chompjs_ in :ref:`topics-parsing-javascript` (:issue:`4556`,
|
||||||
|
:issue:`4562`)
|
||||||
|
|
||||||
|
* Removed from :doc:`topics/coroutines` the warning about the API being
|
||||||
|
experimental (:issue:`4511`, :issue:`4513`)
|
||||||
|
|
||||||
|
* Removed references to unsupported versions of :doc:`Twisted
|
||||||
|
<twisted:index>` (:issue:`4533`)
|
||||||
|
|
||||||
|
* Updated the description of the :ref:`screenshot pipeline example
|
||||||
|
<ScreenshotPipeline>`, which now uses :doc:`coroutine syntax
|
||||||
|
<topics/coroutines>` instead of returning a
|
||||||
|
:class:`~twisted.internet.defer.Deferred` (:issue:`4514`, :issue:`4593`)
|
||||||
|
|
||||||
|
* Removed a misleading import line from the
|
||||||
|
:func:`scrapy.utils.log.configure_logging` code example (:issue:`4510`,
|
||||||
|
:issue:`4587`)
|
||||||
|
|
||||||
|
* The display-on-hover behavior of internal documentation references now also
|
||||||
|
covers links to :ref:`commands <topics-commands>`, :attr:`Request.meta
|
||||||
|
<scrapy.http.Request.meta>` keys, :ref:`settings <topics-settings>` and
|
||||||
|
:ref:`signals <topics-signals>` (:issue:`4495`, :issue:`4563`)
|
||||||
|
|
||||||
|
* It is again possible to download the documentation for offline reading
|
||||||
|
(:issue:`4578`, :issue:`4585`)
|
||||||
|
|
||||||
|
* Removed backslashes preceding ``*args`` and ``**kwargs`` in some function
|
||||||
|
and method signatures (:issue:`4592`, :issue:`4596`)
|
||||||
|
|
||||||
|
.. _chompjs: https://github.com/Nykakin/chompjs
|
||||||
|
|
||||||
|
|
||||||
|
Quality assurance
|
||||||
|
~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Adjusted the code base further to our :ref:`style guidelines
|
||||||
|
<coding-style>` (:issue:`4237`, :issue:`4525`, :issue:`4538`,
|
||||||
|
:issue:`4539`, :issue:`4540`, :issue:`4542`, :issue:`4543`, :issue:`4544`,
|
||||||
|
:issue:`4545`, :issue:`4557`, :issue:`4558`, :issue:`4566`, :issue:`4568`,
|
||||||
|
:issue:`4572`)
|
||||||
|
|
||||||
|
* Removed remnants of Python 2 support (:issue:`4550`, :issue:`4553`,
|
||||||
|
:issue:`4568`)
|
||||||
|
|
||||||
|
* Improved code sharing between the :command:`crawl` and :command:`runspider`
|
||||||
|
commands (:issue:`4548`, :issue:`4552`)
|
||||||
|
|
||||||
|
* Replaced ``chain(*iterable)`` with ``chain.from_iterable(iterable)``
|
||||||
|
(:issue:`4635`)
|
||||||
|
|
||||||
|
* You may now run the :mod:`asyncio` tests with Tox on any Python version
|
||||||
|
(:issue:`4521`)
|
||||||
|
|
||||||
|
* Updated test requirements to reflect an incompatibility with pytest 5.4 and
|
||||||
|
5.4.1 (:issue:`4588`)
|
||||||
|
|
||||||
|
* Improved :class:`~scrapy.spiderloader.SpiderLoader` test coverage for
|
||||||
|
scenarios involving duplicate spider names (:issue:`4549`, :issue:`4560`)
|
||||||
|
|
||||||
|
* Configured Travis CI to also run the tests with Python 3.5.2
|
||||||
|
(:issue:`4518`, :issue:`4615`)
|
||||||
|
|
||||||
|
* Added a `Pylint <https://www.pylint.org/>`_ job to Travis CI
|
||||||
|
(:issue:`3727`)
|
||||||
|
|
||||||
|
* Added a `Mypy <http://mypy-lang.org/>`_ job to Travis CI (:issue:`4637`)
|
||||||
|
|
||||||
|
* Made use of set literals in tests (:issue:`4573`)
|
||||||
|
|
||||||
|
* Cleaned up the Travis CI configuration (:issue:`4517`, :issue:`4519`,
|
||||||
|
:issue:`4522`, :issue:`4537`)
|
||||||
|
|
||||||
|
|
||||||
|
.. _release-2.1.0:
|
||||||
|
|
||||||
|
Scrapy 2.1.0 (2020-04-24)
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Highlights:
|
||||||
|
|
||||||
|
* New :setting:`FEEDS` setting to export to multiple feeds
|
||||||
|
* New :attr:`Response.ip_address <scrapy.http.Response.ip_address>` attribute
|
||||||
|
|
||||||
|
Backward-incompatible changes
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* :exc:`AssertionError` exceptions triggered by :ref:`assert <assert>`
|
||||||
|
statements have been replaced by new exception types, to support running
|
||||||
|
Python in optimized mode (see :option:`-O`) without changing Scrapy’s
|
||||||
|
behavior in any unexpected ways.
|
||||||
|
|
||||||
|
If you catch an :exc:`AssertionError` exception from Scrapy, update your
|
||||||
|
code to catch the corresponding new exception.
|
||||||
|
|
||||||
|
(:issue:`4440`)
|
||||||
|
|
||||||
|
|
||||||
|
Deprecation removals
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* The ``LOG_UNSERIALIZABLE_REQUESTS`` setting is no longer supported, use
|
||||||
|
:setting:`SCHEDULER_DEBUG` instead (:issue:`4385`)
|
||||||
|
|
||||||
|
* The ``REDIRECT_MAX_METAREFRESH_DELAY`` setting is no longer supported, use
|
||||||
|
:setting:`METAREFRESH_MAXDELAY` instead (:issue:`4385`)
|
||||||
|
|
||||||
|
* The :class:`~scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware`
|
||||||
|
middleware has been removed, including the entire
|
||||||
|
:class:`scrapy.downloadermiddlewares.chunked` module; chunked transfers
|
||||||
|
work out of the box (:issue:`4431`)
|
||||||
|
|
||||||
|
* The ``spiders`` property has been removed from
|
||||||
|
:class:`~scrapy.crawler.Crawler`, use :class:`CrawlerRunner.spider_loader
|
||||||
|
<scrapy.crawler.CrawlerRunner.spider_loader>` or instantiate
|
||||||
|
:setting:`SPIDER_LOADER_CLASS` with your settings instead (:issue:`4398`)
|
||||||
|
|
||||||
|
* The ``MultiValueDict``, ``MultiValueDictKeyError``, and ``SiteNode``
|
||||||
|
classes have been removed from :mod:`scrapy.utils.datatypes`
|
||||||
|
(:issue:`4400`)
|
||||||
|
|
||||||
|
|
||||||
|
Deprecations
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* The ``FEED_FORMAT`` and ``FEED_URI`` settings have been deprecated in
|
||||||
|
favor of the new :setting:`FEEDS` setting (:issue:`1336`, :issue:`3858`,
|
||||||
|
:issue:`4507`)
|
||||||
|
|
||||||
|
|
||||||
|
New features
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* A new setting, :setting:`FEEDS`, allows configuring multiple output feeds
|
||||||
|
with different settings each (:issue:`1336`, :issue:`3858`, :issue:`4507`)
|
||||||
|
|
||||||
|
* The :command:`crawl` and :command:`runspider` commands now support multiple
|
||||||
|
``-o`` parameters (:issue:`1336`, :issue:`3858`, :issue:`4507`)
|
||||||
|
|
||||||
|
* The :command:`crawl` and :command:`runspider` commands now support
|
||||||
|
specifying an output format by appending ``:<format>`` to the output file
|
||||||
|
(:issue:`1336`, :issue:`3858`, :issue:`4507`)
|
||||||
|
|
||||||
|
* The new :attr:`Response.ip_address <scrapy.http.Response.ip_address>`
|
||||||
|
attribute gives access to the IP address that originated a response
|
||||||
|
(:issue:`3903`, :issue:`3940`)
|
||||||
|
|
||||||
|
* A warning is now issued when a value in
|
||||||
|
:attr:`~scrapy.spiders.Spider.allowed_domains` includes a port
|
||||||
|
(:issue:`50`, :issue:`3198`, :issue:`4413`)
|
||||||
|
|
||||||
|
* Zsh completion now excludes used option aliases from the completion list
|
||||||
|
(:issue:`4438`)
|
||||||
|
|
||||||
|
|
||||||
|
Bug fixes
|
||||||
|
~~~~~~~~~
|
||||||
|
|
||||||
|
* :ref:`Request serialization <request-serialization>` no longer breaks for
|
||||||
|
callbacks that are spider attributes which are assigned a function with a
|
||||||
|
different name (:issue:`4500`)
|
||||||
|
|
||||||
|
* ``None`` values in :attr:`~scrapy.spiders.Spider.allowed_domains` no longer
|
||||||
|
cause a :exc:`TypeError` exception (:issue:`4410`)
|
||||||
|
|
||||||
|
* Zsh completion no longer allows options after arguments (:issue:`4438`)
|
||||||
|
|
||||||
|
* zope.interface 5.0.0 and later versions are now supported
|
||||||
|
(:issue:`4447`, :issue:`4448`)
|
||||||
|
|
||||||
|
* :meth:`Spider.make_requests_from_url
|
||||||
|
<scrapy.spiders.Spider.make_requests_from_url>`, deprecated in Scrapy
|
||||||
|
1.4.0, now issues a warning when used (:issue:`4412`)
|
||||||
|
|
||||||
|
|
||||||
|
Documentation
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Improved the documentation about signals that allow their handlers to
|
||||||
|
return a :class:`~twisted.internet.defer.Deferred` (:issue:`4295`,
|
||||||
|
:issue:`4390`)
|
||||||
|
|
||||||
|
* Our PyPI entry now includes links for our documentation, our source code
|
||||||
|
repository and our issue tracker (:issue:`4456`)
|
||||||
|
|
||||||
|
* Covered the `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_
|
||||||
|
service in the documentation (:issue:`4206`, :issue:`4455`)
|
||||||
|
|
||||||
|
* Removed references to the Guppy library, which only works in Python 2
|
||||||
|
(:issue:`4285`, :issue:`4343`)
|
||||||
|
|
||||||
|
* Extended use of InterSphinx to link to Python 3 documentation
|
||||||
|
(:issue:`4444`, :issue:`4445`)
|
||||||
|
|
||||||
|
* Added support for Sphinx 3.0 and later (:issue:`4475`, :issue:`4480`,
|
||||||
|
:issue:`4496`, :issue:`4503`)
|
||||||
|
|
||||||
|
|
||||||
|
Quality assurance
|
||||||
|
~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
* Removed warnings about using old, removed settings (:issue:`4404`)
|
||||||
|
|
||||||
|
* Removed a warning about importing
|
||||||
|
:class:`~twisted.internet.testing.StringTransport` from
|
||||||
|
``twisted.test.proto_helpers`` in Twisted 19.7.0 or newer (:issue:`4409`)
|
||||||
|
|
||||||
|
* Removed outdated Debian package build files (:issue:`4384`)
|
||||||
|
|
||||||
|
* Removed :class:`object` usage as a base class (:issue:`4430`)
|
||||||
|
|
||||||
|
* Removed code that added support for old versions of Twisted that we no
|
||||||
|
longer support (:issue:`4472`)
|
||||||
|
|
||||||
|
* Fixed code style issues (:issue:`4468`, :issue:`4469`, :issue:`4471`,
|
||||||
|
:issue:`4481`)
|
||||||
|
|
||||||
|
* Removed :func:`twisted.internet.defer.returnValue` calls (:issue:`4443`,
|
||||||
|
:issue:`4446`, :issue:`4489`)
|
||||||
|
|
||||||
|
|
||||||
.. _release-2.0.1:
|
.. _release-2.0.1:
|
||||||
|
|
||||||
Scrapy 2.0.1 (2020-03-18)
|
Scrapy 2.0.1 (2020-03-18)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
Sphinx>=2.1
|
Sphinx>=3.0
|
||||||
sphinx-hoverxref
|
sphinx-hoverxref>=0.2b1
|
||||||
sphinx-notfound-page
|
sphinx-notfound-page>=0.4
|
||||||
sphinx_rtd_theme
|
sphinx_rtd_theme>=0.4
|
||||||
|
@ -91,7 +91,7 @@ how you :ref:`configure the downloader middlewares
|
|||||||
provided while constructing the crawler, and it is created after the
|
provided while constructing the crawler, and it is created after the
|
||||||
arguments given in the :meth:`crawl` method.
|
arguments given in the :meth:`crawl` method.
|
||||||
|
|
||||||
.. method:: crawl(\*args, \**kwargs)
|
.. method:: crawl(*args, **kwargs)
|
||||||
|
|
||||||
Starts the crawler by instantiating its spider class with the given
|
Starts the crawler by instantiating its spider class with the given
|
||||||
``args`` and ``kwargs`` arguments, while setting the execution engine in
|
``args`` and ``kwargs`` arguments, while setting the execution engine in
|
||||||
|
@ -104,7 +104,7 @@ Spiders
|
|||||||
-------
|
-------
|
||||||
|
|
||||||
Spiders are custom classes written by Scrapy users to parse responses and
|
Spiders are custom classes written by Scrapy users to parse responses and
|
||||||
extract items (aka scraped items) from them or additional requests to
|
extract :ref:`items <topics-items>` from them or additional requests to
|
||||||
follow. For more information see :ref:`topics-spiders`.
|
follow. For more information see :ref:`topics-spiders`.
|
||||||
|
|
||||||
.. _component-pipelines:
|
.. _component-pipelines:
|
||||||
|
@ -78,7 +78,7 @@ override three methods:
|
|||||||
|
|
||||||
.. module:: scrapy.contracts
|
.. module:: scrapy.contracts
|
||||||
|
|
||||||
.. class:: Contract(method, \*args)
|
.. class:: Contract(method, *args)
|
||||||
|
|
||||||
:param method: callback function to which the contract is associated
|
:param method: callback function to which the contract is associated
|
||||||
:type method: function
|
:type method: function
|
||||||
@ -136,7 +136,7 @@ Detecting check runs
|
|||||||
====================
|
====================
|
||||||
|
|
||||||
When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is
|
When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is
|
||||||
set to the ``true`` string. You can use `os.environ`_ to perform any change to
|
set to the ``true`` string. You can use :data:`os.environ` to perform any change to
|
||||||
your spiders or your settings when ``scrapy check`` is used::
|
your spiders or your settings when ``scrapy check`` is used::
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@ -148,5 +148,3 @@ your spiders or your settings when ``scrapy check`` is used::
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
if os.environ.get('SCRAPY_CHECK'):
|
if os.environ.get('SCRAPY_CHECK'):
|
||||||
pass # Do some scraper adjustments when a check is running
|
pass # Do some scraper adjustments when a check is running
|
||||||
|
|
||||||
.. _os.environ: https://docs.python.org/3/library/os.html#os.environ
|
|
||||||
|
@ -7,10 +7,6 @@ Coroutines
|
|||||||
Scrapy has :ref:`partial support <coroutine-support>` for the
|
Scrapy has :ref:`partial support <coroutine-support>` for the
|
||||||
:ref:`coroutine syntax <async>`.
|
:ref:`coroutine syntax <async>`.
|
||||||
|
|
||||||
.. warning:: :mod:`asyncio` support in Scrapy is experimental. Future Scrapy
|
|
||||||
versions may introduce related API and behavior changes without a
|
|
||||||
deprecation period or warning.
|
|
||||||
|
|
||||||
.. _coroutine-support:
|
.. _coroutine-support:
|
||||||
|
|
||||||
Supported callables
|
Supported callables
|
||||||
@ -57,27 +53,34 @@ There are several use cases for coroutines in Scrapy. Code that would
|
|||||||
return Deferreds when written for previous Scrapy versions, such as downloader
|
return Deferreds when written for previous Scrapy versions, such as downloader
|
||||||
middlewares and signal handlers, can be rewritten to be shorter and cleaner::
|
middlewares and signal handlers, can be rewritten to be shorter and cleaner::
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
class DbPipeline:
|
class DbPipeline:
|
||||||
def _update_item(self, data, item):
|
def _update_item(self, data, item):
|
||||||
item['field'] = data
|
adapter = ItemAdapter(item)
|
||||||
|
adapter['field'] = data
|
||||||
return item
|
return item
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
dfd = db.get_some_data(item['id'])
|
adapter = ItemAdapter(item)
|
||||||
|
dfd = db.get_some_data(adapter['id'])
|
||||||
dfd.addCallback(self._update_item, item)
|
dfd.addCallback(self._update_item, item)
|
||||||
return dfd
|
return dfd
|
||||||
|
|
||||||
becomes::
|
becomes::
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
class DbPipeline:
|
class DbPipeline:
|
||||||
async def process_item(self, item, spider):
|
async def process_item(self, item, spider):
|
||||||
item['field'] = await db.get_some_data(item['id'])
|
adapter = ItemAdapter(item)
|
||||||
|
adapter['field'] = await db.get_some_data(adapter['id'])
|
||||||
return item
|
return item
|
||||||
|
|
||||||
Coroutines may be used to call asynchronous code. This includes other
|
Coroutines may be used to call asynchronous code. This includes other
|
||||||
coroutines, functions that return Deferreds and functions that return
|
coroutines, functions that return Deferreds and functions that return
|
||||||
`awaitable objects`_ such as :class:`~asyncio.Future`. This means you can use
|
:term:`awaitable objects <awaitable>` such as :class:`~asyncio.Future`.
|
||||||
many useful Python libraries providing such code::
|
This means you can use many useful Python libraries providing such code::
|
||||||
|
|
||||||
class MySpider(Spider):
|
class MySpider(Spider):
|
||||||
# ...
|
# ...
|
||||||
@ -107,4 +110,3 @@ Common use cases for asynchronous code include:
|
|||||||
:ref:`the screenshot pipeline example<ScreenshotPipeline>`).
|
:ref:`the screenshot pipeline example<ScreenshotPipeline>`).
|
||||||
|
|
||||||
.. _aio-libs: https://github.com/aio-libs
|
.. _aio-libs: https://github.com/aio-libs
|
||||||
.. _awaitable objects: https://docs.python.org/3/glossary.html#term-awaitable
|
|
||||||
|
@ -292,6 +292,9 @@ Alternatively, if you want to know the arguments needed to recreate that
|
|||||||
request you can use the :func:`scrapy.utils.curl.curl_to_request_kwargs`
|
request you can use the :func:`scrapy.utils.curl.curl_to_request_kwargs`
|
||||||
function to get a dictionary with the equivalent arguments.
|
function to get a dictionary with the equivalent arguments.
|
||||||
|
|
||||||
|
Note that to translate a cURL command into a Scrapy request,
|
||||||
|
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
|
||||||
|
|
||||||
As you can see, with a few inspections in the `Network`-tool we
|
As you can see, with a few inspections in the `Network`-tool we
|
||||||
were able to easily replicate the dynamic requests of the scrolling
|
were able to easily replicate the dynamic requests of the scrolling
|
||||||
functionality of the page. Crawling dynamic pages can be quite
|
functionality of the page. Crawling dynamic pages can be quite
|
||||||
|
@ -202,6 +202,11 @@ CookiesMiddleware
|
|||||||
sends them back on subsequent requests (from that spider), just like web
|
sends them back on subsequent requests (from that spider), just like web
|
||||||
browsers do.
|
browsers do.
|
||||||
|
|
||||||
|
.. caution:: When non-UTF8 encoded byte sequences are passed to a
|
||||||
|
:class:`~scrapy.http.Request`, the ``CookiesMiddleware`` will log
|
||||||
|
a warning. Refer to :ref:`topics-logging-advanced-customization`
|
||||||
|
to customize the logging behaviour.
|
||||||
|
|
||||||
The following settings can be used to configure the cookie middleware:
|
The following settings can be used to configure the cookie middleware:
|
||||||
|
|
||||||
* :setting:`COOKIES_ENABLED`
|
* :setting:`COOKIES_ENABLED`
|
||||||
@ -739,7 +744,7 @@ HttpProxyMiddleware
|
|||||||
This middleware sets the HTTP proxy to use for requests, by setting the
|
This middleware sets the HTTP proxy to use for requests, by setting the
|
||||||
``proxy`` meta value for :class:`~scrapy.http.Request` objects.
|
``proxy`` meta value for :class:`~scrapy.http.Request` objects.
|
||||||
|
|
||||||
Like the Python standard library modules `urllib`_ and `urllib2`_, it obeys
|
Like the Python standard library module :mod:`urllib.request`, it obeys
|
||||||
the following environment variables:
|
the following environment variables:
|
||||||
|
|
||||||
* ``http_proxy``
|
* ``http_proxy``
|
||||||
@ -751,9 +756,6 @@ HttpProxyMiddleware
|
|||||||
Keep in mind this value will take precedence over ``http_proxy``/``https_proxy``
|
Keep in mind this value will take precedence over ``http_proxy``/``https_proxy``
|
||||||
environment variables, and it will also ignore ``no_proxy`` environment variable.
|
environment variables, and it will also ignore ``no_proxy`` environment variable.
|
||||||
|
|
||||||
.. _urllib: https://docs.python.org/2/library/urllib.html
|
|
||||||
.. _urllib2: https://docs.python.org/2/library/urllib2.html
|
|
||||||
|
|
||||||
RedirectMiddleware
|
RedirectMiddleware
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
@ -829,6 +831,7 @@ REDIRECT_MAX_TIMES
|
|||||||
Default: ``20``
|
Default: ``20``
|
||||||
|
|
||||||
The maximum number of redirections that will be followed for a single request.
|
The maximum number of redirections that will be followed for a single request.
|
||||||
|
After this maximum, the request's response is returned as is.
|
||||||
|
|
||||||
MetaRefreshMiddleware
|
MetaRefreshMiddleware
|
||||||
---------------------
|
---------------------
|
||||||
@ -1036,8 +1039,7 @@ Scrapy uses this parser by default.
|
|||||||
RobotFileParser
|
RobotFileParser
|
||||||
~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Based on `RobotFileParser
|
Based on :class:`~urllib.robotparser.RobotFileParser`:
|
||||||
<https://docs.python.org/3.7/library/urllib.robotparser.html>`_:
|
|
||||||
|
|
||||||
* is Python's built-in robots.txt_ parser
|
* is Python's built-in robots.txt_ parser
|
||||||
|
|
||||||
|
@ -104,6 +104,9 @@ If you get the expected response `sometimes`, but not always, the issue is
|
|||||||
probably not your request, but the target server. The target server might be
|
probably not your request, but the target server. The target server might be
|
||||||
buggy, overloaded, or :ref:`banning <bans>` some of your requests.
|
buggy, overloaded, or :ref:`banning <bans>` some of your requests.
|
||||||
|
|
||||||
|
Note that to translate a cURL command into a Scrapy request,
|
||||||
|
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
|
||||||
|
|
||||||
.. _topics-handling-response-formats:
|
.. _topics-handling-response-formats:
|
||||||
|
|
||||||
Handling different response formats
|
Handling different response formats
|
||||||
@ -115,7 +118,7 @@ data from it depends on the type of response:
|
|||||||
- If the response is HTML or XML, use :ref:`selectors
|
- If the response is HTML or XML, use :ref:`selectors
|
||||||
<topics-selectors>` as usual.
|
<topics-selectors>` as usual.
|
||||||
|
|
||||||
- If the response is JSON, use `json.loads`_ to load the desired data from
|
- If the response is JSON, use :func:`json.loads` to load the desired data from
|
||||||
:attr:`response.text <scrapy.http.TextResponse.text>`::
|
:attr:`response.text <scrapy.http.TextResponse.text>`::
|
||||||
|
|
||||||
data = json.loads(response.text)
|
data = json.loads(response.text)
|
||||||
@ -130,8 +133,9 @@ data from it depends on the type of response:
|
|||||||
- If the response is JavaScript, or HTML with a ``<script/>`` element
|
- If the response is JavaScript, or HTML with a ``<script/>`` element
|
||||||
containing the desired data, see :ref:`topics-parsing-javascript`.
|
containing the desired data, see :ref:`topics-parsing-javascript`.
|
||||||
|
|
||||||
- If the response is CSS, use a `regular expression`_ to extract the desired
|
- If the response is CSS, use a :doc:`regular expression <library/re>` to
|
||||||
data from :attr:`response.text <scrapy.http.TextResponse.text>`.
|
extract the desired data from
|
||||||
|
:attr:`response.text <scrapy.http.TextResponse.text>`.
|
||||||
|
|
||||||
.. _topics-parsing-images:
|
.. _topics-parsing-images:
|
||||||
|
|
||||||
@ -168,8 +172,9 @@ JavaScript code:
|
|||||||
Once you have a string with the JavaScript code, you can extract the desired
|
Once you have a string with the JavaScript code, you can extract the desired
|
||||||
data from it:
|
data from it:
|
||||||
|
|
||||||
- You might be able to use a `regular expression`_ to extract the desired
|
- You might be able to use a :doc:`regular expression <library/re>` to
|
||||||
data in JSON format, which you can then parse with `json.loads`_.
|
extract the desired data in JSON format, which you can then parse with
|
||||||
|
:func:`json.loads`.
|
||||||
|
|
||||||
For example, if the JavaScript code contains a separate line like
|
For example, if the JavaScript code contains a separate line like
|
||||||
``var data = {"field": "value"};`` you can extract that data as follows:
|
``var data = {"field": "value"};`` you can extract that data as follows:
|
||||||
@ -179,6 +184,18 @@ data from it:
|
|||||||
>>> json.loads(json_data)
|
>>> json.loads(json_data)
|
||||||
{'field': 'value'}
|
{'field': 'value'}
|
||||||
|
|
||||||
|
- chompjs_ provides an API to parse JavaScript objects into a :class:`dict`.
|
||||||
|
|
||||||
|
For example, if the JavaScript code contains
|
||||||
|
``var data = {field: "value", secondField: "second value"};``
|
||||||
|
you can extract that data as follows:
|
||||||
|
|
||||||
|
>>> import chompjs
|
||||||
|
>>> javascript = response.css('script::text').get()
|
||||||
|
>>> data = chompjs.parse_js_object(javascript)
|
||||||
|
>>> data
|
||||||
|
{'field': 'value', 'secondField': 'second value'}
|
||||||
|
|
||||||
- Otherwise, use js2xml_ to convert the JavaScript code into an XML document
|
- Otherwise, use js2xml_ to convert the JavaScript code into an XML document
|
||||||
that you can parse using :ref:`selectors <topics-selectors>`.
|
that you can parse using :ref:`selectors <topics-selectors>`.
|
||||||
|
|
||||||
@ -236,14 +253,13 @@ along with `scrapy-selenium`_ for seamless integration.
|
|||||||
|
|
||||||
|
|
||||||
.. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29
|
.. _AJAX: https://en.wikipedia.org/wiki/Ajax_%28programming%29
|
||||||
|
.. _chompjs: https://github.com/Nykakin/chompjs
|
||||||
.. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
|
.. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
|
||||||
.. _curl: https://curl.haxx.se/
|
.. _curl: https://curl.haxx.se/
|
||||||
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
|
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
|
||||||
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
|
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
|
||||||
.. _js2xml: https://github.com/scrapinghub/js2xml
|
.. _js2xml: https://github.com/scrapinghub/js2xml
|
||||||
.. _json.loads: https://docs.python.org/3/library/json.html#json.loads
|
|
||||||
.. _pytesseract: https://github.com/madmaze/pytesseract
|
.. _pytesseract: https://github.com/madmaze/pytesseract
|
||||||
.. _regular expression: https://docs.python.org/3/library/re.html
|
|
||||||
.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
|
.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
|
||||||
.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
|
.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
|
||||||
.. _Selenium: https://www.selenium.dev/
|
.. _Selenium: https://www.selenium.dev/
|
||||||
|
@ -7,7 +7,7 @@ Sending e-mail
|
|||||||
.. module:: scrapy.mail
|
.. module:: scrapy.mail
|
||||||
:synopsis: Email sending facility
|
:synopsis: Email sending facility
|
||||||
|
|
||||||
Although Python makes sending e-mails relatively easy via the `smtplib`_
|
Although Python makes sending e-mails relatively easy via the :mod:`smtplib`
|
||||||
library, Scrapy provides its own facility for sending e-mails which is very
|
library, Scrapy provides its own facility for sending e-mails which is very
|
||||||
easy to use and it's implemented using :doc:`Twisted non-blocking IO
|
easy to use and it's implemented using :doc:`Twisted non-blocking IO
|
||||||
<twisted:core/howto/defer-intro>`, to avoid interfering with the non-blocking
|
<twisted:core/howto/defer-intro>`, to avoid interfering with the non-blocking
|
||||||
@ -15,8 +15,6 @@ IO of the crawler. It also provides a simple API for sending attachments and
|
|||||||
it's very easy to configure, with a few :ref:`settings
|
it's very easy to configure, with a few :ref:`settings
|
||||||
<topics-email-settings>`.
|
<topics-email-settings>`.
|
||||||
|
|
||||||
.. _smtplib: https://docs.python.org/2/library/smtplib.html
|
|
||||||
|
|
||||||
Quick example
|
Quick example
|
||||||
=============
|
=============
|
||||||
|
|
||||||
|
@ -14,13 +14,6 @@ Built-in Exceptions reference
|
|||||||
|
|
||||||
Here's a list of all exceptions included in Scrapy and their usage.
|
Here's a list of all exceptions included in Scrapy and their usage.
|
||||||
|
|
||||||
DropItem
|
|
||||||
--------
|
|
||||||
|
|
||||||
.. exception:: DropItem
|
|
||||||
|
|
||||||
The exception that must be raised by item pipeline stages to stop processing an
|
|
||||||
Item. For more information see :ref:`topics-item-pipeline`.
|
|
||||||
|
|
||||||
CloseSpider
|
CloseSpider
|
||||||
-----------
|
-----------
|
||||||
@ -47,6 +40,14 @@ DontCloseSpider
|
|||||||
This exception can be raised in a :signal:`spider_idle` signal handler to
|
This exception can be raised in a :signal:`spider_idle` signal handler to
|
||||||
prevent the spider from being closed.
|
prevent the spider from being closed.
|
||||||
|
|
||||||
|
DropItem
|
||||||
|
--------
|
||||||
|
|
||||||
|
.. exception:: DropItem
|
||||||
|
|
||||||
|
The exception that must be raised by item pipeline stages to stop processing an
|
||||||
|
Item. For more information see :ref:`topics-item-pipeline`.
|
||||||
|
|
||||||
IgnoreRequest
|
IgnoreRequest
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
@ -77,3 +78,37 @@ NotSupported
|
|||||||
|
|
||||||
This exception is raised to indicate an unsupported feature.
|
This exception is raised to indicate an unsupported feature.
|
||||||
|
|
||||||
|
StopDownload
|
||||||
|
-------------
|
||||||
|
|
||||||
|
.. versionadded:: 2.2
|
||||||
|
|
||||||
|
.. exception:: StopDownload(fail=True)
|
||||||
|
|
||||||
|
Raised from a :class:`~scrapy.signals.bytes_received` signal handler to
|
||||||
|
indicate that no further bytes should be downloaded for a response.
|
||||||
|
|
||||||
|
The ``fail`` boolean parameter controls which method will handle the resulting
|
||||||
|
response:
|
||||||
|
|
||||||
|
* If ``fail=True`` (default), the request errback is called. The response object is
|
||||||
|
available as the ``response`` attribute of the ``StopDownload`` exception,
|
||||||
|
which is in turn stored as the ``value`` attribute of the received
|
||||||
|
:class:`~twisted.python.failure.Failure` object. This means that in an errback
|
||||||
|
defined as ``def errback(self, failure)``, the response can be accessed though
|
||||||
|
``failure.value.response``.
|
||||||
|
|
||||||
|
* If ``fail=False``, the request callback is called instead.
|
||||||
|
|
||||||
|
In both cases, the response could have its body truncated: the body contains
|
||||||
|
all bytes received up until the exception is raised, including the bytes
|
||||||
|
received in the signal handler that raises the exception. Also, the response
|
||||||
|
object is marked with ``"download_stopped"`` in its :attr:`Response.flags`
|
||||||
|
attribute.
|
||||||
|
|
||||||
|
.. note:: ``fail`` is a keyword-only parameter, i.e. raising
|
||||||
|
``StopDownload(False)`` or ``StopDownload(True)`` will raise
|
||||||
|
a :class:`TypeError`.
|
||||||
|
|
||||||
|
See the documentation for the :class:`~scrapy.signals.bytes_received` signal
|
||||||
|
and the :ref:`topics-stop-response-download` topic for additional information and examples.
|
||||||
|
@ -40,6 +40,7 @@ Here you can see an :doc:`Item Pipeline <item-pipeline>` which uses multiple
|
|||||||
Item Exporters to group scraped items to different files according to the
|
Item Exporters to group scraped items to different files according to the
|
||||||
value of one of their fields::
|
value of one of their fields::
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
from scrapy.exporters import XmlItemExporter
|
from scrapy.exporters import XmlItemExporter
|
||||||
|
|
||||||
class PerYearXmlExportPipeline:
|
class PerYearXmlExportPipeline:
|
||||||
@ -53,7 +54,8 @@ value of one of their fields::
|
|||||||
exporter.finish_exporting()
|
exporter.finish_exporting()
|
||||||
|
|
||||||
def _exporter_for_item(self, item):
|
def _exporter_for_item(self, item):
|
||||||
year = item['year']
|
adapter = ItemAdapter(item)
|
||||||
|
year = adapter['year']
|
||||||
if year not in self.year_to_exporter:
|
if year not in self.year_to_exporter:
|
||||||
f = open('{}.xml'.format(year), 'wb')
|
f = open('{}.xml'.format(year), 'wb')
|
||||||
exporter = XmlItemExporter(f)
|
exporter = XmlItemExporter(f)
|
||||||
@ -167,9 +169,10 @@ BaseItemExporter
|
|||||||
value unchanged except for ``unicode`` values which are encoded to
|
value unchanged except for ``unicode`` values which are encoded to
|
||||||
``str`` using the encoding declared in the :attr:`encoding` attribute.
|
``str`` using the encoding declared in the :attr:`encoding` attribute.
|
||||||
|
|
||||||
:param field: the field being serialized. If a raw dict is being
|
:param field: the field being serialized. If the source :ref:`item object
|
||||||
exported (not :class:`~.Item`) *field* value is an empty dict.
|
<item-types>` does not define field metadata, *field* is an empty
|
||||||
:type field: :class:`~scrapy.item.Field` object or an empty dict
|
:class:`dict`.
|
||||||
|
:type field: :class:`~scrapy.item.Field` object or a :class:`dict` instance
|
||||||
|
|
||||||
:param name: the name of the field being serialized
|
:param name: the name of the field being serialized
|
||||||
:type name: str
|
:type name: str
|
||||||
@ -192,14 +195,17 @@ BaseItemExporter
|
|||||||
|
|
||||||
.. attribute:: fields_to_export
|
.. attribute:: fields_to_export
|
||||||
|
|
||||||
A list with the name of the fields that will be exported, or None if you
|
A list with the name of the fields that will be exported, or ``None`` if
|
||||||
want to export all fields. Defaults to None.
|
you want to export all fields. Defaults to ``None``.
|
||||||
|
|
||||||
Some exporters (like :class:`CsvItemExporter`) respect the order of the
|
Some exporters (like :class:`CsvItemExporter`) respect the order of the
|
||||||
fields defined in this attribute.
|
fields defined in this attribute.
|
||||||
|
|
||||||
Some exporters may require fields_to_export list in order to export the
|
When using :ref:`item objects <item-types>` that do not expose all their
|
||||||
data properly when spiders return dicts (not :class:`~Item` instances).
|
possible fields, exporters that do not support exporting a different
|
||||||
|
subset of fields per item will only export the fields found in the first
|
||||||
|
item exported. Use ``fields_to_export`` to define all the fields to be
|
||||||
|
exported.
|
||||||
|
|
||||||
.. attribute:: export_empty_fields
|
.. attribute:: export_empty_fields
|
||||||
|
|
||||||
@ -236,9 +242,9 @@ PythonItemExporter
|
|||||||
XmlItemExporter
|
XmlItemExporter
|
||||||
---------------
|
---------------
|
||||||
|
|
||||||
.. class:: XmlItemExporter(file, item_element='item', root_element='items', \**kwargs)
|
.. class:: XmlItemExporter(file, item_element='item', root_element='items', **kwargs)
|
||||||
|
|
||||||
Exports Items in XML format to the specified file object.
|
Exports items in XML format to the specified file object.
|
||||||
|
|
||||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||||
@ -290,9 +296,9 @@ XmlItemExporter
|
|||||||
CsvItemExporter
|
CsvItemExporter
|
||||||
---------------
|
---------------
|
||||||
|
|
||||||
.. class:: CsvItemExporter(file, include_headers_line=True, join_multivalued=',', \**kwargs)
|
.. class:: CsvItemExporter(file, include_headers_line=True, join_multivalued=',', **kwargs)
|
||||||
|
|
||||||
Exports Items in CSV format to the given file-like object. If the
|
Exports items in CSV format to the given file-like object. If the
|
||||||
:attr:`fields_to_export` attribute is set, it will be used to define the
|
:attr:`fields_to_export` attribute is set, it will be used to define the
|
||||||
CSV columns and their order. The :attr:`export_empty_fields` attribute has
|
CSV columns and their order. The :attr:`export_empty_fields` attribute has
|
||||||
no effect on this exporter.
|
no effect on this exporter.
|
||||||
@ -311,7 +317,7 @@ CsvItemExporter
|
|||||||
|
|
||||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||||
:class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to the
|
:class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to the
|
||||||
`csv.writer`_ ``__init__`` method, so you can use any ``csv.writer`` ``__init__`` method
|
:func:`csv.writer` function, so you can use any :func:`csv.writer` function
|
||||||
argument to customize this exporter.
|
argument to customize this exporter.
|
||||||
|
|
||||||
A typical output of this exporter would be::
|
A typical output of this exporter would be::
|
||||||
@ -320,14 +326,12 @@ CsvItemExporter
|
|||||||
Color TV,1200
|
Color TV,1200
|
||||||
DVD player,200
|
DVD player,200
|
||||||
|
|
||||||
.. _csv.writer: https://docs.python.org/2/library/csv.html#csv.writer
|
|
||||||
|
|
||||||
PickleItemExporter
|
PickleItemExporter
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
.. class:: PickleItemExporter(file, protocol=0, \**kwargs)
|
.. class:: PickleItemExporter(file, protocol=0, **kwargs)
|
||||||
|
|
||||||
Exports Items in pickle format to the given file-like object.
|
Exports items in pickle format to the given file-like object.
|
||||||
|
|
||||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||||
@ -335,21 +339,19 @@ PickleItemExporter
|
|||||||
:param protocol: The pickle protocol to use.
|
:param protocol: The pickle protocol to use.
|
||||||
:type protocol: int
|
:type protocol: int
|
||||||
|
|
||||||
For more information, refer to the `pickle module documentation`_.
|
For more information, see :mod:`pickle`.
|
||||||
|
|
||||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||||
:class:`BaseItemExporter` ``__init__`` method.
|
:class:`BaseItemExporter` ``__init__`` method.
|
||||||
|
|
||||||
Pickle isn't a human readable format, so no output examples are provided.
|
Pickle isn't a human readable format, so no output examples are provided.
|
||||||
|
|
||||||
.. _pickle module documentation: https://docs.python.org/2/library/pickle.html
|
|
||||||
|
|
||||||
PprintItemExporter
|
PprintItemExporter
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
.. class:: PprintItemExporter(file, \**kwargs)
|
.. class:: PprintItemExporter(file, **kwargs)
|
||||||
|
|
||||||
Exports Items in pretty print format to the specified file object.
|
Exports items in pretty print format to the specified file object.
|
||||||
|
|
||||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||||
@ -367,13 +369,13 @@ PprintItemExporter
|
|||||||
JsonItemExporter
|
JsonItemExporter
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
.. class:: JsonItemExporter(file, \**kwargs)
|
.. class:: JsonItemExporter(file, **kwargs)
|
||||||
|
|
||||||
Exports Items in JSON format to the specified file-like object, writing all
|
Exports items in JSON format to the specified file-like object, writing all
|
||||||
objects as a list of objects. The additional ``__init__`` method arguments are
|
objects as a list of objects. The additional ``__init__`` method arguments are
|
||||||
passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover
|
passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover
|
||||||
arguments to the `JSONEncoder`_ ``__init__`` method, so you can use any
|
arguments to the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any
|
||||||
`JSONEncoder`_ ``__init__`` method argument to customize this exporter.
|
:class:`~json.JSONEncoder` ``__init__`` method argument to customize this exporter.
|
||||||
|
|
||||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||||
@ -393,18 +395,16 @@ JsonItemExporter
|
|||||||
stream-friendly format, consider using :class:`JsonLinesItemExporter`
|
stream-friendly format, consider using :class:`JsonLinesItemExporter`
|
||||||
instead, or splitting the output in multiple chunks.
|
instead, or splitting the output in multiple chunks.
|
||||||
|
|
||||||
.. _JSONEncoder: https://docs.python.org/2/library/json.html#json.JSONEncoder
|
|
||||||
|
|
||||||
JsonLinesItemExporter
|
JsonLinesItemExporter
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
.. class:: JsonLinesItemExporter(file, \**kwargs)
|
.. class:: JsonLinesItemExporter(file, **kwargs)
|
||||||
|
|
||||||
Exports Items in JSON format to the specified file-like object, writing one
|
Exports items in JSON format to the specified file-like object, writing one
|
||||||
JSON-encoded item per line. The additional ``__init__`` method arguments are passed
|
JSON-encoded item per line. The additional ``__init__`` method arguments are passed
|
||||||
to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to
|
to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to
|
||||||
the `JSONEncoder`_ ``__init__`` method, so you can use any `JSONEncoder`_
|
the :class:`~json.JSONEncoder` ``__init__`` method, so you can use any
|
||||||
``__init__`` method argument to customize this exporter.
|
:class:`~json.JSONEncoder` ``__init__`` method argument to customize this exporter.
|
||||||
|
|
||||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||||
@ -417,8 +417,6 @@ JsonLinesItemExporter
|
|||||||
Unlike the one produced by :class:`JsonItemExporter`, the format produced by
|
Unlike the one produced by :class:`JsonItemExporter`, the format produced by
|
||||||
this exporter is well suited for serializing large amounts of data.
|
this exporter is well suited for serializing large amounts of data.
|
||||||
|
|
||||||
.. _JSONEncoder: https://docs.python.org/2/library/json.html#json.JSONEncoder
|
|
||||||
|
|
||||||
MarshalItemExporter
|
MarshalItemExporter
|
||||||
-------------------
|
-------------------
|
||||||
|
|
||||||
|
@ -364,7 +364,7 @@ Debugger extension
|
|||||||
|
|
||||||
.. class:: Debugger
|
.. class:: Debugger
|
||||||
|
|
||||||
Invokes a `Python debugger`_ inside a running Scrapy process when a `SIGUSR2`_
|
Invokes a :doc:`Python debugger <library/pdb>` inside a running Scrapy process when a `SIGUSR2`_
|
||||||
signal is received. After the debugger is exited, the Scrapy process continues
|
signal is received. After the debugger is exited, the Scrapy process continues
|
||||||
running normally.
|
running normally.
|
||||||
|
|
||||||
@ -372,5 +372,4 @@ For more info see `Debugging in Python`_.
|
|||||||
|
|
||||||
This extension only works on POSIX-compliant platforms (i.e. not Windows).
|
This extension only works on POSIX-compliant platforms (i.e. not Windows).
|
||||||
|
|
||||||
.. _Python debugger: https://docs.python.org/2/library/pdb.html
|
|
||||||
.. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/
|
.. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/
|
||||||
|
@ -298,8 +298,8 @@ Example: ``FEED_EXPORT_FIELDS = ["foo", "bar", "baz"]``.
|
|||||||
|
|
||||||
Use FEED_EXPORT_FIELDS option to define fields to export and their order.
|
Use FEED_EXPORT_FIELDS option to define fields to export and their order.
|
||||||
|
|
||||||
When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses fields
|
When FEED_EXPORT_FIELDS is empty or None (default), Scrapy uses the fields
|
||||||
defined in dicts or :class:`~.Item` subclasses a spider is yielding.
|
defined in :ref:`item objects <topics-items>` yielded by your spider.
|
||||||
|
|
||||||
If an exporter requires a fixed set of fields (this is the case for
|
If an exporter requires a fixed set of fields (this is the case for
|
||||||
:ref:`CSV <topics-feed-format-csv>` export format) and FEED_EXPORT_FIELDS
|
:ref:`CSV <topics-feed-format-csv>` export format) and FEED_EXPORT_FIELDS
|
||||||
|
@ -27,15 +27,19 @@ Each item pipeline component is a Python class that must implement the following
|
|||||||
|
|
||||||
.. method:: process_item(self, item, spider)
|
.. method:: process_item(self, item, spider)
|
||||||
|
|
||||||
This method is called for every item pipeline component. :meth:`process_item`
|
This method is called for every item pipeline component.
|
||||||
must either: return a dict with data, return an :class:`~scrapy.item.Item`
|
|
||||||
(or any descendant class) object, return a
|
|
||||||
:class:`~twisted.internet.defer.Deferred` or raise
|
|
||||||
:exc:`~scrapy.exceptions.DropItem` exception. Dropped items are no longer
|
|
||||||
processed by further pipeline components.
|
|
||||||
|
|
||||||
:param item: the item scraped
|
`item` is an :ref:`item object <item-types>`, see
|
||||||
:type item: :class:`~scrapy.item.Item` object or a dict
|
:ref:`supporting-item-types`.
|
||||||
|
|
||||||
|
:meth:`process_item` must either: return an :ref:`item object <item-types>`,
|
||||||
|
return a :class:`~twisted.internet.defer.Deferred` or raise a
|
||||||
|
:exc:`~scrapy.exceptions.DropItem` exception.
|
||||||
|
|
||||||
|
Dropped items are no longer processed by further pipeline components.
|
||||||
|
|
||||||
|
:param item: the scraped item
|
||||||
|
:type item: :ref:`item object <item-types>`
|
||||||
|
|
||||||
:param spider: the spider which scraped the item
|
:param spider: the spider which scraped the item
|
||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
@ -79,16 +83,17 @@ Let's take a look at the following hypothetical pipeline that adjusts the
|
|||||||
(``price_excludes_vat`` attribute), and drops those items which don't
|
(``price_excludes_vat`` attribute), and drops those items which don't
|
||||||
contain a price::
|
contain a price::
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
from scrapy.exceptions import DropItem
|
from scrapy.exceptions import DropItem
|
||||||
|
|
||||||
class PricePipeline:
|
class PricePipeline:
|
||||||
|
|
||||||
vat_factor = 1.15
|
vat_factor = 1.15
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
if item.get('price'):
|
adapter = ItemAdapter(item)
|
||||||
if item.get('price_excludes_vat'):
|
if adapter.get('price'):
|
||||||
item['price'] = item['price'] * self.vat_factor
|
if adapter.get('price_excludes_vat'):
|
||||||
|
adapter['price'] = adapter['price'] * self.vat_factor
|
||||||
return item
|
return item
|
||||||
else:
|
else:
|
||||||
raise DropItem("Missing price in %s" % item)
|
raise DropItem("Missing price in %s" % item)
|
||||||
@ -103,6 +108,8 @@ format::
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
class JsonWriterPipeline:
|
class JsonWriterPipeline:
|
||||||
|
|
||||||
def open_spider(self, spider):
|
def open_spider(self, spider):
|
||||||
@ -112,7 +119,7 @@ format::
|
|||||||
self.file.close()
|
self.file.close()
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
line = json.dumps(dict(item)) + "\n"
|
line = json.dumps(ItemAdapter(item).asdict()) + "\n"
|
||||||
self.file.write(line)
|
self.file.write(line)
|
||||||
return item
|
return item
|
||||||
|
|
||||||
@ -131,6 +138,7 @@ The main point of this example is to show how to use :meth:`from_crawler`
|
|||||||
method and how to clean up the resources properly.::
|
method and how to clean up the resources properly.::
|
||||||
|
|
||||||
import pymongo
|
import pymongo
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
class MongoPipeline:
|
class MongoPipeline:
|
||||||
|
|
||||||
@ -155,7 +163,7 @@ method and how to clean up the resources properly.::
|
|||||||
self.client.close()
|
self.client.close()
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
self.db[self.collection_name].insert_one(dict(item))
|
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
|
||||||
return item
|
return item
|
||||||
|
|
||||||
.. _MongoDB: https://www.mongodb.com/
|
.. _MongoDB: https://www.mongodb.com/
|
||||||
@ -167,18 +175,21 @@ method and how to clean up the resources properly.::
|
|||||||
Take screenshot of item
|
Take screenshot of item
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
This example demonstrates how to return a
|
This example demonstrates how to use :doc:`coroutine syntax <coroutines>` in
|
||||||
:class:`~twisted.internet.defer.Deferred` from the :meth:`process_item` method.
|
the :meth:`process_item` method.
|
||||||
It uses Splash_ to render screenshot of item url. Pipeline
|
|
||||||
makes request to locally running instance of Splash_. After request is downloaded,
|
This item pipeline makes a request to a locally-running instance of Splash_ to
|
||||||
it saves the screenshot to a file and adds filename to the item.
|
render a screenshot of the item URL. After the request response is downloaded,
|
||||||
|
the item pipeline saves the screenshot to a file and adds the filename to the
|
||||||
|
item.
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
import scrapy
|
|
||||||
import hashlib
|
import hashlib
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
class ScreenshotPipeline:
|
class ScreenshotPipeline:
|
||||||
"""Pipeline that uses Splash to render screenshot of
|
"""Pipeline that uses Splash to render screenshot of
|
||||||
@ -187,7 +198,8 @@ it saves the screenshot to a file and adds filename to the item.
|
|||||||
SPLASH_URL = "http://localhost:8050/render.png?url={}"
|
SPLASH_URL = "http://localhost:8050/render.png?url={}"
|
||||||
|
|
||||||
async def process_item(self, item, spider):
|
async def process_item(self, item, spider):
|
||||||
encoded_item_url = quote(item["url"])
|
adapter = ItemAdapter(item)
|
||||||
|
encoded_item_url = quote(adapter["url"])
|
||||||
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
|
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
|
||||||
request = scrapy.Request(screenshot_url)
|
request = scrapy.Request(screenshot_url)
|
||||||
response = await spider.crawler.engine.download(request, spider)
|
response = await spider.crawler.engine.download(request, spider)
|
||||||
@ -197,14 +209,14 @@ it saves the screenshot to a file and adds filename to the item.
|
|||||||
return item
|
return item
|
||||||
|
|
||||||
# Save screenshot to file, filename will be hash of url.
|
# Save screenshot to file, filename will be hash of url.
|
||||||
url = item["url"]
|
url = adapter["url"]
|
||||||
url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
|
url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
|
||||||
filename = "{}.png".format(url_hash)
|
filename = "{}.png".format(url_hash)
|
||||||
with open(filename, "wb") as f:
|
with open(filename, "wb") as f:
|
||||||
f.write(response.body)
|
f.write(response.body)
|
||||||
|
|
||||||
# Store filename in item.
|
# Store filename in item.
|
||||||
item["screenshot_filename"] = filename
|
adapter["screenshot_filename"] = filename
|
||||||
return item
|
return item
|
||||||
|
|
||||||
.. _Splash: https://splash.readthedocs.io/en/stable/
|
.. _Splash: https://splash.readthedocs.io/en/stable/
|
||||||
@ -217,6 +229,7 @@ already processed. Let's say that our items have a unique id, but our spider
|
|||||||
returns multiples items with the same id::
|
returns multiples items with the same id::
|
||||||
|
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
from scrapy.exceptions import DropItem
|
from scrapy.exceptions import DropItem
|
||||||
|
|
||||||
class DuplicatesPipeline:
|
class DuplicatesPipeline:
|
||||||
@ -225,10 +238,11 @@ returns multiples items with the same id::
|
|||||||
self.ids_seen = set()
|
self.ids_seen = set()
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
if item['id'] in self.ids_seen:
|
adapter = ItemAdapter(item)
|
||||||
raise DropItem("Duplicate item found: %s" % item)
|
if adapter['id'] in self.ids_seen:
|
||||||
|
raise DropItem("Duplicate item found: %r" % item)
|
||||||
else:
|
else:
|
||||||
self.ids_seen.add(item['id'])
|
self.ids_seen.add(adapter['id'])
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,31 +8,155 @@ Items
|
|||||||
:synopsis: Item and Field classes
|
:synopsis: Item and Field classes
|
||||||
|
|
||||||
The main goal in scraping is to extract structured data from unstructured
|
The main goal in scraping is to extract structured data from unstructured
|
||||||
sources, typically, web pages. Scrapy spiders can return the extracted data
|
sources, typically, web pages. :ref:`Spiders <topics-spiders>` may return the
|
||||||
as Python dicts. While convenient and familiar, Python dicts lack structure:
|
extracted data as `items`, Python objects that define key-value pairs.
|
||||||
it is easy to make a typo in a field name or return inconsistent data,
|
|
||||||
especially in a larger project with many spiders.
|
|
||||||
|
|
||||||
To define common output data format Scrapy provides the :class:`Item` class.
|
Scrapy supports :ref:`multiple types of items <item-types>`. When you create an
|
||||||
:class:`Item` objects are simple containers used to collect the scraped data.
|
item, you may use whichever type of item you want. When you write code that
|
||||||
They provide a `dictionary-like`_ API with a convenient syntax for declaring
|
receives an item, your code should :ref:`work for any item type
|
||||||
their available fields.
|
<supporting-item-types>`.
|
||||||
|
|
||||||
Various Scrapy components use extra information provided by Items:
|
.. _item-types:
|
||||||
exporters look at declared fields to figure out columns to export,
|
|
||||||
serialization can be customized using Item fields metadata, :mod:`trackref`
|
|
||||||
tracks Item instances to help find memory leaks
|
|
||||||
(see :ref:`topics-leaks-trackrefs`), etc.
|
|
||||||
|
|
||||||
.. _dictionary-like: https://docs.python.org/2/library/stdtypes.html#dict
|
Item Types
|
||||||
|
==========
|
||||||
|
|
||||||
|
Scrapy supports the following types of items, via the `itemadapter`_ library:
|
||||||
|
:ref:`dictionaries <dict-items>`, :ref:`Item objects <item-objects>`,
|
||||||
|
:ref:`dataclass objects <dataclass-items>`, and :ref:`attrs objects <attrs-items>`.
|
||||||
|
|
||||||
|
.. _itemadapter: https://github.com/scrapy/itemadapter
|
||||||
|
|
||||||
|
.. _dict-items:
|
||||||
|
|
||||||
|
Dictionaries
|
||||||
|
------------
|
||||||
|
|
||||||
|
As an item type, :class:`dict` is convenient and familiar.
|
||||||
|
|
||||||
|
.. _item-objects:
|
||||||
|
|
||||||
|
Item objects
|
||||||
|
------------
|
||||||
|
|
||||||
|
:class:`Item` provides a :class:`dict`-like API plus additional features that
|
||||||
|
make it the most feature-complete item type:
|
||||||
|
|
||||||
|
.. class:: Item([arg])
|
||||||
|
|
||||||
|
:class:`Item` objects replicate the standard :class:`dict` API, including
|
||||||
|
its ``__init__`` method.
|
||||||
|
|
||||||
|
:class:`Item` allows defining field names, so that:
|
||||||
|
|
||||||
|
- :class:`KeyError` is raised when using undefined field names (i.e.
|
||||||
|
prevents typos going unnoticed)
|
||||||
|
|
||||||
|
- :ref:`Item exporters <topics-exporters>` can export all fields by
|
||||||
|
default even if the first scraped object does not have values for all
|
||||||
|
of them
|
||||||
|
|
||||||
|
:class:`Item` also allows defining field metadata, which can be used to
|
||||||
|
:ref:`customize serialization <topics-exporters-field-serialization>`.
|
||||||
|
|
||||||
|
:mod:`trackref` tracks :class:`Item` objects to help find memory leaks
|
||||||
|
(see :ref:`topics-leaks-trackrefs`).
|
||||||
|
|
||||||
|
:class:`Item` objects also provide the following additional API members:
|
||||||
|
|
||||||
|
.. automethod:: copy
|
||||||
|
|
||||||
|
.. automethod:: deepcopy
|
||||||
|
|
||||||
|
.. attribute:: fields
|
||||||
|
|
||||||
|
A dictionary containing *all declared fields* for this Item, not only
|
||||||
|
those populated. The keys are the field names and the values are the
|
||||||
|
:class:`Field` objects used in the :ref:`Item declaration
|
||||||
|
<topics-items-declaring>`.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
from scrapy.item import Item, Field
|
||||||
|
|
||||||
|
class CustomItem(Item):
|
||||||
|
one_field = Field()
|
||||||
|
another_field = Field()
|
||||||
|
|
||||||
|
.. _dataclass-items:
|
||||||
|
|
||||||
|
Dataclass objects
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
.. versionadded:: 2.2
|
||||||
|
|
||||||
|
:func:`~dataclasses.dataclass` allows defining item classes with field names,
|
||||||
|
so that :ref:`item exporters <topics-exporters>` can export all fields by
|
||||||
|
default even if the first scraped object does not have values for all of them.
|
||||||
|
|
||||||
|
Additionally, ``dataclass`` items also allow to:
|
||||||
|
|
||||||
|
* define the type and default value of each defined field.
|
||||||
|
|
||||||
|
* define custom field metadata through :func:`dataclasses.field`, which can be used to
|
||||||
|
:ref:`customize serialization <topics-exporters-field-serialization>`.
|
||||||
|
|
||||||
|
They work natively in Python 3.7 or later, or using the `dataclasses
|
||||||
|
backport`_ in Python 3.6.
|
||||||
|
|
||||||
|
.. _dataclasses backport: https://pypi.org/project/dataclasses/
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CustomItem:
|
||||||
|
one_field: str
|
||||||
|
another_field: int
|
||||||
|
|
||||||
|
.. note:: Field types are not enforced at run time.
|
||||||
|
|
||||||
|
.. _attrs-items:
|
||||||
|
|
||||||
|
attr.s objects
|
||||||
|
--------------
|
||||||
|
|
||||||
|
.. versionadded:: 2.2
|
||||||
|
|
||||||
|
:func:`attr.s` allows defining item classes with field names,
|
||||||
|
so that :ref:`item exporters <topics-exporters>` can export all fields by
|
||||||
|
default even if the first scraped object does not have values for all of them.
|
||||||
|
|
||||||
|
Additionally, ``attr.s`` items also allow to:
|
||||||
|
|
||||||
|
* define the type and default value of each defined field.
|
||||||
|
|
||||||
|
* define custom field :ref:`metadata <attrs:metadata>`, which can be used to
|
||||||
|
:ref:`customize serialization <topics-exporters-field-serialization>`.
|
||||||
|
|
||||||
|
In order to use this type, the :doc:`attrs package <attrs:index>` needs to be installed.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
import attr
|
||||||
|
|
||||||
|
@attr.s
|
||||||
|
class CustomItem:
|
||||||
|
one_field = attr.ib()
|
||||||
|
another_field = attr.ib()
|
||||||
|
|
||||||
|
|
||||||
|
Working with Item objects
|
||||||
|
=========================
|
||||||
|
|
||||||
.. _topics-items-declaring:
|
.. _topics-items-declaring:
|
||||||
|
|
||||||
Declaring Items
|
Declaring Item subclasses
|
||||||
===============
|
-------------------------
|
||||||
|
|
||||||
Items are declared using a simple class definition syntax and :class:`Field`
|
Item subclasses are declared using a simple class definition syntax and
|
||||||
objects. Here is an example::
|
:class:`Field` objects. Here is an example::
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
|
|
||||||
@ -50,10 +174,11 @@ objects. Here is an example::
|
|||||||
.. _Django: https://www.djangoproject.com/
|
.. _Django: https://www.djangoproject.com/
|
||||||
.. _Django Models: https://docs.djangoproject.com/en/dev/topics/db/models/
|
.. _Django Models: https://docs.djangoproject.com/en/dev/topics/db/models/
|
||||||
|
|
||||||
|
|
||||||
.. _topics-items-fields:
|
.. _topics-items-fields:
|
||||||
|
|
||||||
Item Fields
|
Declaring fields
|
||||||
===========
|
----------------
|
||||||
|
|
||||||
:class:`Field` objects are used to specify metadata for each field. For
|
:class:`Field` objects are used to specify metadata for each field. For
|
||||||
example, the serializer function for the ``last_updated`` field illustrated in
|
example, the serializer function for the ``last_updated`` field illustrated in
|
||||||
@ -74,15 +199,31 @@ It's important to note that the :class:`Field` objects used to declare the item
|
|||||||
do not stay assigned as class attributes. Instead, they can be accessed through
|
do not stay assigned as class attributes. Instead, they can be accessed through
|
||||||
the :attr:`Item.fields` attribute.
|
the :attr:`Item.fields` attribute.
|
||||||
|
|
||||||
Working with Items
|
.. class:: Field([arg])
|
||||||
==================
|
|
||||||
|
The :class:`Field` class is just an alias to the built-in :class:`dict` class and
|
||||||
|
doesn't provide any extra functionality or attributes. In other words,
|
||||||
|
:class:`Field` objects are plain-old Python dicts. A separate class is used
|
||||||
|
to support the :ref:`item declaration syntax <topics-items-declaring>`
|
||||||
|
based on class attributes.
|
||||||
|
|
||||||
|
.. note:: Field metadata can also be declared for ``dataclass`` and ``attrs``
|
||||||
|
items. Please refer to the documentation for `dataclasses.field`_ and
|
||||||
|
`attr.ib`_ for additional information.
|
||||||
|
|
||||||
|
.. _dataclasses.field: https://docs.python.org/3/library/dataclasses.html#dataclasses.field
|
||||||
|
.. _attr.ib: https://www.attrs.org/en/stable/api.html#attr.ib
|
||||||
|
|
||||||
|
|
||||||
|
Working with Item objects
|
||||||
|
-------------------------
|
||||||
|
|
||||||
Here are some examples of common tasks performed with items, using the
|
Here are some examples of common tasks performed with items, using the
|
||||||
``Product`` item :ref:`declared above <topics-items-declaring>`. You will
|
``Product`` item :ref:`declared above <topics-items-declaring>`. You will
|
||||||
notice the API is very similar to the `dict API`_.
|
notice the API is very similar to the :class:`dict` API.
|
||||||
|
|
||||||
Creating items
|
Creating items
|
||||||
--------------
|
''''''''''''''
|
||||||
|
|
||||||
>>> product = Product(name='Desktop PC', price=1000)
|
>>> product = Product(name='Desktop PC', price=1000)
|
||||||
>>> print(product)
|
>>> print(product)
|
||||||
@ -90,7 +231,7 @@ Product(name='Desktop PC', price=1000)
|
|||||||
|
|
||||||
|
|
||||||
Getting field values
|
Getting field values
|
||||||
--------------------
|
''''''''''''''''''''
|
||||||
|
|
||||||
>>> product['name']
|
>>> product['name']
|
||||||
Desktop PC
|
Desktop PC
|
||||||
@ -130,7 +271,7 @@ False
|
|||||||
|
|
||||||
|
|
||||||
Setting field values
|
Setting field values
|
||||||
--------------------
|
''''''''''''''''''''
|
||||||
|
|
||||||
>>> product['last_updated'] = 'today'
|
>>> product['last_updated'] = 'today'
|
||||||
>>> product['last_updated']
|
>>> product['last_updated']
|
||||||
@ -143,9 +284,9 @@ KeyError: 'Product does not support field: lala'
|
|||||||
|
|
||||||
|
|
||||||
Accessing all populated values
|
Accessing all populated values
|
||||||
------------------------------
|
''''''''''''''''''''''''''''''
|
||||||
|
|
||||||
To access all populated values, just use the typical `dict API`_:
|
To access all populated values, just use the typical :class:`dict` API:
|
||||||
|
|
||||||
>>> product.keys()
|
>>> product.keys()
|
||||||
['price', 'name']
|
['price', 'name']
|
||||||
@ -157,16 +298,14 @@ To access all populated values, just use the typical `dict API`_:
|
|||||||
.. _copying-items:
|
.. _copying-items:
|
||||||
|
|
||||||
Copying items
|
Copying items
|
||||||
-------------
|
'''''''''''''
|
||||||
|
|
||||||
To copy an item, you must first decide whether you want a shallow copy or a
|
To copy an item, you must first decide whether you want a shallow copy or a
|
||||||
deep copy.
|
deep copy.
|
||||||
|
|
||||||
If your item contains mutable_ values like lists or dictionaries, a shallow
|
If your item contains :term:`mutable` values like lists or dictionaries,
|
||||||
copy will keep references to the same mutable values across all different
|
a shallow copy will keep references to the same mutable values across all
|
||||||
copies.
|
different copies.
|
||||||
|
|
||||||
.. _mutable: https://docs.python.org/3/glossary.html#term-mutable
|
|
||||||
|
|
||||||
For example, if you have an item with a list of tags, and you create a shallow
|
For example, if you have an item with a list of tags, and you create a shallow
|
||||||
copy of that item, both the original item and the copy have the same list of
|
copy of that item, both the original item and the copy have the same list of
|
||||||
@ -175,9 +314,7 @@ other item as well.
|
|||||||
|
|
||||||
If that is not the desired behavior, use a deep copy instead.
|
If that is not the desired behavior, use a deep copy instead.
|
||||||
|
|
||||||
See the `documentation of the copy module`_ for more information.
|
See :mod:`copy` for more information.
|
||||||
|
|
||||||
.. _documentation of the copy module: https://docs.python.org/3/library/copy.html
|
|
||||||
|
|
||||||
To create a shallow copy of an item, you can either call
|
To create a shallow copy of an item, you can either call
|
||||||
:meth:`~scrapy.item.Item.copy` on an existing item
|
:meth:`~scrapy.item.Item.copy` on an existing item
|
||||||
@ -189,7 +326,7 @@ To create a deep copy, call :meth:`~scrapy.item.Item.deepcopy` instead
|
|||||||
|
|
||||||
|
|
||||||
Other common tasks
|
Other common tasks
|
||||||
------------------
|
''''''''''''''''''
|
||||||
|
|
||||||
Creating dicts from items:
|
Creating dicts from items:
|
||||||
|
|
||||||
@ -207,8 +344,8 @@ Traceback (most recent call last):
|
|||||||
KeyError: 'Product does not support field: lala'
|
KeyError: 'Product does not support field: lala'
|
||||||
|
|
||||||
|
|
||||||
Extending Items
|
Extending Item subclasses
|
||||||
===============
|
-------------------------
|
||||||
|
|
||||||
You can extend Items (to add more fields or to change some metadata for some
|
You can extend Items (to add more fields or to change some metadata for some
|
||||||
fields) by declaring a subclass of your original Item.
|
fields) by declaring a subclass of your original Item.
|
||||||
@ -228,46 +365,25 @@ appending more values, or changing existing values, like this::
|
|||||||
That adds (or replaces) the ``serializer`` metadata key for the ``name`` field,
|
That adds (or replaces) the ``serializer`` metadata key for the ``name`` field,
|
||||||
keeping all the previously existing metadata values.
|
keeping all the previously existing metadata values.
|
||||||
|
|
||||||
Item objects
|
|
||||||
============
|
|
||||||
|
|
||||||
.. class:: Item([arg])
|
.. _supporting-item-types:
|
||||||
|
|
||||||
Return a new Item optionally initialized from the given argument.
|
Supporting All Item Types
|
||||||
|
=========================
|
||||||
|
|
||||||
Items replicate the standard `dict API`_, including its ``__init__`` method, and
|
In code that receives an item, such as methods of :ref:`item pipelines
|
||||||
also provide the following additional API members:
|
<topics-item-pipeline>` or :ref:`spider middlewares
|
||||||
|
<topics-spider-middleware>`, it is a good practice to use the
|
||||||
|
:class:`~itemadapter.ItemAdapter` class and the
|
||||||
|
:func:`~itemadapter.is_item` function to write code that works for
|
||||||
|
any :ref:`supported item type <item-types>`:
|
||||||
|
|
||||||
.. automethod:: copy
|
.. autoclass:: itemadapter.ItemAdapter
|
||||||
|
|
||||||
.. automethod:: deepcopy
|
.. autofunction:: itemadapter.is_item
|
||||||
|
|
||||||
.. attribute:: fields
|
|
||||||
|
|
||||||
A dictionary containing *all declared fields* for this Item, not only
|
|
||||||
those populated. The keys are the field names and the values are the
|
|
||||||
:class:`Field` objects used in the :ref:`Item declaration
|
|
||||||
<topics-items-declaring>`.
|
|
||||||
|
|
||||||
.. _dict API: https://docs.python.org/2/library/stdtypes.html#dict
|
|
||||||
|
|
||||||
Field objects
|
|
||||||
=============
|
|
||||||
|
|
||||||
.. class:: Field([arg])
|
|
||||||
|
|
||||||
The :class:`Field` class is just an alias to the built-in `dict`_ class and
|
|
||||||
doesn't provide any extra functionality or attributes. In other words,
|
|
||||||
:class:`Field` objects are plain-old Python dicts. A separate class is used
|
|
||||||
to support the :ref:`item declaration syntax <topics-items-declaring>`
|
|
||||||
based on class attributes.
|
|
||||||
|
|
||||||
.. _dict: https://docs.python.org/2/library/stdtypes.html#dict
|
|
||||||
|
|
||||||
|
|
||||||
Other classes related to Item
|
Other classes related to items
|
||||||
=============================
|
==============================
|
||||||
|
|
||||||
.. autoclass:: BaseItem
|
|
||||||
|
|
||||||
.. autoclass:: ItemMeta
|
.. autoclass:: ItemMeta
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
Debugging memory leaks
|
Debugging memory leaks
|
||||||
======================
|
======================
|
||||||
|
|
||||||
In Scrapy, objects such as Requests, Responses and Items have a finite
|
In Scrapy, objects such as requests, responses and items have a finite
|
||||||
lifetime: they are created, used for a while, and finally destroyed.
|
lifetime: they are created, used for a while, and finally destroyed.
|
||||||
|
|
||||||
From all those objects, the Request is probably the one with the longest
|
From all those objects, the Request is probably the one with the longest
|
||||||
@ -61,8 +61,8 @@ Debugging memory leaks with ``trackref``
|
|||||||
========================================
|
========================================
|
||||||
|
|
||||||
:mod:`trackref` is a module provided by Scrapy to debug the most common cases of
|
:mod:`trackref` is a module provided by Scrapy to debug the most common cases of
|
||||||
memory leaks. It basically tracks the references to all live Requests,
|
memory leaks. It basically tracks the references to all live Request,
|
||||||
Responses, Item and Selector objects.
|
Response, Item, Spider and Selector objects.
|
||||||
|
|
||||||
You can enter the telnet console and inspect how many objects (of the classes
|
You can enter the telnet console and inspect how many objects (of the classes
|
||||||
mentioned above) are currently alive using the ``prefs()`` function which is an
|
mentioned above) are currently alive using the ``prefs()`` function which is an
|
||||||
@ -200,11 +200,10 @@ Debugging memory leaks with muppy
|
|||||||
|
|
||||||
``trackref`` provides a very convenient mechanism for tracking down memory
|
``trackref`` provides a very convenient mechanism for tracking down memory
|
||||||
leaks, but it only keeps track of the objects that are more likely to cause
|
leaks, but it only keeps track of the objects that are more likely to cause
|
||||||
memory leaks (Requests, Responses, Items, and Selectors). However, there are
|
memory leaks. However, there are other cases where the memory leaks could come
|
||||||
other cases where the memory leaks could come from other (more or less obscure)
|
from other (more or less obscure) objects. If this is your case, and you can't
|
||||||
objects. If this is your case, and you can't find your leaks using ``trackref``,
|
find your leaks using ``trackref``, you still have another resource: the muppy
|
||||||
you still have another resource: the muppy library.
|
library.
|
||||||
|
|
||||||
|
|
||||||
You can use muppy from `Pympler`_.
|
You can use muppy from `Pympler`_.
|
||||||
|
|
||||||
|
@ -7,13 +7,12 @@ Item Loaders
|
|||||||
.. module:: scrapy.loader
|
.. module:: scrapy.loader
|
||||||
:synopsis: Item Loader class
|
:synopsis: Item Loader class
|
||||||
|
|
||||||
Item Loaders provide a convenient mechanism for populating scraped :ref:`Items
|
Item Loaders provide a convenient mechanism for populating scraped :ref:`items
|
||||||
<topics-items>`. Even though Items can be populated using their own
|
<topics-items>`. Even though items can be populated directly, Item Loaders provide a
|
||||||
dictionary-like API, Item Loaders provide a much more convenient API for
|
much more convenient API for populating them from a scraping process, by automating
|
||||||
populating them from a scraping process, by automating some common tasks like
|
some common tasks like parsing the raw extracted data before assigning it.
|
||||||
parsing the raw extracted data before assigning it.
|
|
||||||
|
|
||||||
In other words, :ref:`Items <topics-items>` provide the *container* of
|
In other words, :ref:`items <topics-items>` provide the *container* of
|
||||||
scraped data, while Item Loaders provide the mechanism for *populating* that
|
scraped data, while Item Loaders provide the mechanism for *populating* that
|
||||||
container.
|
container.
|
||||||
|
|
||||||
@ -25,10 +24,10 @@ Using Item Loaders to populate items
|
|||||||
====================================
|
====================================
|
||||||
|
|
||||||
To use an Item Loader, you must first instantiate it. You can either
|
To use an Item Loader, you must first instantiate it. You can either
|
||||||
instantiate it with a dict-like object (e.g. Item or dict) or without one, in
|
instantiate it with an :ref:`item object <topics-items>` or without one, in which
|
||||||
which case an Item is automatically instantiated in the Item Loader ``__init__`` method
|
case an :ref:`item object <topics-items>` is automatically created in the
|
||||||
using the Item class specified in the :attr:`ItemLoader.default_item_class`
|
Item Loader ``__init__`` method using the :ref:`item <topics-items>` class
|
||||||
attribute.
|
specified in the :attr:`ItemLoader.default_item_class` attribute.
|
||||||
|
|
||||||
Then, you start collecting values into the Item Loader, typically using
|
Then, you start collecting values into the Item Loader, typically using
|
||||||
:ref:`Selectors <topics-selectors>`. You can add more than one value to
|
:ref:`Selectors <topics-selectors>`. You can add more than one value to
|
||||||
@ -77,6 +76,31 @@ called which actually returns the item populated with the data
|
|||||||
previously extracted and collected with the :meth:`~ItemLoader.add_xpath`,
|
previously extracted and collected with the :meth:`~ItemLoader.add_xpath`,
|
||||||
:meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls.
|
:meth:`~ItemLoader.add_css`, and :meth:`~ItemLoader.add_value` calls.
|
||||||
|
|
||||||
|
|
||||||
|
.. _topics-loaders-dataclass:
|
||||||
|
|
||||||
|
Working with dataclass items
|
||||||
|
============================
|
||||||
|
|
||||||
|
By default, :ref:`dataclass items <dataclass-items>` require all fields to be
|
||||||
|
passed when created. This could be an issue when using dataclass items with
|
||||||
|
item loaders: unless a pre-populated item is passed to the loader, fields
|
||||||
|
will be populated incrementally using the loader's :meth:`~ItemLoader.add_xpath`,
|
||||||
|
:meth:`~ItemLoader.add_css` and :meth:`~ItemLoader.add_value` methods.
|
||||||
|
|
||||||
|
One approach to overcome this is to define items using the
|
||||||
|
:func:`~dataclasses.field` function, with a ``default`` argument::
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class InventoryItem:
|
||||||
|
name: Optional[str] = field(default=None)
|
||||||
|
price: Optional[float] = field(default=None)
|
||||||
|
stock: Optional[int] = field(default=None)
|
||||||
|
|
||||||
|
|
||||||
.. _topics-loaders-processors:
|
.. _topics-loaders-processors:
|
||||||
|
|
||||||
Input and Output processors
|
Input and Output processors
|
||||||
@ -88,7 +112,7 @@ received (through the :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`
|
|||||||
:meth:`~ItemLoader.add_value` methods) and the result of the input processor is
|
:meth:`~ItemLoader.add_value` methods) and the result of the input processor is
|
||||||
collected and kept inside the ItemLoader. After collecting all data, the
|
collected and kept inside the ItemLoader. After collecting all data, the
|
||||||
:meth:`ItemLoader.load_item` method is called to populate and get the populated
|
:meth:`ItemLoader.load_item` method is called to populate and get the populated
|
||||||
:class:`~scrapy.item.Item` object. That's when the output processor is
|
:ref:`item object <topics-items>`. That's when the output processor is
|
||||||
called with the data previously collected (and processed using the input
|
called with the data previously collected (and processed using the input
|
||||||
processor). The result of the output processor is the final value that gets
|
processor). The result of the output processor is the final value that gets
|
||||||
assigned to the item.
|
assigned to the item.
|
||||||
@ -153,12 +177,10 @@ Last, but not least, Scrapy comes with some :ref:`commonly used processors
|
|||||||
<topics-loaders-available-processors>` built-in for convenience.
|
<topics-loaders-available-processors>` built-in for convenience.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Declaring Item Loaders
|
Declaring Item Loaders
|
||||||
======================
|
======================
|
||||||
|
|
||||||
Item Loaders are declared like Items, by using a class definition syntax. Here
|
Item Loaders are declared using a class definition syntax. Here is an example::
|
||||||
is an example::
|
|
||||||
|
|
||||||
from scrapy.loader import ItemLoader
|
from scrapy.loader import ItemLoader
|
||||||
from scrapy.loader.processors import TakeFirst, MapCompose, Join
|
from scrapy.loader.processors import TakeFirst, MapCompose, Join
|
||||||
@ -273,11 +295,11 @@ There are several ways to modify Item Loader context values:
|
|||||||
ItemLoader objects
|
ItemLoader objects
|
||||||
==================
|
==================
|
||||||
|
|
||||||
.. class:: ItemLoader([item, selector, response], \**kwargs)
|
.. class:: ItemLoader([item, selector, response], **kwargs)
|
||||||
|
|
||||||
Return a new Item Loader for populating the given Item. If no item is
|
Return a new Item Loader for populating the given :ref:`item object
|
||||||
given, one is instantiated automatically using the class in
|
<topics-items>`. If no item object is given, one is instantiated
|
||||||
:attr:`default_item_class`.
|
automatically using the class in :attr:`default_item_class`.
|
||||||
|
|
||||||
When instantiated with a ``selector`` or a ``response`` parameters
|
When instantiated with a ``selector`` or a ``response`` parameters
|
||||||
the :class:`ItemLoader` class provides convenient mechanisms for extracting
|
the :class:`ItemLoader` class provides convenient mechanisms for extracting
|
||||||
@ -286,7 +308,7 @@ ItemLoader objects
|
|||||||
:param item: The item instance to populate using subsequent calls to
|
:param item: The item instance to populate using subsequent calls to
|
||||||
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
|
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
|
||||||
or :meth:`~ItemLoader.add_value`.
|
or :meth:`~ItemLoader.add_value`.
|
||||||
:type item: :class:`~scrapy.item.Item` object
|
:type item: :ref:`item object <topics-items>`
|
||||||
|
|
||||||
:param selector: The selector to extract data from, when using the
|
:param selector: The selector to extract data from, when using the
|
||||||
:meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath`
|
:meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath`
|
||||||
@ -303,7 +325,7 @@ ItemLoader objects
|
|||||||
|
|
||||||
:class:`ItemLoader` instances have the following methods:
|
:class:`ItemLoader` instances have the following methods:
|
||||||
|
|
||||||
.. method:: get_value(value, \*processors, \**kwargs)
|
.. method:: get_value(value, *processors, **kwargs)
|
||||||
|
|
||||||
Process the given ``value`` by the given ``processors`` and keyword
|
Process the given ``value`` by the given ``processors`` and keyword
|
||||||
arguments.
|
arguments.
|
||||||
@ -321,7 +343,7 @@ ItemLoader objects
|
|||||||
>>> loader.get_value(u'name: foo', TakeFirst(), unicode.upper, re='name: (.+)')
|
>>> loader.get_value(u'name: foo', TakeFirst(), unicode.upper, re='name: (.+)')
|
||||||
'FOO`
|
'FOO`
|
||||||
|
|
||||||
.. method:: add_value(field_name, value, \*processors, \**kwargs)
|
.. method:: add_value(field_name, value, *processors, **kwargs)
|
||||||
|
|
||||||
Process and then add the given ``value`` for the given field.
|
Process and then add the given ``value`` for the given field.
|
||||||
|
|
||||||
@ -343,11 +365,11 @@ ItemLoader objects
|
|||||||
loader.add_value('name', u'name: foo', TakeFirst(), re='name: (.+)')
|
loader.add_value('name', u'name: foo', TakeFirst(), re='name: (.+)')
|
||||||
loader.add_value(None, {'name': u'foo', 'sex': u'male'})
|
loader.add_value(None, {'name': u'foo', 'sex': u'male'})
|
||||||
|
|
||||||
.. method:: replace_value(field_name, value, \*processors, \**kwargs)
|
.. method:: replace_value(field_name, value, *processors, **kwargs)
|
||||||
|
|
||||||
Similar to :meth:`add_value` but replaces the collected data with the
|
Similar to :meth:`add_value` but replaces the collected data with the
|
||||||
new value instead of adding it.
|
new value instead of adding it.
|
||||||
.. method:: get_xpath(xpath, \*processors, \**kwargs)
|
.. method:: get_xpath(xpath, *processors, **kwargs)
|
||||||
|
|
||||||
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
|
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
|
||||||
value, which is used to extract a list of unicode strings from the
|
value, which is used to extract a list of unicode strings from the
|
||||||
@ -367,7 +389,7 @@ ItemLoader objects
|
|||||||
# HTML snippet: <p id="price">the price is $1200</p>
|
# HTML snippet: <p id="price">the price is $1200</p>
|
||||||
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
|
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
|
||||||
|
|
||||||
.. method:: add_xpath(field_name, xpath, \*processors, \**kwargs)
|
.. method:: add_xpath(field_name, xpath, *processors, **kwargs)
|
||||||
|
|
||||||
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
|
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
|
||||||
value, which is used to extract a list of unicode strings from the
|
value, which is used to extract a list of unicode strings from the
|
||||||
@ -385,12 +407,12 @@ ItemLoader objects
|
|||||||
# HTML snippet: <p id="price">the price is $1200</p>
|
# HTML snippet: <p id="price">the price is $1200</p>
|
||||||
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
|
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
|
||||||
|
|
||||||
.. method:: replace_xpath(field_name, xpath, \*processors, \**kwargs)
|
.. method:: replace_xpath(field_name, xpath, *processors, **kwargs)
|
||||||
|
|
||||||
Similar to :meth:`add_xpath` but replaces collected data instead of
|
Similar to :meth:`add_xpath` but replaces collected data instead of
|
||||||
adding it.
|
adding it.
|
||||||
|
|
||||||
.. method:: get_css(css, \*processors, \**kwargs)
|
.. method:: get_css(css, *processors, **kwargs)
|
||||||
|
|
||||||
Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
|
Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
|
||||||
instead of a value, which is used to extract a list of unicode strings
|
instead of a value, which is used to extract a list of unicode strings
|
||||||
@ -410,7 +432,7 @@ ItemLoader objects
|
|||||||
# HTML snippet: <p id="price">the price is $1200</p>
|
# HTML snippet: <p id="price">the price is $1200</p>
|
||||||
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
|
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
|
||||||
|
|
||||||
.. method:: add_css(field_name, css, \*processors, \**kwargs)
|
.. method:: add_css(field_name, css, *processors, **kwargs)
|
||||||
|
|
||||||
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
|
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
|
||||||
instead of a value, which is used to extract a list of unicode strings
|
instead of a value, which is used to extract a list of unicode strings
|
||||||
@ -428,7 +450,7 @@ ItemLoader objects
|
|||||||
# HTML snippet: <p id="price">the price is $1200</p>
|
# HTML snippet: <p id="price">the price is $1200</p>
|
||||||
loader.add_css('price', 'p#price', re='the price is (.*)')
|
loader.add_css('price', 'p#price', re='the price is (.*)')
|
||||||
|
|
||||||
.. method:: replace_css(field_name, css, \*processors, \**kwargs)
|
.. method:: replace_css(field_name, css, *processors, **kwargs)
|
||||||
|
|
||||||
Similar to :meth:`add_css` but replaces collected data instead of
|
Similar to :meth:`add_css` but replaces collected data instead of
|
||||||
adding it.
|
adding it.
|
||||||
@ -444,17 +466,19 @@ ItemLoader objects
|
|||||||
|
|
||||||
Create a nested loader with an xpath selector.
|
Create a nested loader with an xpath selector.
|
||||||
The supplied selector is applied relative to selector associated
|
The supplied selector is applied relative to selector associated
|
||||||
with this :class:`ItemLoader`. The nested loader shares the :class:`Item`
|
with this :class:`ItemLoader`. The nested loader shares the :ref:`item
|
||||||
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
|
object <topics-items>` with the parent :class:`ItemLoader` so calls to
|
||||||
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
|
:meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will
|
||||||
|
behave as expected.
|
||||||
|
|
||||||
.. method:: nested_css(css)
|
.. method:: nested_css(css)
|
||||||
|
|
||||||
Create a nested loader with a css selector.
|
Create a nested loader with a css selector.
|
||||||
The supplied selector is applied relative to selector associated
|
The supplied selector is applied relative to selector associated
|
||||||
with this :class:`ItemLoader`. The nested loader shares the :class:`Item`
|
with this :class:`ItemLoader`. The nested loader shares the :ref:`item
|
||||||
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
|
object <topics-items>` with the parent :class:`ItemLoader` so calls to
|
||||||
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
|
:meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will
|
||||||
|
behave as expected.
|
||||||
|
|
||||||
.. method:: get_collected_values(field_name)
|
.. method:: get_collected_values(field_name)
|
||||||
|
|
||||||
@ -477,7 +501,7 @@ ItemLoader objects
|
|||||||
|
|
||||||
.. attribute:: item
|
.. attribute:: item
|
||||||
|
|
||||||
The :class:`~scrapy.item.Item` object being parsed by this Item Loader.
|
The :ref:`item object <topics-items>` being parsed by this Item Loader.
|
||||||
This is mostly used as a property so when attempting to override this
|
This is mostly used as a property so when attempting to override this
|
||||||
value, you may want to check out :attr:`default_item_class` first.
|
value, you may want to check out :attr:`default_item_class` first.
|
||||||
|
|
||||||
@ -488,8 +512,8 @@ ItemLoader objects
|
|||||||
|
|
||||||
.. attribute:: default_item_class
|
.. attribute:: default_item_class
|
||||||
|
|
||||||
An Item class (or factory), used to instantiate items when not given in
|
An :ref:`item object <topics-items>` class or factory, used to
|
||||||
the ``__init__`` method.
|
instantiate items when not given in the ``__init__`` method.
|
||||||
|
|
||||||
.. attribute:: default_input_processor
|
.. attribute:: default_input_processor
|
||||||
|
|
||||||
@ -678,7 +702,7 @@ Here is a list of all built-in processors:
|
|||||||
>>> proc(['one', 'two', 'three'])
|
>>> proc(['one', 'two', 'three'])
|
||||||
'one<br>two<br>three'
|
'one<br>two<br>three'
|
||||||
|
|
||||||
.. class:: Compose(\*functions, \**default_loader_context)
|
.. class:: Compose(*functions, **default_loader_context)
|
||||||
|
|
||||||
A processor which is constructed from the composition of the given
|
A processor which is constructed from the composition of the given
|
||||||
functions. This means that each input value of this processor is passed to
|
functions. This means that each input value of this processor is passed to
|
||||||
@ -706,7 +730,7 @@ Here is a list of all built-in processors:
|
|||||||
active Loader context accessible through the :meth:`ItemLoader.context`
|
active Loader context accessible through the :meth:`ItemLoader.context`
|
||||||
attribute.
|
attribute.
|
||||||
|
|
||||||
.. class:: MapCompose(\*functions, \**default_loader_context)
|
.. class:: MapCompose(*functions, **default_loader_context)
|
||||||
|
|
||||||
A processor which is constructed from the composition of the given
|
A processor which is constructed from the composition of the given
|
||||||
functions, similar to the :class:`Compose` processor. The difference with
|
functions, similar to the :class:`Compose` processor. The difference with
|
||||||
|
@ -9,8 +9,7 @@ Logging
|
|||||||
explicit calls to the Python standard logging. Keep reading to learn more
|
explicit calls to the Python standard logging. Keep reading to learn more
|
||||||
about the new logging system.
|
about the new logging system.
|
||||||
|
|
||||||
Scrapy uses `Python's builtin logging system
|
Scrapy uses :mod:`logging` for event logging. We'll
|
||||||
<https://docs.python.org/3/library/logging.html>`_ for event logging. We'll
|
|
||||||
provide some simple examples to get you started, but for more advanced
|
provide some simple examples to get you started, but for more advanced
|
||||||
use-cases it's strongly suggested to read thoroughly its documentation.
|
use-cases it's strongly suggested to read thoroughly its documentation.
|
||||||
|
|
||||||
@ -83,10 +82,10 @@ path::
|
|||||||
|
|
||||||
.. seealso::
|
.. seealso::
|
||||||
|
|
||||||
Module logging, `HowTo <https://docs.python.org/2/howto/logging.html>`_
|
Module logging, :doc:`HowTo <howto/logging>`
|
||||||
Basic Logging Tutorial
|
Basic Logging Tutorial
|
||||||
|
|
||||||
Module logging, `Loggers <https://docs.python.org/2/library/logging.html#logger-objects>`_
|
Module logging, :ref:`Loggers <logger>`
|
||||||
Further documentation on loggers
|
Further documentation on loggers
|
||||||
|
|
||||||
.. _topics-logging-from-spiders:
|
.. _topics-logging-from-spiders:
|
||||||
@ -165,14 +164,12 @@ possible levels listed in :ref:`topics-logging-levels`.
|
|||||||
|
|
||||||
:setting:`LOG_FORMAT` and :setting:`LOG_DATEFORMAT` specify formatting strings
|
:setting:`LOG_FORMAT` and :setting:`LOG_DATEFORMAT` specify formatting strings
|
||||||
used as layouts for all messages. Those strings can contain any placeholders
|
used as layouts for all messages. Those strings can contain any placeholders
|
||||||
listed in `logging's logrecord attributes docs
|
listed in :ref:`logging's logrecord attributes docs <logrecord-attributes>` and
|
||||||
<https://docs.python.org/2/library/logging.html#logrecord-attributes>`_ and
|
:ref:`datetime's strftime and strptime directives <strftime-strptime-behavior>`
|
||||||
`datetime's strftime and strptime directives
|
|
||||||
<https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior>`_
|
|
||||||
respectively.
|
respectively.
|
||||||
|
|
||||||
If :setting:`LOG_SHORT_NAMES` is set, then the logs will not display the Scrapy
|
If :setting:`LOG_SHORT_NAMES` is set, then the logs will not display the Scrapy
|
||||||
component that prints the log. It is unset by default, hence logs contain the
|
component that prints the log. It is unset by default, hence logs contain the
|
||||||
Scrapy component responsible for that log output.
|
Scrapy component responsible for that log output.
|
||||||
|
|
||||||
Command-line options
|
Command-line options
|
||||||
@ -190,7 +187,7 @@ to override some of the Scrapy settings regarding logging.
|
|||||||
|
|
||||||
.. seealso::
|
.. seealso::
|
||||||
|
|
||||||
Module `logging.handlers <https://docs.python.org/2/library/logging.handlers.html>`_
|
Module :mod:`logging.handlers`
|
||||||
Further documentation on available handlers
|
Further documentation on available handlers
|
||||||
|
|
||||||
.. _custom-log-formats:
|
.. _custom-log-formats:
|
||||||
@ -201,10 +198,13 @@ Custom Log Formats
|
|||||||
A custom log format can be set for different actions by extending
|
A custom log format can be set for different actions by extending
|
||||||
:class:`~scrapy.logformatter.LogFormatter` class and making
|
:class:`~scrapy.logformatter.LogFormatter` class and making
|
||||||
:setting:`LOG_FORMATTER` point to your new class.
|
:setting:`LOG_FORMATTER` point to your new class.
|
||||||
|
|
||||||
.. autoclass:: scrapy.logformatter.LogFormatter
|
.. autoclass:: scrapy.logformatter.LogFormatter
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
|
.. _topics-logging-advanced-customization:
|
||||||
|
|
||||||
Advanced customization
|
Advanced customization
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
@ -256,16 +256,15 @@ scrapy.utils.log module
|
|||||||
In that case, its usage is not required but it's recommended.
|
In that case, its usage is not required but it's recommended.
|
||||||
|
|
||||||
Another option when running custom scripts is to manually configure the logging.
|
Another option when running custom scripts is to manually configure the logging.
|
||||||
To do this you can use `logging.basicConfig()`_ to set a basic root handler.
|
To do this you can use :func:`logging.basicConfig` to set a basic root handler.
|
||||||
|
|
||||||
Note that :class:`~scrapy.crawler.CrawlerProcess` automatically calls ``configure_logging``,
|
Note that :class:`~scrapy.crawler.CrawlerProcess` automatically calls ``configure_logging``,
|
||||||
so it is recommended to only use `logging.basicConfig()`_ together with
|
so it is recommended to only use :func:`logging.basicConfig` together with
|
||||||
:class:`~scrapy.crawler.CrawlerRunner`.
|
:class:`~scrapy.crawler.CrawlerRunner`.
|
||||||
|
|
||||||
This is an example on how to redirect ``INFO`` or higher messages to a file::
|
This is an example on how to redirect ``INFO`` or higher messages to a file::
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from scrapy.utils.log import configure_logging
|
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
filename='log.txt',
|
filename='log.txt',
|
||||||
@ -275,7 +274,3 @@ scrapy.utils.log module
|
|||||||
|
|
||||||
Refer to :ref:`run-from-script` for more details about using Scrapy this
|
Refer to :ref:`run-from-script` for more details about using Scrapy this
|
||||||
way.
|
way.
|
||||||
|
|
||||||
.. _logging.basicConfig(): https://docs.python.org/2/library/logging.html#logging.basicConfig
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ this:
|
|||||||
4. When the files are downloaded, another field (``files``) will be populated
|
4. When the files are downloaded, another field (``files``) will be populated
|
||||||
with the results. This field will contain a list of dicts with information
|
with the results. This field will contain a list of dicts with information
|
||||||
about the downloaded files, such as the downloaded path, the original
|
about the downloaded files, such as the downloaded path, the original
|
||||||
scraped url (taken from the ``file_urls`` field) , and the file checksum.
|
scraped url (taken from the ``file_urls`` field), the file checksum and the file status.
|
||||||
The files in the list of the ``files`` field will retain the same order of
|
The files in the list of the ``files`` field will retain the same order of
|
||||||
the original ``file_urls`` field. If some file failed downloading, an
|
the original ``file_urls`` field. If some file failed downloading, an
|
||||||
error will be logged and the file won't be present in the ``files`` field.
|
error will be logged and the file won't be present in the ``files`` field.
|
||||||
@ -156,7 +156,7 @@ following forms::
|
|||||||
|
|
||||||
ftp://username:password@address:port/path
|
ftp://username:password@address:port/path
|
||||||
ftp://address:port/path
|
ftp://address:port/path
|
||||||
|
|
||||||
If ``username`` and ``password`` are not provided, they are taken from the :setting:`FTP_USER` and
|
If ``username`` and ``password`` are not provided, they are taken from the :setting:`FTP_USER` and
|
||||||
:setting:`FTP_PASSWORD` settings respectively.
|
:setting:`FTP_PASSWORD` settings respectively.
|
||||||
|
|
||||||
@ -201,6 +201,9 @@ For self-hosting you also might feel the need not to use SSL and not to verify S
|
|||||||
.. _s3.scality: https://s3.scality.com/
|
.. _s3.scality: https://s3.scality.com/
|
||||||
.. _canned ACLs: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl
|
.. _canned ACLs: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl
|
||||||
|
|
||||||
|
|
||||||
|
.. _media-pipeline-gcs:
|
||||||
|
|
||||||
Google Cloud Storage
|
Google Cloud Storage
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
@ -243,20 +246,22 @@ Usage example
|
|||||||
.. setting:: IMAGES_URLS_FIELD
|
.. setting:: IMAGES_URLS_FIELD
|
||||||
.. setting:: IMAGES_RESULT_FIELD
|
.. setting:: IMAGES_RESULT_FIELD
|
||||||
|
|
||||||
In order to use a media pipeline first, :ref:`enable it
|
In order to use a media pipeline, first :ref:`enable it
|
||||||
<topics-media-pipeline-enabling>`.
|
<topics-media-pipeline-enabling>`.
|
||||||
|
|
||||||
Then, if a spider returns a dict with the URLs key (``file_urls`` or
|
Then, if a spider returns an :ref:`item object <topics-items>` with the URLs
|
||||||
``image_urls``, for the Files or Images Pipeline respectively), the pipeline will
|
field (``file_urls`` or ``image_urls``, for the Files or Images Pipeline
|
||||||
put the results under respective key (``files`` or ``images``).
|
respectively), the pipeline will put the results under the respective field
|
||||||
|
(``files`` or ``images``).
|
||||||
|
|
||||||
If you prefer to use :class:`~.Item`, then define a custom item with the
|
When using :ref:`item types <item-types>` for which fields are defined beforehand,
|
||||||
necessary fields, like in this example for Images Pipeline::
|
you must define both the URLs field and the results field. For example, when
|
||||||
|
using the images pipeline, items must define both the ``image_urls`` and the
|
||||||
|
``images`` field. For instance, using the :class:`~scrapy.item.Item` class::
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
|
|
||||||
class MyItem(scrapy.Item):
|
class MyItem(scrapy.Item):
|
||||||
|
|
||||||
# ... other item fields ...
|
# ... other item fields ...
|
||||||
image_urls = scrapy.Field()
|
image_urls = scrapy.Field()
|
||||||
images = scrapy.Field()
|
images = scrapy.Field()
|
||||||
@ -445,8 +450,11 @@ See here the methods that you can override in your custom Files Pipeline:
|
|||||||
:meth:`~get_media_requests` method and return a Request for each
|
:meth:`~get_media_requests` method and return a Request for each
|
||||||
file URL::
|
file URL::
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
def get_media_requests(self, item, info):
|
def get_media_requests(self, item, info):
|
||||||
for file_url in item['file_urls']:
|
adapter = ItemAdapter(item)
|
||||||
|
for file_url in adapter['file_urls']:
|
||||||
yield scrapy.Request(file_url)
|
yield scrapy.Request(file_url)
|
||||||
|
|
||||||
Those requests will be processed by the pipeline and, when they have finished
|
Those requests will be processed by the pipeline and, when they have finished
|
||||||
@ -470,6 +478,18 @@ See here the methods that you can override in your custom Files Pipeline:
|
|||||||
|
|
||||||
* ``checksum`` - a `MD5 hash`_ of the image contents
|
* ``checksum`` - a `MD5 hash`_ of the image contents
|
||||||
|
|
||||||
|
* ``status`` - the file status indication.
|
||||||
|
|
||||||
|
.. versionadded:: 2.2
|
||||||
|
|
||||||
|
It can be one of the following:
|
||||||
|
|
||||||
|
* ``downloaded`` - file was downloaded.
|
||||||
|
* ``uptodate`` - file was not downloaded, as it was downloaded recently,
|
||||||
|
according to the file expiration policy.
|
||||||
|
* ``cached`` - file was already scheduled for download, by another item
|
||||||
|
sharing the same file.
|
||||||
|
|
||||||
The list of tuples received by :meth:`~item_completed` is
|
The list of tuples received by :meth:`~item_completed` is
|
||||||
guaranteed to retain the same order of the requests returned from the
|
guaranteed to retain the same order of the requests returned from the
|
||||||
:meth:`~get_media_requests` method.
|
:meth:`~get_media_requests` method.
|
||||||
@ -479,7 +499,8 @@ See here the methods that you can override in your custom Files Pipeline:
|
|||||||
[(True,
|
[(True,
|
||||||
{'checksum': '2b00042f7481c7b056c4b410d28f33cf',
|
{'checksum': '2b00042f7481c7b056c4b410d28f33cf',
|
||||||
'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg',
|
'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg',
|
||||||
'url': 'http://www.example.com/files/product1.pdf'}),
|
'url': 'http://www.example.com/files/product1.pdf',
|
||||||
|
'status': 'downloaded'}),
|
||||||
(False,
|
(False,
|
||||||
Failure(...))]
|
Failure(...))]
|
||||||
|
|
||||||
@ -500,13 +521,15 @@ See here the methods that you can override in your custom Files Pipeline:
|
|||||||
store the downloaded file paths (passed in results) in the ``file_paths``
|
store the downloaded file paths (passed in results) in the ``file_paths``
|
||||||
item field, and we drop the item if it doesn't contain any files::
|
item field, and we drop the item if it doesn't contain any files::
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
from scrapy.exceptions import DropItem
|
from scrapy.exceptions import DropItem
|
||||||
|
|
||||||
def item_completed(self, results, item, info):
|
def item_completed(self, results, item, info):
|
||||||
file_paths = [x['path'] for ok, x in results if ok]
|
file_paths = [x['path'] for ok, x in results if ok]
|
||||||
if not file_paths:
|
if not file_paths:
|
||||||
raise DropItem("Item contains no files")
|
raise DropItem("Item contains no files")
|
||||||
item['file_paths'] = file_paths
|
adapter = ItemAdapter(item)
|
||||||
|
adapter['file_paths'] = file_paths
|
||||||
return item
|
return item
|
||||||
|
|
||||||
By default, the :meth:`item_completed` method returns the item.
|
By default, the :meth:`item_completed` method returns the item.
|
||||||
@ -580,8 +603,9 @@ Here is a full example of the Images Pipeline whose methods are exemplified
|
|||||||
above::
|
above::
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
from scrapy.pipelines.images import ImagesPipeline
|
from itemadapter import ItemAdapter
|
||||||
from scrapy.exceptions import DropItem
|
from scrapy.exceptions import DropItem
|
||||||
|
from scrapy.pipelines.images import ImagesPipeline
|
||||||
|
|
||||||
class MyImagesPipeline(ImagesPipeline):
|
class MyImagesPipeline(ImagesPipeline):
|
||||||
|
|
||||||
@ -593,7 +617,8 @@ above::
|
|||||||
image_paths = [x['path'] for ok, x in results if ok]
|
image_paths = [x['path'] for ok, x in results if ok]
|
||||||
if not image_paths:
|
if not image_paths:
|
||||||
raise DropItem("Item contains no images")
|
raise DropItem("Item contains no images")
|
||||||
item['image_paths'] = image_paths
|
adapter = ItemAdapter(item)
|
||||||
|
adapter['image_paths'] = image_paths
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
@ -35,8 +35,9 @@ Here's an example showing how to run a single spider with it.
|
|||||||
...
|
...
|
||||||
|
|
||||||
process = CrawlerProcess(settings={
|
process = CrawlerProcess(settings={
|
||||||
'FEED_FORMAT': 'json',
|
"FEEDS": {
|
||||||
'FEED_URI': 'items.json'
|
"items.json": {"format": "json"},
|
||||||
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
process.crawl(MySpider)
|
process.crawl(MySpider)
|
||||||
|
@ -36,7 +36,7 @@ Request objects
|
|||||||
:type url: string
|
:type url: string
|
||||||
|
|
||||||
:param callback: the function that will be called with the response of this
|
:param callback: the function that will be called with the response of this
|
||||||
request (once its downloaded) as its first parameter. For more information
|
request (once it's downloaded) as its first parameter. For more information
|
||||||
see :ref:`topics-request-response-ref-request-callback-arguments` below.
|
see :ref:`topics-request-response-ref-request-callback-arguments` below.
|
||||||
If a Request doesn't specify a callback, the spider's
|
If a Request doesn't specify a callback, the spider's
|
||||||
:meth:`~scrapy.spiders.Spider.parse` method will be used.
|
:meth:`~scrapy.spiders.Spider.parse` method will be used.
|
||||||
@ -174,9 +174,9 @@ Request objects
|
|||||||
See :ref:`topics-request-meta` for a list of special meta keys
|
See :ref:`topics-request-meta` for a list of special meta keys
|
||||||
recognized by Scrapy.
|
recognized by Scrapy.
|
||||||
|
|
||||||
This dict is `shallow copied`_ when the request is cloned using the
|
This dict is :doc:`shallow copied <library/copy>` when the request is
|
||||||
``copy()`` or ``replace()`` methods, and can also be accessed, in your
|
cloned using the ``copy()`` or ``replace()`` methods, and can also be
|
||||||
spider, from the ``response.meta`` attribute.
|
accessed, in your spider, from the ``response.meta`` attribute.
|
||||||
|
|
||||||
.. attribute:: Request.cb_kwargs
|
.. attribute:: Request.cb_kwargs
|
||||||
|
|
||||||
@ -185,11 +185,13 @@ Request objects
|
|||||||
for new Requests, which means by default callbacks only get a :class:`Response`
|
for new Requests, which means by default callbacks only get a :class:`Response`
|
||||||
object as argument.
|
object as argument.
|
||||||
|
|
||||||
This dict is `shallow copied`_ when the request is cloned using the
|
This dict is :doc:`shallow copied <library/copy>` when the request is
|
||||||
``copy()`` or ``replace()`` methods, and can also be accessed, in your
|
cloned using the ``copy()`` or ``replace()`` methods, and can also be
|
||||||
spider, from the ``response.cb_kwargs`` attribute.
|
accessed, in your spider, from the ``response.cb_kwargs`` attribute.
|
||||||
|
|
||||||
.. _shallow copied: https://docs.python.org/2/library/copy.html
|
In case of a failure to process the request, this dict can be accessed as
|
||||||
|
``failure.request.cb_kwargs`` in the request's errback. For more information,
|
||||||
|
see :ref:`errback-cb_kwargs`.
|
||||||
|
|
||||||
.. method:: Request.copy()
|
.. method:: Request.copy()
|
||||||
|
|
||||||
@ -314,6 +316,31 @@ errors if needed::
|
|||||||
request = failure.request
|
request = failure.request
|
||||||
self.logger.error('TimeoutError on %s', request.url)
|
self.logger.error('TimeoutError on %s', request.url)
|
||||||
|
|
||||||
|
.. _errback-cb_kwargs:
|
||||||
|
|
||||||
|
Accessing additional data in errback functions
|
||||||
|
----------------------------------------------
|
||||||
|
|
||||||
|
In case of a failure to process the request, you may be interested in
|
||||||
|
accessing arguments to the callback functions so you can process further
|
||||||
|
based on the arguments in the errback. The following example shows how to
|
||||||
|
achieve this by using ``Failure.request.cb_kwargs``::
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
request = scrapy.Request('http://www.example.com/index.html',
|
||||||
|
callback=self.parse_page2,
|
||||||
|
errback=self.errback_page2,
|
||||||
|
cb_kwargs=dict(main_url=response.url))
|
||||||
|
yield request
|
||||||
|
|
||||||
|
def parse_page2(self, response, main_url):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def errback_page2(self, failure):
|
||||||
|
yield dict(
|
||||||
|
main_url=failure.request.cb_kwargs['main_url'],
|
||||||
|
)
|
||||||
|
|
||||||
.. _topics-request-meta:
|
.. _topics-request-meta:
|
||||||
|
|
||||||
Request.meta special keys
|
Request.meta special keys
|
||||||
@ -387,6 +414,51 @@ The meta key is used set retry times per request. When initialized, the
|
|||||||
:reqmeta:`max_retry_times` meta key takes higher precedence over the
|
:reqmeta:`max_retry_times` meta key takes higher precedence over the
|
||||||
:setting:`RETRY_TIMES` setting.
|
:setting:`RETRY_TIMES` setting.
|
||||||
|
|
||||||
|
|
||||||
|
.. _topics-stop-response-download:
|
||||||
|
|
||||||
|
Stopping the download of a Response
|
||||||
|
===================================
|
||||||
|
|
||||||
|
Raising a :exc:`~scrapy.exceptions.StopDownload` exception from a
|
||||||
|
:class:`~scrapy.signals.bytes_received` signal handler will stop the
|
||||||
|
download of a given response. See the following example::
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class StopSpider(scrapy.Spider):
|
||||||
|
name = "stop"
|
||||||
|
start_urls = ["https://docs.scrapy.org/en/latest/"]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
spider = super().from_crawler(crawler)
|
||||||
|
crawler.signals.connect(spider.on_bytes_received, signal=scrapy.signals.bytes_received)
|
||||||
|
return spider
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
# 'last_chars' show that the full response was not downloaded
|
||||||
|
yield {"len": len(response.text), "last_chars": response.text[-40:]}
|
||||||
|
|
||||||
|
def on_bytes_received(self, data, request, spider):
|
||||||
|
raise scrapy.exceptions.StopDownload(fail=False)
|
||||||
|
|
||||||
|
which produces the following output::
|
||||||
|
|
||||||
|
2020-05-19 17:26:12 [scrapy.core.engine] INFO: Spider opened
|
||||||
|
2020-05-19 17:26:12 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
|
||||||
|
2020-05-19 17:26:13 [scrapy.core.downloader.handlers.http11] DEBUG: Download stopped for <GET https://docs.scrapy.org/en/latest/> from signal handler StopSpider.on_bytes_received
|
||||||
|
2020-05-19 17:26:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://docs.scrapy.org/en/latest/> (referer: None) ['download_stopped']
|
||||||
|
2020-05-19 17:26:13 [scrapy.core.scraper] DEBUG: Scraped from <200 https://docs.scrapy.org/en/latest/>
|
||||||
|
{'len': 279, 'last_chars': 'dth, initial-scale=1.0">\n \n <title>Scr'}
|
||||||
|
2020-05-19 17:26:13 [scrapy.core.engine] INFO: Closing spider (finished)
|
||||||
|
|
||||||
|
By default, resulting responses are handled by their corresponding errbacks. To
|
||||||
|
call their callback instead, like in this example, pass ``fail=False`` to the
|
||||||
|
:exc:`~scrapy.exceptions.StopDownload` exception.
|
||||||
|
|
||||||
|
|
||||||
.. _topics-request-response-ref-request-subclasses:
|
.. _topics-request-response-ref-request-subclasses:
|
||||||
|
|
||||||
Request subclasses
|
Request subclasses
|
||||||
@ -566,12 +638,10 @@ dealing with JSON requests.
|
|||||||
set to ``'POST'`` automatically.
|
set to ``'POST'`` automatically.
|
||||||
:type data: JSON serializable object
|
:type data: JSON serializable object
|
||||||
|
|
||||||
:param dumps_kwargs: Parameters that will be passed to underlying `json.dumps`_ method which is used to serialize
|
:param dumps_kwargs: Parameters that will be passed to underlying :func:`json.dumps` method which is used to serialize
|
||||||
data into JSON format.
|
data into JSON format.
|
||||||
:type dumps_kwargs: dict
|
:type dumps_kwargs: dict
|
||||||
|
|
||||||
.. _json.dumps: https://docs.python.org/3/library/json.html#json.dumps
|
|
||||||
|
|
||||||
JsonRequest usage example
|
JsonRequest usage example
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
@ -620,6 +690,12 @@ Response objects
|
|||||||
:param certificate: an object representing the server's SSL certificate.
|
:param certificate: an object representing the server's SSL certificate.
|
||||||
:type certificate: twisted.internet.ssl.Certificate
|
:type certificate: twisted.internet.ssl.Certificate
|
||||||
|
|
||||||
|
:param ip_address: The IP address of the server from which the Response originated.
|
||||||
|
:type ip_address: :class:`ipaddress.IPv4Address` or :class:`ipaddress.IPv6Address`
|
||||||
|
|
||||||
|
.. versionadded:: 2.1.0
|
||||||
|
The ``ip_address`` parameter.
|
||||||
|
|
||||||
.. attribute:: Response.url
|
.. attribute:: Response.url
|
||||||
|
|
||||||
A string containing the URL of the response.
|
A string containing the URL of the response.
|
||||||
@ -706,9 +782,19 @@ Response objects
|
|||||||
|
|
||||||
A :class:`twisted.internet.ssl.Certificate` object representing
|
A :class:`twisted.internet.ssl.Certificate` object representing
|
||||||
the server's SSL certificate.
|
the server's SSL certificate.
|
||||||
|
|
||||||
Only populated for ``https`` responses, ``None`` otherwise.
|
Only populated for ``https`` responses, ``None`` otherwise.
|
||||||
|
|
||||||
|
.. attribute:: Response.ip_address
|
||||||
|
|
||||||
|
.. versionadded:: 2.1.0
|
||||||
|
|
||||||
|
The IP address of the server from which the Response originated.
|
||||||
|
|
||||||
|
This attribute is currently only populated by the HTTP 1.1 download
|
||||||
|
handler, i.e. for ``http(s)`` responses. For other handlers,
|
||||||
|
:attr:`ip_address` is always ``None``.
|
||||||
|
|
||||||
.. method:: Response.copy()
|
.. method:: Response.copy()
|
||||||
|
|
||||||
Returns a new Response which is a copy of this Response.
|
Returns a new Response which is a copy of this Response.
|
||||||
@ -724,18 +810,16 @@ Response objects
|
|||||||
Constructs an absolute url by combining the Response's :attr:`url` with
|
Constructs an absolute url by combining the Response's :attr:`url` with
|
||||||
a possible relative url.
|
a possible relative url.
|
||||||
|
|
||||||
This is a wrapper over `urlparse.urljoin`_, it's merely an alias for
|
This is a wrapper over :func:`~urllib.parse.urljoin`, it's merely an alias for
|
||||||
making this call::
|
making this call::
|
||||||
|
|
||||||
urlparse.urljoin(response.url, url)
|
urllib.parse.urljoin(response.url, url)
|
||||||
|
|
||||||
.. automethod:: Response.follow
|
.. automethod:: Response.follow
|
||||||
|
|
||||||
.. automethod:: Response.follow_all
|
.. automethod:: Response.follow_all
|
||||||
|
|
||||||
|
|
||||||
.. _urlparse.urljoin: https://docs.python.org/2/library/urlparse.html#urlparse.urljoin
|
|
||||||
|
|
||||||
.. _topics-request-response-ref-response-subclasses:
|
.. _topics-request-response-ref-response-subclasses:
|
||||||
|
|
||||||
Response subclasses
|
Response subclasses
|
||||||
@ -824,10 +908,10 @@ TextResponse objects
|
|||||||
|
|
||||||
.. automethod:: TextResponse.follow_all
|
.. automethod:: TextResponse.follow_all
|
||||||
|
|
||||||
.. method:: TextResponse.body_as_unicode()
|
.. automethod:: TextResponse.json()
|
||||||
|
|
||||||
The same as :attr:`text`, but available as a method. This method is
|
Returns a Python object from deserialized JSON document.
|
||||||
kept for backward compatibility; please prefer ``response.text``.
|
The result is cached after the first call.
|
||||||
|
|
||||||
|
|
||||||
HtmlResponse objects
|
HtmlResponse objects
|
||||||
|
@ -14,7 +14,7 @@ achieve this, such as:
|
|||||||
drawback: it's slow.
|
drawback: it's slow.
|
||||||
|
|
||||||
* `lxml`_ is an XML parsing library (which also parses HTML) with a pythonic
|
* `lxml`_ is an XML parsing library (which also parses HTML) with a pythonic
|
||||||
API based on `ElementTree`_. (lxml is not part of the Python standard
|
API based on :mod:`~xml.etree.ElementTree`. (lxml is not part of the Python standard
|
||||||
library.)
|
library.)
|
||||||
|
|
||||||
Scrapy comes with its own mechanism for extracting data. They're called
|
Scrapy comes with its own mechanism for extracting data. They're called
|
||||||
@ -36,7 +36,6 @@ defines selectors to associate those styles with specific HTML elements.
|
|||||||
|
|
||||||
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
|
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
|
||||||
.. _lxml: https://lxml.de/
|
.. _lxml: https://lxml.de/
|
||||||
.. _ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html
|
|
||||||
.. _XPath: https://www.w3.org/TR/xpath/all/
|
.. _XPath: https://www.w3.org/TR/xpath/all/
|
||||||
.. _CSS: https://www.w3.org/TR/selectors
|
.. _CSS: https://www.w3.org/TR/selectors
|
||||||
.. _parsel: https://parsel.readthedocs.io/en/latest/
|
.. _parsel: https://parsel.readthedocs.io/en/latest/
|
||||||
|
@ -26,9 +26,7 @@ do this by using an environment variable, ``SCRAPY_SETTINGS_MODULE``.
|
|||||||
|
|
||||||
The value of ``SCRAPY_SETTINGS_MODULE`` should be in Python path syntax, e.g.
|
The value of ``SCRAPY_SETTINGS_MODULE`` should be in Python path syntax, e.g.
|
||||||
``myproject.settings``. Note that the settings module should be on the
|
``myproject.settings``. Note that the settings module should be on the
|
||||||
Python `import search path`_.
|
Python :ref:`import search path <tut-searchpath>`.
|
||||||
|
|
||||||
.. _import search path: https://docs.python.org/2/tutorial/modules.html#the-module-search-path
|
|
||||||
|
|
||||||
.. _populating-settings:
|
.. _populating-settings:
|
||||||
|
|
||||||
@ -238,8 +236,8 @@ CONCURRENT_ITEMS
|
|||||||
|
|
||||||
Default: ``100``
|
Default: ``100``
|
||||||
|
|
||||||
Maximum number of concurrent items (per response) to process in parallel in the
|
Maximum number of concurrent items (per response) to process in parallel in
|
||||||
Item Processor (also known as the :ref:`Item Pipeline <topics-item-pipeline>`).
|
:ref:`item pipelines <topics-item-pipeline>`.
|
||||||
|
|
||||||
.. setting:: CONCURRENT_REQUESTS
|
.. setting:: CONCURRENT_REQUESTS
|
||||||
|
|
||||||
@ -422,10 +420,9 @@ connections (for ``HTTP10DownloadHandler``).
|
|||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
HTTP/1.0 is rarely used nowadays so you can safely ignore this setting,
|
HTTP/1.0 is rarely used nowadays so you can safely ignore this setting,
|
||||||
unless you use Twisted<11.1, or if you really want to use HTTP/1.0
|
unless you really want to use HTTP/1.0 and override
|
||||||
and override :setting:`DOWNLOAD_HANDLERS_BASE` for ``http(s)`` scheme
|
:setting:`DOWNLOAD_HANDLERS` for ``http(s)`` scheme accordingly,
|
||||||
accordingly, i.e. to
|
i.e. to ``'scrapy.core.downloader.handlers.http.HTTP10DownloadHandler'``.
|
||||||
``'scrapy.core.downloader.handlers.http.HTTP10DownloadHandler'``.
|
|
||||||
|
|
||||||
.. setting:: DOWNLOADER_CLIENTCONTEXTFACTORY
|
.. setting:: DOWNLOADER_CLIENTCONTEXTFACTORY
|
||||||
|
|
||||||
@ -449,7 +446,6 @@ or even enable client-side authentication (and various other things).
|
|||||||
Scrapy also has another context factory class that you can set,
|
Scrapy also has another context factory class that you can set,
|
||||||
``'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory'``,
|
``'scrapy.core.downloader.contextfactory.BrowserLikeContextFactory'``,
|
||||||
which uses the platform's certificates to validate remote endpoints.
|
which uses the platform's certificates to validate remote endpoints.
|
||||||
**This is only available if you use Twisted>=14.0.**
|
|
||||||
|
|
||||||
If you do use a custom ContextFactory, make sure its ``__init__`` method
|
If you do use a custom ContextFactory, make sure its ``__init__`` method
|
||||||
accepts a ``method`` parameter (this is the ``OpenSSL.SSL`` method mapping
|
accepts a ``method`` parameter (this is the ``OpenSSL.SSL`` method mapping
|
||||||
@ -473,7 +469,7 @@ necessary to access certain HTTPS websites: for example, you may need to use
|
|||||||
``'DEFAULT:!DH'`` for a website with weak DH parameters or enable a
|
``'DEFAULT:!DH'`` for a website with weak DH parameters or enable a
|
||||||
specific cipher that is not included in ``DEFAULT`` if a website requires it.
|
specific cipher that is not included in ``DEFAULT`` if a website requires it.
|
||||||
|
|
||||||
.. _OpenSSL cipher list format: https://www.openssl.org/docs/manmaster/man1/ciphers.html#CIPHER-LIST-FORMAT
|
.. _OpenSSL cipher list format: https://www.openssl.org/docs/manmaster/man1/openssl-ciphers.html#CIPHER-LIST-FORMAT
|
||||||
|
|
||||||
.. setting:: DOWNLOADER_CLIENT_TLS_METHOD
|
.. setting:: DOWNLOADER_CLIENT_TLS_METHOD
|
||||||
|
|
||||||
@ -496,10 +492,6 @@ This setting must be one of these string values:
|
|||||||
- ``'TLSv1.2'``: forces TLS version 1.2
|
- ``'TLSv1.2'``: forces TLS version 1.2
|
||||||
- ``'SSLv3'``: forces SSL version 3 (**not recommended**)
|
- ``'SSLv3'``: forces SSL version 3 (**not recommended**)
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
We recommend that you use PyOpenSSL>=0.13 and Twisted>=0.13
|
|
||||||
or above (Twisted>=14.0 if you can).
|
|
||||||
|
|
||||||
.. setting:: DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING
|
.. setting:: DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING
|
||||||
|
|
||||||
@ -662,8 +654,6 @@ If you want to disable it set to 0.
|
|||||||
spider attribute and per-request using :reqmeta:`download_maxsize`
|
spider attribute and per-request using :reqmeta:`download_maxsize`
|
||||||
Request.meta key.
|
Request.meta key.
|
||||||
|
|
||||||
This feature needs Twisted >= 11.1.
|
|
||||||
|
|
||||||
.. setting:: DOWNLOAD_WARNSIZE
|
.. setting:: DOWNLOAD_WARNSIZE
|
||||||
|
|
||||||
DOWNLOAD_WARNSIZE
|
DOWNLOAD_WARNSIZE
|
||||||
@ -681,8 +671,6 @@ If you want to disable it set to 0.
|
|||||||
spider attribute and per-request using :reqmeta:`download_warnsize`
|
spider attribute and per-request using :reqmeta:`download_warnsize`
|
||||||
Request.meta key.
|
Request.meta key.
|
||||||
|
|
||||||
This feature needs Twisted >= 11.1.
|
|
||||||
|
|
||||||
.. setting:: DOWNLOAD_FAIL_ON_DATALOSS
|
.. setting:: DOWNLOAD_FAIL_ON_DATALOSS
|
||||||
|
|
||||||
DOWNLOAD_FAIL_ON_DATALOSS
|
DOWNLOAD_FAIL_ON_DATALOSS
|
||||||
@ -899,10 +887,9 @@ LOG_FORMAT
|
|||||||
|
|
||||||
Default: ``'%(asctime)s [%(name)s] %(levelname)s: %(message)s'``
|
Default: ``'%(asctime)s [%(name)s] %(levelname)s: %(message)s'``
|
||||||
|
|
||||||
String for formatting log messages. Refer to the `Python logging documentation`_ for the whole list of available
|
String for formatting log messages. Refer to the
|
||||||
placeholders.
|
:ref:`Python logging documentation <logrecord-attributes>` for the qwhole
|
||||||
|
list of available placeholders.
|
||||||
.. _Python logging documentation: https://docs.python.org/2/library/logging.html#logrecord-attributes
|
|
||||||
|
|
||||||
.. setting:: LOG_DATEFORMAT
|
.. setting:: LOG_DATEFORMAT
|
||||||
|
|
||||||
@ -912,10 +899,9 @@ LOG_DATEFORMAT
|
|||||||
Default: ``'%Y-%m-%d %H:%M:%S'``
|
Default: ``'%Y-%m-%d %H:%M:%S'``
|
||||||
|
|
||||||
String for formatting date/time, expansion of the ``%(asctime)s`` placeholder
|
String for formatting date/time, expansion of the ``%(asctime)s`` placeholder
|
||||||
in :setting:`LOG_FORMAT`. Refer to the `Python datetime documentation`_ for the whole list of available
|
in :setting:`LOG_FORMAT`. Refer to the
|
||||||
directives.
|
:ref:`Python datetime documentation <strftime-strptime-behavior>` for the
|
||||||
|
whole list of available directives.
|
||||||
.. _Python datetime documentation: https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior
|
|
||||||
|
|
||||||
.. setting:: LOG_FORMATTER
|
.. setting:: LOG_FORMATTER
|
||||||
|
|
||||||
@ -1116,17 +1102,6 @@ multi-purpose thread pool used by various Scrapy components. Threaded
|
|||||||
DNS Resolver, BlockingFeedStorage, S3FilesStore just to name a few. Increase
|
DNS Resolver, BlockingFeedStorage, S3FilesStore just to name a few. Increase
|
||||||
this value if you're experiencing problems with insufficient blocking IO.
|
this value if you're experiencing problems with insufficient blocking IO.
|
||||||
|
|
||||||
.. setting:: REDIRECT_MAX_TIMES
|
|
||||||
|
|
||||||
REDIRECT_MAX_TIMES
|
|
||||||
------------------
|
|
||||||
|
|
||||||
Default: ``20``
|
|
||||||
|
|
||||||
Defines the maximum times a request can be redirected. After this maximum the
|
|
||||||
request's response is returned as is. We used Firefox default value for the
|
|
||||||
same task.
|
|
||||||
|
|
||||||
.. setting:: REDIRECT_PRIORITY_ADJUST
|
.. setting:: REDIRECT_PRIORITY_ADJUST
|
||||||
|
|
||||||
REDIRECT_PRIORITY_ADJUST
|
REDIRECT_PRIORITY_ADJUST
|
||||||
@ -1422,17 +1397,6 @@ Default: ``True``
|
|||||||
A boolean which specifies if the :ref:`telnet console <topics-telnetconsole>`
|
A boolean which specifies if the :ref:`telnet console <topics-telnetconsole>`
|
||||||
will be enabled (provided its extension is also enabled).
|
will be enabled (provided its extension is also enabled).
|
||||||
|
|
||||||
.. setting:: TELNETCONSOLE_PORT
|
|
||||||
|
|
||||||
TELNETCONSOLE_PORT
|
|
||||||
------------------
|
|
||||||
|
|
||||||
Default: ``[6023, 6073]``
|
|
||||||
|
|
||||||
The port range to use for the telnet console. If set to ``None`` or ``0``, a
|
|
||||||
dynamically assigned port is used. For more info see
|
|
||||||
:ref:`topics-telnetconsole`.
|
|
||||||
|
|
||||||
.. setting:: TEMPLATES_DIR
|
.. setting:: TEMPLATES_DIR
|
||||||
|
|
||||||
TEMPLATES_DIR
|
TEMPLATES_DIR
|
||||||
|
@ -156,6 +156,17 @@ First, we launch the shell::
|
|||||||
|
|
||||||
scrapy shell 'https://scrapy.org' --nolog
|
scrapy shell 'https://scrapy.org' --nolog
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Remember to always enclose URLs in quotes when running the Scrapy shell from
|
||||||
|
the command line, otherwise URLs containing arguments (i.e. the ``&`` character)
|
||||||
|
will not work.
|
||||||
|
|
||||||
|
On Windows, use double quotes instead::
|
||||||
|
|
||||||
|
scrapy shell "https://scrapy.org" --nolog
|
||||||
|
|
||||||
|
|
||||||
Then, the shell fetches the URL (using the Scrapy downloader) and prints the
|
Then, the shell fetches the URL (using the Scrapy downloader) and prints the
|
||||||
list of available objects and useful shortcuts (you'll notice that these lines
|
list of available objects and useful shortcuts (you'll notice that these lines
|
||||||
all start with the ``[s]`` prefix)::
|
all start with the ``[s]`` prefix)::
|
||||||
|
@ -16,8 +16,7 @@ deliver the arguments that the handler receives.
|
|||||||
You can connect to signals (or send your own) through the
|
You can connect to signals (or send your own) through the
|
||||||
:ref:`topics-api-signals`.
|
:ref:`topics-api-signals`.
|
||||||
|
|
||||||
Here is a simple example showing how you can catch signals and perform some action:
|
Here is a simple example showing how you can catch signals and perform some action::
|
||||||
::
|
|
||||||
|
|
||||||
from scrapy import signals
|
from scrapy import signals
|
||||||
from scrapy import Spider
|
from scrapy import Spider
|
||||||
@ -52,9 +51,45 @@ Deferred signal handlers
|
|||||||
========================
|
========================
|
||||||
|
|
||||||
Some signals support returning :class:`~twisted.internet.defer.Deferred`
|
Some signals support returning :class:`~twisted.internet.defer.Deferred`
|
||||||
objects from their handlers, see the :ref:`topics-signals-ref` below to know
|
objects from their handlers, allowing you to run asynchronous code that
|
||||||
which ones.
|
does not block Scrapy. If a signal handler returns a
|
||||||
|
:class:`~twisted.internet.defer.Deferred`, Scrapy waits for that
|
||||||
|
:class:`~twisted.internet.defer.Deferred` to fire.
|
||||||
|
|
||||||
|
Let's take an example::
|
||||||
|
|
||||||
|
class SignalSpider(scrapy.Spider):
|
||||||
|
name = 'signals'
|
||||||
|
start_urls = ['http://quotes.toscrape.com/page/1/']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler, *args, **kwargs):
|
||||||
|
spider = super(SignalSpider, cls).from_crawler(crawler, *args, **kwargs)
|
||||||
|
crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped)
|
||||||
|
return spider
|
||||||
|
|
||||||
|
def item_scraped(self, item):
|
||||||
|
# Send the scraped item to the server
|
||||||
|
d = treq.post(
|
||||||
|
'http://example.com/post',
|
||||||
|
json.dumps(item).encode('ascii'),
|
||||||
|
headers={b'Content-Type': [b'application/json']}
|
||||||
|
)
|
||||||
|
|
||||||
|
# The next item will be scraped only after
|
||||||
|
# deferred (d) is fired
|
||||||
|
return d
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
for quote in response.css('div.quote'):
|
||||||
|
yield {
|
||||||
|
'text': quote.css('span.text::text').get(),
|
||||||
|
'author': quote.css('small.author::text').get(),
|
||||||
|
'tags': quote.css('div.tags a.tag::text').getall(),
|
||||||
|
}
|
||||||
|
|
||||||
|
See the :ref:`topics-signals-ref` below to know which signals support
|
||||||
|
:class:`~twisted.internet.defer.Deferred`.
|
||||||
|
|
||||||
.. _topics-signals-ref:
|
.. _topics-signals-ref:
|
||||||
|
|
||||||
@ -66,22 +101,25 @@ Built-in signals reference
|
|||||||
|
|
||||||
Here's the list of Scrapy built-in signals and their meaning.
|
Here's the list of Scrapy built-in signals and their meaning.
|
||||||
|
|
||||||
engine_started
|
Engine signals
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
|
engine_started
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: engine_started
|
.. signal:: engine_started
|
||||||
.. function:: engine_started()
|
.. function:: engine_started()
|
||||||
|
|
||||||
Sent when the Scrapy engine has started crawling.
|
Sent when the Scrapy engine has started crawling.
|
||||||
|
|
||||||
This signal supports returning deferreds from their handlers.
|
This signal supports returning deferreds from its handlers.
|
||||||
|
|
||||||
.. note:: This signal may be fired *after* the :signal:`spider_opened` signal,
|
.. note:: This signal may be fired *after* the :signal:`spider_opened` signal,
|
||||||
depending on how the spider was started. So **don't** rely on this signal
|
depending on how the spider was started. So **don't** rely on this signal
|
||||||
getting fired before :signal:`spider_opened`.
|
getting fired before :signal:`spider_opened`.
|
||||||
|
|
||||||
engine_stopped
|
engine_stopped
|
||||||
--------------
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: engine_stopped
|
.. signal:: engine_stopped
|
||||||
.. function:: engine_stopped()
|
.. function:: engine_stopped()
|
||||||
@ -89,10 +127,21 @@ engine_stopped
|
|||||||
Sent when the Scrapy engine is stopped (for example, when a crawling
|
Sent when the Scrapy engine is stopped (for example, when a crawling
|
||||||
process has finished).
|
process has finished).
|
||||||
|
|
||||||
This signal supports returning deferreds from their handlers.
|
This signal supports returning deferreds from its handlers.
|
||||||
|
|
||||||
|
Item signals
|
||||||
|
------------
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
As at max :setting:`CONCURRENT_ITEMS` items are processed in
|
||||||
|
parallel, many deferreds are fired together using
|
||||||
|
:class:`~twisted.internet.defer.DeferredList`. Hence the next
|
||||||
|
batch waits for the :class:`~twisted.internet.defer.DeferredList`
|
||||||
|
to fire and then runs the respective item signal handler for
|
||||||
|
the next batch of scraped items.
|
||||||
|
|
||||||
item_scraped
|
item_scraped
|
||||||
------------
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: item_scraped
|
.. signal:: item_scraped
|
||||||
.. function:: item_scraped(item, response, spider)
|
.. function:: item_scraped(item, response, spider)
|
||||||
@ -100,10 +149,10 @@ item_scraped
|
|||||||
Sent when an item has been scraped, after it has passed all the
|
Sent when an item has been scraped, after it has passed all the
|
||||||
:ref:`topics-item-pipeline` stages (without being dropped).
|
:ref:`topics-item-pipeline` stages (without being dropped).
|
||||||
|
|
||||||
This signal supports returning deferreds from their handlers.
|
This signal supports returning deferreds from its handlers.
|
||||||
|
|
||||||
:param item: the item scraped
|
:param item: the scraped item
|
||||||
:type item: dict or :class:`~scrapy.item.Item` object
|
:type item: :ref:`item object <item-types>`
|
||||||
|
|
||||||
:param spider: the spider which scraped the item
|
:param spider: the spider which scraped the item
|
||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
@ -112,7 +161,7 @@ item_scraped
|
|||||||
:type response: :class:`~scrapy.http.Response` object
|
:type response: :class:`~scrapy.http.Response` object
|
||||||
|
|
||||||
item_dropped
|
item_dropped
|
||||||
------------
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: item_dropped
|
.. signal:: item_dropped
|
||||||
.. function:: item_dropped(item, response, exception, spider)
|
.. function:: item_dropped(item, response, exception, spider)
|
||||||
@ -120,10 +169,10 @@ item_dropped
|
|||||||
Sent after an item has been dropped from the :ref:`topics-item-pipeline`
|
Sent after an item has been dropped from the :ref:`topics-item-pipeline`
|
||||||
when some stage raised a :exc:`~scrapy.exceptions.DropItem` exception.
|
when some stage raised a :exc:`~scrapy.exceptions.DropItem` exception.
|
||||||
|
|
||||||
This signal supports returning deferreds from their handlers.
|
This signal supports returning deferreds from its handlers.
|
||||||
|
|
||||||
:param item: the item dropped from the :ref:`topics-item-pipeline`
|
:param item: the item dropped from the :ref:`topics-item-pipeline`
|
||||||
:type item: dict or :class:`~scrapy.item.Item` object
|
:type item: :ref:`item object <item-types>`
|
||||||
|
|
||||||
:param spider: the spider which scraped the item
|
:param spider: the spider which scraped the item
|
||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
@ -137,7 +186,7 @@ item_dropped
|
|||||||
:type exception: :exc:`~scrapy.exceptions.DropItem` exception
|
:type exception: :exc:`~scrapy.exceptions.DropItem` exception
|
||||||
|
|
||||||
item_error
|
item_error
|
||||||
------------
|
~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: item_error
|
.. signal:: item_error
|
||||||
.. function:: item_error(item, response, spider, failure)
|
.. function:: item_error(item, response, spider, failure)
|
||||||
@ -145,10 +194,10 @@ item_error
|
|||||||
Sent when a :ref:`topics-item-pipeline` generates an error (i.e. raises
|
Sent when a :ref:`topics-item-pipeline` generates an error (i.e. raises
|
||||||
an exception), except :exc:`~scrapy.exceptions.DropItem` exception.
|
an exception), except :exc:`~scrapy.exceptions.DropItem` exception.
|
||||||
|
|
||||||
This signal supports returning deferreds from their handlers.
|
This signal supports returning deferreds from its handlers.
|
||||||
|
|
||||||
:param item: the item dropped from the :ref:`topics-item-pipeline`
|
:param item: the item that caused the error in the :ref:`topics-item-pipeline`
|
||||||
:type item: dict or :class:`~scrapy.item.Item` object
|
:type item: :ref:`item object <item-types>`
|
||||||
|
|
||||||
:param response: the response being processed when the exception was raised
|
:param response: the response being processed when the exception was raised
|
||||||
:type response: :class:`~scrapy.http.Response` object
|
:type response: :class:`~scrapy.http.Response` object
|
||||||
@ -159,8 +208,11 @@ item_error
|
|||||||
:param failure: the exception raised
|
:param failure: the exception raised
|
||||||
:type failure: twisted.python.failure.Failure
|
:type failure: twisted.python.failure.Failure
|
||||||
|
|
||||||
|
Spider signals
|
||||||
|
--------------
|
||||||
|
|
||||||
spider_closed
|
spider_closed
|
||||||
-------------
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: spider_closed
|
.. signal:: spider_closed
|
||||||
.. function:: spider_closed(spider, reason)
|
.. function:: spider_closed(spider, reason)
|
||||||
@ -168,7 +220,7 @@ spider_closed
|
|||||||
Sent after a spider has been closed. This can be used to release per-spider
|
Sent after a spider has been closed. This can be used to release per-spider
|
||||||
resources reserved on :signal:`spider_opened`.
|
resources reserved on :signal:`spider_opened`.
|
||||||
|
|
||||||
This signal supports returning deferreds from their handlers.
|
This signal supports returning deferreds from its handlers.
|
||||||
|
|
||||||
:param spider: the spider which has been closed
|
:param spider: the spider which has been closed
|
||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
@ -183,7 +235,7 @@ spider_closed
|
|||||||
:type reason: str
|
:type reason: str
|
||||||
|
|
||||||
spider_opened
|
spider_opened
|
||||||
-------------
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: spider_opened
|
.. signal:: spider_opened
|
||||||
.. function:: spider_opened(spider)
|
.. function:: spider_opened(spider)
|
||||||
@ -192,13 +244,13 @@ spider_opened
|
|||||||
reserve per-spider resources, but can be used for any task that needs to be
|
reserve per-spider resources, but can be used for any task that needs to be
|
||||||
performed when a spider is opened.
|
performed when a spider is opened.
|
||||||
|
|
||||||
This signal supports returning deferreds from their handlers.
|
This signal supports returning deferreds from its handlers.
|
||||||
|
|
||||||
:param spider: the spider which has been opened
|
:param spider: the spider which has been opened
|
||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
|
|
||||||
spider_idle
|
spider_idle
|
||||||
-----------
|
~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: spider_idle
|
.. signal:: spider_idle
|
||||||
.. function:: spider_idle(spider)
|
.. function:: spider_idle(spider)
|
||||||
@ -216,7 +268,7 @@ spider_idle
|
|||||||
You may raise a :exc:`~scrapy.exceptions.DontCloseSpider` exception to
|
You may raise a :exc:`~scrapy.exceptions.DontCloseSpider` exception to
|
||||||
prevent the spider from being closed.
|
prevent the spider from being closed.
|
||||||
|
|
||||||
This signal does not support returning deferreds from their handlers.
|
This signal does not support returning deferreds from its handlers.
|
||||||
|
|
||||||
:param spider: the spider which has gone idle
|
:param spider: the spider which has gone idle
|
||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
@ -228,14 +280,14 @@ spider_idle
|
|||||||
due to duplication).
|
due to duplication).
|
||||||
|
|
||||||
spider_error
|
spider_error
|
||||||
------------
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: spider_error
|
.. signal:: spider_error
|
||||||
.. function:: spider_error(failure, response, spider)
|
.. function:: spider_error(failure, response, spider)
|
||||||
|
|
||||||
Sent when a spider callback generates an error (i.e. raises an exception).
|
Sent when a spider callback generates an error (i.e. raises an exception).
|
||||||
|
|
||||||
This signal does not support returning deferreds from their handlers.
|
This signal does not support returning deferreds from its handlers.
|
||||||
|
|
||||||
:param failure: the exception raised
|
:param failure: the exception raised
|
||||||
:type failure: twisted.python.failure.Failure
|
:type failure: twisted.python.failure.Failure
|
||||||
@ -246,8 +298,11 @@ spider_error
|
|||||||
:param spider: the spider which raised the exception
|
:param spider: the spider which raised the exception
|
||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
|
|
||||||
|
Request signals
|
||||||
|
---------------
|
||||||
|
|
||||||
request_scheduled
|
request_scheduled
|
||||||
-----------------
|
~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: request_scheduled
|
.. signal:: request_scheduled
|
||||||
.. function:: request_scheduled(request, spider)
|
.. function:: request_scheduled(request, spider)
|
||||||
@ -255,7 +310,7 @@ request_scheduled
|
|||||||
Sent when the engine schedules a :class:`~scrapy.http.Request`, to be
|
Sent when the engine schedules a :class:`~scrapy.http.Request`, to be
|
||||||
downloaded later.
|
downloaded later.
|
||||||
|
|
||||||
The signal does not support returning deferreds from their handlers.
|
This signal does not support returning deferreds from its handlers.
|
||||||
|
|
||||||
:param request: the request that reached the scheduler
|
:param request: the request that reached the scheduler
|
||||||
:type request: :class:`~scrapy.http.Request` object
|
:type request: :class:`~scrapy.http.Request` object
|
||||||
@ -264,7 +319,7 @@ request_scheduled
|
|||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
|
|
||||||
request_dropped
|
request_dropped
|
||||||
---------------
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: request_dropped
|
.. signal:: request_dropped
|
||||||
.. function:: request_dropped(request, spider)
|
.. function:: request_dropped(request, spider)
|
||||||
@ -272,7 +327,7 @@ request_dropped
|
|||||||
Sent when a :class:`~scrapy.http.Request`, scheduled by the engine to be
|
Sent when a :class:`~scrapy.http.Request`, scheduled by the engine to be
|
||||||
downloaded later, is rejected by the scheduler.
|
downloaded later, is rejected by the scheduler.
|
||||||
|
|
||||||
The signal does not support returning deferreds from their handlers.
|
This signal does not support returning deferreds from its handlers.
|
||||||
|
|
||||||
:param request: the request that reached the scheduler
|
:param request: the request that reached the scheduler
|
||||||
:type request: :class:`~scrapy.http.Request` object
|
:type request: :class:`~scrapy.http.Request` object
|
||||||
@ -281,14 +336,14 @@ request_dropped
|
|||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
|
|
||||||
request_reached_downloader
|
request_reached_downloader
|
||||||
---------------------------
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: request_reached_downloader
|
.. signal:: request_reached_downloader
|
||||||
.. function:: request_reached_downloader(request, spider)
|
.. function:: request_reached_downloader(request, spider)
|
||||||
|
|
||||||
Sent when a :class:`~scrapy.http.Request` reached downloader.
|
Sent when a :class:`~scrapy.http.Request` reached downloader.
|
||||||
|
|
||||||
The signal does not support returning deferreds from their handlers.
|
This signal does not support returning deferreds from its handlers.
|
||||||
|
|
||||||
:param request: the request that reached downloader
|
:param request: the request that reached downloader
|
||||||
:type request: :class:`~scrapy.http.Request` object
|
:type request: :class:`~scrapy.http.Request` object
|
||||||
@ -297,7 +352,7 @@ request_reached_downloader
|
|||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
|
|
||||||
request_left_downloader
|
request_left_downloader
|
||||||
-----------------------
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: request_left_downloader
|
.. signal:: request_left_downloader
|
||||||
.. function:: request_left_downloader(request, spider)
|
.. function:: request_left_downloader(request, spider)
|
||||||
@ -315,8 +370,41 @@ request_left_downloader
|
|||||||
:param spider: the spider that yielded the request
|
:param spider: the spider that yielded the request
|
||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
|
|
||||||
|
bytes_received
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. versionadded:: 2.2
|
||||||
|
|
||||||
|
.. signal:: bytes_received
|
||||||
|
.. function:: bytes_received(data, request, spider)
|
||||||
|
|
||||||
|
Sent by the HTTP 1.1 and S3 download handlers when a group of bytes is
|
||||||
|
received for a specific request. This signal might be fired multiple
|
||||||
|
times for the same request, with partial data each time. For instance,
|
||||||
|
a possible scenario for a 25 kb response would be two signals fired
|
||||||
|
with 10 kb of data, and a final one with 5 kb of data.
|
||||||
|
|
||||||
|
This signal does not support returning deferreds from its handlers.
|
||||||
|
|
||||||
|
:param data: the data received by the download handler
|
||||||
|
:type data: :class:`bytes` object
|
||||||
|
|
||||||
|
:param request: the request that generated the download
|
||||||
|
:type request: :class:`~scrapy.http.Request` object
|
||||||
|
|
||||||
|
:param spider: the spider associated with the response
|
||||||
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
|
|
||||||
|
.. note:: Handlers of this signal can stop the download of a response while it
|
||||||
|
is in progress by raising the :exc:`~scrapy.exceptions.StopDownload`
|
||||||
|
exception. Please refer to the :ref:`topics-stop-response-download` topic
|
||||||
|
for additional information and examples.
|
||||||
|
|
||||||
|
Response signals
|
||||||
|
----------------
|
||||||
|
|
||||||
response_received
|
response_received
|
||||||
-----------------
|
~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: response_received
|
.. signal:: response_received
|
||||||
.. function:: response_received(response, request, spider)
|
.. function:: response_received(response, request, spider)
|
||||||
@ -324,7 +412,7 @@ response_received
|
|||||||
Sent when the engine receives a new :class:`~scrapy.http.Response` from the
|
Sent when the engine receives a new :class:`~scrapy.http.Response` from the
|
||||||
downloader.
|
downloader.
|
||||||
|
|
||||||
This signal does not support returning deferreds from their handlers.
|
This signal does not support returning deferreds from its handlers.
|
||||||
|
|
||||||
:param response: the response received
|
:param response: the response received
|
||||||
:type response: :class:`~scrapy.http.Response` object
|
:type response: :class:`~scrapy.http.Response` object
|
||||||
@ -336,14 +424,14 @@ response_received
|
|||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
|
|
||||||
response_downloaded
|
response_downloaded
|
||||||
-------------------
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. signal:: response_downloaded
|
.. signal:: response_downloaded
|
||||||
.. function:: response_downloaded(response, request, spider)
|
.. function:: response_downloaded(response, request, spider)
|
||||||
|
|
||||||
Sent by the downloader right after a ``HTTPResponse`` is downloaded.
|
Sent by the downloader right after a ``HTTPResponse`` is downloaded.
|
||||||
|
|
||||||
This signal does not support returning deferreds from their handlers.
|
This signal does not support returning deferreds from its handlers.
|
||||||
|
|
||||||
:param response: the response downloaded
|
:param response: the response downloaded
|
||||||
:type response: :class:`~scrapy.http.Response` object
|
:type response: :class:`~scrapy.http.Response` object
|
||||||
|
@ -102,29 +102,28 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
|||||||
it has processed the response.
|
it has processed the response.
|
||||||
|
|
||||||
:meth:`process_spider_output` must return an iterable of
|
:meth:`process_spider_output` must return an iterable of
|
||||||
:class:`~scrapy.http.Request`, dict or :class:`~scrapy.item.Item`
|
:class:`~scrapy.http.Request` objects and :ref:`item object
|
||||||
objects.
|
<topics-items>`.
|
||||||
|
|
||||||
:param response: the response which generated this output from the
|
:param response: the response which generated this output from the
|
||||||
spider
|
spider
|
||||||
:type response: :class:`~scrapy.http.Response` object
|
:type response: :class:`~scrapy.http.Response` object
|
||||||
|
|
||||||
:param result: the result returned by the spider
|
:param result: the result returned by the spider
|
||||||
:type result: an iterable of :class:`~scrapy.http.Request`, dict
|
:type result: an iterable of :class:`~scrapy.http.Request` objects and
|
||||||
or :class:`~scrapy.item.Item` objects
|
:ref:`item object <topics-items>`
|
||||||
|
|
||||||
:param spider: the spider whose result is being processed
|
:param spider: the spider whose result is being processed
|
||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
|
|
||||||
|
|
||||||
.. method:: process_spider_exception(response, exception, spider)
|
.. method:: process_spider_exception(response, exception, spider)
|
||||||
|
|
||||||
This method is called when a spider or :meth:`process_spider_output`
|
This method is called when a spider or :meth:`process_spider_output`
|
||||||
method (from a previous spider middleware) raises an exception.
|
method (from a previous spider middleware) raises an exception.
|
||||||
|
|
||||||
:meth:`process_spider_exception` should return either ``None`` or an
|
:meth:`process_spider_exception` should return either ``None`` or an
|
||||||
iterable of :class:`~scrapy.http.Request`, dict or
|
iterable of :class:`~scrapy.http.Request` objects and :ref:`item object
|
||||||
:class:`~scrapy.item.Item` objects.
|
<topics-items>`.
|
||||||
|
|
||||||
If it returns ``None``, Scrapy will continue processing this exception,
|
If it returns ``None``, Scrapy will continue processing this exception,
|
||||||
executing any other :meth:`process_spider_exception` in the following
|
executing any other :meth:`process_spider_exception` in the following
|
||||||
@ -140,7 +139,7 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
|||||||
:type response: :class:`~scrapy.http.Response` object
|
:type response: :class:`~scrapy.http.Response` object
|
||||||
|
|
||||||
:param exception: the exception raised
|
:param exception: the exception raised
|
||||||
:type exception: `Exception`_ object
|
:type exception: :exc:`Exception` object
|
||||||
|
|
||||||
:param spider: the spider which raised the exception
|
:param spider: the spider which raised the exception
|
||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
@ -173,20 +172,16 @@ object gives you access, for example, to the :ref:`settings <topics-settings>`.
|
|||||||
:type spider: :class:`~scrapy.spiders.Spider` object
|
:type spider: :class:`~scrapy.spiders.Spider` object
|
||||||
|
|
||||||
.. method:: from_crawler(cls, crawler)
|
.. method:: from_crawler(cls, crawler)
|
||||||
|
|
||||||
If present, this classmethod is called to create a middleware instance
|
If present, this classmethod is called to create a middleware instance
|
||||||
from a :class:`~scrapy.crawler.Crawler`. It must return a new instance
|
from a :class:`~scrapy.crawler.Crawler`. It must return a new instance
|
||||||
of the middleware. Crawler object provides access to all Scrapy core
|
of the middleware. Crawler object provides access to all Scrapy core
|
||||||
components like settings and signals; it is a way for middleware to
|
components like settings and signals; it is a way for middleware to
|
||||||
access them and hook its functionality into Scrapy.
|
access them and hook its functionality into Scrapy.
|
||||||
|
|
||||||
:param crawler: crawler that uses this middleware
|
:param crawler: crawler that uses this middleware
|
||||||
:type crawler: :class:`~scrapy.crawler.Crawler` object
|
:type crawler: :class:`~scrapy.crawler.Crawler` object
|
||||||
|
|
||||||
|
|
||||||
.. _Exception: https://docs.python.org/2/library/exceptions.html#exceptions.Exception
|
|
||||||
|
|
||||||
|
|
||||||
.. _topics-spider-middleware-ref:
|
.. _topics-spider-middleware-ref:
|
||||||
|
|
||||||
Built-in spider middleware reference
|
Built-in spider middleware reference
|
||||||
|
@ -23,8 +23,8 @@ For spiders, the scraping cycle goes through something like this:
|
|||||||
:attr:`~scrapy.spiders.Spider.parse` method as callback function for the
|
:attr:`~scrapy.spiders.Spider.parse` method as callback function for the
|
||||||
Requests.
|
Requests.
|
||||||
|
|
||||||
2. In the callback function, you parse the response (web page) and return either
|
2. In the callback function, you parse the response (web page) and return
|
||||||
dicts with extracted data, :class:`~scrapy.item.Item` objects,
|
:ref:`item objects <topics-items>`,
|
||||||
:class:`~scrapy.http.Request` objects, or an iterable of these objects.
|
:class:`~scrapy.http.Request` objects, or an iterable of these objects.
|
||||||
Those Requests will also contain a callback (maybe
|
Those Requests will also contain a callback (maybe
|
||||||
the same) and will then be downloaded by Scrapy and then their
|
the same) and will then be downloaded by Scrapy and then their
|
||||||
@ -121,7 +121,7 @@ scrapy.Spider
|
|||||||
send log messages through it as described on
|
send log messages through it as described on
|
||||||
:ref:`topics-logging-from-spiders`.
|
:ref:`topics-logging-from-spiders`.
|
||||||
|
|
||||||
.. method:: from_crawler(crawler, \*args, \**kwargs)
|
.. method:: from_crawler(crawler, *args, **kwargs)
|
||||||
|
|
||||||
This is the class method used by Scrapy to create your spiders.
|
This is the class method used by Scrapy to create your spiders.
|
||||||
|
|
||||||
@ -179,8 +179,8 @@ scrapy.Spider
|
|||||||
the same requirements as the :class:`Spider` class.
|
the same requirements as the :class:`Spider` class.
|
||||||
|
|
||||||
This method, as well as any other Request callback, must return an
|
This method, as well as any other Request callback, must return an
|
||||||
iterable of :class:`~scrapy.http.Request` and/or
|
iterable of :class:`~scrapy.http.Request` and/or :ref:`item objects
|
||||||
dicts or :class:`~scrapy.item.Item` objects.
|
<topics-items>`.
|
||||||
|
|
||||||
:param response: the response to parse
|
:param response: the response to parse
|
||||||
:type response: :class:`~scrapy.http.Response`
|
:type response: :class:`~scrapy.http.Response`
|
||||||
@ -234,7 +234,7 @@ Return multiple Requests and items from a single callback::
|
|||||||
yield scrapy.Request(response.urljoin(href), self.parse)
|
yield scrapy.Request(response.urljoin(href), self.parse)
|
||||||
|
|
||||||
Instead of :attr:`~.start_urls` you can use :meth:`~.start_requests` directly;
|
Instead of :attr:`~.start_urls` you can use :meth:`~.start_requests` directly;
|
||||||
to give data more structure you can use :ref:`topics-items`::
|
to give data more structure you can use :class:`~scrapy.item.Item` objects::
|
||||||
|
|
||||||
import scrapy
|
import scrapy
|
||||||
from myproject.items import MyItem
|
from myproject.items import MyItem
|
||||||
@ -298,9 +298,7 @@ Keep in mind that spider arguments are only strings.
|
|||||||
The spider will not do any parsing on its own.
|
The spider will not do any parsing on its own.
|
||||||
If you were to set the ``start_urls`` attribute from the command line,
|
If you were to set the ``start_urls`` attribute from the command line,
|
||||||
you would have to parse it on your own into a list
|
you would have to parse it on your own into a list
|
||||||
using something like
|
using something like :func:`ast.literal_eval` or :func:`json.loads`
|
||||||
`ast.literal_eval <https://docs.python.org/3/library/ast.html#ast.literal_eval>`_
|
|
||||||
or `json.loads <https://docs.python.org/3/library/json.html#json.loads>`_
|
|
||||||
and then set it as an attribute.
|
and then set it as an attribute.
|
||||||
Otherwise, you would cause iteration over a ``start_urls`` string
|
Otherwise, you would cause iteration over a ``start_urls`` string
|
||||||
(a very common python pitfall)
|
(a very common python pitfall)
|
||||||
@ -366,7 +364,7 @@ CrawlSpider
|
|||||||
|
|
||||||
This method is called for the start_urls responses. It allows to parse
|
This method is called for the start_urls responses. It allows to parse
|
||||||
the initial responses and must return either an
|
the initial responses and must return either an
|
||||||
:class:`~scrapy.item.Item` object, a :class:`~scrapy.http.Request`
|
:ref:`item object <topics-items>`, a :class:`~scrapy.http.Request`
|
||||||
object, or an iterable containing any of them.
|
object, or an iterable containing any of them.
|
||||||
|
|
||||||
Crawling rules
|
Crawling rules
|
||||||
@ -385,7 +383,7 @@ Crawling rules
|
|||||||
object with that name will be used) to be called for each link extracted with
|
object with that name will be used) to be called for each link extracted with
|
||||||
the specified link extractor. This callback receives a :class:`~scrapy.http.Response`
|
the specified link extractor. This callback receives a :class:`~scrapy.http.Response`
|
||||||
as its first argument and must return either a single instance or an iterable of
|
as its first argument and must return either a single instance or an iterable of
|
||||||
:class:`~scrapy.item.Item`, ``dict`` and/or :class:`~scrapy.http.Request` objects
|
:ref:`item objects <topics-items>` and/or :class:`~scrapy.http.Request` objects
|
||||||
(or any subclass of them). As mentioned above, the received :class:`~scrapy.http.Response`
|
(or any subclass of them). As mentioned above, the received :class:`~scrapy.http.Response`
|
||||||
object will contain the text of the link that produced the :class:`~scrapy.http.Request`
|
object will contain the text of the link that produced the :class:`~scrapy.http.Request`
|
||||||
in its ``meta`` dictionary (under the ``link_text`` key)
|
in its ``meta`` dictionary (under the ``link_text`` key)
|
||||||
@ -533,7 +531,7 @@ XMLFeedSpider
|
|||||||
(``itertag``). Receives the response and an
|
(``itertag``). Receives the response and an
|
||||||
:class:`~scrapy.selector.Selector` for each node. Overriding this
|
:class:`~scrapy.selector.Selector` for each node. Overriding this
|
||||||
method is mandatory. Otherwise, you spider won't work. This method
|
method is mandatory. Otherwise, you spider won't work. This method
|
||||||
must return either a :class:`~scrapy.item.Item` object, a
|
must return an :ref:`item object <topics-items>`, a
|
||||||
:class:`~scrapy.http.Request` object, or an iterable containing any of
|
:class:`~scrapy.http.Request` object, or an iterable containing any of
|
||||||
them.
|
them.
|
||||||
|
|
||||||
@ -543,7 +541,7 @@ XMLFeedSpider
|
|||||||
spider, and it's intended to perform any last time processing required
|
spider, and it's intended to perform any last time processing required
|
||||||
before returning the results to the framework core, for example setting the
|
before returning the results to the framework core, for example setting the
|
||||||
item IDs. It receives a list of results and the response which originated
|
item IDs. It receives a list of results and the response which originated
|
||||||
those results. It must return a list of results (Items or Requests).
|
those results. It must return a list of results (items or requests).
|
||||||
|
|
||||||
|
|
||||||
XMLFeedSpider example
|
XMLFeedSpider example
|
||||||
|
@ -40,10 +40,10 @@ the console you need to type::
|
|||||||
Connected to localhost.
|
Connected to localhost.
|
||||||
Escape character is '^]'.
|
Escape character is '^]'.
|
||||||
Username:
|
Username:
|
||||||
Password:
|
Password:
|
||||||
>>>
|
>>>
|
||||||
|
|
||||||
By default Username is ``scrapy`` and Password is autogenerated. The
|
By default Username is ``scrapy`` and Password is autogenerated. The
|
||||||
autogenerated Password can be seen on Scrapy logs like the example below::
|
autogenerated Password can be seen on Scrapy logs like the example below::
|
||||||
|
|
||||||
2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326
|
2018-10-16 14:35:21 [scrapy.extensions.telnet] INFO: Telnet Password: 16f92501e8a59326
|
||||||
@ -63,7 +63,7 @@ Available variables in the telnet console
|
|||||||
=========================================
|
=========================================
|
||||||
|
|
||||||
The telnet console is like a regular Python shell running inside the Scrapy
|
The telnet console is like a regular Python shell running inside the Scrapy
|
||||||
process, so you can do anything from it including importing new modules, etc.
|
process, so you can do anything from it including importing new modules, etc.
|
||||||
|
|
||||||
However, the telnet console comes with some default variables defined for
|
However, the telnet console comes with some default variables defined for
|
||||||
convenience:
|
convenience:
|
||||||
@ -89,13 +89,11 @@ convenience:
|
|||||||
+----------------+-------------------------------------------------------------------+
|
+----------------+-------------------------------------------------------------------+
|
||||||
| ``prefs`` | for memory debugging (see :ref:`topics-leaks`) |
|
| ``prefs`` | for memory debugging (see :ref:`topics-leaks`) |
|
||||||
+----------------+-------------------------------------------------------------------+
|
+----------------+-------------------------------------------------------------------+
|
||||||
| ``p`` | a shortcut to the `pprint.pprint`_ function |
|
| ``p`` | a shortcut to the :func:`pprint.pprint` function |
|
||||||
+----------------+-------------------------------------------------------------------+
|
+----------------+-------------------------------------------------------------------+
|
||||||
| ``hpy`` | for memory debugging (see :ref:`topics-leaks`) |
|
| ``hpy`` | for memory debugging (see :ref:`topics-leaks`) |
|
||||||
+----------------+-------------------------------------------------------------------+
|
+----------------+-------------------------------------------------------------------+
|
||||||
|
|
||||||
.. _pprint.pprint: https://docs.python.org/library/pprint.html#pprint.pprint
|
|
||||||
|
|
||||||
Telnet console usage examples
|
Telnet console usage examples
|
||||||
=============================
|
=============================
|
||||||
|
|
||||||
@ -208,4 +206,3 @@ Default: ``None``
|
|||||||
|
|
||||||
The password used for the telnet console, default behaviour is to have it
|
The password used for the telnet console, default behaviour is to have it
|
||||||
autogenerated
|
autogenerated
|
||||||
|
|
||||||
|
@ -14,50 +14,57 @@ Author: dufferzafar
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Used for remembering the file (and its contents)
|
|
||||||
# so we don't have to open the same file again.
|
|
||||||
_filename = None
|
|
||||||
_contents = None
|
|
||||||
|
|
||||||
# A regex that matches standard linkcheck output lines
|
def main():
|
||||||
line_re = re.compile(u'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))')
|
|
||||||
|
|
||||||
# Read lines from the linkcheck output file
|
# Used for remembering the file (and its contents)
|
||||||
try:
|
# so we don't have to open the same file again.
|
||||||
with open("build/linkcheck/output.txt") as out:
|
_filename = None
|
||||||
output_lines = out.readlines()
|
_contents = None
|
||||||
except IOError:
|
|
||||||
print("linkcheck output not found; please run linkcheck first.")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
# For every line, fix the respective file
|
# A regex that matches standard linkcheck output lines
|
||||||
for line in output_lines:
|
line_re = re.compile(u'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))')
|
||||||
match = re.match(line_re, line)
|
|
||||||
|
|
||||||
if match:
|
# Read lines from the linkcheck output file
|
||||||
newfilename = match.group(1)
|
try:
|
||||||
errortype = match.group(2)
|
with open("build/linkcheck/output.txt") as out:
|
||||||
|
output_lines = out.readlines()
|
||||||
|
except IOError:
|
||||||
|
print("linkcheck output not found; please run linkcheck first.")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
# Broken links can't be fixed and
|
# For every line, fix the respective file
|
||||||
# I am not sure what do with the local ones.
|
for line in output_lines:
|
||||||
if errortype.lower() in ["broken", "local"]:
|
match = re.match(line_re, line)
|
||||||
print("Not Fixed: " + line)
|
|
||||||
|
if match:
|
||||||
|
newfilename = match.group(1)
|
||||||
|
errortype = match.group(2)
|
||||||
|
|
||||||
|
# Broken links can't be fixed and
|
||||||
|
# I am not sure what do with the local ones.
|
||||||
|
if errortype.lower() in ["broken", "local"]:
|
||||||
|
print("Not Fixed: " + line)
|
||||||
|
else:
|
||||||
|
# If this is a new file
|
||||||
|
if newfilename != _filename:
|
||||||
|
|
||||||
|
# Update the previous file
|
||||||
|
if _filename:
|
||||||
|
with open(_filename, "w") as _file:
|
||||||
|
_file.write(_contents)
|
||||||
|
|
||||||
|
_filename = newfilename
|
||||||
|
|
||||||
|
# Read the new file to memory
|
||||||
|
with open(_filename) as _file:
|
||||||
|
_contents = _file.read()
|
||||||
|
|
||||||
|
_contents = _contents.replace(match.group(3), match.group(4))
|
||||||
else:
|
else:
|
||||||
# If this is a new file
|
# We don't understand what the current line means!
|
||||||
if newfilename != _filename:
|
print("Not Understood: " + line)
|
||||||
|
|
||||||
# Update the previous file
|
|
||||||
if _filename:
|
|
||||||
with open(_filename, "w") as _file:
|
|
||||||
_file.write(_contents)
|
|
||||||
|
|
||||||
_filename = newfilename
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
# Read the new file to memory
|
|
||||||
with open(_filename) as _file:
|
|
||||||
_contents = _file.read()
|
|
||||||
|
|
||||||
_contents = _contents.replace(match.group(3), match.group(4))
|
|
||||||
else:
|
|
||||||
# We don't understand what the current line means!
|
|
||||||
print("Not Understood: " + line)
|
|
||||||
|
113
pylintrc
Normal file
113
pylintrc
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
[MASTER]
|
||||||
|
persistent=no
|
||||||
|
jobs=1 # >1 hides results
|
||||||
|
|
||||||
|
[MESSAGES CONTROL]
|
||||||
|
disable=abstract-method,
|
||||||
|
anomalous-backslash-in-string,
|
||||||
|
arguments-differ,
|
||||||
|
attribute-defined-outside-init,
|
||||||
|
bad-classmethod-argument,
|
||||||
|
bad-continuation,
|
||||||
|
bad-indentation,
|
||||||
|
bad-mcs-classmethod-argument,
|
||||||
|
bad-super-call,
|
||||||
|
bad-whitespace,
|
||||||
|
bare-except,
|
||||||
|
blacklisted-name,
|
||||||
|
broad-except,
|
||||||
|
c-extension-no-member,
|
||||||
|
catching-non-exception,
|
||||||
|
cell-var-from-loop,
|
||||||
|
comparison-with-callable,
|
||||||
|
consider-iterating-dictionary,
|
||||||
|
consider-using-in,
|
||||||
|
consider-using-set-comprehension,
|
||||||
|
consider-using-sys-exit,
|
||||||
|
cyclic-import,
|
||||||
|
dangerous-default-value,
|
||||||
|
deprecated-method,
|
||||||
|
deprecated-module,
|
||||||
|
duplicate-code, # https://github.com/PyCQA/pylint/issues/214
|
||||||
|
eval-used,
|
||||||
|
expression-not-assigned,
|
||||||
|
fixme,
|
||||||
|
function-redefined,
|
||||||
|
global-statement,
|
||||||
|
import-error,
|
||||||
|
import-outside-toplevel,
|
||||||
|
import-self,
|
||||||
|
inconsistent-return-statements,
|
||||||
|
inherit-non-class,
|
||||||
|
invalid-name,
|
||||||
|
invalid-overridden-method,
|
||||||
|
isinstance-second-argument-not-valid-type,
|
||||||
|
keyword-arg-before-vararg,
|
||||||
|
line-too-long,
|
||||||
|
logging-format-interpolation,
|
||||||
|
logging-not-lazy,
|
||||||
|
lost-exception,
|
||||||
|
method-hidden,
|
||||||
|
misplaced-comparison-constant,
|
||||||
|
missing-docstring,
|
||||||
|
missing-final-newline,
|
||||||
|
multiple-imports,
|
||||||
|
multiple-statements,
|
||||||
|
no-else-continue,
|
||||||
|
no-else-raise,
|
||||||
|
no-else-return,
|
||||||
|
no-init,
|
||||||
|
no-member,
|
||||||
|
no-method-argument,
|
||||||
|
no-name-in-module,
|
||||||
|
no-self-argument,
|
||||||
|
no-self-use,
|
||||||
|
no-value-for-parameter,
|
||||||
|
not-an-iterable,
|
||||||
|
not-callable,
|
||||||
|
pointless-statement,
|
||||||
|
pointless-string-statement,
|
||||||
|
protected-access,
|
||||||
|
redefined-argument-from-local,
|
||||||
|
redefined-builtin,
|
||||||
|
redefined-outer-name,
|
||||||
|
reimported,
|
||||||
|
signature-differs,
|
||||||
|
singleton-comparison,
|
||||||
|
super-init-not-called,
|
||||||
|
superfluous-parens,
|
||||||
|
too-few-public-methods,
|
||||||
|
too-many-ancestors,
|
||||||
|
too-many-arguments,
|
||||||
|
too-many-branches,
|
||||||
|
too-many-format-args,
|
||||||
|
too-many-function-args,
|
||||||
|
too-many-instance-attributes,
|
||||||
|
too-many-lines,
|
||||||
|
too-many-locals,
|
||||||
|
too-many-public-methods,
|
||||||
|
too-many-return-statements,
|
||||||
|
trailing-newlines,
|
||||||
|
trailing-whitespace,
|
||||||
|
unbalanced-tuple-unpacking,
|
||||||
|
undefined-variable,
|
||||||
|
undefined-loop-variable,
|
||||||
|
unexpected-special-method-signature,
|
||||||
|
ungrouped-imports,
|
||||||
|
unidiomatic-typecheck,
|
||||||
|
unnecessary-comprehension,
|
||||||
|
unnecessary-lambda,
|
||||||
|
unnecessary-pass,
|
||||||
|
unreachable,
|
||||||
|
unsubscriptable-object,
|
||||||
|
unused-argument,
|
||||||
|
unused-import,
|
||||||
|
unused-variable,
|
||||||
|
unused-wildcard-import,
|
||||||
|
used-before-assignment,
|
||||||
|
useless-object-inheritance, # Required for Python 2 support
|
||||||
|
useless-return,
|
||||||
|
useless-super-delegation,
|
||||||
|
wildcard-import,
|
||||||
|
wrong-import-order,
|
||||||
|
wrong-import-position
|
233
pytest.ini
233
pytest.ini
@ -20,232 +20,23 @@ addopts =
|
|||||||
twisted = 1
|
twisted = 1
|
||||||
markers =
|
markers =
|
||||||
only_asyncio: marks tests as only enabled when --reactor=asyncio is passed
|
only_asyncio: marks tests as only enabled when --reactor=asyncio is passed
|
||||||
|
flake8-max-line-length = 119
|
||||||
flake8-ignore =
|
flake8-ignore =
|
||||||
W503
|
W503
|
||||||
# Files that are only meant to provide top-level imports are expected not
|
|
||||||
# to use any of their imports:
|
# Exclude files that are meant to provide top-level imports
|
||||||
|
# E402: Module level import not at top of file
|
||||||
|
# F401: Module imported but unused
|
||||||
|
scrapy/__init__.py E402
|
||||||
scrapy/core/downloader/handlers/http.py F401
|
scrapy/core/downloader/handlers/http.py F401
|
||||||
scrapy/http/__init__.py F401
|
scrapy/http/__init__.py F401
|
||||||
|
scrapy/linkextractors/__init__.py E402 F401
|
||||||
|
scrapy/selector/__init__.py F401
|
||||||
|
scrapy/spiders/__init__.py E402 F401
|
||||||
|
|
||||||
# Issues pending a review:
|
# Issues pending a review:
|
||||||
# extras
|
|
||||||
extras/qps-bench-server.py E501
|
|
||||||
extras/qpsclient.py E501 E501
|
|
||||||
# scrapy/commands
|
|
||||||
scrapy/commands/__init__.py E128 E501
|
|
||||||
scrapy/commands/check.py E501
|
|
||||||
scrapy/commands/crawl.py E501
|
|
||||||
scrapy/commands/edit.py E501
|
|
||||||
scrapy/commands/fetch.py E401 E501 E128 E731
|
|
||||||
scrapy/commands/genspider.py E128 E501 E502
|
|
||||||
scrapy/commands/parse.py E128 E501 E731
|
|
||||||
scrapy/commands/runspider.py E501
|
|
||||||
scrapy/commands/settings.py E128
|
|
||||||
scrapy/commands/shell.py E128 E501 E502
|
|
||||||
scrapy/commands/startproject.py E127 E501 E128
|
|
||||||
scrapy/commands/version.py E501 E128
|
|
||||||
# scrapy/contracts
|
|
||||||
scrapy/contracts/__init__.py E501 W504
|
|
||||||
scrapy/contracts/default.py E128
|
|
||||||
# scrapy/core
|
|
||||||
scrapy/core/engine.py E501 E128 E127 E502
|
|
||||||
scrapy/core/scheduler.py E501
|
|
||||||
scrapy/core/scraper.py E501 E128 W504
|
|
||||||
scrapy/core/spidermw.py E501 E731 E126
|
|
||||||
scrapy/core/downloader/__init__.py E501
|
|
||||||
scrapy/core/downloader/contextfactory.py E501 E128 E126
|
|
||||||
scrapy/core/downloader/middleware.py E501 E502
|
|
||||||
scrapy/core/downloader/tls.py E501 E241
|
|
||||||
scrapy/core/downloader/webclient.py E731 E501 E128 E126
|
|
||||||
scrapy/core/downloader/handlers/__init__.py E501
|
|
||||||
scrapy/core/downloader/handlers/ftp.py E501 E128 E127
|
|
||||||
scrapy/core/downloader/handlers/http10.py E501
|
|
||||||
scrapy/core/downloader/handlers/http11.py E501
|
|
||||||
scrapy/core/downloader/handlers/s3.py E501 E128 E126
|
|
||||||
# scrapy/downloadermiddlewares
|
|
||||||
scrapy/downloadermiddlewares/ajaxcrawl.py E501
|
|
||||||
scrapy/downloadermiddlewares/decompression.py E501
|
|
||||||
scrapy/downloadermiddlewares/defaultheaders.py E501
|
|
||||||
scrapy/downloadermiddlewares/httpcache.py E501 E126
|
|
||||||
scrapy/downloadermiddlewares/httpcompression.py E501 E128
|
|
||||||
scrapy/downloadermiddlewares/httpproxy.py E501
|
|
||||||
scrapy/downloadermiddlewares/redirect.py E501 W504
|
|
||||||
scrapy/downloadermiddlewares/retry.py E501 E126
|
|
||||||
scrapy/downloadermiddlewares/robotstxt.py E501
|
|
||||||
scrapy/downloadermiddlewares/stats.py E501
|
|
||||||
# scrapy/extensions
|
|
||||||
scrapy/extensions/closespider.py E501 E128 E123
|
|
||||||
scrapy/extensions/corestats.py E501
|
|
||||||
scrapy/extensions/feedexport.py E128 E501
|
|
||||||
scrapy/extensions/httpcache.py E128 E501
|
|
||||||
scrapy/extensions/memdebug.py E501
|
|
||||||
scrapy/extensions/spiderstate.py E501
|
|
||||||
scrapy/extensions/telnet.py E501 W504
|
|
||||||
scrapy/extensions/throttle.py E501
|
|
||||||
# scrapy/http
|
|
||||||
scrapy/http/common.py E501
|
|
||||||
scrapy/http/cookies.py E501
|
|
||||||
scrapy/http/request/__init__.py E501
|
|
||||||
scrapy/http/request/form.py E501 E123
|
|
||||||
scrapy/http/request/json_request.py E501
|
|
||||||
scrapy/http/response/__init__.py E501 E128
|
|
||||||
scrapy/http/response/text.py E501 E128 E124
|
|
||||||
# scrapy/linkextractors
|
|
||||||
scrapy/linkextractors/__init__.py E731 E501 E402 W504
|
|
||||||
scrapy/linkextractors/lxmlhtml.py E501 E731
|
|
||||||
# scrapy/loader
|
|
||||||
scrapy/loader/__init__.py E501 E128
|
|
||||||
scrapy/loader/processors.py E501
|
|
||||||
# scrapy/pipelines
|
|
||||||
scrapy/pipelines/__init__.py E501
|
|
||||||
scrapy/pipelines/files.py E116 E501 E266
|
|
||||||
scrapy/pipelines/images.py E265 E501
|
|
||||||
scrapy/pipelines/media.py E125 E501 E266
|
|
||||||
# scrapy/selector
|
|
||||||
scrapy/selector/__init__.py F403
|
|
||||||
scrapy/selector/unified.py E501 E111
|
|
||||||
# scrapy/settings
|
|
||||||
scrapy/settings/__init__.py E501
|
|
||||||
scrapy/settings/default_settings.py E501 E114 E116
|
|
||||||
scrapy/settings/deprecated.py E501
|
|
||||||
# scrapy/spidermiddlewares
|
|
||||||
scrapy/spidermiddlewares/httperror.py E501
|
|
||||||
scrapy/spidermiddlewares/offsite.py E501
|
|
||||||
scrapy/spidermiddlewares/referer.py E501 E129 W504
|
|
||||||
scrapy/spidermiddlewares/urllength.py E501
|
|
||||||
# scrapy/spiders
|
|
||||||
scrapy/spiders/__init__.py E501 E402
|
|
||||||
scrapy/spiders/crawl.py E501
|
|
||||||
scrapy/spiders/feed.py E501
|
|
||||||
scrapy/spiders/sitemap.py E501
|
|
||||||
# scrapy/utils
|
|
||||||
scrapy/utils/asyncio.py E501
|
|
||||||
scrapy/utils/benchserver.py E501
|
|
||||||
scrapy/utils/conf.py E402 E501
|
|
||||||
scrapy/utils/datatypes.py E501
|
|
||||||
scrapy/utils/decorators.py E501
|
|
||||||
scrapy/utils/defer.py E501 E128
|
|
||||||
scrapy/utils/deprecate.py E128 E501 E127 E502
|
|
||||||
scrapy/utils/gz.py E501 W504
|
|
||||||
scrapy/utils/http.py F403
|
scrapy/utils/http.py F403
|
||||||
scrapy/utils/httpobj.py E501
|
|
||||||
scrapy/utils/iterators.py E501
|
|
||||||
scrapy/utils/log.py E128 E501
|
|
||||||
scrapy/utils/markup.py F403
|
scrapy/utils/markup.py F403
|
||||||
scrapy/utils/misc.py E501
|
|
||||||
scrapy/utils/multipart.py F403
|
scrapy/utils/multipart.py F403
|
||||||
scrapy/utils/project.py E501
|
scrapy/utils/url.py F403 F405
|
||||||
scrapy/utils/python.py E501
|
tests/test_loader.py E741
|
||||||
scrapy/utils/reactor.py E501
|
|
||||||
scrapy/utils/reqser.py E501
|
|
||||||
scrapy/utils/request.py E127 E501
|
|
||||||
scrapy/utils/response.py E501 E128
|
|
||||||
scrapy/utils/signal.py E501 E128
|
|
||||||
scrapy/utils/sitemap.py E501
|
|
||||||
scrapy/utils/spider.py E501
|
|
||||||
scrapy/utils/ssl.py E501
|
|
||||||
scrapy/utils/test.py E501
|
|
||||||
scrapy/utils/url.py E501 F403 E128 F405
|
|
||||||
# scrapy
|
|
||||||
scrapy/__init__.py E402 E501
|
|
||||||
scrapy/cmdline.py E501
|
|
||||||
scrapy/crawler.py E501
|
|
||||||
scrapy/dupefilters.py E501 E202
|
|
||||||
scrapy/exceptions.py E501
|
|
||||||
scrapy/exporters.py E501
|
|
||||||
scrapy/interfaces.py E501
|
|
||||||
scrapy/item.py E501 E128
|
|
||||||
scrapy/link.py E501
|
|
||||||
scrapy/logformatter.py E501
|
|
||||||
scrapy/mail.py E402 E128 E501 E502
|
|
||||||
scrapy/middleware.py E128 E501
|
|
||||||
scrapy/pqueues.py E501
|
|
||||||
scrapy/resolver.py E501
|
|
||||||
scrapy/responsetypes.py E128 E501
|
|
||||||
scrapy/robotstxt.py E501
|
|
||||||
scrapy/shell.py E501
|
|
||||||
scrapy/signalmanager.py E501
|
|
||||||
scrapy/spiderloader.py F841 E501 E126
|
|
||||||
scrapy/squeues.py E128
|
|
||||||
scrapy/statscollectors.py E501
|
|
||||||
# tests
|
|
||||||
tests/__init__.py E402 E501
|
|
||||||
tests/mockserver.py E401 E501 E126 E123
|
|
||||||
tests/pipelines.py F841
|
|
||||||
tests/spiders.py E501 E127
|
|
||||||
tests/test_closespider.py E501 E127
|
|
||||||
tests/test_command_fetch.py E501
|
|
||||||
tests/test_command_parse.py E501 E128
|
|
||||||
tests/test_command_shell.py E501 E128
|
|
||||||
tests/test_commands.py E128 E501
|
|
||||||
tests/test_contracts.py E501 E128
|
|
||||||
tests/test_crawl.py E501 E741 E265
|
|
||||||
tests/test_crawler.py F841 E501
|
|
||||||
tests/test_dependencies.py F841 E501
|
|
||||||
tests/test_downloader_handlers.py E124 E127 E128 E265 E501 E126 E123
|
|
||||||
tests/test_downloadermiddleware.py E501
|
|
||||||
tests/test_downloadermiddleware_ajaxcrawlable.py E501
|
|
||||||
tests/test_downloadermiddleware_cookies.py E731 E741 E501 E128 E265 E126
|
|
||||||
tests/test_downloadermiddleware_decompression.py E127
|
|
||||||
tests/test_downloadermiddleware_defaultheaders.py E501
|
|
||||||
tests/test_downloadermiddleware_downloadtimeout.py E501
|
|
||||||
tests/test_downloadermiddleware_httpcache.py E501
|
|
||||||
tests/test_downloadermiddleware_httpcompression.py E501 E126 E123
|
|
||||||
tests/test_downloadermiddleware_httpproxy.py E501 E128
|
|
||||||
tests/test_downloadermiddleware_redirect.py E501 E128 E127
|
|
||||||
tests/test_downloadermiddleware_retry.py E501 E128 E126
|
|
||||||
tests/test_downloadermiddleware_robotstxt.py E501
|
|
||||||
tests/test_downloadermiddleware_stats.py E501
|
|
||||||
tests/test_dupefilters.py E501 E741 E128 E124
|
|
||||||
tests/test_engine.py E401 E501 E128
|
|
||||||
tests/test_exporters.py E501 E731 E128 E124
|
|
||||||
tests/test_extension_telnet.py F841
|
|
||||||
tests/test_feedexport.py E501 F841 E241
|
|
||||||
tests/test_http_cookies.py E501
|
|
||||||
tests/test_http_headers.py E501
|
|
||||||
tests/test_http_request.py E402 E501 E127 E128 E128 E126 E123
|
|
||||||
tests/test_http_response.py E501 E128 E265
|
|
||||||
tests/test_item.py E128 F841
|
|
||||||
tests/test_link.py E501
|
|
||||||
tests/test_linkextractors.py E501 E128 E124
|
|
||||||
tests/test_loader.py E501 E731 E741 E128 E117 E241
|
|
||||||
tests/test_logformatter.py E128 E501 E122
|
|
||||||
tests/test_mail.py E128 E501
|
|
||||||
tests/test_middleware.py E501 E128
|
|
||||||
tests/test_pipeline_crawl.py E501 E128 E126
|
|
||||||
tests/test_pipeline_files.py E501
|
|
||||||
tests/test_pipeline_images.py F841 E501
|
|
||||||
tests/test_pipeline_media.py E501 E741 E731 E128 E502
|
|
||||||
tests/test_proxy_connect.py E501 E741
|
|
||||||
tests/test_request_cb_kwargs.py E501
|
|
||||||
tests/test_responsetypes.py E501
|
|
||||||
tests/test_robotstxt_interface.py E501 E501
|
|
||||||
tests/test_scheduler.py E501 E126 E123
|
|
||||||
tests/test_selector.py E501 E127
|
|
||||||
tests/test_spider.py E501
|
|
||||||
tests/test_spidermiddleware.py E501
|
|
||||||
tests/test_spidermiddleware_httperror.py E128 E501 E127 E121
|
|
||||||
tests/test_spidermiddleware_offsite.py E501 E128 E111
|
|
||||||
tests/test_spidermiddleware_output_chain.py E501
|
|
||||||
tests/test_spidermiddleware_referer.py E501 F841 E125 E201 E124 E501 E241 E121
|
|
||||||
tests/test_squeues.py E501 E741
|
|
||||||
tests/test_utils_asyncio.py E501
|
|
||||||
tests/test_utils_conf.py E501 E128
|
|
||||||
tests/test_utils_curl.py E501
|
|
||||||
tests/test_utils_datatypes.py E402 E501
|
|
||||||
tests/test_utils_defer.py E501 F841
|
|
||||||
tests/test_utils_deprecate.py F841 E501
|
|
||||||
tests/test_utils_http.py E501 E128 W504
|
|
||||||
tests/test_utils_iterators.py E501 E128 E129 E241
|
|
||||||
tests/test_utils_log.py E741
|
|
||||||
tests/test_utils_python.py E501 E731
|
|
||||||
tests/test_utils_reqser.py E501 E128
|
|
||||||
tests/test_utils_request.py E501 E128
|
|
||||||
tests/test_utils_response.py E501
|
|
||||||
tests/test_utils_signal.py E741 F841 E731
|
|
||||||
tests/test_utils_sitemap.py E128 E501 E124
|
|
||||||
tests/test_utils_url.py E501 E127 E125 E501 E241 E126 E123
|
|
||||||
tests/test_webclient.py E501 E128 E122 E402 E241 E123 E126
|
|
||||||
tests/test_cmdline/__init__.py E501
|
|
||||||
tests/test_settings/__init__.py E501 E128
|
|
||||||
tests/test_spiderloader/__init__.py E128 E501
|
|
||||||
tests/test_utils_misc/__init__.py E501
|
|
||||||
|
@ -1 +1 @@
|
|||||||
2.0.0
|
2.2.0
|
||||||
|
@ -2,33 +2,11 @@
|
|||||||
Scrapy - a web crawling and web scraping framework written for Python
|
Scrapy - a web crawling and web scraping framework written for Python
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__all__ = ['__version__', 'version_info', 'twisted_version',
|
|
||||||
'Spider', 'Request', 'FormRequest', 'Selector', 'Item', 'Field']
|
|
||||||
|
|
||||||
# Scrapy version
|
|
||||||
import pkgutil
|
import pkgutil
|
||||||
__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
|
|
||||||
version_info = tuple(int(v) if v.isdigit() else v
|
|
||||||
for v in __version__.split('.'))
|
|
||||||
del pkgutil
|
|
||||||
|
|
||||||
# Check minimum required Python version
|
|
||||||
import sys
|
import sys
|
||||||
if sys.version_info < (3, 5):
|
|
||||||
print("Scrapy %s requires Python 3.5" % __version__)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# Ignore noisy twisted deprecation warnings
|
|
||||||
import warnings
|
import warnings
|
||||||
warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
|
|
||||||
del warnings
|
|
||||||
|
|
||||||
# Apply monkey patches to fix issues in external libraries
|
|
||||||
from scrapy import _monkeypatches
|
|
||||||
del _monkeypatches
|
|
||||||
|
|
||||||
from twisted import version as _txv
|
from twisted import version as _txv
|
||||||
twisted_version = (_txv.major, _txv.minor, _txv.micro)
|
|
||||||
|
|
||||||
# Declare top-level shortcuts
|
# Declare top-level shortcuts
|
||||||
from scrapy.spiders import Spider
|
from scrapy.spiders import Spider
|
||||||
@ -36,4 +14,29 @@ from scrapy.http import Request, FormRequest
|
|||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
from scrapy.item import Item, Field
|
from scrapy.item import Item, Field
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'__version__', 'version_info', 'twisted_version', 'Spider',
|
||||||
|
'Request', 'FormRequest', 'Selector', 'Item', 'Field',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Scrapy and Twisted versions
|
||||||
|
__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
|
||||||
|
version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split('.'))
|
||||||
|
twisted_version = (_txv.major, _txv.minor, _txv.micro)
|
||||||
|
|
||||||
|
|
||||||
|
# Check minimum required Python version
|
||||||
|
if sys.version_info < (3, 5, 2):
|
||||||
|
print("Scrapy %s requires Python 3.5.2" % __version__)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
# Ignore noisy twisted deprecation warnings
|
||||||
|
warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
|
||||||
|
|
||||||
|
|
||||||
|
del pkgutil
|
||||||
del sys
|
del sys
|
||||||
|
del warnings
|
||||||
|
@ -1,11 +0,0 @@
|
|||||||
import copyreg
|
|
||||||
|
|
||||||
|
|
||||||
# Undo what Twisted's perspective broker adds to pickle register
|
|
||||||
# to prevent bugs like Twisted#7989 while serializing requests
|
|
||||||
import twisted.persisted.styles # NOQA
|
|
||||||
# Remove only entries with twisted serializers for non-twisted types.
|
|
||||||
for k, v in frozenset(copyreg.dispatch_table.items()):
|
|
||||||
if not str(getattr(k, '__module__', '')).startswith('twisted') \
|
|
||||||
and str(getattr(v, '__module__', '')).startswith('twisted'):
|
|
||||||
copyreg.dispatch_table.pop(k)
|
|
@ -165,6 +165,7 @@ if __name__ == '__main__':
|
|||||||
try:
|
try:
|
||||||
execute()
|
execute()
|
||||||
finally:
|
finally:
|
||||||
# Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect()
|
# Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() on exit:
|
||||||
# on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies
|
# http://doc.pypy.org/en/latest/cpython_differences.html
|
||||||
|
# ?highlight=gc.collect#differences-related-to-garbage-collection-strategies
|
||||||
garbage_collect()
|
garbage_collect()
|
||||||
|
@ -5,7 +5,7 @@ import os
|
|||||||
from optparse import OptionGroup
|
from optparse import OptionGroup
|
||||||
from twisted.python import failure
|
from twisted.python import failure
|
||||||
|
|
||||||
from scrapy.utils.conf import arglist_to_dict
|
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
|
||||||
from scrapy.exceptions import UsageError
|
from scrapy.exceptions import UsageError
|
||||||
|
|
||||||
|
|
||||||
@ -23,7 +23,8 @@ class ScrapyCommand:
|
|||||||
self.settings = None # set in scrapy.cmdline
|
self.settings = None # set in scrapy.cmdline
|
||||||
|
|
||||||
def set_crawler(self, crawler):
|
def set_crawler(self, crawler):
|
||||||
assert not hasattr(self, '_crawler'), "crawler already set"
|
if hasattr(self, '_crawler'):
|
||||||
|
raise RuntimeError("crawler already set")
|
||||||
self._crawler = crawler
|
self._crawler = crawler
|
||||||
|
|
||||||
def syntax(self):
|
def syntax(self):
|
||||||
@ -58,17 +59,17 @@ class ScrapyCommand:
|
|||||||
"""
|
"""
|
||||||
group = OptionGroup(parser, "Global Options")
|
group = OptionGroup(parser, "Global Options")
|
||||||
group.add_option("--logfile", metavar="FILE",
|
group.add_option("--logfile", metavar="FILE",
|
||||||
help="log file. if omitted stderr will be used")
|
help="log file. if omitted stderr will be used")
|
||||||
group.add_option("-L", "--loglevel", metavar="LEVEL", default=None,
|
group.add_option("-L", "--loglevel", metavar="LEVEL", default=None,
|
||||||
help="log level (default: %s)" % self.settings['LOG_LEVEL'])
|
help="log level (default: %s)" % self.settings['LOG_LEVEL'])
|
||||||
group.add_option("--nolog", action="store_true",
|
group.add_option("--nolog", action="store_true",
|
||||||
help="disable logging completely")
|
help="disable logging completely")
|
||||||
group.add_option("--profile", metavar="FILE", default=None,
|
group.add_option("--profile", metavar="FILE", default=None,
|
||||||
help="write python cProfile stats to FILE")
|
help="write python cProfile stats to FILE")
|
||||||
group.add_option("--pidfile", metavar="FILE",
|
group.add_option("--pidfile", metavar="FILE",
|
||||||
help="write process ID to FILE")
|
help="write process ID to FILE")
|
||||||
group.add_option("-s", "--set", action="append", default=[], metavar="NAME=VALUE",
|
group.add_option("-s", "--set", action="append", default=[], metavar="NAME=VALUE",
|
||||||
help="set/override setting (may be repeated)")
|
help="set/override setting (may be repeated)")
|
||||||
group.add_option("--pdb", action="store_true", help="enable pdb on failure")
|
group.add_option("--pdb", action="store_true", help="enable pdb on failure")
|
||||||
|
|
||||||
parser.add_option_group(group)
|
parser.add_option_group(group)
|
||||||
@ -103,3 +104,27 @@ class ScrapyCommand:
|
|||||||
Entry point for running commands
|
Entry point for running commands
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class BaseRunSpiderCommand(ScrapyCommand):
|
||||||
|
"""
|
||||||
|
Common class used to share functionality between the crawl and runspider commands
|
||||||
|
"""
|
||||||
|
def add_options(self, parser):
|
||||||
|
ScrapyCommand.add_options(self, parser)
|
||||||
|
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
||||||
|
help="set spider argument (may be repeated)")
|
||||||
|
parser.add_option("-o", "--output", metavar="FILE", action="append",
|
||||||
|
help="dump scraped items into FILE (use - for stdout)")
|
||||||
|
parser.add_option("-t", "--output-format", metavar="FORMAT",
|
||||||
|
help="format to use for dumping items with -o")
|
||||||
|
|
||||||
|
def process_options(self, args, opts):
|
||||||
|
ScrapyCommand.process_options(self, args, opts)
|
||||||
|
try:
|
||||||
|
opts.spargs = arglist_to_dict(opts.spargs)
|
||||||
|
except ValueError:
|
||||||
|
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
|
||||||
|
if opts.output:
|
||||||
|
feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
|
||||||
|
self.settings.set('FEEDS', feeds, priority='cmdline')
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
from scrapy.commands import ScrapyCommand
|
from scrapy.commands import BaseRunSpiderCommand
|
||||||
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
|
|
||||||
from scrapy.exceptions import UsageError
|
from scrapy.exceptions import UsageError
|
||||||
|
|
||||||
|
|
||||||
class Command(ScrapyCommand):
|
class Command(BaseRunSpiderCommand):
|
||||||
|
|
||||||
requires_project = True
|
requires_project = True
|
||||||
|
|
||||||
@ -13,25 +12,6 @@ class Command(ScrapyCommand):
|
|||||||
def short_desc(self):
|
def short_desc(self):
|
||||||
return "Run a spider"
|
return "Run a spider"
|
||||||
|
|
||||||
def add_options(self, parser):
|
|
||||||
ScrapyCommand.add_options(self, parser)
|
|
||||||
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
|
||||||
help="set spider argument (may be repeated)")
|
|
||||||
parser.add_option("-o", "--output", metavar="FILE", action="append",
|
|
||||||
help="dump scraped items into FILE (use - for stdout)")
|
|
||||||
parser.add_option("-t", "--output-format", metavar="FORMAT",
|
|
||||||
help="format to use for dumping items with -o")
|
|
||||||
|
|
||||||
def process_options(self, args, opts):
|
|
||||||
ScrapyCommand.process_options(self, args, opts)
|
|
||||||
try:
|
|
||||||
opts.spargs = arglist_to_dict(opts.spargs)
|
|
||||||
except ValueError:
|
|
||||||
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
|
|
||||||
if opts.output:
|
|
||||||
feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
|
|
||||||
self.settings.set('FEEDS', feeds, priority='cmdline')
|
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
if len(args) < 1:
|
if len(args) < 1:
|
||||||
raise UsageError()
|
raise UsageError()
|
||||||
|
@ -27,8 +27,8 @@ class Command(ScrapyCommand):
|
|||||||
parser.add_option("--spider", dest="spider", help="use this spider")
|
parser.add_option("--spider", dest="spider", help="use this spider")
|
||||||
parser.add_option("--headers", dest="headers", action="store_true",
|
parser.add_option("--headers", dest="headers", action="store_true",
|
||||||
help="print response HTTP headers instead of body")
|
help="print response HTTP headers instead of body")
|
||||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true",
|
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
|
||||||
default=False, help="do not handle HTTP 3xx status codes and print response as-is")
|
help="do not handle HTTP 3xx status codes and print response as-is")
|
||||||
|
|
||||||
def _print_headers(self, headers, prefix):
|
def _print_headers(self, headers, prefix):
|
||||||
for key, values in headers.items():
|
for key, values in headers.items():
|
||||||
@ -49,8 +49,8 @@ class Command(ScrapyCommand):
|
|||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
if len(args) != 1 or not is_url(args[0]):
|
if len(args) != 1 or not is_url(args[0]):
|
||||||
raise UsageError()
|
raise UsageError()
|
||||||
cb = lambda x: self._print_response(x, opts)
|
request = Request(args[0], callback=self._print_response,
|
||||||
request = Request(args[0], callback=cb, dont_filter=True)
|
cb_kwargs={"opts": opts}, dont_filter=True)
|
||||||
# by default, let the framework handle redirects,
|
# by default, let the framework handle redirects,
|
||||||
# i.e. command handles all codes expect 3xx
|
# i.e. command handles all codes expect 3xx
|
||||||
if not opts.no_redirect:
|
if not opts.no_redirect:
|
||||||
|
@ -36,15 +36,15 @@ class Command(ScrapyCommand):
|
|||||||
def add_options(self, parser):
|
def add_options(self, parser):
|
||||||
ScrapyCommand.add_options(self, parser)
|
ScrapyCommand.add_options(self, parser)
|
||||||
parser.add_option("-l", "--list", dest="list", action="store_true",
|
parser.add_option("-l", "--list", dest="list", action="store_true",
|
||||||
help="List available templates")
|
help="List available templates")
|
||||||
parser.add_option("-e", "--edit", dest="edit", action="store_true",
|
parser.add_option("-e", "--edit", dest="edit", action="store_true",
|
||||||
help="Edit spider after creating it")
|
help="Edit spider after creating it")
|
||||||
parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE",
|
parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE",
|
||||||
help="Dump template to standard output")
|
help="Dump template to standard output")
|
||||||
parser.add_option("-t", "--template", dest="template", default="basic",
|
parser.add_option("-t", "--template", dest="template", default="basic",
|
||||||
help="Uses a custom template.")
|
help="Uses a custom template.")
|
||||||
parser.add_option("--force", dest="force", action="store_true",
|
parser.add_option("--force", dest="force", action="store_true",
|
||||||
help="If the spider already exists, overwrite it with the template")
|
help="If the spider already exists, overwrite it with the template")
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
if opts.list:
|
if opts.list:
|
||||||
@ -90,8 +90,7 @@ class Command(ScrapyCommand):
|
|||||||
'module': module,
|
'module': module,
|
||||||
'name': name,
|
'name': name,
|
||||||
'domain': domain,
|
'domain': domain,
|
||||||
'classname': '%sSpider' % ''.join(s.capitalize() \
|
'classname': '%sSpider' % ''.join(s.capitalize() for s in module.split('_'))
|
||||||
for s in module.split('_'))
|
|
||||||
}
|
}
|
||||||
if self.settings.get('NEWSPIDER_MODULE'):
|
if self.settings.get('NEWSPIDER_MODULE'):
|
||||||
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
|
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
|
||||||
@ -102,8 +101,8 @@ class Command(ScrapyCommand):
|
|||||||
spider_file = "%s.py" % join(spiders_dir, module)
|
spider_file = "%s.py" % join(spiders_dir, module)
|
||||||
shutil.copyfile(template_file, spider_file)
|
shutil.copyfile(template_file, spider_file)
|
||||||
render_templatefile(spider_file, **tvars)
|
render_templatefile(spider_file, **tvars)
|
||||||
print("Created spider %r using template %r " % (name, \
|
print("Created spider %r using template %r "
|
||||||
template_name), end=('' if spiders_module else '\n'))
|
% (name, template_name), end=('' if spiders_module else '\n'))
|
||||||
if spiders_module:
|
if spiders_module:
|
||||||
print("in module:\n %s.%s" % (spiders_module.__name__, module))
|
print("in module:\n %s.%s" % (spiders_module.__name__, module))
|
||||||
|
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from itemadapter import is_item, ItemAdapter
|
||||||
from w3lib.url import is_url
|
from w3lib.url import is_url
|
||||||
|
|
||||||
from scrapy.commands import ScrapyCommand
|
from scrapy.commands import ScrapyCommand
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy.item import BaseItem
|
|
||||||
from scrapy.utils import display
|
from scrapy.utils import display
|
||||||
from scrapy.utils.conf import arglist_to_dict
|
from scrapy.utils.conf import arglist_to_dict
|
||||||
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
|
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
|
||||||
@ -33,29 +33,29 @@ class Command(ScrapyCommand):
|
|||||||
def add_options(self, parser):
|
def add_options(self, parser):
|
||||||
ScrapyCommand.add_options(self, parser)
|
ScrapyCommand.add_options(self, parser)
|
||||||
parser.add_option("--spider", dest="spider", default=None,
|
parser.add_option("--spider", dest="spider", default=None,
|
||||||
help="use this spider without looking for one")
|
help="use this spider without looking for one")
|
||||||
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
||||||
help="set spider argument (may be repeated)")
|
help="set spider argument (may be repeated)")
|
||||||
parser.add_option("--pipelines", action="store_true",
|
parser.add_option("--pipelines", action="store_true",
|
||||||
help="process items through pipelines")
|
help="process items through pipelines")
|
||||||
parser.add_option("--nolinks", dest="nolinks", action="store_true",
|
parser.add_option("--nolinks", dest="nolinks", action="store_true",
|
||||||
help="don't show links to follow (extracted requests)")
|
help="don't show links to follow (extracted requests)")
|
||||||
parser.add_option("--noitems", dest="noitems", action="store_true",
|
parser.add_option("--noitems", dest="noitems", action="store_true",
|
||||||
help="don't show scraped items")
|
help="don't show scraped items")
|
||||||
parser.add_option("--nocolour", dest="nocolour", action="store_true",
|
parser.add_option("--nocolour", dest="nocolour", action="store_true",
|
||||||
help="avoid using pygments to colorize the output")
|
help="avoid using pygments to colorize the output")
|
||||||
parser.add_option("-r", "--rules", dest="rules", action="store_true",
|
parser.add_option("-r", "--rules", dest="rules", action="store_true",
|
||||||
help="use CrawlSpider rules to discover the callback")
|
help="use CrawlSpider rules to discover the callback")
|
||||||
parser.add_option("-c", "--callback", dest="callback",
|
parser.add_option("-c", "--callback", dest="callback",
|
||||||
help="use this callback for parsing, instead looking for a callback")
|
help="use this callback for parsing, instead looking for a callback")
|
||||||
parser.add_option("-m", "--meta", dest="meta",
|
parser.add_option("-m", "--meta", dest="meta",
|
||||||
help="inject extra meta into the Request, it must be a valid raw json string")
|
help="inject extra meta into the Request, it must be a valid raw json string")
|
||||||
parser.add_option("--cbkwargs", dest="cbkwargs",
|
parser.add_option("--cbkwargs", dest="cbkwargs",
|
||||||
help="inject extra callback kwargs into the Request, it must be a valid raw json string")
|
help="inject extra callback kwargs into the Request, it must be a valid raw json string")
|
||||||
parser.add_option("-d", "--depth", dest="depth", type="int", default=1,
|
parser.add_option("-d", "--depth", dest="depth", type="int", default=1,
|
||||||
help="maximum depth for parsing requests [default: %default]")
|
help="maximum depth for parsing requests [default: %default]")
|
||||||
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
|
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
|
||||||
help="print each depth level one by one")
|
help="print each depth level one by one")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_level(self):
|
def max_level(self):
|
||||||
@ -81,7 +81,7 @@ class Command(ScrapyCommand):
|
|||||||
items = self.items.get(lvl, [])
|
items = self.items.get(lvl, [])
|
||||||
|
|
||||||
print("# Scraped Items ", "-" * 60)
|
print("# Scraped Items ", "-" * 60)
|
||||||
display.pprint([dict(x) for x in items], colorize=colour)
|
display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
|
||||||
|
|
||||||
def print_requests(self, lvl=None, colour=True):
|
def print_requests(self, lvl=None, colour=True):
|
||||||
if lvl is None:
|
if lvl is None:
|
||||||
@ -117,7 +117,7 @@ class Command(ScrapyCommand):
|
|||||||
items, requests = [], []
|
items, requests = [], []
|
||||||
|
|
||||||
for x in iterate_spider_output(callback(response, **cb_kwargs)):
|
for x in iterate_spider_output(callback(response, **cb_kwargs)):
|
||||||
if isinstance(x, (BaseItem, dict)):
|
if is_item(x):
|
||||||
items.append(x)
|
items.append(x)
|
||||||
elif isinstance(x, Request):
|
elif isinstance(x, Request):
|
||||||
requests.append(x)
|
requests.append(x)
|
||||||
@ -146,9 +146,8 @@ class Command(ScrapyCommand):
|
|||||||
if not self.spidercls:
|
if not self.spidercls:
|
||||||
logger.error('Unable to find spider for: %(url)s', {'url': url})
|
logger.error('Unable to find spider for: %(url)s', {'url': url})
|
||||||
|
|
||||||
# Request requires callback argument as callable or None, not string
|
def _start_requests(spider):
|
||||||
request = Request(url, None)
|
yield self.prepare_request(spider, Request(url), opts)
|
||||||
_start_requests = lambda s: [self.prepare_request(s, request, opts)]
|
|
||||||
self.spidercls.start_requests = _start_requests
|
self.spidercls.start_requests = _start_requests
|
||||||
|
|
||||||
def start_parsing(self, url, opts):
|
def start_parsing(self, url, opts):
|
||||||
|
@ -3,9 +3,8 @@ import os
|
|||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
|
|
||||||
from scrapy.utils.spider import iter_spider_classes
|
from scrapy.utils.spider import iter_spider_classes
|
||||||
from scrapy.commands import ScrapyCommand
|
|
||||||
from scrapy.exceptions import UsageError
|
from scrapy.exceptions import UsageError
|
||||||
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
|
from scrapy.commands import BaseRunSpiderCommand
|
||||||
|
|
||||||
|
|
||||||
def _import_file(filepath):
|
def _import_file(filepath):
|
||||||
@ -24,7 +23,7 @@ def _import_file(filepath):
|
|||||||
return module
|
return module
|
||||||
|
|
||||||
|
|
||||||
class Command(ScrapyCommand):
|
class Command(BaseRunSpiderCommand):
|
||||||
|
|
||||||
requires_project = False
|
requires_project = False
|
||||||
default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
|
default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
|
||||||
@ -38,25 +37,6 @@ class Command(ScrapyCommand):
|
|||||||
def long_desc(self):
|
def long_desc(self):
|
||||||
return "Run the spider defined in the given file"
|
return "Run the spider defined in the given file"
|
||||||
|
|
||||||
def add_options(self, parser):
|
|
||||||
ScrapyCommand.add_options(self, parser)
|
|
||||||
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
|
||||||
help="set spider argument (may be repeated)")
|
|
||||||
parser.add_option("-o", "--output", metavar="FILE", action="append",
|
|
||||||
help="dump scraped items into FILE (use - for stdout)")
|
|
||||||
parser.add_option("-t", "--output-format", metavar="FORMAT",
|
|
||||||
help="format to use for dumping items with -o")
|
|
||||||
|
|
||||||
def process_options(self, args, opts):
|
|
||||||
ScrapyCommand.process_options(self, args, opts)
|
|
||||||
try:
|
|
||||||
opts.spargs = arglist_to_dict(opts.spargs)
|
|
||||||
except ValueError:
|
|
||||||
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
|
|
||||||
if opts.output:
|
|
||||||
feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
|
|
||||||
self.settings.set('FEEDS', feeds, priority='cmdline')
|
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
if len(args) != 1:
|
if len(args) != 1:
|
||||||
raise UsageError()
|
raise UsageError()
|
||||||
|
@ -19,15 +19,15 @@ class Command(ScrapyCommand):
|
|||||||
def add_options(self, parser):
|
def add_options(self, parser):
|
||||||
ScrapyCommand.add_options(self, parser)
|
ScrapyCommand.add_options(self, parser)
|
||||||
parser.add_option("--get", dest="get", metavar="SETTING",
|
parser.add_option("--get", dest="get", metavar="SETTING",
|
||||||
help="print raw setting value")
|
help="print raw setting value")
|
||||||
parser.add_option("--getbool", dest="getbool", metavar="SETTING",
|
parser.add_option("--getbool", dest="getbool", metavar="SETTING",
|
||||||
help="print setting value, interpreted as a boolean")
|
help="print setting value, interpreted as a boolean")
|
||||||
parser.add_option("--getint", dest="getint", metavar="SETTING",
|
parser.add_option("--getint", dest="getint", metavar="SETTING",
|
||||||
help="print setting value, interpreted as an integer")
|
help="print setting value, interpreted as an integer")
|
||||||
parser.add_option("--getfloat", dest="getfloat", metavar="SETTING",
|
parser.add_option("--getfloat", dest="getfloat", metavar="SETTING",
|
||||||
help="print setting value, interpreted as a float")
|
help="print setting value, interpreted as a float")
|
||||||
parser.add_option("--getlist", dest="getlist", metavar="SETTING",
|
parser.add_option("--getlist", dest="getlist", metavar="SETTING",
|
||||||
help="print setting value, interpreted as a list")
|
help="print setting value, interpreted as a list")
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
settings = self.crawler_process.settings
|
settings = self.crawler_process.settings
|
||||||
|
@ -34,11 +34,11 @@ class Command(ScrapyCommand):
|
|||||||
def add_options(self, parser):
|
def add_options(self, parser):
|
||||||
ScrapyCommand.add_options(self, parser)
|
ScrapyCommand.add_options(self, parser)
|
||||||
parser.add_option("-c", dest="code",
|
parser.add_option("-c", dest="code",
|
||||||
help="evaluate the code in the shell, print the result and exit")
|
help="evaluate the code in the shell, print the result and exit")
|
||||||
parser.add_option("--spider", dest="spider",
|
parser.add_option("--spider", dest="spider",
|
||||||
help="use this spider")
|
help="use this spider")
|
||||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \
|
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
|
||||||
default=False, help="do not handle HTTP 3xx status codes and print response as-is")
|
help="do not handle HTTP 3xx status codes and print response as-is")
|
||||||
|
|
||||||
def update_vars(self, vars):
|
def update_vars(self, vars):
|
||||||
"""You can use this function to update the Scrapy objects that will be
|
"""You can use this function to update the Scrapy objects that will be
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import stat
|
||||||
import string
|
import string
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
from os.path import join, exists, abspath
|
from os.path import join, exists, abspath
|
||||||
@ -78,6 +79,29 @@ class Command(ScrapyCommand):
|
|||||||
else:
|
else:
|
||||||
copy2(srcname, dstname)
|
copy2(srcname, dstname)
|
||||||
copystat(src, dst)
|
copystat(src, dst)
|
||||||
|
self._set_rw_permissions(dst)
|
||||||
|
|
||||||
|
def _set_rw_permissions(self, path):
|
||||||
|
"""
|
||||||
|
Sets permissions of a directory tree to +rw and +rwx for folders.
|
||||||
|
This is necessary if the start template files come without write
|
||||||
|
permissions.
|
||||||
|
"""
|
||||||
|
mode_rw = (stat.S_IRUSR
|
||||||
|
| stat.S_IWUSR
|
||||||
|
| stat.S_IRGRP
|
||||||
|
| stat.S_IROTH)
|
||||||
|
|
||||||
|
mode_x = (stat.S_IXUSR
|
||||||
|
| stat.S_IXGRP
|
||||||
|
| stat.S_IXOTH)
|
||||||
|
|
||||||
|
os.chmod(path, mode_rw | mode_x)
|
||||||
|
for root, dirs, files in os.walk(path):
|
||||||
|
for dir in dirs:
|
||||||
|
os.chmod(join(root, dir), mode_rw | mode_x)
|
||||||
|
for file in files:
|
||||||
|
os.chmod(join(root, file), mode_rw)
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
if len(args) not in (1, 2):
|
if len(args) not in (1, 2):
|
||||||
@ -102,10 +126,8 @@ class Command(ScrapyCommand):
|
|||||||
move(join(project_dir, 'module'), join(project_dir, project_name))
|
move(join(project_dir, 'module'), join(project_dir, project_name))
|
||||||
for paths in TEMPLATES_TO_RENDER:
|
for paths in TEMPLATES_TO_RENDER:
|
||||||
path = join(*paths)
|
path = join(*paths)
|
||||||
tplfile = join(project_dir,
|
tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
|
||||||
string.Template(path).substitute(project_name=project_name))
|
render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
|
||||||
render_templatefile(tplfile, project_name=project_name,
|
|
||||||
ProjectName=string_camelcase(project_name))
|
|
||||||
print("New Scrapy project '%s', using template directory '%s', "
|
print("New Scrapy project '%s', using template directory '%s', "
|
||||||
"created in:" % (project_name, self.templates_dir))
|
"created in:" % (project_name, self.templates_dir))
|
||||||
print(" %s\n" % abspath(project_dir))
|
print(" %s\n" % abspath(project_dir))
|
||||||
|
@ -17,7 +17,7 @@ class Command(ScrapyCommand):
|
|||||||
def add_options(self, parser):
|
def add_options(self, parser):
|
||||||
ScrapyCommand.add_options(self, parser)
|
ScrapyCommand.add_options(self, parser)
|
||||||
parser.add_option("--verbose", "-v", dest="verbose", action="store_true",
|
parser.add_option("--verbose", "-v", dest="verbose", action="store_true",
|
||||||
help="also display twisted/python/platform info (useful for bug reports)")
|
help="also display twisted/python/platform info (useful for bug reports)")
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
if opts.verbose:
|
if opts.verbose:
|
||||||
|
@ -17,10 +17,10 @@ class ContractsManager:
|
|||||||
self.contracts[contract.name] = contract
|
self.contracts[contract.name] = contract
|
||||||
|
|
||||||
def tested_methods_from_spidercls(self, spidercls):
|
def tested_methods_from_spidercls(self, spidercls):
|
||||||
|
is_method = re.compile(r"^\s*@", re.MULTILINE).search
|
||||||
methods = []
|
methods = []
|
||||||
for key, value in getmembers(spidercls):
|
for key, value in getmembers(spidercls):
|
||||||
if (callable(value) and value.__doc__ and
|
if callable(value) and value.__doc__ and is_method(value.__doc__):
|
||||||
re.search(r'^\s*@', value.__doc__, re.MULTILINE)):
|
|
||||||
methods.append(key)
|
methods.append(key)
|
||||||
|
|
||||||
return methods
|
return methods
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
from scrapy.item import BaseItem
|
from itemadapter import is_item, ItemAdapter
|
||||||
from scrapy.http import Request
|
|
||||||
from scrapy.exceptions import ContractFail
|
|
||||||
|
|
||||||
from scrapy.contracts import Contract
|
from scrapy.contracts import Contract
|
||||||
|
from scrapy.exceptions import ContractFail
|
||||||
|
from scrapy.http import Request
|
||||||
|
|
||||||
|
|
||||||
# contracts
|
# contracts
|
||||||
@ -48,19 +48,23 @@ class ReturnsContract(Contract):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
name = 'returns'
|
name = 'returns'
|
||||||
objects = {
|
object_type_verifiers = {
|
||||||
'request': Request,
|
'request': lambda x: isinstance(x, Request),
|
||||||
'requests': Request,
|
'requests': lambda x: isinstance(x, Request),
|
||||||
'item': (BaseItem, dict),
|
'item': is_item,
|
||||||
'items': (BaseItem, dict),
|
'items': is_item,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super(ReturnsContract, self).__init__(*args, **kwargs)
|
super(ReturnsContract, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
assert len(self.args) in [1, 2, 3]
|
if len(self.args) not in [1, 2, 3]:
|
||||||
|
raise ValueError(
|
||||||
|
"Incorrect argument quantity: expected 1, 2 or 3, got %i"
|
||||||
|
% len(self.args)
|
||||||
|
)
|
||||||
self.obj_name = self.args[0] or None
|
self.obj_name = self.args[0] or None
|
||||||
self.obj_type = self.objects[self.obj_name]
|
self.obj_type_verifier = self.object_type_verifiers[self.obj_name]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.min_bound = int(self.args[1])
|
self.min_bound = int(self.args[1])
|
||||||
@ -75,7 +79,7 @@ class ReturnsContract(Contract):
|
|||||||
def post_process(self, output):
|
def post_process(self, output):
|
||||||
occurrences = 0
|
occurrences = 0
|
||||||
for x in output:
|
for x in output:
|
||||||
if isinstance(x, self.obj_type):
|
if self.obj_type_verifier(x):
|
||||||
occurrences += 1
|
occurrences += 1
|
||||||
|
|
||||||
assertion = (self.min_bound <= occurrences <= self.max_bound)
|
assertion = (self.min_bound <= occurrences <= self.max_bound)
|
||||||
@ -99,8 +103,8 @@ class ScrapesContract(Contract):
|
|||||||
|
|
||||||
def post_process(self, output):
|
def post_process(self, output):
|
||||||
for x in output:
|
for x in output:
|
||||||
if isinstance(x, (BaseItem, dict)):
|
if is_item(x):
|
||||||
missing = [arg for arg in self.args if arg not in x]
|
missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
|
||||||
if missing:
|
if missing:
|
||||||
raise ContractFail(
|
missing_str = ", ".join(missing)
|
||||||
"Missing fields: %s" % ", ".join(missing))
|
raise ContractFail("Missing fields: %s" % missing_str)
|
||||||
|
@ -173,7 +173,7 @@ class Downloader:
|
|||||||
return response
|
return response
|
||||||
dfd.addCallback(_downloaded)
|
dfd.addCallback(_downloaded)
|
||||||
|
|
||||||
# 3. After response arrives, remove the request from transferring
|
# 3. After response arrives, remove the request from transferring
|
||||||
# state to free up the transferring slot so it can be used by the
|
# state to free up the transferring slot so it can be used by the
|
||||||
# following requests (perhaps those which came from the downloader
|
# following requests (perhaps those which came from the downloader
|
||||||
# middleware itself)
|
# middleware itself)
|
||||||
|
@ -46,11 +46,12 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
|||||||
#
|
#
|
||||||
# * getattr() for `_ssl_method` attribute for context factories
|
# * getattr() for `_ssl_method` attribute for context factories
|
||||||
# not calling super(..., self).__init__
|
# not calling super(..., self).__init__
|
||||||
return CertificateOptions(verify=False,
|
return CertificateOptions(
|
||||||
method=getattr(self, 'method',
|
verify=False,
|
||||||
getattr(self, '_ssl_method', None)),
|
method=getattr(self, 'method', getattr(self, '_ssl_method', None)),
|
||||||
fixBrokenPeers=True,
|
fixBrokenPeers=True,
|
||||||
acceptableCiphers=self.tls_ciphers)
|
acceptableCiphers=self.tls_ciphers,
|
||||||
|
)
|
||||||
|
|
||||||
# kept for old-style HTTP/1.0 downloader context twisted calls,
|
# kept for old-style HTTP/1.0 downloader context twisted calls,
|
||||||
# e.g. connectSSL()
|
# e.g. connectSSL()
|
||||||
@ -86,8 +87,8 @@ class BrowserLikeContextFactory(ScrapyClientContextFactory):
|
|||||||
#
|
#
|
||||||
# This means that a website like https://www.cacert.org will be rejected
|
# This means that a website like https://www.cacert.org will be rejected
|
||||||
# by default, since CAcert.org CA certificate is seldom shipped.
|
# by default, since CAcert.org CA certificate is seldom shipped.
|
||||||
return optionsForClientTLS(hostname.decode("ascii"),
|
return optionsForClientTLS(
|
||||||
trustRoot=platformTrust(),
|
hostname=hostname.decode("ascii"),
|
||||||
extraCertificateOptions={
|
trustRoot=platformTrust(),
|
||||||
'method': self._ssl_method,
|
extraCertificateOptions={'method': self._ssl_method},
|
||||||
})
|
)
|
||||||
|
@ -86,19 +86,19 @@ class FTPDownloadHandler:
|
|||||||
password = request.meta.get("ftp_password", self.default_password)
|
password = request.meta.get("ftp_password", self.default_password)
|
||||||
passive_mode = 1 if bool(request.meta.get("ftp_passive",
|
passive_mode = 1 if bool(request.meta.get("ftp_passive",
|
||||||
self.passive_mode)) else 0
|
self.passive_mode)) else 0
|
||||||
creator = ClientCreator(reactor, FTPClient, user, password,
|
creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode)
|
||||||
passive=passive_mode)
|
dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
|
||||||
return creator.connectTCP(parsed_url.hostname, parsed_url.port or 21).addCallback(self.gotClient,
|
return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
|
||||||
request, unquote(parsed_url.path))
|
|
||||||
|
|
||||||
def gotClient(self, client, request, filepath):
|
def gotClient(self, client, request, filepath):
|
||||||
self.client = client
|
self.client = client
|
||||||
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
|
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
|
||||||
return client.retrieveFile(filepath, protocol)\
|
return client.retrieveFile(filepath, protocol).addCallbacks(
|
||||||
.addCallbacks(callback=self._build_response,
|
callback=self._build_response,
|
||||||
callbackArgs=(request, protocol),
|
callbackArgs=(request, protocol),
|
||||||
errback=self._failed,
|
errback=self._failed,
|
||||||
errbackArgs=(request,))
|
errbackArgs=(request,),
|
||||||
|
)
|
||||||
|
|
||||||
def _build_response(self, result, request, protocol):
|
def _build_response(self, result, request, protocol):
|
||||||
self.result = result
|
self.result = result
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
"""Download handlers for http and https schemes"""
|
"""Download handlers for http and https schemes"""
|
||||||
|
|
||||||
|
import ipaddress
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
@ -11,15 +12,17 @@ from urllib.parse import urldefrag
|
|||||||
from twisted.internet import defer, protocol, ssl
|
from twisted.internet import defer, protocol, ssl
|
||||||
from twisted.internet.endpoints import TCP4ClientEndpoint
|
from twisted.internet.endpoints import TCP4ClientEndpoint
|
||||||
from twisted.internet.error import TimeoutError
|
from twisted.internet.error import TimeoutError
|
||||||
|
from twisted.python.failure import Failure
|
||||||
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
|
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
|
||||||
from twisted.web.http import _DataLoss, PotentialDataLoss
|
from twisted.web.http import _DataLoss, PotentialDataLoss
|
||||||
from twisted.web.http_headers import Headers as TxHeaders
|
from twisted.web.http_headers import Headers as TxHeaders
|
||||||
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
|
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
|
||||||
from zope.interface import implementer
|
from zope.interface import implementer
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
from scrapy.core.downloader.tls import openssl_methods
|
from scrapy.core.downloader.tls import openssl_methods
|
||||||
from scrapy.core.downloader.webclient import _parse
|
from scrapy.core.downloader.webclient import _parse
|
||||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
from scrapy.exceptions import ScrapyDeprecationWarning, StopDownload
|
||||||
from scrapy.http import Headers
|
from scrapy.http import Headers
|
||||||
from scrapy.responsetypes import responsetypes
|
from scrapy.responsetypes import responsetypes
|
||||||
from scrapy.utils.misc import create_instance, load_object
|
from scrapy.utils.misc import create_instance, load_object
|
||||||
@ -33,6 +36,8 @@ class HTTP11DownloadHandler:
|
|||||||
lazy = False
|
lazy = False
|
||||||
|
|
||||||
def __init__(self, settings, crawler=None):
|
def __init__(self, settings, crawler=None):
|
||||||
|
self._crawler = crawler
|
||||||
|
|
||||||
from twisted.internet import reactor
|
from twisted.internet import reactor
|
||||||
self._pool = HTTPConnectionPool(reactor, persistent=True)
|
self._pool = HTTPConnectionPool(reactor, persistent=True)
|
||||||
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
||||||
@ -78,6 +83,7 @@ class HTTP11DownloadHandler:
|
|||||||
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
|
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
|
||||||
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
|
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
|
||||||
fail_on_dataloss=self._fail_on_dataloss,
|
fail_on_dataloss=self._fail_on_dataloss,
|
||||||
|
crawler=self._crawler,
|
||||||
)
|
)
|
||||||
return agent.download_request(request)
|
return agent.download_request(request)
|
||||||
|
|
||||||
@ -275,7 +281,7 @@ class ScrapyAgent:
|
|||||||
_TunnelingAgent = TunnelingAgent
|
_TunnelingAgent = TunnelingAgent
|
||||||
|
|
||||||
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
|
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
|
||||||
maxsize=0, warnsize=0, fail_on_dataloss=True):
|
maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None):
|
||||||
self._contextFactory = contextFactory
|
self._contextFactory = contextFactory
|
||||||
self._connectTimeout = connectTimeout
|
self._connectTimeout = connectTimeout
|
||||||
self._bindAddress = bindAddress
|
self._bindAddress = bindAddress
|
||||||
@ -284,6 +290,7 @@ class ScrapyAgent:
|
|||||||
self._warnsize = warnsize
|
self._warnsize = warnsize
|
||||||
self._fail_on_dataloss = fail_on_dataloss
|
self._fail_on_dataloss = fail_on_dataloss
|
||||||
self._txresponse = None
|
self._txresponse = None
|
||||||
|
self._crawler = crawler
|
||||||
|
|
||||||
def _get_agent(self, request, timeout):
|
def _get_agent(self, request, timeout):
|
||||||
from twisted.internet import reactor
|
from twisted.internet import reactor
|
||||||
@ -341,20 +348,6 @@ class ScrapyAgent:
|
|||||||
headers.removeHeader(b'Proxy-Authorization')
|
headers.removeHeader(b'Proxy-Authorization')
|
||||||
if request.body:
|
if request.body:
|
||||||
bodyproducer = _RequestBodyProducer(request.body)
|
bodyproducer = _RequestBodyProducer(request.body)
|
||||||
elif method == b'POST':
|
|
||||||
# Setting Content-Length: 0 even for POST requests is not a
|
|
||||||
# MUST per HTTP RFCs, but it's common behavior, and some
|
|
||||||
# servers require this, otherwise returning HTTP 411 Length required
|
|
||||||
#
|
|
||||||
# RFC 7230#section-3.3.2:
|
|
||||||
# "a Content-Length header field is normally sent in a POST
|
|
||||||
# request even when the value is 0 (indicating an empty payload body)."
|
|
||||||
#
|
|
||||||
# Twisted < 17 will not add "Content-Length: 0" by itself;
|
|
||||||
# Twisted >= 17 fixes this;
|
|
||||||
# Using a producer with an empty-string sends `0` as Content-Length
|
|
||||||
# for all versions of Twisted.
|
|
||||||
bodyproducer = _RequestBodyProducer(b'')
|
|
||||||
else:
|
else:
|
||||||
bodyproducer = None
|
bodyproducer = None
|
||||||
start_time = time()
|
start_time = time()
|
||||||
@ -387,7 +380,13 @@ class ScrapyAgent:
|
|||||||
def _cb_bodyready(self, txresponse, request):
|
def _cb_bodyready(self, txresponse, request):
|
||||||
# deliverBody hangs for responses without body
|
# deliverBody hangs for responses without body
|
||||||
if txresponse.length == 0:
|
if txresponse.length == 0:
|
||||||
return txresponse, b'', None, None
|
return {
|
||||||
|
"txresponse": txresponse,
|
||||||
|
"body": b"",
|
||||||
|
"flags": None,
|
||||||
|
"certificate": None,
|
||||||
|
"ip_address": None,
|
||||||
|
}
|
||||||
|
|
||||||
maxsize = request.meta.get('download_maxsize', self._maxsize)
|
maxsize = request.meta.get('download_maxsize', self._maxsize)
|
||||||
warnsize = request.meta.get('download_warnsize', self._warnsize)
|
warnsize = request.meta.get('download_warnsize', self._warnsize)
|
||||||
@ -414,7 +413,15 @@ class ScrapyAgent:
|
|||||||
|
|
||||||
d = defer.Deferred(_cancel)
|
d = defer.Deferred(_cancel)
|
||||||
txresponse.deliverBody(
|
txresponse.deliverBody(
|
||||||
_ResponseReader(d, txresponse, request, maxsize, warnsize, fail_on_dataloss)
|
_ResponseReader(
|
||||||
|
finished=d,
|
||||||
|
txresponse=txresponse,
|
||||||
|
request=request,
|
||||||
|
maxsize=maxsize,
|
||||||
|
warnsize=warnsize,
|
||||||
|
fail_on_dataloss=fail_on_dataloss,
|
||||||
|
crawler=self._crawler,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# save response for timeouts
|
# save response for timeouts
|
||||||
@ -423,12 +430,21 @@ class ScrapyAgent:
|
|||||||
return d
|
return d
|
||||||
|
|
||||||
def _cb_bodydone(self, result, request, url):
|
def _cb_bodydone(self, result, request, url):
|
||||||
txresponse, body, flags, certificate = result
|
headers = Headers(result["txresponse"].headers.getAllRawHeaders())
|
||||||
status = int(txresponse.code)
|
respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
|
||||||
headers = Headers(txresponse.headers.getAllRawHeaders())
|
response = respcls(
|
||||||
respcls = responsetypes.from_args(headers=headers, url=url, body=body)
|
url=url,
|
||||||
return respcls(url=url, status=status, headers=headers, body=body,
|
status=int(result["txresponse"].code),
|
||||||
flags=flags, certificate=certificate)
|
headers=headers,
|
||||||
|
body=result["body"],
|
||||||
|
flags=result["flags"],
|
||||||
|
certificate=result["certificate"],
|
||||||
|
ip_address=result["ip_address"],
|
||||||
|
)
|
||||||
|
if result.get("failure"):
|
||||||
|
result["failure"].value.response = response
|
||||||
|
return result["failure"]
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
@implementer(IBodyProducer)
|
@implementer(IBodyProducer)
|
||||||
@ -451,7 +467,7 @@ class _RequestBodyProducer:
|
|||||||
|
|
||||||
class _ResponseReader(protocol.Protocol):
|
class _ResponseReader(protocol.Protocol):
|
||||||
|
|
||||||
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss):
|
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler):
|
||||||
self._finished = finished
|
self._finished = finished
|
||||||
self._txresponse = txresponse
|
self._txresponse = txresponse
|
||||||
self._request = request
|
self._request = request
|
||||||
@ -463,12 +479,27 @@ class _ResponseReader(protocol.Protocol):
|
|||||||
self._reached_warnsize = False
|
self._reached_warnsize = False
|
||||||
self._bytes_received = 0
|
self._bytes_received = 0
|
||||||
self._certificate = None
|
self._certificate = None
|
||||||
|
self._ip_address = None
|
||||||
|
self._crawler = crawler
|
||||||
|
|
||||||
|
def _finish_response(self, flags=None, failure=None):
|
||||||
|
self._finished.callback({
|
||||||
|
"txresponse": self._txresponse,
|
||||||
|
"body": self._bodybuf.getvalue(),
|
||||||
|
"flags": flags,
|
||||||
|
"certificate": self._certificate,
|
||||||
|
"ip_address": self._ip_address,
|
||||||
|
"failure": failure,
|
||||||
|
})
|
||||||
|
|
||||||
def connectionMade(self):
|
def connectionMade(self):
|
||||||
if self._certificate is None:
|
if self._certificate is None:
|
||||||
with suppress(AttributeError):
|
with suppress(AttributeError):
|
||||||
self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
|
self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
|
||||||
|
|
||||||
|
if self._ip_address is None:
|
||||||
|
self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host)
|
||||||
|
|
||||||
def dataReceived(self, bodyBytes):
|
def dataReceived(self, bodyBytes):
|
||||||
# This maybe called several times after cancel was called with buffered data.
|
# This maybe called several times after cancel was called with buffered data.
|
||||||
if self._finished.called:
|
if self._finished.called:
|
||||||
@ -477,6 +508,20 @@ class _ResponseReader(protocol.Protocol):
|
|||||||
self._bodybuf.write(bodyBytes)
|
self._bodybuf.write(bodyBytes)
|
||||||
self._bytes_received += len(bodyBytes)
|
self._bytes_received += len(bodyBytes)
|
||||||
|
|
||||||
|
bytes_received_result = self._crawler.signals.send_catch_log(
|
||||||
|
signal=signals.bytes_received,
|
||||||
|
data=bodyBytes,
|
||||||
|
request=self._request,
|
||||||
|
spider=self._crawler.spider,
|
||||||
|
)
|
||||||
|
for handler, result in bytes_received_result:
|
||||||
|
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
|
||||||
|
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
|
||||||
|
{"request": self._request, "handler": handler.__qualname__})
|
||||||
|
self.transport._producer.loseConnection()
|
||||||
|
failure = result if result.value.fail else None
|
||||||
|
self._finish_response(flags=["download_stopped"], failure=failure)
|
||||||
|
|
||||||
if self._maxsize and self._bytes_received > self._maxsize:
|
if self._maxsize and self._bytes_received > self._maxsize:
|
||||||
logger.error("Received (%(bytes)s) bytes larger than download "
|
logger.error("Received (%(bytes)s) bytes larger than download "
|
||||||
"max size (%(maxsize)s) in request %(request)s.",
|
"max size (%(maxsize)s) in request %(request)s.",
|
||||||
@ -498,18 +543,17 @@ class _ResponseReader(protocol.Protocol):
|
|||||||
if self._finished.called:
|
if self._finished.called:
|
||||||
return
|
return
|
||||||
|
|
||||||
body = self._bodybuf.getvalue()
|
|
||||||
if reason.check(ResponseDone):
|
if reason.check(ResponseDone):
|
||||||
self._finished.callback((self._txresponse, body, None, self._certificate))
|
self._finish_response()
|
||||||
return
|
return
|
||||||
|
|
||||||
if reason.check(PotentialDataLoss):
|
if reason.check(PotentialDataLoss):
|
||||||
self._finished.callback((self._txresponse, body, ['partial'], self._certificate))
|
self._finish_response(flags=["partial"])
|
||||||
return
|
return
|
||||||
|
|
||||||
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
|
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
|
||||||
if not self._fail_on_dataloss:
|
if not self._fail_on_dataloss:
|
||||||
self._finished.callback((self._txresponse, body, ['dataloss'], self._certificate))
|
self._finish_response(flags=["dataloss"])
|
||||||
return
|
return
|
||||||
|
|
||||||
elif not self._fail_on_dataloss_warned:
|
elif not self._fail_on_dataloss_warned:
|
||||||
|
@ -100,11 +100,12 @@ class S3DownloadHandler:
|
|||||||
url=url, headers=awsrequest.headers.items())
|
url=url, headers=awsrequest.headers.items())
|
||||||
else:
|
else:
|
||||||
signed_headers = self.conn.make_request(
|
signed_headers = self.conn.make_request(
|
||||||
method=request.method,
|
method=request.method,
|
||||||
bucket=bucket,
|
bucket=bucket,
|
||||||
key=unquote(p.path),
|
key=unquote(p.path),
|
||||||
query_args=unquote(p.query),
|
query_args=unquote(p.query),
|
||||||
headers=request.headers,
|
headers=request.headers,
|
||||||
data=request.body)
|
data=request.body,
|
||||||
|
)
|
||||||
request = request.replace(url=url, headers=signed_headers)
|
request = request.replace(url=url, headers=signed_headers)
|
||||||
return self._download_http(request, spider)
|
return self._download_http(request, spider)
|
||||||
|
@ -35,38 +35,45 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
|||||||
for method in self.methods['process_request']:
|
for method in self.methods['process_request']:
|
||||||
response = yield deferred_from_coro(method(request=request, spider=spider))
|
response = yield deferred_from_coro(method(request=request, spider=spider))
|
||||||
if response is not None and not isinstance(response, (Response, Request)):
|
if response is not None and not isinstance(response, (Response, Request)):
|
||||||
raise _InvalidOutput('Middleware %s.process_request must return None, Response or Request, got %s' % \
|
raise _InvalidOutput(
|
||||||
(method.__self__.__class__.__name__, response.__class__.__name__))
|
"Middleware %s.process_request must return None, Response or Request, got %s"
|
||||||
|
% (method.__self__.__class__.__name__, response.__class__.__name__)
|
||||||
|
)
|
||||||
if response:
|
if response:
|
||||||
defer.returnValue(response)
|
return response
|
||||||
defer.returnValue((yield download_func(request=request, spider=spider)))
|
return (yield download_func(request=request, spider=spider))
|
||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def process_response(response):
|
def process_response(response):
|
||||||
assert response is not None, 'Received None in process_response'
|
if response is None:
|
||||||
if isinstance(response, Request):
|
raise TypeError("Received None in process_response")
|
||||||
defer.returnValue(response)
|
elif isinstance(response, Request):
|
||||||
|
return response
|
||||||
|
|
||||||
for method in self.methods['process_response']:
|
for method in self.methods['process_response']:
|
||||||
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
|
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
|
||||||
if not isinstance(response, (Response, Request)):
|
if not isinstance(response, (Response, Request)):
|
||||||
raise _InvalidOutput('Middleware %s.process_response must return Response or Request, got %s' % \
|
raise _InvalidOutput(
|
||||||
(method.__self__.__class__.__name__, type(response)))
|
"Middleware %s.process_response must return Response or Request, got %s"
|
||||||
|
% (method.__self__.__class__.__name__, type(response))
|
||||||
|
)
|
||||||
if isinstance(response, Request):
|
if isinstance(response, Request):
|
||||||
defer.returnValue(response)
|
return response
|
||||||
defer.returnValue(response)
|
return response
|
||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def process_exception(_failure):
|
def process_exception(failure):
|
||||||
exception = _failure.value
|
exception = failure.value
|
||||||
for method in self.methods['process_exception']:
|
for method in self.methods['process_exception']:
|
||||||
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
|
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
|
||||||
if response is not None and not isinstance(response, (Response, Request)):
|
if response is not None and not isinstance(response, (Response, Request)):
|
||||||
raise _InvalidOutput('Middleware %s.process_exception must return None, Response or Request, got %s' % \
|
raise _InvalidOutput(
|
||||||
(method.__self__.__class__.__name__, type(response)))
|
"Middleware %s.process_exception must return None, Response or Request, got %s"
|
||||||
|
% (method.__self__.__class__.__name__, type(response))
|
||||||
|
)
|
||||||
if response:
|
if response:
|
||||||
defer.returnValue(response)
|
return response
|
||||||
defer.returnValue(_failure)
|
return failure
|
||||||
|
|
||||||
deferred = mustbe_deferred(process_request, request)
|
deferred = mustbe_deferred(process_request, request)
|
||||||
deferred.addErrback(process_exception)
|
deferred.addErrback(process_exception)
|
||||||
|
@ -20,8 +20,8 @@ METHOD_TLSv12 = 'TLSv1.2'
|
|||||||
|
|
||||||
|
|
||||||
openssl_methods = {
|
openssl_methods = {
|
||||||
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
|
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
|
||||||
METHOD_SSLv3: SSL.SSLv3_METHOD, # SSL 3 (NOT recommended)
|
METHOD_SSLv3: SSL.SSLv3_METHOD, # SSL 3 (NOT recommended)
|
||||||
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
|
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
|
||||||
METHOD_TLSv11: getattr(SSL, 'TLSv1_1_METHOD', 5), # TLS 1.1 only
|
METHOD_TLSv11: getattr(SSL, 'TLSv1_1_METHOD', 5), # TLS 1.1 only
|
||||||
METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only
|
METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only
|
||||||
|
@ -14,13 +14,12 @@ from scrapy.responsetypes import responsetypes
|
|||||||
def _parsed_url_args(parsed):
|
def _parsed_url_args(parsed):
|
||||||
# Assume parsed is urlparse-d from Request.url,
|
# Assume parsed is urlparse-d from Request.url,
|
||||||
# which was passed via safe_url_string and is ascii-only.
|
# which was passed via safe_url_string and is ascii-only.
|
||||||
b = lambda s: to_bytes(s, encoding='ascii')
|
|
||||||
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
||||||
path = b(path)
|
path = to_bytes(path, encoding="ascii")
|
||||||
host = b(parsed.hostname)
|
host = to_bytes(parsed.hostname, encoding="ascii")
|
||||||
port = parsed.port
|
port = parsed.port
|
||||||
scheme = b(parsed.scheme)
|
scheme = to_bytes(parsed.scheme, encoding="ascii")
|
||||||
netloc = b(parsed.netloc)
|
netloc = to_bytes(parsed.netloc, encoding="ascii")
|
||||||
if port is None:
|
if port is None:
|
||||||
port = 443 if scheme == b'https' else 80
|
port = 443 if scheme == b'https' else 80
|
||||||
return scheme, netloc, host, port, path
|
return scheme, netloc, host, port, path
|
||||||
@ -89,8 +88,8 @@ class ScrapyHTTPPageGetter(HTTPClient):
|
|||||||
self.transport.stopProducing()
|
self.transport.stopProducing()
|
||||||
|
|
||||||
self.factory.noPage(
|
self.factory.noPage(
|
||||||
defer.TimeoutError("Getting %s took longer than %s seconds." %
|
defer.TimeoutError("Getting %s took longer than %s seconds."
|
||||||
(self.factory.url, self.factory.timeout)))
|
% (self.factory.url, self.factory.timeout)))
|
||||||
|
|
||||||
|
|
||||||
class ScrapyHTTPClientFactory(HTTPClientFactory):
|
class ScrapyHTTPClientFactory(HTTPClientFactory):
|
||||||
|
@ -73,7 +73,8 @@ class ExecutionEngine:
|
|||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def start(self):
|
def start(self):
|
||||||
"""Start the execution engine"""
|
"""Start the execution engine"""
|
||||||
assert not self.running, "Engine already running"
|
if self.running:
|
||||||
|
raise RuntimeError("Engine already running")
|
||||||
self.start_time = time()
|
self.start_time = time()
|
||||||
yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
|
yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
|
||||||
self.running = True
|
self.running = True
|
||||||
@ -82,7 +83,8 @@ class ExecutionEngine:
|
|||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
"""Stop the execution engine gracefully"""
|
"""Stop the execution engine gracefully"""
|
||||||
assert self.running, "Engine not running"
|
if not self.running:
|
||||||
|
raise RuntimeError("Engine not running")
|
||||||
self.running = False
|
self.running = False
|
||||||
dfd = self._close_all_spiders()
|
dfd = self._close_all_spiders()
|
||||||
return dfd.addBoth(lambda _: self._finish_stopping_engine())
|
return dfd.addBoth(lambda _: self._finish_stopping_engine())
|
||||||
@ -165,7 +167,11 @@ class ExecutionEngine:
|
|||||||
return d
|
return d
|
||||||
|
|
||||||
def _handle_downloader_output(self, response, request, spider):
|
def _handle_downloader_output(self, response, request, spider):
|
||||||
assert isinstance(response, (Request, Response, Failure)), response
|
if not isinstance(response, (Request, Response, Failure)):
|
||||||
|
raise TypeError(
|
||||||
|
"Incorrect type: expected Request, Response or Failure, got %s: %r"
|
||||||
|
% (type(response), response)
|
||||||
|
)
|
||||||
# downloader middleware can return requests (for example, redirects)
|
# downloader middleware can return requests (for example, redirects)
|
||||||
if isinstance(response, Request):
|
if isinstance(response, Request):
|
||||||
self.crawl(response, spider)
|
self.crawl(response, spider)
|
||||||
@ -205,17 +211,15 @@ class ExecutionEngine:
|
|||||||
return not bool(self.slot)
|
return not bool(self.slot)
|
||||||
|
|
||||||
def crawl(self, request, spider):
|
def crawl(self, request, spider):
|
||||||
assert spider in self.open_spiders, \
|
if spider not in self.open_spiders:
|
||||||
"Spider %r not opened when crawling: %s" % (spider.name, request)
|
raise RuntimeError("Spider %r not opened when crawling: %s" % (spider.name, request))
|
||||||
self.schedule(request, spider)
|
self.schedule(request, spider)
|
||||||
self.slot.nextcall.schedule()
|
self.slot.nextcall.schedule()
|
||||||
|
|
||||||
def schedule(self, request, spider):
|
def schedule(self, request, spider):
|
||||||
self.signals.send_catch_log(signal=signals.request_scheduled,
|
self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
|
||||||
request=request, spider=spider)
|
|
||||||
if not self.slot.scheduler.enqueue_request(request):
|
if not self.slot.scheduler.enqueue_request(request):
|
||||||
self.signals.send_catch_log(signal=signals.request_dropped,
|
self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
|
||||||
request=request, spider=spider)
|
|
||||||
|
|
||||||
def download(self, request, spider):
|
def download(self, request, spider):
|
||||||
d = self._download(request, spider)
|
d = self._download(request, spider)
|
||||||
@ -224,22 +228,25 @@ class ExecutionEngine:
|
|||||||
|
|
||||||
def _downloaded(self, response, slot, request, spider):
|
def _downloaded(self, response, slot, request, spider):
|
||||||
slot.remove_request(request)
|
slot.remove_request(request)
|
||||||
return self.download(response, spider) \
|
return self.download(response, spider) if isinstance(response, Request) else response
|
||||||
if isinstance(response, Request) else response
|
|
||||||
|
|
||||||
def _download(self, request, spider):
|
def _download(self, request, spider):
|
||||||
slot = self.slot
|
slot = self.slot
|
||||||
slot.add_request(request)
|
slot.add_request(request)
|
||||||
|
|
||||||
def _on_success(response):
|
def _on_success(response):
|
||||||
assert isinstance(response, (Response, Request))
|
if not isinstance(response, (Response, Request)):
|
||||||
|
raise TypeError(
|
||||||
|
"Incorrect type: expected Response or Request, got %s: %r"
|
||||||
|
% (type(response), response)
|
||||||
|
)
|
||||||
if isinstance(response, Response):
|
if isinstance(response, Response):
|
||||||
response.request = request # tie request to response received
|
response.request = request # tie request to response received
|
||||||
logkws = self.logformatter.crawled(request, response, spider)
|
logkws = self.logformatter.crawled(request, response, spider)
|
||||||
if logkws is not None:
|
if logkws is not None:
|
||||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||||
self.signals.send_catch_log(signal=signals.response_received,
|
self.signals.send_catch_log(signals.response_received,
|
||||||
response=response, request=request, spider=spider)
|
response=response, request=request, spider=spider)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def _on_complete(_):
|
def _on_complete(_):
|
||||||
@ -253,8 +260,8 @@ class ExecutionEngine:
|
|||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def open_spider(self, spider, start_requests=(), close_if_idle=True):
|
def open_spider(self, spider, start_requests=(), close_if_idle=True):
|
||||||
assert self.has_capacity(), "No free spider slot when opening %r" % \
|
if not self.has_capacity():
|
||||||
spider.name
|
raise RuntimeError("No free spider slot when opening %r" % spider.name)
|
||||||
logger.info("Spider opened", extra={'spider': spider})
|
logger.info("Spider opened", extra={'spider': spider})
|
||||||
nextcall = CallLaterOnce(self._next_request, spider)
|
nextcall = CallLaterOnce(self._next_request, spider)
|
||||||
scheduler = self.scheduler_cls.from_crawler(self.crawler)
|
scheduler = self.scheduler_cls.from_crawler(self.crawler)
|
||||||
@ -277,10 +284,8 @@ class ExecutionEngine:
|
|||||||
next loop and this function is guaranteed to be called (at least) once
|
next loop and this function is guaranteed to be called (at least) once
|
||||||
again for this spider.
|
again for this spider.
|
||||||
"""
|
"""
|
||||||
res = self.signals.send_catch_log(signal=signals.spider_idle, \
|
res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider)
|
||||||
spider=spider, dont_log=DontCloseSpider)
|
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res):
|
||||||
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
|
|
||||||
for _, x in res):
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.spider_is_idle(spider):
|
if self.spider_is_idle(spider):
|
||||||
|
@ -4,18 +4,18 @@ extracts information from them"""
|
|||||||
import logging
|
import logging
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
|
||||||
from twisted.python.failure import Failure
|
from itemadapter import is_item
|
||||||
from twisted.internet import defer
|
from twisted.internet import defer
|
||||||
|
from twisted.python.failure import Failure
|
||||||
|
|
||||||
from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errback
|
|
||||||
from scrapy.utils.spider import iterate_spider_output
|
|
||||||
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
|
|
||||||
from scrapy.utils.log import logformatter_adapter, failure_to_exc_info
|
|
||||||
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
|
|
||||||
from scrapy import signals
|
from scrapy import signals
|
||||||
from scrapy.http import Request, Response
|
|
||||||
from scrapy.item import BaseItem
|
|
||||||
from scrapy.core.spidermw import SpiderMiddlewareManager
|
from scrapy.core.spidermw import SpiderMiddlewareManager
|
||||||
|
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
|
||||||
|
from scrapy.http import Request, Response
|
||||||
|
from scrapy.utils.defer import defer_result, defer_succeed, iter_errback, parallel
|
||||||
|
from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
|
||||||
|
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
|
||||||
|
from scrapy.utils.spider import iterate_spider_output
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -123,7 +123,11 @@ class Scraper:
|
|||||||
def _scrape(self, response, request, spider):
|
def _scrape(self, response, request, spider):
|
||||||
"""Handle the downloaded response or failure through the spider
|
"""Handle the downloaded response or failure through the spider
|
||||||
callback/errback"""
|
callback/errback"""
|
||||||
assert isinstance(response, (Response, Failure))
|
if not isinstance(response, (Response, Failure)):
|
||||||
|
raise TypeError(
|
||||||
|
"Incorrect type: expected Response or Failure, got %s: %r"
|
||||||
|
% (type(response), response)
|
||||||
|
)
|
||||||
|
|
||||||
dfd = self._scrape2(response, request, spider) # returns spider's processed output
|
dfd = self._scrape2(response, request, spider) # returns spider's processed output
|
||||||
dfd.addErrback(self.handle_spider_error, request, response, spider)
|
dfd.addErrback(self.handle_spider_error, request, response, spider)
|
||||||
@ -187,7 +191,7 @@ class Scraper:
|
|||||||
"""
|
"""
|
||||||
if isinstance(output, Request):
|
if isinstance(output, Request):
|
||||||
self.crawler.engine.crawl(request=output, spider=spider)
|
self.crawler.engine.crawl(request=output, spider=spider)
|
||||||
elif isinstance(output, (BaseItem, dict)):
|
elif is_item(output):
|
||||||
self.slot.itemproc_size += 1
|
self.slot.itemproc_size += 1
|
||||||
dfd = self.itemproc.process_item(output, spider)
|
dfd = self.itemproc.process_item(output, spider)
|
||||||
dfd.addBoth(self._itemproc_finished, output, response, spider)
|
dfd.addBoth(self._itemproc_finished, output, response, spider)
|
||||||
@ -196,10 +200,11 @@ class Scraper:
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
typename = type(output).__name__
|
typename = type(output).__name__
|
||||||
logger.error('Spider must return Request, BaseItem, dict or None, '
|
logger.error(
|
||||||
'got %(typename)r in %(request)s',
|
'Spider must return request, item, or None, got %(typename)r in %(request)s',
|
||||||
{'request': request, 'typename': typename},
|
{'request': request, 'typename': typename},
|
||||||
extra={'spider': spider})
|
extra={'spider': spider},
|
||||||
|
)
|
||||||
|
|
||||||
def _log_download_errors(self, spider_failure, download_failure, request, spider):
|
def _log_download_errors(self, spider_failure, download_failure, request, spider):
|
||||||
"""Log and silence errors that come from the engine (typically download
|
"""Log and silence errors that come from the engine (typically download
|
||||||
|
@ -19,7 +19,7 @@ def _isiterable(possible_iterator):
|
|||||||
|
|
||||||
|
|
||||||
def _fname(f):
|
def _fname(f):
|
||||||
return "%s.%s".format(
|
return "{}.{}".format(
|
||||||
f.__self__.__class__.__name__,
|
f.__self__.__class__.__name__,
|
||||||
f.__func__.__name__
|
f.__func__.__name__
|
||||||
)
|
)
|
||||||
|
@ -78,7 +78,8 @@ class Crawler:
|
|||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def crawl(self, *args, **kwargs):
|
def crawl(self, *args, **kwargs):
|
||||||
assert not self.crawling, "Crawling already taking place"
|
if self.crawling:
|
||||||
|
raise RuntimeError("Crawling already taking place")
|
||||||
self.crawling = True
|
self.crawling = True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
@ -29,8 +29,7 @@ class CookiesMiddleware:
|
|||||||
|
|
||||||
cookiejarkey = request.meta.get("cookiejar")
|
cookiejarkey = request.meta.get("cookiejar")
|
||||||
jar = self.jars[cookiejarkey]
|
jar = self.jars[cookiejarkey]
|
||||||
cookies = self._get_request_cookies(jar, request)
|
for cookie in self._get_request_cookies(jar, request):
|
||||||
for cookie in cookies:
|
|
||||||
jar.set_cookie_if_ok(cookie, request)
|
jar.set_cookie_if_ok(cookie, request)
|
||||||
|
|
||||||
# set Cookie header
|
# set Cookie header
|
||||||
@ -68,28 +67,65 @@ class CookiesMiddleware:
|
|||||||
msg = "Received cookies from: {}\n{}".format(response, cookies)
|
msg = "Received cookies from: {}\n{}".format(response, cookies)
|
||||||
logger.debug(msg, extra={'spider': spider})
|
logger.debug(msg, extra={'spider': spider})
|
||||||
|
|
||||||
def _format_cookie(self, cookie):
|
def _format_cookie(self, cookie, request):
|
||||||
# build cookie string
|
"""
|
||||||
cookie_str = '%s=%s' % (cookie['name'], cookie['value'])
|
Given a dict consisting of cookie components, return its string representation.
|
||||||
|
Decode from bytes if necessary.
|
||||||
if cookie.get('path', None):
|
"""
|
||||||
cookie_str += '; Path=%s' % cookie['path']
|
decoded = {}
|
||||||
if cookie.get('domain', None):
|
for key in ("name", "value", "path", "domain"):
|
||||||
cookie_str += '; Domain=%s' % cookie['domain']
|
if not cookie.get(key):
|
||||||
|
if key in ("name", "value"):
|
||||||
|
msg = "Invalid cookie found in request {}: {} ('{}' is missing)"
|
||||||
|
logger.warning(msg.format(request, cookie, key))
|
||||||
|
return
|
||||||
|
continue
|
||||||
|
if isinstance(cookie[key], str):
|
||||||
|
decoded[key] = cookie[key]
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
decoded[key] = cookie[key].decode("utf8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
|
||||||
|
request, cookie)
|
||||||
|
decoded[key] = cookie[key].decode("latin1", errors="replace")
|
||||||
|
|
||||||
|
cookie_str = "{}={}".format(decoded.pop("name"), decoded.pop("value"))
|
||||||
|
for key, value in decoded.items(): # path, domain
|
||||||
|
cookie_str += "; {}={}".format(key.capitalize(), value)
|
||||||
return cookie_str
|
return cookie_str
|
||||||
|
|
||||||
def _get_request_cookies(self, jar, request):
|
def _get_request_cookies(self, jar, request):
|
||||||
if isinstance(request.cookies, dict):
|
"""
|
||||||
cookie_list = [
|
Extract cookies from a Request. Values from the `Request.cookies` attribute
|
||||||
{'name': k, 'value': v}
|
take precedence over values from the `Cookie` request header.
|
||||||
for k, v in request.cookies.items()
|
"""
|
||||||
]
|
def get_cookies_from_header(jar, request):
|
||||||
else:
|
cookie_header = request.headers.get("Cookie")
|
||||||
cookie_list = request.cookies
|
if not cookie_header:
|
||||||
|
return []
|
||||||
|
cookie_gen_bytes = (s.strip() for s in cookie_header.split(b";"))
|
||||||
|
cookie_list_unicode = []
|
||||||
|
for cookie_bytes in cookie_gen_bytes:
|
||||||
|
try:
|
||||||
|
cookie_unicode = cookie_bytes.decode("utf8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
|
||||||
|
request, cookie_bytes)
|
||||||
|
cookie_unicode = cookie_bytes.decode("latin1", errors="replace")
|
||||||
|
cookie_list_unicode.append(cookie_unicode)
|
||||||
|
response = Response(request.url, headers={"Set-Cookie": cookie_list_unicode})
|
||||||
|
return jar.make_cookies(response, request)
|
||||||
|
|
||||||
cookies = [self._format_cookie(x) for x in cookie_list]
|
def get_cookies_from_attribute(jar, request):
|
||||||
headers = {'Set-Cookie': cookies}
|
if not request.cookies:
|
||||||
response = Response(request.url, headers=headers)
|
return []
|
||||||
|
elif isinstance(request.cookies, dict):
|
||||||
|
cookies = ({"name": k, "value": v} for k, v in request.cookies.items())
|
||||||
|
else:
|
||||||
|
cookies = request.cookies
|
||||||
|
formatted = filter(None, (self._format_cookie(c, request) for c in cookies))
|
||||||
|
response = Response(request.url, headers={"Set-Cookie": formatted})
|
||||||
|
return jar.make_cookies(response, request)
|
||||||
|
|
||||||
return jar.make_cookies(response, request)
|
return get_cookies_from_header(jar, request) + get_cookies_from_attribute(jar, request)
|
||||||
|
@ -60,11 +60,14 @@ class RedirectMiddleware(BaseRedirectMiddleware):
|
|||||||
Handle redirection of requests based on response status
|
Handle redirection of requests based on response status
|
||||||
and meta-refresh html tag.
|
and meta-refresh html tag.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def process_response(self, request, response, spider):
|
def process_response(self, request, response, spider):
|
||||||
if (request.meta.get('dont_redirect', False) or
|
if (
|
||||||
response.status in getattr(spider, 'handle_httpstatus_list', []) or
|
request.meta.get('dont_redirect', False)
|
||||||
response.status in request.meta.get('handle_httpstatus_list', []) or
|
or response.status in getattr(spider, 'handle_httpstatus_list', [])
|
||||||
request.meta.get('handle_httpstatus_all', False)):
|
or response.status in request.meta.get('handle_httpstatus_list', [])
|
||||||
|
or request.meta.get('handle_httpstatus_all', False)
|
||||||
|
):
|
||||||
return response
|
return response
|
||||||
|
|
||||||
allowed_status = (301, 302, 303, 307, 308)
|
allowed_status = (301, 302, 303, 307, 308)
|
||||||
|
@ -12,9 +12,15 @@ once the spider has finished crawling all regular (non failed) pages.
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
from twisted.internet import defer
|
from twisted.internet import defer
|
||||||
from twisted.internet.error import TimeoutError, DNSLookupError, \
|
from twisted.internet.error import (
|
||||||
ConnectionRefusedError, ConnectionDone, ConnectError, \
|
ConnectError,
|
||||||
ConnectionLost, TCPTimedOutError
|
ConnectionDone,
|
||||||
|
ConnectionLost,
|
||||||
|
ConnectionRefusedError,
|
||||||
|
DNSLookupError,
|
||||||
|
TCPTimedOutError,
|
||||||
|
TimeoutError,
|
||||||
|
)
|
||||||
from twisted.web.client import ResponseFailed
|
from twisted.web.client import ResponseFailed
|
||||||
|
|
||||||
from scrapy.exceptions import NotConfigured
|
from scrapy.exceptions import NotConfigured
|
||||||
|
@ -61,7 +61,7 @@ class RFPDupeFilter(BaseDupeFilter):
|
|||||||
def log(self, request, spider):
|
def log(self, request, spider):
|
||||||
if self.debug:
|
if self.debug:
|
||||||
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
|
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
|
||||||
args = {'request': request, 'referer': referer_str(request) }
|
args = {'request': request, 'referer': referer_str(request)}
|
||||||
self.logger.debug(msg, args, extra={'spider': spider})
|
self.logger.debug(msg, args, extra={'spider': spider})
|
||||||
elif self.logdupes:
|
elif self.logdupes:
|
||||||
msg = ("Filtered duplicate request: %(request)s"
|
msg = ("Filtered duplicate request: %(request)s"
|
||||||
|
@ -41,6 +41,18 @@ class CloseSpider(Exception):
|
|||||||
self.reason = reason
|
self.reason = reason
|
||||||
|
|
||||||
|
|
||||||
|
class StopDownload(Exception):
|
||||||
|
"""
|
||||||
|
Stop the download of the body for a given response.
|
||||||
|
The 'fail' boolean parameter indicates whether or not the resulting partial response
|
||||||
|
should be handled by the request errback. Note that 'fail' is a keyword-only argument.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *, fail=True):
|
||||||
|
super().__init__()
|
||||||
|
self.fail = fail
|
||||||
|
|
||||||
|
|
||||||
# Items
|
# Items
|
||||||
|
|
||||||
|
|
||||||
@ -59,6 +71,7 @@ class NotSupported(Exception):
|
|||||||
|
|
||||||
class UsageError(Exception):
|
class UsageError(Exception):
|
||||||
"""To indicate a command-line usage error"""
|
"""To indicate a command-line usage error"""
|
||||||
|
|
||||||
def __init__(self, *a, **kw):
|
def __init__(self, *a, **kw):
|
||||||
self.print_help = kw.pop('print_help', True)
|
self.print_help = kw.pop('print_help', True)
|
||||||
super(UsageError, self).__init__(*a, **kw)
|
super(UsageError, self).__init__(*a, **kw)
|
||||||
|
@ -4,16 +4,18 @@ Item Exporters are used to export/serialize items into different formats.
|
|||||||
|
|
||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
import pprint
|
|
||||||
import marshal
|
import marshal
|
||||||
import warnings
|
|
||||||
import pickle
|
import pickle
|
||||||
|
import pprint
|
||||||
|
import warnings
|
||||||
from xml.sax.saxutils import XMLGenerator
|
from xml.sax.saxutils import XMLGenerator
|
||||||
|
|
||||||
from scrapy.utils.serialize import ScrapyJSONEncoder
|
from itemadapter import is_item, ItemAdapter
|
||||||
from scrapy.utils.python import to_bytes, to_unicode, is_listlike
|
|
||||||
from scrapy.item import BaseItem
|
|
||||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||||
|
from scrapy.item import _BaseItem
|
||||||
|
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
|
||||||
|
from scrapy.utils.serialize import ScrapyJSONEncoder
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
|
__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
|
||||||
@ -56,11 +58,14 @@ class BaseItemExporter:
|
|||||||
"""Return the fields to export as an iterable of tuples
|
"""Return the fields to export as an iterable of tuples
|
||||||
(name, serialized_value)
|
(name, serialized_value)
|
||||||
"""
|
"""
|
||||||
|
item = ItemAdapter(item)
|
||||||
|
|
||||||
if include_empty is None:
|
if include_empty is None:
|
||||||
include_empty = self.export_empty_fields
|
include_empty = self.export_empty_fields
|
||||||
|
|
||||||
if self.fields_to_export is None:
|
if self.fields_to_export is None:
|
||||||
if include_empty and not isinstance(item, dict):
|
if include_empty:
|
||||||
field_iter = item.fields.keys()
|
field_iter = item.field_names()
|
||||||
else:
|
else:
|
||||||
field_iter = item.keys()
|
field_iter = item.keys()
|
||||||
else:
|
else:
|
||||||
@ -71,8 +76,8 @@ class BaseItemExporter:
|
|||||||
|
|
||||||
for field_name in field_iter:
|
for field_name in field_iter:
|
||||||
if field_name in item:
|
if field_name in item:
|
||||||
field = {} if isinstance(item, dict) else item.fields[field_name]
|
field_meta = item.get_field_meta(field_name)
|
||||||
value = self.serialize_field(field, field_name, item[field_name])
|
value = self.serialize_field(field_meta, field_name, item[field_name])
|
||||||
else:
|
else:
|
||||||
value = default_value
|
value = default_value
|
||||||
|
|
||||||
@ -250,7 +255,7 @@ class CsvItemExporter(BaseItemExporter):
|
|||||||
|
|
||||||
class PickleItemExporter(BaseItemExporter):
|
class PickleItemExporter(BaseItemExporter):
|
||||||
|
|
||||||
def __init__(self, file, protocol=2, **kwargs):
|
def __init__(self, file, protocol=4, **kwargs):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.file = file
|
self.file = file
|
||||||
self.protocol = protocol
|
self.protocol = protocol
|
||||||
@ -297,6 +302,7 @@ class PythonItemExporter(BaseItemExporter):
|
|||||||
|
|
||||||
.. _msgpack: https://pypi.org/project/msgpack/
|
.. _msgpack: https://pypi.org/project/msgpack/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _configure(self, options, dont_fail=False):
|
def _configure(self, options, dont_fail=False):
|
||||||
self.binary = options.pop('binary', True)
|
self.binary = options.pop('binary', True)
|
||||||
super(PythonItemExporter, self)._configure(options, dont_fail)
|
super(PythonItemExporter, self)._configure(options, dont_fail)
|
||||||
@ -312,24 +318,24 @@ class PythonItemExporter(BaseItemExporter):
|
|||||||
return serializer(value)
|
return serializer(value)
|
||||||
|
|
||||||
def _serialize_value(self, value):
|
def _serialize_value(self, value):
|
||||||
if isinstance(value, BaseItem):
|
if isinstance(value, _BaseItem):
|
||||||
return self.export_item(value)
|
return self.export_item(value)
|
||||||
if isinstance(value, dict):
|
elif is_item(value):
|
||||||
return dict(self._serialize_dict(value))
|
return dict(self._serialize_item(value))
|
||||||
if is_listlike(value):
|
elif is_listlike(value):
|
||||||
return [self._serialize_value(v) for v in value]
|
return [self._serialize_value(v) for v in value]
|
||||||
encode_func = to_bytes if self.binary else to_unicode
|
encode_func = to_bytes if self.binary else to_unicode
|
||||||
if isinstance(value, (str, bytes)):
|
if isinstance(value, (str, bytes)):
|
||||||
return encode_func(value, encoding=self.encoding)
|
return encode_func(value, encoding=self.encoding)
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def _serialize_dict(self, value):
|
def _serialize_item(self, item):
|
||||||
for key, val in value.items():
|
for key, value in ItemAdapter(item).items():
|
||||||
key = to_bytes(key) if self.binary else key
|
key = to_bytes(key) if self.binary else key
|
||||||
yield key, self._serialize_value(val)
|
yield key, self._serialize_value(value)
|
||||||
|
|
||||||
def export_item(self, item):
|
def export_item(self, item):
|
||||||
result = dict(self._get_serialized_fields(item))
|
result = dict(self._get_serialized_fields(item))
|
||||||
if self.binary:
|
if self.binary:
|
||||||
result = dict(self._serialize_dict(result))
|
result = dict(self._serialize_item(result))
|
||||||
return result
|
return result
|
||||||
|
@ -20,7 +20,7 @@ class CloseSpider:
|
|||||||
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
|
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
|
||||||
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
|
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
|
||||||
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
|
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
|
||||||
}
|
}
|
||||||
|
|
||||||
if not any(self.close_on.values()):
|
if not any(self.close_on.values()):
|
||||||
raise NotConfigured
|
raise NotConfigured
|
||||||
|
@ -270,18 +270,29 @@ class FeedExporter:
|
|||||||
if not slot.itemcount and not slot.store_empty:
|
if not slot.itemcount and not slot.store_empty:
|
||||||
# We need to call slot.storage.store nonetheless to get the file
|
# We need to call slot.storage.store nonetheless to get the file
|
||||||
# properly closed.
|
# properly closed.
|
||||||
return defer.maybeDeferred(slot.storage.store, slot.file)
|
d = defer.maybeDeferred(slot.storage.store, slot.file)
|
||||||
|
deferred_list.append(d)
|
||||||
|
continue
|
||||||
slot.finish_exporting()
|
slot.finish_exporting()
|
||||||
logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
|
logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
|
||||||
log_args = {'format': slot.format,
|
log_args = {'format': slot.format,
|
||||||
'itemcount': slot.itemcount,
|
'itemcount': slot.itemcount,
|
||||||
'uri': slot.uri}
|
'uri': slot.uri}
|
||||||
d = defer.maybeDeferred(slot.storage.store, slot.file)
|
d = defer.maybeDeferred(slot.storage.store, slot.file)
|
||||||
d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args,
|
|
||||||
extra={'spider': spider}))
|
# Use `largs=log_args` to copy log_args into function's scope
|
||||||
d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args,
|
# instead of using `log_args` from the outer scope
|
||||||
exc_info=failure_to_exc_info(f),
|
d.addCallback(
|
||||||
extra={'spider': spider}))
|
lambda _, largs=log_args: logger.info(
|
||||||
|
logfmt % "Stored", largs, extra={'spider': spider}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
d.addErrback(
|
||||||
|
lambda f, largs=log_args: logger.error(
|
||||||
|
logfmt % "Error storing", largs,
|
||||||
|
exc_info=failure_to_exc_info(f), extra={'spider': spider}
|
||||||
|
)
|
||||||
|
)
|
||||||
deferred_list.append(d)
|
deferred_list.append(d)
|
||||||
return defer.DeferredList(deferred_list) if deferred_list else None
|
return defer.DeferredList(deferred_list) if deferred_list else None
|
||||||
|
|
||||||
|
@ -46,9 +46,10 @@ class RFC2616Policy:
|
|||||||
def __init__(self, settings):
|
def __init__(self, settings):
|
||||||
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
|
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
|
||||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||||
self.ignore_response_cache_controls = [to_bytes(cc) for cc in
|
|
||||||
settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')]
|
|
||||||
self._cc_parsed = WeakKeyDictionary()
|
self._cc_parsed = WeakKeyDictionary()
|
||||||
|
self.ignore_response_cache_controls = [
|
||||||
|
to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
|
||||||
|
]
|
||||||
|
|
||||||
def _parse_cachecontrol(self, r):
|
def _parse_cachecontrol(self, r):
|
||||||
if r not in self._cc_parsed:
|
if r not in self._cc_parsed:
|
||||||
@ -250,7 +251,7 @@ class DbmCacheStorage:
|
|||||||
'headers': dict(response.headers),
|
'headers': dict(response.headers),
|
||||||
'body': response.body,
|
'body': response.body,
|
||||||
}
|
}
|
||||||
self.db['%s_data' % key] = pickle.dumps(data, protocol=2)
|
self.db['%s_data' % key] = pickle.dumps(data, protocol=4)
|
||||||
self.db['%s_time' % key] = str(time())
|
self.db['%s_time' % key] = str(time())
|
||||||
|
|
||||||
def _read_data(self, spider, request):
|
def _read_data(self, spider, request):
|
||||||
@ -317,7 +318,7 @@ class FilesystemCacheStorage:
|
|||||||
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
|
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
|
||||||
f.write(to_bytes(repr(metadata)))
|
f.write(to_bytes(repr(metadata)))
|
||||||
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
|
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
|
||||||
pickle.dump(metadata, f, protocol=2)
|
pickle.dump(metadata, f, protocol=4)
|
||||||
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
|
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
|
||||||
f.write(headers_dict_to_raw(response.headers))
|
f.write(headers_dict_to_raw(response.headers))
|
||||||
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
|
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
|
||||||
|
@ -26,7 +26,7 @@ class SpiderState:
|
|||||||
def spider_closed(self, spider):
|
def spider_closed(self, spider):
|
||||||
if self.jobdir:
|
if self.jobdir:
|
||||||
with open(self.statefn, 'wb') as f:
|
with open(self.statefn, 'wb') as f:
|
||||||
pickle.dump(spider.state, f, protocol=2)
|
pickle.dump(spider.state, f, protocol=4)
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
def spider_opened(self, spider):
|
||||||
if self.jobdir and os.path.exists(self.statefn):
|
if self.jobdir and os.path.exists(self.statefn):
|
||||||
|
@ -76,8 +76,10 @@ class TelnetConsole(protocol.ServerFactory):
|
|||||||
"""An implementation of IPortal"""
|
"""An implementation of IPortal"""
|
||||||
@defers
|
@defers
|
||||||
def login(self_, credentials, mind, *interfaces):
|
def login(self_, credentials, mind, *interfaces):
|
||||||
if not (credentials.username == self.username.encode('utf8') and
|
if not (
|
||||||
credentials.checkPassword(self.password.encode('utf8'))):
|
credentials.username == self.username.encode('utf8')
|
||||||
|
and credentials.checkPassword(self.password.encode('utf8'))
|
||||||
|
):
|
||||||
raise ValueError("Invalid credentials")
|
raise ValueError("Invalid credentials")
|
||||||
|
|
||||||
protocol = telnet.TelnetBootstrapProtocol(
|
protocol = telnet.TelnetBootstrapProtocol(
|
||||||
|
@ -24,7 +24,8 @@ class Request(object_ref):
|
|||||||
self.method = str(method).upper()
|
self.method = str(method).upper()
|
||||||
self._set_url(url)
|
self._set_url(url)
|
||||||
self._set_body(body)
|
self._set_body(body)
|
||||||
assert isinstance(priority, int), "Request priority not an integer: %r" % priority
|
if not isinstance(priority, int):
|
||||||
|
raise TypeError("Request priority not an integer: %r" % priority)
|
||||||
self.priority = priority
|
self.priority = priority
|
||||||
|
|
||||||
if callback is not None and not callable(callback):
|
if callback is not None and not callable(callback):
|
||||||
@ -129,6 +130,9 @@ class Request(object_ref):
|
|||||||
:class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`,
|
:class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`,
|
||||||
may modify the :class:`~scrapy.http.Request` object.
|
may modify the :class:`~scrapy.http.Request` object.
|
||||||
|
|
||||||
|
To translate a cURL command into a Scrapy request,
|
||||||
|
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options)
|
request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options)
|
||||||
request_kwargs.update(kwargs)
|
request_kwargs.update(kwargs)
|
||||||
|
@ -178,12 +178,11 @@ def _get_clickable(clickdata, form):
|
|||||||
if the latter is given. If not, it returns the first
|
if the latter is given. If not, it returns the first
|
||||||
clickable element found
|
clickable element found
|
||||||
"""
|
"""
|
||||||
clickables = [
|
clickables = list(form.xpath(
|
||||||
el for el in form.xpath(
|
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
|
||||||
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
|
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
|
||||||
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
|
namespaces={"re": "http://exslt.org/regular-expressions"}
|
||||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
))
|
||||||
]
|
|
||||||
if not clickables:
|
if not clickables:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -17,7 +17,8 @@ from scrapy.utils.trackref import object_ref
|
|||||||
|
|
||||||
class Response(object_ref):
|
class Response(object_ref):
|
||||||
|
|
||||||
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None, certificate=None):
|
def __init__(self, url, status=200, headers=None, body=b'', flags=None,
|
||||||
|
request=None, certificate=None, ip_address=None):
|
||||||
self.headers = Headers(headers or {})
|
self.headers = Headers(headers or {})
|
||||||
self.status = int(status)
|
self.status = int(status)
|
||||||
self._set_body(body)
|
self._set_body(body)
|
||||||
@ -25,6 +26,7 @@ class Response(object_ref):
|
|||||||
self.request = request
|
self.request = request
|
||||||
self.flags = [] if flags is None else list(flags)
|
self.flags = [] if flags is None else list(flags)
|
||||||
self.certificate = certificate
|
self.certificate = certificate
|
||||||
|
self.ip_address = ip_address
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def cb_kwargs(self):
|
def cb_kwargs(self):
|
||||||
@ -87,7 +89,8 @@ class Response(object_ref):
|
|||||||
"""Create a new Response with the same attributes except for those
|
"""Create a new Response with the same attributes except for those
|
||||||
given new values.
|
given new values.
|
||||||
"""
|
"""
|
||||||
for x in ['url', 'status', 'headers', 'body', 'request', 'flags', 'certificate']:
|
for x in ['url', 'status', 'headers', 'body',
|
||||||
|
'request', 'flags', 'certificate', 'ip_address']:
|
||||||
kwargs.setdefault(x, getattr(self, x))
|
kwargs.setdefault(x, getattr(self, x))
|
||||||
cls = kwargs.pop('cls', self.__class__)
|
cls = kwargs.pop('cls', self.__class__)
|
||||||
return cls(*args, **kwargs)
|
return cls(*args, **kwargs)
|
||||||
|
@ -5,6 +5,8 @@ discovering (through HTTP headers) to base Response class.
|
|||||||
See documentation in docs/topics/request-response.rst
|
See documentation in docs/topics/request-response.rst
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import warnings
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from typing import Generator
|
from typing import Generator
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
@ -14,15 +16,19 @@ from w3lib.encoding import (html_body_declared_encoding, html_to_unicode,
|
|||||||
http_content_type_encoding, resolve_encoding)
|
http_content_type_encoding, resolve_encoding)
|
||||||
from w3lib.html import strip_html5_whitespace
|
from w3lib.html import strip_html5_whitespace
|
||||||
|
|
||||||
|
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy.http.response import Response
|
from scrapy.http.response import Response
|
||||||
from scrapy.utils.python import memoizemethod_noargs, to_unicode
|
from scrapy.utils.python import memoizemethod_noargs, to_unicode
|
||||||
from scrapy.utils.response import get_base_url
|
from scrapy.utils.response import get_base_url
|
||||||
|
|
||||||
|
_NONE = object()
|
||||||
|
|
||||||
|
|
||||||
class TextResponse(Response):
|
class TextResponse(Response):
|
||||||
|
|
||||||
_DEFAULT_ENCODING = 'ascii'
|
_DEFAULT_ENCODING = 'ascii'
|
||||||
|
_cached_decoded_json = _NONE
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self._encoding = kwargs.pop('encoding', None)
|
self._encoding = kwargs.pop('encoding', None)
|
||||||
@ -61,8 +67,21 @@ class TextResponse(Response):
|
|||||||
|
|
||||||
def body_as_unicode(self):
|
def body_as_unicode(self):
|
||||||
"""Return body as unicode"""
|
"""Return body as unicode"""
|
||||||
|
warnings.warn('Response.body_as_unicode() is deprecated, '
|
||||||
|
'please use Response.text instead.',
|
||||||
|
ScrapyDeprecationWarning, stacklevel=2)
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
|
def json(self):
|
||||||
|
"""
|
||||||
|
.. versionadded:: 2.2
|
||||||
|
|
||||||
|
Deserialize a JSON document to a Python object.
|
||||||
|
"""
|
||||||
|
if self._cached_decoded_json is _NONE:
|
||||||
|
self._cached_decoded_json = json.loads(self.text)
|
||||||
|
return self._cached_decoded_json
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self):
|
def text(self):
|
||||||
""" Body as unicode """
|
""" Body as unicode """
|
||||||
|
@ -14,28 +14,39 @@ from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
|||||||
from scrapy.utils.trackref import object_ref
|
from scrapy.utils.trackref import object_ref
|
||||||
|
|
||||||
|
|
||||||
class BaseItem(object_ref):
|
class _BaseItem(object_ref):
|
||||||
"""Base class for all scraped items.
|
"""
|
||||||
|
Temporary class used internally to avoid the deprecation
|
||||||
In Scrapy, an object is considered an *item* if it is an instance of either
|
warning raised by isinstance checks using BaseItem.
|
||||||
:class:`BaseItem` or :class:`dict`. For example, when the output of a
|
|
||||||
spider callback is evaluated, only instances of :class:`BaseItem` or
|
|
||||||
:class:`dict` are passed to :ref:`item pipelines <topics-item-pipeline>`.
|
|
||||||
|
|
||||||
If you need instances of a custom class to be considered items by Scrapy,
|
|
||||||
you must inherit from either :class:`BaseItem` or :class:`dict`.
|
|
||||||
|
|
||||||
Unlike instances of :class:`dict`, instances of :class:`BaseItem` may be
|
|
||||||
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
|
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class _BaseItemMeta(ABCMeta):
|
||||||
|
def __instancecheck__(cls, instance):
|
||||||
|
if cls is BaseItem:
|
||||||
|
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
|
||||||
|
ScrapyDeprecationWarning, stacklevel=2)
|
||||||
|
return super().__instancecheck__(instance)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseItem(_BaseItem, metaclass=_BaseItemMeta):
|
||||||
|
"""
|
||||||
|
Deprecated, please use :class:`scrapy.item.Item` instead
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __new__(cls, *args, **kwargs):
|
||||||
|
if issubclass(cls, BaseItem) and not issubclass(cls, (Item, DictItem)):
|
||||||
|
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
|
||||||
|
ScrapyDeprecationWarning, stacklevel=2)
|
||||||
|
return super(BaseItem, cls).__new__(cls, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class Field(dict):
|
class Field(dict):
|
||||||
"""Container of field metadata"""
|
"""Container of field metadata"""
|
||||||
|
|
||||||
|
|
||||||
class ItemMeta(ABCMeta):
|
class ItemMeta(_BaseItemMeta):
|
||||||
"""Metaclass_ of :class:`Item` that handles field definitions.
|
"""Metaclass_ of :class:`Item` that handles field definitions.
|
||||||
|
|
||||||
.. _metaclass: https://realpython.com/python-metaclasses
|
.. _metaclass: https://realpython.com/python-metaclasses
|
||||||
@ -68,8 +79,7 @@ class DictItem(MutableMapping, BaseItem):
|
|||||||
|
|
||||||
def __new__(cls, *args, **kwargs):
|
def __new__(cls, *args, **kwargs):
|
||||||
if issubclass(cls, DictItem) and not issubclass(cls, Item):
|
if issubclass(cls, DictItem) and not issubclass(cls, Item):
|
||||||
warn('scrapy.item.DictItem is deprecated, please use '
|
warn('scrapy.item.DictItem is deprecated, please use scrapy.item.Item instead',
|
||||||
'scrapy.item.Item instead',
|
|
||||||
ScrapyDeprecationWarning, stacklevel=2)
|
ScrapyDeprecationWarning, stacklevel=2)
|
||||||
return super(DictItem, cls).__new__(cls, *args, **kwargs)
|
return super(DictItem, cls).__new__(cls, *args, **kwargs)
|
||||||
|
|
||||||
@ -86,8 +96,7 @@ class DictItem(MutableMapping, BaseItem):
|
|||||||
if key in self.fields:
|
if key in self.fields:
|
||||||
self._values[key] = value
|
self._values[key] = value
|
||||||
else:
|
else:
|
||||||
raise KeyError("%s does not support field: %s" %
|
raise KeyError("%s does not support field: %s" % (self.__class__.__name__, key))
|
||||||
(self.__class__.__name__, key))
|
|
||||||
|
|
||||||
def __delitem__(self, key):
|
def __delitem__(self, key):
|
||||||
del self._values[key]
|
del self._values[key]
|
||||||
@ -99,8 +108,7 @@ class DictItem(MutableMapping, BaseItem):
|
|||||||
|
|
||||||
def __setattr__(self, name, value):
|
def __setattr__(self, name, value):
|
||||||
if not name.startswith('_'):
|
if not name.startswith('_'):
|
||||||
raise AttributeError("Use item[%r] = %r to set field value" %
|
raise AttributeError("Use item[%r] = %r to set field value" % (name, value))
|
||||||
(name, value))
|
|
||||||
super(DictItem, self).__setattr__(name, value)
|
super(DictItem, self).__setattr__(name, value)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
@ -121,12 +129,30 @@ class DictItem(MutableMapping, BaseItem):
|
|||||||
return self.__class__(self)
|
return self.__class__(self)
|
||||||
|
|
||||||
def deepcopy(self):
|
def deepcopy(self):
|
||||||
"""Return a `deep copy`_ of this item.
|
"""Return a :func:`~copy.deepcopy` of this item.
|
||||||
|
|
||||||
.. _deep copy: https://docs.python.org/library/copy.html#copy.deepcopy
|
|
||||||
"""
|
"""
|
||||||
return deepcopy(self)
|
return deepcopy(self)
|
||||||
|
|
||||||
|
|
||||||
class Item(DictItem, metaclass=ItemMeta):
|
class Item(DictItem, metaclass=ItemMeta):
|
||||||
pass
|
"""
|
||||||
|
Base class for scraped items.
|
||||||
|
|
||||||
|
In Scrapy, an object is considered an ``item`` if it is an instance of either
|
||||||
|
:class:`Item` or :class:`dict`, or any subclass. For example, when the output of a
|
||||||
|
spider callback is evaluated, only instances of :class:`Item` or
|
||||||
|
:class:`dict` are passed to :ref:`item pipelines <topics-item-pipeline>`.
|
||||||
|
|
||||||
|
If you need instances of a custom class to be considered items by Scrapy,
|
||||||
|
you must inherit from either :class:`Item` or :class:`dict`.
|
||||||
|
|
||||||
|
Items must declare :class:`Field` attributes, which are processed and stored
|
||||||
|
in the ``fields`` attribute. This restricts the set of allowed field names
|
||||||
|
and prevents typos, raising ``KeyError`` when referring to undefined fields.
|
||||||
|
Additionally, fields can be used to define metadata and control the way
|
||||||
|
data is processed internally. Please refer to the :ref:`documentation
|
||||||
|
about fields <topics-items-fields>` for additional information.
|
||||||
|
|
||||||
|
Unlike instances of :class:`dict`, instances of :class:`Item` may be
|
||||||
|
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
|
||||||
|
"""
|
||||||
|
@ -45,8 +45,14 @@ IGNORED_EXTENSIONS = [
|
|||||||
|
|
||||||
|
|
||||||
_re_type = type(re.compile("", 0))
|
_re_type = type(re.compile("", 0))
|
||||||
_matches = lambda url, regexs: any(r.search(url) for r in regexs)
|
|
||||||
_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', 'file', 'ftp'}
|
|
||||||
|
def _matches(url, regexs):
|
||||||
|
return any(r.search(url) for r in regexs)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_valid_url(url):
|
||||||
|
return url.split('://', 1)[0] in {'http', 'https', 'file', 'ftp'}
|
||||||
|
|
||||||
|
|
||||||
class FilteringLinkExtractor:
|
class FilteringLinkExtractor:
|
||||||
@ -55,8 +61,7 @@ class FilteringLinkExtractor:
|
|||||||
|
|
||||||
def __new__(cls, *args, **kwargs):
|
def __new__(cls, *args, **kwargs):
|
||||||
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
|
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
|
||||||
if (issubclass(cls, FilteringLinkExtractor) and
|
if issubclass(cls, FilteringLinkExtractor) and not issubclass(cls, LxmlLinkExtractor):
|
||||||
not issubclass(cls, LxmlLinkExtractor)):
|
|
||||||
warn('scrapy.linkextractors.FilteringLinkExtractor is deprecated, '
|
warn('scrapy.linkextractors.FilteringLinkExtractor is deprecated, '
|
||||||
'please use scrapy.linkextractors.LinkExtractor instead',
|
'please use scrapy.linkextractors.LinkExtractor instead',
|
||||||
ScrapyDeprecationWarning, stacklevel=2)
|
ScrapyDeprecationWarning, stacklevel=2)
|
||||||
@ -128,4 +133,4 @@ class FilteringLinkExtractor:
|
|||||||
|
|
||||||
|
|
||||||
# Top-level imports
|
# Top-level imports
|
||||||
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor as LinkExtractor # noqa: F401
|
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor as LinkExtractor
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
Link extractor based on lxml.html
|
Link extractor based on lxml.html
|
||||||
"""
|
"""
|
||||||
|
import operator
|
||||||
|
from functools import partial
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import lxml.etree as etree
|
import lxml.etree as etree
|
||||||
@ -8,10 +10,10 @@ from w3lib.html import strip_html5_whitespace
|
|||||||
from w3lib.url import canonicalize_url, safe_url_string
|
from w3lib.url import canonicalize_url, safe_url_string
|
||||||
|
|
||||||
from scrapy.link import Link
|
from scrapy.link import Link
|
||||||
|
from scrapy.linkextractors import FilteringLinkExtractor
|
||||||
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
|
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
|
||||||
from scrapy.utils.python import unique as unique_list
|
from scrapy.utils.python import unique as unique_list
|
||||||
from scrapy.utils.response import get_base_url
|
from scrapy.utils.response import get_base_url
|
||||||
from scrapy.linkextractors import FilteringLinkExtractor
|
|
||||||
|
|
||||||
|
|
||||||
# from lxml/src/lxml/html/__init__.py
|
# from lxml/src/lxml/html/__init__.py
|
||||||
@ -27,19 +29,24 @@ def _nons(tag):
|
|||||||
return tag
|
return tag
|
||||||
|
|
||||||
|
|
||||||
|
def _identity(x):
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def _canonicalize_link_url(link):
|
||||||
|
return canonicalize_url(link.url, keep_fragments=True)
|
||||||
|
|
||||||
|
|
||||||
class LxmlParserLinkExtractor:
|
class LxmlParserLinkExtractor:
|
||||||
def __init__(self, tag="a", attr="href", process=None, unique=False,
|
def __init__(
|
||||||
strip=True, canonicalized=False):
|
self, tag="a", attr="href", process=None, unique=False, strip=True, canonicalized=False
|
||||||
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
):
|
||||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
self.scan_tag = tag if callable(tag) else partial(operator.eq, tag)
|
||||||
self.process_attr = process if callable(process) else lambda v: v
|
self.scan_attr = attr if callable(attr) else partial(operator.eq, attr)
|
||||||
|
self.process_attr = process if callable(process) else _identity
|
||||||
self.unique = unique
|
self.unique = unique
|
||||||
self.strip = strip
|
self.strip = strip
|
||||||
if canonicalized:
|
self.link_key = operator.attrgetter("url") if canonicalized else _canonicalize_link_url
|
||||||
self.link_key = lambda link: link.url
|
|
||||||
else:
|
|
||||||
self.link_key = lambda link: canonicalize_url(link.url,
|
|
||||||
keep_fragments=True)
|
|
||||||
|
|
||||||
def _iter_links(self, document):
|
def _iter_links(self, document):
|
||||||
for el in document.iter(etree.Element):
|
for el in document.iter(etree.Element):
|
||||||
@ -93,27 +100,44 @@ class LxmlParserLinkExtractor:
|
|||||||
|
|
||||||
class LxmlLinkExtractor(FilteringLinkExtractor):
|
class LxmlLinkExtractor(FilteringLinkExtractor):
|
||||||
|
|
||||||
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
def __init__(
|
||||||
tags=('a', 'area'), attrs=('href',), canonicalize=False,
|
self,
|
||||||
unique=True, process_value=None, deny_extensions=None, restrict_css=(),
|
allow=(),
|
||||||
strip=True, restrict_text=None):
|
deny=(),
|
||||||
|
allow_domains=(),
|
||||||
|
deny_domains=(),
|
||||||
|
restrict_xpaths=(),
|
||||||
|
tags=('a', 'area'),
|
||||||
|
attrs=('href',),
|
||||||
|
canonicalize=False,
|
||||||
|
unique=True,
|
||||||
|
process_value=None,
|
||||||
|
deny_extensions=None,
|
||||||
|
restrict_css=(),
|
||||||
|
strip=True,
|
||||||
|
restrict_text=None,
|
||||||
|
):
|
||||||
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
|
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
|
||||||
tag_func = lambda x: x in tags
|
|
||||||
attr_func = lambda x: x in attrs
|
|
||||||
lx = LxmlParserLinkExtractor(
|
lx = LxmlParserLinkExtractor(
|
||||||
tag=tag_func,
|
tag=partial(operator.contains, tags),
|
||||||
attr=attr_func,
|
attr=partial(operator.contains, attrs),
|
||||||
unique=unique,
|
unique=unique,
|
||||||
process=process_value,
|
process=process_value,
|
||||||
strip=strip,
|
strip=strip,
|
||||||
canonicalized=canonicalize
|
canonicalized=canonicalize
|
||||||
)
|
)
|
||||||
|
super(LxmlLinkExtractor, self).__init__(
|
||||||
super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
|
link_extractor=lx,
|
||||||
allow_domains=allow_domains, deny_domains=deny_domains,
|
allow=allow,
|
||||||
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
|
deny=deny,
|
||||||
canonicalize=canonicalize, deny_extensions=deny_extensions,
|
allow_domains=allow_domains,
|
||||||
restrict_text=restrict_text)
|
deny_domains=deny_domains,
|
||||||
|
restrict_xpaths=restrict_xpaths,
|
||||||
|
restrict_css=restrict_css,
|
||||||
|
canonicalize=canonicalize,
|
||||||
|
deny_extensions=deny_extensions,
|
||||||
|
restrict_text=restrict_text,
|
||||||
|
)
|
||||||
|
|
||||||
def extract_links(self, response):
|
def extract_links(self, response):
|
||||||
"""Returns a list of :class:`~scrapy.link.Link` objects from the
|
"""Returns a list of :class:`~scrapy.link.Link` objects from the
|
||||||
@ -126,9 +150,11 @@ class LxmlLinkExtractor(FilteringLinkExtractor):
|
|||||||
"""
|
"""
|
||||||
base_url = get_base_url(response)
|
base_url = get_base_url(response)
|
||||||
if self.restrict_xpaths:
|
if self.restrict_xpaths:
|
||||||
docs = [subdoc
|
docs = [
|
||||||
for x in self.restrict_xpaths
|
subdoc
|
||||||
for subdoc in response.xpath(x)]
|
for x in self.restrict_xpaths
|
||||||
|
for subdoc in response.xpath(x)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
docs = [response.selector]
|
docs = [response.selector]
|
||||||
all_links = []
|
all_links = []
|
||||||
|
@ -6,6 +6,8 @@ See documentation in docs/topics/loaders.rst
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
from scrapy.item import Item
|
from scrapy.item import Item
|
||||||
from scrapy.loader.common import wrap_loader_context
|
from scrapy.loader.common import wrap_loader_context
|
||||||
from scrapy.loader.processors import Identity
|
from scrapy.loader.processors import Identity
|
||||||
@ -44,7 +46,7 @@ class ItemLoader:
|
|||||||
self._local_item = context['item'] = item
|
self._local_item = context['item'] = item
|
||||||
self._local_values = defaultdict(list)
|
self._local_values = defaultdict(list)
|
||||||
# values from initial item
|
# values from initial item
|
||||||
for field_name, value in item.items():
|
for field_name, value in ItemAdapter(item).items():
|
||||||
self._values[field_name] += arg_to_iter(value)
|
self._values[field_name] += arg_to_iter(value)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -127,13 +129,12 @@ class ItemLoader:
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
def load_item(self):
|
def load_item(self):
|
||||||
item = self.item
|
adapter = ItemAdapter(self.item)
|
||||||
for field_name in tuple(self._values):
|
for field_name in tuple(self._values):
|
||||||
value = self.get_output_value(field_name)
|
value = self.get_output_value(field_name)
|
||||||
if value is not None:
|
if value is not None:
|
||||||
item[field_name] = value
|
adapter[field_name] = value
|
||||||
|
return adapter.item
|
||||||
return item
|
|
||||||
|
|
||||||
def get_output_value(self, field_name):
|
def get_output_value(self, field_name):
|
||||||
proc = self.get_output_processor(field_name)
|
proc = self.get_output_processor(field_name)
|
||||||
@ -174,11 +175,8 @@ class ItemLoader:
|
|||||||
value, type(e).__name__, str(e)))
|
value, type(e).__name__, str(e)))
|
||||||
|
|
||||||
def _get_item_field_attr(self, field_name, key, default=None):
|
def _get_item_field_attr(self, field_name, key, default=None):
|
||||||
if isinstance(self.item, Item):
|
field_meta = ItemAdapter(self.item).get_field_meta(field_name)
|
||||||
value = self.item.fields[field_name].get(key, default)
|
return field_meta.get(key, default)
|
||||||
else:
|
|
||||||
value = default
|
|
||||||
return value
|
|
||||||
|
|
||||||
def _check_selector_method(self):
|
def _check_selector_method(self):
|
||||||
if self.selector is None:
|
if self.selector is None:
|
||||||
|
@ -28,8 +28,10 @@ def _to_bytes_or_none(text):
|
|||||||
|
|
||||||
|
|
||||||
class MailSender:
|
class MailSender:
|
||||||
def __init__(self, smtphost='localhost', mailfrom='scrapy@localhost',
|
def __init__(
|
||||||
smtpuser=None, smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False):
|
self, smtphost='localhost', mailfrom='scrapy@localhost', smtpuser=None,
|
||||||
|
smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False
|
||||||
|
):
|
||||||
self.smtphost = smtphost
|
self.smtphost = smtphost
|
||||||
self.smtpport = smtpport
|
self.smtpport = smtpport
|
||||||
self.smtpuser = _to_bytes_or_none(smtpuser)
|
self.smtpuser = _to_bytes_or_none(smtpuser)
|
||||||
@ -41,9 +43,15 @@ class MailSender:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_settings(cls, settings):
|
def from_settings(cls, settings):
|
||||||
return cls(settings['MAIL_HOST'], settings['MAIL_FROM'], settings['MAIL_USER'],
|
return cls(
|
||||||
settings['MAIL_PASS'], settings.getint('MAIL_PORT'),
|
smtphost=settings['MAIL_HOST'],
|
||||||
settings.getbool('MAIL_TLS'), settings.getbool('MAIL_SSL'))
|
mailfrom=settings['MAIL_FROM'],
|
||||||
|
smtpuser=settings['MAIL_USER'],
|
||||||
|
smtppass=settings['MAIL_PASS'],
|
||||||
|
smtpport=settings.getint('MAIL_PORT'),
|
||||||
|
smtptls=settings.getbool('MAIL_TLS'),
|
||||||
|
smtpssl=settings.getbool('MAIL_SSL'),
|
||||||
|
)
|
||||||
|
|
||||||
def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
|
def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
|
||||||
from twisted.internet import reactor
|
from twisted.internet import reactor
|
||||||
@ -89,9 +97,12 @@ class MailSender:
|
|||||||
return
|
return
|
||||||
|
|
||||||
dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8'))
|
dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8'))
|
||||||
dfd.addCallbacks(self._sent_ok, self._sent_failed,
|
dfd.addCallbacks(
|
||||||
|
callback=self._sent_ok,
|
||||||
|
errback=self._sent_failed,
|
||||||
callbackArgs=[to, cc, subject, len(attachs)],
|
callbackArgs=[to, cc, subject, len(attachs)],
|
||||||
errbackArgs=[to, cc, subject, len(attachs)])
|
errbackArgs=[to, cc, subject, len(attachs)],
|
||||||
|
)
|
||||||
reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
|
reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
|
||||||
return dfd
|
return dfd
|
||||||
|
|
||||||
@ -115,9 +126,10 @@ class MailSender:
|
|||||||
from twisted.mail.smtp import ESMTPSenderFactory
|
from twisted.mail.smtp import ESMTPSenderFactory
|
||||||
msg = BytesIO(msg)
|
msg = BytesIO(msg)
|
||||||
d = defer.Deferred()
|
d = defer.Deferred()
|
||||||
factory = ESMTPSenderFactory(self.smtpuser, self.smtppass, self.mailfrom, \
|
factory = ESMTPSenderFactory(
|
||||||
to_addrs, msg, d, heloFallback=True, requireAuthentication=False, \
|
self.smtpuser, self.smtppass, self.mailfrom, to_addrs, msg, d,
|
||||||
requireTransportSecurity=self.smtptls)
|
heloFallback=True, requireAuthentication=False, requireTransportSecurity=self.smtptls,
|
||||||
|
)
|
||||||
factory.noisy = False
|
factory.noisy = False
|
||||||
|
|
||||||
if self.smtpssl:
|
if self.smtpssl:
|
||||||
|
@ -10,24 +10,26 @@ import mimetypes
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from email.utils import parsedate_tz, mktime_tz
|
from contextlib import suppress
|
||||||
|
from email.utils import mktime_tz, parsedate_tz
|
||||||
from ftplib import FTP
|
from ftplib import FTP
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
from twisted.internet import defer, threads
|
from twisted.internet import defer, threads
|
||||||
|
|
||||||
|
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||||
|
from scrapy.http import Request
|
||||||
from scrapy.pipelines.media import MediaPipeline
|
from scrapy.pipelines.media import MediaPipeline
|
||||||
from scrapy.settings import Settings
|
from scrapy.settings import Settings
|
||||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
|
||||||
from scrapy.http import Request
|
|
||||||
from scrapy.utils.misc import md5sum
|
|
||||||
from scrapy.utils.log import failure_to_exc_info
|
|
||||||
from scrapy.utils.python import to_bytes
|
|
||||||
from scrapy.utils.request import referer_str
|
|
||||||
from scrapy.utils.boto import is_botocore
|
from scrapy.utils.boto import is_botocore
|
||||||
from scrapy.utils.datatypes import CaselessDict
|
from scrapy.utils.datatypes import CaselessDict
|
||||||
from scrapy.utils.ftp import ftp_store_file
|
from scrapy.utils.ftp import ftp_store_file
|
||||||
|
from scrapy.utils.log import failure_to_exc_info
|
||||||
|
from scrapy.utils.misc import md5sum
|
||||||
|
from scrapy.utils.python import to_bytes
|
||||||
|
from scrapy.utils.request import referer_str
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -83,8 +85,7 @@ class S3FilesStore:
|
|||||||
AWS_USE_SSL = None
|
AWS_USE_SSL = None
|
||||||
AWS_VERIFY = None
|
AWS_VERIFY = None
|
||||||
|
|
||||||
POLICY = 'private' # Overriden from settings.FILES_STORE_S3_ACL in
|
POLICY = 'private' # Overriden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
|
||||||
# FilesPipeline.from_settings.
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
'Cache-Control': 'max-age=172800',
|
'Cache-Control': 'max-age=172800',
|
||||||
}
|
}
|
||||||
@ -106,7 +107,8 @@ class S3FilesStore:
|
|||||||
else:
|
else:
|
||||||
from boto.s3.connection import S3Connection
|
from boto.s3.connection import S3Connection
|
||||||
self.S3Connection = S3Connection
|
self.S3Connection = S3Connection
|
||||||
assert uri.startswith('s3://')
|
if not uri.startswith("s3://"):
|
||||||
|
raise ValueError("Incorrect URI scheme in %s, expected 's3'" % uri)
|
||||||
self.bucket, self.prefix = uri[5:].split('/', 1)
|
self.bucket, self.prefix = uri[5:].split('/', 1)
|
||||||
|
|
||||||
def stat_file(self, path, info):
|
def stat_file(self, path, info):
|
||||||
@ -229,6 +231,20 @@ class GCSFilesStore:
|
|||||||
bucket, prefix = uri[5:].split('/', 1)
|
bucket, prefix = uri[5:].split('/', 1)
|
||||||
self.bucket = client.bucket(bucket)
|
self.bucket = client.bucket(bucket)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
permissions = self.bucket.test_iam_permissions(
|
||||||
|
['storage.objects.get', 'storage.objects.create']
|
||||||
|
)
|
||||||
|
if 'storage.objects.get' not in permissions:
|
||||||
|
logger.warning(
|
||||||
|
"No 'storage.objects.get' permission for GSC bucket %(bucket)s. "
|
||||||
|
"Checking if files are up to date will be impossible. Files will be downloaded every time.",
|
||||||
|
{'bucket': bucket}
|
||||||
|
)
|
||||||
|
if 'storage.objects.create' not in permissions:
|
||||||
|
logger.error(
|
||||||
|
"No 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!",
|
||||||
|
{'bucket': bucket}
|
||||||
|
)
|
||||||
|
|
||||||
def stat_file(self, path, info):
|
def stat_file(self, path, info):
|
||||||
def _onsuccess(blob):
|
def _onsuccess(blob):
|
||||||
@ -266,7 +282,8 @@ class FTPFilesStore:
|
|||||||
USE_ACTIVE_MODE = None
|
USE_ACTIVE_MODE = None
|
||||||
|
|
||||||
def __init__(self, uri):
|
def __init__(self, uri):
|
||||||
assert uri.startswith('ftp://')
|
if not uri.startswith("ftp://"):
|
||||||
|
raise ValueError("Incorrect URI scheme in %s, expected 'ftp'" % uri)
|
||||||
u = urlparse(uri)
|
u = urlparse(uri)
|
||||||
self.port = u.port
|
self.port = u.port
|
||||||
self.host = u.hostname
|
self.host = u.hostname
|
||||||
@ -417,7 +434,7 @@ class FilesPipeline(MediaPipeline):
|
|||||||
self.inc_stats(info.spider, 'uptodate')
|
self.inc_stats(info.spider, 'uptodate')
|
||||||
|
|
||||||
checksum = result.get('checksum', None)
|
checksum = result.get('checksum', None)
|
||||||
return {'url': request.url, 'path': path, 'checksum': checksum}
|
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': 'uptodate'}
|
||||||
|
|
||||||
path = self.file_path(request, info=info)
|
path = self.file_path(request, info=info)
|
||||||
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
|
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
|
||||||
@ -494,15 +511,16 @@ class FilesPipeline(MediaPipeline):
|
|||||||
)
|
)
|
||||||
raise FileException(str(exc))
|
raise FileException(str(exc))
|
||||||
|
|
||||||
return {'url': request.url, 'path': path, 'checksum': checksum}
|
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': status}
|
||||||
|
|
||||||
def inc_stats(self, spider, status):
|
def inc_stats(self, spider, status):
|
||||||
spider.crawler.stats.inc_value('file_count', spider=spider)
|
spider.crawler.stats.inc_value('file_count', spider=spider)
|
||||||
spider.crawler.stats.inc_value('file_status_count/%s' % status, spider=spider)
|
spider.crawler.stats.inc_value('file_status_count/%s' % status, spider=spider)
|
||||||
|
|
||||||
### Overridable Interface
|
# Overridable Interface
|
||||||
def get_media_requests(self, item, info):
|
def get_media_requests(self, item, info):
|
||||||
return [Request(x) for x in item.get(self.files_urls_field, [])]
|
urls = ItemAdapter(item).get(self.files_urls_field, [])
|
||||||
|
return [Request(u) for u in urls]
|
||||||
|
|
||||||
def file_downloaded(self, response, request, info):
|
def file_downloaded(self, response, request, info):
|
||||||
path = self.file_path(request, response=response, info=info)
|
path = self.file_path(request, response=response, info=info)
|
||||||
@ -513,8 +531,8 @@ class FilesPipeline(MediaPipeline):
|
|||||||
return checksum
|
return checksum
|
||||||
|
|
||||||
def item_completed(self, results, item, info):
|
def item_completed(self, results, item, info):
|
||||||
if isinstance(item, dict) or self.files_result_field in item.fields:
|
with suppress(KeyError):
|
||||||
item[self.files_result_field] = [x for ok, x in results if ok]
|
ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]
|
||||||
return item
|
return item
|
||||||
|
|
||||||
def file_path(self, request, response=None, info=None):
|
def file_path(self, request, response=None, info=None):
|
||||||
|
@ -5,17 +5,19 @@ See documentation in topics/media-pipeline.rst
|
|||||||
"""
|
"""
|
||||||
import functools
|
import functools
|
||||||
import hashlib
|
import hashlib
|
||||||
|
from contextlib import suppress
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from scrapy.exceptions import DropItem
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.pipelines.files import FileException, FilesPipeline
|
||||||
|
# TODO: from scrapy.pipelines.media import MediaPipeline
|
||||||
|
from scrapy.settings import Settings
|
||||||
from scrapy.utils.misc import md5sum
|
from scrapy.utils.misc import md5sum
|
||||||
from scrapy.utils.python import to_bytes
|
from scrapy.utils.python import to_bytes
|
||||||
from scrapy.http import Request
|
|
||||||
from scrapy.settings import Settings
|
|
||||||
from scrapy.exceptions import DropItem
|
|
||||||
#TODO: from scrapy.pipelines.media import MediaPipeline
|
|
||||||
from scrapy.pipelines.files import FileException, FilesPipeline
|
|
||||||
|
|
||||||
|
|
||||||
class NoimagesDrop(DropItem):
|
class NoimagesDrop(DropItem):
|
||||||
@ -157,11 +159,12 @@ class ImagesPipeline(FilesPipeline):
|
|||||||
return image, buf
|
return image, buf
|
||||||
|
|
||||||
def get_media_requests(self, item, info):
|
def get_media_requests(self, item, info):
|
||||||
return [Request(x) for x in item.get(self.images_urls_field, [])]
|
urls = ItemAdapter(item).get(self.images_urls_field, [])
|
||||||
|
return [Request(u) for u in urls]
|
||||||
|
|
||||||
def item_completed(self, results, item, info):
|
def item_completed(self, results, item, info):
|
||||||
if isinstance(item, dict) or self.images_result_field in item.fields:
|
with suppress(KeyError):
|
||||||
item[self.images_result_field] = [x for ok, x in results if ok]
|
ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
|
||||||
return item
|
return item
|
||||||
|
|
||||||
def file_path(self, request, response=None, info=None):
|
def file_path(self, request, response=None, info=None):
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import functools
|
import functools
|
||||||
import logging
|
import logging
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from twisted.internet.defer import Deferred, DeferredList, _DefGen_Return
|
from twisted.internet.defer import Deferred, DeferredList
|
||||||
from twisted.python.failure import Failure
|
from twisted.python.failure import Failure
|
||||||
|
|
||||||
from scrapy.settings import Settings
|
from scrapy.settings import Settings
|
||||||
@ -43,8 +43,7 @@ class MediaPipeline:
|
|||||||
if allow_redirects:
|
if allow_redirects:
|
||||||
self.handle_httpstatus_list = SequenceExclude(range(300, 400))
|
self.handle_httpstatus_list = SequenceExclude(range(300, 400))
|
||||||
|
|
||||||
def _key_for_pipe(self, key, base_class_name=None,
|
def _key_for_pipe(self, key, base_class_name=None, settings=None):
|
||||||
settings=None):
|
|
||||||
"""
|
"""
|
||||||
>>> MediaPipeline()._key_for_pipe("IMAGES")
|
>>> MediaPipeline()._key_for_pipe("IMAGES")
|
||||||
'IMAGES'
|
'IMAGES'
|
||||||
@ -55,8 +54,11 @@ class MediaPipeline:
|
|||||||
"""
|
"""
|
||||||
class_name = self.__class__.__name__
|
class_name = self.__class__.__name__
|
||||||
formatted_key = "{}_{}".format(class_name.upper(), key)
|
formatted_key = "{}_{}".format(class_name.upper(), key)
|
||||||
if class_name == base_class_name or not base_class_name \
|
if (
|
||||||
or (settings and not settings.get(formatted_key)):
|
not base_class_name
|
||||||
|
or class_name == base_class_name
|
||||||
|
or settings and not settings.get(formatted_key)
|
||||||
|
):
|
||||||
return key
|
return key
|
||||||
return formatted_key
|
return formatted_key
|
||||||
|
|
||||||
@ -141,24 +143,26 @@ class MediaPipeline:
|
|||||||
# This code fixes a memory leak by avoiding to keep references to
|
# This code fixes a memory leak by avoiding to keep references to
|
||||||
# the Request and Response objects on the Media Pipeline cache.
|
# the Request and Response objects on the Media Pipeline cache.
|
||||||
#
|
#
|
||||||
# Twisted inline callbacks pass return values using the function
|
# What happens when the media_downloaded callback raises an
|
||||||
# twisted.internet.defer.returnValue, which encapsulates the return
|
|
||||||
# value inside a _DefGen_Return base exception.
|
|
||||||
#
|
|
||||||
# What happens when the media_downloaded callback raises another
|
|
||||||
# exception, for example a FileException('download-error') when
|
# exception, for example a FileException('download-error') when
|
||||||
# the Response status code is not 200 OK, is that it stores the
|
# the Response status code is not 200 OK, is that the original
|
||||||
# _DefGen_Return exception on the FileException context.
|
# StopIteration exception (which in turn contains the failed
|
||||||
|
# Response and by extension, the original Request) gets encapsulated
|
||||||
|
# within the FileException context.
|
||||||
|
#
|
||||||
|
# Originally, Scrapy was using twisted.internet.defer.returnValue
|
||||||
|
# inside functions decorated with twisted.internet.defer.inlineCallbacks,
|
||||||
|
# encapsulating the returned Response in a _DefGen_Return exception
|
||||||
|
# instead of a StopIteration.
|
||||||
#
|
#
|
||||||
# To avoid keeping references to the Response and therefore Request
|
# To avoid keeping references to the Response and therefore Request
|
||||||
# objects on the Media Pipeline cache, we should wipe the context of
|
# objects on the Media Pipeline cache, we should wipe the context of
|
||||||
# the exception encapsulated by the Twisted Failure when its a
|
# the encapsulated exception when it is a StopIteration instance
|
||||||
# _DefGen_Return instance.
|
|
||||||
#
|
#
|
||||||
# This problem does not occur in Python 2.7 since we don't have
|
# This problem does not occur in Python 2.7 since we don't have
|
||||||
# Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
|
# Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
|
||||||
context = getattr(result.value, '__context__', None)
|
context = getattr(result.value, '__context__', None)
|
||||||
if isinstance(context, _DefGen_Return):
|
if isinstance(context, StopIteration):
|
||||||
setattr(result.value, '__context__', None)
|
setattr(result.value, '__context__', None)
|
||||||
|
|
||||||
info.downloading.remove(fp)
|
info.downloading.remove(fp)
|
||||||
@ -166,7 +170,7 @@ class MediaPipeline:
|
|||||||
for wad in info.waiting.pop(fp):
|
for wad in info.waiting.pop(fp):
|
||||||
defer_result(result).chainDeferred(wad)
|
defer_result(result).chainDeferred(wad)
|
||||||
|
|
||||||
### Overridable Interface
|
# Overridable Interface
|
||||||
def media_to_download(self, request, info):
|
def media_to_download(self, request, info):
|
||||||
"""Check request before starting download"""
|
"""Check request before starting download"""
|
||||||
pass
|
pass
|
||||||
|
@ -58,9 +58,9 @@ class ResponseTypes:
|
|||||||
|
|
||||||
def from_content_disposition(self, content_disposition):
|
def from_content_disposition(self, content_disposition):
|
||||||
try:
|
try:
|
||||||
filename = to_unicode(content_disposition,
|
filename = to_unicode(
|
||||||
encoding='latin-1', errors='replace').split(';')[1].split('=')[1]
|
content_disposition, encoding='latin-1', errors='replace'
|
||||||
filename = filename.strip('"\'')
|
).split(';')[1].split('=')[1].strip('"\'')
|
||||||
return self.from_filename(filename)
|
return self.from_filename(filename)
|
||||||
except IndexError:
|
except IndexError:
|
||||||
return Response
|
return Response
|
||||||
@ -71,7 +71,7 @@ class ResponseTypes:
|
|||||||
cls = Response
|
cls = Response
|
||||||
if b'Content-Type' in headers:
|
if b'Content-Type' in headers:
|
||||||
cls = self.from_content_type(
|
cls = self.from_content_type(
|
||||||
content_type=headers[b'Content-type'],
|
content_type=headers[b'Content-Type'],
|
||||||
content_encoding=headers.get(b'Content-Encoding')
|
content_encoding=headers.get(b'Content-Encoding')
|
||||||
)
|
)
|
||||||
if cls is Response and b'Content-Disposition' in headers:
|
if cls is Response and b'Content-Disposition' in headers:
|
||||||
|
@ -17,10 +17,12 @@ def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
|
|||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
# If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
|
# If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
|
||||||
# Switch to 'allow all' state.
|
# Switch to 'allow all' state.
|
||||||
logger.warning("Failure while parsing robots.txt. "
|
logger.warning(
|
||||||
"File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
|
"Failure while parsing robots.txt. File either contains garbage or "
|
||||||
exc_info=sys.exc_info(),
|
"is in an encoding other than UTF-8, treating it as an empty file.",
|
||||||
extra={'spider': spider})
|
exc_info=sys.exc_info(),
|
||||||
|
extra={'spider': spider},
|
||||||
|
)
|
||||||
robotstxt_body = ''
|
robotstxt_body = ''
|
||||||
return robotstxt_body
|
return robotstxt_body
|
||||||
|
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
Selectors
|
Selectors
|
||||||
"""
|
"""
|
||||||
from scrapy.selector.unified import * # noqa: F401
|
|
||||||
|
# top-level imports
|
||||||
|
from scrapy.selector.unified import Selector, SelectorList
|
||||||
|
@ -65,9 +65,9 @@ class Selector(_ParselSelector, object_ref):
|
|||||||
selectorlist_cls = SelectorList
|
selectorlist_cls = SelectorList
|
||||||
|
|
||||||
def __init__(self, response=None, text=None, type=None, root=None, **kwargs):
|
def __init__(self, response=None, text=None, type=None, root=None, **kwargs):
|
||||||
if not(response is None or text is None):
|
if response is not None and text is not None:
|
||||||
raise ValueError('%s.__init__() received both response and text'
|
raise ValueError('%s.__init__() received both response and text'
|
||||||
% self.__class__.__name__)
|
% self.__class__.__name__)
|
||||||
|
|
||||||
st = _st(response, type or self._default_type)
|
st = _st(response, type or self._default_type)
|
||||||
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user