1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-15 10:51:48 +00:00

Merge branch 'master' into response_ip_address

This commit is contained in:
Eugenio Lacuesta 2020-03-03 13:53:43 -03:00
commit 3aa5eab993
No known key found for this signature in database
GPG Key ID: DA3EF2D0913E9810
50 changed files with 934 additions and 133 deletions

View File

@ -1,8 +1,7 @@
[bumpversion]
current_version = 1.8.0
current_version = 2.0.0
commit = True
tag = True
tag_name = {new_version}
[bumpversion:file:scrapy/VERSION]

View File

@ -1,9 +1,11 @@
version: 2
sphinx:
configuration: docs/conf.py
fail_on_warning: true
python:
# For available versions, see:
# https://docs.readthedocs.io/en/stable/config-file/v2.html#build-image
version: 3.7 # Keep in sync with .travis.yml
install:
- requirements: docs/requirements.txt
- path: .

View File

@ -41,7 +41,7 @@ Requirements
============
* Python 3.5+
* Works on Linux, Windows, Mac OSX, BSD
* Works on Linux, Windows, macOS, BSD
Install
=======

View File

@ -281,6 +281,7 @@ coverage_ignore_pyobjects = [
intersphinx_mapping = {
'coverage': ('https://coverage.readthedocs.io/en/stable', None),
'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
'pytest': ('https://docs.pytest.org/en/latest', None),
'python': ('https://docs.python.org/3', None),
'sphinx': ('https://www.sphinx-doc.org/en/master', None),

View File

@ -143,7 +143,7 @@ by running ``git fetch upstream pull/$PR_NUMBER/head:$BRANCH_NAME_TO_CREATE``
(replace 'upstream' with a remote name for scrapy repository,
``$PR_NUMBER`` with an ID of the pull request, and ``$BRANCH_NAME_TO_CREATE``
with a name of the branch you want to create locally).
See also: https://help.github.com/articles/checking-out-pull-requests-locally/#modifying-an-inactive-pull-request-locally.
See also: https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/checking-out-pull-requests-locally#modifying-an-inactive-pull-request-locally.
When writing GitHub pull requests, try to keep titles short but descriptive.
E.g. For bug #411: "Scrapy hangs if an exception raises in start_requests"
@ -168,7 +168,7 @@ Scrapy:
* Don't put your name in the code you contribute; git provides enough
metadata to identify author of the code.
See https://help.github.com/articles/setting-your-username-in-git/ for
See https://help.github.com/en/github/using-git/setting-your-username-in-git for
setup instructions.
.. _documentation-policies:
@ -266,5 +266,5 @@ And their unit-tests are in::
.. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
.. _open issues: https://github.com/scrapy/scrapy/issues
.. _PEP 257: https://www.python.org/dev/peps/pep-0257/
.. _pull request: https://help.github.com/en/articles/creating-a-pull-request
.. _pull request: https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request
.. _pytest-xdist: https://github.com/pytest-dev/pytest-xdist

View File

@ -22,8 +22,8 @@ In other words, comparing `BeautifulSoup`_ (or `lxml`_) to Scrapy is like
comparing `jinja2`_ to `Django`_.
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
.. _lxml: http://lxml.de/
.. _jinja2: http://jinja.pocoo.org/
.. _lxml: https://lxml.de/
.. _jinja2: https://palletsprojects.com/p/jinja/
.. _Django: https://www.djangoproject.com/
Can I use Scrapy with BeautifulSoup?
@ -269,7 +269,7 @@ The ``__VIEWSTATE`` parameter is used in sites built with ASP.NET/VB.NET. For
more info on how it works see `this page`_. Also, here's an `example spider`_
which scrapes one of these sites.
.. _this page: http://search.cpan.org/~ecarroll/HTML-TreeBuilderX-ASP_NET-0.09/lib/HTML/TreeBuilderX/ASP_NET.pm
.. _this page: https://metacpan.org/pod/release/ECARROLL/HTML-TreeBuilderX-ASP_NET-0.09/lib/HTML/TreeBuilderX/ASP_NET.pm
.. _example spider: https://github.com/AmbientLighter/rpn-fas/blob/master/fas/spiders/rnp.py
What's the best way to parse big XML/CSV data feeds?

View File

@ -165,6 +165,8 @@ Solving specific problems
topics/autothrottle
topics/benchmarking
topics/jobs
topics/coroutines
topics/asyncio
:doc:`faq`
Get answers to most frequently asked questions.
@ -205,6 +207,12 @@ Solving specific problems
:doc:`topics/jobs`
Learn how to pause and resume crawls for large spiders.
:doc:`topics/coroutines`
Use the :ref:`coroutine syntax <async>`.
:doc:`topics/asyncio`
Use :mod:`asyncio` and :mod:`asyncio`-powered libraries.
.. _extending-scrapy:
Extending Scrapy

View File

@ -7,12 +7,12 @@ Installation guide
Installing Scrapy
=================
Scrapy runs on Python 3.5 or above
under CPython (default Python implementation) and PyPy (starting with PyPy 5.9).
Scrapy runs on Python 3.5 or above under CPython (default Python
implementation) and PyPy (starting with PyPy 5.9).
If you're using `Anaconda`_ or `Miniconda`_, you can install the package from
the `conda-forge`_ channel, which has up-to-date packages for Linux, Windows
and OS X.
and macOS.
To install Scrapy using ``conda``, run::
@ -65,7 +65,7 @@ please refer to their respective installation instructions:
* `lxml installation`_
* `cryptography installation`_
.. _lxml installation: http://lxml.de/installation.html
.. _lxml installation: https://lxml.de/installation.html
.. _cryptography installation: https://cryptography.io/en/latest/installation/
@ -148,11 +148,11 @@ you can install Scrapy with ``pip`` after that::
.. _intro-install-macos:
Mac OS X
--------
macOS
-----
Building Scrapy's dependencies requires the presence of a C compiler and
development headers. On OS X this is typically provided by Apples Xcode
development headers. On macOS this is typically provided by Apples Xcode
development tools. To install the Xcode command line tools open a terminal
window and run::
@ -191,7 +191,7 @@ solutions:
* *(Optional)* :ref:`Install Scrapy inside a Python virtual environment
<intro-using-virtualenv>`.
This method is a workaround for the above OS X issue, but it's an overall
This method is a workaround for the above macOS issue, but it's an overall
good practice for managing dependencies and can complement the first method.
After any of these workarounds you should be able to install Scrapy::
@ -207,7 +207,7 @@ For PyPy3, only Linux installation was tested.
Most Scrapy dependencides now have binary wheels for CPython, but not for PyPy.
This means that these dependecies will be built during installation.
On OS X, you are likely to face an issue with building Cryptography dependency,
On macOS, you are likely to face an issue with building Cryptography dependency,
solution to this problem is described
`here <https://github.com/pyca/cryptography/issues/2692#issuecomment-272773481>`_,
that is to ``brew install openssl`` and then export the flags that this command
@ -253,11 +253,11 @@ For details, see `Issue #2473 <https://github.com/scrapy/scrapy/issues/2473>`_.
.. _Python: https://www.python.org/
.. _pip: https://pip.pypa.io/en/latest/installing/
.. _lxml: https://lxml.de/index.html
.. _parsel: https://pypi.python.org/pypi/parsel
.. _w3lib: https://pypi.python.org/pypi/w3lib
.. _twisted: https://twistedmatrix.com/
.. _cryptography: https://cryptography.io/
.. _pyOpenSSL: https://pypi.python.org/pypi/pyOpenSSL
.. _parsel: https://pypi.org/project/parsel/
.. _w3lib: https://pypi.org/project/w3lib/
.. _twisted: https://twistedmatrix.com/trac/
.. _cryptography: https://cryptography.io/en/latest/
.. _pyOpenSSL: https://pypi.org/project/pyOpenSSL/
.. _setuptools: https://pypi.python.org/pypi/setuptools
.. _AUR Scrapy package: https://aur.archlinux.org/packages/scrapy/
.. _homebrew: https://brew.sh/

View File

@ -306,7 +306,7 @@ with a selector (see :ref:`topics-developer-tools`).
visually selected elements, which works in many browsers.
.. _regular expressions: https://docs.python.org/3/library/re.html
.. _Selector Gadget: http://selectorgadget.com/
.. _Selector Gadget: https://selectorgadget.com/
XPath: a brief intro
@ -337,7 +337,7 @@ recommend `this tutorial to learn XPath through examples
<http://zvon.org/comp/r/tut-XPath_1.html>`_, and `this tutorial to learn "how
to think in XPath" <http://plasmasturm.org/log/xpath101/>`_.
.. _XPath: https://www.w3.org/TR/xpath
.. _XPath: https://www.w3.org/TR/xpath/all/
.. _CSS: https://www.w3.org/TR/selectors
Extracting quotes and authors

View File

@ -3,8 +3,452 @@
Release notes
=============
.. note:: Scrapy 1.x will be the last series supporting Python 2. Scrapy 2.0,
planned for Q4 2019 or Q1 2020, will support **Python 3 only**.
.. _release-2.0.0:
Scrapy 2.0.0 (2020-03-03)
-------------------------
Highlights:
* Python 2 support has been removed
* :doc:`Partial <topics/coroutines>` :ref:`coroutine syntax <async>` support
and :doc:`experimental <topics/asyncio>` :mod:`asyncio` support
* New :meth:`Response.follow_all <scrapy.http.Response.follow_all>` method
* :ref:`FTP support <media-pipeline-ftp>` for media pipelines
* New :attr:`Response.certificate <scrapy.http.Response.certificate>`
attribute
* IPv6 support through :setting:`DNS_RESOLVER`
Backward-incompatible changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Python 2 support has been removed, following `Python 2 end-of-life on
January 1, 2020`_ (:issue:`4091`, :issue:`4114`, :issue:`4115`,
:issue:`4121`, :issue:`4138`, :issue:`4231`, :issue:`4242`, :issue:`4304`,
:issue:`4309`, :issue:`4373`)
* Retry gaveups (see :setting:`RETRY_TIMES`) are now logged as errors instead
of as debug information (:issue:`3171`, :issue:`3566`)
* File extensions that
:class:`LinkExtractor <scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor>`
ignores by default now also include ``7z``, ``7zip``, ``apk``, ``bz2``,
``cdr``, ``dmg``, ``ico``, ``iso``, ``tar``, ``tar.gz``, ``webm``, and
``xz`` (:issue:`1837`, :issue:`2067`, :issue:`4066`)
* The :setting:`METAREFRESH_IGNORE_TAGS` setting is now an empty list by
default, following web browser behavior (:issue:`3844`, :issue:`4311`)
* The
:class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`
now includes spaces after commas in the value of the ``Accept-Encoding``
header that it sets, following web browser behavior (:issue:`4293`)
* The ``__init__`` method of custom download handlers (see
:setting:`DOWNLOAD_HANDLERS`) or subclasses of the following downloader
handlers no longer receives a ``settings`` parameter:
* :class:`scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler`
* :class:`scrapy.core.downloader.handlers.file.FileDownloadHandler`
Use the ``from_settings`` or ``from_crawler`` class methods to expose such
a parameter to your custom download handlers.
(:issue:`4126`)
* We have refactored the :class:`scrapy.core.scheduler.Scheduler` class and
related queue classes (see :setting:`SCHEDULER_PRIORITY_QUEUE`,
:setting:`SCHEDULER_DISK_QUEUE` and :setting:`SCHEDULER_MEMORY_QUEUE`) to
make it easier to implement custom scheduler queue classes. See
:ref:`2-0-0-scheduler-queue-changes` below for details.
* Overridden settings are now logged in a different format. This is more in
line with similar information logged at startup (:issue:`4199`)
.. _Python 2 end-of-life on January 1, 2020: https://www.python.org/doc/sunset-python-2/
Deprecation removals
~~~~~~~~~~~~~~~~~~~~
* The :ref:`Scrapy shell <topics-shell>` no longer provides a `sel` proxy
object, use :meth:`response.selector <scrapy.http.Response.selector>`
instead (:issue:`4347`)
* LevelDB support has been removed (:issue:`4112`)
* The following functions have been removed from :mod:`scrapy.utils.python`:
``isbinarytext``, ``is_writable``, ``setattr_default``, ``stringify_dict``
(:issue:`4362`)
Deprecations
~~~~~~~~~~~~
* Using environment variables prefixed with ``SCRAPY_`` to override settings
is deprecated (:issue:`4300`, :issue:`4374`, :issue:`4375`)
* :class:`scrapy.linkextractors.FilteringLinkExtractor` is deprecated, use
:class:`scrapy.linkextractors.LinkExtractor
<scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor>` instead (:issue:`4045`)
* The ``noconnect`` query string argument of proxy URLs is deprecated and
should be removed from proxy URLs (:issue:`4198`)
* The :meth:`next <scrapy.utils.python.MutableChain.next>` method of
:class:`scrapy.utils.python.MutableChain` is deprecated, use the global
:func:`next` function or :meth:`MutableChain.__next__
<scrapy.utils.python.MutableChain.__next__>` instead (:issue:`4153`)
New features
~~~~~~~~~~~~
* Added :doc:`partial support <topics/coroutines>` for Pythons
:ref:`coroutine syntax <async>` and :doc:`experimental support
<topics/asyncio>` for :mod:`asyncio` and :mod:`asyncio`-powered libraries
(:issue:`4010`, :issue:`4259`, :issue:`4269`, :issue:`4270`, :issue:`4271`,
:issue:`4316`, :issue:`4318`)
* The new :meth:`Response.follow_all <scrapy.http.Response.follow_all>`
method offers the same functionality as
:meth:`Response.follow <scrapy.http.Response.follow>` but supports an
iterable of URLs as input and returns an iterable of requests
(:issue:`2582`, :issue:`4057`, :issue:`4286`)
* :ref:`Media pipelines <topics-media-pipeline>` now support :ref:`FTP
storage <media-pipeline-ftp>` (:issue:`3928`, :issue:`3961`)
* The new :attr:`Response.certificate <scrapy.http.Response.certificate>`
attribute exposes the SSL certificate of the server as a
:class:`twisted.internet.ssl.Certificate` object for HTTPS responses
(:issue:`2726`, :issue:`4054`)
* A new :setting:`DNS_RESOLVER` setting allows enabling IPv6 support
(:issue:`1031`, :issue:`4227`)
* A new :setting:`SCRAPER_SLOT_MAX_ACTIVE_SIZE` setting allows configuring
the existing soft limit that pauses request downloads when the total
response data being processed is too high (:issue:`1410`, :issue:`3551`)
* A new :setting:`TWISTED_REACTOR` setting allows customizing the
:mod:`~twisted.internet.reactor` that Scrapy uses, allowing to
:doc:`enable asyncio support <topics/asyncio>` or deal with a
:ref:`common macOS issue <faq-specific-reactor>` (:issue:`2905`,
:issue:`4294`)
* Scheduler disk and memory queues may now use the class methods
``from_crawler`` or ``from_settings`` (:issue:`3884`)
* The new :attr:`Response.cb_kwargs <scrapy.http.Response.cb_kwargs>`
attribute serves as a shortcut for :attr:`Response.request.cb_kwargs
<scrapy.http.Request.cb_kwargs>` (:issue:`4331`)
* :meth:`Response.follow <scrapy.http.Response.follow>` now supports a
``flags`` parameter, for consistency with :class:`~scrapy.http.Request`
(:issue:`4277`, :issue:`4279`)
* :ref:`Item loader processors <topics-loaders-processors>` can now be
regular functions, they no longer need to be methods (:issue:`3899`)
* :class:`~scrapy.spiders.Rule` now accepts an ``errback`` parameter
(:issue:`4000`)
* :class:`~scrapy.http.Request` no longer requires a ``callback`` parameter
when an ``errback`` parameter is specified (:issue:`3586`, :issue:`4008`)
* :class:`~scrapy.logformatter.LogFormatter` now supports some additional
methods:
* :class:`~scrapy.logformatter.LogFormatter.download_error` for
download errors
* :class:`~scrapy.logformatter.LogFormatter.item_error` for exceptions
raised during item processing by :ref:`item pipelines
<topics-item-pipeline>`
* :class:`~scrapy.logformatter.LogFormatter.spider_error` for exceptions
raised from :ref:`spider callbacks <topics-spiders>`
(:issue:`374`, :issue:`3986`, :issue:`3989`, :issue:`4176`, :issue:`4188`)
* The :setting:`FEED_URI` setting now supports :class:`pathlib.Path` values
(:issue:`3731`, :issue:`4074`)
* A new :signal:`request_left_downloader` signal is sent when a request
leaves the downloader (:issue:`4303`)
* Scrapy logs a warning when it detects a request callback or errback that
uses ``yield`` but also returns a value, since the returned value would be
lost (:issue:`3484`, :issue:`3869`)
* :class:`~scrapy.spiders.Spider` objects now raise an :exc:`AttributeError`
exception if they do not have a :class:`~scrapy.spiders.Spider.start_urls`
attribute nor reimplement :class:`~scrapy.spiders.Spider.start_requests`,
but have a ``start_url`` attribute (:issue:`4133`, :issue:`4170`)
* :class:`~scrapy.exporters.BaseItemExporter` subclasses may now use
``super().__init__(**kwargs)`` instead of ``self._configure(kwargs)`` in
their ``__init__`` method, passing ``dont_fail=True`` to the parent
``__init__`` method if needed, and accessing ``kwargs`` at ``self._kwargs``
after calling their parent ``__init__`` method (:issue:`4193`,
:issue:`4370`)
* A new ``keep_fragments`` parameter of
:func:`scrapy.utils.request.request_fingerprint` allows to generate
different fingerprints for requests with different fragments in their URL
(:issue:`4104`)
* Download handlers (see :setting:`DOWNLOAD_HANDLERS`) may now use the
``from_settings`` and ``from_crawler`` class methods that other Scrapy
components already supported (:issue:`4126`)
* :class:`scrapy.utils.python.MutableChain.__iter__` now returns ``self``,
`allowing it to be used as a sequence <https://lgtm.com/rules/4850080/>`_
(:issue:`4153`)
Bug fixes
~~~~~~~~~
* The :command:`crawl` command now also exits with exit code 1 when an
exception happens before the crawling starts (:issue:`4175`, :issue:`4207`)
* :class:`LinkExtractor.extract_links
<scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor.extract_links>` no longer
re-encodes the query string or URLs from non-UTF-8 responses in UTF-8
(:issue:`998`, :issue:`1403`, :issue:`1949`, :issue:`4321`)
* The first spider middleware (see :setting:`SPIDER_MIDDLEWARES`) now also
processes exceptions raised from callbacks that are generators
(:issue:`4260`, :issue:`4272`)
* Redirects to URLs starting with 3 slashes (``///``) are now supported
(:issue:`4032`, :issue:`4042`)
* :class:`~scrapy.http.Request` no longer accepts strings as ``url`` simply
because they have a colon (:issue:`2552`, :issue:`4094`)
* The correct encoding is now used for attach names in
:class:`~scrapy.mail.MailSender` (:issue:`4229`, :issue:`4239`)
* :class:`~scrapy.dupefilters.RFPDupeFilter`, the default
:setting:`DUPEFILTER_CLASS`, no longer writes an extra ``\r`` character on
each line in Windows, which made the size of the ``requests.seen`` file
unnecessarily large on that platform (:issue:`4283`)
* Z shell auto-completion now looks for ``.html`` files, not ``.http`` files,
and covers the ``-h`` command-line switch (:issue:`4122`, :issue:`4291`)
* Adding items to a :class:`scrapy.utils.datatypes.LocalCache` object
without a ``limit`` defined no longer raises a :exc:`TypeError` exception
(:issue:`4123`)
* Fixed a typo in the message of the :exc:`ValueError` exception raised when
:func:`scrapy.utils.misc.create_instance` gets both ``settings`` and
``crawler`` set to ``None`` (:issue:`4128`)
Documentation
~~~~~~~~~~~~~
* API documentation now links to an online, syntax-highlighted view of the
corresponding source code (:issue:`4148`)
* Links to unexisting documentation pages now allow access to the sidebar
(:issue:`4152`, :issue:`4169`)
* Cross-references within our documentation now display a tooltip when
hovered (:issue:`4173`, :issue:`4183`)
* Improved the documentation about :meth:`LinkExtractor.extract_links
<scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor.extract_links>` and
simplified :ref:`topics-link-extractors` (:issue:`4045`)
* Clarified how :class:`ItemLoader.item <scrapy.loader.ItemLoader.item>`
works (:issue:`3574`, :issue:`4099`)
* Clarified that :func:`logging.basicConfig` should not be used when also
using :class:`~scrapy.crawler.CrawlerProcess` (:issue:`2149`,
:issue:`2352`, :issue:`3146`, :issue:`3960`)
* Clarified the requirements for :class:`~scrapy.http.Request` objects
:ref:`when using persistence <request-serialization>` (:issue:`4124`,
:issue:`4139`)
* Clarified how to install a :ref:`custom image pipeline
<media-pipeline-example>` (:issue:`4034`, :issue:`4252`)
* Fixed the signatures of the ``file_path`` method in :ref:`media pipeline
<topics-media-pipeline>` examples (:issue:`4290`)
* Covered a backward-incompatible change in Scrapy 1.7.0 affecting custom
:class:`scrapy.core.scheduler.Scheduler` subclasses (:issue:`4274`)
* Improved the ``README.rst`` and ``CODE_OF_CONDUCT.md`` files
(:issue:`4059`)
* Documentation examples are now checked as part of our test suite and we
have fixed some of the issues detected (:issue:`4142`, :issue:`4146`,
:issue:`4171`, :issue:`4184`, :issue:`4190`)
* Fixed logic issues, broken links and typos (:issue:`4247`, :issue:`4258`,
:issue:`4282`, :issue:`4288`, :issue:`4305`, :issue:`4308`, :issue:`4323`,
:issue:`4338`, :issue:`4359`, :issue:`4361`)
* Improved consistency when referring to the ``__init__`` method of an object
(:issue:`4086`, :issue:`4088`)
* Fixed an inconsistency between code and output in :ref:`intro-overview`
(:issue:`4213`)
* Extended :mod:`~sphinx.ext.intersphinx` usage (:issue:`4147`,
:issue:`4172`, :issue:`4185`, :issue:`4194`, :issue:`4197`)
* We now use a recent version of Python to build the documentation
(:issue:`4140`, :issue:`4249`)
* Cleaned up documentation (:issue:`4143`, :issue:`4275`)
Quality assurance
~~~~~~~~~~~~~~~~~
* Re-enabled proxy ``CONNECT`` tests (:issue:`2545`, :issue:`4114`)
* Added Bandit_ security checks to our test suite (:issue:`4162`,
:issue:`4181`)
* Added Flake8_ style checks to our test suite and applied many of the
corresponding changes (:issue:`3944`, :issue:`3945`, :issue:`4137`,
:issue:`4157`, :issue:`4167`, :issue:`4174`, :issue:`4186`, :issue:`4195`,
:issue:`4238`, :issue:`4246`, :issue:`4355`, :issue:`4360`, :issue:`4365`)
* Improved test coverage (:issue:`4097`, :issue:`4218`, :issue:`4236`)
* Started reporting slowest tests, and improved the performance of some of
them (:issue:`4163`, :issue:`4164`)
* Fixed broken tests and refactored some tests (:issue:`4014`, :issue:`4095`,
:issue:`4244`, :issue:`4268`, :issue:`4372`)
* Modified the :doc:`tox <tox:index>` configuration to allow running tests
with any Python version, run Bandit_ and Flake8_ tests by default, and
enforce a minimum tox version programmatically (:issue:`4179`)
* Cleaned up code (:issue:`3937`, :issue:`4208`, :issue:`4209`,
:issue:`4210`, :issue:`4212`, :issue:`4369`, :issue:`4376`, :issue:`4378`)
.. _Bandit: https://bandit.readthedocs.io/
.. _Flake8: https://flake8.pycqa.org/en/latest/
.. _2-0-0-scheduler-queue-changes:
Changes to scheduler queue classes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The following changes may impact any custom queue classes of all types:
* The ``push`` method no longer receives a second positional parameter
containing ``request.priority * -1``. If you need that value, get it
from the first positional parameter, ``request``, instead, or use
the new :meth:`~scrapy.core.scheduler.ScrapyPriorityQueue.priority`
method in :class:`scrapy.core.scheduler.ScrapyPriorityQueue`
subclasses.
The following changes may impact custom priority queue classes:
* In the ``__init__`` method or the ``from_crawler`` or ``from_settings``
class methods:
* The parameter that used to contain a factory function,
``qfactory``, is now passed as a keyword parameter named
``downstream_queue_cls``.
* A new keyword parameter has been added: ``key``. It is a string
that is always an empty string for memory queues and indicates the
:setting:`JOB_DIR` value for disk queues.
* The parameter for disk queues that contains data from the previous
crawl, ``startprios`` or ``slot_startprios``, is now passed as a
keyword parameter named ``startprios``.
* The ``serialize`` parameter is no longer passed. The disk queue
class must take care of request serialization on its own before
writing to disk, using the
:func:`~scrapy.utils.reqser.request_to_dict` and
:func:`~scrapy.utils.reqser.request_from_dict` functions from the
:mod:`scrapy.utils.reqser` module.
The following changes may impact custom disk and memory queue classes:
* The signature of the ``__init__`` method is now
``__init__(self, crawler, key)``.
The following changes affect specifically the
:class:`~scrapy.core.scheduler.ScrapyPriorityQueue` and
:class:`~scrapy.core.scheduler.DownloaderAwarePriorityQueue` classes from
:mod:`scrapy.core.scheduler` and may affect subclasses:
* In the ``__init__`` method, most of the changes described above apply.
``__init__`` may still receive all parameters as positional parameters,
however:
* ``downstream_queue_cls``, which replaced ``qfactory``, must be
instantiated differently.
``qfactory`` was instantiated with a priority value (integer).
Instances of ``downstream_queue_cls`` should be created using
the new
:meth:`ScrapyPriorityQueue.qfactory <scrapy.core.scheduler.ScrapyPriorityQueue.qfactory>`
or
:meth:`DownloaderAwarePriorityQueue.pqfactory <scrapy.core.scheduler.DownloaderAwarePriorityQueue.pqfactory>`
methods.
* The new ``key`` parameter displaced the ``startprios``
parameter 1 position to the right.
* The following class attributes have been added:
* :attr:`~scrapy.core.scheduler.ScrapyPriorityQueue.crawler`
* :attr:`~scrapy.core.scheduler.ScrapyPriorityQueue.downstream_queue_cls`
(details above)
* :attr:`~scrapy.core.scheduler.ScrapyPriorityQueue.key` (details above)
* The ``serialize`` attribute has been removed (details above)
The following changes affect specifically the
:class:`~scrapy.core.scheduler.ScrapyPriorityQueue` class and may affect
subclasses:
* A new :meth:`~scrapy.core.scheduler.ScrapyPriorityQueue.priority`
method has been added which, given a request, returns
``request.priority * -1``.
It is used in :meth:`~scrapy.core.scheduler.ScrapyPriorityQueue.push`
to make up for the removal of its ``priority`` parameter.
* The ``spider`` attribute has been removed. Use
:attr:`crawler.spider <scrapy.core.scheduler.ScrapyPriorityQueue.crawler>`
instead.
The following changes affect specifically the
:class:`~scrapy.core.scheduler.DownloaderAwarePriorityQueue` class and may
affect subclasses:
* A new :attr:`~scrapy.core.scheduler.DownloaderAwarePriorityQueue.pqueues`
attribute offers a mapping of downloader slot names to the
corresponding instances of
:attr:`~scrapy.core.scheduler.DownloaderAwarePriorityQueue.downstream_queue_cls`.
(:issue:`3884`)
.. _release-1.8.0:
@ -26,7 +470,7 @@ Backward-incompatible changes
* Python 3.4 is no longer supported, and some of the minimum requirements of
Scrapy have also changed:
* cssselect_ 0.9.1
* :doc:`cssselect <cssselect:index>` 0.9.1
* cryptography_ 2.0
* lxml_ 3.5.0
* pyOpenSSL_ 16.2.0
@ -288,12 +732,12 @@ Backward-incompatible changes
:class:`~scrapy.http.Request` objects instead of arbitrary Python data
structures.
* An additional ``crawler`` parameter has been added to the ``__init__`` method
of the :class:`scrapy.core.scheduler.Scheduler` class.
Custom scheduler subclasses which don't accept arbitrary parameters in
their ``__init__`` method might break because of this change.
* An additional ``crawler`` parameter has been added to the ``__init__``
method of the :class:`~scrapy.core.scheduler.Scheduler` class. Custom
scheduler subclasses which don't accept arbitrary parameters in their
``__init__`` method might break because of this change.
For more information, refer to the documentation for the :setting:`SCHEDULER` setting.
For more information, see :setting:`SCHEDULER`.
See also :ref:`1.7-deprecation-removals` below.
@ -1076,7 +1520,7 @@ Cleanups & Refactoring
~~~~~~~~~~~~~~~~~~~~~~
- Tests: remove temp files and folders (:issue:`2570`),
fixed ProjectUtilsTest on OS X (:issue:`2569`),
fixed ProjectUtilsTest on macOS (:issue:`2569`),
use portable pypy for Linux on Travis CI (:issue:`2710`)
- Separate building request from ``_requests_to_follow`` in CrawlSpider (:issue:`2562`)
- Remove “Python 3 progress” badge (:issue:`2567`)
@ -1616,7 +2060,7 @@ Deprecations and Removals
+ ``scrapy.utils.datatypes.SiteNode``
- The previously bundled ``scrapy.xlib.pydispatch`` library was deprecated and
replaced by `pydispatcher <https://pypi.python.org/pypi/PyDispatcher>`_.
replaced by `pydispatcher <https://pypi.org/project/PyDispatcher/>`_.
Relocations
@ -1645,7 +2089,7 @@ Bugfixes
- Makes ``_monkeypatches`` more robust (:issue:`1634`).
- Fixed bug on ``XMLItemExporter`` with non-string fields in
items (:issue:`1738`).
- Fixed startproject command in OS X (:issue:`1635`).
- Fixed startproject command in macOS (:issue:`1635`).
- Fixed :class:`~scrapy.exporters.PythonItemExporter` and CSVExporter for
non-string item types (:issue:`1737`).
- Various logging related fixes (:issue:`1294`, :issue:`1419`, :issue:`1263`,
@ -1713,12 +2157,12 @@ Scrapy 1.0.4 (2015-12-30)
- Typos corrections (:commit:`7067117`)
- fix typos in downloader-middleware.rst and exceptions.rst, middlware -> middleware (:commit:`32f115c`)
- Add note to Ubuntu install section about Debian compatibility (:commit:`23fda69`)
- Replace alternative OSX install workaround with virtualenv (:commit:`98b63ee`)
- Replace alternative macOS install workaround with virtualenv (:commit:`98b63ee`)
- Reference Homebrew's homepage for installation instructions (:commit:`1925db1`)
- Add oldest supported tox version to contributing docs (:commit:`5d10d6d`)
- Note in install docs about pip being already included in python>=2.7.9 (:commit:`85c980e`)
- Add non-python dependencies to Ubuntu install section in the docs (:commit:`fbd010d`)
- Add OS X installation section to docs (:commit:`d8f4cba`)
- Add macOS installation section to docs (:commit:`d8f4cba`)
- DOC(ENH): specify path to rtd theme explicitly (:commit:`de73b1a`)
- minor: scrapy.Spider docs grammar (:commit:`1ddcc7b`)
- Make common practices sample code match the comments (:commit:`1b85bcf`)
@ -2450,7 +2894,7 @@ Other
~~~~~
- Dropped Python 2.6 support (:issue:`448`)
- Add `cssselect`_ python package as install dependency
- Add :doc:`cssselect <cssselect:index>` python package as install dependency
- Drop libxml2 and multi selector's backend support, `lxml`_ is required from now on.
- Minimum Twisted version increased to 10.0.0, dropped Twisted 8.0 support.
- Running test suite now requires ``mock`` python library (:issue:`390`)
@ -2571,7 +3015,7 @@ Scrapy 0.18.0 (released 2013-08-09)
- MetaRefreshMiddldeware and RedirectMiddleware have different priorities to address #62
- added from_crawler method to spiders
- added system tests with mock server
- more improvements to Mac OS compatibility (thanks Alex Cepoi)
- more improvements to macOS compatibility (thanks Alex Cepoi)
- several more cleanups to singletons and multi-spider support (thanks Nicolas Ramirez)
- support custom download slots
- added --spider option to "shell" command.
@ -2647,7 +3091,7 @@ Scrapy 0.16.3 (released 2012-12-07)
- Remove concurrency limitation when using download delays and still ensure inter-request delays are enforced (:commit:`487b9b5`)
- add error details when image pipeline fails (:commit:`8232569`)
- improve mac os compatibility (:commit:`8dcf8aa`)
- improve macOS compatibility (:commit:`8dcf8aa`)
- setup.py: use README.rst to populate long_description (:commit:`7b5310d`)
- doc: removed obsolete references to ClientForm (:commit:`80f9bb6`)
- correct docs for default storage backend (:commit:`2aa491b`)
@ -3047,17 +3491,16 @@ Scrapy 0.7
First release of Scrapy.
.. _AJAX crawleable urls: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started?csw=1
.. _AJAX crawleable urls: https://developers.google.com/search/docs/ajax-crawling/docs/getting-started?csw=1
.. _botocore: https://github.com/boto/botocore
.. _chunked transfer encoding: https://en.wikipedia.org/wiki/Chunked_transfer_encoding
.. _ClientForm: http://wwwsearch.sourceforge.net/old/ClientForm/
.. _Creating a pull request: https://help.github.com/en/articles/creating-a-pull-request
.. _cryptography: https://cryptography.io/en/latest/
.. _cssselect: https://github.com/scrapy/cssselect/
.. _docstrings: https://docs.python.org/glossary.html#term-docstring
.. _KeyboardInterrupt: https://docs.python.org/library/exceptions.html#KeyboardInterrupt
.. _docstrings: https://docs.python.org/3/glossary.html#term-docstring
.. _KeyboardInterrupt: https://docs.python.org/3/library/exceptions.html#KeyboardInterrupt
.. _LevelDB: https://github.com/google/leveldb
.. _lxml: http://lxml.de/
.. _lxml: https://lxml.de/
.. _marshal: https://docs.python.org/2/library/marshal.html
.. _parsel.csstranslator.GenericTranslator: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.csstranslator.GenericTranslator
.. _parsel.csstranslator.HTMLTranslator: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.csstranslator.HTMLTranslator
@ -3068,11 +3511,11 @@ First release of Scrapy.
.. _queuelib: https://github.com/scrapy/queuelib
.. _registered with IANA: https://www.iana.org/assignments/media-types/media-types.xhtml
.. _resource: https://docs.python.org/2/library/resource.html
.. _robots.txt: http://www.robotstxt.org/
.. _robots.txt: https://www.robotstxt.org/
.. _scrapely: https://github.com/scrapy/scrapely
.. _service_identity: https://service-identity.readthedocs.io/en/stable/
.. _six: https://six.readthedocs.io/
.. _tox: https://pypi.python.org/pypi/tox
.. _tox: https://pypi.org/project/tox/
.. _Twisted: https://twistedmatrix.com/trac/
.. _Twisted - hello, asynchronous programming: http://jessenoller.com/blog/2009/02/11/twisted-hello-asynchronous-programming/
.. _w3lib: https://github.com/scrapy/w3lib

28
docs/topics/asyncio.rst Normal file
View File

@ -0,0 +1,28 @@
=======
asyncio
=======
.. versionadded:: 2.0
Scrapy has partial support :mod:`asyncio`. After you :ref:`install the asyncio
reactor <install-asyncio>`, you may use :mod:`asyncio` and
:mod:`asyncio`-powered libraries in any :doc:`coroutine <coroutines>`.
.. warning:: :mod:`asyncio` support in Scrapy is experimental. Future Scrapy
versions may introduce related changes without a deprecation
period or warning.
.. _install-asyncio:
Installing the asyncio reactor
==============================
To enable :mod:`asyncio` support, set the :setting:`TWISTED_REACTOR` setting to
``'twisted.internet.asyncioreactor.AsyncioSelectorReactor'``.
If you are using :class:`~scrapy.crawler.CrawlerRunner`, you also need to
install the :class:`~twisted.internet.asyncioreactor.AsyncioSelectorReactor`
reactor manually. You can do that using
:func:`~scrapy.utils.reactor.install_reactor`::
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')

View File

@ -188,7 +188,7 @@ AjaxCrawlMiddleware helps to crawl them correctly.
It is turned OFF by default because it has some performance overhead,
and enabling it for focused crawls doesn't make much sense.
.. _ajax crawlable: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
.. _ajax crawlable: https://developers.google.com/search/docs/ajax-crawling/docs/getting-started
.. _broad-crawls-bfo:

110
docs/topics/coroutines.rst Normal file
View File

@ -0,0 +1,110 @@
==========
Coroutines
==========
.. versionadded:: 2.0
Scrapy has :ref:`partial support <coroutine-support>` for the
:ref:`coroutine syntax <async>`.
.. warning:: :mod:`asyncio` support in Scrapy is experimental. Future Scrapy
versions may introduce related API and behavior changes without a
deprecation period or warning.
.. _coroutine-support:
Supported callables
===================
The following callables may be defined as coroutines using ``async def``, and
hence use coroutine syntax (e.g. ``await``, ``async for``, ``async with``):
- :class:`~scrapy.http.Request` callbacks.
The following are known caveats of the current implementation that we aim
to address in future versions of Scrapy:
- The callback output is not processed until the whole callback finishes.
As a side effect, if the callback raises an exception, none of its
output is processed.
- Because `asynchronous generators were introduced in Python 3.6`_, you
can only use ``yield`` if you are using Python 3.6 or later.
If you need to output multiple items or requests and you are using
Python 3.5, return an iterable (e.g. a list) instead.
- The :meth:`process_item` method of
:ref:`item pipelines <topics-item-pipeline>`.
- The
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_request`,
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_response`,
and
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_exception`
methods of
:ref:`downloader middlewares <topics-downloader-middleware-custom>`.
- :ref:`Signal handlers that support deferreds <signal-deferred>`.
.. _asynchronous generators were introduced in Python 3.6: https://www.python.org/dev/peps/pep-0525/
Usage
=====
There are several use cases for coroutines in Scrapy. Code that would
return Deferreds when written for previous Scrapy versions, such as downloader
middlewares and signal handlers, can be rewritten to be shorter and cleaner::
class DbPipeline:
def _update_item(self, data, item):
item['field'] = data
return item
def process_item(self, item, spider):
dfd = db.get_some_data(item['id'])
dfd.addCallback(self._update_item, item)
return dfd
becomes::
class DbPipeline:
async def process_item(self, item, spider):
item['field'] = await db.get_some_data(item['id'])
return item
Coroutines may be used to call asynchronous code. This includes other
coroutines, functions that return Deferreds and functions that return
`awaitable objects`_ such as :class:`~asyncio.Future`. This means you can use
many useful Python libraries providing such code::
class MySpider(Spider):
# ...
async def parse_with_deferred(self, response):
additional_response = await treq.get('https://additional.url')
additional_data = await treq.content(additional_response)
# ... use response and additional_data to yield items and requests
async def parse_with_asyncio(self, response):
async with aiohttp.ClientSession() as session:
async with session.get('https://additional.url') as additional_response:
additional_data = await r.text()
# ... use response and additional_data to yield items and requests
.. note:: Many libraries that use coroutines, such as `aio-libs`_, require the
:mod:`asyncio` loop and to use them you need to
:doc:`enable asyncio support in Scrapy<asyncio>`.
Common use cases for asynchronous code include:
* requesting data from websites, databases and other services (in callbacks,
pipelines and middlewares);
* storing data in databases (in pipelines and middlewares);
* delaying the spider initialization until some external event (in the
:signal:`spider_opened` handler);
* calling asynchronous Scrapy methods like ``ExecutionEngine.download`` (see
:ref:`the screenshot pipeline example<ScreenshotPipeline>`).
.. _aio-libs: https://github.com/aio-libs
.. _awaitable objects: https://docs.python.org/3/glossary.html#term-awaitable

View File

@ -709,7 +709,7 @@ HttpCompressionMiddleware
provided `brotlipy`_ is installed.
.. _brotli-compressed: https://www.ietf.org/rfc/rfc7932.txt
.. _brotlipy: https://pypi.python.org/pypi/brotlipy
.. _brotlipy: https://pypi.org/project/brotlipy/
HttpCompressionMiddleware Settings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -872,6 +872,10 @@ Default: ``[]``
Meta tags within these tags are ignored.
.. versionchanged:: 2.0
The default value of :setting:`METAREFRESH_IGNORE_TAGS` changed from
``['script', 'noscript']`` to ``[]``.
.. setting:: METAREFRESH_MAXDELAY
METAREFRESH_MAXDELAY
@ -1038,7 +1042,7 @@ Based on `RobotFileParser
* is Python's built-in robots.txt_ parser
* is compliant with `Martijn Koster's 1996 draft specification
<http://www.robotstxt.org/norobots-rfc.txt>`_
<https://www.robotstxt.org/norobots-rfc.txt>`_
* lacks support for wildcard matching
@ -1061,7 +1065,7 @@ Based on `Reppy <https://github.com/seomoz/reppy/>`_:
<https://github.com/seomoz/rep-cpp>`_
* is compliant with `Martijn Koster's 1996 draft specification
<http://www.robotstxt.org/norobots-rfc.txt>`_
<https://www.robotstxt.org/norobots-rfc.txt>`_
* supports wildcard matching
@ -1086,7 +1090,7 @@ Based on `Robotexclusionrulesparser <http://nikitathespider.com/python/rerp/>`_:
* implemented in Python
* is compliant with `Martijn Koster's 1996 draft specification
<http://www.robotstxt.org/norobots-rfc.txt>`_
<https://www.robotstxt.org/norobots-rfc.txt>`_
* supports wildcard matching
@ -1115,7 +1119,7 @@ implementing the methods described below.
.. autoclass:: RobotParser
:members:
.. _robots.txt: http://www.robotstxt.org/
.. _robots.txt: https://www.robotstxt.org/
DownloaderStats
---------------
@ -1155,7 +1159,7 @@ AjaxCrawlMiddleware
Middleware that finds 'AJAX crawlable' page variants based
on meta-fragment html tag. See
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
https://developers.google.com/search/docs/ajax-crawling/docs/getting-started
for more info.
.. note::

View File

@ -241,12 +241,12 @@ along with `scrapy-selenium`_ for seamless integration.
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
.. _js2xml: https://github.com/scrapinghub/js2xml
.. _json.loads: https://docs.python.org/library/json.html#json.loads
.. _json.loads: https://docs.python.org/3/library/json.html#json.loads
.. _pytesseract: https://github.com/madmaze/pytesseract
.. _regular expression: https://docs.python.org/library/re.html
.. _regular expression: https://docs.python.org/3/library/re.html
.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
.. _Selenium: https://www.seleniumhq.org/
.. _Selenium: https://www.selenium.dev/
.. _Splash: https://github.com/scrapinghub/splash
.. _tabula-py: https://github.com/chezou/tabula-py
.. _wget: https://www.gnu.org/software/wget/

View File

@ -137,7 +137,7 @@ output examples, which assume you're exporting these two items::
BaseItemExporter
----------------
.. class:: BaseItemExporter(fields_to_export=None, export_empty_fields=False, encoding='utf-8', indent=0)
.. class:: BaseItemExporter(fields_to_export=None, export_empty_fields=False, encoding='utf-8', indent=0, dont_fail=False)
This is the (abstract) base class for all Item Exporters. It provides
support for common features used by all (concrete) Item Exporters, such as
@ -148,6 +148,9 @@ BaseItemExporter
populate their respective instance attributes: :attr:`fields_to_export`,
:attr:`export_empty_fields`, :attr:`encoding`, :attr:`indent`.
.. versionadded:: 2.0
The *dont_fail* parameter.
.. method:: export_item(item)
Exports the given item. This method must be implemented in subclasses.

View File

@ -236,6 +236,9 @@ supported URI schemes.
This setting is required for enabling the feed exports.
.. versionchanged:: 2.0
Added :class:`pathlib.Path` support.
.. setting:: FEED_FORMAT
FEED_FORMAT

View File

@ -158,18 +158,20 @@ method and how to clean up the resources properly.::
self.db[self.collection_name].insert_one(dict(item))
return item
.. _MongoDB: https://www.mongodb.org/
.. _pymongo: https://api.mongodb.org/python/current/
.. _MongoDB: https://www.mongodb.com/
.. _pymongo: https://api.mongodb.com/python/current/
.. _ScreenshotPipeline:
Take screenshot of item
-----------------------
This example demonstrates how to return a
:class:`~twisted.internet.defer.Deferred` from the :meth:`process_item` method.
It uses Splash_ to render screenshot of item url. Pipeline
makes request to locally running instance of Splash_. After request is downloaded
and Deferred callback fires, it saves item to a file and adds filename to an item.
makes request to locally running instance of Splash_. After request is downloaded,
it saves the screenshot to a file and adds filename to the item.
::
@ -184,15 +186,12 @@ and Deferred callback fires, it saves item to a file and adds filename to an ite
SPLASH_URL = "http://localhost:8050/render.png?url={}"
def process_item(self, item, spider):
async def process_item(self, item, spider):
encoded_item_url = quote(item["url"])
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
request = scrapy.Request(screenshot_url)
dfd = spider.crawler.engine.download(request, spider)
dfd.addBoth(self.return_item, item)
return dfd
response = await spider.crawler.engine.download(request, spider)
def return_item(self, response, item):
if response.status != 200:
# Error happened, return item.
return item

View File

@ -166,7 +166,7 @@ If your item contains mutable_ values like lists or dictionaries, a shallow
copy will keep references to the same mutable values across all different
copies.
.. _mutable: https://docs.python.org/glossary.html#term-mutable
.. _mutable: https://docs.python.org/3/glossary.html#term-mutable
For example, if you have an item with a list of tags, and you create a shallow
copy of that item, both the original item and the copy have the same list of
@ -177,7 +177,7 @@ If that is not the desired behavior, use a deep copy instead.
See the `documentation of the copy module`_ for more information.
.. _documentation of the copy module: https://docs.python.org/library/copy.html
.. _documentation of the copy module: https://docs.python.org/3/library/copy.html
To create a shallow copy of an item, you can either call
:meth:`~scrapy.item.Item.copy` on an existing item

View File

@ -68,6 +68,9 @@ Cookies may expire. So, if you don't resume your spider quickly the requests
scheduled may no longer work. This won't be an issue if you spider doesn't rely
on cookies.
.. _request-serialization:
Request serialization
---------------------

View File

@ -206,7 +206,7 @@ objects. If this is your case, and you can't find your leaks using ``trackref``,
you still have another resource: the `Guppy library`_.
If you're using Python3, see :ref:`topics-leaks-muppy`.
.. _Guppy library: https://pypi.python.org/pypi/guppy
.. _Guppy library: https://pypi.org/project/guppy/
If you use ``pip``, you can install Guppy with the following command::
@ -311,9 +311,9 @@ though neither Scrapy nor your project are leaking memory. This is due to a
(not so well) known problem of Python, which may not return released memory to
the operating system in some cases. For more information on this issue see:
* `Python Memory Management <http://www.evanjones.ca/python-memory.html>`_
* `Python Memory Management Part 2 <http://www.evanjones.ca/python-memory-part2.html>`_
* `Python Memory Management Part 3 <http://www.evanjones.ca/python-memory-part3.html>`_
* `Python Memory Management <https://www.evanjones.ca/python-memory.html>`_
* `Python Memory Management Part 2 <https://www.evanjones.ca/python-memory-part2.html>`_
* `Python Memory Management Part 3 <https://www.evanjones.ca/python-memory-part3.html>`_
The improvements proposed by Evan Jones, which are detailed in `this paper`_,
got merged in Python 2.5, but this only reduces the problem, it doesn't fix it
@ -327,7 +327,7 @@ completely. To quote the paper:
to move to a compacting garbage collector, which is able to move objects in
memory. This would require significant changes to the Python interpreter.*
.. _this paper: http://www.evanjones.ca/memoryallocator/
.. _this paper: https://www.evanjones.ca/memoryallocator/
To keep memory consumption reasonable you can split the job into several
smaller jobs or enable :ref:`persistent job queue <topics-jobs>`

View File

@ -64,9 +64,13 @@ LxmlLinkExtractor
:param deny_extensions: a single value or list of strings containing
extensions that should be ignored when extracting links.
If not given, it will default to the
``IGNORED_EXTENSIONS`` list defined in the
`scrapy.linkextractors`_ package.
If not given, it will default to
:data:`scrapy.linkextractors.IGNORED_EXTENSIONS`.
.. versionchanged:: 2.0
:data:`~scrapy.linkextractors.IGNORED_EXTENSIONS` now includes
``7z``, ``7zip``, ``apk``, ``bz2``, ``cdr``, ``dmg``, ``ico``,
``iso``, ``tar``, ``tar.gz``, ``webm``, and ``xz``.
:type deny_extensions: list
:param restrict_xpaths: is an XPath (or list of XPath's) which defines

View File

@ -136,6 +136,9 @@ with the data to be parsed, and return a parsed value. So you can use any
function as input or output processor. The only requirement is that they must
accept one (and only one) positional argument, which will be an iterable.
.. versionchanged:: 2.0
Processors no longer need to be methods.
.. note:: Both input and output processors must receive an iterable as their
first argument. The output of those functions can be anything. The result of
input processors will be appended to an internal list (in the Loader)

View File

@ -116,12 +116,6 @@ For the Images Pipeline, set the :setting:`IMAGES_STORE` setting::
Supported Storage
=================
File system is currently the only officially supported storage, but there are
also support for storing files in `Amazon S3`_ and `Google Cloud Storage`_.
.. _Amazon S3: https://aws.amazon.com/s3/
.. _Google Cloud Storage: https://cloud.google.com/storage/
File system storage
-------------------
@ -147,9 +141,13 @@ Where:
* ``full`` is a sub-directory to separate full images from thumbnails (if
used). For more info see :ref:`topics-images-thumbnails`.
.. _media-pipeline-ftp:
FTP server storage
------------------
.. versionadded:: 2.0
:setting:`FILES_STORE` and :setting:`IMAGES_STORE` can point to an FTP server.
Scrapy will automatically upload the files to the server.
@ -573,6 +571,8 @@ See here the methods that you can override in your custom Images Pipeline:
By default, the :meth:`item_completed` method returns the item.
.. _media-pipeline-example:
Custom Images pipeline example
==============================

View File

@ -31,6 +31,8 @@ Request objects
a :class:`Response`.
:param url: the URL of this request
If the URL is invalid, a :exc:`ValueError` exception is raised.
:type url: string
:param callback: the function that will be called with the response of this
@ -125,6 +127,10 @@ Request objects
:exc:`~twisted.python.failure.Failure` as first parameter.
For more information,
see :ref:`topics-request-response-ref-errbacks` below.
.. versionchanged:: 2.0
The *callback* parameter is no longer required when the *errback*
parameter is specified.
:type errback: callable
:param flags: Flags sent to the request, can be used for logging or similar purposes.
@ -396,7 +402,7 @@ The FormRequest class extends the base :class:`Request` with functionality for
dealing with HTML forms. It uses `lxml.html forms`_ to pre-populate form
fields with form data from :class:`Response` objects.
.. _lxml.html forms: http://lxml.de/lxmlhtml.html#forms
.. _lxml.html forms: https://lxml.de/lxmlhtml.html#forms
.. class:: FormRequest(url, [formdata, ...])
@ -680,6 +686,8 @@ Response objects
.. attribute:: Response.cb_kwargs
.. versionadded:: 2.0
A shortcut to the :attr:`Request.cb_kwargs` attribute of the
:attr:`Response.request` object (i.e. ``self.request.cb_kwargs``).

View File

@ -35,12 +35,11 @@ defines selectors to associate those styles with specific HTML elements.
in speed and parsing accuracy to lxml.
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
.. _lxml: http://lxml.de/
.. _lxml: https://lxml.de/
.. _ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html
.. _cssselect: https://pypi.python.org/pypi/cssselect/
.. _XPath: https://www.w3.org/TR/xpath
.. _XPath: https://www.w3.org/TR/xpath/all/
.. _CSS: https://www.w3.org/TR/selectors
.. _parsel: https://parsel.readthedocs.io/
.. _parsel: https://parsel.readthedocs.io/en/latest/
Using selectors
===============
@ -255,7 +254,7 @@ that Scrapy (parsel) implements a couple of **non-standard pseudo-elements**:
They will most probably not work with other libraries like
`lxml`_ or `PyQuery`_.
.. _PyQuery: https://pypi.python.org/pypi/pyquery
.. _PyQuery: https://pypi.org/project/pyquery/
Examples:
@ -309,7 +308,7 @@ Examples:
make much sense: text nodes do not have attributes, and attribute values
are string values already and do not have children nodes.
.. _CSS Selectors: https://www.w3.org/TR/css3-selectors/#selectors
.. _CSS Selectors: https://www.w3.org/TR/selectors-3/#selectors
.. _topics-selectors-nesting-selectors:
@ -504,7 +503,7 @@ Another common case would be to extract all direct ``<p>`` children:
For more details about relative XPaths see the `Location Paths`_ section in the
XPath specification.
.. _Location Paths: https://www.w3.org/TR/xpath#location-paths
.. _Location Paths: https://www.w3.org/TR/xpath/all/#location-paths
When querying by class, consider using CSS
------------------------------------------
@ -612,7 +611,7 @@ But using the ``.`` to mean the node, works:
>>> sel.xpath("//a[contains(., 'Next Page')]").getall()
['<a href="#">Click here to go to the <strong>Next Page</strong></a>']
.. _`XPath string function`: https://www.w3.org/TR/xpath/#section-String-Functions
.. _`XPath string function`: https://www.w3.org/TR/xpath/all/#section-String-Functions
.. _topics-selectors-xpath-variables:
@ -764,7 +763,7 @@ Set operations
These can be handy for excluding parts of a document tree before
extracting text elements for example.
Example extracting microdata (sample content taken from http://schema.org/Product)
Example extracting microdata (sample content taken from https://schema.org/Product)
with groups of itemscopes and corresponding itemprops::
>>> doc = u"""

View File

@ -381,6 +381,8 @@ DNS in-memory cache size.
DNS_RESOLVER
------------
.. versionadded:: 2.0
Default: ``'scrapy.resolver.CachingThreadedResolver'``
The class to be used to resolve DNS names. The default ``scrapy.resolver.CachingThreadedResolver``
@ -1258,6 +1260,9 @@ does not work together with :setting:`CONCURRENT_REQUESTS_PER_IP`.
SCRAPER_SLOT_MAX_ACTIVE_SIZE
----------------------------
.. versionadded:: 2.0
Default: ``5_000_000``
Soft limit (in bytes) for response data being processed.
@ -1447,24 +1452,36 @@ in the ``project`` subdirectory.
TWISTED_REACTOR
---------------
.. versionadded:: 2.0
Default: ``None``
Import path of a given Twisted reactor, for instance:
:class:`twisted.internet.asyncioreactor.AsyncioSelectorReactor`.
Import path of a given :mod:`~twisted.internet.reactor`.
Scrapy will install this reactor if no other is installed yet, such as when
the ``scrapy`` CLI program is invoked or when using the
:class:`~scrapy.crawler.CrawlerProcess` class. If you are using the
:class:`~scrapy.crawler.CrawlerRunner` class, you need to install the correct
reactor manually. An exception will be raised if the installation fails.
Scrapy will install this reactor if no other reactor is installed yet, such as
when the ``scrapy`` CLI program is invoked or when using the
:class:`~scrapy.crawler.CrawlerProcess` class.
The default value for this option is currently ``None``, which means that Scrapy
will not attempt to install any specific reactor, and the default one defined by
Twisted for the current platform will be used. This is to maintain backward
compatibility and avoid possible problems caused by using a non-default reactor.
If you are using the :class:`~scrapy.crawler.CrawlerRunner` class, you also
need to install the correct reactor manually. You can do that using
:func:`~scrapy.utils.reactor.install_reactor`:
For additional information, please see
:doc:`core/howto/choosing-reactor`.
.. autofunction:: scrapy.utils.reactor.install_reactor
If a reactor is already installed,
:func:`~scrapy.utils.reactor.install_reactor` has no effect.
:meth:`CrawlerRunner.__init__ <scrapy.crawler.CrawlerRunner.__init__>` raises
:exc:`Exception` if the installed reactor does not match the
:setting:`TWISTED_REACTOR` setting.
The default value of the :setting:`TWISTED_REACTOR` setting is ``None``, which
means that Scrapy will not attempt to install any specific reactor, and the
default reactor defined by Twisted for the current platform will be used. This
is to maintain backward compatibility and avoid possible problems caused by
using a non-default reactor.
For additional information, see :doc:`core/howto/choosing-reactor`.
.. setting:: URLLENGTH_LIMIT

View File

@ -41,7 +41,7 @@ variable; or by defining it in your :ref:`scrapy.cfg <topics-config-settings>`::
.. _IPython: https://ipython.org/
.. _IPython installation guide: https://ipython.org/install.html
.. _bpython: https://www.bpython-interpreter.org/
.. _bpython: https://bpython-interpreter.org/
Launch the shell
================
@ -142,7 +142,7 @@ Example of shell session
========================
Here's an example of a typical shell session where we start by scraping the
https://scrapy.org page, and then proceed to scrape the https://reddit.com
https://scrapy.org page, and then proceed to scrape the https://old.reddit.com/
page. Finally, we modify the (Reddit) request method to POST and re-fetch it
getting an error. We end the session by typing Ctrl-D (in Unix systems) or
Ctrl-Z in Windows.
@ -182,7 +182,7 @@ After that, we can start playing with the objects:
>>> response.xpath('//title/text()').get()
'Scrapy | A Fast and Powerful Scraping and Web Crawling Framework'
>>> fetch("https://reddit.com")
>>> fetch("https://old.reddit.com/")
>>> response.xpath('//title/text()').get()
'reddit: the front page of the internet'

View File

@ -46,6 +46,7 @@ Here is a simple example showing how you can catch signals and perform some acti
def parse(self, response):
pass
.. _signal-deferred:
Deferred signal handlers
========================
@ -301,6 +302,8 @@ request_left_downloader
.. signal:: request_left_downloader
.. function:: request_left_downloader(request, spider)
.. versionadded:: 2.0
Sent when a :class:`~scrapy.http.Request` leaves the downloader, even in case of
failure.

View File

@ -299,8 +299,8 @@ The spider will not do any parsing on its own.
If you were to set the ``start_urls`` attribute from the command line,
you would have to parse it on your own into a list
using something like
`ast.literal_eval <https://docs.python.org/library/ast.html#ast.literal_eval>`_
or `json.loads <https://docs.python.org/library/json.html#json.loads>`_
`ast.literal_eval <https://docs.python.org/3/library/ast.html#ast.literal_eval>`_
or `json.loads <https://docs.python.org/3/library/json.html#json.loads>`_
and then set it as an attribute.
Otherwise, you would cause iteration over a ``start_urls`` string
(a very common python pitfall)
@ -420,6 +420,9 @@ Crawling rules
It receives a :class:`Twisted Failure <twisted.python.failure.Failure>`
instance as first parameter.
.. versionadded:: 2.0
The *errback* parameter.
CrawlSpider example
~~~~~~~~~~~~~~~~~~~
@ -811,6 +814,6 @@ Combine SitemapSpider with other sources of urls::
.. _Sitemaps: https://www.sitemaps.org/index.html
.. _Sitemap index files: https://www.sitemaps.org/protocol.html#index
.. _robots.txt: http://www.robotstxt.org/
.. _robots.txt: https://www.robotstxt.org/
.. _TLD: https://en.wikipedia.org/wiki/Top-level_domain
.. _Scrapyd documentation: https://scrapyd.readthedocs.io/en/latest/

View File

@ -1 +1 @@
1.8.0
2.0.0

View File

@ -54,8 +54,13 @@ class Command(ScrapyCommand):
raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
spname = args[0]
self.crawler_process.crawl(spname, **opts.spargs)
self.crawler_process.start()
crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
if self.crawler_process.bootstrap_failed:
if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception):
self.exitcode = 1
else:
self.crawler_process.start()
if self.crawler_process.bootstrap_failed or \
(hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception):
self.exitcode = 1

View File

@ -23,7 +23,7 @@ __all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
class BaseItemExporter(object):
def __init__(self, dont_fail=False, **kwargs):
def __init__(self, *, dont_fail=False, **kwargs):
self._kwargs = kwargs
self._configure(kwargs, dont_fail=dont_fail)

View File

@ -47,7 +47,7 @@ class MemoryUsage(object):
def get_virtual_size(self):
size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
if sys.platform != 'darwin':
# on Mac OS X ru_maxrss is in bytes, on Linux it is in KB
# on macOS ru_maxrss is in bytes, on Linux it is in KB
size *= 1024
return size

View File

@ -132,6 +132,9 @@ class Response(object_ref):
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
.. versionadded:: 2.0
The *flags* parameter.
"""
if isinstance(url, Link):
url = url.url
@ -160,6 +163,8 @@ class Response(object_ref):
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Generator[Request, None, None]
"""
.. versionadded:: 2.0
Return an iterable of :class:`~.Request` instances to follow all links
in ``urls``. It accepts the same arguments as ``Request.__init__`` method,
but elements of ``urls`` can be relative URLs or :class:`~scrapy.link.Link` objects,

View File

@ -97,7 +97,11 @@ class LogFormatter(object):
}
def item_error(self, item, exception, response, spider):
"""Logs a message when an item causes an error while it is passing through the item pipeline."""
"""Logs a message when an item causes an error while it is passing
through the item pipeline.
.. versionadded:: 2.0
"""
return {
'level': logging.ERROR,
'msg': ITEMERRORMSG,
@ -107,7 +111,10 @@ class LogFormatter(object):
}
def spider_error(self, failure, request, response, spider):
"""Logs an error message from a spider."""
"""Logs an error message from a spider.
.. versionadded:: 2.0
"""
return {
'level': logging.ERROR,
'msg': SPIDERERRORMSG,
@ -118,7 +125,11 @@ class LogFormatter(object):
}
def download_error(self, failure, request, spider, errmsg=None):
"""Logs a download error message from a spider (typically coming from the engine)."""
"""Logs a download error message from a spider (typically coming from
the engine).
.. versionadded:: 2.0
"""
args = {'request': request}
if errmsg:
msg = DOWNLOADERRORMSG_LONG

View File

@ -29,7 +29,7 @@ class CachingThreadedResolver(ThreadedResolver):
cache_size = 0
return cls(reactor, cache_size, crawler.settings.getfloat('DNS_TIMEOUT'))
def install_on_reactor(self,):
def install_on_reactor(self):
self.reactor.installResolver(self)
def getHostByName(self, name, timeout=None):

View File

@ -9,10 +9,8 @@ DEPRECATED_SETTINGS = [
('ENCODING_ALIASES', 'no longer needed (encoding discovery uses w3lib now)'),
('STATS_ENABLED', 'no longer supported (change STATS_CLASS instead)'),
('SQLITE_DB', 'no longer supported'),
('SELECTORS_BACKEND', 'use SCRAPY_SELECTORS_BACKEND environment variable instead'),
('AUTOTHROTTLE_MIN_DOWNLOAD_DELAY', 'use DOWNLOAD_DELAY instead'),
('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
('REDIRECT_MAX_METAREFRESH_DELAY', 'use METAREFRESH_MAXDELAY instead'),
('LOG_UNSERIALIZABLE_REQUESTS', 'use SCHEDULER_DEBUG instead'),
]

View File

@ -75,9 +75,24 @@ def get_project_settings():
"is deprecated.", ScrapyDeprecationWarning)
settings.setdict(pickle.loads(pickled_settings), priority='project')
env_overrides = {k[7:]: v for k, v in os.environ.items() if
k.startswith('SCRAPY_')}
if env_overrides:
warnings.warn("Use of 'SCRAPY_'-prefixed environment variables to override settings is deprecated.", ScrapyDeprecationWarning)
settings.setdict(env_overrides, priority='project')
scrapy_envvars = {k[7:]: v for k, v in os.environ.items() if
k.startswith('SCRAPY_')}
valid_envvars = {
'CHECK',
'PICKLED_SETTINGS_TO_OVERRIDE',
'PROJECT',
'PYTHON_SHELL',
'SETTINGS_MODULE',
}
setting_envvars = {k for k in scrapy_envvars if k not in valid_envvars}
if setting_envvars:
setting_envvar_list = ', '.join(sorted(setting_envvars))
warnings.warn(
'Use of environment variables prefixed with SCRAPY_ to override '
'settings is deprecated. The following environment variables are '
'currently defined: {}'.format(setting_envvar_list),
ScrapyDeprecationWarning
)
settings.setdict(scrapy_envvars, priority='project')
return settings

View File

@ -50,6 +50,8 @@ class CallLaterOnce(object):
def install_reactor(reactor_path):
"""Installs the :mod:`~twisted.internet.reactor` with the specified
import path."""
reactor_class = load_object(reactor_path)
if reactor_class is asyncioreactor.AsyncioSelectorReactor:
with suppress(error.ReactorAlreadyInstalledError):
@ -63,6 +65,9 @@ def install_reactor(reactor_path):
def verify_installed_reactor(reactor_path):
"""Raises :exc:`Exception` if the installed
:mod:`~twisted.internet.reactor` does not match the specified import
path."""
from twisted.internet import reactor
reactor_class = load_object(reactor_path)
if not isinstance(reactor, reactor_class):

View File

@ -0,0 +1,20 @@
import os
import sys
import unittest
from subprocess import Popen, PIPE
class CmdlineCrawlPipelineTest(unittest.TestCase):
def _execute(self, spname):
args = (sys.executable, '-m', 'scrapy.cmdline', 'crawl', spname)
cwd = os.path.dirname(os.path.abspath(__file__))
proc = Popen(args, stdout=PIPE, stderr=PIPE, cwd=cwd)
proc.communicate()
return proc.returncode
def test_open_spider_normally_in_pipeline(self):
self.assertEqual(self._execute('normal'), 0)
def test_exception_at_open_spider_in_pipeline(self):
self.assertEqual(self._execute('exception'), 1)

View File

@ -0,0 +1,2 @@
[settings]
default = test_spider.settings

View File

@ -0,0 +1,16 @@
class TestSpiderPipeline(object):
def open_spider(self, spider):
pass
def process_item(self, item, spider):
return item
class TestSpiderExceptionPipeline(object):
def open_spider(self, spider):
raise Exception('exception')
def process_item(self, item, spider):
return item

View File

@ -0,0 +1,2 @@
BOT_NAME = 'test_spider'
SPIDER_MODULES = ['test_spider.spiders']

View File

@ -0,0 +1,14 @@
import scrapy
class ExceptionSpider(scrapy.Spider):
name = 'exception'
custom_settings = {
'ITEM_PIPELINES': {
'test_spider.pipelines.TestSpiderExceptionPipeline': 300
}
}
def parse(self, response):
pass

View File

@ -0,0 +1,14 @@
import scrapy
class NormalSpider(scrapy.Spider):
name = 'normal'
custom_settings = {
'ITEM_PIPELINES': {
'test_spider.pipelines.TestSpiderPipeline': 300
}
}
def parse(self, response):
pass

View File

@ -40,7 +40,7 @@ class CrawlTestCase(TestCase):
@defer.inlineCallbacks
def test_fixed_delay(self):
yield self._test_delay(total=3, delay=0.1)
yield self._test_delay(total=3, delay=0.2)
@defer.inlineCallbacks
def test_randomized_delay(self):
@ -328,7 +328,7 @@ with multiples lines
@mark.only_asyncio()
@defer.inlineCallbacks
def test_async_def_asyncio_parse(self):
runner = CrawlerRunner({"ASYNCIO_REACTOR": True})
runner = CrawlerRunner({"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor"})
runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
with LogCapture() as log:
yield runner.join()

View File

@ -3,7 +3,11 @@ import os
import tempfile
import shutil
import contextlib
from scrapy.utils.project import data_path
from pytest import warns
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.project import data_path, get_project_settings
@contextlib.contextmanager
@ -41,3 +45,53 @@ class ProjectUtilsTest(unittest.TestCase):
)
abspath = os.path.join(os.path.sep, 'absolute', 'path')
self.assertEqual(abspath, data_path(abspath))
@contextlib.contextmanager
def set_env(**update):
modified = set(update.keys()) & set(os.environ.keys())
update_after = {k: os.environ[k] for k in modified}
remove_after = frozenset(k for k in update if k not in os.environ)
try:
os.environ.update(update)
yield
finally:
os.environ.update(update_after)
for k in remove_after:
os.environ.pop(k)
class GetProjectSettingsTestCase(unittest.TestCase):
def test_valid_envvar(self):
value = 'tests.test_cmdline.settings'
envvars = {
'SCRAPY_SETTINGS_MODULE': value,
}
with set_env(**envvars), warns(None) as warnings:
settings = get_project_settings()
assert not warnings
assert settings.get('SETTINGS_MODULE') == value
def test_invalid_envvar(self):
envvars = {
'SCRAPY_FOO': 'bar',
}
with set_env(**envvars), warns(None) as warnings:
get_project_settings()
assert len(warnings) == 1
assert warnings[0].category == ScrapyDeprecationWarning
assert str(warnings[0].message).endswith(': FOO')
def test_valid_and_invalid_envvars(self):
value = 'tests.test_cmdline.settings'
envvars = {
'SCRAPY_FOO': 'bar',
'SCRAPY_SETTINGS_MODULE': value,
}
with set_env(**envvars), warns(None) as warnings:
settings = get_project_settings()
assert len(warnings) == 1
assert warnings[0].category == ScrapyDeprecationWarning
assert str(warnings[0].message).endswith(': FOO')
assert settings.get('SETTINGS_MODULE') == value