mirror of
https://github.com/scrapy/scrapy.git
synced 2025-03-15 10:51:48 +00:00
Merge branch 'master' into response_ip_address
This commit is contained in:
commit
3aa5eab993
@ -1,8 +1,7 @@
|
||||
[bumpversion]
|
||||
current_version = 1.8.0
|
||||
current_version = 2.0.0
|
||||
commit = True
|
||||
tag = True
|
||||
tag_name = {new_version}
|
||||
|
||||
[bumpversion:file:scrapy/VERSION]
|
||||
|
||||
|
@ -1,9 +1,11 @@
|
||||
version: 2
|
||||
sphinx:
|
||||
configuration: docs/conf.py
|
||||
fail_on_warning: true
|
||||
python:
|
||||
# For available versions, see:
|
||||
# https://docs.readthedocs.io/en/stable/config-file/v2.html#build-image
|
||||
version: 3.7 # Keep in sync with .travis.yml
|
||||
install:
|
||||
- requirements: docs/requirements.txt
|
||||
- path: .
|
||||
|
@ -41,7 +41,7 @@ Requirements
|
||||
============
|
||||
|
||||
* Python 3.5+
|
||||
* Works on Linux, Windows, Mac OSX, BSD
|
||||
* Works on Linux, Windows, macOS, BSD
|
||||
|
||||
Install
|
||||
=======
|
||||
|
@ -281,6 +281,7 @@ coverage_ignore_pyobjects = [
|
||||
|
||||
intersphinx_mapping = {
|
||||
'coverage': ('https://coverage.readthedocs.io/en/stable', None),
|
||||
'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
|
||||
'pytest': ('https://docs.pytest.org/en/latest', None),
|
||||
'python': ('https://docs.python.org/3', None),
|
||||
'sphinx': ('https://www.sphinx-doc.org/en/master', None),
|
||||
|
@ -143,7 +143,7 @@ by running ``git fetch upstream pull/$PR_NUMBER/head:$BRANCH_NAME_TO_CREATE``
|
||||
(replace 'upstream' with a remote name for scrapy repository,
|
||||
``$PR_NUMBER`` with an ID of the pull request, and ``$BRANCH_NAME_TO_CREATE``
|
||||
with a name of the branch you want to create locally).
|
||||
See also: https://help.github.com/articles/checking-out-pull-requests-locally/#modifying-an-inactive-pull-request-locally.
|
||||
See also: https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/checking-out-pull-requests-locally#modifying-an-inactive-pull-request-locally.
|
||||
|
||||
When writing GitHub pull requests, try to keep titles short but descriptive.
|
||||
E.g. For bug #411: "Scrapy hangs if an exception raises in start_requests"
|
||||
@ -168,7 +168,7 @@ Scrapy:
|
||||
|
||||
* Don't put your name in the code you contribute; git provides enough
|
||||
metadata to identify author of the code.
|
||||
See https://help.github.com/articles/setting-your-username-in-git/ for
|
||||
See https://help.github.com/en/github/using-git/setting-your-username-in-git for
|
||||
setup instructions.
|
||||
|
||||
.. _documentation-policies:
|
||||
@ -266,5 +266,5 @@ And their unit-tests are in::
|
||||
.. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
|
||||
.. _open issues: https://github.com/scrapy/scrapy/issues
|
||||
.. _PEP 257: https://www.python.org/dev/peps/pep-0257/
|
||||
.. _pull request: https://help.github.com/en/articles/creating-a-pull-request
|
||||
.. _pull request: https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request
|
||||
.. _pytest-xdist: https://github.com/pytest-dev/pytest-xdist
|
||||
|
@ -22,8 +22,8 @@ In other words, comparing `BeautifulSoup`_ (or `lxml`_) to Scrapy is like
|
||||
comparing `jinja2`_ to `Django`_.
|
||||
|
||||
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
|
||||
.. _lxml: http://lxml.de/
|
||||
.. _jinja2: http://jinja.pocoo.org/
|
||||
.. _lxml: https://lxml.de/
|
||||
.. _jinja2: https://palletsprojects.com/p/jinja/
|
||||
.. _Django: https://www.djangoproject.com/
|
||||
|
||||
Can I use Scrapy with BeautifulSoup?
|
||||
@ -269,7 +269,7 @@ The ``__VIEWSTATE`` parameter is used in sites built with ASP.NET/VB.NET. For
|
||||
more info on how it works see `this page`_. Also, here's an `example spider`_
|
||||
which scrapes one of these sites.
|
||||
|
||||
.. _this page: http://search.cpan.org/~ecarroll/HTML-TreeBuilderX-ASP_NET-0.09/lib/HTML/TreeBuilderX/ASP_NET.pm
|
||||
.. _this page: https://metacpan.org/pod/release/ECARROLL/HTML-TreeBuilderX-ASP_NET-0.09/lib/HTML/TreeBuilderX/ASP_NET.pm
|
||||
.. _example spider: https://github.com/AmbientLighter/rpn-fas/blob/master/fas/spiders/rnp.py
|
||||
|
||||
What's the best way to parse big XML/CSV data feeds?
|
||||
|
@ -165,6 +165,8 @@ Solving specific problems
|
||||
topics/autothrottle
|
||||
topics/benchmarking
|
||||
topics/jobs
|
||||
topics/coroutines
|
||||
topics/asyncio
|
||||
|
||||
:doc:`faq`
|
||||
Get answers to most frequently asked questions.
|
||||
@ -205,6 +207,12 @@ Solving specific problems
|
||||
:doc:`topics/jobs`
|
||||
Learn how to pause and resume crawls for large spiders.
|
||||
|
||||
:doc:`topics/coroutines`
|
||||
Use the :ref:`coroutine syntax <async>`.
|
||||
|
||||
:doc:`topics/asyncio`
|
||||
Use :mod:`asyncio` and :mod:`asyncio`-powered libraries.
|
||||
|
||||
.. _extending-scrapy:
|
||||
|
||||
Extending Scrapy
|
||||
|
@ -7,12 +7,12 @@ Installation guide
|
||||
Installing Scrapy
|
||||
=================
|
||||
|
||||
Scrapy runs on Python 3.5 or above
|
||||
under CPython (default Python implementation) and PyPy (starting with PyPy 5.9).
|
||||
Scrapy runs on Python 3.5 or above under CPython (default Python
|
||||
implementation) and PyPy (starting with PyPy 5.9).
|
||||
|
||||
If you're using `Anaconda`_ or `Miniconda`_, you can install the package from
|
||||
the `conda-forge`_ channel, which has up-to-date packages for Linux, Windows
|
||||
and OS X.
|
||||
and macOS.
|
||||
|
||||
To install Scrapy using ``conda``, run::
|
||||
|
||||
@ -65,7 +65,7 @@ please refer to their respective installation instructions:
|
||||
* `lxml installation`_
|
||||
* `cryptography installation`_
|
||||
|
||||
.. _lxml installation: http://lxml.de/installation.html
|
||||
.. _lxml installation: https://lxml.de/installation.html
|
||||
.. _cryptography installation: https://cryptography.io/en/latest/installation/
|
||||
|
||||
|
||||
@ -148,11 +148,11 @@ you can install Scrapy with ``pip`` after that::
|
||||
|
||||
.. _intro-install-macos:
|
||||
|
||||
Mac OS X
|
||||
--------
|
||||
macOS
|
||||
-----
|
||||
|
||||
Building Scrapy's dependencies requires the presence of a C compiler and
|
||||
development headers. On OS X this is typically provided by Apple’s Xcode
|
||||
development headers. On macOS this is typically provided by Apple’s Xcode
|
||||
development tools. To install the Xcode command line tools open a terminal
|
||||
window and run::
|
||||
|
||||
@ -191,7 +191,7 @@ solutions:
|
||||
* *(Optional)* :ref:`Install Scrapy inside a Python virtual environment
|
||||
<intro-using-virtualenv>`.
|
||||
|
||||
This method is a workaround for the above OS X issue, but it's an overall
|
||||
This method is a workaround for the above macOS issue, but it's an overall
|
||||
good practice for managing dependencies and can complement the first method.
|
||||
|
||||
After any of these workarounds you should be able to install Scrapy::
|
||||
@ -207,7 +207,7 @@ For PyPy3, only Linux installation was tested.
|
||||
|
||||
Most Scrapy dependencides now have binary wheels for CPython, but not for PyPy.
|
||||
This means that these dependecies will be built during installation.
|
||||
On OS X, you are likely to face an issue with building Cryptography dependency,
|
||||
On macOS, you are likely to face an issue with building Cryptography dependency,
|
||||
solution to this problem is described
|
||||
`here <https://github.com/pyca/cryptography/issues/2692#issuecomment-272773481>`_,
|
||||
that is to ``brew install openssl`` and then export the flags that this command
|
||||
@ -253,11 +253,11 @@ For details, see `Issue #2473 <https://github.com/scrapy/scrapy/issues/2473>`_.
|
||||
.. _Python: https://www.python.org/
|
||||
.. _pip: https://pip.pypa.io/en/latest/installing/
|
||||
.. _lxml: https://lxml.de/index.html
|
||||
.. _parsel: https://pypi.python.org/pypi/parsel
|
||||
.. _w3lib: https://pypi.python.org/pypi/w3lib
|
||||
.. _twisted: https://twistedmatrix.com/
|
||||
.. _cryptography: https://cryptography.io/
|
||||
.. _pyOpenSSL: https://pypi.python.org/pypi/pyOpenSSL
|
||||
.. _parsel: https://pypi.org/project/parsel/
|
||||
.. _w3lib: https://pypi.org/project/w3lib/
|
||||
.. _twisted: https://twistedmatrix.com/trac/
|
||||
.. _cryptography: https://cryptography.io/en/latest/
|
||||
.. _pyOpenSSL: https://pypi.org/project/pyOpenSSL/
|
||||
.. _setuptools: https://pypi.python.org/pypi/setuptools
|
||||
.. _AUR Scrapy package: https://aur.archlinux.org/packages/scrapy/
|
||||
.. _homebrew: https://brew.sh/
|
||||
|
@ -306,7 +306,7 @@ with a selector (see :ref:`topics-developer-tools`).
|
||||
visually selected elements, which works in many browsers.
|
||||
|
||||
.. _regular expressions: https://docs.python.org/3/library/re.html
|
||||
.. _Selector Gadget: http://selectorgadget.com/
|
||||
.. _Selector Gadget: https://selectorgadget.com/
|
||||
|
||||
|
||||
XPath: a brief intro
|
||||
@ -337,7 +337,7 @@ recommend `this tutorial to learn XPath through examples
|
||||
<http://zvon.org/comp/r/tut-XPath_1.html>`_, and `this tutorial to learn "how
|
||||
to think in XPath" <http://plasmasturm.org/log/xpath101/>`_.
|
||||
|
||||
.. _XPath: https://www.w3.org/TR/xpath
|
||||
.. _XPath: https://www.w3.org/TR/xpath/all/
|
||||
.. _CSS: https://www.w3.org/TR/selectors
|
||||
|
||||
Extracting quotes and authors
|
||||
|
489
docs/news.rst
489
docs/news.rst
@ -3,8 +3,452 @@
|
||||
Release notes
|
||||
=============
|
||||
|
||||
.. note:: Scrapy 1.x will be the last series supporting Python 2. Scrapy 2.0,
|
||||
planned for Q4 2019 or Q1 2020, will support **Python 3 only**.
|
||||
.. _release-2.0.0:
|
||||
|
||||
Scrapy 2.0.0 (2020-03-03)
|
||||
-------------------------
|
||||
|
||||
Highlights:
|
||||
|
||||
* Python 2 support has been removed
|
||||
* :doc:`Partial <topics/coroutines>` :ref:`coroutine syntax <async>` support
|
||||
and :doc:`experimental <topics/asyncio>` :mod:`asyncio` support
|
||||
* New :meth:`Response.follow_all <scrapy.http.Response.follow_all>` method
|
||||
* :ref:`FTP support <media-pipeline-ftp>` for media pipelines
|
||||
* New :attr:`Response.certificate <scrapy.http.Response.certificate>`
|
||||
attribute
|
||||
* IPv6 support through :setting:`DNS_RESOLVER`
|
||||
|
||||
Backward-incompatible changes
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Python 2 support has been removed, following `Python 2 end-of-life on
|
||||
January 1, 2020`_ (:issue:`4091`, :issue:`4114`, :issue:`4115`,
|
||||
:issue:`4121`, :issue:`4138`, :issue:`4231`, :issue:`4242`, :issue:`4304`,
|
||||
:issue:`4309`, :issue:`4373`)
|
||||
|
||||
* Retry gaveups (see :setting:`RETRY_TIMES`) are now logged as errors instead
|
||||
of as debug information (:issue:`3171`, :issue:`3566`)
|
||||
|
||||
* File extensions that
|
||||
:class:`LinkExtractor <scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor>`
|
||||
ignores by default now also include ``7z``, ``7zip``, ``apk``, ``bz2``,
|
||||
``cdr``, ``dmg``, ``ico``, ``iso``, ``tar``, ``tar.gz``, ``webm``, and
|
||||
``xz`` (:issue:`1837`, :issue:`2067`, :issue:`4066`)
|
||||
|
||||
* The :setting:`METAREFRESH_IGNORE_TAGS` setting is now an empty list by
|
||||
default, following web browser behavior (:issue:`3844`, :issue:`4311`)
|
||||
|
||||
* The
|
||||
:class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`
|
||||
now includes spaces after commas in the value of the ``Accept-Encoding``
|
||||
header that it sets, following web browser behavior (:issue:`4293`)
|
||||
|
||||
* The ``__init__`` method of custom download handlers (see
|
||||
:setting:`DOWNLOAD_HANDLERS`) or subclasses of the following downloader
|
||||
handlers no longer receives a ``settings`` parameter:
|
||||
|
||||
* :class:`scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler`
|
||||
|
||||
* :class:`scrapy.core.downloader.handlers.file.FileDownloadHandler`
|
||||
|
||||
Use the ``from_settings`` or ``from_crawler`` class methods to expose such
|
||||
a parameter to your custom download handlers.
|
||||
|
||||
(:issue:`4126`)
|
||||
|
||||
* We have refactored the :class:`scrapy.core.scheduler.Scheduler` class and
|
||||
related queue classes (see :setting:`SCHEDULER_PRIORITY_QUEUE`,
|
||||
:setting:`SCHEDULER_DISK_QUEUE` and :setting:`SCHEDULER_MEMORY_QUEUE`) to
|
||||
make it easier to implement custom scheduler queue classes. See
|
||||
:ref:`2-0-0-scheduler-queue-changes` below for details.
|
||||
|
||||
* Overridden settings are now logged in a different format. This is more in
|
||||
line with similar information logged at startup (:issue:`4199`)
|
||||
|
||||
.. _Python 2 end-of-life on January 1, 2020: https://www.python.org/doc/sunset-python-2/
|
||||
|
||||
|
||||
Deprecation removals
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
* The :ref:`Scrapy shell <topics-shell>` no longer provides a `sel` proxy
|
||||
object, use :meth:`response.selector <scrapy.http.Response.selector>`
|
||||
instead (:issue:`4347`)
|
||||
|
||||
* LevelDB support has been removed (:issue:`4112`)
|
||||
|
||||
* The following functions have been removed from :mod:`scrapy.utils.python`:
|
||||
``isbinarytext``, ``is_writable``, ``setattr_default``, ``stringify_dict``
|
||||
(:issue:`4362`)
|
||||
|
||||
|
||||
Deprecations
|
||||
~~~~~~~~~~~~
|
||||
|
||||
* Using environment variables prefixed with ``SCRAPY_`` to override settings
|
||||
is deprecated (:issue:`4300`, :issue:`4374`, :issue:`4375`)
|
||||
|
||||
* :class:`scrapy.linkextractors.FilteringLinkExtractor` is deprecated, use
|
||||
:class:`scrapy.linkextractors.LinkExtractor
|
||||
<scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor>` instead (:issue:`4045`)
|
||||
|
||||
* The ``noconnect`` query string argument of proxy URLs is deprecated and
|
||||
should be removed from proxy URLs (:issue:`4198`)
|
||||
|
||||
* The :meth:`next <scrapy.utils.python.MutableChain.next>` method of
|
||||
:class:`scrapy.utils.python.MutableChain` is deprecated, use the global
|
||||
:func:`next` function or :meth:`MutableChain.__next__
|
||||
<scrapy.utils.python.MutableChain.__next__>` instead (:issue:`4153`)
|
||||
|
||||
|
||||
New features
|
||||
~~~~~~~~~~~~
|
||||
|
||||
* Added :doc:`partial support <topics/coroutines>` for Python’s
|
||||
:ref:`coroutine syntax <async>` and :doc:`experimental support
|
||||
<topics/asyncio>` for :mod:`asyncio` and :mod:`asyncio`-powered libraries
|
||||
(:issue:`4010`, :issue:`4259`, :issue:`4269`, :issue:`4270`, :issue:`4271`,
|
||||
:issue:`4316`, :issue:`4318`)
|
||||
|
||||
* The new :meth:`Response.follow_all <scrapy.http.Response.follow_all>`
|
||||
method offers the same functionality as
|
||||
:meth:`Response.follow <scrapy.http.Response.follow>` but supports an
|
||||
iterable of URLs as input and returns an iterable of requests
|
||||
(:issue:`2582`, :issue:`4057`, :issue:`4286`)
|
||||
|
||||
* :ref:`Media pipelines <topics-media-pipeline>` now support :ref:`FTP
|
||||
storage <media-pipeline-ftp>` (:issue:`3928`, :issue:`3961`)
|
||||
|
||||
* The new :attr:`Response.certificate <scrapy.http.Response.certificate>`
|
||||
attribute exposes the SSL certificate of the server as a
|
||||
:class:`twisted.internet.ssl.Certificate` object for HTTPS responses
|
||||
(:issue:`2726`, :issue:`4054`)
|
||||
|
||||
* A new :setting:`DNS_RESOLVER` setting allows enabling IPv6 support
|
||||
(:issue:`1031`, :issue:`4227`)
|
||||
|
||||
* A new :setting:`SCRAPER_SLOT_MAX_ACTIVE_SIZE` setting allows configuring
|
||||
the existing soft limit that pauses request downloads when the total
|
||||
response data being processed is too high (:issue:`1410`, :issue:`3551`)
|
||||
|
||||
* A new :setting:`TWISTED_REACTOR` setting allows customizing the
|
||||
:mod:`~twisted.internet.reactor` that Scrapy uses, allowing to
|
||||
:doc:`enable asyncio support <topics/asyncio>` or deal with a
|
||||
:ref:`common macOS issue <faq-specific-reactor>` (:issue:`2905`,
|
||||
:issue:`4294`)
|
||||
|
||||
* Scheduler disk and memory queues may now use the class methods
|
||||
``from_crawler`` or ``from_settings`` (:issue:`3884`)
|
||||
|
||||
* The new :attr:`Response.cb_kwargs <scrapy.http.Response.cb_kwargs>`
|
||||
attribute serves as a shortcut for :attr:`Response.request.cb_kwargs
|
||||
<scrapy.http.Request.cb_kwargs>` (:issue:`4331`)
|
||||
|
||||
* :meth:`Response.follow <scrapy.http.Response.follow>` now supports a
|
||||
``flags`` parameter, for consistency with :class:`~scrapy.http.Request`
|
||||
(:issue:`4277`, :issue:`4279`)
|
||||
|
||||
* :ref:`Item loader processors <topics-loaders-processors>` can now be
|
||||
regular functions, they no longer need to be methods (:issue:`3899`)
|
||||
|
||||
* :class:`~scrapy.spiders.Rule` now accepts an ``errback`` parameter
|
||||
(:issue:`4000`)
|
||||
|
||||
* :class:`~scrapy.http.Request` no longer requires a ``callback`` parameter
|
||||
when an ``errback`` parameter is specified (:issue:`3586`, :issue:`4008`)
|
||||
|
||||
* :class:`~scrapy.logformatter.LogFormatter` now supports some additional
|
||||
methods:
|
||||
|
||||
* :class:`~scrapy.logformatter.LogFormatter.download_error` for
|
||||
download errors
|
||||
|
||||
* :class:`~scrapy.logformatter.LogFormatter.item_error` for exceptions
|
||||
raised during item processing by :ref:`item pipelines
|
||||
<topics-item-pipeline>`
|
||||
|
||||
* :class:`~scrapy.logformatter.LogFormatter.spider_error` for exceptions
|
||||
raised from :ref:`spider callbacks <topics-spiders>`
|
||||
|
||||
(:issue:`374`, :issue:`3986`, :issue:`3989`, :issue:`4176`, :issue:`4188`)
|
||||
|
||||
* The :setting:`FEED_URI` setting now supports :class:`pathlib.Path` values
|
||||
(:issue:`3731`, :issue:`4074`)
|
||||
|
||||
* A new :signal:`request_left_downloader` signal is sent when a request
|
||||
leaves the downloader (:issue:`4303`)
|
||||
|
||||
* Scrapy logs a warning when it detects a request callback or errback that
|
||||
uses ``yield`` but also returns a value, since the returned value would be
|
||||
lost (:issue:`3484`, :issue:`3869`)
|
||||
|
||||
* :class:`~scrapy.spiders.Spider` objects now raise an :exc:`AttributeError`
|
||||
exception if they do not have a :class:`~scrapy.spiders.Spider.start_urls`
|
||||
attribute nor reimplement :class:`~scrapy.spiders.Spider.start_requests`,
|
||||
but have a ``start_url`` attribute (:issue:`4133`, :issue:`4170`)
|
||||
|
||||
* :class:`~scrapy.exporters.BaseItemExporter` subclasses may now use
|
||||
``super().__init__(**kwargs)`` instead of ``self._configure(kwargs)`` in
|
||||
their ``__init__`` method, passing ``dont_fail=True`` to the parent
|
||||
``__init__`` method if needed, and accessing ``kwargs`` at ``self._kwargs``
|
||||
after calling their parent ``__init__`` method (:issue:`4193`,
|
||||
:issue:`4370`)
|
||||
|
||||
* A new ``keep_fragments`` parameter of
|
||||
:func:`scrapy.utils.request.request_fingerprint` allows to generate
|
||||
different fingerprints for requests with different fragments in their URL
|
||||
(:issue:`4104`)
|
||||
|
||||
* Download handlers (see :setting:`DOWNLOAD_HANDLERS`) may now use the
|
||||
``from_settings`` and ``from_crawler`` class methods that other Scrapy
|
||||
components already supported (:issue:`4126`)
|
||||
|
||||
* :class:`scrapy.utils.python.MutableChain.__iter__` now returns ``self``,
|
||||
`allowing it to be used as a sequence <https://lgtm.com/rules/4850080/>`_
|
||||
(:issue:`4153`)
|
||||
|
||||
|
||||
Bug fixes
|
||||
~~~~~~~~~
|
||||
|
||||
* The :command:`crawl` command now also exits with exit code 1 when an
|
||||
exception happens before the crawling starts (:issue:`4175`, :issue:`4207`)
|
||||
|
||||
* :class:`LinkExtractor.extract_links
|
||||
<scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor.extract_links>` no longer
|
||||
re-encodes the query string or URLs from non-UTF-8 responses in UTF-8
|
||||
(:issue:`998`, :issue:`1403`, :issue:`1949`, :issue:`4321`)
|
||||
|
||||
* The first spider middleware (see :setting:`SPIDER_MIDDLEWARES`) now also
|
||||
processes exceptions raised from callbacks that are generators
|
||||
(:issue:`4260`, :issue:`4272`)
|
||||
|
||||
* Redirects to URLs starting with 3 slashes (``///``) are now supported
|
||||
(:issue:`4032`, :issue:`4042`)
|
||||
|
||||
* :class:`~scrapy.http.Request` no longer accepts strings as ``url`` simply
|
||||
because they have a colon (:issue:`2552`, :issue:`4094`)
|
||||
|
||||
* The correct encoding is now used for attach names in
|
||||
:class:`~scrapy.mail.MailSender` (:issue:`4229`, :issue:`4239`)
|
||||
|
||||
* :class:`~scrapy.dupefilters.RFPDupeFilter`, the default
|
||||
:setting:`DUPEFILTER_CLASS`, no longer writes an extra ``\r`` character on
|
||||
each line in Windows, which made the size of the ``requests.seen`` file
|
||||
unnecessarily large on that platform (:issue:`4283`)
|
||||
|
||||
* Z shell auto-completion now looks for ``.html`` files, not ``.http`` files,
|
||||
and covers the ``-h`` command-line switch (:issue:`4122`, :issue:`4291`)
|
||||
|
||||
* Adding items to a :class:`scrapy.utils.datatypes.LocalCache` object
|
||||
without a ``limit`` defined no longer raises a :exc:`TypeError` exception
|
||||
(:issue:`4123`)
|
||||
|
||||
* Fixed a typo in the message of the :exc:`ValueError` exception raised when
|
||||
:func:`scrapy.utils.misc.create_instance` gets both ``settings`` and
|
||||
``crawler`` set to ``None`` (:issue:`4128`)
|
||||
|
||||
|
||||
Documentation
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
* API documentation now links to an online, syntax-highlighted view of the
|
||||
corresponding source code (:issue:`4148`)
|
||||
|
||||
* Links to unexisting documentation pages now allow access to the sidebar
|
||||
(:issue:`4152`, :issue:`4169`)
|
||||
|
||||
* Cross-references within our documentation now display a tooltip when
|
||||
hovered (:issue:`4173`, :issue:`4183`)
|
||||
|
||||
* Improved the documentation about :meth:`LinkExtractor.extract_links
|
||||
<scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor.extract_links>` and
|
||||
simplified :ref:`topics-link-extractors` (:issue:`4045`)
|
||||
|
||||
* Clarified how :class:`ItemLoader.item <scrapy.loader.ItemLoader.item>`
|
||||
works (:issue:`3574`, :issue:`4099`)
|
||||
|
||||
* Clarified that :func:`logging.basicConfig` should not be used when also
|
||||
using :class:`~scrapy.crawler.CrawlerProcess` (:issue:`2149`,
|
||||
:issue:`2352`, :issue:`3146`, :issue:`3960`)
|
||||
|
||||
* Clarified the requirements for :class:`~scrapy.http.Request` objects
|
||||
:ref:`when using persistence <request-serialization>` (:issue:`4124`,
|
||||
:issue:`4139`)
|
||||
|
||||
* Clarified how to install a :ref:`custom image pipeline
|
||||
<media-pipeline-example>` (:issue:`4034`, :issue:`4252`)
|
||||
|
||||
* Fixed the signatures of the ``file_path`` method in :ref:`media pipeline
|
||||
<topics-media-pipeline>` examples (:issue:`4290`)
|
||||
|
||||
* Covered a backward-incompatible change in Scrapy 1.7.0 affecting custom
|
||||
:class:`scrapy.core.scheduler.Scheduler` subclasses (:issue:`4274`)
|
||||
|
||||
* Improved the ``README.rst`` and ``CODE_OF_CONDUCT.md`` files
|
||||
(:issue:`4059`)
|
||||
|
||||
* Documentation examples are now checked as part of our test suite and we
|
||||
have fixed some of the issues detected (:issue:`4142`, :issue:`4146`,
|
||||
:issue:`4171`, :issue:`4184`, :issue:`4190`)
|
||||
|
||||
* Fixed logic issues, broken links and typos (:issue:`4247`, :issue:`4258`,
|
||||
:issue:`4282`, :issue:`4288`, :issue:`4305`, :issue:`4308`, :issue:`4323`,
|
||||
:issue:`4338`, :issue:`4359`, :issue:`4361`)
|
||||
|
||||
* Improved consistency when referring to the ``__init__`` method of an object
|
||||
(:issue:`4086`, :issue:`4088`)
|
||||
|
||||
* Fixed an inconsistency between code and output in :ref:`intro-overview`
|
||||
(:issue:`4213`)
|
||||
|
||||
* Extended :mod:`~sphinx.ext.intersphinx` usage (:issue:`4147`,
|
||||
:issue:`4172`, :issue:`4185`, :issue:`4194`, :issue:`4197`)
|
||||
|
||||
* We now use a recent version of Python to build the documentation
|
||||
(:issue:`4140`, :issue:`4249`)
|
||||
|
||||
* Cleaned up documentation (:issue:`4143`, :issue:`4275`)
|
||||
|
||||
|
||||
Quality assurance
|
||||
~~~~~~~~~~~~~~~~~
|
||||
|
||||
* Re-enabled proxy ``CONNECT`` tests (:issue:`2545`, :issue:`4114`)
|
||||
|
||||
* Added Bandit_ security checks to our test suite (:issue:`4162`,
|
||||
:issue:`4181`)
|
||||
|
||||
* Added Flake8_ style checks to our test suite and applied many of the
|
||||
corresponding changes (:issue:`3944`, :issue:`3945`, :issue:`4137`,
|
||||
:issue:`4157`, :issue:`4167`, :issue:`4174`, :issue:`4186`, :issue:`4195`,
|
||||
:issue:`4238`, :issue:`4246`, :issue:`4355`, :issue:`4360`, :issue:`4365`)
|
||||
|
||||
* Improved test coverage (:issue:`4097`, :issue:`4218`, :issue:`4236`)
|
||||
|
||||
* Started reporting slowest tests, and improved the performance of some of
|
||||
them (:issue:`4163`, :issue:`4164`)
|
||||
|
||||
* Fixed broken tests and refactored some tests (:issue:`4014`, :issue:`4095`,
|
||||
:issue:`4244`, :issue:`4268`, :issue:`4372`)
|
||||
|
||||
* Modified the :doc:`tox <tox:index>` configuration to allow running tests
|
||||
with any Python version, run Bandit_ and Flake8_ tests by default, and
|
||||
enforce a minimum tox version programmatically (:issue:`4179`)
|
||||
|
||||
* Cleaned up code (:issue:`3937`, :issue:`4208`, :issue:`4209`,
|
||||
:issue:`4210`, :issue:`4212`, :issue:`4369`, :issue:`4376`, :issue:`4378`)
|
||||
|
||||
.. _Bandit: https://bandit.readthedocs.io/
|
||||
.. _Flake8: https://flake8.pycqa.org/en/latest/
|
||||
|
||||
|
||||
.. _2-0-0-scheduler-queue-changes:
|
||||
|
||||
Changes to scheduler queue classes
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The following changes may impact any custom queue classes of all types:
|
||||
|
||||
* The ``push`` method no longer receives a second positional parameter
|
||||
containing ``request.priority * -1``. If you need that value, get it
|
||||
from the first positional parameter, ``request``, instead, or use
|
||||
the new :meth:`~scrapy.core.scheduler.ScrapyPriorityQueue.priority`
|
||||
method in :class:`scrapy.core.scheduler.ScrapyPriorityQueue`
|
||||
subclasses.
|
||||
|
||||
The following changes may impact custom priority queue classes:
|
||||
|
||||
* In the ``__init__`` method or the ``from_crawler`` or ``from_settings``
|
||||
class methods:
|
||||
|
||||
* The parameter that used to contain a factory function,
|
||||
``qfactory``, is now passed as a keyword parameter named
|
||||
``downstream_queue_cls``.
|
||||
|
||||
* A new keyword parameter has been added: ``key``. It is a string
|
||||
that is always an empty string for memory queues and indicates the
|
||||
:setting:`JOB_DIR` value for disk queues.
|
||||
|
||||
* The parameter for disk queues that contains data from the previous
|
||||
crawl, ``startprios`` or ``slot_startprios``, is now passed as a
|
||||
keyword parameter named ``startprios``.
|
||||
|
||||
* The ``serialize`` parameter is no longer passed. The disk queue
|
||||
class must take care of request serialization on its own before
|
||||
writing to disk, using the
|
||||
:func:`~scrapy.utils.reqser.request_to_dict` and
|
||||
:func:`~scrapy.utils.reqser.request_from_dict` functions from the
|
||||
:mod:`scrapy.utils.reqser` module.
|
||||
|
||||
The following changes may impact custom disk and memory queue classes:
|
||||
|
||||
* The signature of the ``__init__`` method is now
|
||||
``__init__(self, crawler, key)``.
|
||||
|
||||
The following changes affect specifically the
|
||||
:class:`~scrapy.core.scheduler.ScrapyPriorityQueue` and
|
||||
:class:`~scrapy.core.scheduler.DownloaderAwarePriorityQueue` classes from
|
||||
:mod:`scrapy.core.scheduler` and may affect subclasses:
|
||||
|
||||
* In the ``__init__`` method, most of the changes described above apply.
|
||||
|
||||
``__init__`` may still receive all parameters as positional parameters,
|
||||
however:
|
||||
|
||||
* ``downstream_queue_cls``, which replaced ``qfactory``, must be
|
||||
instantiated differently.
|
||||
|
||||
``qfactory`` was instantiated with a priority value (integer).
|
||||
|
||||
Instances of ``downstream_queue_cls`` should be created using
|
||||
the new
|
||||
:meth:`ScrapyPriorityQueue.qfactory <scrapy.core.scheduler.ScrapyPriorityQueue.qfactory>`
|
||||
or
|
||||
:meth:`DownloaderAwarePriorityQueue.pqfactory <scrapy.core.scheduler.DownloaderAwarePriorityQueue.pqfactory>`
|
||||
methods.
|
||||
|
||||
* The new ``key`` parameter displaced the ``startprios``
|
||||
parameter 1 position to the right.
|
||||
|
||||
* The following class attributes have been added:
|
||||
|
||||
* :attr:`~scrapy.core.scheduler.ScrapyPriorityQueue.crawler`
|
||||
|
||||
* :attr:`~scrapy.core.scheduler.ScrapyPriorityQueue.downstream_queue_cls`
|
||||
(details above)
|
||||
|
||||
* :attr:`~scrapy.core.scheduler.ScrapyPriorityQueue.key` (details above)
|
||||
|
||||
* The ``serialize`` attribute has been removed (details above)
|
||||
|
||||
The following changes affect specifically the
|
||||
:class:`~scrapy.core.scheduler.ScrapyPriorityQueue` class and may affect
|
||||
subclasses:
|
||||
|
||||
* A new :meth:`~scrapy.core.scheduler.ScrapyPriorityQueue.priority`
|
||||
method has been added which, given a request, returns
|
||||
``request.priority * -1``.
|
||||
|
||||
It is used in :meth:`~scrapy.core.scheduler.ScrapyPriorityQueue.push`
|
||||
to make up for the removal of its ``priority`` parameter.
|
||||
|
||||
* The ``spider`` attribute has been removed. Use
|
||||
:attr:`crawler.spider <scrapy.core.scheduler.ScrapyPriorityQueue.crawler>`
|
||||
instead.
|
||||
|
||||
The following changes affect specifically the
|
||||
:class:`~scrapy.core.scheduler.DownloaderAwarePriorityQueue` class and may
|
||||
affect subclasses:
|
||||
|
||||
* A new :attr:`~scrapy.core.scheduler.DownloaderAwarePriorityQueue.pqueues`
|
||||
attribute offers a mapping of downloader slot names to the
|
||||
corresponding instances of
|
||||
:attr:`~scrapy.core.scheduler.DownloaderAwarePriorityQueue.downstream_queue_cls`.
|
||||
|
||||
(:issue:`3884`)
|
||||
|
||||
|
||||
.. _release-1.8.0:
|
||||
|
||||
@ -26,7 +470,7 @@ Backward-incompatible changes
|
||||
* Python 3.4 is no longer supported, and some of the minimum requirements of
|
||||
Scrapy have also changed:
|
||||
|
||||
* cssselect_ 0.9.1
|
||||
* :doc:`cssselect <cssselect:index>` 0.9.1
|
||||
* cryptography_ 2.0
|
||||
* lxml_ 3.5.0
|
||||
* pyOpenSSL_ 16.2.0
|
||||
@ -288,12 +732,12 @@ Backward-incompatible changes
|
||||
:class:`~scrapy.http.Request` objects instead of arbitrary Python data
|
||||
structures.
|
||||
|
||||
* An additional ``crawler`` parameter has been added to the ``__init__`` method
|
||||
of the :class:`scrapy.core.scheduler.Scheduler` class.
|
||||
Custom scheduler subclasses which don't accept arbitrary parameters in
|
||||
their ``__init__`` method might break because of this change.
|
||||
* An additional ``crawler`` parameter has been added to the ``__init__``
|
||||
method of the :class:`~scrapy.core.scheduler.Scheduler` class. Custom
|
||||
scheduler subclasses which don't accept arbitrary parameters in their
|
||||
``__init__`` method might break because of this change.
|
||||
|
||||
For more information, refer to the documentation for the :setting:`SCHEDULER` setting.
|
||||
For more information, see :setting:`SCHEDULER`.
|
||||
|
||||
See also :ref:`1.7-deprecation-removals` below.
|
||||
|
||||
@ -1076,7 +1520,7 @@ Cleanups & Refactoring
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Tests: remove temp files and folders (:issue:`2570`),
|
||||
fixed ProjectUtilsTest on OS X (:issue:`2569`),
|
||||
fixed ProjectUtilsTest on macOS (:issue:`2569`),
|
||||
use portable pypy for Linux on Travis CI (:issue:`2710`)
|
||||
- Separate building request from ``_requests_to_follow`` in CrawlSpider (:issue:`2562`)
|
||||
- Remove “Python 3 progress” badge (:issue:`2567`)
|
||||
@ -1616,7 +2060,7 @@ Deprecations and Removals
|
||||
+ ``scrapy.utils.datatypes.SiteNode``
|
||||
|
||||
- The previously bundled ``scrapy.xlib.pydispatch`` library was deprecated and
|
||||
replaced by `pydispatcher <https://pypi.python.org/pypi/PyDispatcher>`_.
|
||||
replaced by `pydispatcher <https://pypi.org/project/PyDispatcher/>`_.
|
||||
|
||||
|
||||
Relocations
|
||||
@ -1645,7 +2089,7 @@ Bugfixes
|
||||
- Makes ``_monkeypatches`` more robust (:issue:`1634`).
|
||||
- Fixed bug on ``XMLItemExporter`` with non-string fields in
|
||||
items (:issue:`1738`).
|
||||
- Fixed startproject command in OS X (:issue:`1635`).
|
||||
- Fixed startproject command in macOS (:issue:`1635`).
|
||||
- Fixed :class:`~scrapy.exporters.PythonItemExporter` and CSVExporter for
|
||||
non-string item types (:issue:`1737`).
|
||||
- Various logging related fixes (:issue:`1294`, :issue:`1419`, :issue:`1263`,
|
||||
@ -1713,12 +2157,12 @@ Scrapy 1.0.4 (2015-12-30)
|
||||
- Typos corrections (:commit:`7067117`)
|
||||
- fix typos in downloader-middleware.rst and exceptions.rst, middlware -> middleware (:commit:`32f115c`)
|
||||
- Add note to Ubuntu install section about Debian compatibility (:commit:`23fda69`)
|
||||
- Replace alternative OSX install workaround with virtualenv (:commit:`98b63ee`)
|
||||
- Replace alternative macOS install workaround with virtualenv (:commit:`98b63ee`)
|
||||
- Reference Homebrew's homepage for installation instructions (:commit:`1925db1`)
|
||||
- Add oldest supported tox version to contributing docs (:commit:`5d10d6d`)
|
||||
- Note in install docs about pip being already included in python>=2.7.9 (:commit:`85c980e`)
|
||||
- Add non-python dependencies to Ubuntu install section in the docs (:commit:`fbd010d`)
|
||||
- Add OS X installation section to docs (:commit:`d8f4cba`)
|
||||
- Add macOS installation section to docs (:commit:`d8f4cba`)
|
||||
- DOC(ENH): specify path to rtd theme explicitly (:commit:`de73b1a`)
|
||||
- minor: scrapy.Spider docs grammar (:commit:`1ddcc7b`)
|
||||
- Make common practices sample code match the comments (:commit:`1b85bcf`)
|
||||
@ -2450,7 +2894,7 @@ Other
|
||||
~~~~~
|
||||
|
||||
- Dropped Python 2.6 support (:issue:`448`)
|
||||
- Add `cssselect`_ python package as install dependency
|
||||
- Add :doc:`cssselect <cssselect:index>` python package as install dependency
|
||||
- Drop libxml2 and multi selector's backend support, `lxml`_ is required from now on.
|
||||
- Minimum Twisted version increased to 10.0.0, dropped Twisted 8.0 support.
|
||||
- Running test suite now requires ``mock`` python library (:issue:`390`)
|
||||
@ -2571,7 +3015,7 @@ Scrapy 0.18.0 (released 2013-08-09)
|
||||
- MetaRefreshMiddldeware and RedirectMiddleware have different priorities to address #62
|
||||
- added from_crawler method to spiders
|
||||
- added system tests with mock server
|
||||
- more improvements to Mac OS compatibility (thanks Alex Cepoi)
|
||||
- more improvements to macOS compatibility (thanks Alex Cepoi)
|
||||
- several more cleanups to singletons and multi-spider support (thanks Nicolas Ramirez)
|
||||
- support custom download slots
|
||||
- added --spider option to "shell" command.
|
||||
@ -2647,7 +3091,7 @@ Scrapy 0.16.3 (released 2012-12-07)
|
||||
|
||||
- Remove concurrency limitation when using download delays and still ensure inter-request delays are enforced (:commit:`487b9b5`)
|
||||
- add error details when image pipeline fails (:commit:`8232569`)
|
||||
- improve mac os compatibility (:commit:`8dcf8aa`)
|
||||
- improve macOS compatibility (:commit:`8dcf8aa`)
|
||||
- setup.py: use README.rst to populate long_description (:commit:`7b5310d`)
|
||||
- doc: removed obsolete references to ClientForm (:commit:`80f9bb6`)
|
||||
- correct docs for default storage backend (:commit:`2aa491b`)
|
||||
@ -3047,17 +3491,16 @@ Scrapy 0.7
|
||||
First release of Scrapy.
|
||||
|
||||
|
||||
.. _AJAX crawleable urls: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started?csw=1
|
||||
.. _AJAX crawleable urls: https://developers.google.com/search/docs/ajax-crawling/docs/getting-started?csw=1
|
||||
.. _botocore: https://github.com/boto/botocore
|
||||
.. _chunked transfer encoding: https://en.wikipedia.org/wiki/Chunked_transfer_encoding
|
||||
.. _ClientForm: http://wwwsearch.sourceforge.net/old/ClientForm/
|
||||
.. _Creating a pull request: https://help.github.com/en/articles/creating-a-pull-request
|
||||
.. _cryptography: https://cryptography.io/en/latest/
|
||||
.. _cssselect: https://github.com/scrapy/cssselect/
|
||||
.. _docstrings: https://docs.python.org/glossary.html#term-docstring
|
||||
.. _KeyboardInterrupt: https://docs.python.org/library/exceptions.html#KeyboardInterrupt
|
||||
.. _docstrings: https://docs.python.org/3/glossary.html#term-docstring
|
||||
.. _KeyboardInterrupt: https://docs.python.org/3/library/exceptions.html#KeyboardInterrupt
|
||||
.. _LevelDB: https://github.com/google/leveldb
|
||||
.. _lxml: http://lxml.de/
|
||||
.. _lxml: https://lxml.de/
|
||||
.. _marshal: https://docs.python.org/2/library/marshal.html
|
||||
.. _parsel.csstranslator.GenericTranslator: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.csstranslator.GenericTranslator
|
||||
.. _parsel.csstranslator.HTMLTranslator: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.csstranslator.HTMLTranslator
|
||||
@ -3068,11 +3511,11 @@ First release of Scrapy.
|
||||
.. _queuelib: https://github.com/scrapy/queuelib
|
||||
.. _registered with IANA: https://www.iana.org/assignments/media-types/media-types.xhtml
|
||||
.. _resource: https://docs.python.org/2/library/resource.html
|
||||
.. _robots.txt: http://www.robotstxt.org/
|
||||
.. _robots.txt: https://www.robotstxt.org/
|
||||
.. _scrapely: https://github.com/scrapy/scrapely
|
||||
.. _service_identity: https://service-identity.readthedocs.io/en/stable/
|
||||
.. _six: https://six.readthedocs.io/
|
||||
.. _tox: https://pypi.python.org/pypi/tox
|
||||
.. _tox: https://pypi.org/project/tox/
|
||||
.. _Twisted: https://twistedmatrix.com/trac/
|
||||
.. _Twisted - hello, asynchronous programming: http://jessenoller.com/blog/2009/02/11/twisted-hello-asynchronous-programming/
|
||||
.. _w3lib: https://github.com/scrapy/w3lib
|
||||
|
28
docs/topics/asyncio.rst
Normal file
28
docs/topics/asyncio.rst
Normal file
@ -0,0 +1,28 @@
|
||||
=======
|
||||
asyncio
|
||||
=======
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Scrapy has partial support :mod:`asyncio`. After you :ref:`install the asyncio
|
||||
reactor <install-asyncio>`, you may use :mod:`asyncio` and
|
||||
:mod:`asyncio`-powered libraries in any :doc:`coroutine <coroutines>`.
|
||||
|
||||
.. warning:: :mod:`asyncio` support in Scrapy is experimental. Future Scrapy
|
||||
versions may introduce related changes without a deprecation
|
||||
period or warning.
|
||||
|
||||
.. _install-asyncio:
|
||||
|
||||
Installing the asyncio reactor
|
||||
==============================
|
||||
|
||||
To enable :mod:`asyncio` support, set the :setting:`TWISTED_REACTOR` setting to
|
||||
``'twisted.internet.asyncioreactor.AsyncioSelectorReactor'``.
|
||||
|
||||
If you are using :class:`~scrapy.crawler.CrawlerRunner`, you also need to
|
||||
install the :class:`~twisted.internet.asyncioreactor.AsyncioSelectorReactor`
|
||||
reactor manually. You can do that using
|
||||
:func:`~scrapy.utils.reactor.install_reactor`::
|
||||
|
||||
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
|
@ -188,7 +188,7 @@ AjaxCrawlMiddleware helps to crawl them correctly.
|
||||
It is turned OFF by default because it has some performance overhead,
|
||||
and enabling it for focused crawls doesn't make much sense.
|
||||
|
||||
.. _ajax crawlable: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
||||
.. _ajax crawlable: https://developers.google.com/search/docs/ajax-crawling/docs/getting-started
|
||||
|
||||
.. _broad-crawls-bfo:
|
||||
|
||||
|
110
docs/topics/coroutines.rst
Normal file
110
docs/topics/coroutines.rst
Normal file
@ -0,0 +1,110 @@
|
||||
==========
|
||||
Coroutines
|
||||
==========
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Scrapy has :ref:`partial support <coroutine-support>` for the
|
||||
:ref:`coroutine syntax <async>`.
|
||||
|
||||
.. warning:: :mod:`asyncio` support in Scrapy is experimental. Future Scrapy
|
||||
versions may introduce related API and behavior changes without a
|
||||
deprecation period or warning.
|
||||
|
||||
.. _coroutine-support:
|
||||
|
||||
Supported callables
|
||||
===================
|
||||
|
||||
The following callables may be defined as coroutines using ``async def``, and
|
||||
hence use coroutine syntax (e.g. ``await``, ``async for``, ``async with``):
|
||||
|
||||
- :class:`~scrapy.http.Request` callbacks.
|
||||
|
||||
The following are known caveats of the current implementation that we aim
|
||||
to address in future versions of Scrapy:
|
||||
|
||||
- The callback output is not processed until the whole callback finishes.
|
||||
|
||||
As a side effect, if the callback raises an exception, none of its
|
||||
output is processed.
|
||||
|
||||
- Because `asynchronous generators were introduced in Python 3.6`_, you
|
||||
can only use ``yield`` if you are using Python 3.6 or later.
|
||||
|
||||
If you need to output multiple items or requests and you are using
|
||||
Python 3.5, return an iterable (e.g. a list) instead.
|
||||
|
||||
- The :meth:`process_item` method of
|
||||
:ref:`item pipelines <topics-item-pipeline>`.
|
||||
|
||||
- The
|
||||
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_request`,
|
||||
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_response`,
|
||||
and
|
||||
:meth:`~scrapy.downloadermiddlewares.DownloaderMiddleware.process_exception`
|
||||
methods of
|
||||
:ref:`downloader middlewares <topics-downloader-middleware-custom>`.
|
||||
|
||||
- :ref:`Signal handlers that support deferreds <signal-deferred>`.
|
||||
|
||||
.. _asynchronous generators were introduced in Python 3.6: https://www.python.org/dev/peps/pep-0525/
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
There are several use cases for coroutines in Scrapy. Code that would
|
||||
return Deferreds when written for previous Scrapy versions, such as downloader
|
||||
middlewares and signal handlers, can be rewritten to be shorter and cleaner::
|
||||
|
||||
class DbPipeline:
|
||||
def _update_item(self, data, item):
|
||||
item['field'] = data
|
||||
return item
|
||||
|
||||
def process_item(self, item, spider):
|
||||
dfd = db.get_some_data(item['id'])
|
||||
dfd.addCallback(self._update_item, item)
|
||||
return dfd
|
||||
|
||||
becomes::
|
||||
|
||||
class DbPipeline:
|
||||
async def process_item(self, item, spider):
|
||||
item['field'] = await db.get_some_data(item['id'])
|
||||
return item
|
||||
|
||||
Coroutines may be used to call asynchronous code. This includes other
|
||||
coroutines, functions that return Deferreds and functions that return
|
||||
`awaitable objects`_ such as :class:`~asyncio.Future`. This means you can use
|
||||
many useful Python libraries providing such code::
|
||||
|
||||
class MySpider(Spider):
|
||||
# ...
|
||||
async def parse_with_deferred(self, response):
|
||||
additional_response = await treq.get('https://additional.url')
|
||||
additional_data = await treq.content(additional_response)
|
||||
# ... use response and additional_data to yield items and requests
|
||||
|
||||
async def parse_with_asyncio(self, response):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get('https://additional.url') as additional_response:
|
||||
additional_data = await r.text()
|
||||
# ... use response and additional_data to yield items and requests
|
||||
|
||||
.. note:: Many libraries that use coroutines, such as `aio-libs`_, require the
|
||||
:mod:`asyncio` loop and to use them you need to
|
||||
:doc:`enable asyncio support in Scrapy<asyncio>`.
|
||||
|
||||
Common use cases for asynchronous code include:
|
||||
|
||||
* requesting data from websites, databases and other services (in callbacks,
|
||||
pipelines and middlewares);
|
||||
* storing data in databases (in pipelines and middlewares);
|
||||
* delaying the spider initialization until some external event (in the
|
||||
:signal:`spider_opened` handler);
|
||||
* calling asynchronous Scrapy methods like ``ExecutionEngine.download`` (see
|
||||
:ref:`the screenshot pipeline example<ScreenshotPipeline>`).
|
||||
|
||||
.. _aio-libs: https://github.com/aio-libs
|
||||
.. _awaitable objects: https://docs.python.org/3/glossary.html#term-awaitable
|
@ -709,7 +709,7 @@ HttpCompressionMiddleware
|
||||
provided `brotlipy`_ is installed.
|
||||
|
||||
.. _brotli-compressed: https://www.ietf.org/rfc/rfc7932.txt
|
||||
.. _brotlipy: https://pypi.python.org/pypi/brotlipy
|
||||
.. _brotlipy: https://pypi.org/project/brotlipy/
|
||||
|
||||
HttpCompressionMiddleware Settings
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -872,6 +872,10 @@ Default: ``[]``
|
||||
|
||||
Meta tags within these tags are ignored.
|
||||
|
||||
.. versionchanged:: 2.0
|
||||
The default value of :setting:`METAREFRESH_IGNORE_TAGS` changed from
|
||||
``['script', 'noscript']`` to ``[]``.
|
||||
|
||||
.. setting:: METAREFRESH_MAXDELAY
|
||||
|
||||
METAREFRESH_MAXDELAY
|
||||
@ -1038,7 +1042,7 @@ Based on `RobotFileParser
|
||||
* is Python's built-in robots.txt_ parser
|
||||
|
||||
* is compliant with `Martijn Koster's 1996 draft specification
|
||||
<http://www.robotstxt.org/norobots-rfc.txt>`_
|
||||
<https://www.robotstxt.org/norobots-rfc.txt>`_
|
||||
|
||||
* lacks support for wildcard matching
|
||||
|
||||
@ -1061,7 +1065,7 @@ Based on `Reppy <https://github.com/seomoz/reppy/>`_:
|
||||
<https://github.com/seomoz/rep-cpp>`_
|
||||
|
||||
* is compliant with `Martijn Koster's 1996 draft specification
|
||||
<http://www.robotstxt.org/norobots-rfc.txt>`_
|
||||
<https://www.robotstxt.org/norobots-rfc.txt>`_
|
||||
|
||||
* supports wildcard matching
|
||||
|
||||
@ -1086,7 +1090,7 @@ Based on `Robotexclusionrulesparser <http://nikitathespider.com/python/rerp/>`_:
|
||||
* implemented in Python
|
||||
|
||||
* is compliant with `Martijn Koster's 1996 draft specification
|
||||
<http://www.robotstxt.org/norobots-rfc.txt>`_
|
||||
<https://www.robotstxt.org/norobots-rfc.txt>`_
|
||||
|
||||
* supports wildcard matching
|
||||
|
||||
@ -1115,7 +1119,7 @@ implementing the methods described below.
|
||||
.. autoclass:: RobotParser
|
||||
:members:
|
||||
|
||||
.. _robots.txt: http://www.robotstxt.org/
|
||||
.. _robots.txt: https://www.robotstxt.org/
|
||||
|
||||
DownloaderStats
|
||||
---------------
|
||||
@ -1155,7 +1159,7 @@ AjaxCrawlMiddleware
|
||||
|
||||
Middleware that finds 'AJAX crawlable' page variants based
|
||||
on meta-fragment html tag. See
|
||||
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
||||
https://developers.google.com/search/docs/ajax-crawling/docs/getting-started
|
||||
for more info.
|
||||
|
||||
.. note::
|
||||
|
@ -241,12 +241,12 @@ along with `scrapy-selenium`_ for seamless integration.
|
||||
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
|
||||
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
|
||||
.. _js2xml: https://github.com/scrapinghub/js2xml
|
||||
.. _json.loads: https://docs.python.org/library/json.html#json.loads
|
||||
.. _json.loads: https://docs.python.org/3/library/json.html#json.loads
|
||||
.. _pytesseract: https://github.com/madmaze/pytesseract
|
||||
.. _regular expression: https://docs.python.org/library/re.html
|
||||
.. _regular expression: https://docs.python.org/3/library/re.html
|
||||
.. _scrapy-selenium: https://github.com/clemfromspace/scrapy-selenium
|
||||
.. _scrapy-splash: https://github.com/scrapy-plugins/scrapy-splash
|
||||
.. _Selenium: https://www.seleniumhq.org/
|
||||
.. _Selenium: https://www.selenium.dev/
|
||||
.. _Splash: https://github.com/scrapinghub/splash
|
||||
.. _tabula-py: https://github.com/chezou/tabula-py
|
||||
.. _wget: https://www.gnu.org/software/wget/
|
||||
|
@ -137,7 +137,7 @@ output examples, which assume you're exporting these two items::
|
||||
BaseItemExporter
|
||||
----------------
|
||||
|
||||
.. class:: BaseItemExporter(fields_to_export=None, export_empty_fields=False, encoding='utf-8', indent=0)
|
||||
.. class:: BaseItemExporter(fields_to_export=None, export_empty_fields=False, encoding='utf-8', indent=0, dont_fail=False)
|
||||
|
||||
This is the (abstract) base class for all Item Exporters. It provides
|
||||
support for common features used by all (concrete) Item Exporters, such as
|
||||
@ -148,6 +148,9 @@ BaseItemExporter
|
||||
populate their respective instance attributes: :attr:`fields_to_export`,
|
||||
:attr:`export_empty_fields`, :attr:`encoding`, :attr:`indent`.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
The *dont_fail* parameter.
|
||||
|
||||
.. method:: export_item(item)
|
||||
|
||||
Exports the given item. This method must be implemented in subclasses.
|
||||
|
@ -236,6 +236,9 @@ supported URI schemes.
|
||||
|
||||
This setting is required for enabling the feed exports.
|
||||
|
||||
.. versionchanged:: 2.0
|
||||
Added :class:`pathlib.Path` support.
|
||||
|
||||
.. setting:: FEED_FORMAT
|
||||
|
||||
FEED_FORMAT
|
||||
|
@ -158,18 +158,20 @@ method and how to clean up the resources properly.::
|
||||
self.db[self.collection_name].insert_one(dict(item))
|
||||
return item
|
||||
|
||||
.. _MongoDB: https://www.mongodb.org/
|
||||
.. _pymongo: https://api.mongodb.org/python/current/
|
||||
.. _MongoDB: https://www.mongodb.com/
|
||||
.. _pymongo: https://api.mongodb.com/python/current/
|
||||
|
||||
|
||||
.. _ScreenshotPipeline:
|
||||
|
||||
Take screenshot of item
|
||||
-----------------------
|
||||
|
||||
This example demonstrates how to return a
|
||||
:class:`~twisted.internet.defer.Deferred` from the :meth:`process_item` method.
|
||||
It uses Splash_ to render screenshot of item url. Pipeline
|
||||
makes request to locally running instance of Splash_. After request is downloaded
|
||||
and Deferred callback fires, it saves item to a file and adds filename to an item.
|
||||
makes request to locally running instance of Splash_. After request is downloaded,
|
||||
it saves the screenshot to a file and adds filename to the item.
|
||||
|
||||
::
|
||||
|
||||
@ -184,15 +186,12 @@ and Deferred callback fires, it saves item to a file and adds filename to an ite
|
||||
|
||||
SPLASH_URL = "http://localhost:8050/render.png?url={}"
|
||||
|
||||
def process_item(self, item, spider):
|
||||
async def process_item(self, item, spider):
|
||||
encoded_item_url = quote(item["url"])
|
||||
screenshot_url = self.SPLASH_URL.format(encoded_item_url)
|
||||
request = scrapy.Request(screenshot_url)
|
||||
dfd = spider.crawler.engine.download(request, spider)
|
||||
dfd.addBoth(self.return_item, item)
|
||||
return dfd
|
||||
response = await spider.crawler.engine.download(request, spider)
|
||||
|
||||
def return_item(self, response, item):
|
||||
if response.status != 200:
|
||||
# Error happened, return item.
|
||||
return item
|
||||
|
@ -166,7 +166,7 @@ If your item contains mutable_ values like lists or dictionaries, a shallow
|
||||
copy will keep references to the same mutable values across all different
|
||||
copies.
|
||||
|
||||
.. _mutable: https://docs.python.org/glossary.html#term-mutable
|
||||
.. _mutable: https://docs.python.org/3/glossary.html#term-mutable
|
||||
|
||||
For example, if you have an item with a list of tags, and you create a shallow
|
||||
copy of that item, both the original item and the copy have the same list of
|
||||
@ -177,7 +177,7 @@ If that is not the desired behavior, use a deep copy instead.
|
||||
|
||||
See the `documentation of the copy module`_ for more information.
|
||||
|
||||
.. _documentation of the copy module: https://docs.python.org/library/copy.html
|
||||
.. _documentation of the copy module: https://docs.python.org/3/library/copy.html
|
||||
|
||||
To create a shallow copy of an item, you can either call
|
||||
:meth:`~scrapy.item.Item.copy` on an existing item
|
||||
|
@ -68,6 +68,9 @@ Cookies may expire. So, if you don't resume your spider quickly the requests
|
||||
scheduled may no longer work. This won't be an issue if you spider doesn't rely
|
||||
on cookies.
|
||||
|
||||
|
||||
.. _request-serialization:
|
||||
|
||||
Request serialization
|
||||
---------------------
|
||||
|
||||
|
@ -206,7 +206,7 @@ objects. If this is your case, and you can't find your leaks using ``trackref``,
|
||||
you still have another resource: the `Guppy library`_.
|
||||
If you're using Python3, see :ref:`topics-leaks-muppy`.
|
||||
|
||||
.. _Guppy library: https://pypi.python.org/pypi/guppy
|
||||
.. _Guppy library: https://pypi.org/project/guppy/
|
||||
|
||||
If you use ``pip``, you can install Guppy with the following command::
|
||||
|
||||
@ -311,9 +311,9 @@ though neither Scrapy nor your project are leaking memory. This is due to a
|
||||
(not so well) known problem of Python, which may not return released memory to
|
||||
the operating system in some cases. For more information on this issue see:
|
||||
|
||||
* `Python Memory Management <http://www.evanjones.ca/python-memory.html>`_
|
||||
* `Python Memory Management Part 2 <http://www.evanjones.ca/python-memory-part2.html>`_
|
||||
* `Python Memory Management Part 3 <http://www.evanjones.ca/python-memory-part3.html>`_
|
||||
* `Python Memory Management <https://www.evanjones.ca/python-memory.html>`_
|
||||
* `Python Memory Management Part 2 <https://www.evanjones.ca/python-memory-part2.html>`_
|
||||
* `Python Memory Management Part 3 <https://www.evanjones.ca/python-memory-part3.html>`_
|
||||
|
||||
The improvements proposed by Evan Jones, which are detailed in `this paper`_,
|
||||
got merged in Python 2.5, but this only reduces the problem, it doesn't fix it
|
||||
@ -327,7 +327,7 @@ completely. To quote the paper:
|
||||
to move to a compacting garbage collector, which is able to move objects in
|
||||
memory. This would require significant changes to the Python interpreter.*
|
||||
|
||||
.. _this paper: http://www.evanjones.ca/memoryallocator/
|
||||
.. _this paper: https://www.evanjones.ca/memoryallocator/
|
||||
|
||||
To keep memory consumption reasonable you can split the job into several
|
||||
smaller jobs or enable :ref:`persistent job queue <topics-jobs>`
|
||||
|
@ -64,9 +64,13 @@ LxmlLinkExtractor
|
||||
|
||||
:param deny_extensions: a single value or list of strings containing
|
||||
extensions that should be ignored when extracting links.
|
||||
If not given, it will default to the
|
||||
``IGNORED_EXTENSIONS`` list defined in the
|
||||
`scrapy.linkextractors`_ package.
|
||||
If not given, it will default to
|
||||
:data:`scrapy.linkextractors.IGNORED_EXTENSIONS`.
|
||||
|
||||
.. versionchanged:: 2.0
|
||||
:data:`~scrapy.linkextractors.IGNORED_EXTENSIONS` now includes
|
||||
``7z``, ``7zip``, ``apk``, ``bz2``, ``cdr``, ``dmg``, ``ico``,
|
||||
``iso``, ``tar``, ``tar.gz``, ``webm``, and ``xz``.
|
||||
:type deny_extensions: list
|
||||
|
||||
:param restrict_xpaths: is an XPath (or list of XPath's) which defines
|
||||
|
@ -136,6 +136,9 @@ with the data to be parsed, and return a parsed value. So you can use any
|
||||
function as input or output processor. The only requirement is that they must
|
||||
accept one (and only one) positional argument, which will be an iterable.
|
||||
|
||||
.. versionchanged:: 2.0
|
||||
Processors no longer need to be methods.
|
||||
|
||||
.. note:: Both input and output processors must receive an iterable as their
|
||||
first argument. The output of those functions can be anything. The result of
|
||||
input processors will be appended to an internal list (in the Loader)
|
||||
|
@ -116,12 +116,6 @@ For the Images Pipeline, set the :setting:`IMAGES_STORE` setting::
|
||||
Supported Storage
|
||||
=================
|
||||
|
||||
File system is currently the only officially supported storage, but there are
|
||||
also support for storing files in `Amazon S3`_ and `Google Cloud Storage`_.
|
||||
|
||||
.. _Amazon S3: https://aws.amazon.com/s3/
|
||||
.. _Google Cloud Storage: https://cloud.google.com/storage/
|
||||
|
||||
File system storage
|
||||
-------------------
|
||||
|
||||
@ -147,9 +141,13 @@ Where:
|
||||
* ``full`` is a sub-directory to separate full images from thumbnails (if
|
||||
used). For more info see :ref:`topics-images-thumbnails`.
|
||||
|
||||
.. _media-pipeline-ftp:
|
||||
|
||||
FTP server storage
|
||||
------------------
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
:setting:`FILES_STORE` and :setting:`IMAGES_STORE` can point to an FTP server.
|
||||
Scrapy will automatically upload the files to the server.
|
||||
|
||||
@ -573,6 +571,8 @@ See here the methods that you can override in your custom Images Pipeline:
|
||||
By default, the :meth:`item_completed` method returns the item.
|
||||
|
||||
|
||||
.. _media-pipeline-example:
|
||||
|
||||
Custom Images pipeline example
|
||||
==============================
|
||||
|
||||
|
@ -31,6 +31,8 @@ Request objects
|
||||
a :class:`Response`.
|
||||
|
||||
:param url: the URL of this request
|
||||
|
||||
If the URL is invalid, a :exc:`ValueError` exception is raised.
|
||||
:type url: string
|
||||
|
||||
:param callback: the function that will be called with the response of this
|
||||
@ -125,6 +127,10 @@ Request objects
|
||||
:exc:`~twisted.python.failure.Failure` as first parameter.
|
||||
For more information,
|
||||
see :ref:`topics-request-response-ref-errbacks` below.
|
||||
|
||||
.. versionchanged:: 2.0
|
||||
The *callback* parameter is no longer required when the *errback*
|
||||
parameter is specified.
|
||||
:type errback: callable
|
||||
|
||||
:param flags: Flags sent to the request, can be used for logging or similar purposes.
|
||||
@ -396,7 +402,7 @@ The FormRequest class extends the base :class:`Request` with functionality for
|
||||
dealing with HTML forms. It uses `lxml.html forms`_ to pre-populate form
|
||||
fields with form data from :class:`Response` objects.
|
||||
|
||||
.. _lxml.html forms: http://lxml.de/lxmlhtml.html#forms
|
||||
.. _lxml.html forms: https://lxml.de/lxmlhtml.html#forms
|
||||
|
||||
.. class:: FormRequest(url, [formdata, ...])
|
||||
|
||||
@ -680,6 +686,8 @@ Response objects
|
||||
|
||||
.. attribute:: Response.cb_kwargs
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
A shortcut to the :attr:`Request.cb_kwargs` attribute of the
|
||||
:attr:`Response.request` object (i.e. ``self.request.cb_kwargs``).
|
||||
|
||||
|
@ -35,12 +35,11 @@ defines selectors to associate those styles with specific HTML elements.
|
||||
in speed and parsing accuracy to lxml.
|
||||
|
||||
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
|
||||
.. _lxml: http://lxml.de/
|
||||
.. _lxml: https://lxml.de/
|
||||
.. _ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html
|
||||
.. _cssselect: https://pypi.python.org/pypi/cssselect/
|
||||
.. _XPath: https://www.w3.org/TR/xpath
|
||||
.. _XPath: https://www.w3.org/TR/xpath/all/
|
||||
.. _CSS: https://www.w3.org/TR/selectors
|
||||
.. _parsel: https://parsel.readthedocs.io/
|
||||
.. _parsel: https://parsel.readthedocs.io/en/latest/
|
||||
|
||||
Using selectors
|
||||
===============
|
||||
@ -255,7 +254,7 @@ that Scrapy (parsel) implements a couple of **non-standard pseudo-elements**:
|
||||
They will most probably not work with other libraries like
|
||||
`lxml`_ or `PyQuery`_.
|
||||
|
||||
.. _PyQuery: https://pypi.python.org/pypi/pyquery
|
||||
.. _PyQuery: https://pypi.org/project/pyquery/
|
||||
|
||||
Examples:
|
||||
|
||||
@ -309,7 +308,7 @@ Examples:
|
||||
make much sense: text nodes do not have attributes, and attribute values
|
||||
are string values already and do not have children nodes.
|
||||
|
||||
.. _CSS Selectors: https://www.w3.org/TR/css3-selectors/#selectors
|
||||
.. _CSS Selectors: https://www.w3.org/TR/selectors-3/#selectors
|
||||
|
||||
.. _topics-selectors-nesting-selectors:
|
||||
|
||||
@ -504,7 +503,7 @@ Another common case would be to extract all direct ``<p>`` children:
|
||||
For more details about relative XPaths see the `Location Paths`_ section in the
|
||||
XPath specification.
|
||||
|
||||
.. _Location Paths: https://www.w3.org/TR/xpath#location-paths
|
||||
.. _Location Paths: https://www.w3.org/TR/xpath/all/#location-paths
|
||||
|
||||
When querying by class, consider using CSS
|
||||
------------------------------------------
|
||||
@ -612,7 +611,7 @@ But using the ``.`` to mean the node, works:
|
||||
>>> sel.xpath("//a[contains(., 'Next Page')]").getall()
|
||||
['<a href="#">Click here to go to the <strong>Next Page</strong></a>']
|
||||
|
||||
.. _`XPath string function`: https://www.w3.org/TR/xpath/#section-String-Functions
|
||||
.. _`XPath string function`: https://www.w3.org/TR/xpath/all/#section-String-Functions
|
||||
|
||||
.. _topics-selectors-xpath-variables:
|
||||
|
||||
@ -764,7 +763,7 @@ Set operations
|
||||
These can be handy for excluding parts of a document tree before
|
||||
extracting text elements for example.
|
||||
|
||||
Example extracting microdata (sample content taken from http://schema.org/Product)
|
||||
Example extracting microdata (sample content taken from https://schema.org/Product)
|
||||
with groups of itemscopes and corresponding itemprops::
|
||||
|
||||
>>> doc = u"""
|
||||
|
@ -381,6 +381,8 @@ DNS in-memory cache size.
|
||||
DNS_RESOLVER
|
||||
------------
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Default: ``'scrapy.resolver.CachingThreadedResolver'``
|
||||
|
||||
The class to be used to resolve DNS names. The default ``scrapy.resolver.CachingThreadedResolver``
|
||||
@ -1258,6 +1260,9 @@ does not work together with :setting:`CONCURRENT_REQUESTS_PER_IP`.
|
||||
|
||||
SCRAPER_SLOT_MAX_ACTIVE_SIZE
|
||||
----------------------------
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Default: ``5_000_000``
|
||||
|
||||
Soft limit (in bytes) for response data being processed.
|
||||
@ -1447,24 +1452,36 @@ in the ``project`` subdirectory.
|
||||
TWISTED_REACTOR
|
||||
---------------
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Default: ``None``
|
||||
|
||||
Import path of a given Twisted reactor, for instance:
|
||||
:class:`twisted.internet.asyncioreactor.AsyncioSelectorReactor`.
|
||||
Import path of a given :mod:`~twisted.internet.reactor`.
|
||||
|
||||
Scrapy will install this reactor if no other is installed yet, such as when
|
||||
the ``scrapy`` CLI program is invoked or when using the
|
||||
:class:`~scrapy.crawler.CrawlerProcess` class. If you are using the
|
||||
:class:`~scrapy.crawler.CrawlerRunner` class, you need to install the correct
|
||||
reactor manually. An exception will be raised if the installation fails.
|
||||
Scrapy will install this reactor if no other reactor is installed yet, such as
|
||||
when the ``scrapy`` CLI program is invoked or when using the
|
||||
:class:`~scrapy.crawler.CrawlerProcess` class.
|
||||
|
||||
The default value for this option is currently ``None``, which means that Scrapy
|
||||
will not attempt to install any specific reactor, and the default one defined by
|
||||
Twisted for the current platform will be used. This is to maintain backward
|
||||
compatibility and avoid possible problems caused by using a non-default reactor.
|
||||
If you are using the :class:`~scrapy.crawler.CrawlerRunner` class, you also
|
||||
need to install the correct reactor manually. You can do that using
|
||||
:func:`~scrapy.utils.reactor.install_reactor`:
|
||||
|
||||
For additional information, please see
|
||||
:doc:`core/howto/choosing-reactor`.
|
||||
.. autofunction:: scrapy.utils.reactor.install_reactor
|
||||
|
||||
If a reactor is already installed,
|
||||
:func:`~scrapy.utils.reactor.install_reactor` has no effect.
|
||||
|
||||
:meth:`CrawlerRunner.__init__ <scrapy.crawler.CrawlerRunner.__init__>` raises
|
||||
:exc:`Exception` if the installed reactor does not match the
|
||||
:setting:`TWISTED_REACTOR` setting.
|
||||
|
||||
The default value of the :setting:`TWISTED_REACTOR` setting is ``None``, which
|
||||
means that Scrapy will not attempt to install any specific reactor, and the
|
||||
default reactor defined by Twisted for the current platform will be used. This
|
||||
is to maintain backward compatibility and avoid possible problems caused by
|
||||
using a non-default reactor.
|
||||
|
||||
For additional information, see :doc:`core/howto/choosing-reactor`.
|
||||
|
||||
|
||||
.. setting:: URLLENGTH_LIMIT
|
||||
|
@ -41,7 +41,7 @@ variable; or by defining it in your :ref:`scrapy.cfg <topics-config-settings>`::
|
||||
|
||||
.. _IPython: https://ipython.org/
|
||||
.. _IPython installation guide: https://ipython.org/install.html
|
||||
.. _bpython: https://www.bpython-interpreter.org/
|
||||
.. _bpython: https://bpython-interpreter.org/
|
||||
|
||||
Launch the shell
|
||||
================
|
||||
@ -142,7 +142,7 @@ Example of shell session
|
||||
========================
|
||||
|
||||
Here's an example of a typical shell session where we start by scraping the
|
||||
https://scrapy.org page, and then proceed to scrape the https://reddit.com
|
||||
https://scrapy.org page, and then proceed to scrape the https://old.reddit.com/
|
||||
page. Finally, we modify the (Reddit) request method to POST and re-fetch it
|
||||
getting an error. We end the session by typing Ctrl-D (in Unix systems) or
|
||||
Ctrl-Z in Windows.
|
||||
@ -182,7 +182,7 @@ After that, we can start playing with the objects:
|
||||
>>> response.xpath('//title/text()').get()
|
||||
'Scrapy | A Fast and Powerful Scraping and Web Crawling Framework'
|
||||
|
||||
>>> fetch("https://reddit.com")
|
||||
>>> fetch("https://old.reddit.com/")
|
||||
|
||||
>>> response.xpath('//title/text()').get()
|
||||
'reddit: the front page of the internet'
|
||||
|
@ -46,6 +46,7 @@ Here is a simple example showing how you can catch signals and perform some acti
|
||||
def parse(self, response):
|
||||
pass
|
||||
|
||||
.. _signal-deferred:
|
||||
|
||||
Deferred signal handlers
|
||||
========================
|
||||
@ -301,6 +302,8 @@ request_left_downloader
|
||||
.. signal:: request_left_downloader
|
||||
.. function:: request_left_downloader(request, spider)
|
||||
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Sent when a :class:`~scrapy.http.Request` leaves the downloader, even in case of
|
||||
failure.
|
||||
|
||||
|
@ -299,8 +299,8 @@ The spider will not do any parsing on its own.
|
||||
If you were to set the ``start_urls`` attribute from the command line,
|
||||
you would have to parse it on your own into a list
|
||||
using something like
|
||||
`ast.literal_eval <https://docs.python.org/library/ast.html#ast.literal_eval>`_
|
||||
or `json.loads <https://docs.python.org/library/json.html#json.loads>`_
|
||||
`ast.literal_eval <https://docs.python.org/3/library/ast.html#ast.literal_eval>`_
|
||||
or `json.loads <https://docs.python.org/3/library/json.html#json.loads>`_
|
||||
and then set it as an attribute.
|
||||
Otherwise, you would cause iteration over a ``start_urls`` string
|
||||
(a very common python pitfall)
|
||||
@ -420,6 +420,9 @@ Crawling rules
|
||||
It receives a :class:`Twisted Failure <twisted.python.failure.Failure>`
|
||||
instance as first parameter.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
The *errback* parameter.
|
||||
|
||||
CrawlSpider example
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -811,6 +814,6 @@ Combine SitemapSpider with other sources of urls::
|
||||
|
||||
.. _Sitemaps: https://www.sitemaps.org/index.html
|
||||
.. _Sitemap index files: https://www.sitemaps.org/protocol.html#index
|
||||
.. _robots.txt: http://www.robotstxt.org/
|
||||
.. _robots.txt: https://www.robotstxt.org/
|
||||
.. _TLD: https://en.wikipedia.org/wiki/Top-level_domain
|
||||
.. _Scrapyd documentation: https://scrapyd.readthedocs.io/en/latest/
|
||||
|
@ -1 +1 @@
|
||||
1.8.0
|
||||
2.0.0
|
||||
|
@ -54,8 +54,13 @@ class Command(ScrapyCommand):
|
||||
raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
|
||||
spname = args[0]
|
||||
|
||||
self.crawler_process.crawl(spname, **opts.spargs)
|
||||
self.crawler_process.start()
|
||||
crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
|
||||
|
||||
if self.crawler_process.bootstrap_failed:
|
||||
if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception):
|
||||
self.exitcode = 1
|
||||
else:
|
||||
self.crawler_process.start()
|
||||
|
||||
if self.crawler_process.bootstrap_failed or \
|
||||
(hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception):
|
||||
self.exitcode = 1
|
||||
|
@ -23,7 +23,7 @@ __all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
|
||||
|
||||
class BaseItemExporter(object):
|
||||
|
||||
def __init__(self, dont_fail=False, **kwargs):
|
||||
def __init__(self, *, dont_fail=False, **kwargs):
|
||||
self._kwargs = kwargs
|
||||
self._configure(kwargs, dont_fail=dont_fail)
|
||||
|
||||
|
@ -47,7 +47,7 @@ class MemoryUsage(object):
|
||||
def get_virtual_size(self):
|
||||
size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
|
||||
if sys.platform != 'darwin':
|
||||
# on Mac OS X ru_maxrss is in bytes, on Linux it is in KB
|
||||
# on macOS ru_maxrss is in bytes, on Linux it is in KB
|
||||
size *= 1024
|
||||
return size
|
||||
|
||||
|
@ -132,6 +132,9 @@ class Response(object_ref):
|
||||
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
|
||||
method which supports selectors in addition to absolute/relative URLs
|
||||
and Link objects.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
The *flags* parameter.
|
||||
"""
|
||||
if isinstance(url, Link):
|
||||
url = url.url
|
||||
@ -160,6 +163,8 @@ class Response(object_ref):
|
||||
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
|
||||
# type: (...) -> Generator[Request, None, None]
|
||||
"""
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Return an iterable of :class:`~.Request` instances to follow all links
|
||||
in ``urls``. It accepts the same arguments as ``Request.__init__`` method,
|
||||
but elements of ``urls`` can be relative URLs or :class:`~scrapy.link.Link` objects,
|
||||
|
@ -97,7 +97,11 @@ class LogFormatter(object):
|
||||
}
|
||||
|
||||
def item_error(self, item, exception, response, spider):
|
||||
"""Logs a message when an item causes an error while it is passing through the item pipeline."""
|
||||
"""Logs a message when an item causes an error while it is passing
|
||||
through the item pipeline.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
"""
|
||||
return {
|
||||
'level': logging.ERROR,
|
||||
'msg': ITEMERRORMSG,
|
||||
@ -107,7 +111,10 @@ class LogFormatter(object):
|
||||
}
|
||||
|
||||
def spider_error(self, failure, request, response, spider):
|
||||
"""Logs an error message from a spider."""
|
||||
"""Logs an error message from a spider.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
"""
|
||||
return {
|
||||
'level': logging.ERROR,
|
||||
'msg': SPIDERERRORMSG,
|
||||
@ -118,7 +125,11 @@ class LogFormatter(object):
|
||||
}
|
||||
|
||||
def download_error(self, failure, request, spider, errmsg=None):
|
||||
"""Logs a download error message from a spider (typically coming from the engine)."""
|
||||
"""Logs a download error message from a spider (typically coming from
|
||||
the engine).
|
||||
|
||||
.. versionadded:: 2.0
|
||||
"""
|
||||
args = {'request': request}
|
||||
if errmsg:
|
||||
msg = DOWNLOADERRORMSG_LONG
|
||||
|
@ -29,7 +29,7 @@ class CachingThreadedResolver(ThreadedResolver):
|
||||
cache_size = 0
|
||||
return cls(reactor, cache_size, crawler.settings.getfloat('DNS_TIMEOUT'))
|
||||
|
||||
def install_on_reactor(self,):
|
||||
def install_on_reactor(self):
|
||||
self.reactor.installResolver(self)
|
||||
|
||||
def getHostByName(self, name, timeout=None):
|
||||
|
@ -9,10 +9,8 @@ DEPRECATED_SETTINGS = [
|
||||
('ENCODING_ALIASES', 'no longer needed (encoding discovery uses w3lib now)'),
|
||||
('STATS_ENABLED', 'no longer supported (change STATS_CLASS instead)'),
|
||||
('SQLITE_DB', 'no longer supported'),
|
||||
('SELECTORS_BACKEND', 'use SCRAPY_SELECTORS_BACKEND environment variable instead'),
|
||||
('AUTOTHROTTLE_MIN_DOWNLOAD_DELAY', 'use DOWNLOAD_DELAY instead'),
|
||||
('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
|
||||
('AUTOTHROTTLE_MAX_CONCURRENCY', 'use CONCURRENT_REQUESTS_PER_DOMAIN instead'),
|
||||
('REDIRECT_MAX_METAREFRESH_DELAY', 'use METAREFRESH_MAXDELAY instead'),
|
||||
('LOG_UNSERIALIZABLE_REQUESTS', 'use SCHEDULER_DEBUG instead'),
|
||||
]
|
||||
|
@ -75,9 +75,24 @@ def get_project_settings():
|
||||
"is deprecated.", ScrapyDeprecationWarning)
|
||||
settings.setdict(pickle.loads(pickled_settings), priority='project')
|
||||
|
||||
env_overrides = {k[7:]: v for k, v in os.environ.items() if
|
||||
k.startswith('SCRAPY_')}
|
||||
if env_overrides:
|
||||
warnings.warn("Use of 'SCRAPY_'-prefixed environment variables to override settings is deprecated.", ScrapyDeprecationWarning)
|
||||
settings.setdict(env_overrides, priority='project')
|
||||
scrapy_envvars = {k[7:]: v for k, v in os.environ.items() if
|
||||
k.startswith('SCRAPY_')}
|
||||
valid_envvars = {
|
||||
'CHECK',
|
||||
'PICKLED_SETTINGS_TO_OVERRIDE',
|
||||
'PROJECT',
|
||||
'PYTHON_SHELL',
|
||||
'SETTINGS_MODULE',
|
||||
}
|
||||
setting_envvars = {k for k in scrapy_envvars if k not in valid_envvars}
|
||||
if setting_envvars:
|
||||
setting_envvar_list = ', '.join(sorted(setting_envvars))
|
||||
warnings.warn(
|
||||
'Use of environment variables prefixed with SCRAPY_ to override '
|
||||
'settings is deprecated. The following environment variables are '
|
||||
'currently defined: {}'.format(setting_envvar_list),
|
||||
ScrapyDeprecationWarning
|
||||
)
|
||||
settings.setdict(scrapy_envvars, priority='project')
|
||||
|
||||
return settings
|
||||
|
@ -50,6 +50,8 @@ class CallLaterOnce(object):
|
||||
|
||||
|
||||
def install_reactor(reactor_path):
|
||||
"""Installs the :mod:`~twisted.internet.reactor` with the specified
|
||||
import path."""
|
||||
reactor_class = load_object(reactor_path)
|
||||
if reactor_class is asyncioreactor.AsyncioSelectorReactor:
|
||||
with suppress(error.ReactorAlreadyInstalledError):
|
||||
@ -63,6 +65,9 @@ def install_reactor(reactor_path):
|
||||
|
||||
|
||||
def verify_installed_reactor(reactor_path):
|
||||
"""Raises :exc:`Exception` if the installed
|
||||
:mod:`~twisted.internet.reactor` does not match the specified import
|
||||
path."""
|
||||
from twisted.internet import reactor
|
||||
reactor_class = load_object(reactor_path)
|
||||
if not isinstance(reactor, reactor_class):
|
||||
|
20
tests/test_cmdline_crawl_with_pipeline/__init__.py
Normal file
20
tests/test_cmdline_crawl_with_pipeline/__init__.py
Normal file
@ -0,0 +1,20 @@
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
from subprocess import Popen, PIPE
|
||||
|
||||
|
||||
class CmdlineCrawlPipelineTest(unittest.TestCase):
|
||||
|
||||
def _execute(self, spname):
|
||||
args = (sys.executable, '-m', 'scrapy.cmdline', 'crawl', spname)
|
||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||
proc = Popen(args, stdout=PIPE, stderr=PIPE, cwd=cwd)
|
||||
proc.communicate()
|
||||
return proc.returncode
|
||||
|
||||
def test_open_spider_normally_in_pipeline(self):
|
||||
self.assertEqual(self._execute('normal'), 0)
|
||||
|
||||
def test_exception_at_open_spider_in_pipeline(self):
|
||||
self.assertEqual(self._execute('exception'), 1)
|
2
tests/test_cmdline_crawl_with_pipeline/scrapy.cfg
Normal file
2
tests/test_cmdline_crawl_with_pipeline/scrapy.cfg
Normal file
@ -0,0 +1,2 @@
|
||||
[settings]
|
||||
default = test_spider.settings
|
@ -0,0 +1,16 @@
|
||||
class TestSpiderPipeline(object):
|
||||
|
||||
def open_spider(self, spider):
|
||||
pass
|
||||
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
|
||||
|
||||
class TestSpiderExceptionPipeline(object):
|
||||
|
||||
def open_spider(self, spider):
|
||||
raise Exception('exception')
|
||||
|
||||
def process_item(self, item, spider):
|
||||
return item
|
@ -0,0 +1,2 @@
|
||||
BOT_NAME = 'test_spider'
|
||||
SPIDER_MODULES = ['test_spider.spiders']
|
@ -0,0 +1,14 @@
|
||||
import scrapy
|
||||
|
||||
|
||||
class ExceptionSpider(scrapy.Spider):
|
||||
name = 'exception'
|
||||
|
||||
custom_settings = {
|
||||
'ITEM_PIPELINES': {
|
||||
'test_spider.pipelines.TestSpiderExceptionPipeline': 300
|
||||
}
|
||||
}
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
@ -0,0 +1,14 @@
|
||||
import scrapy
|
||||
|
||||
|
||||
class NormalSpider(scrapy.Spider):
|
||||
name = 'normal'
|
||||
|
||||
custom_settings = {
|
||||
'ITEM_PIPELINES': {
|
||||
'test_spider.pipelines.TestSpiderPipeline': 300
|
||||
}
|
||||
}
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
@ -40,7 +40,7 @@ class CrawlTestCase(TestCase):
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_fixed_delay(self):
|
||||
yield self._test_delay(total=3, delay=0.1)
|
||||
yield self._test_delay(total=3, delay=0.2)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_randomized_delay(self):
|
||||
@ -328,7 +328,7 @@ with multiples lines
|
||||
@mark.only_asyncio()
|
||||
@defer.inlineCallbacks
|
||||
def test_async_def_asyncio_parse(self):
|
||||
runner = CrawlerRunner({"ASYNCIO_REACTOR": True})
|
||||
runner = CrawlerRunner({"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor"})
|
||||
runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
|
||||
with LogCapture() as log:
|
||||
yield runner.join()
|
||||
|
@ -3,7 +3,11 @@ import os
|
||||
import tempfile
|
||||
import shutil
|
||||
import contextlib
|
||||
from scrapy.utils.project import data_path
|
||||
|
||||
from pytest import warns
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.utils.project import data_path, get_project_settings
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
@ -41,3 +45,53 @@ class ProjectUtilsTest(unittest.TestCase):
|
||||
)
|
||||
abspath = os.path.join(os.path.sep, 'absolute', 'path')
|
||||
self.assertEqual(abspath, data_path(abspath))
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def set_env(**update):
|
||||
modified = set(update.keys()) & set(os.environ.keys())
|
||||
update_after = {k: os.environ[k] for k in modified}
|
||||
remove_after = frozenset(k for k in update if k not in os.environ)
|
||||
try:
|
||||
os.environ.update(update)
|
||||
yield
|
||||
finally:
|
||||
os.environ.update(update_after)
|
||||
for k in remove_after:
|
||||
os.environ.pop(k)
|
||||
|
||||
|
||||
class GetProjectSettingsTestCase(unittest.TestCase):
|
||||
|
||||
def test_valid_envvar(self):
|
||||
value = 'tests.test_cmdline.settings'
|
||||
envvars = {
|
||||
'SCRAPY_SETTINGS_MODULE': value,
|
||||
}
|
||||
with set_env(**envvars), warns(None) as warnings:
|
||||
settings = get_project_settings()
|
||||
assert not warnings
|
||||
assert settings.get('SETTINGS_MODULE') == value
|
||||
|
||||
def test_invalid_envvar(self):
|
||||
envvars = {
|
||||
'SCRAPY_FOO': 'bar',
|
||||
}
|
||||
with set_env(**envvars), warns(None) as warnings:
|
||||
get_project_settings()
|
||||
assert len(warnings) == 1
|
||||
assert warnings[0].category == ScrapyDeprecationWarning
|
||||
assert str(warnings[0].message).endswith(': FOO')
|
||||
|
||||
def test_valid_and_invalid_envvars(self):
|
||||
value = 'tests.test_cmdline.settings'
|
||||
envvars = {
|
||||
'SCRAPY_FOO': 'bar',
|
||||
'SCRAPY_SETTINGS_MODULE': value,
|
||||
}
|
||||
with set_env(**envvars), warns(None) as warnings:
|
||||
settings = get_project_settings()
|
||||
assert len(warnings) == 1
|
||||
assert warnings[0].category == ScrapyDeprecationWarning
|
||||
assert str(warnings[0].message).endswith(': FOO')
|
||||
assert settings.get('SETTINGS_MODULE') == value
|
||||
|
Loading…
x
Reference in New Issue
Block a user