diff --git a/.bumpversion.cfg b/.bumpversion.cfg index f347a0cd0..8d4d74bc5 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.0.0 +current_version = 2.2.0 commit = True tag = True tag_name = {new_version} diff --git a/.gitignore b/.gitignore index ff6e2ea65..83a2569dd 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ htmlcov/ .pytest_cache/ .coverage.* .cache/ +.mypy_cache/ # Windows Thumbs.db diff --git a/.readthedocs.yml b/.readthedocs.yml index 17eba34f3..e4d3f02cc 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,4 +1,5 @@ version: 2 +formats: all sphinx: configuration: docs/conf.py fail_on_warning: true diff --git a/.travis.yml b/.travis.yml index 66e1a9617..b403ac54c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,25 +11,35 @@ matrix: python: 3.8 - env: TOXENV=flake8 python: 3.8 - - env: TOXENV=pypy3 - - env: TOXENV=py35 - python: 3.5 - - env: TOXENV=pinned - python: 3.5 - - env: TOXENV=py35-asyncio - python: 3.5.2 - - env: TOXENV=py36 - python: 3.6 - - env: TOXENV=py37 - python: 3.7 - - env: TOXENV=py38 - python: 3.8 - - env: TOXENV=extra-deps - python: 3.8 - - env: TOXENV=py38-asyncio + - env: TOXENV=pylint python: 3.8 - env: TOXENV=docs python: 3.7 # Keep in sync with .readthedocs.yml + - env: TOXENV=typing + python: 3.8 + + - env: TOXENV=pypy3 + - env: TOXENV=pinned + python: 3.5.2 + - env: TOXENV=asyncio + python: 3.5.2 # We use additional code to support 3.5.3 and earlier + - env: TOXENV=py + python: 3.5 + - env: TOXENV=asyncio + python: 3.5 # We use specific code to support >= 3.5.4, < 3.6 + - env: TOXENV=py + python: 3.6 + - env: TOXENV=py + python: 3.7 + - env: TOXENV=py PYPI_RELEASE_JOB=true + python: 3.8 + dist: bionic + - env: TOXENV=extra-deps + python: 3.8 + dist: bionic + - env: TOXENV=asyncio + python: 3.8 + dist: bionic install: - | if [ "$TOXENV" = "pypy3" ]; then @@ -62,4 +72,4 @@ deploy: on: tags: true repo: scrapy/scrapy - condition: "$TOXENV == py37 && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$" + condition: "$PYPI_RELEASE_JOB == true && $TRAVIS_TAG =~ ^[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$" diff --git a/README.rst b/README.rst index ce5973bcd..0e3939e9b 100644 --- a/README.rst +++ b/README.rst @@ -40,7 +40,7 @@ including a list of features. Requirements ============ -* Python 3.5+ +* Python 3.5.2+ * Works on Linux, Windows, macOS, BSD Install diff --git a/conftest.py b/conftest.py index be5fbabf4..b39d644a5 100644 --- a/conftest.py +++ b/conftest.py @@ -12,6 +12,8 @@ collect_ignore = [ "scrapy/utils/testsite.py", # contains scripts to be run by tests/test_crawler.py::CrawlerProcessSubprocess *_py_files("tests/CrawlerProcess"), + # contains scripts to be run by tests/test_crawler.py::CrawlerRunnerSubprocess + *_py_files("tests/CrawlerRunner"), # Py36-only parts of respective tests *_py_files("tests/py36"), ] diff --git a/docs/README.rst b/docs/README.rst index 0a343cd19..0b7afa548 100644 --- a/docs/README.rst +++ b/docs/README.rst @@ -57,3 +57,12 @@ There is a way to recreate the doc automatically when you make changes, you need to install watchdog (``pip install watchdog``) and then use:: make watch + +Alternative method using tox +---------------------------- + +To compile the documentation to HTML run the following command:: + + tox -e docs + +Documentation will be generated (in HTML format) inside the ``.tox/docs/tmp/html`` dir. diff --git a/docs/conf.py b/docs/conf.py index 6e2399f66..86734fae7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# # Scrapy documentation build configuration file, created by # sphinx-quickstart on Mon Nov 24 12:02:52 2008. # @@ -102,6 +100,9 @@ exclude_trees = ['.build'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' +# List of Sphinx warnings that will not be raised +suppress_warnings = ['epub.unknown_project_files'] + # Options for HTML output # ----------------------- @@ -280,6 +281,7 @@ coverage_ignore_pyobjects = [ # ------------------------------------- intersphinx_mapping = { + 'attrs': ('https://www.attrs.org/en/stable/', None), 'coverage': ('https://coverage.readthedocs.io/en/stable', None), 'cssselect': ('https://cssselect.readthedocs.io/en/latest', None), 'pytest': ('https://docs.pytest.org/en/latest', None), @@ -295,3 +297,11 @@ intersphinx_mapping = { # ------------------------------------ hoverxref_auto_ref = True +hoverxref_role_types = { + "class": "tooltip", + "confval": "tooltip", + "hoverxref": "tooltip", + "mod": "tooltip", + "ref": "tooltip", +} +hoverxref_roles = ['command', 'reqmeta', 'setting', 'signal'] diff --git a/docs/contributing.rst b/docs/contributing.rst index aed5ab92e..7b901dd00 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -155,6 +155,9 @@ Finally, try to keep aesthetic changes (:pep:`8` compliance, unused imports removal, etc) in separate commits from functional changes. This will make pull requests easier to review and more likely to get merged. + +.. _coding-style: + Coding style ============ @@ -163,7 +166,7 @@ Scrapy: * Unless otherwise specified, follow :pep:`8`. -* It's OK to use lines longer than 80 chars if it improves the code +* It's OK to use lines longer than 79 chars if it improves the code readability. * Don't put your name in the code you contribute; git provides enough diff --git a/docs/faq.rst b/docs/faq.rst index 75a0f4864..d5ea3cb87 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -69,7 +69,7 @@ Here's an example spider using BeautifulSoup API, with ``lxml`` as the HTML pars What Python versions does Scrapy support? ----------------------------------------- -Scrapy is supported under Python 3.5+ +Scrapy is supported under Python 3.5.2+ under CPython (default Python implementation) and PyPy (starting with PyPy 5.9). Python 3 support was added in Scrapy 1.1. PyPy support was added in Scrapy 1.4, PyPy3 support was added in Scrapy 1.5. @@ -342,15 +342,15 @@ method for this purpose. For example:: from copy import deepcopy - from scrapy.item import BaseItem - + from itemadapter import is_item, ItemAdapter class MultiplyItemsMiddleware: def process_spider_output(self, response, result, spider): for item in result: - if isinstance(item, (BaseItem, dict)): - for _ in range(item['multiply_by']): + if is_item(item): + adapter = ItemAdapter(item) + for _ in range(adapter['multiply_by']): yield deepcopy(item) Does Scrapy support IPv6 addresses? @@ -371,6 +371,19 @@ Twisted reactor is :class:`twisted.internet.selectreactor.SelectReactor`. Switch different reactor is possible by using the :setting:`TWISTED_REACTOR` setting. +.. _faq-stop-response-download: + +How can I cancel the download of a given response? +-------------------------------------------------- + +In some situations, it might be useful to stop the download of a certain response. +For instance, if you only need the first part of a large response and you would like +to save resources by avoiding the download of the whole body. +In that case, you could attach a handler to the :class:`~scrapy.signals.bytes_received` +signal and raise a :exc:`~scrapy.exceptions.StopDownload` exception. Please refer to +the :ref:`topics-stop-response-download` topic for additional information and examples. + + .. _has been reported: https://github.com/scrapy/scrapy/issues/2905 .. _user agents: https://en.wikipedia.org/wiki/User_agent .. _LIFO: https://en.wikipedia.org/wiki/Stack_(abstract_data_type) diff --git a/docs/intro/install.rst b/docs/intro/install.rst index 6356e0eea..fb64d443c 100644 --- a/docs/intro/install.rst +++ b/docs/intro/install.rst @@ -7,7 +7,7 @@ Installation guide Installing Scrapy ================= -Scrapy runs on Python 3.5 or above under CPython (default Python +Scrapy runs on Python 3.5.2 or above under CPython (default Python implementation) and PyPy (starting with PyPy 5.9). If you're using `Anaconda`_ or `Miniconda`_, you can install the package from diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index 1768badbb..5f35dc936 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -25,16 +25,16 @@ Scrapy. If you're already familiar with other languages, and want to learn Python quickly, the `Python Tutorial`_ is a good resource. If you're new to programming and want to start with Python, the following books -may be useful to you: +may be useful to you: * `Automate the Boring Stuff With Python`_ -* `How To Think Like a Computer Scientist`_ +* `How To Think Like a Computer Scientist`_ -* `Learn Python 3 The Hard Way`_ +* `Learn Python 3 The Hard Way`_ You can also take a look at `this list of Python resources for non-programmers`_, -as well as the `suggested resources in the learnpython-subreddit`_. +as well as the `suggested resources in the learnpython-subreddit`_. .. _Python: https://www.python.org/ .. _this list of Python resources for non-programmers: https://wiki.python.org/moin/BeginnersGuide/NonProgrammers @@ -62,7 +62,7 @@ This will create a ``tutorial`` directory with the following contents:: __init__.py items.py # project items definition file - + middlewares.py # project middlewares file pipelines.py # project pipelines file @@ -287,8 +287,8 @@ to be scraped, you can at least get **some** data. Besides the :meth:`~scrapy.selector.SelectorList.getall` and :meth:`~scrapy.selector.SelectorList.get` methods, you can also use -the :meth:`~scrapy.selector.SelectorList.re` method to extract using `regular -expressions`_: +the :meth:`~scrapy.selector.SelectorList.re` method to extract using +:doc:`regular expressions `: >>> response.css('title::text').re(r'Quotes.*') ['Quotes to Scrape'] @@ -305,7 +305,6 @@ with a selector (see :ref:`topics-developer-tools`). `Selector Gadget`_ is also a nice tool to quickly find CSS selector for visually selected elements, which works in many browsers. -.. _regular expressions: https://docs.python.org/3/library/re.html .. _Selector Gadget: https://selectorgadget.com/ diff --git a/docs/news.rst b/docs/news.rst index e9b7140cd..80d130e4a 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -3,6 +3,348 @@ Release notes ============= +.. _release-2.2.0: + +Scrapy 2.2.0 (2020-06-24) +------------------------- + +Highlights: + +* Python 3.5.2+ is required now +* :ref:`dataclass objects ` and + :ref:`attrs objects ` are now valid :ref:`item types + ` +* New :meth:`TextResponse.json ` method +* New :signal:`bytes_received` signal that allows canceling response download +* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` fixes + +Backward-incompatible changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Support for Python 3.5.0 and 3.5.1 has been dropped; Scrapy now refuses to + run with a Python version lower than 3.5.2, which introduced + :class:`typing.Type` (:issue:`4615`) + + +Deprecations +~~~~~~~~~~~~ + +* :meth:`TextResponse.body_as_unicode + ` is now deprecated, use + :attr:`TextResponse.text ` instead + (:issue:`4546`, :issue:`4555`, :issue:`4579`) + +* :class:`scrapy.item.BaseItem` is now deprecated, use + :class:`scrapy.item.Item` instead (:issue:`4534`) + + +New features +~~~~~~~~~~~~ + +* :ref:`dataclass objects ` and + :ref:`attrs objects ` are now valid :ref:`item types + `, and a new itemadapter_ library makes it easy to + write code that :ref:`supports any item type ` + (:issue:`2749`, :issue:`2807`, :issue:`3761`, :issue:`3881`, :issue:`4642`) + +* A new :meth:`TextResponse.json ` method + allows to deserialize JSON responses (:issue:`2444`, :issue:`4460`, + :issue:`4574`) + +* A new :signal:`bytes_received` signal allows monitoring response download + progress and :ref:`stopping downloads ` + (:issue:`4205`, :issue:`4559`) + +* The dictionaries in the result list of a :ref:`media pipeline + ` now include a new key, ``status``, which indicates + if the file was downloaded or, if the file was not downloaded, why it was + not downloaded; see :meth:`FilesPipeline.get_media_requests + ` for more + information (:issue:`2893`, :issue:`4486`) + +* When using :ref:`Google Cloud Storage ` for + a :ref:`media pipeline `, a warning is now logged if + the configured credentials do not grant the required permissions + (:issue:`4346`, :issue:`4508`) + +* :ref:`Link extractors ` are now serializable, + as long as you do not use :ref:`lambdas ` for parameters; for + example, you can now pass link extractors in :attr:`Request.cb_kwargs + ` or + :attr:`Request.meta ` when :ref:`persisting + scheduled requests ` (:issue:`4554`) + +* Upgraded the :ref:`pickle protocol ` that Scrapy uses + from protocol 2 to protocol 4, improving serialization capabilities and + performance (:issue:`4135`, :issue:`4541`) + +* :func:`scrapy.utils.misc.create_instance` now raises a :exc:`TypeError` + exception if the resulting instance is ``None`` (:issue:`4528`, + :issue:`4532`) + +.. _itemadapter: https://github.com/scrapy/itemadapter + + +Bug fixes +~~~~~~~~~ + +* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` no longer + discards cookies defined in :attr:`Request.headers + ` (:issue:`1992`, :issue:`2400`) + +* :class:`~scrapy.downloadermiddlewares.cookies.CookiesMiddleware` no longer + re-encodes cookies defined as :class:`bytes` in the ``cookies`` parameter + of the ``__init__`` method of :class:`~scrapy.http.Request` + (:issue:`2400`, :issue:`3575`) + +* When :setting:`FEEDS` defines multiple URIs, :setting:`FEED_STORE_EMPTY` is + ``False`` and the crawl yields no items, Scrapy no longer stops feed + exports after the first URI (:issue:`4621`, :issue:`4626`) + +* :class:`~scrapy.spiders.Spider` callbacks defined using :doc:`coroutine + syntax ` no longer need to return an iterable, and may + instead return a :class:`~scrapy.http.Request` object, an + :ref:`item `, or ``None`` (:issue:`4609`) + +* The :command:`startproject` command now ensures that the generated project + folders and files have the right permissions (:issue:`4604`) + +* Fix a :exc:`KeyError` exception being sometimes raised from + :class:`scrapy.utils.datatypes.LocalWeakReferencedCache` (:issue:`4597`, + :issue:`4599`) + +* When :setting:`FEEDS` defines multiple URIs, log messages about items being + stored now contain information from the corresponding feed, instead of + always containing information about only one of the feeds (:issue:`4619`, + :issue:`4629`) + + +Documentation +~~~~~~~~~~~~~ + +* Added a new section about :ref:`accessing cb_kwargs from errbacks + ` (:issue:`4598`, :issue:`4634`) + +* Covered chompjs_ in :ref:`topics-parsing-javascript` (:issue:`4556`, + :issue:`4562`) + +* Removed from :doc:`topics/coroutines` the warning about the API being + experimental (:issue:`4511`, :issue:`4513`) + +* Removed references to unsupported versions of :doc:`Twisted + ` (:issue:`4533`) + +* Updated the description of the :ref:`screenshot pipeline example + `, which now uses :doc:`coroutine syntax + ` instead of returning a + :class:`~twisted.internet.defer.Deferred` (:issue:`4514`, :issue:`4593`) + +* Removed a misleading import line from the + :func:`scrapy.utils.log.configure_logging` code example (:issue:`4510`, + :issue:`4587`) + +* The display-on-hover behavior of internal documentation references now also + covers links to :ref:`commands `, :attr:`Request.meta + ` keys, :ref:`settings ` and + :ref:`signals ` (:issue:`4495`, :issue:`4563`) + +* It is again possible to download the documentation for offline reading + (:issue:`4578`, :issue:`4585`) + +* Removed backslashes preceding ``*args`` and ``**kwargs`` in some function + and method signatures (:issue:`4592`, :issue:`4596`) + +.. _chompjs: https://github.com/Nykakin/chompjs + + +Quality assurance +~~~~~~~~~~~~~~~~~ + +* Adjusted the code base further to our :ref:`style guidelines + ` (:issue:`4237`, :issue:`4525`, :issue:`4538`, + :issue:`4539`, :issue:`4540`, :issue:`4542`, :issue:`4543`, :issue:`4544`, + :issue:`4545`, :issue:`4557`, :issue:`4558`, :issue:`4566`, :issue:`4568`, + :issue:`4572`) + +* Removed remnants of Python 2 support (:issue:`4550`, :issue:`4553`, + :issue:`4568`) + +* Improved code sharing between the :command:`crawl` and :command:`runspider` + commands (:issue:`4548`, :issue:`4552`) + +* Replaced ``chain(*iterable)`` with ``chain.from_iterable(iterable)`` + (:issue:`4635`) + +* You may now run the :mod:`asyncio` tests with Tox on any Python version + (:issue:`4521`) + +* Updated test requirements to reflect an incompatibility with pytest 5.4 and + 5.4.1 (:issue:`4588`) + +* Improved :class:`~scrapy.spiderloader.SpiderLoader` test coverage for + scenarios involving duplicate spider names (:issue:`4549`, :issue:`4560`) + +* Configured Travis CI to also run the tests with Python 3.5.2 + (:issue:`4518`, :issue:`4615`) + +* Added a `Pylint `_ job to Travis CI + (:issue:`3727`) + +* Added a `Mypy `_ job to Travis CI (:issue:`4637`) + +* Made use of set literals in tests (:issue:`4573`) + +* Cleaned up the Travis CI configuration (:issue:`4517`, :issue:`4519`, + :issue:`4522`, :issue:`4537`) + + +.. _release-2.1.0: + +Scrapy 2.1.0 (2020-04-24) +------------------------- + +Highlights: + +* New :setting:`FEEDS` setting to export to multiple feeds +* New :attr:`Response.ip_address ` attribute + +Backward-incompatible changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* :exc:`AssertionError` exceptions triggered by :ref:`assert ` + statements have been replaced by new exception types, to support running + Python in optimized mode (see :option:`-O`) without changing Scrapy’s + behavior in any unexpected ways. + + If you catch an :exc:`AssertionError` exception from Scrapy, update your + code to catch the corresponding new exception. + + (:issue:`4440`) + + +Deprecation removals +~~~~~~~~~~~~~~~~~~~~ + +* The ``LOG_UNSERIALIZABLE_REQUESTS`` setting is no longer supported, use + :setting:`SCHEDULER_DEBUG` instead (:issue:`4385`) + +* The ``REDIRECT_MAX_METAREFRESH_DELAY`` setting is no longer supported, use + :setting:`METAREFRESH_MAXDELAY` instead (:issue:`4385`) + +* The :class:`~scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware` + middleware has been removed, including the entire + :class:`scrapy.downloadermiddlewares.chunked` module; chunked transfers + work out of the box (:issue:`4431`) + +* The ``spiders`` property has been removed from + :class:`~scrapy.crawler.Crawler`, use :class:`CrawlerRunner.spider_loader + ` or instantiate + :setting:`SPIDER_LOADER_CLASS` with your settings instead (:issue:`4398`) + +* The ``MultiValueDict``, ``MultiValueDictKeyError``, and ``SiteNode`` + classes have been removed from :mod:`scrapy.utils.datatypes` + (:issue:`4400`) + + +Deprecations +~~~~~~~~~~~~ + +* The ``FEED_FORMAT`` and ``FEED_URI`` settings have been deprecated in + favor of the new :setting:`FEEDS` setting (:issue:`1336`, :issue:`3858`, + :issue:`4507`) + + +New features +~~~~~~~~~~~~ + +* A new setting, :setting:`FEEDS`, allows configuring multiple output feeds + with different settings each (:issue:`1336`, :issue:`3858`, :issue:`4507`) + +* The :command:`crawl` and :command:`runspider` commands now support multiple + ``-o`` parameters (:issue:`1336`, :issue:`3858`, :issue:`4507`) + +* The :command:`crawl` and :command:`runspider` commands now support + specifying an output format by appending ``:`` to the output file + (:issue:`1336`, :issue:`3858`, :issue:`4507`) + +* The new :attr:`Response.ip_address ` + attribute gives access to the IP address that originated a response + (:issue:`3903`, :issue:`3940`) + +* A warning is now issued when a value in + :attr:`~scrapy.spiders.Spider.allowed_domains` includes a port + (:issue:`50`, :issue:`3198`, :issue:`4413`) + +* Zsh completion now excludes used option aliases from the completion list + (:issue:`4438`) + + +Bug fixes +~~~~~~~~~ + +* :ref:`Request serialization ` no longer breaks for + callbacks that are spider attributes which are assigned a function with a + different name (:issue:`4500`) + +* ``None`` values in :attr:`~scrapy.spiders.Spider.allowed_domains` no longer + cause a :exc:`TypeError` exception (:issue:`4410`) + +* Zsh completion no longer allows options after arguments (:issue:`4438`) + +* zope.interface 5.0.0 and later versions are now supported + (:issue:`4447`, :issue:`4448`) + +* :meth:`Spider.make_requests_from_url + `, deprecated in Scrapy + 1.4.0, now issues a warning when used (:issue:`4412`) + + +Documentation +~~~~~~~~~~~~~ + +* Improved the documentation about signals that allow their handlers to + return a :class:`~twisted.internet.defer.Deferred` (:issue:`4295`, + :issue:`4390`) + +* Our PyPI entry now includes links for our documentation, our source code + repository and our issue tracker (:issue:`4456`) + +* Covered the `curl2scrapy `_ + service in the documentation (:issue:`4206`, :issue:`4455`) + +* Removed references to the Guppy library, which only works in Python 2 + (:issue:`4285`, :issue:`4343`) + +* Extended use of InterSphinx to link to Python 3 documentation + (:issue:`4444`, :issue:`4445`) + +* Added support for Sphinx 3.0 and later (:issue:`4475`, :issue:`4480`, + :issue:`4496`, :issue:`4503`) + + +Quality assurance +~~~~~~~~~~~~~~~~~ + +* Removed warnings about using old, removed settings (:issue:`4404`) + +* Removed a warning about importing + :class:`~twisted.internet.testing.StringTransport` from + ``twisted.test.proto_helpers`` in Twisted 19.7.0 or newer (:issue:`4409`) + +* Removed outdated Debian package build files (:issue:`4384`) + +* Removed :class:`object` usage as a base class (:issue:`4430`) + +* Removed code that added support for old versions of Twisted that we no + longer support (:issue:`4472`) + +* Fixed code style issues (:issue:`4468`, :issue:`4469`, :issue:`4471`, + :issue:`4481`) + +* Removed :func:`twisted.internet.defer.returnValue` calls (:issue:`4443`, + :issue:`4446`, :issue:`4489`) + + .. _release-2.0.1: Scrapy 2.0.1 (2020-03-18) diff --git a/docs/requirements.txt b/docs/requirements.txt index 773b92cea..3d34b47da 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -Sphinx>=2.1 -sphinx-hoverxref -sphinx-notfound-page -sphinx_rtd_theme +Sphinx>=3.0 +sphinx-hoverxref>=0.2b1 +sphinx-notfound-page>=0.4 +sphinx_rtd_theme>=0.4 diff --git a/docs/topics/api.rst b/docs/topics/api.rst index 1c461a511..52509ffdf 100644 --- a/docs/topics/api.rst +++ b/docs/topics/api.rst @@ -91,7 +91,7 @@ how you :ref:`configure the downloader middlewares provided while constructing the crawler, and it is created after the arguments given in the :meth:`crawl` method. - .. method:: crawl(\*args, \**kwargs) + .. method:: crawl(*args, **kwargs) Starts the crawler by instantiating its spider class with the given ``args`` and ``kwargs`` arguments, while setting the execution engine in diff --git a/docs/topics/architecture.rst b/docs/topics/architecture.rst index ae25dfa2f..074c59241 100644 --- a/docs/topics/architecture.rst +++ b/docs/topics/architecture.rst @@ -104,7 +104,7 @@ Spiders ------- Spiders are custom classes written by Scrapy users to parse responses and -extract items (aka scraped items) from them or additional requests to +extract :ref:`items ` from them or additional requests to follow. For more information see :ref:`topics-spiders`. .. _component-pipelines: diff --git a/docs/topics/contracts.rst b/docs/topics/contracts.rst index 43db8f101..b8b3078c4 100644 --- a/docs/topics/contracts.rst +++ b/docs/topics/contracts.rst @@ -78,7 +78,7 @@ override three methods: .. module:: scrapy.contracts -.. class:: Contract(method, \*args) +.. class:: Contract(method, *args) :param method: callback function to which the contract is associated :type method: function @@ -136,7 +136,7 @@ Detecting check runs ==================== When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is -set to the ``true`` string. You can use `os.environ`_ to perform any change to +set to the ``true`` string. You can use :data:`os.environ` to perform any change to your spiders or your settings when ``scrapy check`` is used:: import os @@ -148,5 +148,3 @@ your spiders or your settings when ``scrapy check`` is used:: def __init__(self): if os.environ.get('SCRAPY_CHECK'): pass # Do some scraper adjustments when a check is running - -.. _os.environ: https://docs.python.org/3/library/os.html#os.environ diff --git a/docs/topics/coroutines.rst b/docs/topics/coroutines.rst index 487cf4c6c..a0952d323 100644 --- a/docs/topics/coroutines.rst +++ b/docs/topics/coroutines.rst @@ -7,10 +7,6 @@ Coroutines Scrapy has :ref:`partial support ` for the :ref:`coroutine syntax `. -.. warning:: :mod:`asyncio` support in Scrapy is experimental. Future Scrapy - versions may introduce related API and behavior changes without a - deprecation period or warning. - .. _coroutine-support: Supported callables @@ -57,27 +53,34 @@ There are several use cases for coroutines in Scrapy. Code that would return Deferreds when written for previous Scrapy versions, such as downloader middlewares and signal handlers, can be rewritten to be shorter and cleaner:: + from itemadapter import ItemAdapter + class DbPipeline: def _update_item(self, data, item): - item['field'] = data + adapter = ItemAdapter(item) + adapter['field'] = data return item def process_item(self, item, spider): - dfd = db.get_some_data(item['id']) + adapter = ItemAdapter(item) + dfd = db.get_some_data(adapter['id']) dfd.addCallback(self._update_item, item) return dfd becomes:: + from itemadapter import ItemAdapter + class DbPipeline: async def process_item(self, item, spider): - item['field'] = await db.get_some_data(item['id']) + adapter = ItemAdapter(item) + adapter['field'] = await db.get_some_data(adapter['id']) return item Coroutines may be used to call asynchronous code. This includes other coroutines, functions that return Deferreds and functions that return -`awaitable objects`_ such as :class:`~asyncio.Future`. This means you can use -many useful Python libraries providing such code:: +:term:`awaitable objects ` such as :class:`~asyncio.Future`. +This means you can use many useful Python libraries providing such code:: class MySpider(Spider): # ... @@ -107,4 +110,3 @@ Common use cases for asynchronous code include: :ref:`the screenshot pipeline example`). .. _aio-libs: https://github.com/aio-libs -.. _awaitable objects: https://docs.python.org/3/glossary.html#term-awaitable diff --git a/docs/topics/developer-tools.rst b/docs/topics/developer-tools.rst index f1b0964c6..4e87a00f2 100644 --- a/docs/topics/developer-tools.rst +++ b/docs/topics/developer-tools.rst @@ -292,6 +292,9 @@ Alternatively, if you want to know the arguments needed to recreate that request you can use the :func:`scrapy.utils.curl.curl_to_request_kwargs` function to get a dictionary with the equivalent arguments. +Note that to translate a cURL command into a Scrapy request, +you may use `curl2scrapy `_. + As you can see, with a few inspections in the `Network`-tool we were able to easily replicate the dynamic requests of the scrolling functionality of the page. Crawling dynamic pages can be quite diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst index 73648994d..323e553e5 100644 --- a/docs/topics/downloader-middleware.rst +++ b/docs/topics/downloader-middleware.rst @@ -202,6 +202,11 @@ CookiesMiddleware sends them back on subsequent requests (from that spider), just like web browsers do. + .. caution:: When non-UTF8 encoded byte sequences are passed to a + :class:`~scrapy.http.Request`, the ``CookiesMiddleware`` will log + a warning. Refer to :ref:`topics-logging-advanced-customization` + to customize the logging behaviour. + The following settings can be used to configure the cookie middleware: * :setting:`COOKIES_ENABLED` @@ -739,7 +744,7 @@ HttpProxyMiddleware This middleware sets the HTTP proxy to use for requests, by setting the ``proxy`` meta value for :class:`~scrapy.http.Request` objects. - Like the Python standard library modules `urllib`_ and `urllib2`_, it obeys + Like the Python standard library module :mod:`urllib.request`, it obeys the following environment variables: * ``http_proxy`` @@ -751,9 +756,6 @@ HttpProxyMiddleware Keep in mind this value will take precedence over ``http_proxy``/``https_proxy`` environment variables, and it will also ignore ``no_proxy`` environment variable. -.. _urllib: https://docs.python.org/2/library/urllib.html -.. _urllib2: https://docs.python.org/2/library/urllib2.html - RedirectMiddleware ------------------ @@ -829,6 +831,7 @@ REDIRECT_MAX_TIMES Default: ``20`` The maximum number of redirections that will be followed for a single request. +After this maximum, the request's response is returned as is. MetaRefreshMiddleware --------------------- @@ -1036,8 +1039,7 @@ Scrapy uses this parser by default. RobotFileParser ~~~~~~~~~~~~~~~ -Based on `RobotFileParser -`_: +Based on :class:`~urllib.robotparser.RobotFileParser`: * is Python's built-in robots.txt_ parser diff --git a/docs/topics/dynamic-content.rst b/docs/topics/dynamic-content.rst index b98133676..495111b56 100644 --- a/docs/topics/dynamic-content.rst +++ b/docs/topics/dynamic-content.rst @@ -104,6 +104,9 @@ If you get the expected response `sometimes`, but not always, the issue is probably not your request, but the target server. The target server might be buggy, overloaded, or :ref:`banning ` some of your requests. +Note that to translate a cURL command into a Scrapy request, +you may use `curl2scrapy `_. + .. _topics-handling-response-formats: Handling different response formats @@ -115,7 +118,7 @@ data from it depends on the type of response: - If the response is HTML or XML, use :ref:`selectors ` as usual. -- If the response is JSON, use `json.loads`_ to load the desired data from +- If the response is JSON, use :func:`json.loads` to load the desired data from :attr:`response.text `:: data = json.loads(response.text) @@ -130,8 +133,9 @@ data from it depends on the type of response: - If the response is JavaScript, or HTML with a ``