mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 08:33:33 +00:00
Run and fix linkcheck. (#6524)
This commit is contained in:
parent
7701e590fb
commit
ce5a132f12
@ -6,11 +6,11 @@ Scrapy
|
||||
======
|
||||
|
||||
.. image:: https://img.shields.io/pypi/v/Scrapy.svg
|
||||
:target: https://pypi.python.org/pypi/Scrapy
|
||||
:target: https://pypi.org/pypi/Scrapy
|
||||
:alt: PyPI Version
|
||||
|
||||
.. image:: https://img.shields.io/pypi/pyversions/Scrapy.svg
|
||||
:target: https://pypi.python.org/pypi/Scrapy
|
||||
:target: https://pypi.org/pypi/Scrapy
|
||||
:alt: Supported Python Versions
|
||||
|
||||
.. image:: https://github.com/scrapy/scrapy/workflows/Ubuntu/badge.svg
|
||||
@ -27,7 +27,7 @@ Scrapy
|
||||
:alt: Windows
|
||||
|
||||
.. image:: https://img.shields.io/badge/wheel-yes-brightgreen.svg
|
||||
:target: https://pypi.python.org/pypi/Scrapy
|
||||
:target: https://pypi.org/pypi/Scrapy
|
||||
:alt: Wheel Status
|
||||
|
||||
.. image:: https://img.shields.io/codecov/c/github/scrapy/scrapy/master.svg
|
||||
@ -111,4 +111,4 @@ See https://scrapy.org/companies/ for a list.
|
||||
Commercial Support
|
||||
==================
|
||||
|
||||
See https://scrapy.org/support/ for details.
|
||||
See https://scrapy.org/support/ for details.
|
||||
|
@ -231,6 +231,7 @@ linkcheck_ignore = [
|
||||
r"http://localhost:\d+",
|
||||
"http://hg.scrapy.org",
|
||||
"http://directory.google.com/",
|
||||
r"https://github.com/scrapy/scrapy/issues/\d+",
|
||||
]
|
||||
|
||||
|
||||
|
@ -154,7 +154,7 @@ by running ``git fetch upstream pull/$PR_NUMBER/head:$BRANCH_NAME_TO_CREATE``
|
||||
(replace 'upstream' with a remote name for scrapy repository,
|
||||
``$PR_NUMBER`` with an ID of the pull request, and ``$BRANCH_NAME_TO_CREATE``
|
||||
with a name of the branch you want to create locally).
|
||||
See also: https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/checking-out-pull-requests-locally#modifying-an-inactive-pull-request-locally.
|
||||
See also: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/reviewing-changes-in-pull-requests/checking-out-pull-requests-locally#modifying-an-inactive-pull-request-locally.
|
||||
|
||||
When writing GitHub pull requests, try to keep titles short but descriptive.
|
||||
E.g. For bug #411: "Scrapy hangs if an exception raises in start_requests"
|
||||
@ -182,8 +182,8 @@ Scrapy:
|
||||
|
||||
* Don't put your name in the code you contribute; git provides enough
|
||||
metadata to identify author of the code.
|
||||
See https://help.github.com/en/github/using-git/setting-your-username-in-git for
|
||||
setup instructions.
|
||||
See https://docs.github.com/en/get-started/getting-started-with-git/setting-your-username-in-git
|
||||
for setup instructions.
|
||||
|
||||
.. _scrapy-pre-commit:
|
||||
|
||||
@ -317,8 +317,8 @@ And their unit-tests are in::
|
||||
.. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS
|
||||
.. _tests/: https://github.com/scrapy/scrapy/tree/master/tests
|
||||
.. _open issues: https://github.com/scrapy/scrapy/issues
|
||||
.. _PEP 257: https://www.python.org/dev/peps/pep-0257/
|
||||
.. _pull request: https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request
|
||||
.. _PEP 257: https://peps.python.org/pep-0257/
|
||||
.. _pull request: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request
|
||||
.. _pytest-xdist: https://github.com/pytest-dev/pytest-xdist
|
||||
.. _good first issues: https://github.com/scrapy/scrapy/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22
|
||||
.. _help wanted issues: https://github.com/scrapy/scrapy/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22
|
||||
|
11
docs/faq.rst
11
docs/faq.rst
@ -23,7 +23,7 @@ comparing `jinja2`_ to `Django`_.
|
||||
|
||||
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
|
||||
.. _lxml: https://lxml.de/
|
||||
.. _jinja2: https://palletsprojects.com/p/jinja/
|
||||
.. _jinja2: https://palletsprojects.com/projects/jinja/
|
||||
.. _Django: https://www.djangoproject.com/
|
||||
|
||||
Can I use Scrapy with BeautifulSoup?
|
||||
@ -148,7 +148,7 @@ middleware with a :ref:`custom downloader middleware
|
||||
instead joining the strings in :attr:`~scrapy.Spider.allowed_domains` into
|
||||
a complex regular expression.
|
||||
|
||||
- If you can `meet the installation requirements`_, use pyre2_ instead of
|
||||
- If you can meet the installation requirements, use pyre2_ instead of
|
||||
Python’s re_ to compile your URL-filtering regular expression. See
|
||||
:issue:`1908`.
|
||||
|
||||
@ -166,9 +166,8 @@ See also `other suggestions at StackOverflow
|
||||
"myproject.middlewares.CustomOffsiteMiddleware": 50,
|
||||
}
|
||||
|
||||
.. _meet the installation requirements: https://github.com/andreasvc/pyre2#installation
|
||||
.. _pyre2: https://github.com/andreasvc/pyre2
|
||||
.. _re: https://docs.python.org/library/re.html
|
||||
.. _re: https://docs.python.org/3/library/re.html
|
||||
|
||||
Can I use Basic HTTP Authentication in my spiders?
|
||||
--------------------------------------------------
|
||||
@ -282,7 +281,7 @@ The ``__VIEWSTATE`` parameter is used in sites built with ASP.NET/VB.NET. For
|
||||
more info on how it works see `this page`_. Also, here's an `example spider`_
|
||||
which scrapes one of these sites.
|
||||
|
||||
.. _this page: https://metacpan.org/pod/release/ECARROLL/HTML-TreeBuilderX-ASP_NET-0.09/lib/HTML/TreeBuilderX/ASP_NET.pm
|
||||
.. _this page: https://metacpan.org/release/ECARROLL/HTML-TreeBuilderX-ASP_NET-0.09/view/lib/HTML/TreeBuilderX/ASP_NET.pm
|
||||
.. _example spider: https://github.com/AmbientLighter/rpn-fas/blob/master/fas/spiders/rnp.py
|
||||
|
||||
What's the best way to parse big XML/CSV data feeds?
|
||||
@ -432,7 +431,7 @@ See :issue:`2680`.
|
||||
|
||||
|
||||
.. _has been reported: https://github.com/scrapy/scrapy/issues/2905
|
||||
.. _Python standard library modules: https://docs.python.org/py-modindex.html
|
||||
.. _Python standard library modules: https://docs.python.org/3/py-modindex.html
|
||||
.. _Python package: https://pypi.org/
|
||||
.. _user agents: https://en.wikipedia.org/wiki/User_agent
|
||||
.. _LIFO: https://en.wikipedia.org/wiki/Stack_(abstract_data_type)
|
||||
|
@ -33,7 +33,7 @@ Having trouble? We'd like to help!
|
||||
.. _StackOverflow using the scrapy tag: https://stackoverflow.com/tags/scrapy
|
||||
.. _#scrapy IRC channel: irc://irc.freenode.net/scrapy
|
||||
.. _issue tracker: https://github.com/scrapy/scrapy/issues
|
||||
.. _Scrapy Discord: https://discord.gg/mv3yErfpvq
|
||||
.. _Scrapy Discord: https://discord.com/invite/mv3yErfpvq
|
||||
|
||||
|
||||
First steps
|
||||
|
@ -267,10 +267,10 @@ For details, see `Issue #2473 <https://github.com/scrapy/scrapy/issues/2473>`_.
|
||||
.. _lxml: https://lxml.de/index.html
|
||||
.. _parsel: https://pypi.org/project/parsel/
|
||||
.. _w3lib: https://pypi.org/project/w3lib/
|
||||
.. _twisted: https://twistedmatrix.com/trac/
|
||||
.. _twisted: https://twisted.org/
|
||||
.. _cryptography: https://cryptography.io/en/latest/
|
||||
.. _pyOpenSSL: https://pypi.org/project/pyOpenSSL/
|
||||
.. _setuptools: https://pypi.python.org/pypi/setuptools
|
||||
.. _setuptools: https://pypi.org/pypi/setuptools
|
||||
.. _homebrew: https://brew.sh/
|
||||
.. _zsh: https://www.zsh.org/
|
||||
.. _Anaconda: https://docs.anaconda.com/anaconda/
|
||||
|
@ -152,6 +152,6 @@ interest!
|
||||
|
||||
.. _join the community: https://scrapy.org/community/
|
||||
.. _web scraping: https://en.wikipedia.org/wiki/Web_scraping
|
||||
.. _Amazon Associates Web Services: https://affiliate-program.amazon.com/gp/advertising/api/detail/main.html
|
||||
.. _Amazon Associates Web Services: https://affiliate-program.amazon.com/welcome/ecs
|
||||
.. _Amazon S3: https://aws.amazon.com/s3/
|
||||
.. _Sitemaps: https://www.sitemaps.org/index.html
|
||||
|
@ -369,7 +369,7 @@ recommend `this tutorial to learn XPath through examples
|
||||
<http://zvon.org/comp/r/tut-XPath_1.html>`_, and `this tutorial to learn "how
|
||||
to think in XPath" <http://plasmasturm.org/log/xpath101/>`_.
|
||||
|
||||
.. _XPath: https://www.w3.org/TR/xpath/all/
|
||||
.. _XPath: https://www.w3.org/TR/xpath-10/
|
||||
.. _CSS: https://www.w3.org/TR/selectors
|
||||
|
||||
Extracting quotes and authors
|
||||
@ -541,7 +541,7 @@ for Item Pipelines has been set up for you when the project is created, in
|
||||
``tutorial/pipelines.py``. Though you don't need to implement any item
|
||||
pipelines if you just want to store the scraped items.
|
||||
|
||||
.. _JSON Lines: http://jsonlines.org
|
||||
.. _JSON Lines: https://jsonlines.org
|
||||
.. _JQ: https://stedolan.github.io/jq
|
||||
|
||||
|
||||
|
@ -1069,7 +1069,7 @@ Documentation
|
||||
(:issue:`3582`, :issue:`5432`).
|
||||
|
||||
.. _Common Crawl: https://commoncrawl.org/
|
||||
.. _Google cache: http://www.googleguide.com/cached_pages.html
|
||||
.. _Google cache: https://www.googleguide.com/cached_pages.html
|
||||
|
||||
- The new :ref:`topics-components` topic covers enforcing requirements on
|
||||
Scrapy components, like :ref:`downloader middlewares
|
||||
@ -1426,7 +1426,7 @@ New features
|
||||
(:setting:`AWS_SESSION_TOKEN`) and endpoint customization
|
||||
(:setting:`AWS_ENDPOINT_URL`). (:issue:`4998`, :issue:`5210`)
|
||||
|
||||
.. _temporary security credentials: https://docs.aws.amazon.com/general/latest/gr/aws-sec-cred-types.html#temporary-access-keys
|
||||
.. _temporary security credentials: https://docs.aws.amazon.com/IAM/latest/UserGuide/security-creds.html
|
||||
|
||||
- New :setting:`LOG_FILE_APPEND` setting to allow truncating the log file.
|
||||
(:issue:`5279`)
|
||||
@ -1572,7 +1572,7 @@ Documentation
|
||||
- ``quotes.toscrape.com`` references now use HTTPS instead of HTTP.
|
||||
(:issue:`5395`, :issue:`5396`)
|
||||
|
||||
- Added a link to `our Discord server <https://discord.gg/mv3yErfpvq>`_
|
||||
- Added a link to `our Discord server <https://discord.com/invite/mv3yErfpvq>`_
|
||||
to :ref:`getting-help`. (:issue:`5421`, :issue:`5422`)
|
||||
|
||||
- The pronunciation of the project name is now :ref:`officially
|
||||
@ -1763,7 +1763,7 @@ Bug fixes
|
||||
with lower indentation than the following code.
|
||||
(:issue:`4477`, :issue:`4935`)
|
||||
|
||||
- The `Content-Length <https://tools.ietf.org/html/rfc2616#section-14.13>`_
|
||||
- The `Content-Length <https://datatracker.ietf.org/doc/html/rfc2616#section-14.13>`_
|
||||
header is no longer omitted from responses when using the default, HTTP/1.1
|
||||
download handler (see :setting:`DOWNLOAD_HANDLERS`).
|
||||
(:issue:`5009`, :issue:`5034`, :issue:`5045`, :issue:`5057`, :issue:`5062`)
|
||||
@ -2263,7 +2263,7 @@ Documentation
|
||||
* Simplified the code example in :ref:`topics-loaders-dataclass`
|
||||
(:issue:`4652`)
|
||||
|
||||
.. _OpenSSL cipher list format: https://www.openssl.org/docs/manmaster/man1/openssl-ciphers.html#CIPHER-LIST-FORMAT
|
||||
.. _OpenSSL cipher list format: https://docs.openssl.org/master/man1/openssl-ciphers/#cipher-list-format
|
||||
|
||||
|
||||
Quality assurance
|
||||
@ -2490,7 +2490,7 @@ Quality assurance
|
||||
* Added a `Pylint <https://www.pylint.org/>`_ job to Travis CI
|
||||
(:issue:`3727`)
|
||||
|
||||
* Added a `Mypy <http://mypy-lang.org/>`_ job to Travis CI (:issue:`4637`)
|
||||
* Added a `Mypy <https://mypy-lang.org/>`_ job to Travis CI (:issue:`4637`)
|
||||
|
||||
* Made use of set literals in tests (:issue:`4573`)
|
||||
|
||||
@ -2997,7 +2997,7 @@ Quality assurance
|
||||
* Cleaned up code (:issue:`3937`, :issue:`4208`, :issue:`4209`,
|
||||
:issue:`4210`, :issue:`4212`, :issue:`4369`, :issue:`4376`, :issue:`4378`)
|
||||
|
||||
.. _Bandit: https://bandit.readthedocs.io/
|
||||
.. _Bandit: https://bandit.readthedocs.io/en/latest/
|
||||
.. _Flake8: https://flake8.pycqa.org/en/latest/
|
||||
|
||||
|
||||
@ -4172,7 +4172,7 @@ Docs
|
||||
- Update Contributing docs, document new support channels
|
||||
(:issue:`2762`, issue:`3038`)
|
||||
- Include references to Scrapy subreddit in the docs
|
||||
- Fix broken links; use https:// for external links
|
||||
- Fix broken links; use ``https://`` for external links
|
||||
(:issue:`2978`, :issue:`2982`, :issue:`2958`)
|
||||
- Document CloseSpider extension better (:issue:`2759`)
|
||||
- Use ``pymongo.collection.Collection.insert_one()`` in MongoDB example
|
||||
@ -4773,7 +4773,7 @@ This 1.1 release brings a lot of interesting features and bug fixes:
|
||||
- Don't retry bad requests (HTTP 400) by default (:issue:`1289`).
|
||||
If you need the old behavior, add ``400`` to :setting:`RETRY_HTTP_CODES`.
|
||||
- Fix shell files argument handling (:issue:`1710`, :issue:`1550`).
|
||||
If you try ``scrapy shell index.html`` it will try to load the URL http://index.html,
|
||||
If you try ``scrapy shell index.html`` it will try to load the URL ``http://index.html``,
|
||||
use ``scrapy shell ./index.html`` to load a local file.
|
||||
- Robots.txt compliance is now enabled by default for newly-created projects
|
||||
(:issue:`1724`). Scrapy will also wait for robots.txt to be downloaded
|
||||
@ -5449,7 +5449,7 @@ Scrapy 0.24.5 (2015-02-25)
|
||||
Scrapy 0.24.4 (2014-08-09)
|
||||
--------------------------
|
||||
|
||||
- pem file is used by mockserver and required by scrapy bench (:commit:`5eddc68`)
|
||||
- pem file is used by mockserver and required by scrapy bench (:commit:`5eddc68b63`)
|
||||
- scrapy bench needs scrapy.tests* (:commit:`d6cb999`)
|
||||
|
||||
Scrapy 0.24.3 (2014-08-09)
|
||||
@ -5970,7 +5970,7 @@ Scrapy changes:
|
||||
- nested items now fully supported in JSON and JSONLines exporters
|
||||
- added :reqmeta:`cookiejar` Request meta key to support multiple cookie sessions per spider
|
||||
- decoupled encoding detection code to `w3lib.encoding`_, and ported Scrapy code to use that module
|
||||
- dropped support for Python 2.5. See https://blog.scrapinghub.com/2012/02/27/scrapy-0-15-dropping-support-for-python-2-5/
|
||||
- dropped support for Python 2.5. See https://www.zyte.com/blog/scrapy-0-15-dropping-support-for-python-2-5/
|
||||
- dropped support for Twisted 2.5
|
||||
- added :setting:`REFERER_ENABLED` setting, to control referer middleware
|
||||
- changed default user agent to: ``Scrapy/VERSION (+http://scrapy.org)``
|
||||
@ -6048,7 +6048,7 @@ Scrapy 0.14
|
||||
New features and settings
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- Support for `AJAX crawlable urls`_
|
||||
- Support for AJAX crawlable urls
|
||||
- New persistent scheduler that stores requests on disk, allowing to suspend and resume crawls (:rev:`2737`)
|
||||
- added ``-o`` option to ``scrapy crawl``, a shortcut for dumping scraped items into a file (or standard output using ``-``)
|
||||
- Added support for passing custom settings to Scrapyd ``schedule.json`` api (:rev:`2779`, :rev:`2783`)
|
||||
@ -6319,11 +6319,10 @@ Scrapy 0.7
|
||||
First release of Scrapy.
|
||||
|
||||
|
||||
.. _AJAX crawlable urls: https://developers.google.com/search/docs/ajax-crawling/docs/getting-started?csw=1
|
||||
.. _boto3: https://github.com/boto/boto3
|
||||
.. _botocore: https://github.com/boto/botocore
|
||||
.. _chunked transfer encoding: https://en.wikipedia.org/wiki/Chunked_transfer_encoding
|
||||
.. _ClientForm: http://wwwsearch.sourceforge.net/old/ClientForm/
|
||||
.. _ClientForm: https://pypi.org/project/ClientForm/
|
||||
.. _Creating a pull request: https://help.github.com/en/articles/creating-a-pull-request
|
||||
.. _cryptography: https://cryptography.io/en/latest/
|
||||
.. _docstrings: https://docs.python.org/3/glossary.html#term-docstring
|
||||
@ -6335,7 +6334,7 @@ First release of Scrapy.
|
||||
.. _parsel.csstranslator.GenericTranslator: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.csstranslator.GenericTranslator
|
||||
.. _parsel.csstranslator.HTMLTranslator: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.csstranslator.HTMLTranslator
|
||||
.. _parsel.csstranslator.XPathExpr: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.csstranslator.XPathExpr
|
||||
.. _PEP 257: https://www.python.org/dev/peps/pep-0257/
|
||||
.. _PEP 257: https://peps.python.org/pep-0257/
|
||||
.. _Pillow: https://python-pillow.org/
|
||||
.. _pyOpenSSL: https://www.pyopenssl.org/en/stable/
|
||||
.. _queuelib: https://github.com/scrapy/queuelib
|
||||
@ -6347,7 +6346,7 @@ First release of Scrapy.
|
||||
.. _service_identity: https://service-identity.readthedocs.io/en/stable/
|
||||
.. _six: https://six.readthedocs.io/
|
||||
.. _tox: https://pypi.org/project/tox/
|
||||
.. _Twisted: https://twistedmatrix.com/trac/
|
||||
.. _Twisted: https://twisted.org/
|
||||
.. _w3lib: https://github.com/scrapy/w3lib
|
||||
.. _w3lib.encoding: https://github.com/scrapy/w3lib/blob/master/w3lib/encoding.py
|
||||
.. _What is cacheable: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
|
||||
|
@ -168,9 +168,7 @@ For more information about asynchronous programming and Twisted see these
|
||||
links:
|
||||
|
||||
* :doc:`twisted:core/howto/defer-intro`
|
||||
* `Twisted - hello, asynchronous programming`_
|
||||
* `Twisted Introduction - Krondo`_
|
||||
|
||||
.. _Twisted: https://twistedmatrix.com/trac/
|
||||
.. _Twisted - hello, asynchronous programming: http://jessenoller.com/blog/2009/02/11/twisted-hello-asynchronous-programming/
|
||||
.. _Twisted Introduction - Krondo: http://krondo.com/an-introduction-to-asynchronous-programming-and-twisted/
|
||||
.. _Twisted: https://twisted.org/
|
||||
.. _Twisted Introduction - Krondo: https://krondo.com/an-introduction-to-asynchronous-programming-and-twisted/
|
||||
|
@ -186,7 +186,7 @@ Enable crawling of "Ajax Crawlable Pages"
|
||||
=========================================
|
||||
|
||||
Some pages (up to 1%, based on empirical data from year 2013) declare
|
||||
themselves as `ajax crawlable`_. This means they provide plain HTML
|
||||
themselves as ajax crawlable. This means they provide plain HTML
|
||||
version of content that is usually available only via AJAX.
|
||||
Pages can indicate it in two ways:
|
||||
|
||||
@ -206,8 +206,6 @@ AjaxCrawlMiddleware helps to crawl them correctly.
|
||||
It is turned OFF by default because it has some performance overhead,
|
||||
and enabling it for focused crawls doesn't make much sense.
|
||||
|
||||
.. _ajax crawlable: https://developers.google.com/search/docs/ajax-crawling/docs/getting-started
|
||||
|
||||
.. _broad-crawls-bfo:
|
||||
|
||||
Crawl in BFO order
|
||||
|
@ -54,6 +54,6 @@ just like ``scrapyd-deploy``.
|
||||
.. _scrapyd-client: https://github.com/scrapy/scrapyd-client
|
||||
.. _scrapyd-deploy documentation: https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
.. _shub: https://shub.readthedocs.io/en/latest/
|
||||
.. _Zyte: https://zyte.com/
|
||||
.. _Zyte: https://www.zyte.com/
|
||||
.. _Zyte Scrapy Cloud: https://www.zyte.com/scrapy-cloud/
|
||||
.. _Zyte Scrapy Cloud documentation: https://docs.zyte.com/scrapy-cloud.html
|
||||
|
@ -278,7 +278,7 @@ into our ``url``.
|
||||
|
||||
In more complex websites, it could be difficult to easily reproduce the
|
||||
requests, as we could need to add ``headers`` or ``cookies`` to make it work.
|
||||
In those cases you can export the requests in `cURL <https://curl.haxx.se/>`_
|
||||
In those cases you can export the requests in `cURL <https://curl.se/>`_
|
||||
format, by right-clicking on each of them in the network tool and using the
|
||||
:meth:`~scrapy.Request.from_curl()` method to generate an equivalent
|
||||
request:
|
||||
|
@ -1105,7 +1105,7 @@ Parsers vary in several aspects:
|
||||
|
||||
* Support for wildcard matching
|
||||
|
||||
* Usage of `length based rule <https://developers.google.com/search/reference/robots_txt#order-of-precedence-for-group-member-lines>`_:
|
||||
* Usage of `length based rule <https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt#order-of-precedence-for-rules>`_:
|
||||
in particular for ``Allow`` and ``Disallow`` directives, where the most
|
||||
specific rule based on the length of the path trumps the less specific
|
||||
(shorter) rule
|
||||
@ -1123,7 +1123,7 @@ Based on `Protego <https://github.com/scrapy/protego>`_:
|
||||
* implemented in Python
|
||||
|
||||
* is compliant with `Google's Robots.txt Specification
|
||||
<https://developers.google.com/search/reference/robots_txt>`_
|
||||
<https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt>`_
|
||||
|
||||
* supports wildcard matching
|
||||
|
||||
@ -1158,7 +1158,7 @@ In order to use this parser, set:
|
||||
Robotexclusionrulesparser
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Based on `Robotexclusionrulesparser <http://nikitathespider.com/python/rerp/>`_:
|
||||
Based on `Robotexclusionrulesparser <https://pypi.org/project/robotexclusionrulesparser/>`_:
|
||||
|
||||
* implemented in Python
|
||||
|
||||
@ -1171,7 +1171,7 @@ Based on `Robotexclusionrulesparser <http://nikitathespider.com/python/rerp/>`_:
|
||||
|
||||
In order to use this parser:
|
||||
|
||||
* Install `Robotexclusionrulesparser <http://nikitathespider.com/python/rerp/>`_ by running
|
||||
* Install ``Robotexclusionrulesparser`` by running
|
||||
``pip install robotexclusionrulesparser``
|
||||
|
||||
* Set :setting:`ROBOTSTXT_PARSER` setting to
|
||||
@ -1231,9 +1231,7 @@ AjaxCrawlMiddleware
|
||||
.. class:: AjaxCrawlMiddleware
|
||||
|
||||
Middleware that finds 'AJAX crawlable' page variants based
|
||||
on meta-fragment html tag. See
|
||||
https://developers.google.com/search/docs/ajax-crawling/docs/getting-started
|
||||
for more info.
|
||||
on meta-fragment html tag.
|
||||
|
||||
.. note::
|
||||
|
||||
|
@ -85,9 +85,8 @@ It might be enough to yield a :class:`~scrapy.Request` with the same HTTP
|
||||
method and URL. However, you may also need to reproduce the body, headers and
|
||||
form parameters (see :class:`~scrapy.FormRequest`) of that request.
|
||||
|
||||
As all major browsers allow to export the requests in `cURL
|
||||
<https://curl.haxx.se/>`_ format, Scrapy incorporates the method
|
||||
:meth:`~scrapy.Request.from_curl()` to generate an equivalent
|
||||
As all major browsers allow to export the requests in curl_ format, Scrapy
|
||||
incorporates the method :meth:`~scrapy.Request.from_curl()` to generate an equivalent
|
||||
:class:`~scrapy.Request` from a cURL command. To get more information
|
||||
visit :ref:`request from curl <requests-from-curl>` inside the network
|
||||
tool section.
|
||||
@ -289,7 +288,7 @@ We recommend using `scrapy-playwright`_ for a better integration.
|
||||
.. _JavaScript: https://en.wikipedia.org/wiki/JavaScript
|
||||
.. _Splash: https://github.com/scrapinghub/splash
|
||||
.. _chompjs: https://github.com/Nykakin/chompjs
|
||||
.. _curl: https://curl.haxx.se/
|
||||
.. _curl: https://curl.se/
|
||||
.. _headless browser: https://en.wikipedia.org/wiki/Headless_browser
|
||||
.. _js2xml: https://github.com/scrapinghub/js2xml
|
||||
.. _playwright-python: https://github.com/microsoft/playwright-python
|
||||
|
@ -546,8 +546,4 @@ Invokes a :doc:`Python debugger <library/pdb>` inside a running Scrapy process w
|
||||
signal is received. After the debugger is exited, the Scrapy process continues
|
||||
running normally.
|
||||
|
||||
For more info see `Debugging in Python`_.
|
||||
|
||||
This extension only works on POSIX-compliant platforms (i.e. not Windows).
|
||||
|
||||
.. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/
|
||||
|
@ -213,7 +213,7 @@ passed through the following settings:
|
||||
- :setting:`AWS_SECRET_ACCESS_KEY`
|
||||
- :setting:`AWS_SESSION_TOKEN` (only needed for `temporary security credentials`_)
|
||||
|
||||
.. _temporary security credentials: https://docs.aws.amazon.com/general/latest/gr/aws-sec-cred-types.html#temporary-access-keys
|
||||
.. _temporary security credentials: https://docs.aws.amazon.com/IAM/latest/UserGuide/security-creds.html
|
||||
|
||||
You can also define a custom ACL, custom endpoint, and region name for exported
|
||||
feeds using these settings:
|
||||
@ -248,7 +248,7 @@ The feeds are stored on `Google Cloud Storage`_.
|
||||
|
||||
- Required external libraries: `google-cloud-storage`_.
|
||||
|
||||
For more information about authentication, please refer to `Google Cloud documentation <https://cloud.google.com/docs/authentication/production>`_.
|
||||
For more information about authentication, please refer to `Google Cloud documentation <https://cloud.google.com/docs/authentication>`_.
|
||||
|
||||
You can set a *Project ID* and *Access Control List (ACL)* through the following settings:
|
||||
|
||||
@ -516,8 +516,7 @@ as a fallback value if that key is not provided for a specific feed definition:
|
||||
.. note:: Some FTP servers may not support appending to files (the
|
||||
``APPE`` FTP command).
|
||||
|
||||
- :ref:`topics-feed-storage-s3`: ``True`` (appending `is not supported
|
||||
<https://forums.aws.amazon.com/message.jspa?messageID=540395>`_)
|
||||
- :ref:`topics-feed-storage-s3`: ``True`` (appending is not supported)
|
||||
|
||||
- :ref:`topics-feed-storage-gcs`: ``True`` (appending is not supported)
|
||||
|
||||
@ -816,5 +815,5 @@ source spider in the feed URI:
|
||||
.. _URIs: https://en.wikipedia.org/wiki/Uniform_Resource_Identifier
|
||||
.. _Amazon S3: https://aws.amazon.com/s3/
|
||||
.. _boto3: https://github.com/boto/boto3
|
||||
.. _Canned ACL: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl
|
||||
.. _Canned ACL: https://docs.aws.amazon.com/AmazonS3/latest/userguide/acl-overview.html#canned-acl
|
||||
.. _Google Cloud Storage: https://cloud.google.com/storage/
|
||||
|
@ -175,7 +175,7 @@ method and how to clean up the resources properly.
|
||||
return item
|
||||
|
||||
.. _MongoDB: https://www.mongodb.com/
|
||||
.. _pymongo: https://api.mongodb.com/python/current/
|
||||
.. _pymongo: https://pymongo.readthedocs.io/en/stable/
|
||||
|
||||
|
||||
.. _ScreenshotPipeline:
|
||||
|
@ -221,7 +221,7 @@ the :attr:`Item.fields` attribute.
|
||||
`attr.ib`_ for additional information.
|
||||
|
||||
.. _dataclasses.field: https://docs.python.org/3/library/dataclasses.html#dataclasses.field
|
||||
.. _attr.ib: https://www.attrs.org/en/stable/api.html#attr.ib
|
||||
.. _attr.ib: https://www.attrs.org/en/stable/api-attr.html#attr.ib
|
||||
|
||||
|
||||
Working with Item objects
|
||||
|
@ -261,7 +261,7 @@ policy:
|
||||
For more information, see `canned ACLs`_ in the Amazon S3 Developer Guide.
|
||||
|
||||
You can also use other S3-like storages. Storages like self-hosted `Minio`_ or
|
||||
`s3.scality`_. All you need to do is set endpoint option in you Scrapy
|
||||
`Zenko CloudServer`_. All you need to do is set endpoint option in you Scrapy
|
||||
settings:
|
||||
|
||||
.. code-block:: python
|
||||
@ -276,9 +276,9 @@ For self-hosting you also might feel the need not to use SSL and not to verify S
|
||||
AWS_VERIFY = False # or True (None by default)
|
||||
|
||||
.. _botocore: https://github.com/boto/botocore
|
||||
.. _canned ACLs: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl
|
||||
.. _canned ACLs: https://docs.aws.amazon.com/AmazonS3/latest/userguide/acl-overview.html#canned-acl
|
||||
.. _Minio: https://github.com/minio/minio
|
||||
.. _s3.scality: https://s3.scality.com/
|
||||
.. _Zenko CloudServer: https://www.zenko.io/cloudserver/
|
||||
|
||||
|
||||
.. _media-pipeline-gcs:
|
||||
@ -303,7 +303,7 @@ For example, these are valid :setting:`IMAGES_STORE` and :setting:`GCS_PROJECT_I
|
||||
|
||||
For information about authentication, see this `documentation`_.
|
||||
|
||||
.. _documentation: https://cloud.google.com/docs/authentication/production
|
||||
.. _documentation: https://cloud.google.com/docs/authentication
|
||||
|
||||
You can modify the Access Control List (ACL) policy used for the stored files,
|
||||
which is defined by the :setting:`FILES_STORE_GCS_ACL` and
|
||||
|
@ -591,7 +591,7 @@ Another common case would be to extract all direct ``<p>`` children:
|
||||
For more details about relative XPaths see the `Location Paths`_ section in the
|
||||
XPath specification.
|
||||
|
||||
.. _Location Paths: https://www.w3.org/TR/xpath/all/#location-paths
|
||||
.. _Location Paths: https://www.w3.org/TR/xpath-10/#location-paths
|
||||
|
||||
When querying by class, consider using CSS
|
||||
------------------------------------------
|
||||
@ -727,7 +727,7 @@ But using the ``.`` to mean the node, works:
|
||||
>>> sel.xpath("//a[contains(., 'Next Page')]").getall()
|
||||
['<a href="#">Click here to go to the <strong>Next Page</strong></a>']
|
||||
|
||||
.. _`XPath string function`: https://www.w3.org/TR/xpath/all/#section-String-Functions
|
||||
.. _`XPath string function`: https://www.w3.org/TR/xpath-10/#section-String-Functions
|
||||
|
||||
.. _topics-selectors-xpath-variables:
|
||||
|
||||
@ -801,8 +801,8 @@ This is how the file starts::
|
||||
...
|
||||
|
||||
You can see several namespace declarations including a default
|
||||
"http://www.w3.org/2005/Atom" and another one using the "gd:" prefix for
|
||||
"http://schemas.google.com/g/2005".
|
||||
``"http://www.w3.org/2005/Atom"`` and another one using the ``gd:`` prefix for
|
||||
``"http://schemas.google.com/g/2005"``.
|
||||
|
||||
.. highlight:: python
|
||||
|
||||
|
@ -288,7 +288,7 @@ The AWS security token used by code that requires access to `Amazon Web services
|
||||
such as the :ref:`S3 feed storage backend <topics-feed-storage-s3>`, when using
|
||||
`temporary security credentials`_.
|
||||
|
||||
.. _temporary security credentials: https://docs.aws.amazon.com/general/latest/gr/aws-sec-cred-types.html#temporary-access-keys
|
||||
.. _temporary security credentials: https://docs.aws.amazon.com/IAM/latest/UserGuide/security-creds.html
|
||||
|
||||
.. setting:: AWS_ENDPOINT_URL
|
||||
|
||||
@ -617,7 +617,7 @@ necessary to access certain HTTPS websites: for example, you may need to use
|
||||
``'DEFAULT:!DH'`` for a website with weak DH parameters or enable a
|
||||
specific cipher that is not included in ``DEFAULT`` if a website requires it.
|
||||
|
||||
.. _OpenSSL cipher list format: https://www.openssl.org/docs/manmaster/man1/openssl-ciphers.html#CIPHER-LIST-FORMAT
|
||||
.. _OpenSSL cipher list format: https://docs.openssl.org/master/man1/openssl-ciphers/#cipher-list-format
|
||||
|
||||
.. setting:: DOWNLOADER_CLIENT_TLS_METHOD
|
||||
|
||||
@ -829,9 +829,9 @@ The default HTTPS handler uses HTTP/1.1. To use HTTP/2:
|
||||
- No support for the :signal:`bytes_received` and
|
||||
:signal:`headers_received` signals.
|
||||
|
||||
.. _frame size: https://tools.ietf.org/html/rfc7540#section-4.2
|
||||
.. _frame size: https://datatracker.ietf.org/doc/html/rfc7540#section-4.2
|
||||
.. _http2 faq: https://http2.github.io/faq/#does-http2-require-encryption
|
||||
.. _server pushes: https://tools.ietf.org/html/rfc7540#section-8.2
|
||||
.. _server pushes: https://datatracker.ietf.org/doc/html/rfc7540#section-8.2
|
||||
|
||||
.. setting:: DOWNLOAD_SLOTS
|
||||
|
||||
@ -1074,7 +1074,7 @@ in ``Request`` meta.
|
||||
some FTP servers explicitly ask for the user's e-mail address
|
||||
and will not allow login with the "guest" password.
|
||||
|
||||
.. _RFC 1635: https://tools.ietf.org/html/rfc1635
|
||||
.. _RFC 1635: https://datatracker.ietf.org/doc/html/rfc1635
|
||||
|
||||
.. reqmeta:: ftp_user
|
||||
.. setting:: FTP_USER
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Run tests, generate coverage report and open it on a browser
|
||||
#
|
||||
# Requires: coverage 3.3 or above from https://pypi.python.org/pypi/coverage
|
||||
# Requires: coverage 3.3 or above from https://pypi.org/pypi/coverage
|
||||
|
||||
coverage run --branch $(which trial) --reporter=text tests
|
||||
coverage html -i
|
||||
|
@ -24,7 +24,6 @@ logger = logging.getLogger(__name__)
|
||||
class AjaxCrawlMiddleware:
|
||||
"""
|
||||
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
|
||||
For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
|
||||
def __init__(self, settings: BaseSettings):
|
||||
@ -70,8 +69,7 @@ class AjaxCrawlMiddleware:
|
||||
|
||||
def _has_ajax_crawlable_variant(self, response: Response) -> bool:
|
||||
"""
|
||||
Return True if a page without hash fragment could be "AJAX crawlable"
|
||||
according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
Return True if a page without hash fragment could be "AJAX crawlable".
|
||||
"""
|
||||
body = response.text[: self.lookup_bytes]
|
||||
return _has_ajaxcrawlable_meta(body)
|
||||
|
@ -222,7 +222,7 @@ class Request(object_ref):
|
||||
**kwargs: Any,
|
||||
) -> Self:
|
||||
"""Create a Request object from a string containing a `cURL
|
||||
<https://curl.haxx.se/>`_ command. It populates the HTTP method, the
|
||||
<https://curl.se/>`_ command. It populates the HTTP method, the
|
||||
URL, the headers, the cookies and the body. It accepts the same
|
||||
arguments as the :class:`Request` class, taking preference and
|
||||
overriding the values of the same arguments contained in the cURL
|
||||
|
@ -46,17 +46,15 @@ def fingerprint(
|
||||
|
||||
The request fingerprint is a hash that uniquely identifies the resource the
|
||||
request points to. For example, take the following two urls:
|
||||
|
||||
http://www.example.com/query?id=111&cat=222
|
||||
http://www.example.com/query?cat=222&id=111
|
||||
``http://www.example.com/query?id=111&cat=222``,
|
||||
``http://www.example.com/query?cat=222&id=111``.
|
||||
|
||||
Even though those are two different URLs both point to the same resource
|
||||
and are equivalent (i.e. they should return the same response).
|
||||
|
||||
Another example are cookies used to store session ids. Suppose the
|
||||
following page is only accessible to authenticated users:
|
||||
|
||||
http://www.example.com/members/offers.html
|
||||
``http://www.example.com/members/offers.html``.
|
||||
|
||||
Lots of sites use a cookie to store the session id, which adds a random
|
||||
component to the HTTP Request and thus should be ignored when calculating
|
||||
|
@ -61,8 +61,7 @@ def parse_url(url: UrlT, encoding: str | None = None) -> ParseResult:
|
||||
|
||||
def escape_ajax(url: str) -> str:
|
||||
"""
|
||||
Return the crawlable url according to:
|
||||
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
||||
Return the crawlable url
|
||||
|
||||
>>> escape_ajax("www.example.com/ajax.html#!key=value")
|
||||
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
|
||||
|
@ -143,7 +143,7 @@ class RequestTest(unittest.TestCase):
|
||||
# percent-escaping sequences that do not match valid UTF-8 sequences
|
||||
# should be kept untouched (just upper-cased perhaps)
|
||||
#
|
||||
# See https://tools.ietf.org/html/rfc3987#section-3.2
|
||||
# See https://datatracker.ietf.org/doc/html/rfc3987#section-3.2
|
||||
#
|
||||
# "Conversions from URIs to IRIs MUST NOT use any character encoding
|
||||
# other than UTF-8 in steps 3 and 4, even if it might be possible to
|
||||
|
@ -220,9 +220,7 @@ skip_pillow: str | None
|
||||
try:
|
||||
from PIL import Image # noqa: imported just to check for the import error
|
||||
except ImportError:
|
||||
skip_pillow = (
|
||||
"Missing Python Imaging Library, install https://pypi.python.org/pypi/Pillow"
|
||||
)
|
||||
skip_pillow = "Missing Python Imaging Library, install https://pypi.org/pypi/Pillow"
|
||||
else:
|
||||
skip_pillow = None
|
||||
|
||||
|
@ -19,9 +19,7 @@ skip_pillow: str | None
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
skip_pillow = (
|
||||
"Missing Python Imaging Library, install https://pypi.python.org/pypi/Pillow"
|
||||
)
|
||||
skip_pillow = "Missing Python Imaging Library, install https://pypi.org/pypi/Pillow"
|
||||
else:
|
||||
encoders = {"jpeg_encoder", "jpeg_decoder"}
|
||||
if not encoders.issubset(set(Image.core.__dict__)): # type: ignore[attr-defined]
|
||||
|
@ -21,7 +21,7 @@ try:
|
||||
from PIL import Image # noqa: imported just to check for the import error
|
||||
except ImportError:
|
||||
skip_pillow: str | None = (
|
||||
"Missing Python Imaging Library, install https://pypi.python.org/pypi/Pillow"
|
||||
"Missing Python Imaging Library, install https://pypi.org/pypi/Pillow"
|
||||
)
|
||||
else:
|
||||
skip_pillow = None
|
||||
|
Loading…
x
Reference in New Issue
Block a user