1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-22 06:52:53 +00:00

Merge remote-tracking branch 'upstream/master' into remove-six-code

This commit is contained in:
Eugenio Lacuesta 2019-11-19 11:01:34 -03:00
commit 05785c1c17
No known key found for this signature in database
GPG Key ID: DA3EF2D0913E9810
51 changed files with 634 additions and 202 deletions

16
.bandit.yml Normal file
View File

@ -0,0 +1,16 @@
skips:
- B101
- B105
- B303
- B306
- B307
- B311
- B320
- B321
- B402
- B404
- B406
- B410
- B503
- B603
- B605

View File

@ -7,6 +7,8 @@ branches:
- /^\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/
matrix:
include:
- env: TOXENV=security
python: 3.8
- env: TOXENV=flake8
python: 3.8
- env: TOXENV=pypy3

View File

@ -68,7 +68,7 @@ members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at [http://contributor-covenant.org/version/1/4][version]
available at [http://contributor-covenant.org/version/1/4][version].
[homepage]: http://contributor-covenant.org
[version]: http://contributor-covenant.org/version/1/4/

View File

@ -34,8 +34,8 @@ Scrapy is a fast high-level web crawling and web scraping framework, used to
crawl websites and extract structured data from their pages. It can be used for
a wide range of purposes, from data mining to monitoring and automated testing.
For more information including a list of features check the Scrapy homepage at:
https://scrapy.org
Check the Scrapy homepage at https://scrapy.org for more information,
including a list of features.
Requirements
============
@ -50,8 +50,8 @@ The quick way::
pip install scrapy
For more details see the install section in the documentation:
https://docs.scrapy.org/en/latest/intro/install.html
See the install section in the documentation at
https://docs.scrapy.org/en/latest/intro/install.html for more details.
Documentation
=============
@ -62,17 +62,17 @@ directory.
Releases
========
You can find release notes at https://docs.scrapy.org/en/latest/news.html
You can check https://docs.scrapy.org/en/latest/news.html for the release notes.
Community (blog, twitter, mail list, IRC)
=========================================
See https://scrapy.org/community/
See https://scrapy.org/community/ for details.
Contributing
============
See https://docs.scrapy.org/en/master/contributing.html
See https://docs.scrapy.org/en/master/contributing.html for details.
Code of Conduct
---------------
@ -86,9 +86,9 @@ Please report unacceptable behavior to opensource@scrapinghub.com.
Companies using Scrapy
======================
See https://scrapy.org/companies/
See https://scrapy.org/companies/ for a list.
Commercial Support
==================
See https://scrapy.org/support/
See https://scrapy.org/support/ for details.

281
docs/_tests/quotes1.html Normal file
View File

@ -0,0 +1,281 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Quotes to Scrape</title>
<link rel="stylesheet" href="/static/bootstrap.min.css">
<link rel="stylesheet" href="/static/main.css">
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="change,deep-thoughts,thinking,world" / >
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
<span>by <small class="author" itemprop="author">J.K. Rowling</small>
<a href="/author/J-K-Rowling">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="abilities,choices" / >
<a class="tag" href="/tag/abilities/page/1/">abilities</a>
<a class="tag" href="/tag/choices/page/1/">choices</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="inspirational,life,live,miracle,miracles" / >
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
<a class="tag" href="/tag/life/page/1/">life</a>
<a class="tag" href="/tag/live/page/1/">live</a>
<a class="tag" href="/tag/miracle/page/1/">miracle</a>
<a class="tag" href="/tag/miracles/page/1/">miracles</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>
<span>by <small class="author" itemprop="author">Jane Austen</small>
<a href="/author/Jane-Austen">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="aliteracy,books,classic,humor" / >
<a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>
<a class="tag" href="/tag/books/page/1/">books</a>
<a class="tag" href="/tag/classic/page/1/">classic</a>
<a class="tag" href="/tag/humor/page/1/">humor</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it&#39;s better to be absolutely ridiculous than absolutely boring.”</span>
<span>by <small class="author" itemprop="author">Marilyn Monroe</small>
<a href="/author/Marilyn-Monroe">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="be-yourself,inspirational" / >
<a class="tag" href="/tag/be-yourself/page/1/">be-yourself</a>
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="adulthood,success,value" / >
<a class="tag" href="/tag/adulthood/page/1/">adulthood</a>
<a class="tag" href="/tag/success/page/1/">success</a>
<a class="tag" href="/tag/value/page/1/">value</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.”</span>
<span>by <small class="author" itemprop="author">André Gide</small>
<a href="/author/Andre-Gide">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="life,love" / >
<a class="tag" href="/tag/life/page/1/">life</a>
<a class="tag" href="/tag/love/page/1/">love</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“I have not failed. I&#39;ve just found 10,000 ways that won&#39;t work.”</span>
<span>by <small class="author" itemprop="author">Thomas A. Edison</small>
<a href="/author/Thomas-A-Edison">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="edison,failure,inspirational,paraphrased" / >
<a class="tag" href="/tag/edison/page/1/">edison</a>
<a class="tag" href="/tag/failure/page/1/">failure</a>
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
<a class="tag" href="/tag/paraphrased/page/1/">paraphrased</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“A woman is like a tea bag; you never know how strong it is until it&#39;s in hot water.”</span>
<span>by <small class="author" itemprop="author">Eleanor Roosevelt</small>
<a href="/author/Eleanor-Roosevelt">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="misattributed-eleanor-roosevelt" / >
<a class="tag" href="/tag/misattributed-eleanor-roosevelt/page/1/">misattributed-eleanor-roosevelt</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“A day without sunshine is like, you know, night.”</span>
<span>by <small class="author" itemprop="author">Steve Martin</small>
<a href="/author/Steve-Martin">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="humor,obvious,simile" / >
<a class="tag" href="/tag/humor/page/1/">humor</a>
<a class="tag" href="/tag/obvious/page/1/">obvious</a>
<a class="tag" href="/tag/simile/page/1/">simile</a>
</div>
</div>
<nav>
<ul class="pager">
<li class="next">
<a href="/page/2/">Next <span aria-hidden="true">&rarr;</span></a>
</li>
</ul>
</nav>
</div>
<div class="col-md-4 tags-box">
<h2>Top Ten tags</h2>
<span class="tag-item">
<a class="tag" style="font-size: 28px" href="/tag/love/">love</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 26px" href="/tag/inspirational/">inspirational</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 26px" href="/tag/life/">life</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 24px" href="/tag/humor/">humor</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 22px" href="/tag/books/">books</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 14px" href="/tag/reading/">reading</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 10px" href="/tag/friendship/">friendship</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 8px" href="/tag/friends/">friends</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 8px" href="/tag/truth/">truth</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 6px" href="/tag/simile/">simile</a>
</span>
</div>
</div>
</div>
<footer class="footer">
<div class="container">
<p class="text-muted">
Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
</p>
<p class="copyright">
Made with <span class='sh-red'></span> by <a href="https://scrapinghub.com">Scrapinghub</a>
</p>
</div>
</footer>
</body>
</html>

View File

@ -27,10 +27,12 @@ sys.path.insert(0, path.dirname(path.dirname(__file__)))
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
'notfound.extension',
'scrapydocs',
'sphinx.ext.autodoc',
'sphinx.ext.coverage',
'sphinx.ext.intersphinx',
'sphinx.ext.viewcode',
]
# Add any paths that contain templates here, relative to this directory.
@ -237,7 +239,7 @@ coverage_ignore_pyobjects = [
r'\bContractsManager\b$',
# For default contracts we only want to document their general purpose in
# their constructor, the methods they reimplement to achieve that purpose
# their __init__ method, the methods they reimplement to achieve that purpose
# should be irrelevant to developers using those contracts.
r'\w+Contract\.(adjust_request_args|(pre|post)_process)$',
@ -273,4 +275,5 @@ coverage_ignore_pyobjects = [
intersphinx_mapping = {
'python': ('https://docs.python.org/3', None),
'sphinx': ('https://www.sphinx-doc.org/en/stable', None),
}

29
docs/conftest.py Normal file
View File

@ -0,0 +1,29 @@
import os
from doctest import ELLIPSIS, NORMALIZE_WHITESPACE
from scrapy.http.response.html import HtmlResponse
from sybil import Sybil
from sybil.parsers.codeblock import CodeBlockParser
from sybil.parsers.doctest import DocTestParser
from sybil.parsers.skip import skip
def load_response(url, filename):
input_path = os.path.join(os.path.dirname(__file__), '_tests', filename)
with open(input_path, 'rb') as input_file:
return HtmlResponse(url, body=input_file.read())
def setup(namespace):
namespace['load_response'] = load_response
pytest_collect_file = Sybil(
parsers=[
DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE),
CodeBlockParser(future_imports=['print_function']),
skip,
],
pattern='*.rst',
setup=setup,
).pytest()

View File

@ -177,20 +177,19 @@ Documentation policies
======================
For reference documentation of API members (classes, methods, etc.) use
docstrings and make sure that the Sphinx documentation uses the autodoc_
extension to pull the docstrings. API reference documentation should follow
docstring conventions (`PEP 257`_) and be IDE-friendly: short, to the point,
and it may provide short examples.
docstrings and make sure that the Sphinx documentation uses the
:mod:`~sphinx.ext.autodoc` extension to pull the docstrings. API reference
documentation should follow docstring conventions (`PEP 257`_) and be
IDE-friendly: short, to the point, and it may provide short examples.
Other types of documentation, such as tutorials or topics, should be covered in
files within the ``docs/`` directory. This includes documentation that is
specific to an API member, but goes beyond API reference documentation.
In any case, if something is covered in a docstring, use the autodoc_
extension to pull the docstring into the documentation instead of duplicating
the docstring in files within the ``docs/`` directory.
.. _autodoc: http://www.sphinx-doc.org/en/stable/ext/autodoc.html
In any case, if something is covered in a docstring, use the
:mod:`~sphinx.ext.autodoc` extension to pull the docstring into the
documentation instead of duplicating the docstring in files within the
``docs/`` directory.
Tests
=====

View File

@ -235,13 +235,16 @@ You will see something like::
[s] shelp() Shell help (print this help)
[s] fetch(req_or_url) Fetch request (or URL) and update local objects
[s] view(response) View response in a browser
>>>
Using the shell, you can try selecting elements using `CSS`_ with the response
object::
object:
>>> response.css('title')
[<Selector xpath='descendant-or-self::title' data='<title>Quotes to Scrape</title>'>]
.. invisible-code-block: python
response = load_response('http://quotes.toscrape.com/page/1/', 'quotes1.html')
>>> response.css('title')
[<Selector xpath='descendant-or-self::title' data='<title>Quotes to Scrape</title>'>]
The result of running ``response.css('title')`` is a list-like object called
:class:`~scrapy.selector.SelectorList`, which represents a list of
@ -372,6 +375,9 @@ we want::
We get a list of selectors for the quote HTML elements with::
>>> response.css("div.quote")
[<Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' quote ')]" data='<div class="quote" itemscope itemtype...'>,
<Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' quote ')]" data='<div class="quote" itemscope itemtype...'>,
...]
Each of the selectors returned by the query above allows us to run further
queries over their sub-elements. Let's assign the first selector to a
@ -396,6 +402,12 @@ to get all of them::
>>> tags
['change', 'deep-thoughts', 'thinking', 'world']
.. invisible-code-block: python
from sys import version_info
.. skip: next if(version_info < (3, 6), reason="Only Python 3.6+ dictionaries match the output")
Having figured out how to extract each bit, we can now iterate over all the
quotes elements and put them together into a Python dictionary::
@ -404,10 +416,9 @@ quotes elements and put them together into a Python dictionary::
... author = quote.css("small.author::text").get()
... tags = quote.css("div.tags a.tag::text").getall()
... print(dict(text=text, author=author, tags=tags))
{'tags': ['change', 'deep-thoughts', 'thinking', 'world'], 'author': 'Albert Einstein', 'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'}
{'tags': ['abilities', 'choices'], 'author': 'J.K. Rowling', 'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”'}
... a few more of these, omitted for brevity
>>>
{'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'author': 'Albert Einstein', 'tags': ['change', 'deep-thoughts', 'thinking', 'world']}
{'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'author': 'J.K. Rowling', 'tags': ['abilities', 'choices']}
...
Extracting data in our spider
-----------------------------
@ -521,7 +532,7 @@ There is also an ``attrib`` property available
(see :ref:`selecting-attributes` for more)::
>>> response.css('li.next a').attrib['href']
'/page/2'
'/page/2/'
Let's see now our spider modified to recursively follow the link to the next
page, extracting data from it::

View File

@ -308,12 +308,12 @@ New features
convenient way to build JSON requests (:issue:`3504`, :issue:`3505`)
* A ``process_request`` callback passed to the :class:`~scrapy.spiders.Rule`
constructor now receives the :class:`~scrapy.http.Response` object that
``__init__`` method now receives the :class:`~scrapy.http.Response` object that
originated the request as its second argument (:issue:`3682`)
* A new ``restrict_text`` parameter for the
:attr:`LinkExtractor <scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor>`
constructor allows filtering links by linking text (:issue:`3622`,
``__init__`` method allows filtering links by linking text (:issue:`3622`,
:issue:`3635`)
* A new :setting:`FEED_STORAGE_S3_ACL` setting allows defining a custom ACL
@ -479,7 +479,7 @@ The following deprecated APIs have been removed (:issue:`3578`):
* From :class:`~scrapy.selector.Selector`:
* ``_root`` (both the constructor argument and the object property, use
* ``_root`` (both the ``__init__`` method argument and the object property, use
``root``)
* ``extract_unquoted`` (use ``getall``)
@ -2703,7 +2703,7 @@ Scrapy changes:
- removed ``ENCODING_ALIASES`` setting, as encoding auto-detection has been moved to the `w3lib`_ library
- promoted :ref:`topics-djangoitem` to main contrib
- LogFormatter method now return dicts(instead of strings) to support lazy formatting (:issue:`164`, :commit:`dcef7b0`)
- downloader handlers (:setting:`DOWNLOAD_HANDLERS` setting) now receive settings as the first argument of the constructor
- downloader handlers (:setting:`DOWNLOAD_HANDLERS` setting) now receive settings as the first argument of the ``__init__`` method
- replaced memory usage acounting with (more portable) `resource`_ module, removed ``scrapy.utils.memory`` module
- removed signal: ``scrapy.mail.mail_sent``
- removed ``TRACK_REFS`` setting, now :ref:`trackrefs <topics-leaks-trackrefs>` is always enabled
@ -2917,7 +2917,7 @@ API changes
- ``Request.copy()`` and ``Request.replace()`` now also copies their ``callback`` and ``errback`` attributes (#231)
- Removed ``UrlFilterMiddleware`` from ``scrapy.contrib`` (already disabled by default)
- Offsite middelware doesn't filter out any request coming from a spider that doesn't have a allowed_domains attribute (#225)
- Removed Spider Manager ``load()`` method. Now spiders are loaded in the constructor itself.
- Removed Spider Manager ``load()`` method. Now spiders are loaded in the ``__init__`` method itself.
- Changes to Scrapy Manager (now called "Crawler"):
- ``scrapy.core.manager.ScrapyManager`` class renamed to ``scrapy.crawler.Crawler``
- ``scrapy.core.manager.scrapymanager`` singleton moved to ``scrapy.project.crawler``

View File

@ -1,2 +1,3 @@
Sphinx>=2.1
sphinx_rtd_theme
sphinx-notfound-page
sphinx_rtd_theme

View File

@ -21,7 +21,7 @@ Quick example
=============
There are two ways to instantiate the mail sender. You can instantiate it using
the standard constructor::
the standard ``__init__`` method::
from scrapy.mail import MailSender
mailer = MailSender()
@ -111,7 +111,7 @@ uses `Twisted non-blocking IO`_, like the rest of the framework.
Mail settings
=============
These settings define the default constructor values of the :class:`MailSender`
These settings define the default ``__init__`` method values of the :class:`MailSender`
class, and can be used to configure e-mail notifications in your project without
writing any code (for those extensions and code that uses :class:`MailSender`).

View File

@ -87,8 +87,8 @@ described next.
1. Declaring a serializer in the field
--------------------------------------
If you use :class:`~.Item` you can declare a serializer in the
:ref:`field metadata <topics-items-fields>`. The serializer must be
If you use :class:`~.Item` you can declare a serializer in the
:ref:`field metadata <topics-items-fields>`. The serializer must be
a callable which receives a value and returns its serialized form.
Example::
@ -144,7 +144,7 @@ BaseItemExporter
defining what fields to export, whether to export empty fields, or which
encoding to use.
These features can be configured through the constructor arguments which
These features can be configured through the ``__init__`` method arguments which
populate their respective instance attributes: :attr:`fields_to_export`,
:attr:`export_empty_fields`, :attr:`encoding`, :attr:`indent`.
@ -246,8 +246,8 @@ XmlItemExporter
:param item_element: The name of each item element in the exported XML.
:type item_element: str
The additional keyword arguments of this constructor are passed to the
:class:`BaseItemExporter` constructor.
The additional keyword arguments of this ``__init__`` method are passed to the
:class:`BaseItemExporter` ``__init__`` method.
A typical output of this exporter would be::
@ -306,9 +306,9 @@ CsvItemExporter
multi-valued fields, if found.
:type include_headers_line: str
The additional keyword arguments of this constructor are passed to the
:class:`BaseItemExporter` constructor, and the leftover arguments to the
`csv.writer`_ constructor, so you can use any ``csv.writer`` constructor
The additional keyword arguments of this ``__init__`` method are passed to the
:class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to the
`csv.writer`_ ``__init__`` method, so you can use any ``csv.writer`` ``__init__`` method
argument to customize this exporter.
A typical output of this exporter would be::
@ -334,8 +334,8 @@ PickleItemExporter
For more information, refer to the `pickle module documentation`_.
The additional keyword arguments of this constructor are passed to the
:class:`BaseItemExporter` constructor.
The additional keyword arguments of this ``__init__`` method are passed to the
:class:`BaseItemExporter` ``__init__`` method.
Pickle isn't a human readable format, so no output examples are provided.
@ -351,8 +351,8 @@ PprintItemExporter
:param file: the file-like object to use for exporting the data. Its ``write`` method should
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
The additional keyword arguments of this constructor are passed to the
:class:`BaseItemExporter` constructor.
The additional keyword arguments of this ``__init__`` method are passed to the
:class:`BaseItemExporter` ``__init__`` method.
A typical output of this exporter would be::
@ -367,10 +367,10 @@ JsonItemExporter
.. class:: JsonItemExporter(file, \**kwargs)
Exports Items in JSON format to the specified file-like object, writing all
objects as a list of objects. The additional constructor arguments are
passed to the :class:`BaseItemExporter` constructor, and the leftover
arguments to the `JSONEncoder`_ constructor, so you can use any
`JSONEncoder`_ constructor argument to customize this exporter.
objects as a list of objects. The additional ``__init__`` method arguments are
passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover
arguments to the `JSONEncoder`_ ``__init__`` method, so you can use any
`JSONEncoder`_ ``__init__`` method argument to customize this exporter.
:param file: the file-like object to use for exporting the data. Its ``write`` method should
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
@ -398,10 +398,10 @@ JsonLinesItemExporter
.. class:: JsonLinesItemExporter(file, \**kwargs)
Exports Items in JSON format to the specified file-like object, writing one
JSON-encoded item per line. The additional constructor arguments are passed
to the :class:`BaseItemExporter` constructor, and the leftover arguments to
the `JSONEncoder`_ constructor, so you can use any `JSONEncoder`_
constructor argument to customize this exporter.
JSON-encoded item per line. The additional ``__init__`` method arguments are passed
to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to
the `JSONEncoder`_ ``__init__`` method, so you can use any `JSONEncoder`_
``__init__`` method argument to customize this exporter.
:param file: the file-like object to use for exporting the data. Its ``write`` method should
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)

View File

@ -28,7 +28,7 @@ Loading & activating extensions
Extensions are loaded and activated at startup by instantiating a single
instance of the extension class. Therefore, all the extension initialization
code must be performed in the class constructor (``__init__`` method).
code must be performed in the class ``__init__`` method.
To make an extension available, add it to the :setting:`EXTENSIONS` setting in
your Scrapy settings. In :setting:`EXTENSIONS`, each extension is represented

View File

@ -16,12 +16,12 @@ especially in a larger project with many spiders.
To define common output data format Scrapy provides the :class:`Item` class.
:class:`Item` objects are simple containers used to collect the scraped data.
They provide a `dictionary-like`_ API with a convenient syntax for declaring
their available fields.
their available fields.
Various Scrapy components use extra information provided by Items:
Various Scrapy components use extra information provided by Items:
exporters look at declared fields to figure out columns to export,
serialization can be customized using Item fields metadata, :mod:`trackref`
tracks Item instances to help find memory leaks
tracks Item instances to help find memory leaks
(see :ref:`topics-leaks-trackrefs`), etc.
.. _dictionary-like: https://docs.python.org/2/library/stdtypes.html#dict
@ -237,7 +237,7 @@ Item objects
Return a new Item optionally initialized from the given argument.
Items replicate the standard `dict API`_, including its constructor, and
Items replicate the standard `dict API`_, including its ``__init__`` method, and
also provide the following additional API members:
.. automethod:: copy

View File

@ -71,34 +71,11 @@ on cookies.
Request serialization
---------------------
Requests must be serializable by the ``pickle`` module, in order for persistence
to work, so you should make sure that your requests are serializable.
The most common issue here is to use ``lambda`` functions on request callbacks that
can't be persisted.
So, for example, this won't work::
def some_callback(self, response):
somearg = 'test'
return scrapy.Request('http://www.example.com',
callback=lambda r: self.other_callback(r, somearg))
def other_callback(self, response, somearg):
print("the argument passed is: %s" % somearg)
But this will::
def some_callback(self, response):
somearg = 'test'
return scrapy.Request('http://www.example.com',
callback=self.other_callback, cb_kwargs={'somearg': somearg})
def other_callback(self, response, somearg):
print("the argument passed is: %s" % somearg)
For persistence to work, :class:`~scrapy.http.Request` objects must be
serializable with :mod:`pickle`, except for the ``callback`` and ``errback``
values passed to their ``__init__`` method, which must be methods of the
runnning :class:`~scrapy.spiders.Spider` class.
If you wish to log the requests that couldn't be serialized, you can set the
:setting:`SCHEDULER_DEBUG` setting to ``True`` in the project's settings page.
It is ``False`` by default.
.. _pickle: https://docs.python.org/library/pickle.html

View File

@ -26,7 +26,7 @@ Using Item Loaders to populate items
To use an Item Loader, you must first instantiate it. You can either
instantiate it with a dict-like object (e.g. Item or dict) or without one, in
which case an Item is automatically instantiated in the Item Loader constructor
which case an Item is automatically instantiated in the Item Loader ``__init__`` method
using the Item class specified in the :attr:`ItemLoader.default_item_class`
attribute.
@ -271,7 +271,7 @@ There are several ways to modify Item Loader context values:
loader.context['unit'] = 'cm'
2. On Item Loader instantiation (the keyword arguments of Item Loader
constructor are stored in the Item Loader context)::
``__init__`` method are stored in the Item Loader context)::
loader = ItemLoader(product, unit='cm')
@ -500,7 +500,7 @@ ItemLoader objects
.. attribute:: default_item_class
An Item class (or factory), used to instantiate items when not given in
the constructor.
the ``__init__`` method.
.. attribute:: default_input_processor
@ -515,15 +515,15 @@ ItemLoader objects
.. attribute:: default_selector_class
The class used to construct the :attr:`selector` of this
:class:`ItemLoader`, if only a response is given in the constructor.
If a selector is given in the constructor this attribute is ignored.
:class:`ItemLoader`, if only a response is given in the ``__init__`` method.
If a selector is given in the ``__init__`` method this attribute is ignored.
This attribute is sometimes overridden in subclasses.
.. attribute:: selector
The :class:`~scrapy.selector.Selector` object to extract data from.
It's either the selector given in the constructor or one created from
the response given in the constructor using the
It's either the selector given in the ``__init__`` method or one created from
the response given in the ``__init__`` method using the
:attr:`default_selector_class`. This attribute is meant to be
read-only.
@ -648,7 +648,7 @@ Here is a list of all built-in processors:
.. class:: Identity
The simplest processor, which doesn't do anything. It returns the original
values unchanged. It doesn't receive any constructor arguments, nor does it
values unchanged. It doesn't receive any ``__init__`` method arguments, nor does it
accept Loader contexts.
Example::
@ -662,7 +662,7 @@ Here is a list of all built-in processors:
Returns the first non-null/non-empty value from the values received,
so it's typically used as an output processor to single-valued fields.
It doesn't receive any constructor arguments, nor does it accept Loader contexts.
It doesn't receive any ``__init__`` method arguments, nor does it accept Loader contexts.
Example::
@ -673,7 +673,7 @@ Here is a list of all built-in processors:
.. class:: Join(separator=u' ')
Returns the values joined with the separator given in the constructor, which
Returns the values joined with the separator given in the ``__init__`` method, which
defaults to ``u' '``. It doesn't accept Loader contexts.
When using the default separator, this processor is equivalent to the
@ -711,7 +711,7 @@ Here is a list of all built-in processors:
those which do, this processor will pass the currently active :ref:`Loader
context <topics-loaders-context>` through that parameter.
The keyword arguments passed in the constructor are used as the default
The keyword arguments passed in the ``__init__`` method are used as the default
Loader context values passed to each function call. However, the final
Loader context values passed to functions are overridden with the currently
active Loader context accessible through the :meth:`ItemLoader.context`
@ -755,12 +755,12 @@ Here is a list of all built-in processors:
['HELLO, 'THIS', 'IS', 'SCRAPY']
As with the Compose processor, functions can receive Loader contexts, and
constructor keyword arguments are used as default context values. See
``__init__`` method keyword arguments are used as default context values. See
:class:`Compose` processor for more info.
.. class:: SelectJmes(json_path)
Queries the value using the json path provided to the constructor and returns the output.
Queries the value using the json path provided to the ``__init__`` method and returns the output.
Requires jmespath (https://github.com/jmespath/jmespath.py) to run.
This processor takes only one input at a time.

View File

@ -255,18 +255,18 @@ scrapy.utils.log module
when running custom scripts using :class:`~scrapy.crawler.CrawlerRunner`.
In that case, its usage is not required but it's recommended.
If you plan on configuring the handlers yourself is still recommended you
call this function, passing ``install_root_handler=False``. Bear in mind
there won't be any log output set by default in that case.
Another option when running custom scripts is to manually configure the logging.
To do this you can use `logging.basicConfig()`_ to set a basic root handler.
To get you started on manually configuring logging's output, you can use
`logging.basicConfig()`_ to set a basic root handler. This is an example
on how to redirect ``INFO`` or higher messages to a file::
Note that :class:`~scrapy.crawler.CrawlerProcess` automatically calls ``configure_logging``,
so it is recommended to only use `logging.basicConfig()`_ together with
:class:`~scrapy.crawler.CrawlerRunner`.
This is an example on how to redirect ``INFO`` or higher messages to a file::
import logging
from scrapy.utils.log import configure_logging
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='log.txt',
format='%(levelname)s: %(message)s',

View File

@ -137,7 +137,7 @@ Request objects
A string containing the URL of this request. Keep in mind that this
attribute contains the escaped URL, so it can differ from the URL passed in
the constructor.
the ``__init__`` method.
This attribute is read-only. To change the URL of a Request use
:meth:`replace`.
@ -400,7 +400,7 @@ fields with form data from :class:`Response` objects.
.. class:: FormRequest(url, [formdata, ...])
The :class:`FormRequest` class adds a new keyword parameter to the constructor. The
The :class:`FormRequest` class adds a new keyword parameter to the ``__init__`` method. The
remaining arguments are the same as for the :class:`Request` class and are
not documented here.
@ -473,7 +473,7 @@ fields with form data from :class:`Response` objects.
:type dont_click: boolean
The other parameters of this class method are passed directly to the
:class:`FormRequest` constructor.
:class:`FormRequest` ``__init__`` method.
.. versionadded:: 0.10.3
The ``formname`` parameter.
@ -547,7 +547,7 @@ dealing with JSON requests.
.. class:: JsonRequest(url, [... data, dumps_kwargs])
The :class:`JsonRequest` class adds two new keyword parameters to the constructor. The
The :class:`JsonRequest` class adds two new keyword parameters to the ``__init__`` method. The
remaining arguments are the same as for the :class:`Request` class and are
not documented here.
@ -556,7 +556,7 @@ dealing with JSON requests.
:param data: is any JSON serializable object that needs to be JSON encoded and assigned to body.
if :attr:`Request.body` argument is provided this parameter will be ignored.
if :attr:`Request.body` argument is not provided and data argument is provided :attr:`Request.method` will be
if :attr:`Request.body` argument is not provided and data argument is provided :attr:`Request.method` will be
set to ``'POST'`` automatically.
:type data: JSON serializable object
@ -721,7 +721,7 @@ TextResponse objects
:class:`Response` class, which is meant to be used only for binary data,
such as images, sounds or any media file.
:class:`TextResponse` objects support a new constructor argument, in
:class:`TextResponse` objects support a new ``__init__`` method argument, in
addition to the base :class:`Response` objects. The remaining functionality
is the same as for the :class:`Response` class and is not documented here.
@ -755,7 +755,7 @@ TextResponse objects
A string with the encoding of this response. The encoding is resolved by
trying the following mechanisms, in order:
1. the encoding passed in the constructor ``encoding`` argument
1. the encoding passed in the ``__init__`` method ``encoding`` argument
2. the encoding declared in the Content-Type HTTP header. If this
encoding is not valid (ie. unknown), it is ignored and the next

View File

@ -2,7 +2,24 @@
usefixtures = chdir
python_files=test_*.py __init__.py
python_classes=
addopts = --doctest-modules --assert=plain
addopts =
--assert=plain
--doctest-modules
--ignore=docs/_ext
--ignore=docs/conf.py
--ignore=docs/news.rst
--ignore=docs/topics/commands.rst
--ignore=docs/topics/debug.rst
--ignore=docs/topics/developer-tools.rst
--ignore=docs/topics/dynamic-content.rst
--ignore=docs/topics/items.rst
--ignore=docs/topics/leaks.rst
--ignore=docs/topics/loaders.rst
--ignore=docs/topics/selectors.rst
--ignore=docs/topics/shell.rst
--ignore=docs/topics/stats.rst
--ignore=docs/topics/telnetconsole.rst
--ignore=docs/utils
twisted = 1
flake8-ignore =
# extras
@ -30,7 +47,7 @@ flake8-ignore =
scrapy/core/engine.py E261 E501 E128 E127 E306 E502
scrapy/core/scheduler.py E501
scrapy/core/scraper.py E501 E306 E261 E128 W504
scrapy/core/spidermw.py E501 E731 E502 E231 E126 E226
scrapy/core/spidermw.py E501 E731 E502 E126 E226
scrapy/core/downloader/__init__.py F401 E501
scrapy/core/downloader/contextfactory.py E501 E128 E126
scrapy/core/downloader/middleware.py E501 E502
@ -175,14 +192,14 @@ flake8-ignore =
tests/test_crawl.py E501 E741 E265
tests/test_crawler.py F841 E306 E501
tests/test_dependencies.py E302 F841 E501 E305
tests/test_downloader_handlers.py E124 E127 E128 E225 E261 E265 F401 E501 E502 E701 E711 E126 E226 E123
tests/test_downloader_handlers.py E124 E127 E128 E225 E261 E265 F401 E501 E502 E701 E126 E226 E123
tests/test_downloadermiddleware.py E501
tests/test_downloadermiddleware_ajaxcrawlable.py E302 E501
tests/test_downloadermiddleware_cookies.py E731 E741 E501 E128 E303 E265 E126
tests/test_downloadermiddleware_decompression.py E127
tests/test_downloadermiddleware_defaultheaders.py E501
tests/test_downloadermiddleware_downloadtimeout.py E501
tests/test_downloadermiddleware_httpcache.py E713 E501 E302 E305 F401
tests/test_downloadermiddleware_httpcache.py E501 E302 E305 F401
tests/test_downloadermiddleware_httpcompression.py E501 F401 E251 E126 E123
tests/test_downloadermiddleware_httpproxy.py F401 E501 E128
tests/test_downloadermiddleware_redirect.py E501 E303 E128 E306 E127 E305
@ -196,13 +213,13 @@ flake8-ignore =
tests/test_feedexport.py E501 F401 F841 E241
tests/test_http_cookies.py E501
tests/test_http_headers.py E302 E501
tests/test_http_request.py F401 E402 E501 E231 E261 E127 E128 W293 E502 E128 E502 E126 E123
tests/test_http_request.py F401 E402 E501 E261 E127 E128 W293 E502 E128 E502 E126 E123
tests/test_http_response.py E501 E301 E502 E128 E265
tests/test_item.py E701 E128 E231 F841 E306
tests/test_item.py E701 E128 F841 E306
tests/test_link.py E501
tests/test_linkextractors.py E501 E128 E231 E124
tests/test_linkextractors.py E501 E128 E124
tests/test_loader.py E302 E501 E731 E303 E741 E128 E117 E241
tests/test_logformatter.py E128 E501 E231 E122 E302
tests/test_logformatter.py E128 E501 E122 E302
tests/test_mail.py E302 E128 E501 E305
tests/test_middleware.py E302 E501 E128
tests/test_pipeline_crawl.py E131 E501 E128 E126
@ -221,8 +238,8 @@ flake8-ignore =
tests/test_spidermiddleware_output_chain.py F401 E501 E302 W293 E226
tests/test_spidermiddleware_referer.py F401 E501 E302 F841 E125 E201 E261 E124 E501 E241 E121
tests/test_squeues.py E501 E302 E701 E741
tests/test_utils_conf.py E501 E231 E303 E128
tests/test_utils_console.py E302 E231
tests/test_utils_conf.py E501 E303 E128
tests/test_utils_console.py E302
tests/test_utils_curl.py E501
tests/test_utils_datatypes.py E402 E501 E305
tests/test_utils_defer.py E306 E261 E501 E302 F841 E226
@ -251,4 +268,4 @@ flake8-ignore =
tests/test_spiderloader/test_spiders/spider2.py E302
tests/test_spiderloader/test_spiders/spider3.py E302
tests/test_spiderloader/test_spiders/nested/spider4.py E302
tests/test_utils_misc/__init__.py E501 E231
tests/test_utils_misc/__init__.py E501

View File

@ -231,9 +231,9 @@ class Scraper(object):
signal=signals.item_dropped, item=item, response=response,
spider=spider, exception=output.value)
else:
logger.error('Error processing %(item)s', {'item': item},
exc_info=failure_to_exc_info(output),
extra={'spider': spider})
logkws = self.logformatter.error(item, ex, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider},
exc_info=failure_to_exc_info(output))
return self.signals.send_catch_log_deferred(
signal=signals.item_error, item=item, response=response,
spider=spider, failure=output)

View File

@ -35,7 +35,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
self.methods['process_spider_exception'].appendleft(getattr(mw, 'process_spider_exception', None))
def scrape_response(self, scrape_func, response, request, spider):
fname = lambda f:'%s.%s' % (
fname = lambda f: '%s.%s' % (
f.__self__.__class__.__name__,
f.__func__.__name__)

View File

@ -4,9 +4,9 @@ and extract the potentially compressed responses that may arrive.
import bz2
import gzip
import zipfile
import tarfile
import logging
import tarfile
import zipfile
from io import BytesIO
from tempfile import mktemp

View File

@ -29,7 +29,7 @@ class BaseItemExporter(object):
def _configure(self, options, dont_fail=False):
"""Configure the exporter by poping options from the ``options`` dict.
If dont_fail is set, it won't raise an exception on unexpected options
(useful for using with keyword arguments in subclasses constructors)
(useful for using with keyword arguments in subclasses ``__init__`` methods)
"""
self.encoding = options.pop('encoding', None)
self.fields_to_export = options.pop('fields_to_export', None)

View File

@ -198,9 +198,9 @@ class FeedExporter(object):
def __init__(self, settings):
self.settings = settings
self.urifmt = settings['FEED_URI']
if not self.urifmt:
if not settings['FEED_URI']:
raise NotConfigured
self.urifmt = str(settings['FEED_URI'])
self.format = settings['FEED_FORMAT'].lower()
self.export_encoding = settings['FEED_EXPORT_ENCODING']
self.storages = self._load_components('FEED_STORAGES')

View File

@ -65,7 +65,7 @@ class Request(object_ref):
s = safe_url_string(url, self.encoding)
self._url = escape_ajax(s)
if ':' not in self._url:
if ('://' not in self._url) and (not self._url.startswith('data:')):
raise ValueError('Missing scheme in request url: %s' % self._url)
url = property(_get_url, obsolete_setter(_set_url, 'url'))

View File

@ -19,23 +19,26 @@ from scrapy.utils.url import (
# common file extensions that are not followed if they occur in links
IGNORED_EXTENSIONS = [
# archives
'7z', '7zip', 'bz2', 'rar', 'tar', 'tar.gz', 'xz', 'zip',
# images
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'cdr', 'ico',
# audio
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
# video
'3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',
'm4a', 'm4v', 'flv',
'm4a', 'm4v', 'flv', 'webm',
# office suites
'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg',
'odp',
# other
'css', 'pdf', 'exe', 'bin', 'rss', 'zip', 'rar',
'css', 'pdf', 'exe', 'bin', 'rss', 'dmg', 'iso', 'apk'
]

View File

@ -8,6 +8,7 @@ from scrapy.utils.request import referer_str
SCRAPEDMSG = u"Scraped from %(src)s" + os.linesep + "%(item)s"
DROPPEDMSG = u"Dropped: %(exception)s" + os.linesep + "%(item)s"
CRAWLEDMSG = u"Crawled (%(status)s) %(request)s%(request_flags)s (referer: %(referer)s)%(response_flags)s"
ERRORMSG = u"'Error processing %(item)s'"
class LogFormatter(object):
@ -92,6 +93,16 @@ class LogFormatter(object):
}
}
def error(self, item, exception, response, spider):
"""Logs a message when an item causes an error while it is passing through the item pipeline."""
return {
'level': logging.ERROR,
'msg': ERRORMSG,
'args': {
'item': item,
}
}
@classmethod
def from_crawler(cls, crawler):
return cls()

View File

@ -86,9 +86,6 @@ class _SlotPriorityQueues(object):
def __len__(self):
return sum(len(x) for x in self.pqueues.values()) if self.pqueues else 0
def __contains__(self, slot):
return slot in self.pqueues
class ScrapyPriorityQueue(PriorityQueue):
"""

View File

@ -5,9 +5,10 @@ Python Standard Library.
This module must not depend on any module outside the Standard Library.
"""
import copy
import collections
import copy
import warnings
from collections.abc import Mapping
from scrapy.exceptions import ScrapyDeprecationWarning
@ -223,7 +224,7 @@ class CaselessDict(dict):
return dict.setdefault(self, self.normkey(key), self.normvalue(def_val))
def update(self, seq):
seq = seq.items() if isinstance(seq, collections.abc.Mapping) else seq
seq = seq.items() if isinstance(seq, Mapping) else seq
iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
super(CaselessDict, self).update(iseq)
@ -247,8 +248,9 @@ class LocalCache(collections.OrderedDict):
self.limit = limit
def __setitem__(self, key, value):
while len(self) >= self.limit:
self.popitem(last=False)
if self.limit:
while len(self) >= self.limit:
self.popitem(last=False)
super(LocalCache, self).__setitem__(key, value)

View File

@ -296,7 +296,7 @@ class WeakKeyCache(object):
def stringify_dict(dct_or_tuples, encoding='utf-8', keys_only=True):
"""Return a (new) dict with unicode keys (and values when "keys_only" is
False) of the given dict converted to strings. ``dct_or_tuples`` can be a
dict or a list of tuples, like any dict constructor supports.
dict or a list of tuples, like any dict ``__init__`` method supports.
"""
d = {}
for k, v in dict(dct_or_tuples).items():

View File

@ -3,10 +3,10 @@ from twisted.internet import reactor, error
def listen_tcp(portrange, host, factory):
"""Like reactor.listenTCP but tries different ports in a range."""
assert len(portrange) <= 2, "invalid portrange: %s" % portrange
if not hasattr(portrange, '__iter__'):
return reactor.listenTCP(portrange, factory, interface=host)
if not portrange:
return reactor.listenTCP(0, factory, interface=host)
if not hasattr(portrange, '__iter__'):
return reactor.listenTCP(portrange, factory, interface=host)
if len(portrange) == 1:
return reactor.listenTCP(portrange[0], factory, interface=host)
for x in range(portrange[0], portrange[1]+1):

View File

@ -38,7 +38,7 @@ singletons members of that object, as explained below:
``scrapy.core.manager.ExecutionManager``) - instantiated with a ``Settings``
object
- **crawler.settings**: ``scrapy.conf.Settings`` instance (passed in the constructor)
- **crawler.settings**: ``scrapy.conf.Settings`` instance (passed in the ``__init__`` method)
- **crawler.extensions**: ``scrapy.extension.ExtensionManager`` instance
- **crawler.engine**: ``scrapy.core.engine.ExecutionEngine`` instance
- ``crawler.engine.scheduler``
@ -55,7 +55,7 @@ singletons members of that object, as explained below:
``STATS_CLASS`` setting)
- **crawler.log**: Logger class with methods replacing the current
``scrapy.log`` functions. Logging would be started (if enabled) on
``Crawler`` constructor, so no log starting functions are required.
``Crawler`` instantiation, so no log starting functions are required.
- ``crawler.log.msg``
- **crawler.signals**: signal handling
@ -69,12 +69,12 @@ Required code changes after singletons removal
==============================================
All components (extensions, middlewares, etc) will receive this ``Crawler``
object in their constructors, and this will be the only mechanism for accessing
object in their ``__init__`` methods, and this will be the only mechanism for accessing
any other components (as opposed to importing each singleton from their
respective module). This will also serve to stabilize the core API, something
which we haven't documented so far (partly because of this).
So, for a typical middleware constructor code, instead of this:
So, for a typical middleware ``__init__`` method code, instead of this:
::
@ -125,13 +125,13 @@ Open issues to resolve
- Should we pass ``Settings`` object to ``ScrapyCommand.add_options()``?
- How should spiders access settings?
- Option 1. Pass ``Crawler`` object to spider constructors too
- Option 1. Pass ``Crawler`` object to spider ``__init__`` methods too
- pro: one way to access all components (settings and signals being the
most relevant to spiders)
- con?: spider code can access (and control) any crawler component -
since we don't want to support spiders messing with the crawler (write
an extension or spider middleware if you need that)
- Option 2. Pass ``Settings`` object to spider constructors, which would
- Option 2. Pass ``Settings`` object to spider ``__init__`` methods, which would
then be accessed through ``self.settings``, like logging which is accessed
through ``self.log``

View File

@ -6,6 +6,7 @@ pytest
pytest-cov
pytest-twisted
pytest-xdist
sybil
testfixtures
# optional for shell wrapper tests

View File

@ -614,7 +614,7 @@ class Http11MockServerTestCase(unittest.TestCase):
crawler = get_crawler(SingleRequestSpider)
yield crawler.crawl(seed=Request(url=self.mockserver.url('')))
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
self.assertTrue(failure is None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')
@ -636,7 +636,7 @@ class Http11MockServerTestCase(unittest.TestCase):
yield crawler.crawl(seed=request)
# download_maxsize = 50 is enough for the gzipped response
failure = crawler.spider.meta.get('failure')
self.assertTrue(failure == None)
self.assertTrue(failure is None)
reason = crawler.spider.meta['close_reason']
self.assertTrue(reason, 'finished')

View File

@ -84,8 +84,8 @@ class _BaseTest(unittest.TestCase):
def assertEqualRequestButWithCacheValidators(self, request1, request2):
self.assertEqual(request1.url, request2.url)
assert not b'If-None-Match' in request1.headers
assert not b'If-Modified-Since' in request1.headers
assert b'If-None-Match' not in request1.headers
assert b'If-Modified-Since' not in request1.headers
assert any(h in request2.headers for h in (b'If-None-Match', b'If-Modified-Since'))
self.assertEqual(request1.body, request2.body)

View File

@ -6,6 +6,7 @@ import tempfile
import shutil
import string
from io import BytesIO
from pathlib import Path
from unittest import mock
from urllib.parse import urljoin, urlparse, quote
from urllib.request import pathname2url
@ -403,6 +404,7 @@ class FeedExportTest(unittest.TestCase):
defaults = {
'FEED_URI': res_uri,
'FEED_FORMAT': 'csv',
'FEED_PATH': res_path
}
defaults.update(settings or {})
try:
@ -411,7 +413,7 @@ class FeedExportTest(unittest.TestCase):
spider_cls.start_urls = [s.url('/')]
yield runner.crawl(spider_cls)
with open(res_path, 'rb') as f:
with open(str(defaults['FEED_PATH']), 'rb') as f:
content = f.read()
finally:
@ -841,3 +843,17 @@ class FeedExportTest(unittest.TestCase):
yield self.exported_data({}, settings)
self.assertTrue(FromCrawlerCsvItemExporter.init_with_crawler)
self.assertTrue(FromCrawlerFileFeedStorage.init_with_crawler)
@defer.inlineCallbacks
def test_pathlib_uri(self):
tmpdir = tempfile.mkdtemp()
feed_uri = Path(tmpdir) / 'res'
settings = {
'FEED_FORMAT': 'csv',
'FEED_STORE_EMPTY': True,
'FEED_URI': feed_uri,
'FEED_PATH': feed_uri
}
data = yield self.exported_no_data(settings)
self.assertEqual(data, b'')
shutil.rmtree(tmpdir, ignore_errors=True)

View File

@ -3,7 +3,7 @@ import cgi
import unittest
import re
import json
import xmlrpc.client as xmlrpclib
import xmlrpc.client
import warnings
from unittest import mock
from urllib.parse import parse_qs, unquote_to_bytes, urlparse
@ -20,7 +20,7 @@ class RequestTest(unittest.TestCase):
default_meta = {}
def test_init(self):
# Request requires url in the constructor
# Request requires url in the __init__ method
self.assertRaises(Exception, self.request_class)
# url argument must be basestring
@ -47,11 +47,13 @@ class RequestTest(unittest.TestCase):
def test_url_no_scheme(self):
self.assertRaises(ValueError, self.request_class, 'foo')
self.assertRaises(ValueError, self.request_class, '/foo/')
self.assertRaises(ValueError, self.request_class, '/foo:bar')
def test_headers(self):
# Different ways of setting headers attribute
url = 'http://www.scrapy.org'
headers = {b'Accept':'gzip', b'Custom-Header':'nothing to tell you'}
headers = {b'Accept': 'gzip', b'Custom-Header': 'nothing to tell you'}
r = self.request_class(url=url, headers=headers)
p = self.request_class(url=url, headers=r.headers)
@ -495,7 +497,7 @@ class FormRequestTest(RequestTest):
formdata=(('foo', 'bar'), ('foo', 'baz')))
self.assertEqual(urlparse(req.url).hostname, 'www.example.com')
self.assertEqual(urlparse(req.url).query, 'foo=bar&foo=baz')
def test_from_response_override_duplicate_form_key(self):
response = _buildresponse(
"""<form action="get.php" method="POST">
@ -652,7 +654,7 @@ class FormRequestTest(RequestTest):
req = self.request_class.from_response(response, dont_click=True)
fs = _qs(req)
self.assertEqual(fs, {b'i1': [b'i1v'], b'i2': [b'i2v']})
def test_from_response_clickdata_does_not_ignore_image(self):
response = _buildresponse(
"""<form>
@ -811,7 +813,7 @@ class FormRequestTest(RequestTest):
<input type="hidden" name="one" value="1">
<input type="hidden" name="two" value="2">
</form>""")
r1 = self.request_class.from_response(response, formdata={'two':'3'})
r1 = self.request_class.from_response(response, formdata={'two': '3'})
self.assertEqual(r1.method, 'POST')
self.assertEqual(r1.headers['Content-type'], b'application/x-www-form-urlencoded')
fs = _qs(r1)
@ -1218,7 +1220,7 @@ class XmlRpcRequestTest(RequestTest):
r = self.request_class('http://scrapytest.org/rpc2', **kwargs)
self.assertEqual(r.headers[b'Content-Type'], b'text/xml')
self.assertEqual(r.body,
to_bytes(xmlrpclib.dumps(**kwargs),
to_bytes(xmlrpc.client.dumps(**kwargs),
encoding=kwargs.get('encoding', 'utf-8')))
self.assertEqual(r.method, 'POST')
self.assertEqual(r.encoding, kwargs.get('encoding', 'utf-8'))

View File

@ -532,7 +532,7 @@ class XmlResponseTest(TextResponseTest):
r2 = self.response_class("http://www.example.com", body=body)
self._assert_response_values(r2, 'iso-8859-1', body)
# make sure replace() preserves the explicit encoding passed in the constructor
# make sure replace() preserves the explicit encoding passed in the __init__ method
body = b"""<?xml version="1.0" encoding="iso-8859-1"?><xml></xml>"""
r3 = self.response_class("http://www.example.com", body=body, encoding='utf-8')
body2 = b"New body"

View File

@ -239,7 +239,7 @@ class ItemTest(unittest.TestCase):
def test_copy(self):
class TestItem(Item):
name = Field()
item = TestItem({'name':'lower'})
item = TestItem({'name': 'lower'})
copied_item = item.copy()
self.assertNotEqual(id(item), id(copied_item))
copied_item['name'] = copied_item['name'].upper()

View File

@ -43,6 +43,6 @@ class LinkTest(unittest.TestCase):
l2 = eval(repr(l1))
self._assert_same_links(l1, l2)
def test_non_str_url_py2(self):
def test_bytes_url(self):
with self.assertRaises(TypeError):
Link(b"http://www.example.com/\xc2\xa3")

View File

@ -322,7 +322,7 @@ class Base:
Link(url=page4_url, text=u'href with whitespaces'),
])
lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
lx = self.extractor_cls(attrs=("href", "src"), tags=("a", "area", "img"), deny_extensions=())
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
@ -360,7 +360,7 @@ class Base:
Link(url='http://example.com/sample2.html', text=u'sample 2'),
])
lx = self.extractor_cls(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
lx = self.extractor_cls(tags=("a", "img"), attrs=("href", "src"), deny_extensions=())
self.assertEqual(lx.extract_links(response), [
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample2.jpg', text=u''),

View File

@ -725,11 +725,11 @@ class SelectortemLoaderTest(unittest.TestCase):
</html>
""")
def test_constructor(self):
def test_init_method(self):
l = TestItemLoader()
self.assertEqual(l.selector, None)
def test_constructor_errors(self):
def test_init_method_errors(self):
l = TestItemLoader()
self.assertRaises(RuntimeError, l.add_xpath, 'url', '//a/@href')
self.assertRaises(RuntimeError, l.replace_xpath, 'url', '//a/@href')
@ -738,7 +738,7 @@ class SelectortemLoaderTest(unittest.TestCase):
self.assertRaises(RuntimeError, l.replace_css, 'name', '#name::text')
self.assertRaises(RuntimeError, l.get_css, '#name::text')
def test_constructor_with_selector(self):
def test_init_method_with_selector(self):
sel = Selector(text=u"<html><body><div>marta</div></body></html>")
l = TestItemLoader(selector=sel)
self.assertIs(l.selector, sel)
@ -746,7 +746,7 @@ class SelectortemLoaderTest(unittest.TestCase):
l.add_xpath('name', '//div/text()')
self.assertEqual(l.get_output_value('name'), [u'Marta'])
def test_constructor_with_selector_css(self):
def test_init_method_with_selector_css(self):
sel = Selector(text=u"<html><body><div>marta</div></body></html>")
l = TestItemLoader(selector=sel)
self.assertIs(l.selector, sel)
@ -754,14 +754,14 @@ class SelectortemLoaderTest(unittest.TestCase):
l.add_css('name', 'div::text')
self.assertEqual(l.get_output_value('name'), [u'Marta'])
def test_constructor_with_response(self):
def test_init_method_with_response(self):
l = TestItemLoader(response=self.response)
self.assertTrue(l.selector)
l.add_xpath('name', '//div/text()')
self.assertEqual(l.get_output_value('name'), [u'Marta'])
def test_constructor_with_response_css(self):
def test_init_method_with_response_css(self):
l = TestItemLoader(response=self.response)
self.assertTrue(l.selector)

View File

@ -22,13 +22,13 @@ class CustomItem(Item):
return "name: %s" % self['name']
class LoggingContribTest(unittest.TestCase):
class LogFormatterTestCase(unittest.TestCase):
def setUp(self):
self.formatter = LogFormatter()
self.spider = Spider('default')
def test_crawled(self):
def test_crawled_with_referer(self):
req = Request("http://www.example.com")
res = Response("http://www.example.com")
logkws = self.formatter.crawled(req, res, self.spider)
@ -36,6 +36,7 @@ class LoggingContribTest(unittest.TestCase):
self.assertEqual(logline,
"Crawled (200) <GET http://www.example.com> (referer: None)")
def test_crawled_without_referer(self):
req = Request("http://www.example.com", headers={'referer': 'http://example.com'})
res = Response("http://www.example.com", flags=['cached'])
logkws = self.formatter.crawled(req, res, self.spider)
@ -44,7 +45,7 @@ class LoggingContribTest(unittest.TestCase):
"Crawled (200) <GET http://www.example.com> (referer: http://example.com) ['cached']")
def test_flags_in_request(self):
req = Request("http://www.example.com", flags=['test','flag'])
req = Request("http://www.example.com", flags=['test', 'flag'])
res = Response("http://www.example.com")
logkws = self.formatter.crawled(req, res, self.spider)
logline = logkws['msg'] % logkws['args']
@ -61,6 +62,16 @@ class LoggingContribTest(unittest.TestCase):
assert all(isinstance(x, str) for x in lines)
self.assertEqual(lines, [u"Dropped: \u2018", '{}'])
def test_error(self):
# In practice, the complete traceback is shown by passing the
# 'exc_info' argument to the logging function
item = {'key': 'value'}
exception = Exception()
response = Response("http://www.example.com")
logkws = self.formatter.error(item, exception, response, self.spider)
logline = logkws['msg'] % logkws['args']
self.assertEqual(logline, u"'Error processing {'key': 'value'}'")
def test_scraped(self):
item = CustomItem()
item['name'] = u'\xa3'
@ -74,26 +85,46 @@ class LoggingContribTest(unittest.TestCase):
class LogFormatterSubclass(LogFormatter):
def crawled(self, request, response, spider):
kwargs = super(LogFormatterSubclass, self).crawled(
request, response, spider)
kwargs = super(LogFormatterSubclass, self).crawled(request, response, spider)
CRAWLEDMSG = (
u"Crawled (%(status)s) %(request)s (referer: "
u"%(referer)s)%(flags)s"
u"Crawled (%(status)s) %(request)s (referer: %(referer)s) %(flags)s"
)
log_args = kwargs['args']
log_args['flags'] = str(request.flags)
return {
'level': kwargs['level'],
'msg': CRAWLEDMSG,
'args': kwargs['args']
'args': log_args,
}
class LogformatterSubclassTest(LoggingContribTest):
class LogformatterSubclassTest(LogFormatterTestCase):
def setUp(self):
self.formatter = LogFormatterSubclass()
self.spider = Spider('default')
def test_crawled_with_referer(self):
req = Request("http://www.example.com")
res = Response("http://www.example.com")
logkws = self.formatter.crawled(req, res, self.spider)
logline = logkws['msg'] % logkws['args']
self.assertEqual(logline,
"Crawled (200) <GET http://www.example.com> (referer: None) []")
def test_crawled_without_referer(self):
req = Request("http://www.example.com", headers={'referer': 'http://example.com'}, flags=['cached'])
res = Response("http://www.example.com")
logkws = self.formatter.crawled(req, res, self.spider)
logline = logkws['msg'] % logkws['args']
self.assertEqual(logline,
"Crawled (200) <GET http://www.example.com> (referer: http://example.com) ['cached']")
def test_flags_in_request(self):
pass
req = Request("http://www.example.com", flags=['test', 'flag'])
res = Response("http://www.example.com")
logkws = self.formatter.crawled(req, res, self.spider)
logline = logkws['msg'] % logkws['args']
self.assertEqual(logline, "Crawled (200) <GET http://www.example.com> (referer: None) ['test', 'flag']")
class SkipMessagesLogFormatter(LogFormatter):

View File

@ -41,12 +41,12 @@ class SpiderTest(unittest.TestCase):
self.assertEqual(list(start_requests), [])
def test_spider_args(self):
"""Constructor arguments are assigned to spider attributes"""
"""``__init__`` method arguments are assigned to spider attributes"""
spider = self.spider_class('example.com', foo='bar')
self.assertEqual(spider.foo, 'bar')
def test_spider_without_name(self):
"""Constructor arguments are assigned to spider attributes"""
"""``__init__`` method arguments are assigned to spider attributes"""
self.assertRaises(ValueError, self.spider_class)
self.assertRaises(ValueError, self.spider_class, somearg='foo')

View File

@ -79,7 +79,7 @@ class BuildComponentListTest(unittest.TestCase):
self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x)
d = {'one': {'a': 'a', 'b': 2}}
self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x)
d = {'one': 'lorem ipsum',}
d = {'one': 'lorem ipsum'}
self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x)

View File

@ -21,7 +21,7 @@ class UtilsConsoleTestCase(unittest.TestCase):
shell = get_shell_embed_func(['invalid'])
self.assertEqual(shell, None)
shell = get_shell_embed_func(['invalid','python'])
shell = get_shell_embed_func(['invalid', 'python'])
self.assertTrue(callable(shell))
self.assertEqual(shell.__name__, '_embed_standard_shell')

View File

@ -1,8 +1,8 @@
from collections.abc import Mapping, MutableMapping
import copy
import unittest
from collections.abc import Mapping, MutableMapping
from scrapy.utils.datatypes import CaselessDict, SequenceExclude
from scrapy.utils.datatypes import CaselessDict, LocalCache, SequenceExclude
__doctests__ = ['scrapy.utils.datatypes']
@ -229,5 +229,31 @@ class SequenceExcludeTest(unittest.TestCase):
for v in [-3, "test", 1.1]:
self.assertNotIn(v, d)
class LocalCacheTest(unittest.TestCase):
def test_cache_with_limit(self):
cache = LocalCache(limit=2)
cache['a'] = 1
cache['b'] = 2
cache['c'] = 3
self.assertEqual(len(cache), 2)
self.assertNotIn('a', cache)
self.assertIn('b', cache)
self.assertIn('c', cache)
self.assertEqual(cache['b'], 2)
self.assertEqual(cache['c'], 3)
def test_cache_without_limit(self):
maximum = 10**4
cache = LocalCache()
for x in range(maximum):
cache[str(x)] = x
self.assertEqual(len(cache), maximum)
for x in range(maximum):
self.assertIn(str(x), cache)
self.assertEqual(cache[str(x)], x)
if __name__ == "__main__":
unittest.main()

View File

@ -74,7 +74,7 @@ class UtilsMiscTestCase(unittest.TestCase):
self.assertEqual(list(arg_to_iter(100)), [100])
self.assertEqual(list(arg_to_iter(l for l in 'abc')), ['a', 'b', 'c'])
self.assertEqual(list(arg_to_iter([1, 2, 3])), [1, 2, 3])
self.assertEqual(list(arg_to_iter({'a':1})), [{'a': 1}])
self.assertEqual(list(arg_to_iter({'a': 1})), [{'a': 1}])
self.assertEqual(list(arg_to_iter(TestItem(name="john"))), [TestItem(name="john")])
def test_create_instance(self):

View File

@ -205,10 +205,10 @@ class UtilsPythonTestCase(unittest.TestCase):
self.assertEqual(get_func_args(operator.itemgetter(2)), [])
else:
self.assertEqual(
get_func_args(str.split, True), ['sep', 'maxsplit'])
self.assertEqual(get_func_args(" ".join, True), ['list'])
get_func_args(str.split, stripself=True), ['sep', 'maxsplit'])
self.assertEqual(get_func_args(" ".join, stripself=True), ['list'])
self.assertEqual(
get_func_args(operator.itemgetter(2), True), ['obj'])
get_func_args(operator.itemgetter(2), stripself=True), ['obj'])
def test_without_none_values(self):

13
tox.ini
View File

@ -21,7 +21,7 @@ passenv =
GCS_TEST_FILE_URI
GCS_PROJECT_ID
commands =
py.test --cov=scrapy --cov-report= {posargs:scrapy tests}
py.test --cov=scrapy --cov-report= {posargs:--durations=10 docs scrapy tests}
[testenv:py35]
basepython = python3.5
@ -60,7 +60,14 @@ basepython = python3.8
[testenv:pypy3]
basepython = pypy3
commands =
py.test {posargs:scrapy tests}
py.test {posargs:--durations=10 docs scrapy tests}
[testenv:security]
basepython = python3.8
deps =
bandit
commands =
bandit -r -c .bandit.yml {posargs:scrapy}
[testenv:flake8]
basepython = python3.8
@ -68,7 +75,7 @@ deps =
{[testenv]deps}
pytest-flake8
commands =
py.test --flake8 {posargs:scrapy tests}
py.test --flake8 {posargs:docs scrapy tests}
[docs]
changedir = docs