mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-22 06:52:53 +00:00
Merge remote-tracking branch 'upstream/master' into remove-six-code
This commit is contained in:
commit
05785c1c17
16
.bandit.yml
Normal file
16
.bandit.yml
Normal file
@ -0,0 +1,16 @@
|
||||
skips:
|
||||
- B101
|
||||
- B105
|
||||
- B303
|
||||
- B306
|
||||
- B307
|
||||
- B311
|
||||
- B320
|
||||
- B321
|
||||
- B402
|
||||
- B404
|
||||
- B406
|
||||
- B410
|
||||
- B503
|
||||
- B603
|
||||
- B605
|
@ -7,6 +7,8 @@ branches:
|
||||
- /^\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/
|
||||
matrix:
|
||||
include:
|
||||
- env: TOXENV=security
|
||||
python: 3.8
|
||||
- env: TOXENV=flake8
|
||||
python: 3.8
|
||||
- env: TOXENV=pypy3
|
||||
|
@ -68,7 +68,7 @@ members of the project's leadership.
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
||||
available at [http://contributor-covenant.org/version/1/4][version]
|
||||
available at [http://contributor-covenant.org/version/1/4][version].
|
||||
|
||||
[homepage]: http://contributor-covenant.org
|
||||
[version]: http://contributor-covenant.org/version/1/4/
|
||||
|
18
README.rst
18
README.rst
@ -34,8 +34,8 @@ Scrapy is a fast high-level web crawling and web scraping framework, used to
|
||||
crawl websites and extract structured data from their pages. It can be used for
|
||||
a wide range of purposes, from data mining to monitoring and automated testing.
|
||||
|
||||
For more information including a list of features check the Scrapy homepage at:
|
||||
https://scrapy.org
|
||||
Check the Scrapy homepage at https://scrapy.org for more information,
|
||||
including a list of features.
|
||||
|
||||
Requirements
|
||||
============
|
||||
@ -50,8 +50,8 @@ The quick way::
|
||||
|
||||
pip install scrapy
|
||||
|
||||
For more details see the install section in the documentation:
|
||||
https://docs.scrapy.org/en/latest/intro/install.html
|
||||
See the install section in the documentation at
|
||||
https://docs.scrapy.org/en/latest/intro/install.html for more details.
|
||||
|
||||
Documentation
|
||||
=============
|
||||
@ -62,17 +62,17 @@ directory.
|
||||
Releases
|
||||
========
|
||||
|
||||
You can find release notes at https://docs.scrapy.org/en/latest/news.html
|
||||
You can check https://docs.scrapy.org/en/latest/news.html for the release notes.
|
||||
|
||||
Community (blog, twitter, mail list, IRC)
|
||||
=========================================
|
||||
|
||||
See https://scrapy.org/community/
|
||||
See https://scrapy.org/community/ for details.
|
||||
|
||||
Contributing
|
||||
============
|
||||
|
||||
See https://docs.scrapy.org/en/master/contributing.html
|
||||
See https://docs.scrapy.org/en/master/contributing.html for details.
|
||||
|
||||
Code of Conduct
|
||||
---------------
|
||||
@ -86,9 +86,9 @@ Please report unacceptable behavior to opensource@scrapinghub.com.
|
||||
Companies using Scrapy
|
||||
======================
|
||||
|
||||
See https://scrapy.org/companies/
|
||||
See https://scrapy.org/companies/ for a list.
|
||||
|
||||
Commercial Support
|
||||
==================
|
||||
|
||||
See https://scrapy.org/support/
|
||||
See https://scrapy.org/support/ for details.
|
||||
|
281
docs/_tests/quotes1.html
Normal file
281
docs/_tests/quotes1.html
Normal file
@ -0,0 +1,281 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Quotes to Scrape</title>
|
||||
<link rel="stylesheet" href="/static/bootstrap.min.css">
|
||||
<link rel="stylesheet" href="/static/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="row header-box">
|
||||
<div class="col-md-8">
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
|
||||
</h1>
|
||||
</div>
|
||||
<div class="col-md-4">
|
||||
<p>
|
||||
|
||||
<a href="/login">Login</a>
|
||||
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-8">
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="change,deep-thoughts,thinking,world" / >
|
||||
|
||||
<a class="tag" href="/tag/change/page/1/">change</a>
|
||||
|
||||
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
|
||||
|
||||
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
|
||||
|
||||
<a class="tag" href="/tag/world/page/1/">world</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
|
||||
<span>by <small class="author" itemprop="author">J.K. Rowling</small>
|
||||
<a href="/author/J-K-Rowling">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="abilities,choices" / >
|
||||
|
||||
<a class="tag" href="/tag/abilities/page/1/">abilities</a>
|
||||
|
||||
<a class="tag" href="/tag/choices/page/1/">choices</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="inspirational,life,live,miracle,miracles" / >
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
<a class="tag" href="/tag/life/page/1/">life</a>
|
||||
|
||||
<a class="tag" href="/tag/live/page/1/">live</a>
|
||||
|
||||
<a class="tag" href="/tag/miracle/page/1/">miracle</a>
|
||||
|
||||
<a class="tag" href="/tag/miracles/page/1/">miracles</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>
|
||||
<span>by <small class="author" itemprop="author">Jane Austen</small>
|
||||
<a href="/author/Jane-Austen">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="aliteracy,books,classic,humor" / >
|
||||
|
||||
<a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>
|
||||
|
||||
<a class="tag" href="/tag/books/page/1/">books</a>
|
||||
|
||||
<a class="tag" href="/tag/classic/page/1/">classic</a>
|
||||
|
||||
<a class="tag" href="/tag/humor/page/1/">humor</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>
|
||||
<span>by <small class="author" itemprop="author">Marilyn Monroe</small>
|
||||
<a href="/author/Marilyn-Monroe">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="be-yourself,inspirational" / >
|
||||
|
||||
<a class="tag" href="/tag/be-yourself/page/1/">be-yourself</a>
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="adulthood,success,value" / >
|
||||
|
||||
<a class="tag" href="/tag/adulthood/page/1/">adulthood</a>
|
||||
|
||||
<a class="tag" href="/tag/success/page/1/">success</a>
|
||||
|
||||
<a class="tag" href="/tag/value/page/1/">value</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.”</span>
|
||||
<span>by <small class="author" itemprop="author">André Gide</small>
|
||||
<a href="/author/Andre-Gide">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="life,love" / >
|
||||
|
||||
<a class="tag" href="/tag/life/page/1/">life</a>
|
||||
|
||||
<a class="tag" href="/tag/love/page/1/">love</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“I have not failed. I've just found 10,000 ways that won't work.”</span>
|
||||
<span>by <small class="author" itemprop="author">Thomas A. Edison</small>
|
||||
<a href="/author/Thomas-A-Edison">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="edison,failure,inspirational,paraphrased" / >
|
||||
|
||||
<a class="tag" href="/tag/edison/page/1/">edison</a>
|
||||
|
||||
<a class="tag" href="/tag/failure/page/1/">failure</a>
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
<a class="tag" href="/tag/paraphrased/page/1/">paraphrased</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“A woman is like a tea bag; you never know how strong it is until it's in hot water.”</span>
|
||||
<span>by <small class="author" itemprop="author">Eleanor Roosevelt</small>
|
||||
<a href="/author/Eleanor-Roosevelt">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="misattributed-eleanor-roosevelt" / >
|
||||
|
||||
<a class="tag" href="/tag/misattributed-eleanor-roosevelt/page/1/">misattributed-eleanor-roosevelt</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“A day without sunshine is like, you know, night.”</span>
|
||||
<span>by <small class="author" itemprop="author">Steve Martin</small>
|
||||
<a href="/author/Steve-Martin">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="humor,obvious,simile" / >
|
||||
|
||||
<a class="tag" href="/tag/humor/page/1/">humor</a>
|
||||
|
||||
<a class="tag" href="/tag/obvious/page/1/">obvious</a>
|
||||
|
||||
<a class="tag" href="/tag/simile/page/1/">simile</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<nav>
|
||||
<ul class="pager">
|
||||
|
||||
|
||||
<li class="next">
|
||||
<a href="/page/2/">Next <span aria-hidden="true">→</span></a>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
<div class="col-md-4 tags-box">
|
||||
|
||||
<h2>Top Ten tags</h2>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 28px" href="/tag/love/">love</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 26px" href="/tag/inspirational/">inspirational</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 26px" href="/tag/life/">life</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 24px" href="/tag/humor/">humor</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 22px" href="/tag/books/">books</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 14px" href="/tag/reading/">reading</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 10px" href="/tag/friendship/">friendship</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 8px" href="/tag/friends/">friends</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 8px" href="/tag/truth/">truth</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 6px" href="/tag/simile/">simile</a>
|
||||
</span>
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<footer class="footer">
|
||||
<div class="container">
|
||||
<p class="text-muted">
|
||||
Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
|
||||
</p>
|
||||
<p class="copyright">
|
||||
Made with <span class='sh-red'>❤</span> by <a href="https://scrapinghub.com">Scrapinghub</a>
|
||||
</p>
|
||||
</div>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
@ -27,10 +27,12 @@ sys.path.insert(0, path.dirname(path.dirname(__file__)))
|
||||
# Add any Sphinx extension module names here, as strings. They can be extensions
|
||||
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
||||
extensions = [
|
||||
'notfound.extension',
|
||||
'scrapydocs',
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.coverage',
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.viewcode',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
@ -237,7 +239,7 @@ coverage_ignore_pyobjects = [
|
||||
r'\bContractsManager\b$',
|
||||
|
||||
# For default contracts we only want to document their general purpose in
|
||||
# their constructor, the methods they reimplement to achieve that purpose
|
||||
# their __init__ method, the methods they reimplement to achieve that purpose
|
||||
# should be irrelevant to developers using those contracts.
|
||||
r'\w+Contract\.(adjust_request_args|(pre|post)_process)$',
|
||||
|
||||
@ -273,4 +275,5 @@ coverage_ignore_pyobjects = [
|
||||
|
||||
intersphinx_mapping = {
|
||||
'python': ('https://docs.python.org/3', None),
|
||||
'sphinx': ('https://www.sphinx-doc.org/en/stable', None),
|
||||
}
|
||||
|
29
docs/conftest.py
Normal file
29
docs/conftest.py
Normal file
@ -0,0 +1,29 @@
|
||||
import os
|
||||
from doctest import ELLIPSIS, NORMALIZE_WHITESPACE
|
||||
|
||||
from scrapy.http.response.html import HtmlResponse
|
||||
from sybil import Sybil
|
||||
from sybil.parsers.codeblock import CodeBlockParser
|
||||
from sybil.parsers.doctest import DocTestParser
|
||||
from sybil.parsers.skip import skip
|
||||
|
||||
|
||||
def load_response(url, filename):
|
||||
input_path = os.path.join(os.path.dirname(__file__), '_tests', filename)
|
||||
with open(input_path, 'rb') as input_file:
|
||||
return HtmlResponse(url, body=input_file.read())
|
||||
|
||||
|
||||
def setup(namespace):
|
||||
namespace['load_response'] = load_response
|
||||
|
||||
|
||||
pytest_collect_file = Sybil(
|
||||
parsers=[
|
||||
DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE),
|
||||
CodeBlockParser(future_imports=['print_function']),
|
||||
skip,
|
||||
],
|
||||
pattern='*.rst',
|
||||
setup=setup,
|
||||
).pytest()
|
@ -177,20 +177,19 @@ Documentation policies
|
||||
======================
|
||||
|
||||
For reference documentation of API members (classes, methods, etc.) use
|
||||
docstrings and make sure that the Sphinx documentation uses the autodoc_
|
||||
extension to pull the docstrings. API reference documentation should follow
|
||||
docstring conventions (`PEP 257`_) and be IDE-friendly: short, to the point,
|
||||
and it may provide short examples.
|
||||
docstrings and make sure that the Sphinx documentation uses the
|
||||
:mod:`~sphinx.ext.autodoc` extension to pull the docstrings. API reference
|
||||
documentation should follow docstring conventions (`PEP 257`_) and be
|
||||
IDE-friendly: short, to the point, and it may provide short examples.
|
||||
|
||||
Other types of documentation, such as tutorials or topics, should be covered in
|
||||
files within the ``docs/`` directory. This includes documentation that is
|
||||
specific to an API member, but goes beyond API reference documentation.
|
||||
|
||||
In any case, if something is covered in a docstring, use the autodoc_
|
||||
extension to pull the docstring into the documentation instead of duplicating
|
||||
the docstring in files within the ``docs/`` directory.
|
||||
|
||||
.. _autodoc: http://www.sphinx-doc.org/en/stable/ext/autodoc.html
|
||||
In any case, if something is covered in a docstring, use the
|
||||
:mod:`~sphinx.ext.autodoc` extension to pull the docstring into the
|
||||
documentation instead of duplicating the docstring in files within the
|
||||
``docs/`` directory.
|
||||
|
||||
Tests
|
||||
=====
|
||||
|
@ -235,13 +235,16 @@ You will see something like::
|
||||
[s] shelp() Shell help (print this help)
|
||||
[s] fetch(req_or_url) Fetch request (or URL) and update local objects
|
||||
[s] view(response) View response in a browser
|
||||
>>>
|
||||
|
||||
Using the shell, you can try selecting elements using `CSS`_ with the response
|
||||
object::
|
||||
object:
|
||||
|
||||
>>> response.css('title')
|
||||
[<Selector xpath='descendant-or-self::title' data='<title>Quotes to Scrape</title>'>]
|
||||
.. invisible-code-block: python
|
||||
|
||||
response = load_response('http://quotes.toscrape.com/page/1/', 'quotes1.html')
|
||||
|
||||
>>> response.css('title')
|
||||
[<Selector xpath='descendant-or-self::title' data='<title>Quotes to Scrape</title>'>]
|
||||
|
||||
The result of running ``response.css('title')`` is a list-like object called
|
||||
:class:`~scrapy.selector.SelectorList`, which represents a list of
|
||||
@ -372,6 +375,9 @@ we want::
|
||||
We get a list of selectors for the quote HTML elements with::
|
||||
|
||||
>>> response.css("div.quote")
|
||||
[<Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' quote ')]" data='<div class="quote" itemscope itemtype...'>,
|
||||
<Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' quote ')]" data='<div class="quote" itemscope itemtype...'>,
|
||||
...]
|
||||
|
||||
Each of the selectors returned by the query above allows us to run further
|
||||
queries over their sub-elements. Let's assign the first selector to a
|
||||
@ -396,6 +402,12 @@ to get all of them::
|
||||
>>> tags
|
||||
['change', 'deep-thoughts', 'thinking', 'world']
|
||||
|
||||
.. invisible-code-block: python
|
||||
|
||||
from sys import version_info
|
||||
|
||||
.. skip: next if(version_info < (3, 6), reason="Only Python 3.6+ dictionaries match the output")
|
||||
|
||||
Having figured out how to extract each bit, we can now iterate over all the
|
||||
quotes elements and put them together into a Python dictionary::
|
||||
|
||||
@ -404,10 +416,9 @@ quotes elements and put them together into a Python dictionary::
|
||||
... author = quote.css("small.author::text").get()
|
||||
... tags = quote.css("div.tags a.tag::text").getall()
|
||||
... print(dict(text=text, author=author, tags=tags))
|
||||
{'tags': ['change', 'deep-thoughts', 'thinking', 'world'], 'author': 'Albert Einstein', 'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'}
|
||||
{'tags': ['abilities', 'choices'], 'author': 'J.K. Rowling', 'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”'}
|
||||
... a few more of these, omitted for brevity
|
||||
>>>
|
||||
{'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'author': 'Albert Einstein', 'tags': ['change', 'deep-thoughts', 'thinking', 'world']}
|
||||
{'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'author': 'J.K. Rowling', 'tags': ['abilities', 'choices']}
|
||||
...
|
||||
|
||||
Extracting data in our spider
|
||||
-----------------------------
|
||||
@ -521,7 +532,7 @@ There is also an ``attrib`` property available
|
||||
(see :ref:`selecting-attributes` for more)::
|
||||
|
||||
>>> response.css('li.next a').attrib['href']
|
||||
'/page/2'
|
||||
'/page/2/'
|
||||
|
||||
Let's see now our spider modified to recursively follow the link to the next
|
||||
page, extracting data from it::
|
||||
|
@ -308,12 +308,12 @@ New features
|
||||
convenient way to build JSON requests (:issue:`3504`, :issue:`3505`)
|
||||
|
||||
* A ``process_request`` callback passed to the :class:`~scrapy.spiders.Rule`
|
||||
constructor now receives the :class:`~scrapy.http.Response` object that
|
||||
``__init__`` method now receives the :class:`~scrapy.http.Response` object that
|
||||
originated the request as its second argument (:issue:`3682`)
|
||||
|
||||
* A new ``restrict_text`` parameter for the
|
||||
:attr:`LinkExtractor <scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor>`
|
||||
constructor allows filtering links by linking text (:issue:`3622`,
|
||||
``__init__`` method allows filtering links by linking text (:issue:`3622`,
|
||||
:issue:`3635`)
|
||||
|
||||
* A new :setting:`FEED_STORAGE_S3_ACL` setting allows defining a custom ACL
|
||||
@ -479,7 +479,7 @@ The following deprecated APIs have been removed (:issue:`3578`):
|
||||
|
||||
* From :class:`~scrapy.selector.Selector`:
|
||||
|
||||
* ``_root`` (both the constructor argument and the object property, use
|
||||
* ``_root`` (both the ``__init__`` method argument and the object property, use
|
||||
``root``)
|
||||
|
||||
* ``extract_unquoted`` (use ``getall``)
|
||||
@ -2703,7 +2703,7 @@ Scrapy changes:
|
||||
- removed ``ENCODING_ALIASES`` setting, as encoding auto-detection has been moved to the `w3lib`_ library
|
||||
- promoted :ref:`topics-djangoitem` to main contrib
|
||||
- LogFormatter method now return dicts(instead of strings) to support lazy formatting (:issue:`164`, :commit:`dcef7b0`)
|
||||
- downloader handlers (:setting:`DOWNLOAD_HANDLERS` setting) now receive settings as the first argument of the constructor
|
||||
- downloader handlers (:setting:`DOWNLOAD_HANDLERS` setting) now receive settings as the first argument of the ``__init__`` method
|
||||
- replaced memory usage acounting with (more portable) `resource`_ module, removed ``scrapy.utils.memory`` module
|
||||
- removed signal: ``scrapy.mail.mail_sent``
|
||||
- removed ``TRACK_REFS`` setting, now :ref:`trackrefs <topics-leaks-trackrefs>` is always enabled
|
||||
@ -2917,7 +2917,7 @@ API changes
|
||||
- ``Request.copy()`` and ``Request.replace()`` now also copies their ``callback`` and ``errback`` attributes (#231)
|
||||
- Removed ``UrlFilterMiddleware`` from ``scrapy.contrib`` (already disabled by default)
|
||||
- Offsite middelware doesn't filter out any request coming from a spider that doesn't have a allowed_domains attribute (#225)
|
||||
- Removed Spider Manager ``load()`` method. Now spiders are loaded in the constructor itself.
|
||||
- Removed Spider Manager ``load()`` method. Now spiders are loaded in the ``__init__`` method itself.
|
||||
- Changes to Scrapy Manager (now called "Crawler"):
|
||||
- ``scrapy.core.manager.ScrapyManager`` class renamed to ``scrapy.crawler.Crawler``
|
||||
- ``scrapy.core.manager.scrapymanager`` singleton moved to ``scrapy.project.crawler``
|
||||
|
@ -1,2 +1,3 @@
|
||||
Sphinx>=2.1
|
||||
sphinx_rtd_theme
|
||||
sphinx-notfound-page
|
||||
sphinx_rtd_theme
|
||||
|
@ -21,7 +21,7 @@ Quick example
|
||||
=============
|
||||
|
||||
There are two ways to instantiate the mail sender. You can instantiate it using
|
||||
the standard constructor::
|
||||
the standard ``__init__`` method::
|
||||
|
||||
from scrapy.mail import MailSender
|
||||
mailer = MailSender()
|
||||
@ -111,7 +111,7 @@ uses `Twisted non-blocking IO`_, like the rest of the framework.
|
||||
Mail settings
|
||||
=============
|
||||
|
||||
These settings define the default constructor values of the :class:`MailSender`
|
||||
These settings define the default ``__init__`` method values of the :class:`MailSender`
|
||||
class, and can be used to configure e-mail notifications in your project without
|
||||
writing any code (for those extensions and code that uses :class:`MailSender`).
|
||||
|
||||
|
@ -87,8 +87,8 @@ described next.
|
||||
1. Declaring a serializer in the field
|
||||
--------------------------------------
|
||||
|
||||
If you use :class:`~.Item` you can declare a serializer in the
|
||||
:ref:`field metadata <topics-items-fields>`. The serializer must be
|
||||
If you use :class:`~.Item` you can declare a serializer in the
|
||||
:ref:`field metadata <topics-items-fields>`. The serializer must be
|
||||
a callable which receives a value and returns its serialized form.
|
||||
|
||||
Example::
|
||||
@ -144,7 +144,7 @@ BaseItemExporter
|
||||
defining what fields to export, whether to export empty fields, or which
|
||||
encoding to use.
|
||||
|
||||
These features can be configured through the constructor arguments which
|
||||
These features can be configured through the ``__init__`` method arguments which
|
||||
populate their respective instance attributes: :attr:`fields_to_export`,
|
||||
:attr:`export_empty_fields`, :attr:`encoding`, :attr:`indent`.
|
||||
|
||||
@ -246,8 +246,8 @@ XmlItemExporter
|
||||
:param item_element: The name of each item element in the exported XML.
|
||||
:type item_element: str
|
||||
|
||||
The additional keyword arguments of this constructor are passed to the
|
||||
:class:`BaseItemExporter` constructor.
|
||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||
:class:`BaseItemExporter` ``__init__`` method.
|
||||
|
||||
A typical output of this exporter would be::
|
||||
|
||||
@ -306,9 +306,9 @@ CsvItemExporter
|
||||
multi-valued fields, if found.
|
||||
:type include_headers_line: str
|
||||
|
||||
The additional keyword arguments of this constructor are passed to the
|
||||
:class:`BaseItemExporter` constructor, and the leftover arguments to the
|
||||
`csv.writer`_ constructor, so you can use any ``csv.writer`` constructor
|
||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||
:class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to the
|
||||
`csv.writer`_ ``__init__`` method, so you can use any ``csv.writer`` ``__init__`` method
|
||||
argument to customize this exporter.
|
||||
|
||||
A typical output of this exporter would be::
|
||||
@ -334,8 +334,8 @@ PickleItemExporter
|
||||
|
||||
For more information, refer to the `pickle module documentation`_.
|
||||
|
||||
The additional keyword arguments of this constructor are passed to the
|
||||
:class:`BaseItemExporter` constructor.
|
||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||
:class:`BaseItemExporter` ``__init__`` method.
|
||||
|
||||
Pickle isn't a human readable format, so no output examples are provided.
|
||||
|
||||
@ -351,8 +351,8 @@ PprintItemExporter
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
|
||||
The additional keyword arguments of this constructor are passed to the
|
||||
:class:`BaseItemExporter` constructor.
|
||||
The additional keyword arguments of this ``__init__`` method are passed to the
|
||||
:class:`BaseItemExporter` ``__init__`` method.
|
||||
|
||||
A typical output of this exporter would be::
|
||||
|
||||
@ -367,10 +367,10 @@ JsonItemExporter
|
||||
.. class:: JsonItemExporter(file, \**kwargs)
|
||||
|
||||
Exports Items in JSON format to the specified file-like object, writing all
|
||||
objects as a list of objects. The additional constructor arguments are
|
||||
passed to the :class:`BaseItemExporter` constructor, and the leftover
|
||||
arguments to the `JSONEncoder`_ constructor, so you can use any
|
||||
`JSONEncoder`_ constructor argument to customize this exporter.
|
||||
objects as a list of objects. The additional ``__init__`` method arguments are
|
||||
passed to the :class:`BaseItemExporter` ``__init__`` method, and the leftover
|
||||
arguments to the `JSONEncoder`_ ``__init__`` method, so you can use any
|
||||
`JSONEncoder`_ ``__init__`` method argument to customize this exporter.
|
||||
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
@ -398,10 +398,10 @@ JsonLinesItemExporter
|
||||
.. class:: JsonLinesItemExporter(file, \**kwargs)
|
||||
|
||||
Exports Items in JSON format to the specified file-like object, writing one
|
||||
JSON-encoded item per line. The additional constructor arguments are passed
|
||||
to the :class:`BaseItemExporter` constructor, and the leftover arguments to
|
||||
the `JSONEncoder`_ constructor, so you can use any `JSONEncoder`_
|
||||
constructor argument to customize this exporter.
|
||||
JSON-encoded item per line. The additional ``__init__`` method arguments are passed
|
||||
to the :class:`BaseItemExporter` ``__init__`` method, and the leftover arguments to
|
||||
the `JSONEncoder`_ ``__init__`` method, so you can use any `JSONEncoder`_
|
||||
``__init__`` method argument to customize this exporter.
|
||||
|
||||
:param file: the file-like object to use for exporting the data. Its ``write`` method should
|
||||
accept ``bytes`` (a disk file opened in binary mode, a ``io.BytesIO`` object, etc)
|
||||
|
@ -28,7 +28,7 @@ Loading & activating extensions
|
||||
|
||||
Extensions are loaded and activated at startup by instantiating a single
|
||||
instance of the extension class. Therefore, all the extension initialization
|
||||
code must be performed in the class constructor (``__init__`` method).
|
||||
code must be performed in the class ``__init__`` method.
|
||||
|
||||
To make an extension available, add it to the :setting:`EXTENSIONS` setting in
|
||||
your Scrapy settings. In :setting:`EXTENSIONS`, each extension is represented
|
||||
|
@ -16,12 +16,12 @@ especially in a larger project with many spiders.
|
||||
To define common output data format Scrapy provides the :class:`Item` class.
|
||||
:class:`Item` objects are simple containers used to collect the scraped data.
|
||||
They provide a `dictionary-like`_ API with a convenient syntax for declaring
|
||||
their available fields.
|
||||
their available fields.
|
||||
|
||||
Various Scrapy components use extra information provided by Items:
|
||||
Various Scrapy components use extra information provided by Items:
|
||||
exporters look at declared fields to figure out columns to export,
|
||||
serialization can be customized using Item fields metadata, :mod:`trackref`
|
||||
tracks Item instances to help find memory leaks
|
||||
tracks Item instances to help find memory leaks
|
||||
(see :ref:`topics-leaks-trackrefs`), etc.
|
||||
|
||||
.. _dictionary-like: https://docs.python.org/2/library/stdtypes.html#dict
|
||||
@ -237,7 +237,7 @@ Item objects
|
||||
|
||||
Return a new Item optionally initialized from the given argument.
|
||||
|
||||
Items replicate the standard `dict API`_, including its constructor, and
|
||||
Items replicate the standard `dict API`_, including its ``__init__`` method, and
|
||||
also provide the following additional API members:
|
||||
|
||||
.. automethod:: copy
|
||||
|
@ -71,34 +71,11 @@ on cookies.
|
||||
Request serialization
|
||||
---------------------
|
||||
|
||||
Requests must be serializable by the ``pickle`` module, in order for persistence
|
||||
to work, so you should make sure that your requests are serializable.
|
||||
|
||||
The most common issue here is to use ``lambda`` functions on request callbacks that
|
||||
can't be persisted.
|
||||
|
||||
So, for example, this won't work::
|
||||
|
||||
def some_callback(self, response):
|
||||
somearg = 'test'
|
||||
return scrapy.Request('http://www.example.com',
|
||||
callback=lambda r: self.other_callback(r, somearg))
|
||||
|
||||
def other_callback(self, response, somearg):
|
||||
print("the argument passed is: %s" % somearg)
|
||||
|
||||
But this will::
|
||||
|
||||
def some_callback(self, response):
|
||||
somearg = 'test'
|
||||
return scrapy.Request('http://www.example.com',
|
||||
callback=self.other_callback, cb_kwargs={'somearg': somearg})
|
||||
|
||||
def other_callback(self, response, somearg):
|
||||
print("the argument passed is: %s" % somearg)
|
||||
For persistence to work, :class:`~scrapy.http.Request` objects must be
|
||||
serializable with :mod:`pickle`, except for the ``callback`` and ``errback``
|
||||
values passed to their ``__init__`` method, which must be methods of the
|
||||
runnning :class:`~scrapy.spiders.Spider` class.
|
||||
|
||||
If you wish to log the requests that couldn't be serialized, you can set the
|
||||
:setting:`SCHEDULER_DEBUG` setting to ``True`` in the project's settings page.
|
||||
It is ``False`` by default.
|
||||
|
||||
.. _pickle: https://docs.python.org/library/pickle.html
|
||||
|
@ -26,7 +26,7 @@ Using Item Loaders to populate items
|
||||
|
||||
To use an Item Loader, you must first instantiate it. You can either
|
||||
instantiate it with a dict-like object (e.g. Item or dict) or without one, in
|
||||
which case an Item is automatically instantiated in the Item Loader constructor
|
||||
which case an Item is automatically instantiated in the Item Loader ``__init__`` method
|
||||
using the Item class specified in the :attr:`ItemLoader.default_item_class`
|
||||
attribute.
|
||||
|
||||
@ -271,7 +271,7 @@ There are several ways to modify Item Loader context values:
|
||||
loader.context['unit'] = 'cm'
|
||||
|
||||
2. On Item Loader instantiation (the keyword arguments of Item Loader
|
||||
constructor are stored in the Item Loader context)::
|
||||
``__init__`` method are stored in the Item Loader context)::
|
||||
|
||||
loader = ItemLoader(product, unit='cm')
|
||||
|
||||
@ -500,7 +500,7 @@ ItemLoader objects
|
||||
.. attribute:: default_item_class
|
||||
|
||||
An Item class (or factory), used to instantiate items when not given in
|
||||
the constructor.
|
||||
the ``__init__`` method.
|
||||
|
||||
.. attribute:: default_input_processor
|
||||
|
||||
@ -515,15 +515,15 @@ ItemLoader objects
|
||||
.. attribute:: default_selector_class
|
||||
|
||||
The class used to construct the :attr:`selector` of this
|
||||
:class:`ItemLoader`, if only a response is given in the constructor.
|
||||
If a selector is given in the constructor this attribute is ignored.
|
||||
:class:`ItemLoader`, if only a response is given in the ``__init__`` method.
|
||||
If a selector is given in the ``__init__`` method this attribute is ignored.
|
||||
This attribute is sometimes overridden in subclasses.
|
||||
|
||||
.. attribute:: selector
|
||||
|
||||
The :class:`~scrapy.selector.Selector` object to extract data from.
|
||||
It's either the selector given in the constructor or one created from
|
||||
the response given in the constructor using the
|
||||
It's either the selector given in the ``__init__`` method or one created from
|
||||
the response given in the ``__init__`` method using the
|
||||
:attr:`default_selector_class`. This attribute is meant to be
|
||||
read-only.
|
||||
|
||||
@ -648,7 +648,7 @@ Here is a list of all built-in processors:
|
||||
.. class:: Identity
|
||||
|
||||
The simplest processor, which doesn't do anything. It returns the original
|
||||
values unchanged. It doesn't receive any constructor arguments, nor does it
|
||||
values unchanged. It doesn't receive any ``__init__`` method arguments, nor does it
|
||||
accept Loader contexts.
|
||||
|
||||
Example::
|
||||
@ -662,7 +662,7 @@ Here is a list of all built-in processors:
|
||||
|
||||
Returns the first non-null/non-empty value from the values received,
|
||||
so it's typically used as an output processor to single-valued fields.
|
||||
It doesn't receive any constructor arguments, nor does it accept Loader contexts.
|
||||
It doesn't receive any ``__init__`` method arguments, nor does it accept Loader contexts.
|
||||
|
||||
Example::
|
||||
|
||||
@ -673,7 +673,7 @@ Here is a list of all built-in processors:
|
||||
|
||||
.. class:: Join(separator=u' ')
|
||||
|
||||
Returns the values joined with the separator given in the constructor, which
|
||||
Returns the values joined with the separator given in the ``__init__`` method, which
|
||||
defaults to ``u' '``. It doesn't accept Loader contexts.
|
||||
|
||||
When using the default separator, this processor is equivalent to the
|
||||
@ -711,7 +711,7 @@ Here is a list of all built-in processors:
|
||||
those which do, this processor will pass the currently active :ref:`Loader
|
||||
context <topics-loaders-context>` through that parameter.
|
||||
|
||||
The keyword arguments passed in the constructor are used as the default
|
||||
The keyword arguments passed in the ``__init__`` method are used as the default
|
||||
Loader context values passed to each function call. However, the final
|
||||
Loader context values passed to functions are overridden with the currently
|
||||
active Loader context accessible through the :meth:`ItemLoader.context`
|
||||
@ -755,12 +755,12 @@ Here is a list of all built-in processors:
|
||||
['HELLO, 'THIS', 'IS', 'SCRAPY']
|
||||
|
||||
As with the Compose processor, functions can receive Loader contexts, and
|
||||
constructor keyword arguments are used as default context values. See
|
||||
``__init__`` method keyword arguments are used as default context values. See
|
||||
:class:`Compose` processor for more info.
|
||||
|
||||
.. class:: SelectJmes(json_path)
|
||||
|
||||
Queries the value using the json path provided to the constructor and returns the output.
|
||||
Queries the value using the json path provided to the ``__init__`` method and returns the output.
|
||||
Requires jmespath (https://github.com/jmespath/jmespath.py) to run.
|
||||
This processor takes only one input at a time.
|
||||
|
||||
|
@ -255,18 +255,18 @@ scrapy.utils.log module
|
||||
when running custom scripts using :class:`~scrapy.crawler.CrawlerRunner`.
|
||||
In that case, its usage is not required but it's recommended.
|
||||
|
||||
If you plan on configuring the handlers yourself is still recommended you
|
||||
call this function, passing ``install_root_handler=False``. Bear in mind
|
||||
there won't be any log output set by default in that case.
|
||||
Another option when running custom scripts is to manually configure the logging.
|
||||
To do this you can use `logging.basicConfig()`_ to set a basic root handler.
|
||||
|
||||
To get you started on manually configuring logging's output, you can use
|
||||
`logging.basicConfig()`_ to set a basic root handler. This is an example
|
||||
on how to redirect ``INFO`` or higher messages to a file::
|
||||
Note that :class:`~scrapy.crawler.CrawlerProcess` automatically calls ``configure_logging``,
|
||||
so it is recommended to only use `logging.basicConfig()`_ together with
|
||||
:class:`~scrapy.crawler.CrawlerRunner`.
|
||||
|
||||
This is an example on how to redirect ``INFO`` or higher messages to a file::
|
||||
|
||||
import logging
|
||||
from scrapy.utils.log import configure_logging
|
||||
|
||||
configure_logging(install_root_handler=False)
|
||||
logging.basicConfig(
|
||||
filename='log.txt',
|
||||
format='%(levelname)s: %(message)s',
|
||||
|
@ -137,7 +137,7 @@ Request objects
|
||||
|
||||
A string containing the URL of this request. Keep in mind that this
|
||||
attribute contains the escaped URL, so it can differ from the URL passed in
|
||||
the constructor.
|
||||
the ``__init__`` method.
|
||||
|
||||
This attribute is read-only. To change the URL of a Request use
|
||||
:meth:`replace`.
|
||||
@ -400,7 +400,7 @@ fields with form data from :class:`Response` objects.
|
||||
|
||||
.. class:: FormRequest(url, [formdata, ...])
|
||||
|
||||
The :class:`FormRequest` class adds a new keyword parameter to the constructor. The
|
||||
The :class:`FormRequest` class adds a new keyword parameter to the ``__init__`` method. The
|
||||
remaining arguments are the same as for the :class:`Request` class and are
|
||||
not documented here.
|
||||
|
||||
@ -473,7 +473,7 @@ fields with form data from :class:`Response` objects.
|
||||
:type dont_click: boolean
|
||||
|
||||
The other parameters of this class method are passed directly to the
|
||||
:class:`FormRequest` constructor.
|
||||
:class:`FormRequest` ``__init__`` method.
|
||||
|
||||
.. versionadded:: 0.10.3
|
||||
The ``formname`` parameter.
|
||||
@ -547,7 +547,7 @@ dealing with JSON requests.
|
||||
|
||||
.. class:: JsonRequest(url, [... data, dumps_kwargs])
|
||||
|
||||
The :class:`JsonRequest` class adds two new keyword parameters to the constructor. The
|
||||
The :class:`JsonRequest` class adds two new keyword parameters to the ``__init__`` method. The
|
||||
remaining arguments are the same as for the :class:`Request` class and are
|
||||
not documented here.
|
||||
|
||||
@ -556,7 +556,7 @@ dealing with JSON requests.
|
||||
|
||||
:param data: is any JSON serializable object that needs to be JSON encoded and assigned to body.
|
||||
if :attr:`Request.body` argument is provided this parameter will be ignored.
|
||||
if :attr:`Request.body` argument is not provided and data argument is provided :attr:`Request.method` will be
|
||||
if :attr:`Request.body` argument is not provided and data argument is provided :attr:`Request.method` will be
|
||||
set to ``'POST'`` automatically.
|
||||
:type data: JSON serializable object
|
||||
|
||||
@ -721,7 +721,7 @@ TextResponse objects
|
||||
:class:`Response` class, which is meant to be used only for binary data,
|
||||
such as images, sounds or any media file.
|
||||
|
||||
:class:`TextResponse` objects support a new constructor argument, in
|
||||
:class:`TextResponse` objects support a new ``__init__`` method argument, in
|
||||
addition to the base :class:`Response` objects. The remaining functionality
|
||||
is the same as for the :class:`Response` class and is not documented here.
|
||||
|
||||
@ -755,7 +755,7 @@ TextResponse objects
|
||||
A string with the encoding of this response. The encoding is resolved by
|
||||
trying the following mechanisms, in order:
|
||||
|
||||
1. the encoding passed in the constructor ``encoding`` argument
|
||||
1. the encoding passed in the ``__init__`` method ``encoding`` argument
|
||||
|
||||
2. the encoding declared in the Content-Type HTTP header. If this
|
||||
encoding is not valid (ie. unknown), it is ignored and the next
|
||||
|
39
pytest.ini
39
pytest.ini
@ -2,7 +2,24 @@
|
||||
usefixtures = chdir
|
||||
python_files=test_*.py __init__.py
|
||||
python_classes=
|
||||
addopts = --doctest-modules --assert=plain
|
||||
addopts =
|
||||
--assert=plain
|
||||
--doctest-modules
|
||||
--ignore=docs/_ext
|
||||
--ignore=docs/conf.py
|
||||
--ignore=docs/news.rst
|
||||
--ignore=docs/topics/commands.rst
|
||||
--ignore=docs/topics/debug.rst
|
||||
--ignore=docs/topics/developer-tools.rst
|
||||
--ignore=docs/topics/dynamic-content.rst
|
||||
--ignore=docs/topics/items.rst
|
||||
--ignore=docs/topics/leaks.rst
|
||||
--ignore=docs/topics/loaders.rst
|
||||
--ignore=docs/topics/selectors.rst
|
||||
--ignore=docs/topics/shell.rst
|
||||
--ignore=docs/topics/stats.rst
|
||||
--ignore=docs/topics/telnetconsole.rst
|
||||
--ignore=docs/utils
|
||||
twisted = 1
|
||||
flake8-ignore =
|
||||
# extras
|
||||
@ -30,7 +47,7 @@ flake8-ignore =
|
||||
scrapy/core/engine.py E261 E501 E128 E127 E306 E502
|
||||
scrapy/core/scheduler.py E501
|
||||
scrapy/core/scraper.py E501 E306 E261 E128 W504
|
||||
scrapy/core/spidermw.py E501 E731 E502 E231 E126 E226
|
||||
scrapy/core/spidermw.py E501 E731 E502 E126 E226
|
||||
scrapy/core/downloader/__init__.py F401 E501
|
||||
scrapy/core/downloader/contextfactory.py E501 E128 E126
|
||||
scrapy/core/downloader/middleware.py E501 E502
|
||||
@ -175,14 +192,14 @@ flake8-ignore =
|
||||
tests/test_crawl.py E501 E741 E265
|
||||
tests/test_crawler.py F841 E306 E501
|
||||
tests/test_dependencies.py E302 F841 E501 E305
|
||||
tests/test_downloader_handlers.py E124 E127 E128 E225 E261 E265 F401 E501 E502 E701 E711 E126 E226 E123
|
||||
tests/test_downloader_handlers.py E124 E127 E128 E225 E261 E265 F401 E501 E502 E701 E126 E226 E123
|
||||
tests/test_downloadermiddleware.py E501
|
||||
tests/test_downloadermiddleware_ajaxcrawlable.py E302 E501
|
||||
tests/test_downloadermiddleware_cookies.py E731 E741 E501 E128 E303 E265 E126
|
||||
tests/test_downloadermiddleware_decompression.py E127
|
||||
tests/test_downloadermiddleware_defaultheaders.py E501
|
||||
tests/test_downloadermiddleware_downloadtimeout.py E501
|
||||
tests/test_downloadermiddleware_httpcache.py E713 E501 E302 E305 F401
|
||||
tests/test_downloadermiddleware_httpcache.py E501 E302 E305 F401
|
||||
tests/test_downloadermiddleware_httpcompression.py E501 F401 E251 E126 E123
|
||||
tests/test_downloadermiddleware_httpproxy.py F401 E501 E128
|
||||
tests/test_downloadermiddleware_redirect.py E501 E303 E128 E306 E127 E305
|
||||
@ -196,13 +213,13 @@ flake8-ignore =
|
||||
tests/test_feedexport.py E501 F401 F841 E241
|
||||
tests/test_http_cookies.py E501
|
||||
tests/test_http_headers.py E302 E501
|
||||
tests/test_http_request.py F401 E402 E501 E231 E261 E127 E128 W293 E502 E128 E502 E126 E123
|
||||
tests/test_http_request.py F401 E402 E501 E261 E127 E128 W293 E502 E128 E502 E126 E123
|
||||
tests/test_http_response.py E501 E301 E502 E128 E265
|
||||
tests/test_item.py E701 E128 E231 F841 E306
|
||||
tests/test_item.py E701 E128 F841 E306
|
||||
tests/test_link.py E501
|
||||
tests/test_linkextractors.py E501 E128 E231 E124
|
||||
tests/test_linkextractors.py E501 E128 E124
|
||||
tests/test_loader.py E302 E501 E731 E303 E741 E128 E117 E241
|
||||
tests/test_logformatter.py E128 E501 E231 E122 E302
|
||||
tests/test_logformatter.py E128 E501 E122 E302
|
||||
tests/test_mail.py E302 E128 E501 E305
|
||||
tests/test_middleware.py E302 E501 E128
|
||||
tests/test_pipeline_crawl.py E131 E501 E128 E126
|
||||
@ -221,8 +238,8 @@ flake8-ignore =
|
||||
tests/test_spidermiddleware_output_chain.py F401 E501 E302 W293 E226
|
||||
tests/test_spidermiddleware_referer.py F401 E501 E302 F841 E125 E201 E261 E124 E501 E241 E121
|
||||
tests/test_squeues.py E501 E302 E701 E741
|
||||
tests/test_utils_conf.py E501 E231 E303 E128
|
||||
tests/test_utils_console.py E302 E231
|
||||
tests/test_utils_conf.py E501 E303 E128
|
||||
tests/test_utils_console.py E302
|
||||
tests/test_utils_curl.py E501
|
||||
tests/test_utils_datatypes.py E402 E501 E305
|
||||
tests/test_utils_defer.py E306 E261 E501 E302 F841 E226
|
||||
@ -251,4 +268,4 @@ flake8-ignore =
|
||||
tests/test_spiderloader/test_spiders/spider2.py E302
|
||||
tests/test_spiderloader/test_spiders/spider3.py E302
|
||||
tests/test_spiderloader/test_spiders/nested/spider4.py E302
|
||||
tests/test_utils_misc/__init__.py E501 E231
|
||||
tests/test_utils_misc/__init__.py E501
|
||||
|
@ -231,9 +231,9 @@ class Scraper(object):
|
||||
signal=signals.item_dropped, item=item, response=response,
|
||||
spider=spider, exception=output.value)
|
||||
else:
|
||||
logger.error('Error processing %(item)s', {'item': item},
|
||||
exc_info=failure_to_exc_info(output),
|
||||
extra={'spider': spider})
|
||||
logkws = self.logformatter.error(item, ex, response, spider)
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider},
|
||||
exc_info=failure_to_exc_info(output))
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_error, item=item, response=response,
|
||||
spider=spider, failure=output)
|
||||
|
@ -35,7 +35,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
self.methods['process_spider_exception'].appendleft(getattr(mw, 'process_spider_exception', None))
|
||||
|
||||
def scrape_response(self, scrape_func, response, request, spider):
|
||||
fname = lambda f:'%s.%s' % (
|
||||
fname = lambda f: '%s.%s' % (
|
||||
f.__self__.__class__.__name__,
|
||||
f.__func__.__name__)
|
||||
|
||||
|
@ -4,9 +4,9 @@ and extract the potentially compressed responses that may arrive.
|
||||
|
||||
import bz2
|
||||
import gzip
|
||||
import zipfile
|
||||
import tarfile
|
||||
import logging
|
||||
import tarfile
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from tempfile import mktemp
|
||||
|
||||
|
@ -29,7 +29,7 @@ class BaseItemExporter(object):
|
||||
def _configure(self, options, dont_fail=False):
|
||||
"""Configure the exporter by poping options from the ``options`` dict.
|
||||
If dont_fail is set, it won't raise an exception on unexpected options
|
||||
(useful for using with keyword arguments in subclasses constructors)
|
||||
(useful for using with keyword arguments in subclasses ``__init__`` methods)
|
||||
"""
|
||||
self.encoding = options.pop('encoding', None)
|
||||
self.fields_to_export = options.pop('fields_to_export', None)
|
||||
|
@ -198,9 +198,9 @@ class FeedExporter(object):
|
||||
|
||||
def __init__(self, settings):
|
||||
self.settings = settings
|
||||
self.urifmt = settings['FEED_URI']
|
||||
if not self.urifmt:
|
||||
if not settings['FEED_URI']:
|
||||
raise NotConfigured
|
||||
self.urifmt = str(settings['FEED_URI'])
|
||||
self.format = settings['FEED_FORMAT'].lower()
|
||||
self.export_encoding = settings['FEED_EXPORT_ENCODING']
|
||||
self.storages = self._load_components('FEED_STORAGES')
|
||||
|
@ -65,7 +65,7 @@ class Request(object_ref):
|
||||
s = safe_url_string(url, self.encoding)
|
||||
self._url = escape_ajax(s)
|
||||
|
||||
if ':' not in self._url:
|
||||
if ('://' not in self._url) and (not self._url.startswith('data:')):
|
||||
raise ValueError('Missing scheme in request url: %s' % self._url)
|
||||
|
||||
url = property(_get_url, obsolete_setter(_set_url, 'url'))
|
||||
|
@ -19,23 +19,26 @@ from scrapy.utils.url import (
|
||||
|
||||
# common file extensions that are not followed if they occur in links
|
||||
IGNORED_EXTENSIONS = [
|
||||
# archives
|
||||
'7z', '7zip', 'bz2', 'rar', 'tar', 'tar.gz', 'xz', 'zip',
|
||||
|
||||
# images
|
||||
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
|
||||
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg',
|
||||
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'cdr', 'ico',
|
||||
|
||||
# audio
|
||||
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
|
||||
|
||||
# video
|
||||
'3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',
|
||||
'm4a', 'm4v', 'flv',
|
||||
'm4a', 'm4v', 'flv', 'webm',
|
||||
|
||||
# office suites
|
||||
'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg',
|
||||
'odp',
|
||||
|
||||
# other
|
||||
'css', 'pdf', 'exe', 'bin', 'rss', 'zip', 'rar',
|
||||
'css', 'pdf', 'exe', 'bin', 'rss', 'dmg', 'iso', 'apk'
|
||||
]
|
||||
|
||||
|
||||
|
@ -8,6 +8,7 @@ from scrapy.utils.request import referer_str
|
||||
SCRAPEDMSG = u"Scraped from %(src)s" + os.linesep + "%(item)s"
|
||||
DROPPEDMSG = u"Dropped: %(exception)s" + os.linesep + "%(item)s"
|
||||
CRAWLEDMSG = u"Crawled (%(status)s) %(request)s%(request_flags)s (referer: %(referer)s)%(response_flags)s"
|
||||
ERRORMSG = u"'Error processing %(item)s'"
|
||||
|
||||
|
||||
class LogFormatter(object):
|
||||
@ -92,6 +93,16 @@ class LogFormatter(object):
|
||||
}
|
||||
}
|
||||
|
||||
def error(self, item, exception, response, spider):
|
||||
"""Logs a message when an item causes an error while it is passing through the item pipeline."""
|
||||
return {
|
||||
'level': logging.ERROR,
|
||||
'msg': ERRORMSG,
|
||||
'args': {
|
||||
'item': item,
|
||||
}
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls()
|
||||
|
@ -86,9 +86,6 @@ class _SlotPriorityQueues(object):
|
||||
def __len__(self):
|
||||
return sum(len(x) for x in self.pqueues.values()) if self.pqueues else 0
|
||||
|
||||
def __contains__(self, slot):
|
||||
return slot in self.pqueues
|
||||
|
||||
|
||||
class ScrapyPriorityQueue(PriorityQueue):
|
||||
"""
|
||||
|
@ -5,9 +5,10 @@ Python Standard Library.
|
||||
This module must not depend on any module outside the Standard Library.
|
||||
"""
|
||||
|
||||
import copy
|
||||
import collections
|
||||
import copy
|
||||
import warnings
|
||||
from collections.abc import Mapping
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
@ -223,7 +224,7 @@ class CaselessDict(dict):
|
||||
return dict.setdefault(self, self.normkey(key), self.normvalue(def_val))
|
||||
|
||||
def update(self, seq):
|
||||
seq = seq.items() if isinstance(seq, collections.abc.Mapping) else seq
|
||||
seq = seq.items() if isinstance(seq, Mapping) else seq
|
||||
iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
|
||||
super(CaselessDict, self).update(iseq)
|
||||
|
||||
@ -247,8 +248,9 @@ class LocalCache(collections.OrderedDict):
|
||||
self.limit = limit
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
while len(self) >= self.limit:
|
||||
self.popitem(last=False)
|
||||
if self.limit:
|
||||
while len(self) >= self.limit:
|
||||
self.popitem(last=False)
|
||||
super(LocalCache, self).__setitem__(key, value)
|
||||
|
||||
|
||||
|
@ -296,7 +296,7 @@ class WeakKeyCache(object):
|
||||
def stringify_dict(dct_or_tuples, encoding='utf-8', keys_only=True):
|
||||
"""Return a (new) dict with unicode keys (and values when "keys_only" is
|
||||
False) of the given dict converted to strings. ``dct_or_tuples`` can be a
|
||||
dict or a list of tuples, like any dict constructor supports.
|
||||
dict or a list of tuples, like any dict ``__init__`` method supports.
|
||||
"""
|
||||
d = {}
|
||||
for k, v in dict(dct_or_tuples).items():
|
||||
|
@ -3,10 +3,10 @@ from twisted.internet import reactor, error
|
||||
def listen_tcp(portrange, host, factory):
|
||||
"""Like reactor.listenTCP but tries different ports in a range."""
|
||||
assert len(portrange) <= 2, "invalid portrange: %s" % portrange
|
||||
if not hasattr(portrange, '__iter__'):
|
||||
return reactor.listenTCP(portrange, factory, interface=host)
|
||||
if not portrange:
|
||||
return reactor.listenTCP(0, factory, interface=host)
|
||||
if not hasattr(portrange, '__iter__'):
|
||||
return reactor.listenTCP(portrange, factory, interface=host)
|
||||
if len(portrange) == 1:
|
||||
return reactor.listenTCP(portrange[0], factory, interface=host)
|
||||
for x in range(portrange[0], portrange[1]+1):
|
||||
|
@ -38,7 +38,7 @@ singletons members of that object, as explained below:
|
||||
``scrapy.core.manager.ExecutionManager``) - instantiated with a ``Settings``
|
||||
object
|
||||
|
||||
- **crawler.settings**: ``scrapy.conf.Settings`` instance (passed in the constructor)
|
||||
- **crawler.settings**: ``scrapy.conf.Settings`` instance (passed in the ``__init__`` method)
|
||||
- **crawler.extensions**: ``scrapy.extension.ExtensionManager`` instance
|
||||
- **crawler.engine**: ``scrapy.core.engine.ExecutionEngine`` instance
|
||||
- ``crawler.engine.scheduler``
|
||||
@ -55,7 +55,7 @@ singletons members of that object, as explained below:
|
||||
``STATS_CLASS`` setting)
|
||||
- **crawler.log**: Logger class with methods replacing the current
|
||||
``scrapy.log`` functions. Logging would be started (if enabled) on
|
||||
``Crawler`` constructor, so no log starting functions are required.
|
||||
``Crawler`` instantiation, so no log starting functions are required.
|
||||
|
||||
- ``crawler.log.msg``
|
||||
- **crawler.signals**: signal handling
|
||||
@ -69,12 +69,12 @@ Required code changes after singletons removal
|
||||
==============================================
|
||||
|
||||
All components (extensions, middlewares, etc) will receive this ``Crawler``
|
||||
object in their constructors, and this will be the only mechanism for accessing
|
||||
object in their ``__init__`` methods, and this will be the only mechanism for accessing
|
||||
any other components (as opposed to importing each singleton from their
|
||||
respective module). This will also serve to stabilize the core API, something
|
||||
which we haven't documented so far (partly because of this).
|
||||
|
||||
So, for a typical middleware constructor code, instead of this:
|
||||
So, for a typical middleware ``__init__`` method code, instead of this:
|
||||
|
||||
::
|
||||
|
||||
@ -125,13 +125,13 @@ Open issues to resolve
|
||||
|
||||
- Should we pass ``Settings`` object to ``ScrapyCommand.add_options()``?
|
||||
- How should spiders access settings?
|
||||
- Option 1. Pass ``Crawler`` object to spider constructors too
|
||||
- Option 1. Pass ``Crawler`` object to spider ``__init__`` methods too
|
||||
- pro: one way to access all components (settings and signals being the
|
||||
most relevant to spiders)
|
||||
- con?: spider code can access (and control) any crawler component -
|
||||
since we don't want to support spiders messing with the crawler (write
|
||||
an extension or spider middleware if you need that)
|
||||
- Option 2. Pass ``Settings`` object to spider constructors, which would
|
||||
- Option 2. Pass ``Settings`` object to spider ``__init__`` methods, which would
|
||||
then be accessed through ``self.settings``, like logging which is accessed
|
||||
through ``self.log``
|
||||
|
||||
|
@ -6,6 +6,7 @@ pytest
|
||||
pytest-cov
|
||||
pytest-twisted
|
||||
pytest-xdist
|
||||
sybil
|
||||
testfixtures
|
||||
|
||||
# optional for shell wrapper tests
|
||||
|
@ -614,7 +614,7 @@ class Http11MockServerTestCase(unittest.TestCase):
|
||||
crawler = get_crawler(SingleRequestSpider)
|
||||
yield crawler.crawl(seed=Request(url=self.mockserver.url('')))
|
||||
failure = crawler.spider.meta.get('failure')
|
||||
self.assertTrue(failure == None)
|
||||
self.assertTrue(failure is None)
|
||||
reason = crawler.spider.meta['close_reason']
|
||||
self.assertTrue(reason, 'finished')
|
||||
|
||||
@ -636,7 +636,7 @@ class Http11MockServerTestCase(unittest.TestCase):
|
||||
yield crawler.crawl(seed=request)
|
||||
# download_maxsize = 50 is enough for the gzipped response
|
||||
failure = crawler.spider.meta.get('failure')
|
||||
self.assertTrue(failure == None)
|
||||
self.assertTrue(failure is None)
|
||||
reason = crawler.spider.meta['close_reason']
|
||||
self.assertTrue(reason, 'finished')
|
||||
|
||||
|
@ -84,8 +84,8 @@ class _BaseTest(unittest.TestCase):
|
||||
|
||||
def assertEqualRequestButWithCacheValidators(self, request1, request2):
|
||||
self.assertEqual(request1.url, request2.url)
|
||||
assert not b'If-None-Match' in request1.headers
|
||||
assert not b'If-Modified-Since' in request1.headers
|
||||
assert b'If-None-Match' not in request1.headers
|
||||
assert b'If-Modified-Since' not in request1.headers
|
||||
assert any(h in request2.headers for h in (b'If-None-Match', b'If-Modified-Since'))
|
||||
self.assertEqual(request1.body, request2.body)
|
||||
|
||||
|
@ -6,6 +6,7 @@ import tempfile
|
||||
import shutil
|
||||
import string
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
from urllib.parse import urljoin, urlparse, quote
|
||||
from urllib.request import pathname2url
|
||||
@ -403,6 +404,7 @@ class FeedExportTest(unittest.TestCase):
|
||||
defaults = {
|
||||
'FEED_URI': res_uri,
|
||||
'FEED_FORMAT': 'csv',
|
||||
'FEED_PATH': res_path
|
||||
}
|
||||
defaults.update(settings or {})
|
||||
try:
|
||||
@ -411,7 +413,7 @@ class FeedExportTest(unittest.TestCase):
|
||||
spider_cls.start_urls = [s.url('/')]
|
||||
yield runner.crawl(spider_cls)
|
||||
|
||||
with open(res_path, 'rb') as f:
|
||||
with open(str(defaults['FEED_PATH']), 'rb') as f:
|
||||
content = f.read()
|
||||
|
||||
finally:
|
||||
@ -841,3 +843,17 @@ class FeedExportTest(unittest.TestCase):
|
||||
yield self.exported_data({}, settings)
|
||||
self.assertTrue(FromCrawlerCsvItemExporter.init_with_crawler)
|
||||
self.assertTrue(FromCrawlerFileFeedStorage.init_with_crawler)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_pathlib_uri(self):
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
feed_uri = Path(tmpdir) / 'res'
|
||||
settings = {
|
||||
'FEED_FORMAT': 'csv',
|
||||
'FEED_STORE_EMPTY': True,
|
||||
'FEED_URI': feed_uri,
|
||||
'FEED_PATH': feed_uri
|
||||
}
|
||||
data = yield self.exported_no_data(settings)
|
||||
self.assertEqual(data, b'')
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
|
@ -3,7 +3,7 @@ import cgi
|
||||
import unittest
|
||||
import re
|
||||
import json
|
||||
import xmlrpc.client as xmlrpclib
|
||||
import xmlrpc.client
|
||||
import warnings
|
||||
from unittest import mock
|
||||
from urllib.parse import parse_qs, unquote_to_bytes, urlparse
|
||||
@ -20,7 +20,7 @@ class RequestTest(unittest.TestCase):
|
||||
default_meta = {}
|
||||
|
||||
def test_init(self):
|
||||
# Request requires url in the constructor
|
||||
# Request requires url in the __init__ method
|
||||
self.assertRaises(Exception, self.request_class)
|
||||
|
||||
# url argument must be basestring
|
||||
@ -47,11 +47,13 @@ class RequestTest(unittest.TestCase):
|
||||
|
||||
def test_url_no_scheme(self):
|
||||
self.assertRaises(ValueError, self.request_class, 'foo')
|
||||
self.assertRaises(ValueError, self.request_class, '/foo/')
|
||||
self.assertRaises(ValueError, self.request_class, '/foo:bar')
|
||||
|
||||
def test_headers(self):
|
||||
# Different ways of setting headers attribute
|
||||
url = 'http://www.scrapy.org'
|
||||
headers = {b'Accept':'gzip', b'Custom-Header':'nothing to tell you'}
|
||||
headers = {b'Accept': 'gzip', b'Custom-Header': 'nothing to tell you'}
|
||||
r = self.request_class(url=url, headers=headers)
|
||||
p = self.request_class(url=url, headers=r.headers)
|
||||
|
||||
@ -495,7 +497,7 @@ class FormRequestTest(RequestTest):
|
||||
formdata=(('foo', 'bar'), ('foo', 'baz')))
|
||||
self.assertEqual(urlparse(req.url).hostname, 'www.example.com')
|
||||
self.assertEqual(urlparse(req.url).query, 'foo=bar&foo=baz')
|
||||
|
||||
|
||||
def test_from_response_override_duplicate_form_key(self):
|
||||
response = _buildresponse(
|
||||
"""<form action="get.php" method="POST">
|
||||
@ -652,7 +654,7 @@ class FormRequestTest(RequestTest):
|
||||
req = self.request_class.from_response(response, dont_click=True)
|
||||
fs = _qs(req)
|
||||
self.assertEqual(fs, {b'i1': [b'i1v'], b'i2': [b'i2v']})
|
||||
|
||||
|
||||
def test_from_response_clickdata_does_not_ignore_image(self):
|
||||
response = _buildresponse(
|
||||
"""<form>
|
||||
@ -811,7 +813,7 @@ class FormRequestTest(RequestTest):
|
||||
<input type="hidden" name="one" value="1">
|
||||
<input type="hidden" name="two" value="2">
|
||||
</form>""")
|
||||
r1 = self.request_class.from_response(response, formdata={'two':'3'})
|
||||
r1 = self.request_class.from_response(response, formdata={'two': '3'})
|
||||
self.assertEqual(r1.method, 'POST')
|
||||
self.assertEqual(r1.headers['Content-type'], b'application/x-www-form-urlencoded')
|
||||
fs = _qs(r1)
|
||||
@ -1218,7 +1220,7 @@ class XmlRpcRequestTest(RequestTest):
|
||||
r = self.request_class('http://scrapytest.org/rpc2', **kwargs)
|
||||
self.assertEqual(r.headers[b'Content-Type'], b'text/xml')
|
||||
self.assertEqual(r.body,
|
||||
to_bytes(xmlrpclib.dumps(**kwargs),
|
||||
to_bytes(xmlrpc.client.dumps(**kwargs),
|
||||
encoding=kwargs.get('encoding', 'utf-8')))
|
||||
self.assertEqual(r.method, 'POST')
|
||||
self.assertEqual(r.encoding, kwargs.get('encoding', 'utf-8'))
|
||||
|
@ -532,7 +532,7 @@ class XmlResponseTest(TextResponseTest):
|
||||
r2 = self.response_class("http://www.example.com", body=body)
|
||||
self._assert_response_values(r2, 'iso-8859-1', body)
|
||||
|
||||
# make sure replace() preserves the explicit encoding passed in the constructor
|
||||
# make sure replace() preserves the explicit encoding passed in the __init__ method
|
||||
body = b"""<?xml version="1.0" encoding="iso-8859-1"?><xml></xml>"""
|
||||
r3 = self.response_class("http://www.example.com", body=body, encoding='utf-8')
|
||||
body2 = b"New body"
|
||||
|
@ -239,7 +239,7 @@ class ItemTest(unittest.TestCase):
|
||||
def test_copy(self):
|
||||
class TestItem(Item):
|
||||
name = Field()
|
||||
item = TestItem({'name':'lower'})
|
||||
item = TestItem({'name': 'lower'})
|
||||
copied_item = item.copy()
|
||||
self.assertNotEqual(id(item), id(copied_item))
|
||||
copied_item['name'] = copied_item['name'].upper()
|
||||
|
@ -43,6 +43,6 @@ class LinkTest(unittest.TestCase):
|
||||
l2 = eval(repr(l1))
|
||||
self._assert_same_links(l1, l2)
|
||||
|
||||
def test_non_str_url_py2(self):
|
||||
def test_bytes_url(self):
|
||||
with self.assertRaises(TypeError):
|
||||
Link(b"http://www.example.com/\xc2\xa3")
|
||||
|
@ -322,7 +322,7 @@ class Base:
|
||||
Link(url=page4_url, text=u'href with whitespaces'),
|
||||
])
|
||||
|
||||
lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
|
||||
lx = self.extractor_cls(attrs=("href", "src"), tags=("a", "area", "img"), deny_extensions=())
|
||||
self.assertEqual(lx.extract_links(self.response), [
|
||||
Link(url='http://example.com/sample1.html', text=u''),
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||
@ -360,7 +360,7 @@ class Base:
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||
])
|
||||
|
||||
lx = self.extractor_cls(tags=("a","img"), attrs=("href", "src"), deny_extensions=())
|
||||
lx = self.extractor_cls(tags=("a", "img"), attrs=("href", "src"), deny_extensions=())
|
||||
self.assertEqual(lx.extract_links(response), [
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||
Link(url='http://example.com/sample2.jpg', text=u''),
|
||||
|
@ -725,11 +725,11 @@ class SelectortemLoaderTest(unittest.TestCase):
|
||||
</html>
|
||||
""")
|
||||
|
||||
def test_constructor(self):
|
||||
def test_init_method(self):
|
||||
l = TestItemLoader()
|
||||
self.assertEqual(l.selector, None)
|
||||
|
||||
def test_constructor_errors(self):
|
||||
def test_init_method_errors(self):
|
||||
l = TestItemLoader()
|
||||
self.assertRaises(RuntimeError, l.add_xpath, 'url', '//a/@href')
|
||||
self.assertRaises(RuntimeError, l.replace_xpath, 'url', '//a/@href')
|
||||
@ -738,7 +738,7 @@ class SelectortemLoaderTest(unittest.TestCase):
|
||||
self.assertRaises(RuntimeError, l.replace_css, 'name', '#name::text')
|
||||
self.assertRaises(RuntimeError, l.get_css, '#name::text')
|
||||
|
||||
def test_constructor_with_selector(self):
|
||||
def test_init_method_with_selector(self):
|
||||
sel = Selector(text=u"<html><body><div>marta</div></body></html>")
|
||||
l = TestItemLoader(selector=sel)
|
||||
self.assertIs(l.selector, sel)
|
||||
@ -746,7 +746,7 @@ class SelectortemLoaderTest(unittest.TestCase):
|
||||
l.add_xpath('name', '//div/text()')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
|
||||
def test_constructor_with_selector_css(self):
|
||||
def test_init_method_with_selector_css(self):
|
||||
sel = Selector(text=u"<html><body><div>marta</div></body></html>")
|
||||
l = TestItemLoader(selector=sel)
|
||||
self.assertIs(l.selector, sel)
|
||||
@ -754,14 +754,14 @@ class SelectortemLoaderTest(unittest.TestCase):
|
||||
l.add_css('name', 'div::text')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
|
||||
def test_constructor_with_response(self):
|
||||
def test_init_method_with_response(self):
|
||||
l = TestItemLoader(response=self.response)
|
||||
self.assertTrue(l.selector)
|
||||
|
||||
l.add_xpath('name', '//div/text()')
|
||||
self.assertEqual(l.get_output_value('name'), [u'Marta'])
|
||||
|
||||
def test_constructor_with_response_css(self):
|
||||
def test_init_method_with_response_css(self):
|
||||
l = TestItemLoader(response=self.response)
|
||||
self.assertTrue(l.selector)
|
||||
|
||||
|
@ -22,13 +22,13 @@ class CustomItem(Item):
|
||||
return "name: %s" % self['name']
|
||||
|
||||
|
||||
class LoggingContribTest(unittest.TestCase):
|
||||
class LogFormatterTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.formatter = LogFormatter()
|
||||
self.spider = Spider('default')
|
||||
|
||||
def test_crawled(self):
|
||||
def test_crawled_with_referer(self):
|
||||
req = Request("http://www.example.com")
|
||||
res = Response("http://www.example.com")
|
||||
logkws = self.formatter.crawled(req, res, self.spider)
|
||||
@ -36,6 +36,7 @@ class LoggingContribTest(unittest.TestCase):
|
||||
self.assertEqual(logline,
|
||||
"Crawled (200) <GET http://www.example.com> (referer: None)")
|
||||
|
||||
def test_crawled_without_referer(self):
|
||||
req = Request("http://www.example.com", headers={'referer': 'http://example.com'})
|
||||
res = Response("http://www.example.com", flags=['cached'])
|
||||
logkws = self.formatter.crawled(req, res, self.spider)
|
||||
@ -44,7 +45,7 @@ class LoggingContribTest(unittest.TestCase):
|
||||
"Crawled (200) <GET http://www.example.com> (referer: http://example.com) ['cached']")
|
||||
|
||||
def test_flags_in_request(self):
|
||||
req = Request("http://www.example.com", flags=['test','flag'])
|
||||
req = Request("http://www.example.com", flags=['test', 'flag'])
|
||||
res = Response("http://www.example.com")
|
||||
logkws = self.formatter.crawled(req, res, self.spider)
|
||||
logline = logkws['msg'] % logkws['args']
|
||||
@ -61,6 +62,16 @@ class LoggingContribTest(unittest.TestCase):
|
||||
assert all(isinstance(x, str) for x in lines)
|
||||
self.assertEqual(lines, [u"Dropped: \u2018", '{}'])
|
||||
|
||||
def test_error(self):
|
||||
# In practice, the complete traceback is shown by passing the
|
||||
# 'exc_info' argument to the logging function
|
||||
item = {'key': 'value'}
|
||||
exception = Exception()
|
||||
response = Response("http://www.example.com")
|
||||
logkws = self.formatter.error(item, exception, response, self.spider)
|
||||
logline = logkws['msg'] % logkws['args']
|
||||
self.assertEqual(logline, u"'Error processing {'key': 'value'}'")
|
||||
|
||||
def test_scraped(self):
|
||||
item = CustomItem()
|
||||
item['name'] = u'\xa3'
|
||||
@ -74,26 +85,46 @@ class LoggingContribTest(unittest.TestCase):
|
||||
|
||||
class LogFormatterSubclass(LogFormatter):
|
||||
def crawled(self, request, response, spider):
|
||||
kwargs = super(LogFormatterSubclass, self).crawled(
|
||||
request, response, spider)
|
||||
kwargs = super(LogFormatterSubclass, self).crawled(request, response, spider)
|
||||
CRAWLEDMSG = (
|
||||
u"Crawled (%(status)s) %(request)s (referer: "
|
||||
u"%(referer)s)%(flags)s"
|
||||
u"Crawled (%(status)s) %(request)s (referer: %(referer)s) %(flags)s"
|
||||
)
|
||||
log_args = kwargs['args']
|
||||
log_args['flags'] = str(request.flags)
|
||||
return {
|
||||
'level': kwargs['level'],
|
||||
'msg': CRAWLEDMSG,
|
||||
'args': kwargs['args']
|
||||
'args': log_args,
|
||||
}
|
||||
|
||||
|
||||
class LogformatterSubclassTest(LoggingContribTest):
|
||||
class LogformatterSubclassTest(LogFormatterTestCase):
|
||||
def setUp(self):
|
||||
self.formatter = LogFormatterSubclass()
|
||||
self.spider = Spider('default')
|
||||
|
||||
def test_crawled_with_referer(self):
|
||||
req = Request("http://www.example.com")
|
||||
res = Response("http://www.example.com")
|
||||
logkws = self.formatter.crawled(req, res, self.spider)
|
||||
logline = logkws['msg'] % logkws['args']
|
||||
self.assertEqual(logline,
|
||||
"Crawled (200) <GET http://www.example.com> (referer: None) []")
|
||||
|
||||
def test_crawled_without_referer(self):
|
||||
req = Request("http://www.example.com", headers={'referer': 'http://example.com'}, flags=['cached'])
|
||||
res = Response("http://www.example.com")
|
||||
logkws = self.formatter.crawled(req, res, self.spider)
|
||||
logline = logkws['msg'] % logkws['args']
|
||||
self.assertEqual(logline,
|
||||
"Crawled (200) <GET http://www.example.com> (referer: http://example.com) ['cached']")
|
||||
|
||||
def test_flags_in_request(self):
|
||||
pass
|
||||
req = Request("http://www.example.com", flags=['test', 'flag'])
|
||||
res = Response("http://www.example.com")
|
||||
logkws = self.formatter.crawled(req, res, self.spider)
|
||||
logline = logkws['msg'] % logkws['args']
|
||||
self.assertEqual(logline, "Crawled (200) <GET http://www.example.com> (referer: None) ['test', 'flag']")
|
||||
|
||||
|
||||
class SkipMessagesLogFormatter(LogFormatter):
|
||||
|
@ -41,12 +41,12 @@ class SpiderTest(unittest.TestCase):
|
||||
self.assertEqual(list(start_requests), [])
|
||||
|
||||
def test_spider_args(self):
|
||||
"""Constructor arguments are assigned to spider attributes"""
|
||||
"""``__init__`` method arguments are assigned to spider attributes"""
|
||||
spider = self.spider_class('example.com', foo='bar')
|
||||
self.assertEqual(spider.foo, 'bar')
|
||||
|
||||
def test_spider_without_name(self):
|
||||
"""Constructor arguments are assigned to spider attributes"""
|
||||
"""``__init__`` method arguments are assigned to spider attributes"""
|
||||
self.assertRaises(ValueError, self.spider_class)
|
||||
self.assertRaises(ValueError, self.spider_class, somearg='foo')
|
||||
|
||||
|
@ -79,7 +79,7 @@ class BuildComponentListTest(unittest.TestCase):
|
||||
self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x)
|
||||
d = {'one': {'a': 'a', 'b': 2}}
|
||||
self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x)
|
||||
d = {'one': 'lorem ipsum',}
|
||||
d = {'one': 'lorem ipsum'}
|
||||
self.assertRaises(ValueError, build_component_list, {}, d, convert=lambda x: x)
|
||||
|
||||
|
||||
|
@ -21,7 +21,7 @@ class UtilsConsoleTestCase(unittest.TestCase):
|
||||
shell = get_shell_embed_func(['invalid'])
|
||||
self.assertEqual(shell, None)
|
||||
|
||||
shell = get_shell_embed_func(['invalid','python'])
|
||||
shell = get_shell_embed_func(['invalid', 'python'])
|
||||
self.assertTrue(callable(shell))
|
||||
self.assertEqual(shell.__name__, '_embed_standard_shell')
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
from collections.abc import Mapping, MutableMapping
|
||||
import copy
|
||||
import unittest
|
||||
from collections.abc import Mapping, MutableMapping
|
||||
|
||||
from scrapy.utils.datatypes import CaselessDict, SequenceExclude
|
||||
from scrapy.utils.datatypes import CaselessDict, LocalCache, SequenceExclude
|
||||
|
||||
|
||||
__doctests__ = ['scrapy.utils.datatypes']
|
||||
@ -229,5 +229,31 @@ class SequenceExcludeTest(unittest.TestCase):
|
||||
for v in [-3, "test", 1.1]:
|
||||
self.assertNotIn(v, d)
|
||||
|
||||
|
||||
class LocalCacheTest(unittest.TestCase):
|
||||
|
||||
def test_cache_with_limit(self):
|
||||
cache = LocalCache(limit=2)
|
||||
cache['a'] = 1
|
||||
cache['b'] = 2
|
||||
cache['c'] = 3
|
||||
self.assertEqual(len(cache), 2)
|
||||
self.assertNotIn('a', cache)
|
||||
self.assertIn('b', cache)
|
||||
self.assertIn('c', cache)
|
||||
self.assertEqual(cache['b'], 2)
|
||||
self.assertEqual(cache['c'], 3)
|
||||
|
||||
def test_cache_without_limit(self):
|
||||
maximum = 10**4
|
||||
cache = LocalCache()
|
||||
for x in range(maximum):
|
||||
cache[str(x)] = x
|
||||
self.assertEqual(len(cache), maximum)
|
||||
for x in range(maximum):
|
||||
self.assertIn(str(x), cache)
|
||||
self.assertEqual(cache[str(x)], x)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -74,7 +74,7 @@ class UtilsMiscTestCase(unittest.TestCase):
|
||||
self.assertEqual(list(arg_to_iter(100)), [100])
|
||||
self.assertEqual(list(arg_to_iter(l for l in 'abc')), ['a', 'b', 'c'])
|
||||
self.assertEqual(list(arg_to_iter([1, 2, 3])), [1, 2, 3])
|
||||
self.assertEqual(list(arg_to_iter({'a':1})), [{'a': 1}])
|
||||
self.assertEqual(list(arg_to_iter({'a': 1})), [{'a': 1}])
|
||||
self.assertEqual(list(arg_to_iter(TestItem(name="john"))), [TestItem(name="john")])
|
||||
|
||||
def test_create_instance(self):
|
||||
|
@ -205,10 +205,10 @@ class UtilsPythonTestCase(unittest.TestCase):
|
||||
self.assertEqual(get_func_args(operator.itemgetter(2)), [])
|
||||
else:
|
||||
self.assertEqual(
|
||||
get_func_args(str.split, True), ['sep', 'maxsplit'])
|
||||
self.assertEqual(get_func_args(" ".join, True), ['list'])
|
||||
get_func_args(str.split, stripself=True), ['sep', 'maxsplit'])
|
||||
self.assertEqual(get_func_args(" ".join, stripself=True), ['list'])
|
||||
self.assertEqual(
|
||||
get_func_args(operator.itemgetter(2), True), ['obj'])
|
||||
get_func_args(operator.itemgetter(2), stripself=True), ['obj'])
|
||||
|
||||
|
||||
def test_without_none_values(self):
|
||||
|
13
tox.ini
13
tox.ini
@ -21,7 +21,7 @@ passenv =
|
||||
GCS_TEST_FILE_URI
|
||||
GCS_PROJECT_ID
|
||||
commands =
|
||||
py.test --cov=scrapy --cov-report= {posargs:scrapy tests}
|
||||
py.test --cov=scrapy --cov-report= {posargs:--durations=10 docs scrapy tests}
|
||||
|
||||
[testenv:py35]
|
||||
basepython = python3.5
|
||||
@ -60,7 +60,14 @@ basepython = python3.8
|
||||
[testenv:pypy3]
|
||||
basepython = pypy3
|
||||
commands =
|
||||
py.test {posargs:scrapy tests}
|
||||
py.test {posargs:--durations=10 docs scrapy tests}
|
||||
|
||||
[testenv:security]
|
||||
basepython = python3.8
|
||||
deps =
|
||||
bandit
|
||||
commands =
|
||||
bandit -r -c .bandit.yml {posargs:scrapy}
|
||||
|
||||
[testenv:flake8]
|
||||
basepython = python3.8
|
||||
@ -68,7 +75,7 @@ deps =
|
||||
{[testenv]deps}
|
||||
pytest-flake8
|
||||
commands =
|
||||
py.test --flake8 {posargs:scrapy tests}
|
||||
py.test --flake8 {posargs:docs scrapy tests}
|
||||
|
||||
[docs]
|
||||
changedir = docs
|
||||
|
Loading…
x
Reference in New Issue
Block a user