mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-21 06:33:07 +00:00
Merge branch 'master' into fix_typos
This commit is contained in:
commit
16b363de31
281
docs/_tests/quotes.html
Normal file
281
docs/_tests/quotes.html
Normal file
@ -0,0 +1,281 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Quotes to Scrape</title>
|
||||
<link rel="stylesheet" href="/static/bootstrap.min.css">
|
||||
<link rel="stylesheet" href="/static/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="row header-box">
|
||||
<div class="col-md-8">
|
||||
<h1>
|
||||
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
|
||||
</h1>
|
||||
</div>
|
||||
<div class="col-md-4">
|
||||
<p>
|
||||
|
||||
<a href="/login">Login</a>
|
||||
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-8">
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="change,deep-thoughts,thinking,world" / >
|
||||
|
||||
<a class="tag" href="/tag/change/page/1/">change</a>
|
||||
|
||||
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
|
||||
|
||||
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
|
||||
|
||||
<a class="tag" href="/tag/world/page/1/">world</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
|
||||
<span>by <small class="author" itemprop="author">J.K. Rowling</small>
|
||||
<a href="/author/J-K-Rowling">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="abilities,choices" / >
|
||||
|
||||
<a class="tag" href="/tag/abilities/page/1/">abilities</a>
|
||||
|
||||
<a class="tag" href="/tag/choices/page/1/">choices</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="inspirational,life,live,miracle,miracles" / >
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
<a class="tag" href="/tag/life/page/1/">life</a>
|
||||
|
||||
<a class="tag" href="/tag/live/page/1/">live</a>
|
||||
|
||||
<a class="tag" href="/tag/miracle/page/1/">miracle</a>
|
||||
|
||||
<a class="tag" href="/tag/miracles/page/1/">miracles</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>
|
||||
<span>by <small class="author" itemprop="author">Jane Austen</small>
|
||||
<a href="/author/Jane-Austen">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="aliteracy,books,classic,humor" / >
|
||||
|
||||
<a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>
|
||||
|
||||
<a class="tag" href="/tag/books/page/1/">books</a>
|
||||
|
||||
<a class="tag" href="/tag/classic/page/1/">classic</a>
|
||||
|
||||
<a class="tag" href="/tag/humor/page/1/">humor</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>
|
||||
<span>by <small class="author" itemprop="author">Marilyn Monroe</small>
|
||||
<a href="/author/Marilyn-Monroe">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="be-yourself,inspirational" / >
|
||||
|
||||
<a class="tag" href="/tag/be-yourself/page/1/">be-yourself</a>
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>
|
||||
<span>by <small class="author" itemprop="author">Albert Einstein</small>
|
||||
<a href="/author/Albert-Einstein">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="adulthood,success,value" / >
|
||||
|
||||
<a class="tag" href="/tag/adulthood/page/1/">adulthood</a>
|
||||
|
||||
<a class="tag" href="/tag/success/page/1/">success</a>
|
||||
|
||||
<a class="tag" href="/tag/value/page/1/">value</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.”</span>
|
||||
<span>by <small class="author" itemprop="author">André Gide</small>
|
||||
<a href="/author/Andre-Gide">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="life,love" / >
|
||||
|
||||
<a class="tag" href="/tag/life/page/1/">life</a>
|
||||
|
||||
<a class="tag" href="/tag/love/page/1/">love</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“I have not failed. I've just found 10,000 ways that won't work.”</span>
|
||||
<span>by <small class="author" itemprop="author">Thomas A. Edison</small>
|
||||
<a href="/author/Thomas-A-Edison">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="edison,failure,inspirational,paraphrased" / >
|
||||
|
||||
<a class="tag" href="/tag/edison/page/1/">edison</a>
|
||||
|
||||
<a class="tag" href="/tag/failure/page/1/">failure</a>
|
||||
|
||||
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
|
||||
|
||||
<a class="tag" href="/tag/paraphrased/page/1/">paraphrased</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“A woman is like a tea bag; you never know how strong it is until it's in hot water.”</span>
|
||||
<span>by <small class="author" itemprop="author">Eleanor Roosevelt</small>
|
||||
<a href="/author/Eleanor-Roosevelt">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="misattributed-eleanor-roosevelt" / >
|
||||
|
||||
<a class="tag" href="/tag/misattributed-eleanor-roosevelt/page/1/">misattributed-eleanor-roosevelt</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
|
||||
<span class="text" itemprop="text">“A day without sunshine is like, you know, night.”</span>
|
||||
<span>by <small class="author" itemprop="author">Steve Martin</small>
|
||||
<a href="/author/Steve-Martin">(about)</a>
|
||||
</span>
|
||||
<div class="tags">
|
||||
Tags:
|
||||
<meta class="keywords" itemprop="keywords" content="humor,obvious,simile" / >
|
||||
|
||||
<a class="tag" href="/tag/humor/page/1/">humor</a>
|
||||
|
||||
<a class="tag" href="/tag/obvious/page/1/">obvious</a>
|
||||
|
||||
<a class="tag" href="/tag/simile/page/1/">simile</a>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<nav>
|
||||
<ul class="pager">
|
||||
|
||||
|
||||
<li class="next">
|
||||
<a href="/page/2/">Next <span aria-hidden="true">→</span></a>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
<div class="col-md-4 tags-box">
|
||||
|
||||
<h2>Top Ten tags</h2>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 28px" href="/tag/love/">love</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 26px" href="/tag/inspirational/">inspirational</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 26px" href="/tag/life/">life</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 24px" href="/tag/humor/">humor</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 22px" href="/tag/books/">books</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 14px" href="/tag/reading/">reading</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 10px" href="/tag/friendship/">friendship</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 8px" href="/tag/friends/">friends</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 8px" href="/tag/truth/">truth</a>
|
||||
</span>
|
||||
|
||||
<span class="tag-item">
|
||||
<a class="tag" style="font-size: 6px" href="/tag/simile/">simile</a>
|
||||
</span>
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<footer class="footer">
|
||||
<div class="container">
|
||||
<p class="text-muted">
|
||||
Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
|
||||
</p>
|
||||
<p class="copyright">
|
||||
Made with <span class='sh-red'>❤</span> by <a href="https://scrapinghub.com">Scrapinghub</a>
|
||||
</p>
|
||||
</div>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
@ -39,7 +39,7 @@ Therefore, you should keep in mind the following things:
|
||||
.. _topics-inspector:
|
||||
|
||||
Inspecting a website
|
||||
===================================
|
||||
====================
|
||||
|
||||
By far the most handy feature of the Developer Tools is the `Inspector`
|
||||
feature, which allows you to inspect the underlying HTML code of
|
||||
@ -79,13 +79,23 @@ sections and tags of a webpage, which greatly improves readability. You can
|
||||
expand and collapse a tag by clicking on the arrow in front of it or by double
|
||||
clicking directly on the tag. If we expand the ``span`` tag with the ``class=
|
||||
"text"`` we will see the quote-text we clicked on. The `Inspector` lets you
|
||||
copy XPaths to selected elements. Let's try it out: Right-click on the ``span``
|
||||
tag, select ``Copy > XPath`` and paste it in the Scrapy shell like so::
|
||||
copy XPaths to selected elements. Let's try it out.
|
||||
|
||||
First open the Scrapy shell at http://quotes.toscrape.com/ in a terminal:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
$ scrapy shell "http://quotes.toscrape.com/"
|
||||
(...)
|
||||
>>> response.xpath('/html/body/div/div[2]/div[1]/div[1]/span[1]/text()').getall()
|
||||
['"The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”]
|
||||
|
||||
Then, back to your web browser, right-click on the ``span`` tag, select
|
||||
``Copy > XPath`` and paste it in the Scrapy shell like so:
|
||||
|
||||
.. invisible-code-block: python
|
||||
|
||||
response = load_response('http://quotes.toscrape.com/', 'quotes.html')
|
||||
|
||||
>>> response.xpath('/html/body/div/div[2]/div[1]/div[1]/span[1]/text()').getall()
|
||||
['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”']
|
||||
|
||||
Adding ``text()`` at the end we are able to extract the first quote with this
|
||||
basic selector. But this XPath is not really that clever. All it does is
|
||||
@ -112,13 +122,13 @@ see each quote:
|
||||
|
||||
With this knowledge we can refine our XPath: Instead of a path to follow,
|
||||
we'll simply select all ``span`` tags with the ``class="text"`` by using
|
||||
the `has-class-extension`_::
|
||||
the `has-class-extension`_:
|
||||
|
||||
>>> response.xpath('//span[has-class("text")]/text()').getall()
|
||||
['"The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”,
|
||||
'“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
|
||||
'“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
|
||||
(...)]
|
||||
>>> response.xpath('//span[has-class("text")]/text()').getall()
|
||||
['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
|
||||
'“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
|
||||
'“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
|
||||
...]
|
||||
|
||||
And with one simple, cleverer XPath we are able to extract all quotes from
|
||||
the page. We could have constructed a loop over our first XPath to increase
|
||||
@ -159,7 +169,11 @@ The page is quite similar to the basic `quotes.toscrape.com`_-page,
|
||||
but instead of the above-mentioned ``Next`` button, the page
|
||||
automatically loads new quotes when you scroll to the bottom. We
|
||||
could go ahead and try out different XPaths directly, but instead
|
||||
we'll check another quite useful command from the Scrapy shell::
|
||||
we'll check another quite useful command from the Scrapy shell:
|
||||
|
||||
.. skip: next
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
$ scrapy shell "quotes.toscrape.com/scroll"
|
||||
(...)
|
||||
|
28
pytest.ini
28
pytest.ini
@ -8,7 +8,6 @@ addopts =
|
||||
--ignore=docs/_ext
|
||||
--ignore=docs/conf.py
|
||||
--ignore=docs/news.rst
|
||||
--ignore=docs/topics/developer-tools.rst
|
||||
--ignore=docs/topics/dynamic-content.rst
|
||||
--ignore=docs/topics/items.rst
|
||||
--ignore=docs/topics/leaks.rst
|
||||
@ -85,8 +84,8 @@ flake8-ignore =
|
||||
scrapy/http/request/__init__.py E501
|
||||
scrapy/http/request/form.py E501 E123
|
||||
scrapy/http/request/json_request.py E501
|
||||
scrapy/http/response/__init__.py E501 E128 W293 W291
|
||||
scrapy/http/response/text.py E501 W293 E128 E124
|
||||
scrapy/http/response/__init__.py E501 E128
|
||||
scrapy/http/response/text.py E501 E128 E124
|
||||
# scrapy/linkextractors
|
||||
scrapy/linkextractors/__init__.py E731 E501 E402
|
||||
scrapy/linkextractors/lxmlhtml.py E501 E731 E226
|
||||
@ -127,9 +126,9 @@ flake8-ignore =
|
||||
scrapy/utils/httpobj.py E501
|
||||
scrapy/utils/iterators.py E501 E701
|
||||
scrapy/utils/log.py E128 W503
|
||||
scrapy/utils/markup.py F403 W292
|
||||
scrapy/utils/markup.py F403
|
||||
scrapy/utils/misc.py E501 E226
|
||||
scrapy/utils/multipart.py F403 W292
|
||||
scrapy/utils/multipart.py F403
|
||||
scrapy/utils/project.py E501
|
||||
scrapy/utils/python.py E501
|
||||
scrapy/utils/reactor.py E226
|
||||
@ -144,7 +143,6 @@ flake8-ignore =
|
||||
scrapy/utils/url.py E501 F403 E128 F405
|
||||
# scrapy
|
||||
scrapy/__init__.py E402 E501
|
||||
scrapy/_monkeypatches.py W293
|
||||
scrapy/cmdline.py E501
|
||||
scrapy/crawler.py E501
|
||||
scrapy/dupefilters.py E501 E202
|
||||
@ -153,7 +151,7 @@ flake8-ignore =
|
||||
scrapy/interfaces.py E501
|
||||
scrapy/item.py E501 E128
|
||||
scrapy/link.py E501
|
||||
scrapy/logformatter.py E501 W293
|
||||
scrapy/logformatter.py E501
|
||||
scrapy/mail.py E402 E128 E501 E502
|
||||
scrapy/middleware.py E128 E501
|
||||
scrapy/pqueues.py E501
|
||||
@ -174,7 +172,7 @@ flake8-ignore =
|
||||
tests/test_command_parse.py E501 E128 E303 E226
|
||||
tests/test_command_shell.py E501 E128
|
||||
tests/test_commands.py E128 E501
|
||||
tests/test_contracts.py E501 E128 W293
|
||||
tests/test_contracts.py E501 E128
|
||||
tests/test_crawl.py E501 E741 E265
|
||||
tests/test_crawler.py F841 E306 E501
|
||||
tests/test_dependencies.py F841 E501 E305
|
||||
@ -189,17 +187,17 @@ flake8-ignore =
|
||||
tests/test_downloadermiddleware_httpcompression.py E501 E251 E126 E123
|
||||
tests/test_downloadermiddleware_httpproxy.py E501 E128
|
||||
tests/test_downloadermiddleware_redirect.py E501 E303 E128 E306 E127 E305
|
||||
tests/test_downloadermiddleware_retry.py E501 E128 W293 E251 E303 E126
|
||||
tests/test_downloadermiddleware_retry.py E501 E128 E251 E303 E126
|
||||
tests/test_downloadermiddleware_robotstxt.py E501
|
||||
tests/test_downloadermiddleware_stats.py E501
|
||||
tests/test_dupefilters.py E221 E501 E741 W293 W291 E128 E124
|
||||
tests/test_dupefilters.py E221 E501 E741 E128 E124
|
||||
tests/test_engine.py E401 E501 E128
|
||||
tests/test_exporters.py E501 E731 E306 E128 E124
|
||||
tests/test_extension_telnet.py F841
|
||||
tests/test_feedexport.py E501 F841 E241
|
||||
tests/test_http_cookies.py E501
|
||||
tests/test_http_headers.py E501
|
||||
tests/test_http_request.py E402 E501 E127 E128 W293 E128 E126 E123
|
||||
tests/test_http_request.py E402 E501 E127 E128 E128 E126 E123
|
||||
tests/test_http_response.py E501 E301 E128 E265
|
||||
tests/test_item.py E701 E128 F841 E306
|
||||
tests/test_link.py E501
|
||||
@ -209,20 +207,20 @@ flake8-ignore =
|
||||
tests/test_mail.py E128 E501 E305
|
||||
tests/test_middleware.py E501 E128
|
||||
tests/test_pipeline_crawl.py E131 E501 E128 E126
|
||||
tests/test_pipeline_files.py E501 W293 E303 E272 E226
|
||||
tests/test_pipeline_files.py E501 E303 E272 E226
|
||||
tests/test_pipeline_images.py F841 E501 E303
|
||||
tests/test_pipeline_media.py E501 E741 E731 E128 E306 E502
|
||||
tests/test_proxy_connect.py E501 E741
|
||||
tests/test_request_cb_kwargs.py E501
|
||||
tests/test_responsetypes.py E501 E305
|
||||
tests/test_robotstxt_interface.py E501 W291 E501
|
||||
tests/test_robotstxt_interface.py E501 E501
|
||||
tests/test_scheduler.py E501 E126 E123
|
||||
tests/test_selector.py E501 E127
|
||||
tests/test_spider.py E501
|
||||
tests/test_spidermiddleware.py E501 E226
|
||||
tests/test_spidermiddleware_httperror.py E128 E501 E127 E121
|
||||
tests/test_spidermiddleware_offsite.py E501 E128 E111 W293
|
||||
tests/test_spidermiddleware_output_chain.py E501 W293 E226
|
||||
tests/test_spidermiddleware_offsite.py E501 E128 E111
|
||||
tests/test_spidermiddleware_output_chain.py E501 E226
|
||||
tests/test_spidermiddleware_referer.py E501 F841 E125 E201 E124 E501 E241 E121
|
||||
tests/test_squeues.py E501 E701 E741
|
||||
tests/test_utils_conf.py E501 E303 E128
|
||||
|
@ -113,8 +113,8 @@ class Response(object_ref):
|
||||
It accepts the same arguments as ``Request.__init__`` method,
|
||||
but ``url`` can be a relative URL or a ``scrapy.link.Link`` object,
|
||||
not only an absolute URL.
|
||||
|
||||
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
|
||||
|
||||
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
|
||||
method which supports selectors in addition to absolute/relative URLs
|
||||
and Link objects.
|
||||
"""
|
||||
|
@ -125,7 +125,7 @@ class TextResponse(Response):
|
||||
Return a :class:`~.Request` instance to follow a link ``url``.
|
||||
It accepts the same arguments as ``Request.__init__`` method,
|
||||
but ``url`` can be not only an absolute URL, but also
|
||||
|
||||
|
||||
* a relative URL;
|
||||
* a scrapy.link.Link object (e.g. a link extractor result);
|
||||
* an attribute Selector (not SelectorList) - e.g.
|
||||
@ -133,7 +133,7 @@ class TextResponse(Response):
|
||||
``response.xpath('//img/@src')[0]``.
|
||||
* a Selector for ``<a>`` or ``<link>`` element, e.g.
|
||||
``response.css('a.my_link')[0]``.
|
||||
|
||||
|
||||
See :ref:`response-follow-example` for usage examples.
|
||||
"""
|
||||
if isinstance(url, parsel.Selector):
|
||||
|
@ -13,7 +13,7 @@ ERRORMSG = u"'Error processing %(item)s'"
|
||||
|
||||
class LogFormatter(object):
|
||||
"""Class for generating log messages for different actions.
|
||||
|
||||
|
||||
All methods must return a dictionary listing the parameters ``level``, ``msg``
|
||||
and ``args`` which are going to be used for constructing the log message when
|
||||
calling ``logging.log``.
|
||||
@ -48,7 +48,7 @@ class LogFormatter(object):
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
def crawled(self, request, response, spider):
|
||||
"""Logs a message when the crawler finds a webpage."""
|
||||
request_flags = ' %s' % str(request.flags) if request.flags else ''
|
||||
|
@ -11,4 +11,4 @@ from w3lib.html import * # noqa: F401
|
||||
|
||||
warnings.warn("Module `scrapy.utils.markup` is deprecated. "
|
||||
"Please import from `w3lib.html` instead.",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
@ -12,4 +12,4 @@ from w3lib.form import * # noqa: F401
|
||||
warnings.warn("Module `scrapy.utils.multipart` is deprecated. "
|
||||
"If you're using `encode_multipart` function, please use "
|
||||
"`urllib3.filepost.encode_multipart_formdata` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
@ -252,7 +252,7 @@ class ContractsManagerTest(unittest.TestCase):
|
||||
self.assertEqual(len(contracts), 3)
|
||||
self.assertEqual(frozenset(type(x) for x in contracts),
|
||||
frozenset([UrlContract, CallbackKeywordArgumentsContract, ReturnsContract]))
|
||||
|
||||
|
||||
contracts = self.conman.extract_contracts(spider.returns_item_cb_kwargs)
|
||||
self.assertEqual(len(contracts), 3)
|
||||
self.assertEqual(frozenset(type(x) for x in contracts),
|
||||
|
@ -30,25 +30,44 @@ class CrawlTestCase(TestCase):
|
||||
self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_delay(self):
|
||||
# short to long delays
|
||||
yield self._test_delay(0.2, False)
|
||||
yield self._test_delay(1, False)
|
||||
# randoms
|
||||
yield self._test_delay(0.2, True)
|
||||
yield self._test_delay(1, True)
|
||||
def test_fixed_delay(self):
|
||||
yield self._test_delay(total=3, delay=0.1)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def _test_delay(self, delay, randomize):
|
||||
settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
|
||||
def test_randomized_delay(self):
|
||||
yield self._test_delay(total=3, delay=0.1, randomize=True)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def _test_delay(self, total, delay, randomize=False):
|
||||
crawl_kwargs = dict(
|
||||
maxlatency=delay * 2,
|
||||
mockserver=self.mockserver,
|
||||
total=total,
|
||||
)
|
||||
tolerance = (1 - (0.6 if randomize else 0.2))
|
||||
|
||||
settings = {"DOWNLOAD_DELAY": delay,
|
||||
'RANDOMIZE_DOWNLOAD_DELAY': randomize}
|
||||
crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
|
||||
yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver)
|
||||
t = crawler.spider.times
|
||||
totaltime = t[-1] - t[0]
|
||||
avgd = totaltime / (len(t) - 1)
|
||||
tolerance = 0.6 if randomize else 0.2
|
||||
self.assertTrue(avgd > delay * (1 - tolerance),
|
||||
"download delay too small: %s" % avgd)
|
||||
yield crawler.crawl(**crawl_kwargs)
|
||||
times = crawler.spider.times
|
||||
total_time = times[-1] - times[0]
|
||||
average = total_time / (len(times) - 1)
|
||||
self.assertTrue(average > delay * tolerance,
|
||||
"download delay too small: %s" % average)
|
||||
|
||||
# Ensure that the same test parameters would cause a failure if no
|
||||
# download delay is set. Otherwise, it means we are using a combination
|
||||
# of ``total`` and ``delay`` values that are too small for the test
|
||||
# code above to have any meaning.
|
||||
settings["DOWNLOAD_DELAY"] = 0
|
||||
crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
|
||||
yield crawler.crawl(**crawl_kwargs)
|
||||
times = crawler.spider.times
|
||||
total_time = times[-1] - times[0]
|
||||
average = total_time / (len(times) - 1)
|
||||
self.assertFalse(average > delay / tolerance,
|
||||
"test total or delay values are too small")
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_timeout_success(self):
|
||||
|
@ -124,7 +124,7 @@ class MaxRetryTimesTest(unittest.TestCase):
|
||||
|
||||
# SETTINGS: meta(max_retry_times) = 0
|
||||
meta_max_retry_times = 0
|
||||
|
||||
|
||||
req = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
|
||||
self._test_retry(req, DNSLookupError('foo'), meta_max_retry_times)
|
||||
|
||||
@ -137,7 +137,7 @@ class MaxRetryTimesTest(unittest.TestCase):
|
||||
self._test_retry(req, DNSLookupError('foo'), self.mw.max_retry_times)
|
||||
|
||||
def test_with_metakey_greater(self):
|
||||
|
||||
|
||||
# SETINGS: RETRY_TIMES < meta(max_retry_times)
|
||||
self.mw.max_retry_times = 2
|
||||
meta_max_retry_times = 3
|
||||
@ -149,7 +149,7 @@ class MaxRetryTimesTest(unittest.TestCase):
|
||||
self._test_retry(req2, DNSLookupError('foo'), self.mw.max_retry_times)
|
||||
|
||||
def test_with_metakey_lesser(self):
|
||||
|
||||
|
||||
# SETINGS: RETRY_TIMES > meta(max_retry_times)
|
||||
self.mw.max_retry_times = 5
|
||||
meta_max_retry_times = 4
|
||||
@ -172,7 +172,7 @@ class MaxRetryTimesTest(unittest.TestCase):
|
||||
self._test_retry(req, DNSLookupError('foo'), 0)
|
||||
|
||||
def _test_retry(self, req, exception, max_retry_times):
|
||||
|
||||
|
||||
for i in range(0, max_retry_times):
|
||||
req = self.mw.process_exception(req, exception, self.spider)
|
||||
assert isinstance(req, Request)
|
||||
|
@ -142,12 +142,12 @@ class RFPDupeFilterTest(unittest.TestCase):
|
||||
|
||||
r1 = Request('http://scrapytest.org/index.html')
|
||||
r2 = Request('http://scrapytest.org/index.html')
|
||||
|
||||
|
||||
dupefilter.log(r1, spider)
|
||||
dupefilter.log(r2, spider)
|
||||
|
||||
assert crawler.stats.get_value('dupefilter/filtered') == 2
|
||||
l.check_present(('scrapy.dupefilters', 'DEBUG',
|
||||
l.check_present(('scrapy.dupefilters', 'DEBUG',
|
||||
('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
|
||||
' - no more duplicates will be shown'
|
||||
' (see DUPEFILTER_DEBUG to show all duplicates)')))
|
||||
@ -169,7 +169,7 @@ class RFPDupeFilterTest(unittest.TestCase):
|
||||
r2 = Request('http://scrapytest.org/index.html',
|
||||
headers={'Referer': 'http://scrapytest.org/INDEX.html'}
|
||||
)
|
||||
|
||||
|
||||
dupefilter.log(r1, spider)
|
||||
dupefilter.log(r2, spider)
|
||||
|
||||
|
@ -58,7 +58,6 @@ class FilesPipelineTestCase(unittest.TestCase):
|
||||
self.assertEqual(file_path(Request("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAR0AAACxCAMAAADOHZloAAACClBMVEX/\
|
||||
//+F0tzCwMK76ZKQ21AMqr7oAAC96JvD5aWM2kvZ78J0N7fmAAC46Y4Ap7y")),
|
||||
'full/178059cbeba2e34120a67f2dc1afc3ecc09b61cb.png')
|
||||
|
||||
|
||||
def test_fs_store(self):
|
||||
assert isinstance(self.pipeline.store, FSFilesStore)
|
||||
|
@ -44,7 +44,7 @@ class BaseRobotParserTest:
|
||||
|
||||
def test_allowed_wildcards(self):
|
||||
robotstxt_robotstxt_body = """User-agent: first
|
||||
Disallow: /disallowed/*/end$
|
||||
Disallow: /disallowed/*/end$
|
||||
|
||||
User-agent: second
|
||||
Allow: /*allowed
|
||||
|
@ -73,7 +73,7 @@ class TestOffsiteMiddleware4(TestOffsiteMiddleware3):
|
||||
|
||||
|
||||
class TestOffsiteMiddleware5(TestOffsiteMiddleware4):
|
||||
|
||||
|
||||
def test_get_host_regex(self):
|
||||
self.spider.allowed_domains = ['http://scrapytest.org', 'scrapy.org', 'scrapy.test.org']
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
|
@ -156,7 +156,7 @@ class GeneratorFailMiddleware:
|
||||
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
|
||||
yield r
|
||||
raise LookupError()
|
||||
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
method = '{}.process_spider_exception'.format(self.__class__.__name__)
|
||||
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
|
||||
@ -264,7 +264,7 @@ class TestSpiderMiddleware(TestCase):
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
cls.mockserver.__exit__(None, None, None)
|
||||
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def crawl_log(self, spider):
|
||||
crawler = get_crawler(spider)
|
||||
@ -308,7 +308,7 @@ class TestSpiderMiddleware(TestCase):
|
||||
self.assertIn("{'from': 'errback'}", str(log1))
|
||||
self.assertNotIn("{'from': 'callback'}", str(log1))
|
||||
self.assertIn("'item_scraped_count': 1", str(log1))
|
||||
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_generator_callback(self):
|
||||
"""
|
||||
@ -319,7 +319,7 @@ class TestSpiderMiddleware(TestCase):
|
||||
log2 = yield self.crawl_log(GeneratorCallbackSpider)
|
||||
self.assertIn("Middleware: ImportError exception caught", str(log2))
|
||||
self.assertIn("'item_scraped_count': 2", str(log2))
|
||||
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_not_a_generator_callback(self):
|
||||
"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user