1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-21 06:33:07 +00:00

Merge branch 'master' into fix_typos

This commit is contained in:
Marc Hernandez Cabot 2019-12-19 10:02:21 +01:00
commit 16b363de31
16 changed files with 378 additions and 67 deletions

281
docs/_tests/quotes.html Normal file
View File

@ -0,0 +1,281 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Quotes to Scrape</title>
<link rel="stylesheet" href="/static/bootstrap.min.css">
<link rel="stylesheet" href="/static/main.css">
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="change,deep-thoughts,thinking,world" / >
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
<span>by <small class="author" itemprop="author">J.K. Rowling</small>
<a href="/author/J-K-Rowling">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="abilities,choices" / >
<a class="tag" href="/tag/abilities/page/1/">abilities</a>
<a class="tag" href="/tag/choices/page/1/">choices</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="inspirational,life,live,miracle,miracles" / >
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
<a class="tag" href="/tag/life/page/1/">life</a>
<a class="tag" href="/tag/live/page/1/">live</a>
<a class="tag" href="/tag/miracle/page/1/">miracle</a>
<a class="tag" href="/tag/miracles/page/1/">miracles</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>
<span>by <small class="author" itemprop="author">Jane Austen</small>
<a href="/author/Jane-Austen">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="aliteracy,books,classic,humor" / >
<a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>
<a class="tag" href="/tag/books/page/1/">books</a>
<a class="tag" href="/tag/classic/page/1/">classic</a>
<a class="tag" href="/tag/humor/page/1/">humor</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it&#39;s better to be absolutely ridiculous than absolutely boring.”</span>
<span>by <small class="author" itemprop="author">Marilyn Monroe</small>
<a href="/author/Marilyn-Monroe">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="be-yourself,inspirational" / >
<a class="tag" href="/tag/be-yourself/page/1/">be-yourself</a>
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="adulthood,success,value" / >
<a class="tag" href="/tag/adulthood/page/1/">adulthood</a>
<a class="tag" href="/tag/success/page/1/">success</a>
<a class="tag" href="/tag/value/page/1/">value</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.”</span>
<span>by <small class="author" itemprop="author">André Gide</small>
<a href="/author/Andre-Gide">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="life,love" / >
<a class="tag" href="/tag/life/page/1/">life</a>
<a class="tag" href="/tag/love/page/1/">love</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“I have not failed. I&#39;ve just found 10,000 ways that won&#39;t work.”</span>
<span>by <small class="author" itemprop="author">Thomas A. Edison</small>
<a href="/author/Thomas-A-Edison">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="edison,failure,inspirational,paraphrased" / >
<a class="tag" href="/tag/edison/page/1/">edison</a>
<a class="tag" href="/tag/failure/page/1/">failure</a>
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
<a class="tag" href="/tag/paraphrased/page/1/">paraphrased</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“A woman is like a tea bag; you never know how strong it is until it&#39;s in hot water.”</span>
<span>by <small class="author" itemprop="author">Eleanor Roosevelt</small>
<a href="/author/Eleanor-Roosevelt">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="misattributed-eleanor-roosevelt" / >
<a class="tag" href="/tag/misattributed-eleanor-roosevelt/page/1/">misattributed-eleanor-roosevelt</a>
</div>
</div>
<div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“A day without sunshine is like, you know, night.”</span>
<span>by <small class="author" itemprop="author">Steve Martin</small>
<a href="/author/Steve-Martin">(about)</a>
</span>
<div class="tags">
Tags:
<meta class="keywords" itemprop="keywords" content="humor,obvious,simile" / >
<a class="tag" href="/tag/humor/page/1/">humor</a>
<a class="tag" href="/tag/obvious/page/1/">obvious</a>
<a class="tag" href="/tag/simile/page/1/">simile</a>
</div>
</div>
<nav>
<ul class="pager">
<li class="next">
<a href="/page/2/">Next <span aria-hidden="true">&rarr;</span></a>
</li>
</ul>
</nav>
</div>
<div class="col-md-4 tags-box">
<h2>Top Ten tags</h2>
<span class="tag-item">
<a class="tag" style="font-size: 28px" href="/tag/love/">love</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 26px" href="/tag/inspirational/">inspirational</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 26px" href="/tag/life/">life</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 24px" href="/tag/humor/">humor</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 22px" href="/tag/books/">books</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 14px" href="/tag/reading/">reading</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 10px" href="/tag/friendship/">friendship</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 8px" href="/tag/friends/">friends</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 8px" href="/tag/truth/">truth</a>
</span>
<span class="tag-item">
<a class="tag" style="font-size: 6px" href="/tag/simile/">simile</a>
</span>
</div>
</div>
</div>
<footer class="footer">
<div class="container">
<p class="text-muted">
Quotes by: <a href="https://www.goodreads.com/quotes">GoodReads.com</a>
</p>
<p class="copyright">
Made with <span class='sh-red'></span> by <a href="https://scrapinghub.com">Scrapinghub</a>
</p>
</div>
</footer>
</body>
</html>

View File

@ -39,7 +39,7 @@ Therefore, you should keep in mind the following things:
.. _topics-inspector:
Inspecting a website
===================================
====================
By far the most handy feature of the Developer Tools is the `Inspector`
feature, which allows you to inspect the underlying HTML code of
@ -79,13 +79,23 @@ sections and tags of a webpage, which greatly improves readability. You can
expand and collapse a tag by clicking on the arrow in front of it or by double
clicking directly on the tag. If we expand the ``span`` tag with the ``class=
"text"`` we will see the quote-text we clicked on. The `Inspector` lets you
copy XPaths to selected elements. Let's try it out: Right-click on the ``span``
tag, select ``Copy > XPath`` and paste it in the Scrapy shell like so::
copy XPaths to selected elements. Let's try it out.
First open the Scrapy shell at http://quotes.toscrape.com/ in a terminal:
.. code-block:: none
$ scrapy shell "http://quotes.toscrape.com/"
(...)
>>> response.xpath('/html/body/div/div[2]/div[1]/div[1]/span[1]/text()').getall()
['"The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”]
Then, back to your web browser, right-click on the ``span`` tag, select
``Copy > XPath`` and paste it in the Scrapy shell like so:
.. invisible-code-block: python
response = load_response('http://quotes.toscrape.com/', 'quotes.html')
>>> response.xpath('/html/body/div/div[2]/div[1]/div[1]/span[1]/text()').getall()
['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”']
Adding ``text()`` at the end we are able to extract the first quote with this
basic selector. But this XPath is not really that clever. All it does is
@ -112,13 +122,13 @@ see each quote:
With this knowledge we can refine our XPath: Instead of a path to follow,
we'll simply select all ``span`` tags with the ``class="text"`` by using
the `has-class-extension`_::
the `has-class-extension`_:
>>> response.xpath('//span[has-class("text")]/text()').getall()
['"The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”,
'“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
'“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
(...)]
>>> response.xpath('//span[has-class("text")]/text()').getall()
['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
'“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
'“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
...]
And with one simple, cleverer XPath we are able to extract all quotes from
the page. We could have constructed a loop over our first XPath to increase
@ -159,7 +169,11 @@ The page is quite similar to the basic `quotes.toscrape.com`_-page,
but instead of the above-mentioned ``Next`` button, the page
automatically loads new quotes when you scroll to the bottom. We
could go ahead and try out different XPaths directly, but instead
we'll check another quite useful command from the Scrapy shell::
we'll check another quite useful command from the Scrapy shell:
.. skip: next
.. code-block:: none
$ scrapy shell "quotes.toscrape.com/scroll"
(...)

View File

@ -8,7 +8,6 @@ addopts =
--ignore=docs/_ext
--ignore=docs/conf.py
--ignore=docs/news.rst
--ignore=docs/topics/developer-tools.rst
--ignore=docs/topics/dynamic-content.rst
--ignore=docs/topics/items.rst
--ignore=docs/topics/leaks.rst
@ -85,8 +84,8 @@ flake8-ignore =
scrapy/http/request/__init__.py E501
scrapy/http/request/form.py E501 E123
scrapy/http/request/json_request.py E501
scrapy/http/response/__init__.py E501 E128 W293 W291
scrapy/http/response/text.py E501 W293 E128 E124
scrapy/http/response/__init__.py E501 E128
scrapy/http/response/text.py E501 E128 E124
# scrapy/linkextractors
scrapy/linkextractors/__init__.py E731 E501 E402
scrapy/linkextractors/lxmlhtml.py E501 E731 E226
@ -127,9 +126,9 @@ flake8-ignore =
scrapy/utils/httpobj.py E501
scrapy/utils/iterators.py E501 E701
scrapy/utils/log.py E128 W503
scrapy/utils/markup.py F403 W292
scrapy/utils/markup.py F403
scrapy/utils/misc.py E501 E226
scrapy/utils/multipart.py F403 W292
scrapy/utils/multipart.py F403
scrapy/utils/project.py E501
scrapy/utils/python.py E501
scrapy/utils/reactor.py E226
@ -144,7 +143,6 @@ flake8-ignore =
scrapy/utils/url.py E501 F403 E128 F405
# scrapy
scrapy/__init__.py E402 E501
scrapy/_monkeypatches.py W293
scrapy/cmdline.py E501
scrapy/crawler.py E501
scrapy/dupefilters.py E501 E202
@ -153,7 +151,7 @@ flake8-ignore =
scrapy/interfaces.py E501
scrapy/item.py E501 E128
scrapy/link.py E501
scrapy/logformatter.py E501 W293
scrapy/logformatter.py E501
scrapy/mail.py E402 E128 E501 E502
scrapy/middleware.py E128 E501
scrapy/pqueues.py E501
@ -174,7 +172,7 @@ flake8-ignore =
tests/test_command_parse.py E501 E128 E303 E226
tests/test_command_shell.py E501 E128
tests/test_commands.py E128 E501
tests/test_contracts.py E501 E128 W293
tests/test_contracts.py E501 E128
tests/test_crawl.py E501 E741 E265
tests/test_crawler.py F841 E306 E501
tests/test_dependencies.py F841 E501 E305
@ -189,17 +187,17 @@ flake8-ignore =
tests/test_downloadermiddleware_httpcompression.py E501 E251 E126 E123
tests/test_downloadermiddleware_httpproxy.py E501 E128
tests/test_downloadermiddleware_redirect.py E501 E303 E128 E306 E127 E305
tests/test_downloadermiddleware_retry.py E501 E128 W293 E251 E303 E126
tests/test_downloadermiddleware_retry.py E501 E128 E251 E303 E126
tests/test_downloadermiddleware_robotstxt.py E501
tests/test_downloadermiddleware_stats.py E501
tests/test_dupefilters.py E221 E501 E741 W293 W291 E128 E124
tests/test_dupefilters.py E221 E501 E741 E128 E124
tests/test_engine.py E401 E501 E128
tests/test_exporters.py E501 E731 E306 E128 E124
tests/test_extension_telnet.py F841
tests/test_feedexport.py E501 F841 E241
tests/test_http_cookies.py E501
tests/test_http_headers.py E501
tests/test_http_request.py E402 E501 E127 E128 W293 E128 E126 E123
tests/test_http_request.py E402 E501 E127 E128 E128 E126 E123
tests/test_http_response.py E501 E301 E128 E265
tests/test_item.py E701 E128 F841 E306
tests/test_link.py E501
@ -209,20 +207,20 @@ flake8-ignore =
tests/test_mail.py E128 E501 E305
tests/test_middleware.py E501 E128
tests/test_pipeline_crawl.py E131 E501 E128 E126
tests/test_pipeline_files.py E501 W293 E303 E272 E226
tests/test_pipeline_files.py E501 E303 E272 E226
tests/test_pipeline_images.py F841 E501 E303
tests/test_pipeline_media.py E501 E741 E731 E128 E306 E502
tests/test_proxy_connect.py E501 E741
tests/test_request_cb_kwargs.py E501
tests/test_responsetypes.py E501 E305
tests/test_robotstxt_interface.py E501 W291 E501
tests/test_robotstxt_interface.py E501 E501
tests/test_scheduler.py E501 E126 E123
tests/test_selector.py E501 E127
tests/test_spider.py E501
tests/test_spidermiddleware.py E501 E226
tests/test_spidermiddleware_httperror.py E128 E501 E127 E121
tests/test_spidermiddleware_offsite.py E501 E128 E111 W293
tests/test_spidermiddleware_output_chain.py E501 W293 E226
tests/test_spidermiddleware_offsite.py E501 E128 E111
tests/test_spidermiddleware_output_chain.py E501 E226
tests/test_spidermiddleware_referer.py E501 F841 E125 E201 E124 E501 E241 E121
tests/test_squeues.py E501 E701 E741
tests/test_utils_conf.py E501 E303 E128

View File

@ -113,8 +113,8 @@ class Response(object_ref):
It accepts the same arguments as ``Request.__init__`` method,
but ``url`` can be a relative URL or a ``scrapy.link.Link`` object,
not only an absolute URL.
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
"""

View File

@ -125,7 +125,7 @@ class TextResponse(Response):
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
but ``url`` can be not only an absolute URL, but also
* a relative URL;
* a scrapy.link.Link object (e.g. a link extractor result);
* an attribute Selector (not SelectorList) - e.g.
@ -133,7 +133,7 @@ class TextResponse(Response):
``response.xpath('//img/@src')[0]``.
* a Selector for ``<a>`` or ``<link>`` element, e.g.
``response.css('a.my_link')[0]``.
See :ref:`response-follow-example` for usage examples.
"""
if isinstance(url, parsel.Selector):

View File

@ -13,7 +13,7 @@ ERRORMSG = u"'Error processing %(item)s'"
class LogFormatter(object):
"""Class for generating log messages for different actions.
All methods must return a dictionary listing the parameters ``level``, ``msg``
and ``args`` which are going to be used for constructing the log message when
calling ``logging.log``.
@ -48,7 +48,7 @@ class LogFormatter(object):
}
}
"""
def crawled(self, request, response, spider):
"""Logs a message when the crawler finds a webpage."""
request_flags = ' %s' % str(request.flags) if request.flags else ''

View File

@ -11,4 +11,4 @@ from w3lib.html import * # noqa: F401
warnings.warn("Module `scrapy.utils.markup` is deprecated. "
"Please import from `w3lib.html` instead.",
ScrapyDeprecationWarning, stacklevel=2)
ScrapyDeprecationWarning, stacklevel=2)

View File

@ -12,4 +12,4 @@ from w3lib.form import * # noqa: F401
warnings.warn("Module `scrapy.utils.multipart` is deprecated. "
"If you're using `encode_multipart` function, please use "
"`urllib3.filepost.encode_multipart_formdata` instead",
ScrapyDeprecationWarning, stacklevel=2)
ScrapyDeprecationWarning, stacklevel=2)

View File

@ -252,7 +252,7 @@ class ContractsManagerTest(unittest.TestCase):
self.assertEqual(len(contracts), 3)
self.assertEqual(frozenset(type(x) for x in contracts),
frozenset([UrlContract, CallbackKeywordArgumentsContract, ReturnsContract]))
contracts = self.conman.extract_contracts(spider.returns_item_cb_kwargs)
self.assertEqual(len(contracts), 3)
self.assertEqual(frozenset(type(x) for x in contracts),

View File

@ -30,25 +30,44 @@ class CrawlTestCase(TestCase):
self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url
@defer.inlineCallbacks
def test_delay(self):
# short to long delays
yield self._test_delay(0.2, False)
yield self._test_delay(1, False)
# randoms
yield self._test_delay(0.2, True)
yield self._test_delay(1, True)
def test_fixed_delay(self):
yield self._test_delay(total=3, delay=0.1)
@defer.inlineCallbacks
def _test_delay(self, delay, randomize):
settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
def test_randomized_delay(self):
yield self._test_delay(total=3, delay=0.1, randomize=True)
@defer.inlineCallbacks
def _test_delay(self, total, delay, randomize=False):
crawl_kwargs = dict(
maxlatency=delay * 2,
mockserver=self.mockserver,
total=total,
)
tolerance = (1 - (0.6 if randomize else 0.2))
settings = {"DOWNLOAD_DELAY": delay,
'RANDOMIZE_DOWNLOAD_DELAY': randomize}
crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver)
t = crawler.spider.times
totaltime = t[-1] - t[0]
avgd = totaltime / (len(t) - 1)
tolerance = 0.6 if randomize else 0.2
self.assertTrue(avgd > delay * (1 - tolerance),
"download delay too small: %s" % avgd)
yield crawler.crawl(**crawl_kwargs)
times = crawler.spider.times
total_time = times[-1] - times[0]
average = total_time / (len(times) - 1)
self.assertTrue(average > delay * tolerance,
"download delay too small: %s" % average)
# Ensure that the same test parameters would cause a failure if no
# download delay is set. Otherwise, it means we are using a combination
# of ``total`` and ``delay`` values that are too small for the test
# code above to have any meaning.
settings["DOWNLOAD_DELAY"] = 0
crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
yield crawler.crawl(**crawl_kwargs)
times = crawler.spider.times
total_time = times[-1] - times[0]
average = total_time / (len(times) - 1)
self.assertFalse(average > delay / tolerance,
"test total or delay values are too small")
@defer.inlineCallbacks
def test_timeout_success(self):

View File

@ -124,7 +124,7 @@ class MaxRetryTimesTest(unittest.TestCase):
# SETTINGS: meta(max_retry_times) = 0
meta_max_retry_times = 0
req = Request(self.invalid_url, meta={'max_retry_times': meta_max_retry_times})
self._test_retry(req, DNSLookupError('foo'), meta_max_retry_times)
@ -137,7 +137,7 @@ class MaxRetryTimesTest(unittest.TestCase):
self._test_retry(req, DNSLookupError('foo'), self.mw.max_retry_times)
def test_with_metakey_greater(self):
# SETINGS: RETRY_TIMES < meta(max_retry_times)
self.mw.max_retry_times = 2
meta_max_retry_times = 3
@ -149,7 +149,7 @@ class MaxRetryTimesTest(unittest.TestCase):
self._test_retry(req2, DNSLookupError('foo'), self.mw.max_retry_times)
def test_with_metakey_lesser(self):
# SETINGS: RETRY_TIMES > meta(max_retry_times)
self.mw.max_retry_times = 5
meta_max_retry_times = 4
@ -172,7 +172,7 @@ class MaxRetryTimesTest(unittest.TestCase):
self._test_retry(req, DNSLookupError('foo'), 0)
def _test_retry(self, req, exception, max_retry_times):
for i in range(0, max_retry_times):
req = self.mw.process_exception(req, exception, self.spider)
assert isinstance(req, Request)

View File

@ -142,12 +142,12 @@ class RFPDupeFilterTest(unittest.TestCase):
r1 = Request('http://scrapytest.org/index.html')
r2 = Request('http://scrapytest.org/index.html')
dupefilter.log(r1, spider)
dupefilter.log(r2, spider)
assert crawler.stats.get_value('dupefilter/filtered') == 2
l.check_present(('scrapy.dupefilters', 'DEBUG',
l.check_present(('scrapy.dupefilters', 'DEBUG',
('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
' - no more duplicates will be shown'
' (see DUPEFILTER_DEBUG to show all duplicates)')))
@ -169,7 +169,7 @@ class RFPDupeFilterTest(unittest.TestCase):
r2 = Request('http://scrapytest.org/index.html',
headers={'Referer': 'http://scrapytest.org/INDEX.html'}
)
dupefilter.log(r1, spider)
dupefilter.log(r2, spider)

View File

@ -58,7 +58,6 @@ class FilesPipelineTestCase(unittest.TestCase):
self.assertEqual(file_path(Request("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAR0AAACxCAMAAADOHZloAAACClBMVEX/\
//+F0tzCwMK76ZKQ21AMqr7oAAC96JvD5aWM2kvZ78J0N7fmAAC46Y4Ap7y")),
'full/178059cbeba2e34120a67f2dc1afc3ecc09b61cb.png')
def test_fs_store(self):
assert isinstance(self.pipeline.store, FSFilesStore)

View File

@ -44,7 +44,7 @@ class BaseRobotParserTest:
def test_allowed_wildcards(self):
robotstxt_robotstxt_body = """User-agent: first
Disallow: /disallowed/*/end$
Disallow: /disallowed/*/end$
User-agent: second
Allow: /*allowed

View File

@ -73,7 +73,7 @@ class TestOffsiteMiddleware4(TestOffsiteMiddleware3):
class TestOffsiteMiddleware5(TestOffsiteMiddleware4):
def test_get_host_regex(self):
self.spider.allowed_domains = ['http://scrapytest.org', 'scrapy.org', 'scrapy.test.org']
with warnings.catch_warnings(record=True) as w:

View File

@ -156,7 +156,7 @@ class GeneratorFailMiddleware:
r['processed'].append('{}.process_spider_output'.format(self.__class__.__name__))
yield r
raise LookupError()
def process_spider_exception(self, response, exception, spider):
method = '{}.process_spider_exception'.format(self.__class__.__name__)
spider.logger.info('%s: %s caught', method, exception.__class__.__name__)
@ -264,7 +264,7 @@ class TestSpiderMiddleware(TestCase):
@classmethod
def tearDownClass(cls):
cls.mockserver.__exit__(None, None, None)
@defer.inlineCallbacks
def crawl_log(self, spider):
crawler = get_crawler(spider)
@ -308,7 +308,7 @@ class TestSpiderMiddleware(TestCase):
self.assertIn("{'from': 'errback'}", str(log1))
self.assertNotIn("{'from': 'callback'}", str(log1))
self.assertIn("'item_scraped_count': 1", str(log1))
@defer.inlineCallbacks
def test_generator_callback(self):
"""
@ -319,7 +319,7 @@ class TestSpiderMiddleware(TestCase):
log2 = yield self.crawl_log(GeneratorCallbackSpider)
self.assertIn("Middleware: ImportError exception caught", str(log2))
self.assertIn("'item_scraped_count': 2", str(log2))
@defer.inlineCallbacks
def test_not_a_generator_callback(self):
"""