From 3b6f7ac9f2f5b48b9f2f3ce106d1205599d2164f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 22 Oct 2019 19:43:02 +0200 Subject: [PATCH 001/181] Use pylint --- .travis.yml | 2 + docs/utils/linkfix.py | 85 ++++++++++++++++++++++------------------- pylintrc | 88 +++++++++++++++++++++++++++++++++++++++++++ tox.ini | 14 +++++++ 4 files changed, 150 insertions(+), 39 deletions(-) create mode 100644 pylintrc diff --git a/.travis.yml b/.travis.yml index 0190a7f4d..28a19f4f0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,8 @@ branches: - /^\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/ matrix: include: + - env: TOXENV=pylint + python: 3.7 - env: TOXENV=py27 python: 2.7 - env: TOXENV=py27-pinned diff --git a/docs/utils/linkfix.py b/docs/utils/linkfix.py index 6290adbe2..9acfc3b23 100755 --- a/docs/utils/linkfix.py +++ b/docs/utils/linkfix.py @@ -14,50 +14,57 @@ Author: dufferzafar import re -# Used for remembering the file (and its contents) -# so we don't have to open the same file again. -_filename = None -_contents = None -# A regex that matches standard linkcheck output lines -line_re = re.compile(u'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))') +def main(): -# Read lines from the linkcheck output file -try: - with open("build/linkcheck/output.txt") as out: - output_lines = out.readlines() -except IOError: - print("linkcheck output not found; please run linkcheck first.") - exit(1) + # Used for remembering the file (and its contents) + # so we don't have to open the same file again. + _filename = None + _contents = None -# For every line, fix the respective file -for line in output_lines: - match = re.match(line_re, line) + # A regex that matches standard linkcheck output lines + line_re = re.compile(u'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))') - if match: - newfilename = match.group(1) - errortype = match.group(2) + # Read lines from the linkcheck output file + try: + with open("build/linkcheck/output.txt") as out: + output_lines = out.readlines() + except IOError: + print("linkcheck output not found; please run linkcheck first.") + exit(1) - # Broken links can't be fixed and - # I am not sure what do with the local ones. - if errortype.lower() in ["broken", "local"]: - print("Not Fixed: " + line) + # For every line, fix the respective file + for line in output_lines: + match = re.match(line_re, line) + + if match: + newfilename = match.group(1) + errortype = match.group(2) + + # Broken links can't be fixed and + # I am not sure what do with the local ones. + if errortype.lower() in ["broken", "local"]: + print("Not Fixed: " + line) + else: + # If this is a new file + if newfilename != _filename: + + # Update the previous file + if _filename: + with open(_filename, "w") as _file: + _file.write(_contents) + + _filename = newfilename + + # Read the new file to memory + with open(_filename) as _file: + _contents = _file.read() + + _contents = _contents.replace(match.group(3), match.group(4)) else: - # If this is a new file - if newfilename != _filename: + # We don't understand what the current line means! + print("Not Understood: " + line) - # Update the previous file - if _filename: - with open(_filename, "w") as _file: - _file.write(_contents) - _filename = newfilename - - # Read the new file to memory - with open(_filename) as _file: - _contents = _file.read() - - _contents = _contents.replace(match.group(3), match.group(4)) - else: - # We don't understand what the current line means! - print("Not Understood: " + line) +if __name__ == '__main__': + main() diff --git a/pylintrc b/pylintrc new file mode 100644 index 000000000..b83bc9f82 --- /dev/null +++ b/pylintrc @@ -0,0 +1,88 @@ +[MASTER] +persistent=no +jobs=1 # >1 hides results + +[MESSAGES CONTROL] +disable=abstract-method, + anomalous-backslash-in-string, + arguments-differ, + attribute-defined-outside-init, + bad-classmethod-argument, + bad-continuation, + bad-indentation, + bad-mcs-classmethod-argument, + bad-whitespace, + broad-except, + c-extension-no-member, + catching-non-exception, + cell-var-from-loop, + comparison-with-callable, + consider-using-in, + cyclic-import, + dangerous-default-value, + deprecated-method, + deprecated-module, + duplicate-code, # https://github.com/PyCQA/pylint/issues/214 + eval-used, + expression-not-assigned, + fixme, + function-redefined, + global-statement, + import-error, + import-outside-toplevel, + inconsistent-return-statements, + inherit-non-class, + invalid-name, + keyword-arg-before-vararg, + line-too-long, + logging-format-interpolation, + logging-not-lazy, + lost-exception, + method-hidden, + missing-docstring, + missing-final-newline, + multiple-imports, + multiple-statements, + no-else-continue, + no-else-raise, + no-else-return, + no-init, + no-member, + no-method-argument, + no-name-in-module, + no-self-argument, + no-self-use, + pointless-string-statement, + protected-access, + redefined-argument-from-local, + redefined-builtin, + redefined-outer-name, + reimported, + signature-differs, + super-init-not-called, + superfluous-parens, + too-few-public-methods, + too-many-ancestors, + too-many-arguments, + too-many-branches, + too-many-function-args, + too-many-instance-attributes, + too-many-locals, + too-many-return-statements, + trailing-newlines, + trailing-whitespace, + unexpected-special-method-signature, + ungrouped-imports, + unidiomatic-typecheck, + unnecessary-comprehension, + unnecessary-pass, + unsubscriptable-object, + unused-argument, + unused-import, + unused-variable, + unused-wildcard-import, + used-before-assignment, + useless-object-inheritance, # Required for Python 2 support + wildcard-import, + wrong-import-order, + wrong-import-position diff --git a/tox.ini b/tox.ini index ffe7360d3..e7d366fe9 100644 --- a/tox.ini +++ b/tox.ini @@ -98,6 +98,20 @@ deps = {[testenv:py35]deps} commands = py.test {posargs:scrapy tests} +[testenv:pylint] +basepython = python3.7 +deps = + {[testenv:py35]deps} + # Optional dependencies + boto + reppy + robotexclusionrulesparser + # Test dependencies + pylint + +commands = + pylint scrapy + [docs] changedir = docs deps = From 02577f55a0586bc3e6c13a4a3ea572c7eefc82b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 24 Oct 2019 13:25:11 +0200 Subject: [PATCH 002/181] Have PyLint cover all Python files in the repository --- pylintrc | 19 +++++++++++++++++++ tox.ini | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pylintrc b/pylintrc index b83bc9f82..ca3ea1c57 100644 --- a/pylintrc +++ b/pylintrc @@ -11,13 +11,18 @@ disable=abstract-method, bad-continuation, bad-indentation, bad-mcs-classmethod-argument, + bad-super-call, bad-whitespace, + blacklisted-name, broad-except, c-extension-no-member, catching-non-exception, cell-var-from-loop, comparison-with-callable, + consider-iterating-dictionary, consider-using-in, + consider-using-set-comprehension, + consider-using-sys-exit, cyclic-import, dangerous-default-value, deprecated-method, @@ -30,6 +35,7 @@ disable=abstract-method, global-statement, import-error, import-outside-toplevel, + import-self, inconsistent-return-statements, inherit-non-class, invalid-name, @@ -39,6 +45,7 @@ disable=abstract-method, logging-not-lazy, lost-exception, method-hidden, + misplaced-comparison-constant, missing-docstring, missing-final-newline, multiple-imports, @@ -52,6 +59,9 @@ disable=abstract-method, no-name-in-module, no-self-argument, no-self-use, + no-value-for-parameter, + not-callable, + pointless-statement, pointless-string-statement, protected-access, redefined-argument-from-local, @@ -59,6 +69,7 @@ disable=abstract-method, redefined-outer-name, reimported, signature-differs, + singleton-comparison, super-init-not-called, superfluous-parens, too-few-public-methods, @@ -67,15 +78,21 @@ disable=abstract-method, too-many-branches, too-many-function-args, too-many-instance-attributes, + too-many-lines, too-many-locals, + too-many-public-methods, too-many-return-statements, trailing-newlines, trailing-whitespace, + unbalanced-tuple-unpacking, + undefined-variable, unexpected-special-method-signature, ungrouped-imports, unidiomatic-typecheck, unnecessary-comprehension, + unnecessary-lambda, unnecessary-pass, + unreachable, unsubscriptable-object, unused-argument, unused-import, @@ -83,6 +100,8 @@ disable=abstract-method, unused-wildcard-import, used-before-assignment, useless-object-inheritance, # Required for Python 2 support + useless-return, + useless-super-delegation, wildcard-import, wrong-import-order, wrong-import-position diff --git a/tox.ini b/tox.ini index e7d366fe9..428571ef2 100644 --- a/tox.ini +++ b/tox.ini @@ -110,7 +110,7 @@ deps = pylint commands = - pylint scrapy + pylint conftest.py docs extras scrapy setup.py tests [docs] changedir = docs From c7f9b955bdf2405fce58907b0395abce2400a66d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 19 Dec 2019 12:44:52 +0100 Subject: [PATCH 003/181] Pylint: ignore not-an-iterable --- pylintrc | 1 + 1 file changed, 1 insertion(+) diff --git a/pylintrc b/pylintrc index ca3ea1c57..c52a4c2d0 100644 --- a/pylintrc +++ b/pylintrc @@ -60,6 +60,7 @@ disable=abstract-method, no-self-argument, no-self-use, no-value-for-parameter, + not-an-iterable, not-callable, pointless-statement, pointless-string-statement, From 80925ab845b7f55be97d9bb91015ceee90efc333 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 5 Aug 2019 11:39:07 -0300 Subject: [PATCH 004/181] Get server IP address for HTTP/1.1 responses --- docs/topics/request-response.rst | 12 +++++++++- scrapy/core/downloader/__init__.py | 2 +- scrapy/core/downloader/handlers/http11.py | 18 ++++++++++----- scrapy/http/response/__init__.py | 5 +++-- tests/test_crawl.py | 27 +++++++++++++++++++++++ 5 files changed, 54 insertions(+), 10 deletions(-) diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst index 8997a7f19..a4cc1a7d7 100644 --- a/docs/topics/request-response.rst +++ b/docs/topics/request-response.rst @@ -34,7 +34,7 @@ Request objects :type url: string :param callback: the function that will be called with the response of this - request (once its downloaded) as its first parameter. For more information + request (once it's downloaded) as its first parameter. For more information see :ref:`topics-request-response-ref-request-callback-arguments` below. If a Request doesn't specify a callback, the spider's :meth:`~scrapy.spiders.Spider.parse` method will be used. @@ -611,6 +611,12 @@ Response objects This represents the :class:`Request` that generated this response. :type request: :class:`Request` object + :param ip_address: The IP address of the server from which the Response originated. + :type ip_address: :class:`ipaddress.IPv4Address` object + + .. FIXME: Add ipaddress.IPv6Address once it's supported + + .. attribute:: Response.url A string containing the URL of the response. @@ -679,6 +685,10 @@ Response objects they're shown on the string representation of the Response (`__str__` method) which is used by the engine for logging. + .. attribute:: Response.ip_address + + The IP address of the server from which the Response originated. + .. method:: Response.copy() Returns a new Response which is a copy of this Response. diff --git a/scrapy/core/downloader/__init__.py b/scrapy/core/downloader/__init__.py index 157dc3418..11c9dd908 100644 --- a/scrapy/core/downloader/__init__.py +++ b/scrapy/core/downloader/__init__.py @@ -172,7 +172,7 @@ class Downloader(object): return response dfd.addCallback(_downloaded) - # 3. After response arrives, remove the request from transferring + # 3. After response arrives, remove the request from transferring # state to free up the transferring slot so it can be used by the # following requests (perhaps those which came from the downloader # middleware itself) diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index 5a5f6cf0a..b690f439f 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -4,6 +4,7 @@ import logging import re import warnings from io import BytesIO +from ipaddress import ip_address from time import time from urllib.parse import urldefrag @@ -382,7 +383,7 @@ class ScrapyAgent(object): def _cb_bodyready(self, txresponse, request): # deliverBody hangs for responses without body if txresponse.length == 0: - return txresponse, b'', None + return txresponse, b'', None, None maxsize = request.meta.get('download_maxsize', self._maxsize) warnsize = request.meta.get('download_warnsize', self._warnsize) @@ -418,11 +419,11 @@ class ScrapyAgent(object): return d def _cb_bodydone(self, result, request, url): - txresponse, body, flags = result + txresponse, body, flags, ip_address = result status = int(txresponse.code) headers = Headers(txresponse.headers.getAllRawHeaders()) respcls = responsetypes.from_args(headers=headers, url=url, body=body) - return respcls(url=url, status=status, headers=headers, body=body, flags=flags) + return respcls(url=url, status=status, headers=headers, body=body, flags=flags, ip_address=ip_address) @implementer(IBodyProducer) @@ -456,6 +457,11 @@ class _ResponseReader(protocol.Protocol): self._fail_on_dataloss_warned = False self._reached_warnsize = False self._bytes_received = 0 + self._ip_address = None + + def connectionMade(self): + if self._ip_address is None: + self._ip_address = ip_address(self.transport._producer.getPeer().host) def dataReceived(self, bodyBytes): # This maybe called several times after cancel was called with buffered data. @@ -488,16 +494,16 @@ class _ResponseReader(protocol.Protocol): body = self._bodybuf.getvalue() if reason.check(ResponseDone): - self._finished.callback((self._txresponse, body, None)) + self._finished.callback((self._txresponse, body, None, self._ip_address)) return if reason.check(PotentialDataLoss): - self._finished.callback((self._txresponse, body, ['partial'])) + self._finished.callback((self._txresponse, body, ['partial'], self._ip_address)) return if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons): if not self._fail_on_dataloss: - self._finished.callback((self._txresponse, body, ['dataloss'])) + self._finished.callback((self._txresponse, body, ['dataloss'], self._ip_address)) return elif not self._fail_on_dataloss_warned: diff --git a/scrapy/http/response/__init__.py b/scrapy/http/response/__init__.py index f92d0901c..ca5ecc02c 100644 --- a/scrapy/http/response/__init__.py +++ b/scrapy/http/response/__init__.py @@ -17,13 +17,14 @@ from scrapy.utils.trackref import object_ref class Response(object_ref): - def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None): + def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None, ip_address=None): self.headers = Headers(headers or {}) self.status = int(status) self._set_body(body) self._set_url(url) self.request = request self.flags = [] if flags is None else list(flags) + self.ip_address = ip_address @property def meta(self): @@ -76,7 +77,7 @@ class Response(object_ref): """Create a new Response with the same attributes except for those given new values. """ - for x in ['url', 'status', 'headers', 'body', 'request', 'flags']: + for x in ['url', 'status', 'headers', 'body', 'request', 'flags', 'ip_address']: kwargs.setdefault(x, getattr(self, x)) cls = kwargs.pop('cls', self.__class__) return cls(*args, **kwargs) diff --git a/tests/test_crawl.py b/tests/test_crawl.py index f433fcea6..6281160ae 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -1,5 +1,7 @@ import json import logging +from ipaddress import IPv4Address +from urllib.parse import urlparse from testfixtures import LogCapture from twisted.internet import defer @@ -308,3 +310,28 @@ with multiples lines self.assertIn("[callback] status 201", str(log)) self.assertIn("[errback] status 404", str(log)) self.assertIn("[errback] status 500", str(log)) + + @defer.inlineCallbacks + def test_dns_server_ip_address(self): + from socket import gethostbyname + + crawler = self.runner.create_crawler(SingleRequestSpider) + url = 'https://example.org' + yield crawler.crawl(seed=url) + ip_address = crawler.spider.meta['responses'][0].ip_address + self.assertIsInstance(ip_address, IPv4Address) + self.assertEqual(str(ip_address), gethostbyname(urlparse(url).netloc)) + + crawler = self.runner.create_crawler(SingleRequestSpider) + url = self.mockserver.url('/status?n=200') + yield crawler.crawl(seed=url, mockserver=self.mockserver) + ip_address = crawler.spider.meta['responses'][0].ip_address + self.assertIsNone(ip_address) + + crawler = self.runner.create_crawler(SingleRequestSpider) + url = self.mockserver.url('/echo?body=test') + expected_netloc, _ = urlparse(url).netloc.split(':') + yield crawler.crawl(seed=url, mockserver=self.mockserver) + ip_address = crawler.spider.meta['responses'][0].ip_address + self.assertIsInstance(ip_address, IPv4Address) + self.assertEqual(str(ip_address), gethostbyname(expected_netloc)) From e8da7e296691d2b4eb63e2a442bb600e03e5766f Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 26 Jan 2020 17:53:39 -0300 Subject: [PATCH 005/181] Test DNS resolution using CrawlerProcess --- tests/CrawlerProcess/ip_address.py | 51 ++++++++++++++++++++++++++++++ tests/test_crawl.py | 10 +----- tests/test_crawler.py | 8 +++++ 3 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 tests/CrawlerProcess/ip_address.py diff --git a/tests/CrawlerProcess/ip_address.py b/tests/CrawlerProcess/ip_address.py new file mode 100644 index 000000000..6b069cc90 --- /dev/null +++ b/tests/CrawlerProcess/ip_address.py @@ -0,0 +1,51 @@ +from urllib.parse import urlparse + +from twisted.internet import defer +from twisted.internet.base import ThreadedResolver +from twisted.internet.interfaces import IResolverSimple +from zope.interface.declarations import implementer + +from scrapy import Spider, Request +from scrapy.crawler import CrawlerProcess + +from tests.mockserver import MockServer + + +@implementer(IResolverSimple) +class MockThreadedResolver(ThreadedResolver): + """ + Resolves all names to localhost + """ + + @classmethod + def from_crawler(cls, crawler, reactor): + return cls(reactor) + + def install_on_reactor(self,): + self.reactor.installResolver(self) + + def getHostByName(self, name, timeout=None): + return defer.succeed("127.0.0.1") + + +class LocalhostSpider(Spider): + name = "localhost_spider" + + def start_requests(self): + yield Request(self.url) + + def parse(self, response): + netloc = urlparse(response.url).netloc + self.logger.info("Host: %s" % netloc.split(":")[0]) + self.logger.info("Type: %s" % type(response.ip_address)) + self.logger.info("IP address: %s" % response.ip_address) + + +with MockServer() as mockserver: + settings = {"DNS_RESOLVER": __name__ + ".MockThreadedResolver"} + process = CrawlerProcess(settings) + + port = urlparse(mockserver.http_address).port + url = "http://not.a.real.domain:{port}/echo?body=test".format(port=port) + process.crawl(LocalhostSpider, url=url) + process.start() diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 6281160ae..9896058dc 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -1,6 +1,7 @@ import json import logging from ipaddress import IPv4Address +from socket import gethostbyname from urllib.parse import urlparse from testfixtures import LogCapture @@ -313,15 +314,6 @@ with multiples lines @defer.inlineCallbacks def test_dns_server_ip_address(self): - from socket import gethostbyname - - crawler = self.runner.create_crawler(SingleRequestSpider) - url = 'https://example.org' - yield crawler.crawl(seed=url) - ip_address = crawler.spider.meta['responses'][0].ip_address - self.assertIsInstance(ip_address, IPv4Address) - self.assertEqual(str(ip_address), gethostbyname(urlparse(url).netloc)) - crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/status?n=200') yield crawler.crawl(seed=url, mockserver=self.mockserver) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 0ce0674de..dfc1cf448 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -107,6 +107,7 @@ class CrawlerLoggingTestCase(unittest.TestCase): def test_spider_custom_settings_log_level(self): log_file = self.mktemp() + class MySpider(scrapy.Spider): name = 'spider' custom_settings = { @@ -323,3 +324,10 @@ class CrawlerProcessSubprocess(unittest.TestCase): "'downloader/exception_type_count/twisted.internet.error.ConnectionRefusedError': 1," in log, "'downloader/exception_type_count/twisted.internet.error.ConnectError': 1," in log, ])) + + def test_response_ip_address(self): + log = self.run_script("ip_address.py") + self.assertIn("Spider closed (finished)", log) + self.assertIn("Host: not.a.real.domain", log) + self.assertIn("Type: ", log) + self.assertIn("IP address: 127.0.0.1", log) From 8529dff41d3d2f6c81ee58c60b16dd9f2b8f72b4 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 26 Jan 2020 18:00:56 -0300 Subject: [PATCH 006/181] Update docs regarding Response.ip_address and IPv6 --- docs/topics/request-response.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst index a4cc1a7d7..17eb63064 100644 --- a/docs/topics/request-response.rst +++ b/docs/topics/request-response.rst @@ -612,10 +612,7 @@ Response objects :type request: :class:`Request` object :param ip_address: The IP address of the server from which the Response originated. - :type ip_address: :class:`ipaddress.IPv4Address` object - - .. FIXME: Add ipaddress.IPv6Address once it's supported - + :type ip_address: :class:`ipaddress.IPv4Address` or :class:`ipaddress.IPv6Address` .. attribute:: Response.url From 72b8613ee9827af031862bd84f1bea9acefcbebe Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 27 Nov 2019 14:46:20 -0300 Subject: [PATCH 007/181] bytes_received signal (no tests) --- docs/topics/signals.rst | 45 ++++++++++++++++------- scrapy/core/downloader/handlers/http11.py | 25 +++++++++++-- scrapy/signals.py | 1 + 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/docs/topics/signals.rst b/docs/topics/signals.rst index 3f29aa323..6efb73abb 100644 --- a/docs/topics/signals.rst +++ b/docs/topics/signals.rst @@ -73,7 +73,7 @@ engine_started Sent when the Scrapy engine has started crawling. - This signal supports returning deferreds from their handlers. + This signal supports returning deferreds from its handlers. .. note:: This signal may be fired *after* the :signal:`spider_opened` signal, depending on how the spider was started. So **don't** rely on this signal @@ -88,7 +88,7 @@ engine_stopped Sent when the Scrapy engine is stopped (for example, when a crawling process has finished). - This signal supports returning deferreds from their handlers. + This signal supports returning deferreds from its handlers. item_scraped ------------ @@ -99,7 +99,7 @@ item_scraped Sent when an item has been scraped, after it has passed all the :ref:`topics-item-pipeline` stages (without being dropped). - This signal supports returning deferreds from their handlers. + This signal supports returning deferreds from its handlers. :param item: the item scraped :type item: dict or :class:`~scrapy.item.Item` object @@ -119,7 +119,7 @@ item_dropped Sent after an item has been dropped from the :ref:`topics-item-pipeline` when some stage raised a :exc:`~scrapy.exceptions.DropItem` exception. - This signal supports returning deferreds from their handlers. + This signal supports returning deferreds from its handlers. :param item: the item dropped from the :ref:`topics-item-pipeline` :type item: dict or :class:`~scrapy.item.Item` object @@ -144,7 +144,7 @@ item_error Sent when a :ref:`topics-item-pipeline` generates an error (ie. raises an exception), except :exc:`~scrapy.exceptions.DropItem` exception. - This signal supports returning deferreds from their handlers. + This signal supports returning deferreds from its handlers. :param item: the item dropped from the :ref:`topics-item-pipeline` :type item: dict or :class:`~scrapy.item.Item` object @@ -158,6 +158,23 @@ item_error :param failure: the exception raised :type failure: twisted.python.failure.Failure +bytes_received +-------------- + +.. signal:: bytes_received +.. function:: bytes_received(data, request) + + Sent by the HTTP 1.1 download handler when a group of bytes is + received for a specific request. + + This signal does not support returning deferreds from its handlers. + + :param data: the data received by the download handler + :type spider: :class:`bytes` object + + :param request: the request that generated the response + :type request: :class:`~scrapy.http.Request` object + spider_closed ------------- @@ -167,7 +184,7 @@ spider_closed Sent after a spider has been closed. This can be used to release per-spider resources reserved on :signal:`spider_opened`. - This signal supports returning deferreds from their handlers. + This signal supports returning deferreds from its handlers. :param spider: the spider which has been closed :type spider: :class:`~scrapy.spiders.Spider` object @@ -191,7 +208,7 @@ spider_opened reserve per-spider resources, but can be used for any task that needs to be performed when a spider is opened. - This signal supports returning deferreds from their handlers. + This signal supports returning deferreds from its handlers. :param spider: the spider which has been opened :type spider: :class:`~scrapy.spiders.Spider` object @@ -215,7 +232,7 @@ spider_idle You may raise a :exc:`~scrapy.exceptions.DontCloseSpider` exception to prevent the spider from being closed. - This signal does not support returning deferreds from their handlers. + This signal does not support returning deferreds from its handlers. :param spider: the spider which has gone idle :type spider: :class:`~scrapy.spiders.Spider` object @@ -234,7 +251,7 @@ spider_error Sent when a spider callback generates an error (ie. raises an exception). - This signal does not support returning deferreds from their handlers. + This signal does not support returning deferreds from its handlers. :param failure: the exception raised :type failure: twisted.python.failure.Failure @@ -254,7 +271,7 @@ request_scheduled Sent when the engine schedules a :class:`~scrapy.http.Request`, to be downloaded later. - The signal does not support returning deferreds from their handlers. + The signal does not support returning deferreds from its handlers. :param request: the request that reached the scheduler :type request: :class:`~scrapy.http.Request` object @@ -271,7 +288,7 @@ request_dropped Sent when a :class:`~scrapy.http.Request`, scheduled by the engine to be downloaded later, is rejected by the scheduler. - The signal does not support returning deferreds from their handlers. + The signal does not support returning deferreds from its handlers. :param request: the request that reached the scheduler :type request: :class:`~scrapy.http.Request` object @@ -287,7 +304,7 @@ request_reached_downloader Sent when a :class:`~scrapy.http.Request` reached downloader. - The signal does not support returning deferreds from their handlers. + The signal does not support returning deferreds from its handlers. :param request: the request that reached downloader :type request: :class:`~scrapy.http.Request` object @@ -304,7 +321,7 @@ response_received Sent when the engine receives a new :class:`~scrapy.http.Response` from the downloader. - This signal does not support returning deferreds from their handlers. + This signal does not support returning deferreds from its handlers. :param response: the response received :type response: :class:`~scrapy.http.Response` object @@ -323,7 +340,7 @@ response_downloaded Sent by the downloader right after a ``HTTPResponse`` is downloaded. - This signal does not support returning deferreds from their handlers. + This signal does not support returning deferreds from its handlers. :param response: the response downloaded :type response: :class:`~scrapy.http.Response` object diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index 5a5f6cf0a..92c3d5f5c 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -16,6 +16,7 @@ from twisted.web.http_headers import Headers as TxHeaders from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH from zope.interface import implementer +from scrapy import signals from scrapy.core.downloader.tls import openssl_methods from scrapy.core.downloader.webclient import _parse from scrapy.exceptions import ScrapyDeprecationWarning @@ -32,6 +33,7 @@ class HTTP11DownloadHandler: lazy = False def __init__(self, settings, crawler=None): + self.crawler = crawler self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False @@ -76,6 +78,7 @@ class HTTP11DownloadHandler: maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), warnsize=getattr(spider, 'download_warnsize', self._default_warnsize), fail_on_dataloss=self._fail_on_dataloss, + crawler=self.crawler, ) return agent.download_request(request) @@ -272,7 +275,7 @@ class ScrapyAgent(object): _TunnelingAgent = TunnelingAgent def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None, - maxsize=0, warnsize=0, fail_on_dataloss=True): + maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None): self._contextFactory = contextFactory self._connectTimeout = connectTimeout self._bindAddress = bindAddress @@ -281,6 +284,7 @@ class ScrapyAgent(object): self._warnsize = warnsize self._fail_on_dataloss = fail_on_dataloss self._txresponse = None + self._crawler = crawler def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress @@ -409,7 +413,15 @@ class ScrapyAgent(object): d = defer.Deferred(_cancel) txresponse.deliverBody( - _ResponseReader(d, txresponse, request, maxsize, warnsize, fail_on_dataloss) + _ResponseReader( + d, + txresponse, + request, + maxsize, + warnsize, + fail_on_dataloss, + self._crawler, + ) ) # save response for timeouts @@ -445,7 +457,7 @@ class _RequestBodyProducer(object): class _ResponseReader(protocol.Protocol): - def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss): + def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler): self._finished = finished self._txresponse = txresponse self._request = request @@ -456,6 +468,7 @@ class _ResponseReader(protocol.Protocol): self._fail_on_dataloss_warned = False self._reached_warnsize = False self._bytes_received = 0 + self._crawler = crawler def dataReceived(self, bodyBytes): # This maybe called several times after cancel was called with buffered data. @@ -465,6 +478,12 @@ class _ResponseReader(protocol.Protocol): self._bodybuf.write(bodyBytes) self._bytes_received += len(bodyBytes) + self._crawler.signals.send_catch_log( + signal=signals.bytes_received, + data=bodyBytes, + request=self._request, + ) + if self._maxsize and self._bytes_received > self._maxsize: logger.error("Received (%(bytes)s) bytes larger than download " "max size (%(maxsize)s) in request %(request)s.", diff --git a/scrapy/signals.py b/scrapy/signals.py index 6b9125302..590421893 100644 --- a/scrapy/signals.py +++ b/scrapy/signals.py @@ -16,6 +16,7 @@ request_dropped = object() request_reached_downloader = object() response_received = object() response_downloaded = object() +bytes_received = object() item_scraped = object() item_dropped = object() item_error = object() From cab449b1952020b86fbe2915a537150fc885c567 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 29 Nov 2019 11:37:40 -0300 Subject: [PATCH 008/181] Typo fix --- tests/test_engine.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_engine.py b/tests/test_engine.py index 25dee7c1f..9d68836cc 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -107,7 +107,7 @@ class CrawlerRun(object): self.reqreached = [] self.itemerror = [] self.itemresp = [] - self.signals_catched = {} + self.signals_caught = {} self.spider_class = spider_class def run(self): @@ -172,7 +172,7 @@ class CrawlerRun(object): signalargs = kwargs.copy() sig = signalargs.pop('signal') signalargs.pop('sender', None) - self.signals_catched[sig] = signalargs + self.signals_caught[sig] = signalargs class EngineTest(unittest.TestCase): @@ -186,7 +186,7 @@ class EngineTest(unittest.TestCase): self._assert_scheduled_requests(urls_to_visit=8) self._assert_downloaded_responses() self._assert_scraped_items() - self._assert_signals_catched() + self._assert_signals_caught() @defer.inlineCallbacks def test_crawler_dupefilter(self): @@ -263,19 +263,19 @@ class EngineTest(unittest.TestCase): self.assertEqual('Item 2 name', item['name']) self.assertEqual('200', item['price']) - def _assert_signals_catched(self): - assert signals.engine_started in self.run.signals_catched - assert signals.engine_stopped in self.run.signals_catched - assert signals.spider_opened in self.run.signals_catched - assert signals.spider_idle in self.run.signals_catched - assert signals.spider_closed in self.run.signals_catched + def _assert_signals_caught(self): + assert signals.engine_started in self.run.signals_caught + assert signals.engine_stopped in self.run.signals_caught + assert signals.spider_opened in self.run.signals_caught + assert signals.spider_idle in self.run.signals_caught + assert signals.spider_closed in self.run.signals_caught self.assertEqual({'spider': self.run.spider}, - self.run.signals_catched[signals.spider_opened]) + self.run.signals_caught[signals.spider_opened]) self.assertEqual({'spider': self.run.spider}, - self.run.signals_catched[signals.spider_idle]) + self.run.signals_caught[signals.spider_idle]) self.assertEqual({'spider': self.run.spider, 'reason': 'finished'}, - self.run.signals_catched[signals.spider_closed]) + self.run.signals_caught[signals.spider_closed]) @defer.inlineCallbacks def test_close_downloader(self): From bda37e38bd53d5aae691b56d4136fbff99f78158 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 29 Nov 2019 12:02:27 -0300 Subject: [PATCH 009/181] [Tests] bytes_received signal --- tests/test_engine.py | 53 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/tests/test_engine.py b/tests/test_engine.py index 9d68836cc..b63c7e232 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -13,22 +13,24 @@ module with the ``runserver`` argument:: import os import re import sys +from collections import defaultdict from urllib.parse import urlparse from twisted.internet import reactor, defer -from twisted.web import server, static, util from twisted.trial import unittest +from twisted.web import server, static, util +from pydispatch import dispatcher from scrapy import signals from scrapy.core.engine import ExecutionEngine -from scrapy.utils.test import get_crawler -from pydispatch import dispatcher -from tests import tests_datadir -from scrapy.spiders import Spider +from scrapy.http import Request from scrapy.item import Item, Field from scrapy.linkextractors import LinkExtractor -from scrapy.http import Request +from scrapy.spiders import Spider from scrapy.utils.signal import disconnect_all +from scrapy.utils.test import get_crawler + +from tests import tests_datadir, get_testdata class TestItem(Item): @@ -107,6 +109,7 @@ class CrawlerRun(object): self.reqreached = [] self.itemerror = [] self.itemresp = [] + self.bytes = defaultdict(lambda: b"") self.signals_caught = {} self.spider_class = spider_class @@ -124,6 +127,7 @@ class CrawlerRun(object): self.crawler = get_crawler(self.spider_class) self.crawler.signals.connect(self.item_scraped, signals.item_scraped) self.crawler.signals.connect(self.item_error, signals.item_error) + self.crawler.signals.connect(self.bytes_received, signals.bytes_received) self.crawler.signals.connect(self.request_scheduled, signals.request_scheduled) self.crawler.signals.connect(self.request_dropped, signals.request_dropped) self.crawler.signals.connect(self.request_reached, signals.request_reached_downloader) @@ -155,6 +159,9 @@ class CrawlerRun(object): def item_scraped(self, item, spider, response): self.itemresp.append((item, response)) + def bytes_received(self, data, request): + self.bytes[request] += data + def request_scheduled(self, request, spider): self.reqplug.append((request, spider)) @@ -187,6 +194,7 @@ class EngineTest(unittest.TestCase): self._assert_downloaded_responses() self._assert_scraped_items() self._assert_signals_caught() + self._assert_bytes_received() @defer.inlineCallbacks def test_crawler_dupefilter(self): @@ -263,6 +271,39 @@ class EngineTest(unittest.TestCase): self.assertEqual('Item 2 name', item['name']) self.assertEqual('200', item['price']) + def _assert_bytes_received(self): + self.assertEqual(8, len(self.run.bytes)) + for request, data in self.run.bytes.items(): + if self.run.getpath(request.url) == "/": + self.assertEqual(data, get_testdata("test_site", "index.html")) + elif self.run.getpath(request.url) == "/item1.html": + self.assertEqual(data, get_testdata("test_site", "item1.html")) + elif self.run.getpath(request.url) == "/item2.html": + self.assertEqual(data, get_testdata("test_site", "item2.html")) + elif self.run.getpath(request.url) == "/redirected": + self.assertEqual(data, b"Redirected here") + elif self.run.getpath(request.url) == '/redirect': + self.assertEqual(data, + b"\n\n" + b" \n" + b" \n" + b" \n" + b" \n" + b" click here\n" + b" \n" + b"\n" + ) + elif self.run.getpath(request.url) == "/tem999.html": + self.assertEqual(data, + b"\n\n" + b" 404 - No Such Resource\n" + b" \n" + b"

No Such Resource

\n" + b"

File not found.

\n" + b" \n" + b"\n" + ) + def _assert_signals_caught(self): assert signals.engine_started in self.run.signals_caught assert signals.engine_stopped in self.run.signals_caught From 89483ce9f709e230ee5ff9050d206430d2d17c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 3 Dec 2019 12:06:08 +0100 Subject: [PATCH 010/181] Fix Flake8 issues --- tests/test_engine.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_engine.py b/tests/test_engine.py index b63c7e232..c0769c992 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -283,7 +283,8 @@ class EngineTest(unittest.TestCase): elif self.run.getpath(request.url) == "/redirected": self.assertEqual(data, b"Redirected here") elif self.run.getpath(request.url) == '/redirect': - self.assertEqual(data, + self.assertEqual( + data, b"\n\n" b" \n" b" \n" @@ -294,7 +295,8 @@ class EngineTest(unittest.TestCase): b"\n" ) elif self.run.getpath(request.url) == "/tem999.html": - self.assertEqual(data, + self.assertEqual( + data, b"\n\n" b" 404 - No Such Resource\n" b" \n" From dbe20a863ff63dce937b2d3b159782d8268e6838 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 27 Jan 2020 12:21:18 -0300 Subject: [PATCH 011/181] bytes_received signal: send spider argument --- docs/topics/signals.rst | 5 ++++- scrapy/core/downloader/handlers/http11.py | 1 + tests/test_engine.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/topics/signals.rst b/docs/topics/signals.rst index 6efb73abb..3e70ca067 100644 --- a/docs/topics/signals.rst +++ b/docs/topics/signals.rst @@ -162,7 +162,7 @@ bytes_received -------------- .. signal:: bytes_received -.. function:: bytes_received(data, request) +.. function:: bytes_received(data, request, spider) Sent by the HTTP 1.1 download handler when a group of bytes is received for a specific request. @@ -175,6 +175,9 @@ bytes_received :param request: the request that generated the response :type request: :class:`~scrapy.http.Request` object + :param spider: the spider associated with the response + :type spider: :class:`~scrapy.spiders.Spider` object + spider_closed ------------- diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index 92c3d5f5c..c53c9bb2d 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -482,6 +482,7 @@ class _ResponseReader(protocol.Protocol): signal=signals.bytes_received, data=bodyBytes, request=self._request, + spider=self._crawler.spider, ) if self._maxsize and self._bytes_received > self._maxsize: diff --git a/tests/test_engine.py b/tests/test_engine.py index c0769c992..57cc89ba3 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -159,7 +159,7 @@ class CrawlerRun(object): def item_scraped(self, item, spider, response): self.itemresp.append((item, response)) - def bytes_received(self, data, request): + def bytes_received(self, data, request, spider): self.bytes[request] += data def request_scheduled(self, request, spider): From 613fd41f44d1455f9c9369087958674f3fdfcc8d Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 27 Jan 2020 12:30:26 -0300 Subject: [PATCH 012/181] bytes_received signal: improve test performance --- tests/test_engine.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/test_engine.py b/tests/test_engine.py index 57cc89ba3..bb475958e 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -109,7 +109,7 @@ class CrawlerRun(object): self.reqreached = [] self.itemerror = [] self.itemresp = [] - self.bytes = defaultdict(lambda: b"") + self.bytes = defaultdict(lambda: list()) self.signals_caught = {} self.spider_class = spider_class @@ -160,7 +160,7 @@ class CrawlerRun(object): self.itemresp.append((item, response)) def bytes_received(self, data, request, spider): - self.bytes[request] += data + self.bytes[request].append(data) def request_scheduled(self, request, spider): self.reqplug.append((request, spider)) @@ -274,17 +274,18 @@ class EngineTest(unittest.TestCase): def _assert_bytes_received(self): self.assertEqual(8, len(self.run.bytes)) for request, data in self.run.bytes.items(): + joined_data = b"".join(data) if self.run.getpath(request.url) == "/": - self.assertEqual(data, get_testdata("test_site", "index.html")) + self.assertEqual(joined_data, get_testdata("test_site", "index.html")) elif self.run.getpath(request.url) == "/item1.html": - self.assertEqual(data, get_testdata("test_site", "item1.html")) + self.assertEqual(joined_data, get_testdata("test_site", "item1.html")) elif self.run.getpath(request.url) == "/item2.html": - self.assertEqual(data, get_testdata("test_site", "item2.html")) + self.assertEqual(joined_data, get_testdata("test_site", "item2.html")) elif self.run.getpath(request.url) == "/redirected": - self.assertEqual(data, b"Redirected here") + self.assertEqual(joined_data, b"Redirected here") elif self.run.getpath(request.url) == '/redirect': self.assertEqual( - data, + joined_data, b"\n\n" b" \n" b" \n" @@ -296,7 +297,7 @@ class EngineTest(unittest.TestCase): ) elif self.run.getpath(request.url) == "/tem999.html": self.assertEqual( - data, + joined_data, b"\n\n" b" 404 - No Such Resource\n" b" \n" From 4ffd18fb11ff89863569b8b4de44241e3ca2f86e Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 27 Jan 2020 13:29:33 -0300 Subject: [PATCH 013/181] [docs] Mention that signals.bytes_received could be fired multiple times --- docs/topics/signals.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/topics/signals.rst b/docs/topics/signals.rst index 3e70ca067..f490911f3 100644 --- a/docs/topics/signals.rst +++ b/docs/topics/signals.rst @@ -165,7 +165,8 @@ bytes_received .. function:: bytes_received(data, request, spider) Sent by the HTTP 1.1 download handler when a group of bytes is - received for a specific request. + received for a specific request. This signal might be fired + multiple times for the same request. This signal does not support returning deferreds from its handlers. From 2c9643d38cc076c4d2032efd994fda4cfcc9f88a Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 29 Jan 2020 14:11:56 -0300 Subject: [PATCH 014/181] Test: bytes_received signal fired multiple times --- tests/test_engine.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/test_engine.py b/tests/test_engine.py index bb475958e..3c5cc403b 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -12,6 +12,7 @@ module with the ``runserver`` argument:: import os import re +import string import sys from collections import defaultdict from urllib.parse import urlparse @@ -90,6 +91,7 @@ def start_test_site(debug=False): r = static.File(root_dir) r.putChild(b"redirect", util.Redirect(b"/redirected")) r.putChild(b"redirected", static.Data(b"Redirected here", "text/plain")) + r.putChild(b"random", static.Data(string.ascii_letters.encode("utf8") * 2**14, "text/plain")) port = reactor.listenTCP(0, server.Site(r), interface="127.0.0.1") if debug: @@ -117,8 +119,12 @@ class CrawlerRun(object): self.port = start_test_site() self.portno = self.port.getHost().port - start_urls = [self.geturl("/"), self.geturl("/redirect"), - self.geturl("/redirect")] # a duplicate + start_urls = [ + self.geturl("/"), + self.geturl("/redirect"), + self.geturl("/redirect"), # duplicate + self.geturl("/random"), + ] for name, signal in vars(signals).items(): if not name.startswith('_'): @@ -190,7 +196,7 @@ class EngineTest(unittest.TestCase): self.run = CrawlerRun(spider) yield self.run.run() self._assert_visited_urls() - self._assert_scheduled_requests(urls_to_visit=8) + self._assert_scheduled_requests(urls_to_visit=9) self._assert_downloaded_responses() self._assert_scraped_items() self._assert_signals_caught() @@ -200,7 +206,7 @@ class EngineTest(unittest.TestCase): def test_crawler_dupefilter(self): self.run = CrawlerRun(TestDupeFilterSpider) yield self.run.run() - self._assert_scheduled_requests(urls_to_visit=7) + self._assert_scheduled_requests(urls_to_visit=8) self._assert_dropped_requests() @defer.inlineCallbacks @@ -237,8 +243,8 @@ class EngineTest(unittest.TestCase): def _assert_downloaded_responses(self): # response tests - self.assertEqual(8, len(self.run.respplug)) - self.assertEqual(8, len(self.run.reqreached)) + self.assertEqual(9, len(self.run.respplug)) + self.assertEqual(9, len(self.run.reqreached)) for response, _ in self.run.respplug: if self.run.getpath(response.url) == '/item999.html': @@ -272,7 +278,7 @@ class EngineTest(unittest.TestCase): self.assertEqual('200', item['price']) def _assert_bytes_received(self): - self.assertEqual(8, len(self.run.bytes)) + self.assertEqual(9, len(self.run.bytes)) for request, data in self.run.bytes.items(): joined_data = b"".join(data) if self.run.getpath(request.url) == "/": @@ -306,6 +312,8 @@ class EngineTest(unittest.TestCase): b" \n" b"\n" ) + elif self.run.getpath(request.url) == "/random": + self.assertTrue(len(data) > 1) # signal was fired multiple times def _assert_signals_caught(self): assert signals.engine_started in self.run.signals_caught From a499f38b14d16338d20084c0dcb24528a1f1f22f Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 29 Jan 2020 14:35:17 -0300 Subject: [PATCH 015/181] Remove object parent class --- scrapy/core/downloader/handlers/http11.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index c53c9bb2d..6f1bd9ad6 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -268,7 +268,7 @@ class ScrapyProxyAgent(Agent): ) -class ScrapyAgent(object): +class ScrapyAgent: _Agent = Agent _ProxyAgent = ScrapyProxyAgent @@ -438,7 +438,7 @@ class ScrapyAgent(object): @implementer(IBodyProducer) -class _RequestBodyProducer(object): +class _RequestBodyProducer: def __init__(self, body): self.body = body From 6f02a8dccb95373f22bac18c08d9fda8169dcb02 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 29 Jan 2020 14:53:23 -0300 Subject: [PATCH 016/181] Add source parameter to bytes_received signal --- docs/topics/signals.rst | 12 ++++++++---- scrapy/core/downloader/handlers/http11.py | 18 +++++++++++++----- scrapy/core/downloader/handlers/s3.py | 1 + tests/test_downloader_handlers.py | 3 +++ tests/test_engine.py | 5 ++++- 5 files changed, 29 insertions(+), 10 deletions(-) diff --git a/docs/topics/signals.rst b/docs/topics/signals.rst index f490911f3..3a15bf95c 100644 --- a/docs/topics/signals.rst +++ b/docs/topics/signals.rst @@ -162,11 +162,11 @@ bytes_received -------------- .. signal:: bytes_received -.. function:: bytes_received(data, request, spider) +.. function:: bytes_received(data, request, spider, source) - Sent by the HTTP 1.1 download handler when a group of bytes is - received for a specific request. This signal might be fired - multiple times for the same request. + Sent by the HTTP 1.1 and S3 download handlers when a group of bytes is + received for a specific request. This signal might be fired multiple + times for the same request, with partial data each time. This signal does not support returning deferreds from its handlers. @@ -179,6 +179,10 @@ bytes_received :param spider: the spider associated with the response :type spider: :class:`~scrapy.spiders.Spider` object + :param source: a string to identify which handler sent the signal + (current values could be "http11" or "s3") + :type source: :class:`str` object + spider_closed ------------- diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index 6f1bd9ad6..49c9eacac 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -32,8 +32,9 @@ logger = logging.getLogger(__name__) class HTTP11DownloadHandler: lazy = False - def __init__(self, settings, crawler=None): + def __init__(self, settings, crawler=None, source="http11"): self.crawler = crawler + self.source = source self._pool = HTTPConnectionPool(reactor, persistent=True) self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self._pool._factory.noisy = False @@ -67,8 +68,8 @@ class HTTP11DownloadHandler: self._disconnect_timeout = 1 @classmethod - def from_crawler(cls, crawler): - return cls(crawler.settings, crawler) + def from_crawler(cls, crawler, **kwargs): + return cls(crawler.settings, crawler, **kwargs) def download_request(self, request, spider): """Return a deferred for the HTTP download""" @@ -79,6 +80,7 @@ class HTTP11DownloadHandler: warnsize=getattr(spider, 'download_warnsize', self._default_warnsize), fail_on_dataloss=self._fail_on_dataloss, crawler=self.crawler, + source=self.source, ) return agent.download_request(request) @@ -275,7 +277,7 @@ class ScrapyAgent: _TunnelingAgent = TunnelingAgent def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None, - maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None): + maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None, source=None): self._contextFactory = contextFactory self._connectTimeout = connectTimeout self._bindAddress = bindAddress @@ -285,6 +287,7 @@ class ScrapyAgent: self._fail_on_dataloss = fail_on_dataloss self._txresponse = None self._crawler = crawler + self._source = source def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress @@ -421,6 +424,7 @@ class ScrapyAgent: warnsize, fail_on_dataloss, self._crawler, + self._source, ) ) @@ -457,7 +461,9 @@ class _RequestBodyProducer: class _ResponseReader(protocol.Protocol): - def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler): + def __init__( + self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler, source + ): self._finished = finished self._txresponse = txresponse self._request = request @@ -469,6 +475,7 @@ class _ResponseReader(protocol.Protocol): self._reached_warnsize = False self._bytes_received = 0 self._crawler = crawler + self._source = source def dataReceived(self, bodyBytes): # This maybe called several times after cancel was called with buffered data. @@ -483,6 +490,7 @@ class _ResponseReader(protocol.Protocol): data=bodyBytes, request=self._request, spider=self._crawler.spider, + source=self._source, ) if self._maxsize and self._bytes_received > self._maxsize: diff --git a/scrapy/core/downloader/handlers/s3.py b/scrapy/core/downloader/handlers/s3.py index 40a1fa48e..2366b6394 100644 --- a/scrapy/core/downloader/handlers/s3.py +++ b/scrapy/core/downloader/handlers/s3.py @@ -73,6 +73,7 @@ class S3DownloadHandler: objcls=httpdownloadhandler, settings=settings, crawler=crawler, + source="s3", ) self._download_http = _http_handler.download_request diff --git a/tests/test_downloader_handlers.py b/tests/test_downloader_handlers.py index 8d95d7cac..22a813647 100644 --- a/tests/test_downloader_handlers.py +++ b/tests/test_downloader_handlers.py @@ -730,6 +730,9 @@ class Http11ProxyTestCase(HttpProxyTestCase): class HttpDownloadHandlerMock: + def __init__(self, *args, **kwargs): + pass + def download_request(self, request, spider): return request diff --git a/tests/test_engine.py b/tests/test_engine.py index 3c5cc403b..c83a23b55 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -112,6 +112,7 @@ class CrawlerRun(object): self.itemerror = [] self.itemresp = [] self.bytes = defaultdict(lambda: list()) + self.bytes_source = set() self.signals_caught = {} self.spider_class = spider_class @@ -165,8 +166,9 @@ class CrawlerRun(object): def item_scraped(self, item, spider, response): self.itemresp.append((item, response)) - def bytes_received(self, data, request, spider): + def bytes_received(self, data, request, spider, source): self.bytes[request].append(data) + self.bytes_source.add(source) def request_scheduled(self, request, spider): self.reqplug.append((request, spider)) @@ -279,6 +281,7 @@ class EngineTest(unittest.TestCase): def _assert_bytes_received(self): self.assertEqual(9, len(self.run.bytes)) + self.assertEqual(self.run.bytes_source, set(["http11"])) for request, data in self.run.bytes.items(): joined_data = b"".join(data) if self.run.getpath(request.url) == "/": From a2ae380efcaa5a3419a4f6a35541ae0fb71a2e7f Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 3 Feb 2020 13:23:52 -0300 Subject: [PATCH 017/181] Remove unnecessary commas --- scrapy/resolver.py | 2 +- tests/CrawlerProcess/ip_address.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy/resolver.py b/scrapy/resolver.py index 554a3a14d..f69894b1e 100644 --- a/scrapy/resolver.py +++ b/scrapy/resolver.py @@ -29,7 +29,7 @@ class CachingThreadedResolver(ThreadedResolver): cache_size = 0 return cls(reactor, cache_size, crawler.settings.getfloat('DNS_TIMEOUT')) - def install_on_reactor(self,): + def install_on_reactor(self): self.reactor.installResolver(self) def getHostByName(self, name, timeout=None): diff --git a/tests/CrawlerProcess/ip_address.py b/tests/CrawlerProcess/ip_address.py index 6b069cc90..949e97172 100644 --- a/tests/CrawlerProcess/ip_address.py +++ b/tests/CrawlerProcess/ip_address.py @@ -21,7 +21,7 @@ class MockThreadedResolver(ThreadedResolver): def from_crawler(cls, crawler, reactor): return cls(reactor) - def install_on_reactor(self,): + def install_on_reactor(self): self.reactor.installResolver(self) def getHostByName(self, name, timeout=None): From bb8f7dc609382153df79774ad9d8f6d33d064279 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 3 Feb 2020 14:50:14 -0300 Subject: [PATCH 018/181] Mock DNS server --- tests/mockserver.py | 90 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 22 deletions(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index a45277db9..585741f1b 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -1,3 +1,4 @@ +import argparse import json import os import random @@ -6,18 +7,19 @@ from subprocess import Popen, PIPE from urllib.parse import urlencode from OpenSSL import SSL -from twisted.web.server import Site, NOT_DONE_YET -from twisted.web.resource import Resource +from twisted.internet import defer, reactor, ssl +from twisted.internet.task import deferLater +from twisted.names import dns, error +from twisted.names.server import DNSServerFactory +from twisted.web.resource import EncodingResourceWrapper, Resource +from twisted.web.server import GzipEncoderFactory, NOT_DONE_YET, Site from twisted.web.static import File from twisted.web.test.test_webclient import PayloadResource -from twisted.web.server import GzipEncoderFactory -from twisted.web.resource import EncodingResourceWrapper from twisted.web.util import redirectTo -from twisted.internet import reactor, ssl -from twisted.internet.task import deferLater from scrapy.utils.python import to_bytes, to_unicode from scrapy.utils.ssl import SSL_OP_NO_TLSv1_3 +from scrapy.utils.test import get_testenv def getarg(request, name, default=None, type=None): @@ -198,12 +200,10 @@ class Root(Resource): return b'Scrapy mock HTTP server\n' -class MockServer(): +class MockServer: def __enter__(self): - from scrapy.utils.test import get_testenv - - self.proc = Popen([sys.executable, '-u', '-m', 'tests.mockserver'], + self.proc = Popen([sys.executable, '-u', '-m', 'tests.mockserver', '-t', 'http'], stdout=PIPE, env=get_testenv()) http_address = self.proc.stdout.readline().strip().decode('ascii') https_address = self.proc.stdout.readline().strip().decode('ascii') @@ -224,6 +224,37 @@ class MockServer(): return host + path +class MockDNSResolver: + """ + Implements twisted.internet.interfaces.IResolver partially + """ + + def _resolve(self, name): + record = dns.Record_A(address=b"127.0.0.1") + answer = dns.RRHeader(name=name, payload=record) + return [answer], [], [] + + def query(self, query, timeout=None): + if query.type == dns.A: + return defer.succeed(self._resolve(query.name.name)) + return defer.fail(error.DomainError()) + + def lookupAllRecords(self, name, timeout=None): + return defer.succeed(self._resolve(name)) + + +class MockDNSServer(): + + def __enter__(self): + self.proc = Popen([sys.executable, '-u', '-m', 'tests.mockserver', 'dns'], + stdout=PIPE, env=get_testenv()) + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.proc.kill() + self.proc.communicate() + + def ssl_context_factory(keyfile='keys/localhost.key', certfile='keys/localhost.crt', cipher_string=None): factory = ssl.DefaultOpenSSLContextFactory( os.path.join(os.path.dirname(__file__), keyfile), @@ -238,19 +269,34 @@ def ssl_context_factory(keyfile='keys/localhost.key', certfile='keys/localhost.c if __name__ == "__main__": - root = Root() - factory = Site(root) - httpPort = reactor.listenTCP(0, factory) - contextFactory = ssl_context_factory() - httpsPort = reactor.listenSSL(0, factory, contextFactory) + parser = argparse.ArgumentParser() + parser.add_argument("-t", "--type", type=str, choices=("http", "dns"), default="http") + args = parser.parse_args() - def print_listening(): - httpHost = httpPort.getHost() - httpsHost = httpsPort.getHost() - httpAddress = 'http://%s:%d' % (httpHost.host, httpHost.port) - httpsAddress = 'https://%s:%d' % (httpsHost.host, httpsHost.port) - print(httpAddress) - print(httpsAddress) + if args.type == "http": + root = Root() + factory = Site(root) + httpPort = reactor.listenTCP(0, factory) + contextFactory = ssl_context_factory() + httpsPort = reactor.listenSSL(0, factory, contextFactory) + + def print_listening(): + httpHost = httpPort.getHost() + httpsHost = httpsPort.getHost() + httpAddress = "http://%s:%d" % (httpHost.host, httpHost.port) + httpsAddress = "https://%s:%d" % (httpsHost.host, httpsHost.port) + print(httpAddress) + print(httpsAddress) + + elif args.type == "dns": + clients = [MockDNSResolver()] + factory = DNSServerFactory(clients=clients) + protocol = dns.DNSDatagramProtocol(controller=factory) + reactor.listenUDP(10053, protocol) + reactor.listenTCP(10053, factory) + + def print_listening(): + print("DNS server running on port 10053") reactor.callWhenRunning(print_listening) reactor.run() From 4851efdfb0885a40a44a2834c6c69d0104326801 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 3 Feb 2020 14:50:54 -0300 Subject: [PATCH 019/181] Flake8 adjustments --- tests/mockserver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index 585741f1b..67139534e 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -257,9 +257,9 @@ class MockDNSServer(): def ssl_context_factory(keyfile='keys/localhost.key', certfile='keys/localhost.crt', cipher_string=None): factory = ssl.DefaultOpenSSLContextFactory( - os.path.join(os.path.dirname(__file__), keyfile), - os.path.join(os.path.dirname(__file__), certfile), - ) + os.path.join(os.path.dirname(__file__), keyfile), + os.path.join(os.path.dirname(__file__), certfile), + ) if cipher_string: ctx = factory.getContext() # disabling TLS1.2+ because it unconditionally enables some strong ciphers From e0ef8ad2d6f958de6ce04cd7756e142efeb1a6a2 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 3 Feb 2020 15:52:15 -0300 Subject: [PATCH 020/181] CrawlerRunner test for Response.ip_address --- tests/CrawlerProcess/ip_address.py | 51 ------------------------------ tests/CrawlerRunner/ip_address.py | 37 ++++++++++++++++++++++ tests/mockserver.py | 11 ++++--- tests/test_crawler.py | 20 ++++++++---- 4 files changed, 57 insertions(+), 62 deletions(-) delete mode 100644 tests/CrawlerProcess/ip_address.py create mode 100644 tests/CrawlerRunner/ip_address.py diff --git a/tests/CrawlerProcess/ip_address.py b/tests/CrawlerProcess/ip_address.py deleted file mode 100644 index 949e97172..000000000 --- a/tests/CrawlerProcess/ip_address.py +++ /dev/null @@ -1,51 +0,0 @@ -from urllib.parse import urlparse - -from twisted.internet import defer -from twisted.internet.base import ThreadedResolver -from twisted.internet.interfaces import IResolverSimple -from zope.interface.declarations import implementer - -from scrapy import Spider, Request -from scrapy.crawler import CrawlerProcess - -from tests.mockserver import MockServer - - -@implementer(IResolverSimple) -class MockThreadedResolver(ThreadedResolver): - """ - Resolves all names to localhost - """ - - @classmethod - def from_crawler(cls, crawler, reactor): - return cls(reactor) - - def install_on_reactor(self): - self.reactor.installResolver(self) - - def getHostByName(self, name, timeout=None): - return defer.succeed("127.0.0.1") - - -class LocalhostSpider(Spider): - name = "localhost_spider" - - def start_requests(self): - yield Request(self.url) - - def parse(self, response): - netloc = urlparse(response.url).netloc - self.logger.info("Host: %s" % netloc.split(":")[0]) - self.logger.info("Type: %s" % type(response.ip_address)) - self.logger.info("IP address: %s" % response.ip_address) - - -with MockServer() as mockserver: - settings = {"DNS_RESOLVER": __name__ + ".MockThreadedResolver"} - process = CrawlerProcess(settings) - - port = urlparse(mockserver.http_address).port - url = "http://not.a.real.domain:{port}/echo?body=test".format(port=port) - process.crawl(LocalhostSpider, url=url) - process.start() diff --git a/tests/CrawlerRunner/ip_address.py b/tests/CrawlerRunner/ip_address.py new file mode 100644 index 000000000..5a71536d8 --- /dev/null +++ b/tests/CrawlerRunner/ip_address.py @@ -0,0 +1,37 @@ +from urllib.parse import urlparse + +from twisted.internet import reactor +from twisted.names.client import createResolver + +from scrapy import Spider, Request +from scrapy.crawler import CrawlerRunner +from scrapy.utils.log import configure_logging + +from tests.mockserver import MockServer, MockDNSServer + + +class LocalhostSpider(Spider): + name = "localhost_spider" + + def start_requests(self): + yield Request(self.url) + + def parse(self, response): + netloc = urlparse(response.url).netloc + self.logger.info("Host: %s" % netloc.split(":")[0]) + self.logger.info("Type: %s" % type(response.ip_address)) + self.logger.info("IP address: %s" % response.ip_address) + + +with MockServer() as mock_http_server, MockDNSServer() as mock_dns_server: + port = urlparse(mock_http_server.http_address).port + url = "http://not.a.real.domain:{port}/echo".format(port=port) + + servers = [(mock_dns_server.host, mock_dns_server.port)] + reactor.installResolver(createResolver(servers=servers)) + + configure_logging() + runner = CrawlerRunner() + d = runner.crawl(LocalhostSpider, url=url) + d.addBoth(lambda _: reactor.stop()) + reactor.run() diff --git a/tests/mockserver.py b/tests/mockserver.py index 67139534e..08a81418c 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -246,8 +246,11 @@ class MockDNSResolver: class MockDNSServer(): def __enter__(self): - self.proc = Popen([sys.executable, '-u', '-m', 'tests.mockserver', 'dns'], + self.proc = Popen([sys.executable, '-u', '-m', 'tests.mockserver', '-t', 'dns'], stdout=PIPE, env=get_testenv()) + host, port = self.proc.stdout.readline().strip().decode('ascii').split(":") + self.host = host + self.port = int(port) return self def __exit__(self, exc_type, exc_value, traceback): @@ -292,11 +295,11 @@ if __name__ == "__main__": clients = [MockDNSResolver()] factory = DNSServerFactory(clients=clients) protocol = dns.DNSDatagramProtocol(controller=factory) - reactor.listenUDP(10053, protocol) - reactor.listenTCP(10053, factory) + listener = reactor.listenUDP(0, protocol) def print_listening(): - print("DNS server running on port 10053") + host = listener.getHost() + print("%s:%s" % (host.host, host.port)) reactor.callWhenRunning(print_listening) reactor.run() diff --git a/tests/test_crawler.py b/tests/test_crawler.py index dfc1cf448..5d381c368 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -281,9 +281,7 @@ class CrawlerRunnerHasSpider(unittest.TestCase): self.assertNotIn("Asyncio reactor is installed", str(log)) -class CrawlerProcessSubprocess(unittest.TestCase): - script_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'CrawlerProcess') - +class ScriptRunnerMixin: def run_script(self, script_name): script_path = os.path.join(self.script_dir, script_name) args = (sys.executable, script_path) @@ -292,6 +290,10 @@ class CrawlerProcessSubprocess(unittest.TestCase): stdout, stderr = p.communicate() return stderr.decode('utf-8') + +class CrawlerProcessSubprocess(ScriptRunnerMixin, unittest.TestCase): + script_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'CrawlerProcess') + def test_simple(self): log = self.run_script('simple.py') self.assertIn('Spider closed (finished)', log) @@ -325,9 +327,13 @@ class CrawlerProcessSubprocess(unittest.TestCase): "'downloader/exception_type_count/twisted.internet.error.ConnectError': 1," in log, ])) + +class CrawlerRunnerSubprocess(ScriptRunnerMixin, unittest.TestCase): + script_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'CrawlerRunner') + def test_response_ip_address(self): log = self.run_script("ip_address.py") - self.assertIn("Spider closed (finished)", log) - self.assertIn("Host: not.a.real.domain", log) - self.assertIn("Type: ", log) - self.assertIn("IP address: 127.0.0.1", log) + self.assertIn("INFO: Spider closed (finished)", log) + self.assertIn("INFO: Host: not.a.real.domain", log) + self.assertIn("INFO: Type: ", log) + self.assertIn("INFO: IP address: 127.0.0.1", log) From 13670f0397ba8dcec3dceb1852bad5751406d19d Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 3 Feb 2020 16:16:43 -0300 Subject: [PATCH 021/181] Ignore tests/CrawlerRunner directory --- conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index c0de09909..55294feca 100644 --- a/conftest.py +++ b/conftest.py @@ -11,7 +11,8 @@ collect_ignore = [ # not a test, but looks like a test "scrapy/utils/testsite.py", # contains scripts to be run by tests/test_crawler.py::CrawlerProcessSubprocess - *_py_files("tests/CrawlerProcess") + *_py_files("tests/CrawlerProcess"), + *_py_files("tests/CrawlerRunner"), ] for line in open('tests/ignores.txt'): From ad70497416527c3d882a64f7803e73155f3fa1da Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Tue, 4 Feb 2020 13:30:13 -0300 Subject: [PATCH 022/181] Remove unnecessary parentheses in class definition --- tests/mockserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index 08a81418c..30d9bc0e8 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -243,7 +243,7 @@ class MockDNSResolver: return defer.succeed(self._resolve(name)) -class MockDNSServer(): +class MockDNSServer: def __enter__(self): self.proc = Popen([sys.executable, '-u', '-m', 'tests.mockserver', '-t', 'dns'], From a64fa2f0866c10594f1e5cf00a0161f9fea1eb62 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 10 Feb 2020 10:16:05 -0300 Subject: [PATCH 023/181] Keyword arguments when creating a _ResponseReader --- scrapy/core/downloader/handlers/http11.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index 49c9eacac..7a1a77b23 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -417,14 +417,14 @@ class ScrapyAgent: d = defer.Deferred(_cancel) txresponse.deliverBody( _ResponseReader( - d, - txresponse, - request, - maxsize, - warnsize, - fail_on_dataloss, - self._crawler, - self._source, + finished=d, + txresponse=txresponse, + request=request, + maxsize=maxsize, + warnsize=warnsize, + fail_on_dataloss=fail_on_dataloss, + crawler=self._crawler, + source=self._source, ) ) From 122ce6d6fb3861d99ba2f2810b2370056bae1190 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 10 Feb 2020 10:20:26 -0300 Subject: [PATCH 024/181] Check bytes are received in order (bytes_received signal) --- tests/test_engine.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_engine.py b/tests/test_engine.py index c83a23b55..0d970928b 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -12,7 +12,6 @@ module with the ``runserver`` argument:: import os import re -import string import sys from collections import defaultdict from urllib.parse import urlparse @@ -91,7 +90,8 @@ def start_test_site(debug=False): r = static.File(root_dir) r.putChild(b"redirect", util.Redirect(b"/redirected")) r.putChild(b"redirected", static.Data(b"Redirected here", "text/plain")) - r.putChild(b"random", static.Data(string.ascii_letters.encode("utf8") * 2**14, "text/plain")) + numbers = [str(x).encode("utf8") for x in range(2**14)] + r.putChild(b"numbers", static.Data(b"".join(numbers), "text/plain")) port = reactor.listenTCP(0, server.Site(r), interface="127.0.0.1") if debug: @@ -124,7 +124,7 @@ class CrawlerRun(object): self.geturl("/"), self.geturl("/redirect"), self.geturl("/redirect"), # duplicate - self.geturl("/random"), + self.geturl("/numbers"), ] for name, signal in vars(signals).items(): @@ -315,8 +315,12 @@ class EngineTest(unittest.TestCase): b" \n" b"\n" ) - elif self.run.getpath(request.url) == "/random": - self.assertTrue(len(data) > 1) # signal was fired multiple times + elif self.run.getpath(request.url) == "/numbers": + # signal was fired multiple times + self.assertTrue(len(data) > 1) + # bytes were received in order + numbers = [str(x).encode("utf8") for x in range(2**14)] + self.assertEqual(joined_data, b"".join(numbers)) def _assert_signals_caught(self): assert signals.engine_started in self.run.signals_caught From 42b4e9b3372ce3f9da57c7512b31a3c455b8a161 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 10 Feb 2020 11:23:38 -0300 Subject: [PATCH 025/181] Reword signal docs --- docs/topics/signals.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/topics/signals.rst b/docs/topics/signals.rst index 3a15bf95c..dfb87cef3 100644 --- a/docs/topics/signals.rst +++ b/docs/topics/signals.rst @@ -279,7 +279,7 @@ request_scheduled Sent when the engine schedules a :class:`~scrapy.http.Request`, to be downloaded later. - The signal does not support returning deferreds from its handlers. + This signal does not support returning deferreds from its handlers. :param request: the request that reached the scheduler :type request: :class:`~scrapy.http.Request` object @@ -296,7 +296,7 @@ request_dropped Sent when a :class:`~scrapy.http.Request`, scheduled by the engine to be downloaded later, is rejected by the scheduler. - The signal does not support returning deferreds from its handlers. + This signal does not support returning deferreds from its handlers. :param request: the request that reached the scheduler :type request: :class:`~scrapy.http.Request` object @@ -312,7 +312,7 @@ request_reached_downloader Sent when a :class:`~scrapy.http.Request` reached downloader. - The signal does not support returning deferreds from its handlers. + This signal does not support returning deferreds from its handlers. :param request: the request that reached downloader :type request: :class:`~scrapy.http.Request` object From 13ba9bc629cb0a77ebaca36a10a0a4984d7cce68 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 10 Feb 2020 12:29:39 -0300 Subject: [PATCH 026/181] Note about Response.ip_address --- docs/topics/request-response.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst index 17eb63064..89e570028 100644 --- a/docs/topics/request-response.rst +++ b/docs/topics/request-response.rst @@ -685,6 +685,8 @@ Response objects .. attribute:: Response.ip_address The IP address of the server from which the Response originated. + This attribute is currently only populated by the HTTP 1.1 download + handler, i.e. for ``http(s)`` responses. .. method:: Response.copy() From 037ae5b22e6d6600dc537ee5073652ce74e5f47b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 10 Feb 2020 19:54:47 +0100 Subject: [PATCH 027/181] =?UTF-8?q?Explicitly=20indicate=20None=20as=20ip?= =?UTF-8?q?=5Faddress=E2=80=99s=20default=20value?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/topics/request-response.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst index 89e570028..8f2504a33 100644 --- a/docs/topics/request-response.rst +++ b/docs/topics/request-response.rst @@ -685,8 +685,10 @@ Response objects .. attribute:: Response.ip_address The IP address of the server from which the Response originated. + This attribute is currently only populated by the HTTP 1.1 download - handler, i.e. for ``http(s)`` responses. + handler, i.e. for ``http(s)`` responses. For other handlers, + :attr:`ip_address` is always ``None``. .. method:: Response.copy() From f85bf77da3c8943f0791dcae893e8294c4d118d7 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 23 Feb 2020 18:31:13 -0300 Subject: [PATCH 028/181] Restore unrelated change --- scrapy/resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/resolver.py b/scrapy/resolver.py index f69894b1e..554a3a14d 100644 --- a/scrapy/resolver.py +++ b/scrapy/resolver.py @@ -29,7 +29,7 @@ class CachingThreadedResolver(ThreadedResolver): cache_size = 0 return cls(reactor, cache_size, crawler.settings.getfloat('DNS_TIMEOUT')) - def install_on_reactor(self): + def install_on_reactor(self,): self.reactor.installResolver(self) def getHostByName(self, name, timeout=None): From 889b4718520220d1a81e702ff754ec210a7d3c79 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 23 Feb 2020 18:40:43 -0300 Subject: [PATCH 029/181] Import changes --- scrapy/core/downloader/handlers/http11.py | 4 ++-- tests/test_crawl.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index e72275021..190ae1d3b 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -1,11 +1,11 @@ """Download handlers for http and https schemes""" +import ipaddress import logging import re import warnings from contextlib import suppress from io import BytesIO -from ipaddress import ip_address from time import time from urllib.parse import urldefrag @@ -468,7 +468,7 @@ class _ResponseReader(protocol.Protocol): self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate()) if self._ip_address is None: - self._ip_address = ip_address(self.transport._producer.getPeer().host) + self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host) def dataReceived(self, bodyBytes): # This maybe called several times after cancel was called with buffered data. diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 3a9b00ab3..3c110e7a6 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -441,13 +441,15 @@ with multiples lines self.assertEqual(cert.getIssuer().commonName, b"localhost") @defer.inlineCallbacks - def test_dns_server_ip_address(self): + def test_dns_server_ip_address_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/status?n=200') yield crawler.crawl(seed=url, mockserver=self.mockserver) ip_address = crawler.spider.meta['responses'][0].ip_address self.assertIsNone(ip_address) + @defer.inlineCallbacks + def test_dns_server_ip_address(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url('/echo?body=test') expected_netloc, _ = urlparse(url).netloc.split(':') From 91a78eef3ee9de033e66db55c49321b2cc43740e Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sun, 8 Mar 2020 22:32:17 -0300 Subject: [PATCH 030/181] Pass callback results as dicts instead of tuples --- scrapy/core/downloader/handlers/http11.py | 56 ++++++++++++++++------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index 190ae1d3b..e904cbc05 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -384,7 +384,13 @@ class ScrapyAgent(object): def _cb_bodyready(self, txresponse, request): # deliverBody hangs for responses without body if txresponse.length == 0: - return txresponse, b'', None, None + return { + "txresponse": txresponse, + "body": b"", + "flags": None, + "certificate": None, + "ip_address": None, + } maxsize = request.meta.get('download_maxsize', self._maxsize) warnsize = request.meta.get('download_warnsize', self._warnsize) @@ -420,12 +426,18 @@ class ScrapyAgent(object): return d def _cb_bodydone(self, result, request, url): - txresponse, body, flags, certificate, ip_address = result - status = int(txresponse.code) - headers = Headers(txresponse.headers.getAllRawHeaders()) - respcls = responsetypes.from_args(headers=headers, url=url, body=body) - return respcls(url=url, status=status, headers=headers, body=body, - flags=flags, certificate=certificate, ip_address=ip_address) + status = int(result["txresponse"].code) + headers = Headers(result["txresponse"].headers.getAllRawHeaders()) + respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"]) + return respcls( + url=url, + status=status, + headers=headers, + body=result["body"], + flags=result["flags"], + certificate=result["certificate"], + ip_address=result["ip_address"], + ) @implementer(IBodyProducer) @@ -501,22 +513,34 @@ class _ResponseReader(protocol.Protocol): body = self._bodybuf.getvalue() if reason.check(ResponseDone): - self._finished.callback( - (self._txresponse, body, None, self._certificate, self._ip_address) - ) + self._finished.callback({ + "txresponse": self._txresponse, + "body": body, + "flags": None, + "certificate": self._certificate, + "ip_address": self._ip_address, + }) return if reason.check(PotentialDataLoss): - self._finished.callback( - (self._txresponse, body, ['partial'], self._certificate, self._ip_address) - ) + self._finished.callback({ + "txresponse": self._txresponse, + "body": body, + "flags": ["partial"], + "certificate": self._certificate, + "ip_address": self._ip_address, + }) return if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons): if not self._fail_on_dataloss: - self._finished.callback( - (self._txresponse, body, ['dataloss'], self._certificate, self._ip_address) - ) + self._finished.callback({ + "txresponse": self._txresponse, + "body": body, + "flags": ["dataloss"], + "certificate": self._certificate, + "ip_address": self._ip_address, + }) return elif not self._fail_on_dataloss_warned: From 1785095707dec53647c835c0b0861b220e8495af Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 11 Mar 2020 20:41:59 -0300 Subject: [PATCH 031/181] Remove single-use variable --- scrapy/core/downloader/handlers/http11.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index e904cbc05..a5b03a62b 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -426,12 +426,11 @@ class ScrapyAgent(object): return d def _cb_bodydone(self, result, request, url): - status = int(result["txresponse"].code) headers = Headers(result["txresponse"].headers.getAllRawHeaders()) respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"]) return respcls( url=url, - status=status, + status=int(result["txresponse"].code), headers=headers, body=result["body"], flags=result["flags"], From ca08e04198b94bd9583704f86316b57af3408adc Mon Sep 17 00:00:00 2001 From: Aditya Date: Fri, 20 Mar 2020 02:31:35 +0530 Subject: [PATCH 032/181] [docs] update redirect links python2 -> python3 --- docs/topics/downloader-middleware.rst | 5 ++--- docs/topics/email.rst | 2 +- docs/topics/exporters.rst | 8 ++++---- docs/topics/extensions.rst | 2 +- docs/topics/items.rst | 6 +++--- docs/topics/logging.rst | 16 ++++++++-------- docs/topics/request-response.rst | 10 +++++----- docs/topics/selectors.rst | 2 +- docs/topics/settings.rst | 6 +++--- docs/topics/spider-middleware.rst | 6 +++--- 10 files changed, 31 insertions(+), 32 deletions(-) diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst index 73648994d..61a3806fb 100644 --- a/docs/topics/downloader-middleware.rst +++ b/docs/topics/downloader-middleware.rst @@ -739,7 +739,7 @@ HttpProxyMiddleware This middleware sets the HTTP proxy to use for requests, by setting the ``proxy`` meta value for :class:`~scrapy.http.Request` objects. - Like the Python standard library modules `urllib`_ and `urllib2`_, it obeys + Like the Python standard library module `urllib.request`_, it obeys the following environment variables: * ``http_proxy`` @@ -751,8 +751,7 @@ HttpProxyMiddleware Keep in mind this value will take precedence over ``http_proxy``/``https_proxy`` environment variables, and it will also ignore ``no_proxy`` environment variable. -.. _urllib: https://docs.python.org/2/library/urllib.html -.. _urllib2: https://docs.python.org/2/library/urllib2.html +.. _urllib.request: https://docs.python.org/3/library/urllib.request.html RedirectMiddleware ------------------ diff --git a/docs/topics/email.rst b/docs/topics/email.rst index 72bf52227..aed3deb2e 100644 --- a/docs/topics/email.rst +++ b/docs/topics/email.rst @@ -15,7 +15,7 @@ IO of the crawler. It also provides a simple API for sending attachments and it's very easy to configure, with a few :ref:`settings `. -.. _smtplib: https://docs.python.org/2/library/smtplib.html +.. _smtplib: https://docs.python.org/3/library/smtplib.html Quick example ============= diff --git a/docs/topics/exporters.rst b/docs/topics/exporters.rst index e52682690..4ba8714bd 100644 --- a/docs/topics/exporters.rst +++ b/docs/topics/exporters.rst @@ -320,7 +320,7 @@ CsvItemExporter Color TV,1200 DVD player,200 -.. _csv.writer: https://docs.python.org/2/library/csv.html#csv.writer +.. _csv.writer: https://docs.python.org/3/library/csv.html#csv.writer PickleItemExporter ------------------ @@ -342,7 +342,7 @@ PickleItemExporter Pickle isn't a human readable format, so no output examples are provided. -.. _pickle module documentation: https://docs.python.org/2/library/pickle.html +.. _pickle module documentation: https://docs.python.org/3/library/pickle.html PprintItemExporter ------------------ @@ -393,7 +393,7 @@ JsonItemExporter stream-friendly format, consider using :class:`JsonLinesItemExporter` instead, or splitting the output in multiple chunks. -.. _JSONEncoder: https://docs.python.org/2/library/json.html#json.JSONEncoder +.. _JSONEncoder: https://docs.python.org/3/library/json.html#json.JSONEncoder JsonLinesItemExporter --------------------- @@ -417,7 +417,7 @@ JsonLinesItemExporter Unlike the one produced by :class:`JsonItemExporter`, the format produced by this exporter is well suited for serializing large amounts of data. -.. _JSONEncoder: https://docs.python.org/2/library/json.html#json.JSONEncoder +.. _JSONEncoder: https://docs.python.org/3/library/json.html#json.JSONEncoder MarshalItemExporter ------------------- diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index 94fd2e36e..f57e37e6f 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -372,5 +372,5 @@ For more info see `Debugging in Python`_. This extension only works on POSIX-compliant platforms (i.e. not Windows). -.. _Python debugger: https://docs.python.org/2/library/pdb.html +.. _Python debugger: https://docs.python.org/3/library/pdb.html .. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/ diff --git a/docs/topics/items.rst b/docs/topics/items.rst index 44643cb67..36731571e 100644 --- a/docs/topics/items.rst +++ b/docs/topics/items.rst @@ -24,7 +24,7 @@ serialization can be customized using Item fields metadata, :mod:`trackref` tracks Item instances to help find memory leaks (see :ref:`topics-leaks-trackrefs`), etc. -.. _dictionary-like: https://docs.python.org/2/library/stdtypes.html#dict +.. _dictionary-like: https://docs.python.org/3/library/stdtypes.html#dict .. _topics-items-declaring: @@ -249,7 +249,7 @@ Item objects :class:`Field` objects used in the :ref:`Item declaration `. -.. _dict API: https://docs.python.org/2/library/stdtypes.html#dict +.. _dict API: https://docs.python.org/3/library/stdtypes.html#dict Field objects ============= @@ -262,7 +262,7 @@ Field objects to support the :ref:`item declaration syntax ` based on class attributes. -.. _dict: https://docs.python.org/2/library/stdtypes.html#dict +.. _dict: https://docs.python.org/3/library/stdtypes.html#dict Other classes related to Item diff --git a/docs/topics/logging.rst b/docs/topics/logging.rst index d4d22d889..a85e1a769 100644 --- a/docs/topics/logging.rst +++ b/docs/topics/logging.rst @@ -83,10 +83,10 @@ path:: .. seealso:: - Module logging, `HowTo `_ + Module logging, `HowTo `_ Basic Logging Tutorial - Module logging, `Loggers `_ + Module logging, `Loggers `_ Further documentation on loggers .. _topics-logging-from-spiders: @@ -166,13 +166,13 @@ possible levels listed in :ref:`topics-logging-levels`. :setting:`LOG_FORMAT` and :setting:`LOG_DATEFORMAT` specify formatting strings used as layouts for all messages. Those strings can contain any placeholders listed in `logging's logrecord attributes docs -`_ and +`_ and `datetime's strftime and strptime directives -`_ +`_ respectively. If :setting:`LOG_SHORT_NAMES` is set, then the logs will not display the Scrapy -component that prints the log. It is unset by default, hence logs contain the +component that prints the log. It is unset by default, hence logs contain the Scrapy component responsible for that log output. Command-line options @@ -190,7 +190,7 @@ to override some of the Scrapy settings regarding logging. .. seealso:: - Module `logging.handlers `_ + Module `logging.handlers `_ Further documentation on available handlers .. _custom-log-formats: @@ -201,7 +201,7 @@ Custom Log Formats A custom log format can be set for different actions by extending :class:`~scrapy.logformatter.LogFormatter` class and making :setting:`LOG_FORMATTER` point to your new class. - + .. autoclass:: scrapy.logformatter.LogFormatter :members: @@ -276,6 +276,6 @@ scrapy.utils.log module Refer to :ref:`run-from-script` for more details about using Scrapy this way. -.. _logging.basicConfig(): https://docs.python.org/2/library/logging.html#logging.basicConfig +.. _logging.basicConfig(): https://docs.python.org/3/library/logging.html#logging.basicConfig diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst index b2a60ff39..6c5a08409 100644 --- a/docs/topics/request-response.rst +++ b/docs/topics/request-response.rst @@ -189,7 +189,7 @@ Request objects ``copy()`` or ``replace()`` methods, and can also be accessed, in your spider, from the ``response.cb_kwargs`` attribute. - .. _shallow copied: https://docs.python.org/2/library/copy.html + .. _shallow copied: https://docs.python.org/3/library/copy.html .. method:: Request.copy() @@ -706,7 +706,7 @@ Response objects A :class:`twisted.internet.ssl.Certificate` object representing the server's SSL certificate. - + Only populated for ``https`` responses, ``None`` otherwise. .. method:: Response.copy() @@ -724,17 +724,17 @@ Response objects Constructs an absolute url by combining the Response's :attr:`url` with a possible relative url. - This is a wrapper over `urlparse.urljoin`_, it's merely an alias for + This is a wrapper over `urllib.parse.urljoin`_, it's merely an alias for making this call:: - urlparse.urljoin(response.url, url) + urllib.parse.urljoin(response.url, url) .. automethod:: Response.follow .. automethod:: Response.follow_all -.. _urlparse.urljoin: https://docs.python.org/2/library/urlparse.html#urlparse.urljoin +.. _urllib.parse.urljoin: https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin .. _topics-request-response-ref-response-subclasses: diff --git a/docs/topics/selectors.rst b/docs/topics/selectors.rst index 1f7802c98..0f90b28c0 100644 --- a/docs/topics/selectors.rst +++ b/docs/topics/selectors.rst @@ -36,7 +36,7 @@ defines selectors to associate those styles with specific HTML elements. .. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ .. _lxml: https://lxml.de/ -.. _ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html +.. _ElementTree: https://docs.python.org/3/library/xml.etree.elementtree.html .. _XPath: https://www.w3.org/TR/xpath/all/ .. _CSS: https://www.w3.org/TR/selectors .. _parsel: https://parsel.readthedocs.io/en/latest/ diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index dc6843d75..d78a6253e 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -28,7 +28,7 @@ The value of ``SCRAPY_SETTINGS_MODULE`` should be in Python path syntax, e.g. ``myproject.settings``. Note that the settings module should be on the Python `import search path`_. -.. _import search path: https://docs.python.org/2/tutorial/modules.html#the-module-search-path +.. _import search path: https://docs.python.org/3/tutorial/modules.html#the-module-search-path .. _populating-settings: @@ -902,7 +902,7 @@ Default: ``'%(asctime)s [%(name)s] %(levelname)s: %(message)s'`` String for formatting log messages. Refer to the `Python logging documentation`_ for the whole list of available placeholders. -.. _Python logging documentation: https://docs.python.org/2/library/logging.html#logrecord-attributes +.. _Python logging documentation: https://docs.python.org/3/library/logging.html#logrecord-attributes .. setting:: LOG_DATEFORMAT @@ -915,7 +915,7 @@ String for formatting date/time, expansion of the ``%(asctime)s`` placeholder in :setting:`LOG_FORMAT`. Refer to the `Python datetime documentation`_ for the whole list of available directives. -.. _Python datetime documentation: https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior +.. _Python datetime documentation: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior .. setting:: LOG_FORMATTER diff --git a/docs/topics/spider-middleware.rst b/docs/topics/spider-middleware.rst index 0e8210130..3d7450c86 100644 --- a/docs/topics/spider-middleware.rst +++ b/docs/topics/spider-middleware.rst @@ -173,18 +173,18 @@ object gives you access, for example, to the :ref:`settings `. :type spider: :class:`~scrapy.spiders.Spider` object .. method:: from_crawler(cls, crawler) - + If present, this classmethod is called to create a middleware instance from a :class:`~scrapy.crawler.Crawler`. It must return a new instance of the middleware. Crawler object provides access to all Scrapy core components like settings and signals; it is a way for middleware to access them and hook its functionality into Scrapy. - + :param crawler: crawler that uses this middleware :type crawler: :class:`~scrapy.crawler.Crawler` object -.. _Exception: https://docs.python.org/2/library/exceptions.html#exceptions.Exception +.. _Exception: https://docs.python.org/3/library/exceptions.html#Exception .. _topics-spider-middleware-ref: From f37b1bdc5616f67460c645e26c49f9d5b34e3631 Mon Sep 17 00:00:00 2001 From: Aditya Date: Fri, 20 Mar 2020 05:22:51 +0530 Subject: [PATCH 033/181] [docs] update redirect links to python3 --- docs/intro/tutorial.rst | 10 +++++----- docs/topics/contracts.rst | 4 +--- docs/topics/downloader-middleware.rst | 11 +++-------- docs/topics/dynamic-content.rst | 10 ++++------ docs/topics/email.rst | 4 +--- docs/topics/exporters.rst | 20 ++++++-------------- docs/topics/extensions.rst | 3 +-- docs/topics/items.rst | 21 ++++++--------------- docs/topics/logging.rst | 15 +++++---------- docs/topics/request-response.rst | 8 ++------ docs/topics/selectors.rst | 3 +-- docs/topics/spider-middleware.rst | 6 +----- docs/topics/spiders.rst | 4 +--- docs/topics/telnetconsole.rst | 11 ++++------- scrapy/item.py | 4 +--- 15 files changed, 42 insertions(+), 92 deletions(-) diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index 1768badbb..ab6fd4829 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -25,16 +25,16 @@ Scrapy. If you're already familiar with other languages, and want to learn Python quickly, the `Python Tutorial`_ is a good resource. If you're new to programming and want to start with Python, the following books -may be useful to you: +may be useful to you: * `Automate the Boring Stuff With Python`_ -* `How To Think Like a Computer Scientist`_ +* `How To Think Like a Computer Scientist`_ -* `Learn Python 3 The Hard Way`_ +* `Learn Python 3 The Hard Way`_ You can also take a look at `this list of Python resources for non-programmers`_, -as well as the `suggested resources in the learnpython-subreddit`_. +as well as the `suggested resources in the learnpython-subreddit`_. .. _Python: https://www.python.org/ .. _this list of Python resources for non-programmers: https://wiki.python.org/moin/BeginnersGuide/NonProgrammers @@ -62,7 +62,7 @@ This will create a ``tutorial`` directory with the following contents:: __init__.py items.py # project items definition file - + middlewares.py # project middlewares file pipelines.py # project pipelines file diff --git a/docs/topics/contracts.rst b/docs/topics/contracts.rst index 43db8f101..319f577bc 100644 --- a/docs/topics/contracts.rst +++ b/docs/topics/contracts.rst @@ -136,7 +136,7 @@ Detecting check runs ==================== When ``scrapy check`` is running, the ``SCRAPY_CHECK`` environment variable is -set to the ``true`` string. You can use `os.environ`_ to perform any change to +set to the ``true`` string. You can use :data:`os.environ` to perform any change to your spiders or your settings when ``scrapy check`` is used:: import os @@ -148,5 +148,3 @@ your spiders or your settings when ``scrapy check`` is used:: def __init__(self): if os.environ.get('SCRAPY_CHECK'): pass # Do some scraper adjustments when a check is running - -.. _os.environ: https://docs.python.org/3/library/os.html#os.environ diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst index 61a3806fb..d7ec53bfa 100644 --- a/docs/topics/downloader-middleware.rst +++ b/docs/topics/downloader-middleware.rst @@ -739,7 +739,7 @@ HttpProxyMiddleware This middleware sets the HTTP proxy to use for requests, by setting the ``proxy`` meta value for :class:`~scrapy.http.Request` objects. - Like the Python standard library module `urllib.request`_, it obeys + Like the Python standard library module :mod:`urllib.request`, it obeys the following environment variables: * ``http_proxy`` @@ -751,8 +751,6 @@ HttpProxyMiddleware Keep in mind this value will take precedence over ``http_proxy``/``https_proxy`` environment variables, and it will also ignore ``no_proxy`` environment variable. -.. _urllib.request: https://docs.python.org/3/library/urllib.request.html - RedirectMiddleware ------------------ @@ -982,7 +980,7 @@ RobotsTxtMiddleware Scrapy ships with support for the following robots.txt_ parsers: * :ref:`Protego ` (default) - * :ref:`RobotFileParser ` + * :class:`~urllib.robotparser.RobotFileParser` * :ref:`Reppy ` * :ref:`Robotexclusionrulesparser ` @@ -1030,13 +1028,10 @@ Based on `Protego `_: Scrapy uses this parser by default. -.. _python-robotfileparser: - RobotFileParser ~~~~~~~~~~~~~~~ -Based on `RobotFileParser -`_: +Based on :class:`~urllib.robotparser.RobotFileParser`: * is Python's built-in robots.txt_ parser diff --git a/docs/topics/dynamic-content.rst b/docs/topics/dynamic-content.rst index b98133676..22bcac268 100644 --- a/docs/topics/dynamic-content.rst +++ b/docs/topics/dynamic-content.rst @@ -115,7 +115,7 @@ data from it depends on the type of response: - If the response is HTML or XML, use :ref:`selectors ` as usual. -- If the response is JSON, use `json.loads`_ to load the desired data from +- If the response is JSON, use :func:`json.loads` to load the desired data from :attr:`response.text `:: data = json.loads(response.text) @@ -130,7 +130,7 @@ data from it depends on the type of response: - If the response is JavaScript, or HTML with a ``