diff --git a/.travis.yml b/.travis.yml index f9d0dc8be..82167e10a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,8 @@ matrix: python: 3.5 - env: TOXENV=pinned python: 3.5 + - env: TOXENV=py35-asyncio + python: 3.5 - env: TOXENV=py36 python: 3.6 - env: TOXENV=py37 @@ -24,6 +26,8 @@ matrix: python: 3.8 - env: TOXENV=extra-deps python: 3.8 + - env: TOXENV=py38-asyncio + python: 3.8 - env: TOXENV=docs python: 3.7 # Keep in sync with .readthedocs.yml install: diff --git a/conftest.py b/conftest.py index d37c22436..c0de09909 100644 --- a/conftest.py +++ b/conftest.py @@ -34,3 +34,18 @@ def pytest_collection_modifyitems(session, config, items): items[:] = [item for item in items if isinstance(item, Flake8Item)] except ImportError: pass + + +@pytest.fixture(scope='class') +def reactor_pytest(request): + if not request.cls: + # doctests + return + request.cls.reactor_pytest = request.config.getoption("--reactor") + return request.cls.reactor_pytest + + +@pytest.fixture(autouse=True) +def only_asyncio(request, reactor_pytest): + if request.node.get_closest_marker('only_asyncio') and reactor_pytest != 'asyncio': + pytest.skip('This test is only run with --reactor=asyncio') diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index afe4fade1..c02f877fc 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -160,6 +160,27 @@ to any particular component. In that case the module of that component will be shown, typically an extension, middleware or pipeline. It also means that the component must be enabled in order for the setting to have any effect. +.. setting:: ASYNCIO_REACTOR + +ASYNCIO_REACTOR +--------------- + +Default: ``False`` + +Whether to install and require the Twisted reactor that uses the asyncio loop. + +When this option is set to ``True``, Scrapy will require +:class:`~twisted.internet.asyncioreactor.AsyncioSelectorReactor`. It will +install this reactor if no reactor is installed yet, such as when using the +``scrapy`` script or :class:`~scrapy.crawler.CrawlerProcess`. If you are using +:class:`~scrapy.crawler.CrawlerRunner`, you need to install the correct reactor +manually. If a different reactor is installed outside Scrapy, it will raise an +exception. + +The default value for this option is currently ``False`` to maintain backward +compatibility and avoid possible problems caused by using a different Twisted +reactor. + .. setting:: AWS_ACCESS_KEY_ID AWS_ACCESS_KEY_ID diff --git a/pytest.ini b/pytest.ini index 50fc4ac00..c3f3292bb 100644 --- a/pytest.ini +++ b/pytest.ini @@ -18,6 +18,8 @@ addopts = --ignore=docs/topics/telnetconsole.rst --ignore=docs/utils twisted = 1 +markers = + only_asyncio: marks tests as only enabled when --reactor=asyncio is passed flake8-ignore = # Files that are only meant to provide top-level imports are expected not # to use any of their imports: @@ -114,6 +116,7 @@ flake8-ignore = scrapy/spiders/feed.py E501 scrapy/spiders/sitemap.py E501 # scrapy/utils + scrapy/utils/asyncio.py E501 scrapy/utils/benchserver.py E501 scrapy/utils/conf.py E402 E501 scrapy/utils/console.py E306 E305 @@ -223,6 +226,7 @@ flake8-ignore = tests/test_spidermiddleware_output_chain.py E501 E226 tests/test_spidermiddleware_referer.py E501 F841 E125 E201 E124 E501 E241 E121 tests/test_squeues.py E501 E701 E741 + tests/test_utils_asyncio.py E501 tests/test_utils_conf.py E501 E303 E128 tests/test_utils_curl.py E501 tests/test_utils_datatypes.py E402 E501 E305 diff --git a/scrapy/commands/shell.py b/scrapy/commands/shell.py index e05084272..d44a32d5f 100644 --- a/scrapy/commands/shell.py +++ b/scrapy/commands/shell.py @@ -6,8 +6,8 @@ See documentation in docs/topics/shell.rst from threading import Thread from scrapy.commands import ScrapyCommand -from scrapy.shell import Shell from scrapy.http import Request +from scrapy.shell import Shell from scrapy.utils.spider import spidercls_for_request, DefaultSpider from scrapy.utils.url import guess_scheme diff --git a/scrapy/crawler.py b/scrapy/crawler.py index 6c7eb737b..f87e67d93 100644 --- a/scrapy/crawler.py +++ b/scrapy/crawler.py @@ -3,7 +3,7 @@ import pprint import signal import warnings -from twisted.internet import reactor, defer +from twisted.internet import defer from zope.interface.verify import verifyClass, DoesNotImplement from scrapy import Spider @@ -14,6 +14,7 @@ from scrapy.extension import ExtensionManager from scrapy.settings import overridden_settings, Settings from scrapy.signalmanager import SignalManager from scrapy.exceptions import ScrapyDeprecationWarning +from scrapy.utils.asyncio import install_asyncio_reactor, is_asyncio_reactor_installed from scrapy.utils.ossignal import install_shutdown_handlers, signal_names from scrapy.utils.misc import load_object from scrapy.utils.log import ( @@ -136,6 +137,7 @@ class CrawlerRunner(object): self._crawlers = set() self._active = set() self.bootstrap_failed = False + self._handle_asyncio_reactor() @property def spiders(self): @@ -229,6 +231,11 @@ class CrawlerRunner(object): while self._active: yield defer.DeferredList(self._active) + def _handle_asyncio_reactor(self): + if self.settings.getbool('ASYNCIO_REACTOR') and not is_asyncio_reactor_installed(): + raise Exception("ASYNCIO_REACTOR is on but the Twisted asyncio " + "reactor is not installed.") + class CrawlerProcess(CrawlerRunner): """ @@ -261,6 +268,7 @@ class CrawlerProcess(CrawlerRunner): log_scrapy_info(self.settings) def _signal_shutdown(self, signum, _): + from twisted.internet import reactor install_shutdown_handlers(self._signal_kill) signame = signal_names[signum] logger.info("Received %(signame)s, shutting down gracefully. Send again to force ", @@ -268,6 +276,7 @@ class CrawlerProcess(CrawlerRunner): reactor.callFromThread(self._graceful_stop_reactor) def _signal_kill(self, signum, _): + from twisted.internet import reactor install_shutdown_handlers(signal.SIG_IGN) signame = signal_names[signum] logger.info('Received %(signame)s twice, forcing unclean shutdown', @@ -286,6 +295,7 @@ class CrawlerProcess(CrawlerRunner): :param boolean stop_after_crawl: stop or not the reactor when all crawlers have finished """ + from twisted.internet import reactor if stop_after_crawl: d = self.join() # Don't start the reactor if the deferreds are already fired @@ -300,6 +310,7 @@ class CrawlerProcess(CrawlerRunner): reactor.run(installSignalHandlers=False) # blocking call def _get_dns_resolver(self): + from twisted.internet import reactor if self.settings.getbool('DNSCACHE_ENABLED'): cache_size = self.settings.getint('DNSCACHE_SIZE') else: @@ -316,11 +327,17 @@ class CrawlerProcess(CrawlerRunner): return d def _stop_reactor(self, _=None): + from twisted.internet import reactor try: reactor.stop() except RuntimeError: # raised if already stopped or in shutdown stage pass + def _handle_asyncio_reactor(self): + if self.settings.getbool('ASYNCIO_REACTOR'): + install_asyncio_reactor() + super()._handle_asyncio_reactor() + def _get_spider_loader(settings): """ Get SpiderLoader instance from settings """ diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 1e163e1fb..d03fd37b0 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -19,6 +19,8 @@ from os.path import join, abspath, dirname AJAXCRAWL_ENABLED = False +ASYNCIO_REACTOR = False + AUTOTHROTTLE_ENABLED = False AUTOTHROTTLE_DEBUG = False AUTOTHROTTLE_MAX_DELAY = 60.0 diff --git a/scrapy/shell.py b/scrapy/shell.py index a649d555f..a23b04df9 100644 --- a/scrapy/shell.py +++ b/scrapy/shell.py @@ -7,7 +7,7 @@ import os import signal import warnings -from twisted.internet import reactor, threads, defer +from twisted.internet import threads, defer from twisted.python import threadable from w3lib.url import any_to_uri @@ -98,6 +98,7 @@ class Shell(object): return spider def fetch(self, request_or_url, spider=None, redirect=True, **kwargs): + from twisted.internet import reactor if isinstance(request_or_url, Request): request = request_or_url else: diff --git a/scrapy/utils/asyncio.py b/scrapy/utils/asyncio.py new file mode 100644 index 000000000..917973de2 --- /dev/null +++ b/scrapy/utils/asyncio.py @@ -0,0 +1,17 @@ +import asyncio +from contextlib import suppress + +from twisted.internet import asyncioreactor +from twisted.internet.error import ReactorAlreadyInstalledError + + +def install_asyncio_reactor(): + """ Tries to install AsyncioSelectorReactor + """ + with suppress(ReactorAlreadyInstalledError): + asyncioreactor.install(asyncio.get_event_loop()) + + +def is_asyncio_reactor_installed(): + from twisted.internet import reactor + return isinstance(reactor, asyncioreactor.AsyncioSelectorReactor) diff --git a/scrapy/utils/defer.py b/scrapy/utils/defer.py index c5916c21c..20ce59297 100644 --- a/scrapy/utils/defer.py +++ b/scrapy/utils/defer.py @@ -1,8 +1,7 @@ """ Helper functions for dealing with Twisted deferreds """ - -from twisted.internet import defer, reactor, task +from twisted.internet import defer, task from twisted.python import failure from scrapy.exceptions import IgnoreRequest @@ -15,6 +14,7 @@ def defer_fail(_failure): It delays by 100ms so reactor has a chance to go through readers and writers before attending pending delayed calls, so do not set delay to zero. """ + from twisted.internet import reactor d = defer.Deferred() reactor.callLater(0.1, d.errback, _failure) return d @@ -27,6 +27,7 @@ def defer_succeed(result): It delays by 100ms so reactor has a chance to go trough readers and writers before attending pending delayed calls, so do not set delay to zero. """ + from twisted.internet import reactor d = defer.Deferred() reactor.callLater(0.1, d.callback, result) return d diff --git a/scrapy/utils/log.py b/scrapy/utils/log.py index e07fb8698..e4cf0196b 100644 --- a/scrapy/utils/log.py +++ b/scrapy/utils/log.py @@ -11,6 +11,7 @@ from twisted.python import log as twisted_log import scrapy from scrapy.settings import Settings from scrapy.exceptions import ScrapyDeprecationWarning +from scrapy.utils.asyncio import is_asyncio_reactor_installed from scrapy.utils.versions import scrapy_components_versions @@ -148,6 +149,8 @@ def log_scrapy_info(settings): {'versions': ", ".join("%s %s" % (name, version) for name, version in scrapy_components_versions() if name != "Scrapy")}) + if is_asyncio_reactor_installed(): + logger.debug("Asyncio reactor is installed") class StreamLogger(object): diff --git a/scrapy/utils/ossignal.py b/scrapy/utils/ossignal.py index 7a7aec9be..45c9cef0c 100644 --- a/scrapy/utils/ossignal.py +++ b/scrapy/utils/ossignal.py @@ -1,7 +1,5 @@ import signal -from twisted.internet import reactor - signal_names = {} for signame in dir(signal): @@ -17,6 +15,7 @@ def install_shutdown_handlers(function, override_sigint=True): SIGINT handler won't be install if there is already a handler in place (e.g. Pdb) """ + from twisted.internet import reactor reactor._handleSignals() signal.signal(signal.SIGTERM, function) if signal.getsignal(signal.SIGINT) == signal.default_int_handler or \ diff --git a/scrapy/utils/reactor.py b/scrapy/utils/reactor.py index 493d26d4c..b98fff6ec 100644 --- a/scrapy/utils/reactor.py +++ b/scrapy/utils/reactor.py @@ -1,8 +1,9 @@ -from twisted.internet import reactor, error +from twisted.internet import error def listen_tcp(portrange, host, factory): """Like reactor.listenTCP but tries different ports in a range.""" + from twisted.internet import reactor assert len(portrange) <= 2, "invalid portrange: %s" % portrange if not portrange: return reactor.listenTCP(0, factory, interface=host) @@ -30,6 +31,7 @@ class CallLaterOnce(object): self._call = None def schedule(self, delay=0): + from twisted.internet import reactor if self._call is None: self._call = reactor.callLater(delay, self) diff --git a/tests/CrawlerProcess/asyncio_enabled_no_reactor.py b/tests/CrawlerProcess/asyncio_enabled_no_reactor.py new file mode 100644 index 000000000..db1b75931 --- /dev/null +++ b/tests/CrawlerProcess/asyncio_enabled_no_reactor.py @@ -0,0 +1,17 @@ +import scrapy +from scrapy.crawler import CrawlerProcess + + +class NoRequestsSpider(scrapy.Spider): + name = 'no_request' + + def start_requests(self): + return [] + + +process = CrawlerProcess(settings={ + 'ASYNCIO_REACTOR': True, +}) + +process.crawl(NoRequestsSpider) +process.start() diff --git a/tests/CrawlerProcess/asyncio_enabled_reactor.py b/tests/CrawlerProcess/asyncio_enabled_reactor.py new file mode 100644 index 000000000..cec3c9c25 --- /dev/null +++ b/tests/CrawlerProcess/asyncio_enabled_reactor.py @@ -0,0 +1,22 @@ +import asyncio + +from twisted.internet import asyncioreactor +asyncioreactor.install(asyncio.get_event_loop()) + +import scrapy +from scrapy.crawler import CrawlerProcess + + +class NoRequestsSpider(scrapy.Spider): + name = 'no_request' + + def start_requests(self): + return [] + + +process = CrawlerProcess(settings={ + 'ASYNCIO_REACTOR': True, +}) + +process.crawl(NoRequestsSpider) +process.start() diff --git a/tests/requirements-py3.txt b/tests/requirements-py3.txt index e9bf310b9..d97c4b8ee 100644 --- a/tests/requirements-py3.txt +++ b/tests/requirements-py3.txt @@ -4,7 +4,7 @@ mitmproxy; python_version >= '3.6' mitmproxy<4.0.0; python_version < '3.6' pytest pytest-cov -pytest-twisted +pytest-twisted >= 1.11 pytest-xdist sybil testfixtures diff --git a/tests/test_commands.py b/tests/test_commands.py index 536379170..6024af71c 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -295,6 +295,14 @@ class BadSpider(scrapy.Spider): self.assertIn("start_requests", log) self.assertIn("badspider.py", log) + def test_asyncio_enabled_true(self): + log = self.get_log(self.debug_log_spider, args=['-s', 'ASYNCIO_REACTOR=True']) + self.assertIn("DEBUG: Asyncio reactor is installed", log) + + def test_asyncio_enabled_false(self): + log = self.get_log(self.debug_log_spider, args=['-s', 'ASYNCIO_REACTOR=False']) + self.assertNotIn("DEBUG: Asyncio reactor is installed", log) + class BenchCommandTest(CommandTest): diff --git a/tests/test_crawler.py b/tests/test_crawler.py index e37a2ff0e..fce60ca37 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -4,9 +4,10 @@ import subprocess import sys import warnings +from pytest import raises, mark +from testfixtures import LogCapture from twisted.internet import defer from twisted.trial import unittest -from pytest import raises import scrapy from scrapy.crawler import Crawler, CrawlerRunner, CrawlerProcess @@ -207,6 +208,7 @@ class NoRequestsSpider(scrapy.Spider): return [] +@mark.usefixtures('reactor_pytest') class CrawlerRunnerHasSpider(unittest.TestCase): @defer.inlineCallbacks @@ -250,6 +252,33 @@ class CrawlerRunnerHasSpider(unittest.TestCase): self.assertEqual(runner.bootstrap_failed, True) + def test_crawler_runner_asyncio_enabled_true(self): + if self.reactor_pytest == 'asyncio': + runner = CrawlerRunner(settings={'ASYNCIO_REACTOR': True}) + else: + msg = "ASYNCIO_REACTOR is on but the Twisted asyncio reactor is not installed" + with self.assertRaisesRegex(Exception, msg): + runner = CrawlerRunner(settings={'ASYNCIO_REACTOR': True}) + + @defer.inlineCallbacks + def test_crawler_process_asyncio_enabled_true(self): + with LogCapture(level=logging.DEBUG) as log: + if self.reactor_pytest == 'asyncio': + runner = CrawlerProcess(settings={'ASYNCIO_REACTOR': True}) + yield runner.crawl(NoRequestsSpider) + self.assertIn("Asyncio reactor is installed", str(log)) + else: + msg = "ASYNCIO_REACTOR is on but the Twisted asyncio reactor is not installed" + with self.assertRaisesRegex(Exception, msg): + runner = CrawlerProcess(settings={'ASYNCIO_REACTOR': True}) + + @defer.inlineCallbacks + def test_crawler_process_asyncio_enabled_false(self): + runner = CrawlerProcess(settings={'ASYNCIO_REACTOR': False}) + with LogCapture(level=logging.DEBUG) as log: + yield runner.crawl(NoRequestsSpider) + self.assertNotIn("Asyncio reactor is installed", str(log)) + class CrawlerProcessSubprocess(unittest.TestCase): script_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'CrawlerProcess') @@ -265,3 +294,14 @@ class CrawlerProcessSubprocess(unittest.TestCase): def test_simple(self): log = self.run_script('simple.py') self.assertIn('Spider closed (finished)', log) + self.assertNotIn("DEBUG: Asyncio reactor is installed", log) + + def test_asyncio_enabled_no_reactor(self): + log = self.run_script('asyncio_enabled_no_reactor.py') + self.assertIn('Spider closed (finished)', log) + self.assertIn("DEBUG: Asyncio reactor is installed", log) + + def test_asyncio_enabled_reactor(self): + log = self.run_script('asyncio_enabled_reactor.py') + self.assertIn('Spider closed (finished)', log) + self.assertIn("DEBUG: Asyncio reactor is installed", log) diff --git a/tests/test_downloadermiddleware.py b/tests/test_downloadermiddleware.py index 6b9a5bee8..1b81ea949 100644 --- a/tests/test_downloadermiddleware.py +++ b/tests/test_downloadermiddleware.py @@ -1,5 +1,6 @@ from unittest import mock +from twisted.internet.defer import Deferred from twisted.trial.unittest import TestCase from twisted.python.failure import Failure @@ -177,3 +178,31 @@ class ProcessExceptionInvalidOutput(ManagerTestCase): dfd.addBoth(results.append) self.assertIsInstance(results[0], Failure) self.assertIsInstance(results[0].value, _InvalidOutput) + + +class MiddlewareUsingDeferreds(ManagerTestCase): + """Middlewares using Deferreds should work""" + + def test_deferred(self): + resp = Response('http://example.com/index.html') + + class DeferredMiddleware: + def cb(self, result): + return result + + def process_request(self, request, spider): + d = Deferred() + d.addCallback(self.cb) + d.callback(resp) + return d + + self.mwman._add_middleware(DeferredMiddleware()) + req = Request('http://example.com/index.html') + download_func = mock.MagicMock() + dfd = self.mwman.download(download_func, req, self.spider) + results = [] + dfd.addBoth(results.append) + self._wait(dfd) + + self.assertIs(results[0], resp) + self.assertFalse(download_func.called) diff --git a/tests/test_utils_asyncio.py b/tests/test_utils_asyncio.py new file mode 100644 index 000000000..44acc24af --- /dev/null +++ b/tests/test_utils_asyncio.py @@ -0,0 +1,17 @@ +from unittest import TestCase + +from pytest import mark + +from scrapy.utils.asyncio import is_asyncio_reactor_installed, install_asyncio_reactor + + +@mark.usefixtures('reactor_pytest') +class AsyncioTest(TestCase): + + def test_is_asyncio_reactor_installed(self): + # the result should depend only on the pytest --reactor argument + self.assertEqual(is_asyncio_reactor_installed(), self.reactor_pytest == 'asyncio') + + def test_install_asyncio_reactor(self): + # this should do nothing + install_asyncio_reactor() diff --git a/tox.ini b/tox.ini index 1ec8f52e4..b62100026 100644 --- a/tox.ini +++ b/tox.ini @@ -96,3 +96,17 @@ changedir = {[docs]changedir} deps = {[docs]deps} commands = sphinx-build -W -b linkcheck . {envtmpdir}/linkcheck + +[asyncio] +commands = + {[testenv]commands} --reactor=asyncio + +[testenv:py35-asyncio] +basepython = python3.5 +deps = {[testenv]deps} +commands = {[asyncio]commands} + +[testenv:py38-asyncio] +basepython = python3.8 +deps = {[testenv]deps} +commands = {[asyncio]commands}