mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-21 05:13:16 +00:00
Merge pull request #4010 from scrapy/asyncio-base
Base support for asyncio
This commit is contained in:
commit
bb991cd303
@ -16,6 +16,8 @@ matrix:
|
||||
python: 3.5
|
||||
- env: TOXENV=pinned
|
||||
python: 3.5
|
||||
- env: TOXENV=py35-asyncio
|
||||
python: 3.5
|
||||
- env: TOXENV=py36
|
||||
python: 3.6
|
||||
- env: TOXENV=py37
|
||||
@ -24,6 +26,8 @@ matrix:
|
||||
python: 3.8
|
||||
- env: TOXENV=extra-deps
|
||||
python: 3.8
|
||||
- env: TOXENV=py38-asyncio
|
||||
python: 3.8
|
||||
- env: TOXENV=docs
|
||||
python: 3.7 # Keep in sync with .readthedocs.yml
|
||||
install:
|
||||
|
15
conftest.py
15
conftest.py
@ -34,3 +34,18 @@ def pytest_collection_modifyitems(session, config, items):
|
||||
items[:] = [item for item in items if isinstance(item, Flake8Item)]
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope='class')
|
||||
def reactor_pytest(request):
|
||||
if not request.cls:
|
||||
# doctests
|
||||
return
|
||||
request.cls.reactor_pytest = request.config.getoption("--reactor")
|
||||
return request.cls.reactor_pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def only_asyncio(request, reactor_pytest):
|
||||
if request.node.get_closest_marker('only_asyncio') and reactor_pytest != 'asyncio':
|
||||
pytest.skip('This test is only run with --reactor=asyncio')
|
||||
|
@ -160,6 +160,27 @@ to any particular component. In that case the module of that component will be
|
||||
shown, typically an extension, middleware or pipeline. It also means that the
|
||||
component must be enabled in order for the setting to have any effect.
|
||||
|
||||
.. setting:: ASYNCIO_REACTOR
|
||||
|
||||
ASYNCIO_REACTOR
|
||||
---------------
|
||||
|
||||
Default: ``False``
|
||||
|
||||
Whether to install and require the Twisted reactor that uses the asyncio loop.
|
||||
|
||||
When this option is set to ``True``, Scrapy will require
|
||||
:class:`~twisted.internet.asyncioreactor.AsyncioSelectorReactor`. It will
|
||||
install this reactor if no reactor is installed yet, such as when using the
|
||||
``scrapy`` script or :class:`~scrapy.crawler.CrawlerProcess`. If you are using
|
||||
:class:`~scrapy.crawler.CrawlerRunner`, you need to install the correct reactor
|
||||
manually. If a different reactor is installed outside Scrapy, it will raise an
|
||||
exception.
|
||||
|
||||
The default value for this option is currently ``False`` to maintain backward
|
||||
compatibility and avoid possible problems caused by using a different Twisted
|
||||
reactor.
|
||||
|
||||
.. setting:: AWS_ACCESS_KEY_ID
|
||||
|
||||
AWS_ACCESS_KEY_ID
|
||||
|
@ -18,6 +18,8 @@ addopts =
|
||||
--ignore=docs/topics/telnetconsole.rst
|
||||
--ignore=docs/utils
|
||||
twisted = 1
|
||||
markers =
|
||||
only_asyncio: marks tests as only enabled when --reactor=asyncio is passed
|
||||
flake8-ignore =
|
||||
# Files that are only meant to provide top-level imports are expected not
|
||||
# to use any of their imports:
|
||||
@ -114,6 +116,7 @@ flake8-ignore =
|
||||
scrapy/spiders/feed.py E501
|
||||
scrapy/spiders/sitemap.py E501
|
||||
# scrapy/utils
|
||||
scrapy/utils/asyncio.py E501
|
||||
scrapy/utils/benchserver.py E501
|
||||
scrapy/utils/conf.py E402 E501
|
||||
scrapy/utils/console.py E306 E305
|
||||
@ -223,6 +226,7 @@ flake8-ignore =
|
||||
tests/test_spidermiddleware_output_chain.py E501 E226
|
||||
tests/test_spidermiddleware_referer.py E501 F841 E125 E201 E124 E501 E241 E121
|
||||
tests/test_squeues.py E501 E701 E741
|
||||
tests/test_utils_asyncio.py E501
|
||||
tests/test_utils_conf.py E501 E303 E128
|
||||
tests/test_utils_curl.py E501
|
||||
tests/test_utils_datatypes.py E402 E501 E305
|
||||
|
@ -6,8 +6,8 @@ See documentation in docs/topics/shell.rst
|
||||
from threading import Thread
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.shell import Shell
|
||||
from scrapy.http import Request
|
||||
from scrapy.shell import Shell
|
||||
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
|
||||
from scrapy.utils.url import guess_scheme
|
||||
|
||||
|
@ -3,7 +3,7 @@ import pprint
|
||||
import signal
|
||||
import warnings
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
from twisted.internet import defer
|
||||
from zope.interface.verify import verifyClass, DoesNotImplement
|
||||
|
||||
from scrapy import Spider
|
||||
@ -14,6 +14,7 @@ from scrapy.extension import ExtensionManager
|
||||
from scrapy.settings import overridden_settings, Settings
|
||||
from scrapy.signalmanager import SignalManager
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.utils.asyncio import install_asyncio_reactor, is_asyncio_reactor_installed
|
||||
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.log import (
|
||||
@ -136,6 +137,7 @@ class CrawlerRunner(object):
|
||||
self._crawlers = set()
|
||||
self._active = set()
|
||||
self.bootstrap_failed = False
|
||||
self._handle_asyncio_reactor()
|
||||
|
||||
@property
|
||||
def spiders(self):
|
||||
@ -229,6 +231,11 @@ class CrawlerRunner(object):
|
||||
while self._active:
|
||||
yield defer.DeferredList(self._active)
|
||||
|
||||
def _handle_asyncio_reactor(self):
|
||||
if self.settings.getbool('ASYNCIO_REACTOR') and not is_asyncio_reactor_installed():
|
||||
raise Exception("ASYNCIO_REACTOR is on but the Twisted asyncio "
|
||||
"reactor is not installed.")
|
||||
|
||||
|
||||
class CrawlerProcess(CrawlerRunner):
|
||||
"""
|
||||
@ -261,6 +268,7 @@ class CrawlerProcess(CrawlerRunner):
|
||||
log_scrapy_info(self.settings)
|
||||
|
||||
def _signal_shutdown(self, signum, _):
|
||||
from twisted.internet import reactor
|
||||
install_shutdown_handlers(self._signal_kill)
|
||||
signame = signal_names[signum]
|
||||
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
|
||||
@ -268,6 +276,7 @@ class CrawlerProcess(CrawlerRunner):
|
||||
reactor.callFromThread(self._graceful_stop_reactor)
|
||||
|
||||
def _signal_kill(self, signum, _):
|
||||
from twisted.internet import reactor
|
||||
install_shutdown_handlers(signal.SIG_IGN)
|
||||
signame = signal_names[signum]
|
||||
logger.info('Received %(signame)s twice, forcing unclean shutdown',
|
||||
@ -286,6 +295,7 @@ class CrawlerProcess(CrawlerRunner):
|
||||
:param boolean stop_after_crawl: stop or not the reactor when all
|
||||
crawlers have finished
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
if stop_after_crawl:
|
||||
d = self.join()
|
||||
# Don't start the reactor if the deferreds are already fired
|
||||
@ -300,6 +310,7 @@ class CrawlerProcess(CrawlerRunner):
|
||||
reactor.run(installSignalHandlers=False) # blocking call
|
||||
|
||||
def _get_dns_resolver(self):
|
||||
from twisted.internet import reactor
|
||||
if self.settings.getbool('DNSCACHE_ENABLED'):
|
||||
cache_size = self.settings.getint('DNSCACHE_SIZE')
|
||||
else:
|
||||
@ -316,11 +327,17 @@ class CrawlerProcess(CrawlerRunner):
|
||||
return d
|
||||
|
||||
def _stop_reactor(self, _=None):
|
||||
from twisted.internet import reactor
|
||||
try:
|
||||
reactor.stop()
|
||||
except RuntimeError: # raised if already stopped or in shutdown stage
|
||||
pass
|
||||
|
||||
def _handle_asyncio_reactor(self):
|
||||
if self.settings.getbool('ASYNCIO_REACTOR'):
|
||||
install_asyncio_reactor()
|
||||
super()._handle_asyncio_reactor()
|
||||
|
||||
|
||||
def _get_spider_loader(settings):
|
||||
""" Get SpiderLoader instance from settings """
|
||||
|
@ -19,6 +19,8 @@ from os.path import join, abspath, dirname
|
||||
|
||||
AJAXCRAWL_ENABLED = False
|
||||
|
||||
ASYNCIO_REACTOR = False
|
||||
|
||||
AUTOTHROTTLE_ENABLED = False
|
||||
AUTOTHROTTLE_DEBUG = False
|
||||
AUTOTHROTTLE_MAX_DELAY = 60.0
|
||||
|
@ -7,7 +7,7 @@ import os
|
||||
import signal
|
||||
import warnings
|
||||
|
||||
from twisted.internet import reactor, threads, defer
|
||||
from twisted.internet import threads, defer
|
||||
from twisted.python import threadable
|
||||
from w3lib.url import any_to_uri
|
||||
|
||||
@ -98,6 +98,7 @@ class Shell(object):
|
||||
return spider
|
||||
|
||||
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
|
||||
from twisted.internet import reactor
|
||||
if isinstance(request_or_url, Request):
|
||||
request = request_or_url
|
||||
else:
|
||||
|
17
scrapy/utils/asyncio.py
Normal file
17
scrapy/utils/asyncio.py
Normal file
@ -0,0 +1,17 @@
|
||||
import asyncio
|
||||
from contextlib import suppress
|
||||
|
||||
from twisted.internet import asyncioreactor
|
||||
from twisted.internet.error import ReactorAlreadyInstalledError
|
||||
|
||||
|
||||
def install_asyncio_reactor():
|
||||
""" Tries to install AsyncioSelectorReactor
|
||||
"""
|
||||
with suppress(ReactorAlreadyInstalledError):
|
||||
asyncioreactor.install(asyncio.get_event_loop())
|
||||
|
||||
|
||||
def is_asyncio_reactor_installed():
|
||||
from twisted.internet import reactor
|
||||
return isinstance(reactor, asyncioreactor.AsyncioSelectorReactor)
|
@ -1,8 +1,7 @@
|
||||
"""
|
||||
Helper functions for dealing with Twisted deferreds
|
||||
"""
|
||||
|
||||
from twisted.internet import defer, reactor, task
|
||||
from twisted.internet import defer, task
|
||||
from twisted.python import failure
|
||||
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
@ -15,6 +14,7 @@ def defer_fail(_failure):
|
||||
It delays by 100ms so reactor has a chance to go through readers and writers
|
||||
before attending pending delayed calls, so do not set delay to zero.
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
d = defer.Deferred()
|
||||
reactor.callLater(0.1, d.errback, _failure)
|
||||
return d
|
||||
@ -27,6 +27,7 @@ def defer_succeed(result):
|
||||
It delays by 100ms so reactor has a chance to go trough readers and writers
|
||||
before attending pending delayed calls, so do not set delay to zero.
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
d = defer.Deferred()
|
||||
reactor.callLater(0.1, d.callback, result)
|
||||
return d
|
||||
|
@ -11,6 +11,7 @@ from twisted.python import log as twisted_log
|
||||
import scrapy
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.utils.asyncio import is_asyncio_reactor_installed
|
||||
from scrapy.utils.versions import scrapy_components_versions
|
||||
|
||||
|
||||
@ -148,6 +149,8 @@ def log_scrapy_info(settings):
|
||||
{'versions': ", ".join("%s %s" % (name, version)
|
||||
for name, version in scrapy_components_versions()
|
||||
if name != "Scrapy")})
|
||||
if is_asyncio_reactor_installed():
|
||||
logger.debug("Asyncio reactor is installed")
|
||||
|
||||
|
||||
class StreamLogger(object):
|
||||
|
@ -1,7 +1,5 @@
|
||||
import signal
|
||||
|
||||
from twisted.internet import reactor
|
||||
|
||||
|
||||
signal_names = {}
|
||||
for signame in dir(signal):
|
||||
@ -17,6 +15,7 @@ def install_shutdown_handlers(function, override_sigint=True):
|
||||
SIGINT handler won't be install if there is already a handler in place
|
||||
(e.g. Pdb)
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
reactor._handleSignals()
|
||||
signal.signal(signal.SIGTERM, function)
|
||||
if signal.getsignal(signal.SIGINT) == signal.default_int_handler or \
|
||||
|
@ -1,8 +1,9 @@
|
||||
from twisted.internet import reactor, error
|
||||
from twisted.internet import error
|
||||
|
||||
|
||||
def listen_tcp(portrange, host, factory):
|
||||
"""Like reactor.listenTCP but tries different ports in a range."""
|
||||
from twisted.internet import reactor
|
||||
assert len(portrange) <= 2, "invalid portrange: %s" % portrange
|
||||
if not portrange:
|
||||
return reactor.listenTCP(0, factory, interface=host)
|
||||
@ -30,6 +31,7 @@ class CallLaterOnce(object):
|
||||
self._call = None
|
||||
|
||||
def schedule(self, delay=0):
|
||||
from twisted.internet import reactor
|
||||
if self._call is None:
|
||||
self._call = reactor.callLater(delay, self)
|
||||
|
||||
|
17
tests/CrawlerProcess/asyncio_enabled_no_reactor.py
Normal file
17
tests/CrawlerProcess/asyncio_enabled_no_reactor.py
Normal file
@ -0,0 +1,17 @@
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
|
||||
|
||||
class NoRequestsSpider(scrapy.Spider):
|
||||
name = 'no_request'
|
||||
|
||||
def start_requests(self):
|
||||
return []
|
||||
|
||||
|
||||
process = CrawlerProcess(settings={
|
||||
'ASYNCIO_REACTOR': True,
|
||||
})
|
||||
|
||||
process.crawl(NoRequestsSpider)
|
||||
process.start()
|
22
tests/CrawlerProcess/asyncio_enabled_reactor.py
Normal file
22
tests/CrawlerProcess/asyncio_enabled_reactor.py
Normal file
@ -0,0 +1,22 @@
|
||||
import asyncio
|
||||
|
||||
from twisted.internet import asyncioreactor
|
||||
asyncioreactor.install(asyncio.get_event_loop())
|
||||
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
|
||||
|
||||
class NoRequestsSpider(scrapy.Spider):
|
||||
name = 'no_request'
|
||||
|
||||
def start_requests(self):
|
||||
return []
|
||||
|
||||
|
||||
process = CrawlerProcess(settings={
|
||||
'ASYNCIO_REACTOR': True,
|
||||
})
|
||||
|
||||
process.crawl(NoRequestsSpider)
|
||||
process.start()
|
@ -4,7 +4,7 @@ mitmproxy; python_version >= '3.6'
|
||||
mitmproxy<4.0.0; python_version < '3.6'
|
||||
pytest
|
||||
pytest-cov
|
||||
pytest-twisted
|
||||
pytest-twisted >= 1.11
|
||||
pytest-xdist
|
||||
sybil
|
||||
testfixtures
|
||||
|
@ -295,6 +295,14 @@ class BadSpider(scrapy.Spider):
|
||||
self.assertIn("start_requests", log)
|
||||
self.assertIn("badspider.py", log)
|
||||
|
||||
def test_asyncio_enabled_true(self):
|
||||
log = self.get_log(self.debug_log_spider, args=['-s', 'ASYNCIO_REACTOR=True'])
|
||||
self.assertIn("DEBUG: Asyncio reactor is installed", log)
|
||||
|
||||
def test_asyncio_enabled_false(self):
|
||||
log = self.get_log(self.debug_log_spider, args=['-s', 'ASYNCIO_REACTOR=False'])
|
||||
self.assertNotIn("DEBUG: Asyncio reactor is installed", log)
|
||||
|
||||
|
||||
class BenchCommandTest(CommandTest):
|
||||
|
||||
|
@ -4,9 +4,10 @@ import subprocess
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
from pytest import raises, mark
|
||||
from testfixtures import LogCapture
|
||||
from twisted.internet import defer
|
||||
from twisted.trial import unittest
|
||||
from pytest import raises
|
||||
|
||||
import scrapy
|
||||
from scrapy.crawler import Crawler, CrawlerRunner, CrawlerProcess
|
||||
@ -207,6 +208,7 @@ class NoRequestsSpider(scrapy.Spider):
|
||||
return []
|
||||
|
||||
|
||||
@mark.usefixtures('reactor_pytest')
|
||||
class CrawlerRunnerHasSpider(unittest.TestCase):
|
||||
|
||||
@defer.inlineCallbacks
|
||||
@ -250,6 +252,33 @@ class CrawlerRunnerHasSpider(unittest.TestCase):
|
||||
|
||||
self.assertEqual(runner.bootstrap_failed, True)
|
||||
|
||||
def test_crawler_runner_asyncio_enabled_true(self):
|
||||
if self.reactor_pytest == 'asyncio':
|
||||
runner = CrawlerRunner(settings={'ASYNCIO_REACTOR': True})
|
||||
else:
|
||||
msg = "ASYNCIO_REACTOR is on but the Twisted asyncio reactor is not installed"
|
||||
with self.assertRaisesRegex(Exception, msg):
|
||||
runner = CrawlerRunner(settings={'ASYNCIO_REACTOR': True})
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_crawler_process_asyncio_enabled_true(self):
|
||||
with LogCapture(level=logging.DEBUG) as log:
|
||||
if self.reactor_pytest == 'asyncio':
|
||||
runner = CrawlerProcess(settings={'ASYNCIO_REACTOR': True})
|
||||
yield runner.crawl(NoRequestsSpider)
|
||||
self.assertIn("Asyncio reactor is installed", str(log))
|
||||
else:
|
||||
msg = "ASYNCIO_REACTOR is on but the Twisted asyncio reactor is not installed"
|
||||
with self.assertRaisesRegex(Exception, msg):
|
||||
runner = CrawlerProcess(settings={'ASYNCIO_REACTOR': True})
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_crawler_process_asyncio_enabled_false(self):
|
||||
runner = CrawlerProcess(settings={'ASYNCIO_REACTOR': False})
|
||||
with LogCapture(level=logging.DEBUG) as log:
|
||||
yield runner.crawl(NoRequestsSpider)
|
||||
self.assertNotIn("Asyncio reactor is installed", str(log))
|
||||
|
||||
|
||||
class CrawlerProcessSubprocess(unittest.TestCase):
|
||||
script_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'CrawlerProcess')
|
||||
@ -265,3 +294,14 @@ class CrawlerProcessSubprocess(unittest.TestCase):
|
||||
def test_simple(self):
|
||||
log = self.run_script('simple.py')
|
||||
self.assertIn('Spider closed (finished)', log)
|
||||
self.assertNotIn("DEBUG: Asyncio reactor is installed", log)
|
||||
|
||||
def test_asyncio_enabled_no_reactor(self):
|
||||
log = self.run_script('asyncio_enabled_no_reactor.py')
|
||||
self.assertIn('Spider closed (finished)', log)
|
||||
self.assertIn("DEBUG: Asyncio reactor is installed", log)
|
||||
|
||||
def test_asyncio_enabled_reactor(self):
|
||||
log = self.run_script('asyncio_enabled_reactor.py')
|
||||
self.assertIn('Spider closed (finished)', log)
|
||||
self.assertIn("DEBUG: Asyncio reactor is installed", log)
|
||||
|
@ -1,5 +1,6 @@
|
||||
from unittest import mock
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
from twisted.trial.unittest import TestCase
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
@ -177,3 +178,31 @@ class ProcessExceptionInvalidOutput(ManagerTestCase):
|
||||
dfd.addBoth(results.append)
|
||||
self.assertIsInstance(results[0], Failure)
|
||||
self.assertIsInstance(results[0].value, _InvalidOutput)
|
||||
|
||||
|
||||
class MiddlewareUsingDeferreds(ManagerTestCase):
|
||||
"""Middlewares using Deferreds should work"""
|
||||
|
||||
def test_deferred(self):
|
||||
resp = Response('http://example.com/index.html')
|
||||
|
||||
class DeferredMiddleware:
|
||||
def cb(self, result):
|
||||
return result
|
||||
|
||||
def process_request(self, request, spider):
|
||||
d = Deferred()
|
||||
d.addCallback(self.cb)
|
||||
d.callback(resp)
|
||||
return d
|
||||
|
||||
self.mwman._add_middleware(DeferredMiddleware())
|
||||
req = Request('http://example.com/index.html')
|
||||
download_func = mock.MagicMock()
|
||||
dfd = self.mwman.download(download_func, req, self.spider)
|
||||
results = []
|
||||
dfd.addBoth(results.append)
|
||||
self._wait(dfd)
|
||||
|
||||
self.assertIs(results[0], resp)
|
||||
self.assertFalse(download_func.called)
|
||||
|
17
tests/test_utils_asyncio.py
Normal file
17
tests/test_utils_asyncio.py
Normal file
@ -0,0 +1,17 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from pytest import mark
|
||||
|
||||
from scrapy.utils.asyncio import is_asyncio_reactor_installed, install_asyncio_reactor
|
||||
|
||||
|
||||
@mark.usefixtures('reactor_pytest')
|
||||
class AsyncioTest(TestCase):
|
||||
|
||||
def test_is_asyncio_reactor_installed(self):
|
||||
# the result should depend only on the pytest --reactor argument
|
||||
self.assertEqual(is_asyncio_reactor_installed(), self.reactor_pytest == 'asyncio')
|
||||
|
||||
def test_install_asyncio_reactor(self):
|
||||
# this should do nothing
|
||||
install_asyncio_reactor()
|
14
tox.ini
14
tox.ini
@ -96,3 +96,17 @@ changedir = {[docs]changedir}
|
||||
deps = {[docs]deps}
|
||||
commands =
|
||||
sphinx-build -W -b linkcheck . {envtmpdir}/linkcheck
|
||||
|
||||
[asyncio]
|
||||
commands =
|
||||
{[testenv]commands} --reactor=asyncio
|
||||
|
||||
[testenv:py35-asyncio]
|
||||
basepython = python3.5
|
||||
deps = {[testenv]deps}
|
||||
commands = {[asyncio]commands}
|
||||
|
||||
[testenv:py38-asyncio]
|
||||
basepython = python3.8
|
||||
deps = {[testenv]deps}
|
||||
commands = {[asyncio]commands}
|
||||
|
Loading…
x
Reference in New Issue
Block a user