1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-21 05:13:16 +00:00

Merge pull request #4010 from scrapy/asyncio-base

Base support for asyncio
This commit is contained in:
Mikhail Korobov 2019-12-30 00:51:28 +05:00 committed by GitHub
commit bb991cd303
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 243 additions and 10 deletions

View File

@ -16,6 +16,8 @@ matrix:
python: 3.5
- env: TOXENV=pinned
python: 3.5
- env: TOXENV=py35-asyncio
python: 3.5
- env: TOXENV=py36
python: 3.6
- env: TOXENV=py37
@ -24,6 +26,8 @@ matrix:
python: 3.8
- env: TOXENV=extra-deps
python: 3.8
- env: TOXENV=py38-asyncio
python: 3.8
- env: TOXENV=docs
python: 3.7 # Keep in sync with .readthedocs.yml
install:

View File

@ -34,3 +34,18 @@ def pytest_collection_modifyitems(session, config, items):
items[:] = [item for item in items if isinstance(item, Flake8Item)]
except ImportError:
pass
@pytest.fixture(scope='class')
def reactor_pytest(request):
if not request.cls:
# doctests
return
request.cls.reactor_pytest = request.config.getoption("--reactor")
return request.cls.reactor_pytest
@pytest.fixture(autouse=True)
def only_asyncio(request, reactor_pytest):
if request.node.get_closest_marker('only_asyncio') and reactor_pytest != 'asyncio':
pytest.skip('This test is only run with --reactor=asyncio')

View File

@ -160,6 +160,27 @@ to any particular component. In that case the module of that component will be
shown, typically an extension, middleware or pipeline. It also means that the
component must be enabled in order for the setting to have any effect.
.. setting:: ASYNCIO_REACTOR
ASYNCIO_REACTOR
---------------
Default: ``False``
Whether to install and require the Twisted reactor that uses the asyncio loop.
When this option is set to ``True``, Scrapy will require
:class:`~twisted.internet.asyncioreactor.AsyncioSelectorReactor`. It will
install this reactor if no reactor is installed yet, such as when using the
``scrapy`` script or :class:`~scrapy.crawler.CrawlerProcess`. If you are using
:class:`~scrapy.crawler.CrawlerRunner`, you need to install the correct reactor
manually. If a different reactor is installed outside Scrapy, it will raise an
exception.
The default value for this option is currently ``False`` to maintain backward
compatibility and avoid possible problems caused by using a different Twisted
reactor.
.. setting:: AWS_ACCESS_KEY_ID
AWS_ACCESS_KEY_ID

View File

@ -18,6 +18,8 @@ addopts =
--ignore=docs/topics/telnetconsole.rst
--ignore=docs/utils
twisted = 1
markers =
only_asyncio: marks tests as only enabled when --reactor=asyncio is passed
flake8-ignore =
# Files that are only meant to provide top-level imports are expected not
# to use any of their imports:
@ -114,6 +116,7 @@ flake8-ignore =
scrapy/spiders/feed.py E501
scrapy/spiders/sitemap.py E501
# scrapy/utils
scrapy/utils/asyncio.py E501
scrapy/utils/benchserver.py E501
scrapy/utils/conf.py E402 E501
scrapy/utils/console.py E306 E305
@ -223,6 +226,7 @@ flake8-ignore =
tests/test_spidermiddleware_output_chain.py E501 E226
tests/test_spidermiddleware_referer.py E501 F841 E125 E201 E124 E501 E241 E121
tests/test_squeues.py E501 E701 E741
tests/test_utils_asyncio.py E501
tests/test_utils_conf.py E501 E303 E128
tests/test_utils_curl.py E501
tests/test_utils_datatypes.py E402 E501 E305

View File

@ -6,8 +6,8 @@ See documentation in docs/topics/shell.rst
from threading import Thread
from scrapy.commands import ScrapyCommand
from scrapy.shell import Shell
from scrapy.http import Request
from scrapy.shell import Shell
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
from scrapy.utils.url import guess_scheme

View File

@ -3,7 +3,7 @@ import pprint
import signal
import warnings
from twisted.internet import reactor, defer
from twisted.internet import defer
from zope.interface.verify import verifyClass, DoesNotImplement
from scrapy import Spider
@ -14,6 +14,7 @@ from scrapy.extension import ExtensionManager
from scrapy.settings import overridden_settings, Settings
from scrapy.signalmanager import SignalManager
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.asyncio import install_asyncio_reactor, is_asyncio_reactor_installed
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
from scrapy.utils.misc import load_object
from scrapy.utils.log import (
@ -136,6 +137,7 @@ class CrawlerRunner(object):
self._crawlers = set()
self._active = set()
self.bootstrap_failed = False
self._handle_asyncio_reactor()
@property
def spiders(self):
@ -229,6 +231,11 @@ class CrawlerRunner(object):
while self._active:
yield defer.DeferredList(self._active)
def _handle_asyncio_reactor(self):
if self.settings.getbool('ASYNCIO_REACTOR') and not is_asyncio_reactor_installed():
raise Exception("ASYNCIO_REACTOR is on but the Twisted asyncio "
"reactor is not installed.")
class CrawlerProcess(CrawlerRunner):
"""
@ -261,6 +268,7 @@ class CrawlerProcess(CrawlerRunner):
log_scrapy_info(self.settings)
def _signal_shutdown(self, signum, _):
from twisted.internet import reactor
install_shutdown_handlers(self._signal_kill)
signame = signal_names[signum]
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
@ -268,6 +276,7 @@ class CrawlerProcess(CrawlerRunner):
reactor.callFromThread(self._graceful_stop_reactor)
def _signal_kill(self, signum, _):
from twisted.internet import reactor
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
logger.info('Received %(signame)s twice, forcing unclean shutdown',
@ -286,6 +295,7 @@ class CrawlerProcess(CrawlerRunner):
:param boolean stop_after_crawl: stop or not the reactor when all
crawlers have finished
"""
from twisted.internet import reactor
if stop_after_crawl:
d = self.join()
# Don't start the reactor if the deferreds are already fired
@ -300,6 +310,7 @@ class CrawlerProcess(CrawlerRunner):
reactor.run(installSignalHandlers=False) # blocking call
def _get_dns_resolver(self):
from twisted.internet import reactor
if self.settings.getbool('DNSCACHE_ENABLED'):
cache_size = self.settings.getint('DNSCACHE_SIZE')
else:
@ -316,11 +327,17 @@ class CrawlerProcess(CrawlerRunner):
return d
def _stop_reactor(self, _=None):
from twisted.internet import reactor
try:
reactor.stop()
except RuntimeError: # raised if already stopped or in shutdown stage
pass
def _handle_asyncio_reactor(self):
if self.settings.getbool('ASYNCIO_REACTOR'):
install_asyncio_reactor()
super()._handle_asyncio_reactor()
def _get_spider_loader(settings):
""" Get SpiderLoader instance from settings """

View File

@ -19,6 +19,8 @@ from os.path import join, abspath, dirname
AJAXCRAWL_ENABLED = False
ASYNCIO_REACTOR = False
AUTOTHROTTLE_ENABLED = False
AUTOTHROTTLE_DEBUG = False
AUTOTHROTTLE_MAX_DELAY = 60.0

View File

@ -7,7 +7,7 @@ import os
import signal
import warnings
from twisted.internet import reactor, threads, defer
from twisted.internet import threads, defer
from twisted.python import threadable
from w3lib.url import any_to_uri
@ -98,6 +98,7 @@ class Shell(object):
return spider
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
from twisted.internet import reactor
if isinstance(request_or_url, Request):
request = request_or_url
else:

17
scrapy/utils/asyncio.py Normal file
View File

@ -0,0 +1,17 @@
import asyncio
from contextlib import suppress
from twisted.internet import asyncioreactor
from twisted.internet.error import ReactorAlreadyInstalledError
def install_asyncio_reactor():
""" Tries to install AsyncioSelectorReactor
"""
with suppress(ReactorAlreadyInstalledError):
asyncioreactor.install(asyncio.get_event_loop())
def is_asyncio_reactor_installed():
from twisted.internet import reactor
return isinstance(reactor, asyncioreactor.AsyncioSelectorReactor)

View File

@ -1,8 +1,7 @@
"""
Helper functions for dealing with Twisted deferreds
"""
from twisted.internet import defer, reactor, task
from twisted.internet import defer, task
from twisted.python import failure
from scrapy.exceptions import IgnoreRequest
@ -15,6 +14,7 @@ def defer_fail(_failure):
It delays by 100ms so reactor has a chance to go through readers and writers
before attending pending delayed calls, so do not set delay to zero.
"""
from twisted.internet import reactor
d = defer.Deferred()
reactor.callLater(0.1, d.errback, _failure)
return d
@ -27,6 +27,7 @@ def defer_succeed(result):
It delays by 100ms so reactor has a chance to go trough readers and writers
before attending pending delayed calls, so do not set delay to zero.
"""
from twisted.internet import reactor
d = defer.Deferred()
reactor.callLater(0.1, d.callback, result)
return d

View File

@ -11,6 +11,7 @@ from twisted.python import log as twisted_log
import scrapy
from scrapy.settings import Settings
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.asyncio import is_asyncio_reactor_installed
from scrapy.utils.versions import scrapy_components_versions
@ -148,6 +149,8 @@ def log_scrapy_info(settings):
{'versions': ", ".join("%s %s" % (name, version)
for name, version in scrapy_components_versions()
if name != "Scrapy")})
if is_asyncio_reactor_installed():
logger.debug("Asyncio reactor is installed")
class StreamLogger(object):

View File

@ -1,7 +1,5 @@
import signal
from twisted.internet import reactor
signal_names = {}
for signame in dir(signal):
@ -17,6 +15,7 @@ def install_shutdown_handlers(function, override_sigint=True):
SIGINT handler won't be install if there is already a handler in place
(e.g. Pdb)
"""
from twisted.internet import reactor
reactor._handleSignals()
signal.signal(signal.SIGTERM, function)
if signal.getsignal(signal.SIGINT) == signal.default_int_handler or \

View File

@ -1,8 +1,9 @@
from twisted.internet import reactor, error
from twisted.internet import error
def listen_tcp(portrange, host, factory):
"""Like reactor.listenTCP but tries different ports in a range."""
from twisted.internet import reactor
assert len(portrange) <= 2, "invalid portrange: %s" % portrange
if not portrange:
return reactor.listenTCP(0, factory, interface=host)
@ -30,6 +31,7 @@ class CallLaterOnce(object):
self._call = None
def schedule(self, delay=0):
from twisted.internet import reactor
if self._call is None:
self._call = reactor.callLater(delay, self)

View File

@ -0,0 +1,17 @@
import scrapy
from scrapy.crawler import CrawlerProcess
class NoRequestsSpider(scrapy.Spider):
name = 'no_request'
def start_requests(self):
return []
process = CrawlerProcess(settings={
'ASYNCIO_REACTOR': True,
})
process.crawl(NoRequestsSpider)
process.start()

View File

@ -0,0 +1,22 @@
import asyncio
from twisted.internet import asyncioreactor
asyncioreactor.install(asyncio.get_event_loop())
import scrapy
from scrapy.crawler import CrawlerProcess
class NoRequestsSpider(scrapy.Spider):
name = 'no_request'
def start_requests(self):
return []
process = CrawlerProcess(settings={
'ASYNCIO_REACTOR': True,
})
process.crawl(NoRequestsSpider)
process.start()

View File

@ -4,7 +4,7 @@ mitmproxy; python_version >= '3.6'
mitmproxy<4.0.0; python_version < '3.6'
pytest
pytest-cov
pytest-twisted
pytest-twisted >= 1.11
pytest-xdist
sybil
testfixtures

View File

@ -295,6 +295,14 @@ class BadSpider(scrapy.Spider):
self.assertIn("start_requests", log)
self.assertIn("badspider.py", log)
def test_asyncio_enabled_true(self):
log = self.get_log(self.debug_log_spider, args=['-s', 'ASYNCIO_REACTOR=True'])
self.assertIn("DEBUG: Asyncio reactor is installed", log)
def test_asyncio_enabled_false(self):
log = self.get_log(self.debug_log_spider, args=['-s', 'ASYNCIO_REACTOR=False'])
self.assertNotIn("DEBUG: Asyncio reactor is installed", log)
class BenchCommandTest(CommandTest):

View File

@ -4,9 +4,10 @@ import subprocess
import sys
import warnings
from pytest import raises, mark
from testfixtures import LogCapture
from twisted.internet import defer
from twisted.trial import unittest
from pytest import raises
import scrapy
from scrapy.crawler import Crawler, CrawlerRunner, CrawlerProcess
@ -207,6 +208,7 @@ class NoRequestsSpider(scrapy.Spider):
return []
@mark.usefixtures('reactor_pytest')
class CrawlerRunnerHasSpider(unittest.TestCase):
@defer.inlineCallbacks
@ -250,6 +252,33 @@ class CrawlerRunnerHasSpider(unittest.TestCase):
self.assertEqual(runner.bootstrap_failed, True)
def test_crawler_runner_asyncio_enabled_true(self):
if self.reactor_pytest == 'asyncio':
runner = CrawlerRunner(settings={'ASYNCIO_REACTOR': True})
else:
msg = "ASYNCIO_REACTOR is on but the Twisted asyncio reactor is not installed"
with self.assertRaisesRegex(Exception, msg):
runner = CrawlerRunner(settings={'ASYNCIO_REACTOR': True})
@defer.inlineCallbacks
def test_crawler_process_asyncio_enabled_true(self):
with LogCapture(level=logging.DEBUG) as log:
if self.reactor_pytest == 'asyncio':
runner = CrawlerProcess(settings={'ASYNCIO_REACTOR': True})
yield runner.crawl(NoRequestsSpider)
self.assertIn("Asyncio reactor is installed", str(log))
else:
msg = "ASYNCIO_REACTOR is on but the Twisted asyncio reactor is not installed"
with self.assertRaisesRegex(Exception, msg):
runner = CrawlerProcess(settings={'ASYNCIO_REACTOR': True})
@defer.inlineCallbacks
def test_crawler_process_asyncio_enabled_false(self):
runner = CrawlerProcess(settings={'ASYNCIO_REACTOR': False})
with LogCapture(level=logging.DEBUG) as log:
yield runner.crawl(NoRequestsSpider)
self.assertNotIn("Asyncio reactor is installed", str(log))
class CrawlerProcessSubprocess(unittest.TestCase):
script_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'CrawlerProcess')
@ -265,3 +294,14 @@ class CrawlerProcessSubprocess(unittest.TestCase):
def test_simple(self):
log = self.run_script('simple.py')
self.assertIn('Spider closed (finished)', log)
self.assertNotIn("DEBUG: Asyncio reactor is installed", log)
def test_asyncio_enabled_no_reactor(self):
log = self.run_script('asyncio_enabled_no_reactor.py')
self.assertIn('Spider closed (finished)', log)
self.assertIn("DEBUG: Asyncio reactor is installed", log)
def test_asyncio_enabled_reactor(self):
log = self.run_script('asyncio_enabled_reactor.py')
self.assertIn('Spider closed (finished)', log)
self.assertIn("DEBUG: Asyncio reactor is installed", log)

View File

@ -1,5 +1,6 @@
from unittest import mock
from twisted.internet.defer import Deferred
from twisted.trial.unittest import TestCase
from twisted.python.failure import Failure
@ -177,3 +178,31 @@ class ProcessExceptionInvalidOutput(ManagerTestCase):
dfd.addBoth(results.append)
self.assertIsInstance(results[0], Failure)
self.assertIsInstance(results[0].value, _InvalidOutput)
class MiddlewareUsingDeferreds(ManagerTestCase):
"""Middlewares using Deferreds should work"""
def test_deferred(self):
resp = Response('http://example.com/index.html')
class DeferredMiddleware:
def cb(self, result):
return result
def process_request(self, request, spider):
d = Deferred()
d.addCallback(self.cb)
d.callback(resp)
return d
self.mwman._add_middleware(DeferredMiddleware())
req = Request('http://example.com/index.html')
download_func = mock.MagicMock()
dfd = self.mwman.download(download_func, req, self.spider)
results = []
dfd.addBoth(results.append)
self._wait(dfd)
self.assertIs(results[0], resp)
self.assertFalse(download_func.called)

View File

@ -0,0 +1,17 @@
from unittest import TestCase
from pytest import mark
from scrapy.utils.asyncio import is_asyncio_reactor_installed, install_asyncio_reactor
@mark.usefixtures('reactor_pytest')
class AsyncioTest(TestCase):
def test_is_asyncio_reactor_installed(self):
# the result should depend only on the pytest --reactor argument
self.assertEqual(is_asyncio_reactor_installed(), self.reactor_pytest == 'asyncio')
def test_install_asyncio_reactor(self):
# this should do nothing
install_asyncio_reactor()

14
tox.ini
View File

@ -96,3 +96,17 @@ changedir = {[docs]changedir}
deps = {[docs]deps}
commands =
sphinx-build -W -b linkcheck . {envtmpdir}/linkcheck
[asyncio]
commands =
{[testenv]commands} --reactor=asyncio
[testenv:py35-asyncio]
basepython = python3.5
deps = {[testenv]deps}
commands = {[asyncio]commands}
[testenv:py38-asyncio]
basepython = python3.8
deps = {[testenv]deps}
commands = {[asyncio]commands}