mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 04:04:13 +00:00
moved caching resolver to an extension in contrib.resolver
This commit is contained in:
parent
a767b7044e
commit
297a8ac76b
45
scrapy/contrib/resolver.py
Normal file
45
scrapy/contrib/resolver.py
Normal file
@ -0,0 +1,45 @@
|
||||
from collections import defaultdict
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
from twisted.internet.base import ThreadedResolver
|
||||
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.core import signals
|
||||
|
||||
|
||||
class CachingResolver(object):
|
||||
"""Scrapy extension to use a caching resolver, instead of default one"""
|
||||
|
||||
def __init__(self):
|
||||
self.spider_hostnames = defaultdict(set)
|
||||
self.resolver = _CachingThreadedResolver(reactor)
|
||||
reactor.installResolver(self.resolver)
|
||||
dispatcher.connect(self.request_received, signals.request_received)
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
|
||||
def request_received(self, request, spider):
|
||||
url_hostname = urlparse_cached(request).hostname
|
||||
self.spider_hostnames[spider.domain_name].add(url_hostname)
|
||||
|
||||
def domain_closed(self, spider):
|
||||
for hostname in self.spider_hostnames:
|
||||
self.resolver._cache.pop(hostname, None)
|
||||
|
||||
|
||||
class _CachingThreadedResolver(ThreadedResolver):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
ThreadedResolver.__init__(self, *args, **kwargs)
|
||||
self._cache = {}
|
||||
|
||||
def getHostByName(self, name, timeout = (1, 3, 11, 45)):
|
||||
if name in self._cache:
|
||||
return defer.succeed(self._cache[name])
|
||||
dfd = ThreadedResolver.getHostByName(self, name, timeout)
|
||||
dfd.addCallback(self._cache_result, name)
|
||||
return dfd
|
||||
|
||||
def _cache_result(self, result, name):
|
||||
self._cache[name] = result
|
||||
return result
|
@ -13,7 +13,6 @@ from scrapy.utils.defer import mustbe_deferred
|
||||
from scrapy import log
|
||||
from .middleware import DownloaderMiddlewareManager
|
||||
from .handlers import download_any
|
||||
from .resolver import CachingThreadedResolver
|
||||
|
||||
|
||||
class SiteInfo(object):
|
||||
@ -55,8 +54,6 @@ class Downloader(object):
|
||||
self.sites = {}
|
||||
self.middleware = DownloaderMiddlewareManager()
|
||||
self.concurrent_domains = settings.getint('CONCURRENT_DOMAINS')
|
||||
cached_resolver = CachingThreadedResolver(reactor)
|
||||
reactor.installResolver(cached_resolver)
|
||||
|
||||
def fetch(self, request, spider):
|
||||
""" Main method to use to request a download
|
||||
|
@ -1,29 +0,0 @@
|
||||
"""DNS resolver with cache to use with Twisted reactors"""
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.base import ThreadedResolver
|
||||
|
||||
from scrapy.core import signals
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
|
||||
class CachingThreadedResolver(ThreadedResolver):
|
||||
|
||||
def __init__(self, reactor):
|
||||
ThreadedResolver.__init__(self, reactor)
|
||||
self._cache = {}
|
||||
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
|
||||
|
||||
def getHostByName(self, name, timeout = (1, 3, 11, 45)):
|
||||
if name in self._cache:
|
||||
return defer.succeed(self._cache[name])
|
||||
dfd = ThreadedResolver.getHostByName(self, name, timeout)
|
||||
dfd.addCallback(self._cache_result, name)
|
||||
return dfd
|
||||
|
||||
def _cache_result(self, result, name):
|
||||
self._cache[name] = result
|
||||
return result
|
||||
|
||||
def domain_closed(self, spider):
|
||||
for domain in [spider.domain_name] + spider.extra_domain_names:
|
||||
self._cache.pop(domain, None)
|
Loading…
x
Reference in New Issue
Block a user