1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 10:24:13 +00:00

moved caching resolver to an extension in contrib.resolver

This commit is contained in:
Pablo Hoffman 2009-08-20 17:37:46 -03:00
parent a767b7044e
commit 297a8ac76b
3 changed files with 45 additions and 32 deletions

View File

@ -0,0 +1,45 @@
from collections import defaultdict
from twisted.internet import reactor, defer
from twisted.internet.base import ThreadedResolver
from scrapy.xlib.pydispatch import dispatcher
from scrapy.utils.httpobj import urlparse_cached
from scrapy.core import signals
class CachingResolver(object):
"""Scrapy extension to use a caching resolver, instead of default one"""
def __init__(self):
self.spider_hostnames = defaultdict(set)
self.resolver = _CachingThreadedResolver(reactor)
reactor.installResolver(self.resolver)
dispatcher.connect(self.request_received, signals.request_received)
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
def request_received(self, request, spider):
url_hostname = urlparse_cached(request).hostname
self.spider_hostnames[spider.domain_name].add(url_hostname)
def domain_closed(self, spider):
for hostname in self.spider_hostnames:
self.resolver._cache.pop(hostname, None)
class _CachingThreadedResolver(ThreadedResolver):
def __init__(self, *args, **kwargs):
ThreadedResolver.__init__(self, *args, **kwargs)
self._cache = {}
def getHostByName(self, name, timeout = (1, 3, 11, 45)):
if name in self._cache:
return defer.succeed(self._cache[name])
dfd = ThreadedResolver.getHostByName(self, name, timeout)
dfd.addCallback(self._cache_result, name)
return dfd
def _cache_result(self, result, name):
self._cache[name] = result
return result

View File

@ -13,7 +13,6 @@ from scrapy.utils.defer import mustbe_deferred
from scrapy import log
from .middleware import DownloaderMiddlewareManager
from .handlers import download_any
from .resolver import CachingThreadedResolver
class SiteInfo(object):
@ -55,8 +54,6 @@ class Downloader(object):
self.sites = {}
self.middleware = DownloaderMiddlewareManager()
self.concurrent_domains = settings.getint('CONCURRENT_DOMAINS')
cached_resolver = CachingThreadedResolver(reactor)
reactor.installResolver(cached_resolver)
def fetch(self, request, spider):
""" Main method to use to request a download

View File

@ -1,29 +0,0 @@
"""DNS resolver with cache to use with Twisted reactors"""
from twisted.internet import defer
from twisted.internet.base import ThreadedResolver
from scrapy.core import signals
from scrapy.xlib.pydispatch import dispatcher
class CachingThreadedResolver(ThreadedResolver):
def __init__(self, reactor):
ThreadedResolver.__init__(self, reactor)
self._cache = {}
dispatcher.connect(self.domain_closed, signal=signals.domain_closed)
def getHostByName(self, name, timeout = (1, 3, 11, 45)):
if name in self._cache:
return defer.succeed(self._cache[name])
dfd = ThreadedResolver.getHostByName(self, name, timeout)
dfd.addCallback(self._cache_result, name)
return dfd
def _cache_result(self, result, name):
self._cache[name] = result
return result
def domain_closed(self, spider):
for domain in [spider.domain_name] + spider.extra_domain_names:
self._cache.pop(domain, None)