1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 11:23:57 +00:00

core: add scheduler middleware and move duplicate fitler there

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40892
This commit is contained in:
Daniel Grana 2009-02-20 11:28:37 +00:00
parent 549c30d8d5
commit 4bf196fc99
6 changed files with 21 additions and 13 deletions

View File

@ -148,6 +148,11 @@ RETRY_HTTP_CODES = ['500', '503', '504', '400', '408', '200']
ROBOTSTXT_OBEY = False
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_MIDDLEWARES = [
'scrapy.contrib.schedulermiddleware.duplicatesfilter.DuplicatesFilterMiddleware',
]
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
SPIDER_MODULES = []

View File

@ -42,13 +42,12 @@ class RedirectMiddleware(object):
interval, url = get_meta_refresh(response)
if url and int(interval) < META_REFRESH_MAXSEC:
redirected = request.replace(url=urljoin(request.url, url))
return self._redirect(redirected, request, spider, 'meta refresh') or response
return self._redirect(redirected, request, spider, 'meta refresh')
return response
def _redirect(self, redirected, request, spider, reason):
domain = spider.domain_name
if duplicatesfilter.add(domain, redirected) or redirected.dont_filter:
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), level=log.DEBUG, domain=domain)
return redirected
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), level=log.DEBUG, domain=domain)
return redirected

View File

@ -11,15 +11,12 @@ from scrapy import log
class DuplicatesFilterMiddleware(object):
"""Filter out already seen requests to avoid visiting pages more than once."""
def process_spider_input(self, response, spider):
duplicatesfilter.add(spider.domain_name, response.request)
def process_spider_output(self, response, result, spider):
domain = spider.domain_name
for req in result:
if isinstance(req, Request):
added = duplicatesfilter.add(domain, req)
if not (added or req.dont_filter):
has = duplicatesfilter.has(domain, req)
if has and not req.dont_filter:
log.msg('Skipped (already processed): %s' % req, log.TRACE, domain=domain)
continue
yield req

View File

@ -14,7 +14,7 @@ from scrapy import log
from scrapy.stats import stats
from scrapy.conf import settings
from scrapy.core import signals
from scrapy.core.scheduler import Scheduler
from scrapy.core.scheduler import Scheduler, SchedulerMiddlewareManager
from scrapy.core.downloader import Downloader
from scrapy.core.exceptions import IgnoreRequest, HttpException, DontCloseDomain
from scrapy.http import Response, Request
@ -59,6 +59,7 @@ class ExecutionEngine(object):
Configure execution engine with the given scheduling policy and downloader.
"""
self.scheduler = scheduler or Scheduler()
self.schedulermiddleware = SchedulerMiddlewareManager(self.scheduler)
self.downloader = downloader or Downloader(self)
self.spidermiddleware = SpiderMiddlewareManager()
self._scraping = {}
@ -299,7 +300,7 @@ class ExecutionEngine(object):
return self._add_starter(request, spider, domain_priority)
if self.debug_mode:
log.msg('Scheduling %s (now)' % request_info(request), log.DEBUG)
schd = self.scheduler.enqueue_request(domain, request, priority)
schd = self.schedulermiddleware.enqueue_request(domain, request, priority)
self.next_request(spider)
return schd

View File

@ -1,2 +1,3 @@
from scrapy.core.scheduler.schedulers import Scheduler
from scrapy.core.scheduler.store import MemoryStore
from scrapy.core.scheduler.middleware import SchedulerMiddlewareManager

View File

@ -14,6 +14,9 @@ class DuplicatesFilterMiddlewareTest(unittest.TestCase):
self.spider = spiders.fromdomain('scrapytest.org')
duplicatesfilter.open('scrapytest.org')
def tearDown(self):
duplicatesfilter.close('scrapytest.org')
def test_process_spider_output(self):
mw = DuplicatesFilterMiddleware()
@ -25,10 +28,12 @@ class DuplicatesFilterMiddlewareTest(unittest.TestCase):
r2 = Request('http://scrapytest.org/2')
r3 = Request('http://scrapytest.org/2')
mw.process_spider_input(response, self.spider)
duplicatesfilter.add('scrapytest.org', r0)
duplicatesfilter.add('scrapytest.org', r2)
filtered = list(mw.process_spider_output(response, [r0, r1, r2, r3], self.spider))
assert r0 not in filtered
assert r1 in filtered
assert r2 in filtered
assert r2 not in filtered
assert r3 not in filtered