1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 15:24:12 +00:00

core: add scheduler middleware and move duplicate fitler there

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40892
This commit is contained in:
Daniel Grana 2009-02-20 11:28:37 +00:00
parent 549c30d8d5
commit 4bf196fc99
6 changed files with 21 additions and 13 deletions

View File

@ -148,6 +148,11 @@ RETRY_HTTP_CODES = ['500', '503', '504', '400', '408', '200']
ROBOTSTXT_OBEY = False ROBOTSTXT_OBEY = False
SCHEDULER = 'scrapy.core.scheduler.Scheduler' SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_MIDDLEWARES = [
'scrapy.contrib.schedulermiddleware.duplicatesfilter.DuplicatesFilterMiddleware',
]
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
SPIDER_MODULES = [] SPIDER_MODULES = []

View File

@ -42,13 +42,12 @@ class RedirectMiddleware(object):
interval, url = get_meta_refresh(response) interval, url = get_meta_refresh(response)
if url and int(interval) < META_REFRESH_MAXSEC: if url and int(interval) < META_REFRESH_MAXSEC:
redirected = request.replace(url=urljoin(request.url, url)) redirected = request.replace(url=urljoin(request.url, url))
return self._redirect(redirected, request, spider, 'meta refresh') or response return self._redirect(redirected, request, spider, 'meta refresh')
return response return response
def _redirect(self, redirected, request, spider, reason): def _redirect(self, redirected, request, spider, reason):
domain = spider.domain_name domain = spider.domain_name
if duplicatesfilter.add(domain, redirected) or redirected.dont_filter: log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), level=log.DEBUG, domain=domain)
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), level=log.DEBUG, domain=domain) return redirected
return redirected

View File

@ -11,15 +11,12 @@ from scrapy import log
class DuplicatesFilterMiddleware(object): class DuplicatesFilterMiddleware(object):
"""Filter out already seen requests to avoid visiting pages more than once.""" """Filter out already seen requests to avoid visiting pages more than once."""
def process_spider_input(self, response, spider):
duplicatesfilter.add(spider.domain_name, response.request)
def process_spider_output(self, response, result, spider): def process_spider_output(self, response, result, spider):
domain = spider.domain_name domain = spider.domain_name
for req in result: for req in result:
if isinstance(req, Request): if isinstance(req, Request):
added = duplicatesfilter.add(domain, req) has = duplicatesfilter.has(domain, req)
if not (added or req.dont_filter): if has and not req.dont_filter:
log.msg('Skipped (already processed): %s' % req, log.TRACE, domain=domain) log.msg('Skipped (already processed): %s' % req, log.TRACE, domain=domain)
continue continue
yield req yield req

View File

@ -14,7 +14,7 @@ from scrapy import log
from scrapy.stats import stats from scrapy.stats import stats
from scrapy.conf import settings from scrapy.conf import settings
from scrapy.core import signals from scrapy.core import signals
from scrapy.core.scheduler import Scheduler from scrapy.core.scheduler import Scheduler, SchedulerMiddlewareManager
from scrapy.core.downloader import Downloader from scrapy.core.downloader import Downloader
from scrapy.core.exceptions import IgnoreRequest, HttpException, DontCloseDomain from scrapy.core.exceptions import IgnoreRequest, HttpException, DontCloseDomain
from scrapy.http import Response, Request from scrapy.http import Response, Request
@ -59,6 +59,7 @@ class ExecutionEngine(object):
Configure execution engine with the given scheduling policy and downloader. Configure execution engine with the given scheduling policy and downloader.
""" """
self.scheduler = scheduler or Scheduler() self.scheduler = scheduler or Scheduler()
self.schedulermiddleware = SchedulerMiddlewareManager(self.scheduler)
self.downloader = downloader or Downloader(self) self.downloader = downloader or Downloader(self)
self.spidermiddleware = SpiderMiddlewareManager() self.spidermiddleware = SpiderMiddlewareManager()
self._scraping = {} self._scraping = {}
@ -299,7 +300,7 @@ class ExecutionEngine(object):
return self._add_starter(request, spider, domain_priority) return self._add_starter(request, spider, domain_priority)
if self.debug_mode: if self.debug_mode:
log.msg('Scheduling %s (now)' % request_info(request), log.DEBUG) log.msg('Scheduling %s (now)' % request_info(request), log.DEBUG)
schd = self.scheduler.enqueue_request(domain, request, priority) schd = self.schedulermiddleware.enqueue_request(domain, request, priority)
self.next_request(spider) self.next_request(spider)
return schd return schd

View File

@ -1,2 +1,3 @@
from scrapy.core.scheduler.schedulers import Scheduler from scrapy.core.scheduler.schedulers import Scheduler
from scrapy.core.scheduler.store import MemoryStore from scrapy.core.scheduler.store import MemoryStore
from scrapy.core.scheduler.middleware import SchedulerMiddlewareManager

View File

@ -14,6 +14,9 @@ class DuplicatesFilterMiddlewareTest(unittest.TestCase):
self.spider = spiders.fromdomain('scrapytest.org') self.spider = spiders.fromdomain('scrapytest.org')
duplicatesfilter.open('scrapytest.org') duplicatesfilter.open('scrapytest.org')
def tearDown(self):
duplicatesfilter.close('scrapytest.org')
def test_process_spider_output(self): def test_process_spider_output(self):
mw = DuplicatesFilterMiddleware() mw = DuplicatesFilterMiddleware()
@ -25,10 +28,12 @@ class DuplicatesFilterMiddlewareTest(unittest.TestCase):
r2 = Request('http://scrapytest.org/2') r2 = Request('http://scrapytest.org/2')
r3 = Request('http://scrapytest.org/2') r3 = Request('http://scrapytest.org/2')
mw.process_spider_input(response, self.spider) duplicatesfilter.add('scrapytest.org', r0)
duplicatesfilter.add('scrapytest.org', r2)
filtered = list(mw.process_spider_output(response, [r0, r1, r2, r3], self.spider)) filtered = list(mw.process_spider_output(response, [r0, r1, r2, r3], self.spider))
assert r0 not in filtered assert r0 not in filtered
assert r1 in filtered assert r1 in filtered
assert r2 in filtered assert r2 not in filtered
assert r3 not in filtered assert r3 not in filtered