mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 15:04:37 +00:00
core: add scheduler middleware and move duplicate fitler there
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40892
This commit is contained in:
parent
549c30d8d5
commit
4bf196fc99
@ -148,6 +148,11 @@ RETRY_HTTP_CODES = ['500', '503', '504', '400', '408', '200']
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
|
||||
|
||||
SCHEDULER_MIDDLEWARES = [
|
||||
'scrapy.contrib.schedulermiddleware.duplicatesfilter.DuplicatesFilterMiddleware',
|
||||
]
|
||||
|
||||
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
|
||||
|
||||
SPIDER_MODULES = []
|
||||
|
@ -42,13 +42,12 @@ class RedirectMiddleware(object):
|
||||
interval, url = get_meta_refresh(response)
|
||||
if url and int(interval) < META_REFRESH_MAXSEC:
|
||||
redirected = request.replace(url=urljoin(request.url, url))
|
||||
return self._redirect(redirected, request, spider, 'meta refresh') or response
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
|
||||
return response
|
||||
|
||||
def _redirect(self, redirected, request, spider, reason):
|
||||
domain = spider.domain_name
|
||||
if duplicatesfilter.add(domain, redirected) or redirected.dont_filter:
|
||||
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), level=log.DEBUG, domain=domain)
|
||||
return redirected
|
||||
|
||||
|
@ -11,15 +11,12 @@ from scrapy import log
|
||||
class DuplicatesFilterMiddleware(object):
|
||||
"""Filter out already seen requests to avoid visiting pages more than once."""
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
duplicatesfilter.add(spider.domain_name, response.request)
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
domain = spider.domain_name
|
||||
for req in result:
|
||||
if isinstance(req, Request):
|
||||
added = duplicatesfilter.add(domain, req)
|
||||
if not (added or req.dont_filter):
|
||||
has = duplicatesfilter.has(domain, req)
|
||||
if has and not req.dont_filter:
|
||||
log.msg('Skipped (already processed): %s' % req, log.TRACE, domain=domain)
|
||||
continue
|
||||
yield req
|
||||
|
@ -14,7 +14,7 @@ from scrapy import log
|
||||
from scrapy.stats import stats
|
||||
from scrapy.conf import settings
|
||||
from scrapy.core import signals
|
||||
from scrapy.core.scheduler import Scheduler
|
||||
from scrapy.core.scheduler import Scheduler, SchedulerMiddlewareManager
|
||||
from scrapy.core.downloader import Downloader
|
||||
from scrapy.core.exceptions import IgnoreRequest, HttpException, DontCloseDomain
|
||||
from scrapy.http import Response, Request
|
||||
@ -59,6 +59,7 @@ class ExecutionEngine(object):
|
||||
Configure execution engine with the given scheduling policy and downloader.
|
||||
"""
|
||||
self.scheduler = scheduler or Scheduler()
|
||||
self.schedulermiddleware = SchedulerMiddlewareManager(self.scheduler)
|
||||
self.downloader = downloader or Downloader(self)
|
||||
self.spidermiddleware = SpiderMiddlewareManager()
|
||||
self._scraping = {}
|
||||
@ -299,7 +300,7 @@ class ExecutionEngine(object):
|
||||
return self._add_starter(request, spider, domain_priority)
|
||||
if self.debug_mode:
|
||||
log.msg('Scheduling %s (now)' % request_info(request), log.DEBUG)
|
||||
schd = self.scheduler.enqueue_request(domain, request, priority)
|
||||
schd = self.schedulermiddleware.enqueue_request(domain, request, priority)
|
||||
self.next_request(spider)
|
||||
return schd
|
||||
|
||||
|
@ -1,2 +1,3 @@
|
||||
from scrapy.core.scheduler.schedulers import Scheduler
|
||||
from scrapy.core.scheduler.store import MemoryStore
|
||||
from scrapy.core.scheduler.middleware import SchedulerMiddlewareManager
|
||||
|
@ -14,6 +14,9 @@ class DuplicatesFilterMiddlewareTest(unittest.TestCase):
|
||||
self.spider = spiders.fromdomain('scrapytest.org')
|
||||
duplicatesfilter.open('scrapytest.org')
|
||||
|
||||
def tearDown(self):
|
||||
duplicatesfilter.close('scrapytest.org')
|
||||
|
||||
def test_process_spider_output(self):
|
||||
mw = DuplicatesFilterMiddleware()
|
||||
|
||||
@ -25,10 +28,12 @@ class DuplicatesFilterMiddlewareTest(unittest.TestCase):
|
||||
r2 = Request('http://scrapytest.org/2')
|
||||
r3 = Request('http://scrapytest.org/2')
|
||||
|
||||
mw.process_spider_input(response, self.spider)
|
||||
duplicatesfilter.add('scrapytest.org', r0)
|
||||
duplicatesfilter.add('scrapytest.org', r2)
|
||||
|
||||
filtered = list(mw.process_spider_output(response, [r0, r1, r2, r3], self.spider))
|
||||
|
||||
assert r0 not in filtered
|
||||
assert r1 in filtered
|
||||
assert r2 in filtered
|
||||
assert r2 not in filtered
|
||||
assert r3 not in filtered
|
||||
|
Loading…
x
Reference in New Issue
Block a user