mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 15:24:12 +00:00
core: add scheduler middleware and move duplicate fitler there
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40892
This commit is contained in:
parent
549c30d8d5
commit
4bf196fc99
@ -148,6 +148,11 @@ RETRY_HTTP_CODES = ['500', '503', '504', '400', '408', '200']
|
|||||||
ROBOTSTXT_OBEY = False
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
|
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
|
||||||
|
|
||||||
|
SCHEDULER_MIDDLEWARES = [
|
||||||
|
'scrapy.contrib.schedulermiddleware.duplicatesfilter.DuplicatesFilterMiddleware',
|
||||||
|
]
|
||||||
|
|
||||||
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
|
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
|
||||||
|
|
||||||
SPIDER_MODULES = []
|
SPIDER_MODULES = []
|
||||||
|
@ -42,13 +42,12 @@ class RedirectMiddleware(object):
|
|||||||
interval, url = get_meta_refresh(response)
|
interval, url = get_meta_refresh(response)
|
||||||
if url and int(interval) < META_REFRESH_MAXSEC:
|
if url and int(interval) < META_REFRESH_MAXSEC:
|
||||||
redirected = request.replace(url=urljoin(request.url, url))
|
redirected = request.replace(url=urljoin(request.url, url))
|
||||||
return self._redirect(redirected, request, spider, 'meta refresh') or response
|
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def _redirect(self, redirected, request, spider, reason):
|
def _redirect(self, redirected, request, spider, reason):
|
||||||
domain = spider.domain_name
|
domain = spider.domain_name
|
||||||
if duplicatesfilter.add(domain, redirected) or redirected.dont_filter:
|
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), level=log.DEBUG, domain=domain)
|
||||||
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), level=log.DEBUG, domain=domain)
|
return redirected
|
||||||
return redirected
|
|
||||||
|
|
||||||
|
@ -11,15 +11,12 @@ from scrapy import log
|
|||||||
class DuplicatesFilterMiddleware(object):
|
class DuplicatesFilterMiddleware(object):
|
||||||
"""Filter out already seen requests to avoid visiting pages more than once."""
|
"""Filter out already seen requests to avoid visiting pages more than once."""
|
||||||
|
|
||||||
def process_spider_input(self, response, spider):
|
|
||||||
duplicatesfilter.add(spider.domain_name, response.request)
|
|
||||||
|
|
||||||
def process_spider_output(self, response, result, spider):
|
def process_spider_output(self, response, result, spider):
|
||||||
domain = spider.domain_name
|
domain = spider.domain_name
|
||||||
for req in result:
|
for req in result:
|
||||||
if isinstance(req, Request):
|
if isinstance(req, Request):
|
||||||
added = duplicatesfilter.add(domain, req)
|
has = duplicatesfilter.has(domain, req)
|
||||||
if not (added or req.dont_filter):
|
if has and not req.dont_filter:
|
||||||
log.msg('Skipped (already processed): %s' % req, log.TRACE, domain=domain)
|
log.msg('Skipped (already processed): %s' % req, log.TRACE, domain=domain)
|
||||||
continue
|
continue
|
||||||
yield req
|
yield req
|
||||||
|
@ -14,7 +14,7 @@ from scrapy import log
|
|||||||
from scrapy.stats import stats
|
from scrapy.stats import stats
|
||||||
from scrapy.conf import settings
|
from scrapy.conf import settings
|
||||||
from scrapy.core import signals
|
from scrapy.core import signals
|
||||||
from scrapy.core.scheduler import Scheduler
|
from scrapy.core.scheduler import Scheduler, SchedulerMiddlewareManager
|
||||||
from scrapy.core.downloader import Downloader
|
from scrapy.core.downloader import Downloader
|
||||||
from scrapy.core.exceptions import IgnoreRequest, HttpException, DontCloseDomain
|
from scrapy.core.exceptions import IgnoreRequest, HttpException, DontCloseDomain
|
||||||
from scrapy.http import Response, Request
|
from scrapy.http import Response, Request
|
||||||
@ -59,6 +59,7 @@ class ExecutionEngine(object):
|
|||||||
Configure execution engine with the given scheduling policy and downloader.
|
Configure execution engine with the given scheduling policy and downloader.
|
||||||
"""
|
"""
|
||||||
self.scheduler = scheduler or Scheduler()
|
self.scheduler = scheduler or Scheduler()
|
||||||
|
self.schedulermiddleware = SchedulerMiddlewareManager(self.scheduler)
|
||||||
self.downloader = downloader or Downloader(self)
|
self.downloader = downloader or Downloader(self)
|
||||||
self.spidermiddleware = SpiderMiddlewareManager()
|
self.spidermiddleware = SpiderMiddlewareManager()
|
||||||
self._scraping = {}
|
self._scraping = {}
|
||||||
@ -299,7 +300,7 @@ class ExecutionEngine(object):
|
|||||||
return self._add_starter(request, spider, domain_priority)
|
return self._add_starter(request, spider, domain_priority)
|
||||||
if self.debug_mode:
|
if self.debug_mode:
|
||||||
log.msg('Scheduling %s (now)' % request_info(request), log.DEBUG)
|
log.msg('Scheduling %s (now)' % request_info(request), log.DEBUG)
|
||||||
schd = self.scheduler.enqueue_request(domain, request, priority)
|
schd = self.schedulermiddleware.enqueue_request(domain, request, priority)
|
||||||
self.next_request(spider)
|
self.next_request(spider)
|
||||||
return schd
|
return schd
|
||||||
|
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
from scrapy.core.scheduler.schedulers import Scheduler
|
from scrapy.core.scheduler.schedulers import Scheduler
|
||||||
from scrapy.core.scheduler.store import MemoryStore
|
from scrapy.core.scheduler.store import MemoryStore
|
||||||
|
from scrapy.core.scheduler.middleware import SchedulerMiddlewareManager
|
||||||
|
@ -14,6 +14,9 @@ class DuplicatesFilterMiddlewareTest(unittest.TestCase):
|
|||||||
self.spider = spiders.fromdomain('scrapytest.org')
|
self.spider = spiders.fromdomain('scrapytest.org')
|
||||||
duplicatesfilter.open('scrapytest.org')
|
duplicatesfilter.open('scrapytest.org')
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
duplicatesfilter.close('scrapytest.org')
|
||||||
|
|
||||||
def test_process_spider_output(self):
|
def test_process_spider_output(self):
|
||||||
mw = DuplicatesFilterMiddleware()
|
mw = DuplicatesFilterMiddleware()
|
||||||
|
|
||||||
@ -25,10 +28,12 @@ class DuplicatesFilterMiddlewareTest(unittest.TestCase):
|
|||||||
r2 = Request('http://scrapytest.org/2')
|
r2 = Request('http://scrapytest.org/2')
|
||||||
r3 = Request('http://scrapytest.org/2')
|
r3 = Request('http://scrapytest.org/2')
|
||||||
|
|
||||||
mw.process_spider_input(response, self.spider)
|
duplicatesfilter.add('scrapytest.org', r0)
|
||||||
|
duplicatesfilter.add('scrapytest.org', r2)
|
||||||
|
|
||||||
filtered = list(mw.process_spider_output(response, [r0, r1, r2, r3], self.spider))
|
filtered = list(mw.process_spider_output(response, [r0, r1, r2, r3], self.spider))
|
||||||
|
|
||||||
assert r0 not in filtered
|
assert r0 not in filtered
|
||||||
assert r1 in filtered
|
assert r1 in filtered
|
||||||
assert r2 in filtered
|
assert r2 not in filtered
|
||||||
assert r3 not in filtered
|
assert r3 not in filtered
|
||||||
|
Loading…
x
Reference in New Issue
Block a user