1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 16:24:30 +00:00

Allow core Scheduler priority queue customization

This commit is contained in:
nyov 2016-02-27 15:18:31 +00:00
parent 74158611c8
commit 2a6524ee3a
2 changed files with 9 additions and 5 deletions

View File

@ -3,7 +3,6 @@ import json
import logging import logging
from os.path import join, exists from os.path import join, exists
from queuelib import PriorityQueue
from scrapy.utils.reqser import request_to_dict, request_from_dict from scrapy.utils.reqser import request_to_dict, request_from_dict
from scrapy.utils.misc import load_object from scrapy.utils.misc import load_object
from scrapy.utils.job import job_dir from scrapy.utils.job import job_dir
@ -13,9 +12,11 @@ logger = logging.getLogger(__name__)
class Scheduler(object): class Scheduler(object):
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None): def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
logunser=False, stats=None, pqclass=None):
self.df = dupefilter self.df = dupefilter
self.dqdir = self._dqdir(jobdir) self.dqdir = self._dqdir(jobdir)
self.pqclass = pqclass
self.dqclass = dqclass self.dqclass = dqclass
self.mqclass = mqclass self.mqclass = mqclass
self.logunser = logunser self.logunser = logunser
@ -26,17 +27,19 @@ class Scheduler(object):
settings = crawler.settings settings = crawler.settings
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
dupefilter = dupefilter_cls.from_settings(settings) dupefilter = dupefilter_cls.from_settings(settings)
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats) return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
def has_pending_requests(self): def has_pending_requests(self):
return len(self) > 0 return len(self) > 0
def open(self, spider): def open(self, spider):
self.spider = spider self.spider = spider
self.mqs = PriorityQueue(self._newmq) self.mqs = self.pqclass(self._newmq)
self.dqs = self._dq() if self.dqdir else None self.dqs = self._dq() if self.dqdir else None
return self.df.open() return self.df.open()
@ -112,7 +115,7 @@ class Scheduler(object):
prios = json.load(f) prios = json.load(f)
else: else:
prios = () prios = ()
q = PriorityQueue(self._newdq, startprios=prios) q = self.pqclass(self._newdq, startprios=prios)
if q: if q:
logger.info("Resuming crawl (%(queuesize)d requests scheduled)", logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
{'queuesize': len(q)}, extra={'spider': self.spider}) {'queuesize': len(q)}, extra={'spider': self.spider})

View File

@ -234,6 +234,7 @@ ROBOTSTXT_OBEY = False
SCHEDULER = 'scrapy.core.scheduler.Scheduler' SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue' SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue' SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
SCHEDULER_PRIORITY_QUEUE = 'queuelib.PriorityQueue'
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader' SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'