mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 16:24:30 +00:00
Allow core Scheduler priority queue customization
This commit is contained in:
parent
74158611c8
commit
2a6524ee3a
@ -3,7 +3,6 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
from os.path import join, exists
|
from os.path import join, exists
|
||||||
|
|
||||||
from queuelib import PriorityQueue
|
|
||||||
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
||||||
from scrapy.utils.misc import load_object
|
from scrapy.utils.misc import load_object
|
||||||
from scrapy.utils.job import job_dir
|
from scrapy.utils.job import job_dir
|
||||||
@ -13,9 +12,11 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class Scheduler(object):
|
class Scheduler(object):
|
||||||
|
|
||||||
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None):
|
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
|
||||||
|
logunser=False, stats=None, pqclass=None):
|
||||||
self.df = dupefilter
|
self.df = dupefilter
|
||||||
self.dqdir = self._dqdir(jobdir)
|
self.dqdir = self._dqdir(jobdir)
|
||||||
|
self.pqclass = pqclass
|
||||||
self.dqclass = dqclass
|
self.dqclass = dqclass
|
||||||
self.mqclass = mqclass
|
self.mqclass = mqclass
|
||||||
self.logunser = logunser
|
self.logunser = logunser
|
||||||
@ -26,17 +27,19 @@ class Scheduler(object):
|
|||||||
settings = crawler.settings
|
settings = crawler.settings
|
||||||
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
|
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
|
||||||
dupefilter = dupefilter_cls.from_settings(settings)
|
dupefilter = dupefilter_cls.from_settings(settings)
|
||||||
|
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
|
||||||
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
|
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
|
||||||
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
|
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
|
||||||
logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
|
logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
|
||||||
return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats)
|
return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
|
||||||
|
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)
|
||||||
|
|
||||||
def has_pending_requests(self):
|
def has_pending_requests(self):
|
||||||
return len(self) > 0
|
return len(self) > 0
|
||||||
|
|
||||||
def open(self, spider):
|
def open(self, spider):
|
||||||
self.spider = spider
|
self.spider = spider
|
||||||
self.mqs = PriorityQueue(self._newmq)
|
self.mqs = self.pqclass(self._newmq)
|
||||||
self.dqs = self._dq() if self.dqdir else None
|
self.dqs = self._dq() if self.dqdir else None
|
||||||
return self.df.open()
|
return self.df.open()
|
||||||
|
|
||||||
@ -112,7 +115,7 @@ class Scheduler(object):
|
|||||||
prios = json.load(f)
|
prios = json.load(f)
|
||||||
else:
|
else:
|
||||||
prios = ()
|
prios = ()
|
||||||
q = PriorityQueue(self._newdq, startprios=prios)
|
q = self.pqclass(self._newdq, startprios=prios)
|
||||||
if q:
|
if q:
|
||||||
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
|
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
|
||||||
{'queuesize': len(q)}, extra={'spider': self.spider})
|
{'queuesize': len(q)}, extra={'spider': self.spider})
|
||||||
|
@ -234,6 +234,7 @@ ROBOTSTXT_OBEY = False
|
|||||||
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
|
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
|
||||||
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
|
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
|
||||||
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
|
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
|
||||||
|
SCHEDULER_PRIORITY_QUEUE = 'queuelib.PriorityQueue'
|
||||||
|
|
||||||
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
|
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user