2010-09-03 15:54:42 -03:00
|
|
|
import sys, os
|
2010-11-30 02:26:31 -02:00
|
|
|
from datetime import datetime
|
2010-09-03 15:54:42 -03:00
|
|
|
|
|
|
|
from twisted.internet import reactor, defer, protocol, error
|
|
|
|
from twisted.application.service import Service
|
|
|
|
from twisted.python import log
|
|
|
|
|
|
|
|
from scrapy.utils.py26 import cpu_count
|
2010-11-30 02:26:31 -02:00
|
|
|
from scrapy.utils.python import stringify_dict
|
2010-11-29 17:19:05 -02:00
|
|
|
from scrapyd.utils import get_crawl_args
|
2010-12-27 16:22:32 -02:00
|
|
|
from .interfaces import IPoller, IEnvironment
|
2010-09-03 15:54:42 -03:00
|
|
|
|
|
|
|
class Launcher(Service):
|
|
|
|
|
2010-11-30 02:26:31 -02:00
|
|
|
name = 'launcher'
|
|
|
|
|
2010-09-03 15:54:42 -03:00
|
|
|
def __init__(self, config, app):
|
2010-11-30 02:26:31 -02:00
|
|
|
self.processes = {}
|
2010-11-29 17:19:05 -02:00
|
|
|
self.max_proc = config.getint('max_proc', 0)
|
|
|
|
if not self.max_proc:
|
2010-11-30 02:26:31 -02:00
|
|
|
self.max_proc = cpu_count() * config.getint('max_proc_per_cpu', 4)
|
2010-12-27 16:22:32 -02:00
|
|
|
self.runner = config.get('runner', 'scrapyd.runner')
|
2010-09-03 15:54:42 -03:00
|
|
|
self.app = app
|
|
|
|
|
|
|
|
def startService(self):
|
|
|
|
for slot in range(self.max_proc):
|
|
|
|
self._wait_for_project(slot)
|
2010-12-27 16:22:32 -02:00
|
|
|
log.msg("%s started: max_proc=%r, runner=%r" % (self.parent.name, \
|
|
|
|
self.max_proc, self.runner), system="Launcher")
|
2010-09-03 15:54:42 -03:00
|
|
|
|
|
|
|
def _wait_for_project(self, slot):
|
|
|
|
poller = self.app.getComponent(IPoller)
|
|
|
|
poller.next().addCallback(self._spawn_process, slot)
|
|
|
|
|
|
|
|
def _spawn_process(self, message, slot):
|
2010-11-30 02:26:31 -02:00
|
|
|
msg = stringify_dict(message, keys_only=False)
|
2010-11-30 13:03:20 -02:00
|
|
|
project = msg['_project']
|
2010-12-27 16:22:32 -02:00
|
|
|
args = [sys.executable, '-m', self.runner, 'crawl']
|
2010-11-30 02:26:31 -02:00
|
|
|
args += get_crawl_args(msg)
|
2010-09-03 15:54:42 -03:00
|
|
|
e = self.app.getComponent(IEnvironment)
|
2010-12-27 16:22:32 -02:00
|
|
|
env = e.get_environment(msg, slot)
|
2010-11-30 13:03:20 -02:00
|
|
|
env = stringify_dict(env, keys_only=False)
|
2010-12-27 16:22:32 -02:00
|
|
|
pp = ScrapyProcessProtocol(slot, project, msg['_spider'], \
|
2010-11-30 13:03:20 -02:00
|
|
|
msg['_job'], env)
|
2010-12-27 16:22:32 -02:00
|
|
|
pp.deferred.addBoth(self._process_finished, slot)
|
2010-09-03 15:54:42 -03:00
|
|
|
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
|
2010-11-30 02:26:31 -02:00
|
|
|
self.processes[slot] = pp
|
2010-09-03 15:54:42 -03:00
|
|
|
|
2010-12-27 16:22:32 -02:00
|
|
|
def _process_finished(self, _, slot):
|
2010-11-30 02:26:31 -02:00
|
|
|
self.processes.pop(slot)
|
2010-09-03 15:54:42 -03:00
|
|
|
self._wait_for_project(slot)
|
|
|
|
|
|
|
|
|
|
|
|
class ScrapyProcessProtocol(protocol.ProcessProtocol):
|
|
|
|
|
2010-12-27 16:22:32 -02:00
|
|
|
def __init__(self, slot, project, spider, job, env):
|
2010-09-03 15:54:42 -03:00
|
|
|
self.slot = slot
|
|
|
|
self.pid = None
|
2010-11-29 17:19:05 -02:00
|
|
|
self.project = project
|
|
|
|
self.spider = spider
|
2010-11-30 02:26:31 -02:00
|
|
|
self.job = job
|
|
|
|
self.start_time = datetime.now()
|
2010-11-30 13:03:20 -02:00
|
|
|
self.env = env
|
|
|
|
self.logfile = env['SCRAPY_LOG_FILE']
|
2010-09-03 15:54:42 -03:00
|
|
|
self.deferred = defer.Deferred()
|
|
|
|
|
|
|
|
def outReceived(self, data):
|
|
|
|
log.msg(data.rstrip(), system="Launcher,%d/stdout" % self.pid)
|
|
|
|
|
|
|
|
def errReceived(self, data):
|
|
|
|
log.msg(data.rstrip(), system="Launcher,%d/stderr" % self.pid)
|
|
|
|
|
|
|
|
def connectionMade(self):
|
|
|
|
self.pid = self.transport.pid
|
|
|
|
self.log("Process started: ")
|
|
|
|
|
|
|
|
def processEnded(self, status):
|
|
|
|
if isinstance(status.value, error.ProcessDone):
|
|
|
|
self.log("Process finished: ")
|
|
|
|
else:
|
|
|
|
self.log("Process died: exitstatus=%r " % status.value.exitCode)
|
|
|
|
self.deferred.callback(self)
|
|
|
|
|
|
|
|
def log(self, msg):
|
2010-12-27 16:22:32 -02:00
|
|
|
msg += "project=%r spider=%r job=%r pid=%r log=%r" % (self.project, \
|
|
|
|
self.spider, self.job, self.pid, self.logfile)
|
2010-09-03 15:54:42 -03:00
|
|
|
log.msg(msg, system="Launcher")
|