From 833baa60419b708ee0af18fed5c8e2ad31daf929 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Tue, 14 Sep 2010 01:44:25 -0300 Subject: [PATCH] Support running projects without eggs in Scrapyd. Closes #238 --- scrapyd/eggrunner.py | 28 +++++----------------------- scrapyd/eggstorage.py | 5 ++++- scrapyd/eggutils.py | 3 ++- scrapyd/environ.py | 10 +++++++++- scrapyd/interfaces.py | 9 +++++++-- scrapyd/launcher.py | 7 ++++--- scrapyd/tests/test_envion.py | 18 ++++++++++++++++-- 7 files changed, 47 insertions(+), 33 deletions(-) diff --git a/scrapyd/eggrunner.py b/scrapyd/eggrunner.py index 846ee36cd..830768526 100644 --- a/scrapyd/eggrunner.py +++ b/scrapyd/eggrunner.py @@ -1,27 +1,9 @@ -""" -This module can be used to run a Scrapy project contained in an egg file - -To see all spiders in a project: - - python -m scrapyd.eggrunner myproject.egg list - -To crawl a spider: - - python -m scrapyd.eggrunner myproject.egg crawl somespider -""" - -import sys +import sys, os from scrapyd.eggutils import activate_egg -def main(eggpath, args): - """Run scrapy for the settings module name passed""" +eggpath = os.environ.get('SCRAPY_EGGFILE') +if eggpath: activate_egg(eggpath) - from scrapy.cmdline import execute - execute(['scrapy'] + list(args)) - -if __name__ == '__main__': - if len(sys.argv) < 2: - print "usage: %s [scrapy_command args ...]" % sys.argv[0] - sys.exit(1) - main(sys.argv[1], sys.argv[2:]) +from scrapy.cmdline import execute +execute() diff --git a/scrapyd/eggstorage.py b/scrapyd/eggstorage.py index f46da4750..fbeb9e9c4 100644 --- a/scrapyd/eggstorage.py +++ b/scrapyd/eggstorage.py @@ -26,7 +26,10 @@ class FilesystemEggStorage(object): def get(self, project, version=None): if version is None: - version = self.list(project)[-1] + try: + version = self.list(project)[-1] + except IndexError: + return None, None return version, open(self._eggpath(project, version), 'rb') def list(self, project): diff --git a/scrapyd/eggutils.py b/scrapyd/eggutils.py index 95ab2aca5..1e1975580 100644 --- a/scrapyd/eggutils.py +++ b/scrapyd/eggutils.py @@ -12,9 +12,10 @@ def get_spider_list_from_eggfile(eggfile, project): shutil.copyfileobj(eggfile, f) f.flush() eggfile.seek(0) - pargs = [sys.executable, '-m', 'scrapyd.eggrunner', f.name, 'list'] + pargs = [sys.executable, '-m', 'scrapyd.eggrunner', 'list'] env = os.environ.copy() env['SCRAPY_PROJECT'] = project + env['SCRAPY_EGGFILE'] = f.name proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env) out = proc.communicate()[0] return out.splitlines() diff --git a/scrapyd/environ.py b/scrapyd/environ.py index 0cd184687..0cbf7c30b 100644 --- a/scrapyd/environ.py +++ b/scrapyd/environ.py @@ -11,11 +11,19 @@ class Environment(object): def __init__(self, config): self.dbs_dir = config.get('dbs_dir', 'dbs') self.logs_dir = config.get('logs_dir', 'logs') + if config.cp.has_section('settings'): + self.settings = dict(config.cp.items('settings')) + else: + self.settings = {} - def get_environment(self, message, slot): + def get_environment(self, message, slot, eggpath): project = message['project'] env = os.environ.copy() env['SCRAPY_PROJECT'] = project + if eggpath: + env['SCRAPY_EGGFILE'] = eggpath + elif project in self.settings: + env['SCRAPY_SETTINGS_MODULE'] = self.settings[project] dbpath = os.path.join(self.dbs_dir, '%s.db' % project) env['SCRAPY_SQLITE_DB'] = dbpath logpath = os.path.join(self.logs_dir, 'slot%s.log' % slot) diff --git a/scrapyd/interfaces.py b/scrapyd/interfaces.py index e8a1f2cf2..30a27e637 100644 --- a/scrapyd/interfaces.py +++ b/scrapyd/interfaces.py @@ -10,7 +10,8 @@ class IEggStorage(Interface): def get(project, version=None): """Return a tuple (version, file) with the the egg for the specified project and version. If version is None, the latest version is - returned.""" + returned. If no egg is found for the given project/version (None, None) + should be returned.""" def list(project): """Return the list of versions which have eggs stored (for the given @@ -61,9 +62,13 @@ class ISpiderScheduler(Interface): class IEnvironment(Interface): """A component to generate the environment of crawler processes""" - def get_environment(message, slot): + def get_environment(message, slot, eggpath): """Return the environment variables to use for running the process. `message` is the message received from the IPoller.next() `slot` is the Launcher slot where the process will be running. + `eggpath` is the path to an eggfile that contains the project code. The + `eggpath` may be `None` if no egg was found for the project, in + which case the project must be on the python path and its settings + defined in scrapyd.conf [settings] section """ diff --git a/scrapyd/launcher.py b/scrapyd/launcher.py index 9afcad16f..6742af78b 100644 --- a/scrapyd/launcher.py +++ b/scrapyd/launcher.py @@ -39,15 +39,16 @@ class Launcher(Service): def _spawn_process(self, message, slot): project = message['project'] eggpath = self._get_eggpath(project) - args = [sys.executable, '-m', self.egg_runner, eggpath, 'crawl'] + args = [sys.executable, '-m', self.egg_runner, 'crawl'] e = self.app.getComponent(IEnvironment) - env = e.get_environment(message, slot) + env = e.get_environment(message, slot, eggpath) pp = ScrapyProcessProtocol(eggpath, slot) pp.deferred.addBoth(self._process_finished, eggpath, slot) reactor.spawnProcess(pp, sys.executable, args=args, env=env) def _process_finished(self, _, eggpath, slot): - os.remove(eggpath) + if eggpath: + os.remove(eggpath) self._wait_for_project(slot) diff --git a/scrapyd/tests/test_envion.py b/scrapyd/tests/test_envion.py index 0d1c657ee..9f99da3a6 100644 --- a/scrapyd/tests/test_envion.py +++ b/scrapyd/tests/test_envion.py @@ -14,15 +14,29 @@ class EggStorageTest(unittest.TestCase): d = self.mktemp() os.mkdir(d) config = Config(values={'eggs_dir': d, 'logs_dir': d}) + config.cp.add_section('settings') + config.cp.set('settings', 'newbot', 'newbot.settings') self.environ = Environment(config) def test_interface(self): verifyObject(IEnvironment, self.environ) - def test_get_environment(self): + def test_get_environment_with_eggfile(self): msg = {'project': 'mybot'} slot = 3 - env = self.environ.get_environment(msg, slot) + env = self.environ.get_environment(msg, slot, '/path/to/file.egg') self.assertEqual(env['SCRAPY_PROJECT'], 'mybot') self.assert_(env['SCRAPY_SQLITE_DB'].endswith('mybot.db')) self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log')) + self.assert_(env['SCRAPY_EGGFILE'].endswith('/path/to/file.egg')) + self.failIf('SCRAPY_SETTINGS_MODULE' in env) + + def test_get_environment_without_eggfile(self): + msg = {'project': 'newbot'} + slot = 3 + env = self.environ.get_environment(msg, slot, None) + self.assertEqual(env['SCRAPY_PROJECT'], 'newbot') + self.assert_(env['SCRAPY_SQLITE_DB'].endswith('newbot.db')) + self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log')) + self.assertEqual(env['SCRAPY_SETTINGS_MODULE'], 'newbot.settings') + self.failIf('SCRAPY_EGGFILE' in env)