1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 14:44:07 +00:00

Support running projects without eggs in Scrapyd. Closes #238

This commit is contained in:
Pablo Hoffman 2010-09-14 01:44:25 -03:00
parent b76cd42690
commit 833baa6041
7 changed files with 47 additions and 33 deletions

View File

@ -1,27 +1,9 @@
""" import sys, os
This module can be used to run a Scrapy project contained in an egg file
To see all spiders in a project:
python -m scrapyd.eggrunner myproject.egg list
To crawl a spider:
python -m scrapyd.eggrunner myproject.egg crawl somespider
"""
import sys
from scrapyd.eggutils import activate_egg from scrapyd.eggutils import activate_egg
def main(eggpath, args): eggpath = os.environ.get('SCRAPY_EGGFILE')
"""Run scrapy for the settings module name passed""" if eggpath:
activate_egg(eggpath) activate_egg(eggpath)
from scrapy.cmdline import execute from scrapy.cmdline import execute
execute(['scrapy'] + list(args)) execute()
if __name__ == '__main__':
if len(sys.argv) < 2:
print "usage: %s <eggfile> [scrapy_command args ...]" % sys.argv[0]
sys.exit(1)
main(sys.argv[1], sys.argv[2:])

View File

@ -26,7 +26,10 @@ class FilesystemEggStorage(object):
def get(self, project, version=None): def get(self, project, version=None):
if version is None: if version is None:
version = self.list(project)[-1] try:
version = self.list(project)[-1]
except IndexError:
return None, None
return version, open(self._eggpath(project, version), 'rb') return version, open(self._eggpath(project, version), 'rb')
def list(self, project): def list(self, project):

View File

@ -12,9 +12,10 @@ def get_spider_list_from_eggfile(eggfile, project):
shutil.copyfileobj(eggfile, f) shutil.copyfileobj(eggfile, f)
f.flush() f.flush()
eggfile.seek(0) eggfile.seek(0)
pargs = [sys.executable, '-m', 'scrapyd.eggrunner', f.name, 'list'] pargs = [sys.executable, '-m', 'scrapyd.eggrunner', 'list']
env = os.environ.copy() env = os.environ.copy()
env['SCRAPY_PROJECT'] = project env['SCRAPY_PROJECT'] = project
env['SCRAPY_EGGFILE'] = f.name
proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env) proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env)
out = proc.communicate()[0] out = proc.communicate()[0]
return out.splitlines() return out.splitlines()

View File

@ -11,11 +11,19 @@ class Environment(object):
def __init__(self, config): def __init__(self, config):
self.dbs_dir = config.get('dbs_dir', 'dbs') self.dbs_dir = config.get('dbs_dir', 'dbs')
self.logs_dir = config.get('logs_dir', 'logs') self.logs_dir = config.get('logs_dir', 'logs')
if config.cp.has_section('settings'):
self.settings = dict(config.cp.items('settings'))
else:
self.settings = {}
def get_environment(self, message, slot): def get_environment(self, message, slot, eggpath):
project = message['project'] project = message['project']
env = os.environ.copy() env = os.environ.copy()
env['SCRAPY_PROJECT'] = project env['SCRAPY_PROJECT'] = project
if eggpath:
env['SCRAPY_EGGFILE'] = eggpath
elif project in self.settings:
env['SCRAPY_SETTINGS_MODULE'] = self.settings[project]
dbpath = os.path.join(self.dbs_dir, '%s.db' % project) dbpath = os.path.join(self.dbs_dir, '%s.db' % project)
env['SCRAPY_SQLITE_DB'] = dbpath env['SCRAPY_SQLITE_DB'] = dbpath
logpath = os.path.join(self.logs_dir, 'slot%s.log' % slot) logpath = os.path.join(self.logs_dir, 'slot%s.log' % slot)

View File

@ -10,7 +10,8 @@ class IEggStorage(Interface):
def get(project, version=None): def get(project, version=None):
"""Return a tuple (version, file) with the the egg for the specified """Return a tuple (version, file) with the the egg for the specified
project and version. If version is None, the latest version is project and version. If version is None, the latest version is
returned.""" returned. If no egg is found for the given project/version (None, None)
should be returned."""
def list(project): def list(project):
"""Return the list of versions which have eggs stored (for the given """Return the list of versions which have eggs stored (for the given
@ -61,9 +62,13 @@ class ISpiderScheduler(Interface):
class IEnvironment(Interface): class IEnvironment(Interface):
"""A component to generate the environment of crawler processes""" """A component to generate the environment of crawler processes"""
def get_environment(message, slot): def get_environment(message, slot, eggpath):
"""Return the environment variables to use for running the process. """Return the environment variables to use for running the process.
`message` is the message received from the IPoller.next() `message` is the message received from the IPoller.next()
`slot` is the Launcher slot where the process will be running. `slot` is the Launcher slot where the process will be running.
`eggpath` is the path to an eggfile that contains the project code. The
`eggpath` may be `None` if no egg was found for the project, in
which case the project must be on the python path and its settings
defined in scrapyd.conf [settings] section
""" """

View File

@ -39,15 +39,16 @@ class Launcher(Service):
def _spawn_process(self, message, slot): def _spawn_process(self, message, slot):
project = message['project'] project = message['project']
eggpath = self._get_eggpath(project) eggpath = self._get_eggpath(project)
args = [sys.executable, '-m', self.egg_runner, eggpath, 'crawl'] args = [sys.executable, '-m', self.egg_runner, 'crawl']
e = self.app.getComponent(IEnvironment) e = self.app.getComponent(IEnvironment)
env = e.get_environment(message, slot) env = e.get_environment(message, slot, eggpath)
pp = ScrapyProcessProtocol(eggpath, slot) pp = ScrapyProcessProtocol(eggpath, slot)
pp.deferred.addBoth(self._process_finished, eggpath, slot) pp.deferred.addBoth(self._process_finished, eggpath, slot)
reactor.spawnProcess(pp, sys.executable, args=args, env=env) reactor.spawnProcess(pp, sys.executable, args=args, env=env)
def _process_finished(self, _, eggpath, slot): def _process_finished(self, _, eggpath, slot):
os.remove(eggpath) if eggpath:
os.remove(eggpath)
self._wait_for_project(slot) self._wait_for_project(slot)

View File

@ -14,15 +14,29 @@ class EggStorageTest(unittest.TestCase):
d = self.mktemp() d = self.mktemp()
os.mkdir(d) os.mkdir(d)
config = Config(values={'eggs_dir': d, 'logs_dir': d}) config = Config(values={'eggs_dir': d, 'logs_dir': d})
config.cp.add_section('settings')
config.cp.set('settings', 'newbot', 'newbot.settings')
self.environ = Environment(config) self.environ = Environment(config)
def test_interface(self): def test_interface(self):
verifyObject(IEnvironment, self.environ) verifyObject(IEnvironment, self.environ)
def test_get_environment(self): def test_get_environment_with_eggfile(self):
msg = {'project': 'mybot'} msg = {'project': 'mybot'}
slot = 3 slot = 3
env = self.environ.get_environment(msg, slot) env = self.environ.get_environment(msg, slot, '/path/to/file.egg')
self.assertEqual(env['SCRAPY_PROJECT'], 'mybot') self.assertEqual(env['SCRAPY_PROJECT'], 'mybot')
self.assert_(env['SCRAPY_SQLITE_DB'].endswith('mybot.db')) self.assert_(env['SCRAPY_SQLITE_DB'].endswith('mybot.db'))
self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log')) self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log'))
self.assert_(env['SCRAPY_EGGFILE'].endswith('/path/to/file.egg'))
self.failIf('SCRAPY_SETTINGS_MODULE' in env)
def test_get_environment_without_eggfile(self):
msg = {'project': 'newbot'}
slot = 3
env = self.environ.get_environment(msg, slot, None)
self.assertEqual(env['SCRAPY_PROJECT'], 'newbot')
self.assert_(env['SCRAPY_SQLITE_DB'].endswith('newbot.db'))
self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log'))
self.assertEqual(env['SCRAPY_SETTINGS_MODULE'], 'newbot.settings')
self.failIf('SCRAPY_EGGFILE' in env)