1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 09:04:16 +00:00

Support running projects without eggs in Scrapyd. Closes #238

This commit is contained in:
Pablo Hoffman 2010-09-14 01:44:25 -03:00
parent b76cd42690
commit 833baa6041
7 changed files with 47 additions and 33 deletions

View File

@ -1,27 +1,9 @@
"""
This module can be used to run a Scrapy project contained in an egg file
To see all spiders in a project:
python -m scrapyd.eggrunner myproject.egg list
To crawl a spider:
python -m scrapyd.eggrunner myproject.egg crawl somespider
"""
import sys
import sys, os
from scrapyd.eggutils import activate_egg
def main(eggpath, args):
"""Run scrapy for the settings module name passed"""
eggpath = os.environ.get('SCRAPY_EGGFILE')
if eggpath:
activate_egg(eggpath)
from scrapy.cmdline import execute
execute(['scrapy'] + list(args))
if __name__ == '__main__':
if len(sys.argv) < 2:
print "usage: %s <eggfile> [scrapy_command args ...]" % sys.argv[0]
sys.exit(1)
main(sys.argv[1], sys.argv[2:])
from scrapy.cmdline import execute
execute()

View File

@ -26,7 +26,10 @@ class FilesystemEggStorage(object):
def get(self, project, version=None):
if version is None:
try:
version = self.list(project)[-1]
except IndexError:
return None, None
return version, open(self._eggpath(project, version), 'rb')
def list(self, project):

View File

@ -12,9 +12,10 @@ def get_spider_list_from_eggfile(eggfile, project):
shutil.copyfileobj(eggfile, f)
f.flush()
eggfile.seek(0)
pargs = [sys.executable, '-m', 'scrapyd.eggrunner', f.name, 'list']
pargs = [sys.executable, '-m', 'scrapyd.eggrunner', 'list']
env = os.environ.copy()
env['SCRAPY_PROJECT'] = project
env['SCRAPY_EGGFILE'] = f.name
proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env)
out = proc.communicate()[0]
return out.splitlines()

View File

@ -11,11 +11,19 @@ class Environment(object):
def __init__(self, config):
self.dbs_dir = config.get('dbs_dir', 'dbs')
self.logs_dir = config.get('logs_dir', 'logs')
if config.cp.has_section('settings'):
self.settings = dict(config.cp.items('settings'))
else:
self.settings = {}
def get_environment(self, message, slot):
def get_environment(self, message, slot, eggpath):
project = message['project']
env = os.environ.copy()
env['SCRAPY_PROJECT'] = project
if eggpath:
env['SCRAPY_EGGFILE'] = eggpath
elif project in self.settings:
env['SCRAPY_SETTINGS_MODULE'] = self.settings[project]
dbpath = os.path.join(self.dbs_dir, '%s.db' % project)
env['SCRAPY_SQLITE_DB'] = dbpath
logpath = os.path.join(self.logs_dir, 'slot%s.log' % slot)

View File

@ -10,7 +10,8 @@ class IEggStorage(Interface):
def get(project, version=None):
"""Return a tuple (version, file) with the the egg for the specified
project and version. If version is None, the latest version is
returned."""
returned. If no egg is found for the given project/version (None, None)
should be returned."""
def list(project):
"""Return the list of versions which have eggs stored (for the given
@ -61,9 +62,13 @@ class ISpiderScheduler(Interface):
class IEnvironment(Interface):
"""A component to generate the environment of crawler processes"""
def get_environment(message, slot):
def get_environment(message, slot, eggpath):
"""Return the environment variables to use for running the process.
`message` is the message received from the IPoller.next()
`slot` is the Launcher slot where the process will be running.
`eggpath` is the path to an eggfile that contains the project code. The
`eggpath` may be `None` if no egg was found for the project, in
which case the project must be on the python path and its settings
defined in scrapyd.conf [settings] section
"""

View File

@ -39,14 +39,15 @@ class Launcher(Service):
def _spawn_process(self, message, slot):
project = message['project']
eggpath = self._get_eggpath(project)
args = [sys.executable, '-m', self.egg_runner, eggpath, 'crawl']
args = [sys.executable, '-m', self.egg_runner, 'crawl']
e = self.app.getComponent(IEnvironment)
env = e.get_environment(message, slot)
env = e.get_environment(message, slot, eggpath)
pp = ScrapyProcessProtocol(eggpath, slot)
pp.deferred.addBoth(self._process_finished, eggpath, slot)
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
def _process_finished(self, _, eggpath, slot):
if eggpath:
os.remove(eggpath)
self._wait_for_project(slot)

View File

@ -14,15 +14,29 @@ class EggStorageTest(unittest.TestCase):
d = self.mktemp()
os.mkdir(d)
config = Config(values={'eggs_dir': d, 'logs_dir': d})
config.cp.add_section('settings')
config.cp.set('settings', 'newbot', 'newbot.settings')
self.environ = Environment(config)
def test_interface(self):
verifyObject(IEnvironment, self.environ)
def test_get_environment(self):
def test_get_environment_with_eggfile(self):
msg = {'project': 'mybot'}
slot = 3
env = self.environ.get_environment(msg, slot)
env = self.environ.get_environment(msg, slot, '/path/to/file.egg')
self.assertEqual(env['SCRAPY_PROJECT'], 'mybot')
self.assert_(env['SCRAPY_SQLITE_DB'].endswith('mybot.db'))
self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log'))
self.assert_(env['SCRAPY_EGGFILE'].endswith('/path/to/file.egg'))
self.failIf('SCRAPY_SETTINGS_MODULE' in env)
def test_get_environment_without_eggfile(self):
msg = {'project': 'newbot'}
slot = 3
env = self.environ.get_environment(msg, slot, None)
self.assertEqual(env['SCRAPY_PROJECT'], 'newbot')
self.assert_(env['SCRAPY_SQLITE_DB'].endswith('newbot.db'))
self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log'))
self.assertEqual(env['SCRAPY_SETTINGS_MODULE'], 'newbot.settings')
self.failIf('SCRAPY_EGGFILE' in env)