mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 09:04:16 +00:00
Support running projects without eggs in Scrapyd. Closes #238
This commit is contained in:
parent
b76cd42690
commit
833baa6041
@ -1,27 +1,9 @@
|
|||||||
"""
|
import sys, os
|
||||||
This module can be used to run a Scrapy project contained in an egg file
|
|
||||||
|
|
||||||
To see all spiders in a project:
|
|
||||||
|
|
||||||
python -m scrapyd.eggrunner myproject.egg list
|
|
||||||
|
|
||||||
To crawl a spider:
|
|
||||||
|
|
||||||
python -m scrapyd.eggrunner myproject.egg crawl somespider
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from scrapyd.eggutils import activate_egg
|
from scrapyd.eggutils import activate_egg
|
||||||
|
|
||||||
def main(eggpath, args):
|
eggpath = os.environ.get('SCRAPY_EGGFILE')
|
||||||
"""Run scrapy for the settings module name passed"""
|
if eggpath:
|
||||||
activate_egg(eggpath)
|
activate_egg(eggpath)
|
||||||
from scrapy.cmdline import execute
|
from scrapy.cmdline import execute
|
||||||
execute(['scrapy'] + list(args))
|
execute()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
print "usage: %s <eggfile> [scrapy_command args ...]" % sys.argv[0]
|
|
||||||
sys.exit(1)
|
|
||||||
main(sys.argv[1], sys.argv[2:])
|
|
||||||
|
@ -26,7 +26,10 @@ class FilesystemEggStorage(object):
|
|||||||
|
|
||||||
def get(self, project, version=None):
|
def get(self, project, version=None):
|
||||||
if version is None:
|
if version is None:
|
||||||
version = self.list(project)[-1]
|
try:
|
||||||
|
version = self.list(project)[-1]
|
||||||
|
except IndexError:
|
||||||
|
return None, None
|
||||||
return version, open(self._eggpath(project, version), 'rb')
|
return version, open(self._eggpath(project, version), 'rb')
|
||||||
|
|
||||||
def list(self, project):
|
def list(self, project):
|
||||||
|
@ -12,9 +12,10 @@ def get_spider_list_from_eggfile(eggfile, project):
|
|||||||
shutil.copyfileobj(eggfile, f)
|
shutil.copyfileobj(eggfile, f)
|
||||||
f.flush()
|
f.flush()
|
||||||
eggfile.seek(0)
|
eggfile.seek(0)
|
||||||
pargs = [sys.executable, '-m', 'scrapyd.eggrunner', f.name, 'list']
|
pargs = [sys.executable, '-m', 'scrapyd.eggrunner', 'list']
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
env['SCRAPY_PROJECT'] = project
|
env['SCRAPY_PROJECT'] = project
|
||||||
|
env['SCRAPY_EGGFILE'] = f.name
|
||||||
proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env)
|
proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env)
|
||||||
out = proc.communicate()[0]
|
out = proc.communicate()[0]
|
||||||
return out.splitlines()
|
return out.splitlines()
|
||||||
|
@ -11,11 +11,19 @@ class Environment(object):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
self.dbs_dir = config.get('dbs_dir', 'dbs')
|
self.dbs_dir = config.get('dbs_dir', 'dbs')
|
||||||
self.logs_dir = config.get('logs_dir', 'logs')
|
self.logs_dir = config.get('logs_dir', 'logs')
|
||||||
|
if config.cp.has_section('settings'):
|
||||||
|
self.settings = dict(config.cp.items('settings'))
|
||||||
|
else:
|
||||||
|
self.settings = {}
|
||||||
|
|
||||||
def get_environment(self, message, slot):
|
def get_environment(self, message, slot, eggpath):
|
||||||
project = message['project']
|
project = message['project']
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
env['SCRAPY_PROJECT'] = project
|
env['SCRAPY_PROJECT'] = project
|
||||||
|
if eggpath:
|
||||||
|
env['SCRAPY_EGGFILE'] = eggpath
|
||||||
|
elif project in self.settings:
|
||||||
|
env['SCRAPY_SETTINGS_MODULE'] = self.settings[project]
|
||||||
dbpath = os.path.join(self.dbs_dir, '%s.db' % project)
|
dbpath = os.path.join(self.dbs_dir, '%s.db' % project)
|
||||||
env['SCRAPY_SQLITE_DB'] = dbpath
|
env['SCRAPY_SQLITE_DB'] = dbpath
|
||||||
logpath = os.path.join(self.logs_dir, 'slot%s.log' % slot)
|
logpath = os.path.join(self.logs_dir, 'slot%s.log' % slot)
|
||||||
|
@ -10,7 +10,8 @@ class IEggStorage(Interface):
|
|||||||
def get(project, version=None):
|
def get(project, version=None):
|
||||||
"""Return a tuple (version, file) with the the egg for the specified
|
"""Return a tuple (version, file) with the the egg for the specified
|
||||||
project and version. If version is None, the latest version is
|
project and version. If version is None, the latest version is
|
||||||
returned."""
|
returned. If no egg is found for the given project/version (None, None)
|
||||||
|
should be returned."""
|
||||||
|
|
||||||
def list(project):
|
def list(project):
|
||||||
"""Return the list of versions which have eggs stored (for the given
|
"""Return the list of versions which have eggs stored (for the given
|
||||||
@ -61,9 +62,13 @@ class ISpiderScheduler(Interface):
|
|||||||
class IEnvironment(Interface):
|
class IEnvironment(Interface):
|
||||||
"""A component to generate the environment of crawler processes"""
|
"""A component to generate the environment of crawler processes"""
|
||||||
|
|
||||||
def get_environment(message, slot):
|
def get_environment(message, slot, eggpath):
|
||||||
"""Return the environment variables to use for running the process.
|
"""Return the environment variables to use for running the process.
|
||||||
|
|
||||||
`message` is the message received from the IPoller.next()
|
`message` is the message received from the IPoller.next()
|
||||||
`slot` is the Launcher slot where the process will be running.
|
`slot` is the Launcher slot where the process will be running.
|
||||||
|
`eggpath` is the path to an eggfile that contains the project code. The
|
||||||
|
`eggpath` may be `None` if no egg was found for the project, in
|
||||||
|
which case the project must be on the python path and its settings
|
||||||
|
defined in scrapyd.conf [settings] section
|
||||||
"""
|
"""
|
||||||
|
@ -39,15 +39,16 @@ class Launcher(Service):
|
|||||||
def _spawn_process(self, message, slot):
|
def _spawn_process(self, message, slot):
|
||||||
project = message['project']
|
project = message['project']
|
||||||
eggpath = self._get_eggpath(project)
|
eggpath = self._get_eggpath(project)
|
||||||
args = [sys.executable, '-m', self.egg_runner, eggpath, 'crawl']
|
args = [sys.executable, '-m', self.egg_runner, 'crawl']
|
||||||
e = self.app.getComponent(IEnvironment)
|
e = self.app.getComponent(IEnvironment)
|
||||||
env = e.get_environment(message, slot)
|
env = e.get_environment(message, slot, eggpath)
|
||||||
pp = ScrapyProcessProtocol(eggpath, slot)
|
pp = ScrapyProcessProtocol(eggpath, slot)
|
||||||
pp.deferred.addBoth(self._process_finished, eggpath, slot)
|
pp.deferred.addBoth(self._process_finished, eggpath, slot)
|
||||||
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
|
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
|
||||||
|
|
||||||
def _process_finished(self, _, eggpath, slot):
|
def _process_finished(self, _, eggpath, slot):
|
||||||
os.remove(eggpath)
|
if eggpath:
|
||||||
|
os.remove(eggpath)
|
||||||
self._wait_for_project(slot)
|
self._wait_for_project(slot)
|
||||||
|
|
||||||
|
|
||||||
|
@ -14,15 +14,29 @@ class EggStorageTest(unittest.TestCase):
|
|||||||
d = self.mktemp()
|
d = self.mktemp()
|
||||||
os.mkdir(d)
|
os.mkdir(d)
|
||||||
config = Config(values={'eggs_dir': d, 'logs_dir': d})
|
config = Config(values={'eggs_dir': d, 'logs_dir': d})
|
||||||
|
config.cp.add_section('settings')
|
||||||
|
config.cp.set('settings', 'newbot', 'newbot.settings')
|
||||||
self.environ = Environment(config)
|
self.environ = Environment(config)
|
||||||
|
|
||||||
def test_interface(self):
|
def test_interface(self):
|
||||||
verifyObject(IEnvironment, self.environ)
|
verifyObject(IEnvironment, self.environ)
|
||||||
|
|
||||||
def test_get_environment(self):
|
def test_get_environment_with_eggfile(self):
|
||||||
msg = {'project': 'mybot'}
|
msg = {'project': 'mybot'}
|
||||||
slot = 3
|
slot = 3
|
||||||
env = self.environ.get_environment(msg, slot)
|
env = self.environ.get_environment(msg, slot, '/path/to/file.egg')
|
||||||
self.assertEqual(env['SCRAPY_PROJECT'], 'mybot')
|
self.assertEqual(env['SCRAPY_PROJECT'], 'mybot')
|
||||||
self.assert_(env['SCRAPY_SQLITE_DB'].endswith('mybot.db'))
|
self.assert_(env['SCRAPY_SQLITE_DB'].endswith('mybot.db'))
|
||||||
self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log'))
|
self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log'))
|
||||||
|
self.assert_(env['SCRAPY_EGGFILE'].endswith('/path/to/file.egg'))
|
||||||
|
self.failIf('SCRAPY_SETTINGS_MODULE' in env)
|
||||||
|
|
||||||
|
def test_get_environment_without_eggfile(self):
|
||||||
|
msg = {'project': 'newbot'}
|
||||||
|
slot = 3
|
||||||
|
env = self.environ.get_environment(msg, slot, None)
|
||||||
|
self.assertEqual(env['SCRAPY_PROJECT'], 'newbot')
|
||||||
|
self.assert_(env['SCRAPY_SQLITE_DB'].endswith('newbot.db'))
|
||||||
|
self.assert_(env['SCRAPY_LOG_FILE'].endswith('slot3.log'))
|
||||||
|
self.assertEqual(env['SCRAPY_SETTINGS_MODULE'], 'newbot.settings')
|
||||||
|
self.failIf('SCRAPY_EGGFILE' in env)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user