From 048044c1f8536b1e4d04712bba179053855294a8 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 5 Jan 2011 15:59:43 -0200 Subject: [PATCH] A couple of changes to fix #303: * improved detection of inside-project environments * make list command faster (by only instantiating the spider manger) * print a warning when extensions (middlewares, etc) are disabled with a message on NotConfigured exception * assert that scrapy configuration hasn't been loaded in scrapyd.runner * simplified IgnoreRequest exception, to avoid loading settings when importing scrapy.exceptions * added test to make sure certain modules don't cause scrapy.conf module to be loaded, to ensure the scrapyd runner bootstraping performs properly --- scrapy/cmdline.py | 3 ++- scrapy/commands/list.py | 8 +++++++- scrapy/core/engine.py | 4 +--- scrapy/exceptions.py | 9 --------- scrapy/middleware.py | 3 ++- scrapy/utils/project.py | 21 +++++++++++++++------ scrapyd/runner.py | 4 ++++ scrapyd/tests/test_dont_load_settings.py | 22 ++++++++++++++++++++++ scrapyd/tests/test_utils.py | 2 +- 9 files changed, 54 insertions(+), 22 deletions(-) create mode 100644 scrapyd/tests/test_dont_load_settings.py diff --git a/scrapy/cmdline.py b/scrapy/cmdline.py index 6884aad88..e381bf537 100644 --- a/scrapy/cmdline.py +++ b/scrapy/cmdline.py @@ -13,6 +13,7 @@ from scrapy.conf import settings from scrapy.command import ScrapyCommand from scrapy.exceptions import UsageError from scrapy.utils.misc import walk_modules +from scrapy.utils.project import inside_project def _iter_command_classes(module_name): # TODO: add `name` attribute to commands and and merge this function with @@ -106,7 +107,7 @@ def execute(argv=None): argv = sys.argv crawler = CrawlerProcess(settings) crawler.install() - inproject = bool(settings.settings_module) + inproject = inside_project() _check_deprecated_scrapy_ctl(argv, inproject) # TODO: remove for Scrapy 0.11 cmds = _get_commands_dict(inproject) cmdname = _pop_command_name(argv) diff --git a/scrapy/commands/list.py b/scrapy/commands/list.py index 29b57edfd..319265658 100644 --- a/scrapy/commands/list.py +++ b/scrapy/commands/list.py @@ -1,4 +1,8 @@ +import os + from scrapy.command import ScrapyCommand +from scrapy.utils.misc import load_object +from scrapy.conf import settings class Command(ScrapyCommand): @@ -9,4 +13,6 @@ class Command(ScrapyCommand): return "List available spiders" def run(self, args, opts): - print "\n".join(self.crawler.spiders.list()) + spman_cls = load_object(settings['SPIDER_MANAGER_CLASS']) + spiders = spman_cls.from_settings(settings) + print os.linesep.join(spiders.list()) diff --git a/scrapy/core/engine.py b/scrapy/core/engine.py index 25d5dcc36..abd98f2d5 100644 --- a/scrapy/core/engine.py +++ b/scrapy/core/engine.py @@ -166,13 +166,11 @@ class ExecutionEngine(object): exc = _failure.value if isinstance(exc, IgnoreRequest): errmsg = _failure.getErrorMessage() - level = exc.level else: errmsg = str(_failure) - level = log.ERROR if errmsg: log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \ - level=level, spider=spider) + level=log.ERROR, spider=spider) return Failure(IgnoreRequest(str(exc))) def _on_complete(_): diff --git a/scrapy/exceptions.py b/scrapy/exceptions.py index 3294dc4da..e52dafc6d 100644 --- a/scrapy/exceptions.py +++ b/scrapy/exceptions.py @@ -5,8 +5,6 @@ These exceptions are documented in docs/topics/exceptions.rst. Please don't add new exceptions here without documenting them there. """ -from scrapy import log - # Internal class NotConfigured(Exception): @@ -18,13 +16,6 @@ class NotConfigured(Exception): class IgnoreRequest(Exception): """Indicates a decision was made not to process a request""" - def __init__(self, msg='', level=log.ERROR): - self.msg = msg - self.level = level - - def __str__(self): - return self.msg - class DontCloseSpider(Exception): """Request the spider not to be closed yet""" pass diff --git a/scrapy/middleware.py b/scrapy/middleware.py index 818dcdb87..c83b0d44f 100644 --- a/scrapy/middleware.py +++ b/scrapy/middleware.py @@ -34,7 +34,8 @@ class MiddlewareManager(object): middlewares.append(mw) except NotConfigured, e: if e.args: - log.msg(e) + clsname = clspath.split('.')[-1] + log.msg("Disabled %s: %s" % (clsname, e.args[0]), log.WARNING) enabled = [x.__class__.__name__ for x in middlewares] log.msg("Enabled %ss: %s" % (cls.component_name, ", ".join(enabled)), \ level=log.DEBUG) diff --git a/scrapy/utils/project.py b/scrapy/utils/project.py index 15af61a97..7fe778868 100644 --- a/scrapy/utils/project.py +++ b/scrapy/utils/project.py @@ -4,6 +4,7 @@ import warnings from scrapy.utils.conf import closest_scrapy_cfg, get_config from scrapy.utils.python import is_writable +from scrapy.exceptions import NotConfigured DATADIR_CFG_SECTION = 'datadir' @@ -20,12 +21,16 @@ def inside_project(): def project_data_dir(project='default'): """Return the current project data dir, creating it if it doesn't exist""" - assert inside_project(), "Not inside project" - scrapy_cfg = closest_scrapy_cfg() - d = abspath(join(dirname(scrapy_cfg), '.scrapy')) + if not inside_project(): + raise NotConfigured("Not inside a project") cfg = get_config() if cfg.has_option(DATADIR_CFG_SECTION, project): d = cfg.get(DATADIR_CFG_SECTION, project) + else: + scrapy_cfg = closest_scrapy_cfg() + if not scrapy_cfg: + raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir") + d = abspath(join(dirname(scrapy_cfg), '.scrapy')) if not exists(d): makedirs(d) return d @@ -47,8 +52,12 @@ def sqlite_db(path, nonwritable_fallback=True): if not inside_project() or path == ':memory:': db = ':memory:' else: - db = data_path(path) - if not is_writable(db) and nonwritable_fallback: - warnings.warn("%r is not writable - using in-memory SQLite instead" % db) + try: + db = data_path(path) + except NotConfigured: db = ':memory:' + else: + if not is_writable(db) and nonwritable_fallback: + warnings.warn("%r is not writable - using in-memory SQLite instead" % db) + db = ':memory:' return db diff --git a/scrapyd/runner.py b/scrapyd/runner.py index 8f47a339b..c0acd0fea 100644 --- a/scrapyd/runner.py +++ b/scrapyd/runner.py @@ -1,3 +1,6 @@ +from __future__ import with_statement + +import sys import os import shutil import tempfile @@ -22,6 +25,7 @@ def project_environment(project): else: eggpath = None try: + assert 'scrapy.conf' not in sys.modules, "Scrapy settings already loaded" yield finally: if eggpath: diff --git a/scrapyd/tests/test_dont_load_settings.py b/scrapyd/tests/test_dont_load_settings.py new file mode 100644 index 000000000..cefa987db --- /dev/null +++ b/scrapyd/tests/test_dont_load_settings.py @@ -0,0 +1,22 @@ +import sys +import unittest + +class SettingsSafeModulesTest(unittest.TestCase): + + # these modules must not load scrapy.conf + SETTINGS_SAFE_MODULES = [ + 'scrapy.utils.project', + 'scrapy.utils.conf', + 'scrapyd.interfaces', + 'scrapyd.eggutils', + ] + + def test_modules_that_shouldnt_load_settings(self): + sys.modules.pop('scrapy.conf', None) + for m in self.SETTINGS_SAFE_MODULES: + __import__(m) + assert 'scrapy.conf' not in sys.modules, \ + "Module %r must not cause the scrapy.conf module to be loaded" % m + +if __name__ == "__main__": + unittest.main() diff --git a/scrapyd/tests/test_utils.py b/scrapyd/tests/test_utils.py index e5b53beb0..624b82473 100644 --- a/scrapyd/tests/test_utils.py +++ b/scrapyd/tests/test_utils.py @@ -25,7 +25,7 @@ class UtilsTest(unittest.TestCase): class GetSpiderListTest(unittest.TestCase): def test_get_spider_list(self): - path = self.mktemp() + path = os.path.abspath(self.mktemp()) j = os.path.join eggs_dir = j(path, 'eggs') os.makedirs(eggs_dir)