mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 04:04:21 +00:00
A couple of changes to fix #303:
* improved detection of inside-project environments * make list command faster (by only instantiating the spider manger) * print a warning when extensions (middlewares, etc) are disabled with a message on NotConfigured exception * assert that scrapy configuration hasn't been loaded in scrapyd.runner * simplified IgnoreRequest exception, to avoid loading settings when importing scrapy.exceptions * added test to make sure certain modules don't cause scrapy.conf module to be loaded, to ensure the scrapyd runner bootstraping performs properly
This commit is contained in:
parent
48b30ba939
commit
048044c1f8
@ -13,6 +13,7 @@ from scrapy.conf import settings
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.utils.misc import walk_modules
|
||||
from scrapy.utils.project import inside_project
|
||||
|
||||
def _iter_command_classes(module_name):
|
||||
# TODO: add `name` attribute to commands and and merge this function with
|
||||
@ -106,7 +107,7 @@ def execute(argv=None):
|
||||
argv = sys.argv
|
||||
crawler = CrawlerProcess(settings)
|
||||
crawler.install()
|
||||
inproject = bool(settings.settings_module)
|
||||
inproject = inside_project()
|
||||
_check_deprecated_scrapy_ctl(argv, inproject) # TODO: remove for Scrapy 0.11
|
||||
cmds = _get_commands_dict(inproject)
|
||||
cmdname = _pop_command_name(argv)
|
||||
|
@ -1,4 +1,8 @@
|
||||
import os
|
||||
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.conf import settings
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
@ -9,4 +13,6 @@ class Command(ScrapyCommand):
|
||||
return "List available spiders"
|
||||
|
||||
def run(self, args, opts):
|
||||
print "\n".join(self.crawler.spiders.list())
|
||||
spman_cls = load_object(settings['SPIDER_MANAGER_CLASS'])
|
||||
spiders = spman_cls.from_settings(settings)
|
||||
print os.linesep.join(spiders.list())
|
||||
|
@ -166,13 +166,11 @@ class ExecutionEngine(object):
|
||||
exc = _failure.value
|
||||
if isinstance(exc, IgnoreRequest):
|
||||
errmsg = _failure.getErrorMessage()
|
||||
level = exc.level
|
||||
else:
|
||||
errmsg = str(_failure)
|
||||
level = log.ERROR
|
||||
if errmsg:
|
||||
log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \
|
||||
level=level, spider=spider)
|
||||
level=log.ERROR, spider=spider)
|
||||
return Failure(IgnoreRequest(str(exc)))
|
||||
|
||||
def _on_complete(_):
|
||||
|
@ -5,8 +5,6 @@ These exceptions are documented in docs/topics/exceptions.rst. Please don't add
|
||||
new exceptions here without documenting them there.
|
||||
"""
|
||||
|
||||
from scrapy import log
|
||||
|
||||
# Internal
|
||||
|
||||
class NotConfigured(Exception):
|
||||
@ -18,13 +16,6 @@ class NotConfigured(Exception):
|
||||
class IgnoreRequest(Exception):
|
||||
"""Indicates a decision was made not to process a request"""
|
||||
|
||||
def __init__(self, msg='', level=log.ERROR):
|
||||
self.msg = msg
|
||||
self.level = level
|
||||
|
||||
def __str__(self):
|
||||
return self.msg
|
||||
|
||||
class DontCloseSpider(Exception):
|
||||
"""Request the spider not to be closed yet"""
|
||||
pass
|
||||
|
@ -34,7 +34,8 @@ class MiddlewareManager(object):
|
||||
middlewares.append(mw)
|
||||
except NotConfigured, e:
|
||||
if e.args:
|
||||
log.msg(e)
|
||||
clsname = clspath.split('.')[-1]
|
||||
log.msg("Disabled %s: %s" % (clsname, e.args[0]), log.WARNING)
|
||||
enabled = [x.__class__.__name__ for x in middlewares]
|
||||
log.msg("Enabled %ss: %s" % (cls.component_name, ", ".join(enabled)), \
|
||||
level=log.DEBUG)
|
||||
|
@ -4,6 +4,7 @@ import warnings
|
||||
|
||||
from scrapy.utils.conf import closest_scrapy_cfg, get_config
|
||||
from scrapy.utils.python import is_writable
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
DATADIR_CFG_SECTION = 'datadir'
|
||||
|
||||
@ -20,12 +21,16 @@ def inside_project():
|
||||
|
||||
def project_data_dir(project='default'):
|
||||
"""Return the current project data dir, creating it if it doesn't exist"""
|
||||
assert inside_project(), "Not inside project"
|
||||
scrapy_cfg = closest_scrapy_cfg()
|
||||
d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
|
||||
if not inside_project():
|
||||
raise NotConfigured("Not inside a project")
|
||||
cfg = get_config()
|
||||
if cfg.has_option(DATADIR_CFG_SECTION, project):
|
||||
d = cfg.get(DATADIR_CFG_SECTION, project)
|
||||
else:
|
||||
scrapy_cfg = closest_scrapy_cfg()
|
||||
if not scrapy_cfg:
|
||||
raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir")
|
||||
d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
|
||||
if not exists(d):
|
||||
makedirs(d)
|
||||
return d
|
||||
@ -47,8 +52,12 @@ def sqlite_db(path, nonwritable_fallback=True):
|
||||
if not inside_project() or path == ':memory:':
|
||||
db = ':memory:'
|
||||
else:
|
||||
db = data_path(path)
|
||||
if not is_writable(db) and nonwritable_fallback:
|
||||
warnings.warn("%r is not writable - using in-memory SQLite instead" % db)
|
||||
try:
|
||||
db = data_path(path)
|
||||
except NotConfigured:
|
||||
db = ':memory:'
|
||||
else:
|
||||
if not is_writable(db) and nonwritable_fallback:
|
||||
warnings.warn("%r is not writable - using in-memory SQLite instead" % db)
|
||||
db = ':memory:'
|
||||
return db
|
||||
|
@ -1,3 +1,6 @@
|
||||
from __future__ import with_statement
|
||||
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
@ -22,6 +25,7 @@ def project_environment(project):
|
||||
else:
|
||||
eggpath = None
|
||||
try:
|
||||
assert 'scrapy.conf' not in sys.modules, "Scrapy settings already loaded"
|
||||
yield
|
||||
finally:
|
||||
if eggpath:
|
||||
|
22
scrapyd/tests/test_dont_load_settings.py
Normal file
22
scrapyd/tests/test_dont_load_settings.py
Normal file
@ -0,0 +1,22 @@
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
class SettingsSafeModulesTest(unittest.TestCase):
|
||||
|
||||
# these modules must not load scrapy.conf
|
||||
SETTINGS_SAFE_MODULES = [
|
||||
'scrapy.utils.project',
|
||||
'scrapy.utils.conf',
|
||||
'scrapyd.interfaces',
|
||||
'scrapyd.eggutils',
|
||||
]
|
||||
|
||||
def test_modules_that_shouldnt_load_settings(self):
|
||||
sys.modules.pop('scrapy.conf', None)
|
||||
for m in self.SETTINGS_SAFE_MODULES:
|
||||
__import__(m)
|
||||
assert 'scrapy.conf' not in sys.modules, \
|
||||
"Module %r must not cause the scrapy.conf module to be loaded" % m
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -25,7 +25,7 @@ class UtilsTest(unittest.TestCase):
|
||||
class GetSpiderListTest(unittest.TestCase):
|
||||
|
||||
def test_get_spider_list(self):
|
||||
path = self.mktemp()
|
||||
path = os.path.abspath(self.mktemp())
|
||||
j = os.path.join
|
||||
eggs_dir = j(path, 'eggs')
|
||||
os.makedirs(eggs_dir)
|
||||
|
Loading…
x
Reference in New Issue
Block a user