From edfba29dfd063b0b0450f252261fd644dad98842 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 15 Sep 2010 14:27:27 -0300 Subject: [PATCH 01/19] Re-bumped version to 0.11 --- scrapy/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy/__init__.py b/scrapy/__init__.py index de7d7c611..37cb3a0a0 100644 --- a/scrapy/__init__.py +++ b/scrapy/__init__.py @@ -2,8 +2,8 @@ Scrapy - a screen scraping framework written in Python """ -version_info = (0, 10, 2, '') -__version__ = "0.10.2" +version_info = (0, 11, 0, 'dev') +__version__ = "0.11" import sys, os, warnings From b6c2b55e5b2d5ff2bb6d13d46d27cd5f273f6cdc Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 22 Sep 2010 15:47:33 -0300 Subject: [PATCH 02/19] Splitted settings classes from settings singleton. Closes #244 --HG-- rename : scrapy/conf/__init__.py => scrapy/conf.py rename : scrapy/conf/default_settings.py => scrapy/settings/default_settings.py rename : scrapy/tests/test_conf.py => scrapy/tests/test_settings.py --- docs/topics/settings.rst | 38 +++----- .../googledir/googledir/settings.py | 5 - examples/experimental/imdb/imdb/settings.py | 5 - scrapy/conf.py | 40 ++++++++ scrapy/conf/__init__.py | 94 ------------------- scrapy/contrib/downloadermiddleware/retry.py | 3 - scrapy/settings/__init__.py | 59 ++++++++++++ scrapy/{conf => settings}/default_settings.py | 0 scrapy/shell.py | 3 +- .../templates/project/module/settings.py.tmpl | 4 - .../test_downloadermiddleware_httpcache.py | 2 +- scrapy/tests/test_engine.py | 2 +- scrapy/tests/test_middleware.py | 2 +- scrapy/tests/test_pipeline_media.py | 2 +- .../tests/{test_conf.py => test_settings.py} | 2 +- 15 files changed, 118 insertions(+), 143 deletions(-) create mode 100644 scrapy/conf.py delete mode 100644 scrapy/conf/__init__.py create mode 100644 scrapy/settings/__init__.py rename scrapy/{conf => settings}/default_settings.py (100%) rename scrapy/tests/{test_conf.py => test_settings.py} (98%) diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index 2f32be53e..a63eb1f62 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -39,10 +39,9 @@ different precedence. Here is the list of them in decreasing order of precedence: 1. Global overrides (most precedence) - 2. Environment variables - 3. scrapy_settings - 4. Default settings per-command - 5. Default global settings (less precedence) + 2. Project settings module + 3. Default settings per-command + 4. Default global settings (less precedence) These mechanisms are described in more detail below. @@ -65,27 +64,14 @@ Example:: scrapy crawl domain.com --set LOG_FILE=scrapy.log -2. Environment variables ------------------------- +2. Project settings module +-------------------------- -You can populate settings using environment variables prefixed with -``SCRAPY_``. For example, to change the log file location un Unix systems:: +The project settings module is the standard configuration file for your Scrapy +project. It's where most of your custom settings will be populated. For +example:: ``myproject.settings``. - $ export SCRAPY_LOG_FILE=scrapy.log - $ scrapy crawl example.com - -In Windows systems, you can change the environment variables from the Control -Panel following `these guidelines`_. - -.. _these guidelines: http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/sysdm_advancd_environmnt_addchange_variable.mspx - -3. scrapy_settings ------------------- - -scrapy_settings is the standard configuration file for your Scrapy project. -It's where most of your custom settings will be populated. - -4. Default settings per-command +3. Default settings per-command ------------------------------- Each :doc:`Scrapy tool ` command can have its own default @@ -93,11 +79,11 @@ settings, which override the global default settings. Those custom command settings are specified in the ``default_settings`` attribute of the command class. -5. Default global settings +4. Default global settings -------------------------- -The global defaults are located in scrapy.conf.default_settings and documented -in the :ref:`topics-settings-ref` section. +The global defaults are located in the ``scrapy.settings.default_settings`` +module and documented in the :ref:`topics-settings-ref` section. How to access settings ====================== diff --git a/examples/experimental/googledir/googledir/settings.py b/examples/experimental/googledir/googledir/settings.py index 4e3c11163..38f1df2a5 100644 --- a/examples/experimental/googledir/googledir/settings.py +++ b/examples/experimental/googledir/googledir/settings.py @@ -4,11 +4,6 @@ # default. All the other settings are documented here: # # http://doc.scrapy.org/topics/settings.html -# -# Or you can copy and paste them from where they're defined in Scrapy: -# -# scrapy/conf/default_settings.py -# BOT_NAME = 'googledir' BOT_VERSION = '1.0' diff --git a/examples/experimental/imdb/imdb/settings.py b/examples/experimental/imdb/imdb/settings.py index de026dc14..e0a8db52a 100644 --- a/examples/experimental/imdb/imdb/settings.py +++ b/examples/experimental/imdb/imdb/settings.py @@ -4,11 +4,6 @@ # default. All the other settings are documented here: # # http://doc.scrapy.org/topics/settings.html -# -# Or you can copy and paste them from where they're defined in Scrapy: -# -# scrapy/conf/default_settings.py -# BOT_NAME = 'imdb' BOT_VERSION = '1.0' diff --git a/scrapy/conf.py b/scrapy/conf.py new file mode 100644 index 000000000..c47c38626 --- /dev/null +++ b/scrapy/conf.py @@ -0,0 +1,40 @@ +""" +Scrapy settings manager + +See documentation in docs/topics/settings.rst +""" + +import os +import cPickle as pickle + +from scrapy.settings import CrawlerSettings +from scrapy.utils.conf import init_env + +ENVVAR = 'SCRAPY_SETTINGS_MODULE' + +def get_project_settings(): + if ENVVAR not in os.environ: + project = os.environ.get('SCRAPY_PROJECT', 'default') + init_env(project) + settings_module_path = os.environ.get(ENVVAR, 'scrapy_settings') + try: + settings_module = __import__(settings_module_path, {}, {}, ['']) + except ImportError: + settings_module = None + settings = CrawlerSettings(settings_module) + + # XXX: remove this hack + pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE") + settings.overrides = pickle.loads(pickled_settings) if pickled_settings else {} + + # XXX: deprecate and remove this functionality + for k, v in os.environ.items(): + if k.startswith('SCRAPY_'): + settings.overrides[k[7:]] = v + + return settings + +if os.environ.get('SCRAPY_SETTINGS_DISABLED'): + settings = CrawlerSettings() +else: + settings = get_project_settings() diff --git a/scrapy/conf/__init__.py b/scrapy/conf/__init__.py deleted file mode 100644 index e9f3fb136..000000000 --- a/scrapy/conf/__init__.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -Scrapy settings manager - -See documentation in docs/topics/settings.rst -""" - -import os -import cPickle as pickle - -from scrapy.conf import default_settings -from scrapy.utils.conf import init_env - -import_ = lambda x: __import__(x, {}, {}, ['']) - - -class Settings(object): - - def __init__(self, values=None): - self.values = values.copy() if values else {} - self.global_defaults = default_settings - - def __getitem__(self, opt_name): - if opt_name in self.values: - return self.values[opt_name] - return getattr(self.global_defaults, opt_name, None) - - def get(self, name, default=None): - return self[name] if self[name] is not None else default - - def getbool(self, name, default=False): - """ - True is: 1, '1', True - False is: 0, '0', False, None - """ - return bool(int(self.get(name, default))) - - def getint(self, name, default=0): - return int(self.get(name, default)) - - def getfloat(self, name, default=0.0): - return float(self.get(name, default)) - - def getlist(self, name, default=None): - value = self.get(name) - if value is None: - return default or [] - elif hasattr(value, '__iter__'): - return value - else: - return str(value).split(',') - - -class EnvironmentSettings(Settings): - - ENVVAR = 'SCRAPY_SETTINGS_MODULE' - - def __init__(self): - super(EnvironmentSettings, self).__init__() - self.defaults = {} - self.disabled = os.environ.get('SCRAPY_SETTINGS_DISABLED', False) - if self.ENVVAR not in os.environ: - project = os.environ.get('SCRAPY_PROJECT', 'default') - init_env(project) - settings_module_path = os.environ.get(self.ENVVAR, 'scrapy_settings') - self.set_settings_module(settings_module_path) - - # XXX: find a better solution for this hack - pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE") - self.overrides = pickle.loads(pickled_settings) if pickled_settings else {} - - def set_settings_module(self, settings_module_path): - self.settings_module_path = settings_module_path - try: - self.settings_module = import_(settings_module_path) - except ImportError: - self.settings_module = None - - def __getitem__(self, opt_name): - if not self.disabled: - if opt_name in self.overrides: - return self.overrides[opt_name] - if 'SCRAPY_' + opt_name in os.environ: - return os.environ['SCRAPY_' + opt_name] - if hasattr(self.settings_module, opt_name): - return getattr(self.settings_module, opt_name) - if opt_name in self.defaults: - return self.defaults[opt_name] - return super(EnvironmentSettings, self).__getitem__(opt_name) - - def __str__(self): - return "" % self.settings_module_path - - -settings = EnvironmentSettings() diff --git a/scrapy/contrib/downloadermiddleware/retry.py b/scrapy/contrib/downloadermiddleware/retry.py index 854dbb56a..9c0a2c788 100644 --- a/scrapy/contrib/downloadermiddleware/retry.py +++ b/scrapy/contrib/downloadermiddleware/retry.py @@ -11,9 +11,6 @@ once the spider has finished crawling all regular (non failed) pages. Once there is no more failed pages to retry this middleware sends a signal (retry_complete), so other extensions could connect to that signal. -Default values are located in scrapy.conf.default_settings, like any other -setting - About HTTP errors to consider: - You may want to remove 400 from RETRY_HTTP_CODES, if you stick to the HTTP diff --git a/scrapy/settings/__init__.py b/scrapy/settings/__init__.py new file mode 100644 index 000000000..30b80a1cf --- /dev/null +++ b/scrapy/settings/__init__.py @@ -0,0 +1,59 @@ +from . import default_settings + + +class Settings(object): + + def __init__(self, values=None): + self.values = values.copy() if values else {} + self.global_defaults = default_settings + + def __getitem__(self, opt_name): + if opt_name in self.values: + return self.values[opt_name] + return getattr(self.global_defaults, opt_name, None) + + def get(self, name, default=None): + return self[name] if self[name] is not None else default + + def getbool(self, name, default=False): + """ + True is: 1, '1', True + False is: 0, '0', False, None + """ + return bool(int(self.get(name, default))) + + def getint(self, name, default=0): + return int(self.get(name, default)) + + def getfloat(self, name, default=0.0): + return float(self.get(name, default)) + + def getlist(self, name, default=None): + value = self.get(name) + if value is None: + return default or [] + elif hasattr(value, '__iter__'): + return value + else: + return str(value).split(',') + + +class CrawlerSettings(Settings): + + def __init__(self, settings_module=None, **kw): + super(CrawlerSettings, self).__init__(**kw) + self.settings_module = settings_module + self.overrides = {} + self.defaults = {} + + def __getitem__(self, opt_name): + if opt_name in self.overrides: + return self.overrides[opt_name] + if self.settings_module and hasattr(self.settings_module, opt_name): + return getattr(self.settings_module, opt_name) + if opt_name in self.defaults: + return self.defaults[opt_name] + return super(CrawlerSettings, self).__getitem__(opt_name) + + def __str__(self): + return "" % self.settings_module.__name__ diff --git a/scrapy/conf/default_settings.py b/scrapy/settings/default_settings.py similarity index 100% rename from scrapy/conf/default_settings.py rename to scrapy/settings/default_settings.py diff --git a/scrapy/shell.py b/scrapy/shell.py index 12947f480..a4f4594b0 100644 --- a/scrapy/shell.py +++ b/scrapy/shell.py @@ -18,7 +18,8 @@ from scrapy.utils.misc import load_object from scrapy.utils.response import open_in_browser from scrapy.utils.url import any_to_uri from scrapy.utils.console import start_python_console -from scrapy.conf import settings, Settings +from scrapy.conf import settings +from scrapy.settings import Settings from scrapy.http import Request, Response, TextResponse class Shell(object): diff --git a/scrapy/templates/project/module/settings.py.tmpl b/scrapy/templates/project/module/settings.py.tmpl index 07f576f91..ce2751f0f 100644 --- a/scrapy/templates/project/module/settings.py.tmpl +++ b/scrapy/templates/project/module/settings.py.tmpl @@ -5,10 +5,6 @@ # # http://doc.scrapy.org/topics/settings.html # -# Or you can copy and paste them from where they're defined in Scrapy: -# -# scrapy/conf/default_settings.py -# BOT_NAME = '$project_name' BOT_VERSION = '1.0' diff --git a/scrapy/tests/test_downloadermiddleware_httpcache.py b/scrapy/tests/test_downloadermiddleware_httpcache.py index 4e569ea7a..ae6339208 100644 --- a/scrapy/tests/test_downloadermiddleware_httpcache.py +++ b/scrapy/tests/test_downloadermiddleware_httpcache.py @@ -3,7 +3,7 @@ import unittest, tempfile, shutil, time from scrapy.http import Response, HtmlResponse, Request from scrapy.spider import BaseSpider from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage, HttpCacheMiddleware -from scrapy.conf import Settings +from scrapy.settings import Settings from scrapy.exceptions import IgnoreRequest diff --git a/scrapy/tests/test_engine.py b/scrapy/tests/test_engine.py index 6549fefa6..8f4ab8fb7 100644 --- a/scrapy/tests/test_engine.py +++ b/scrapy/tests/test_engine.py @@ -17,7 +17,7 @@ from twisted.web import server, static, util from twisted.trial import unittest from scrapy import signals -from scrapy.conf import Settings +from scrapy.settings import Settings from scrapy.crawler import Crawler from scrapy.xlib.pydispatch import dispatcher from scrapy.tests import tests_datadir diff --git a/scrapy/tests/test_middleware.py b/scrapy/tests/test_middleware.py index 1eca89b27..145b86b02 100644 --- a/scrapy/tests/test_middleware.py +++ b/scrapy/tests/test_middleware.py @@ -1,6 +1,6 @@ from twisted.trial import unittest -from scrapy.conf import Settings +from scrapy.settings import Settings from scrapy.exceptions import NotConfigured from scrapy.middleware import MiddlewareManager diff --git a/scrapy/tests/test_pipeline_media.py b/scrapy/tests/test_pipeline_media.py index 6877fcfad..a9731e6bd 100644 --- a/scrapy/tests/test_pipeline_media.py +++ b/scrapy/tests/test_pipeline_media.py @@ -2,7 +2,7 @@ from twisted.trial import unittest from twisted.python import failure from twisted.internet import defer, reactor -from scrapy.conf import Settings +from scrapy.settings import Settings from scrapy.crawler import Crawler from scrapy.http import Request, Response from scrapy.spider import BaseSpider diff --git a/scrapy/tests/test_conf.py b/scrapy/tests/test_settings.py similarity index 98% rename from scrapy/tests/test_conf.py rename to scrapy/tests/test_settings.py index 9eddb0206..dfccf5dff 100644 --- a/scrapy/tests/test_conf.py +++ b/scrapy/tests/test_settings.py @@ -1,6 +1,6 @@ import unittest -from scrapy.conf import Settings +from scrapy.settings import Settings class SettingsTest(unittest.TestCase): From a4639ffb06ff261e9587588ae58794a39b3b0f31 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 22 Sep 2010 16:08:18 -0300 Subject: [PATCH 03/19] Removed hacky SCRAPY_SETTINGS_DISABLED environment variable --- bin/runtests.sh | 3 --- scrapy/conf.py | 5 +---- scrapy/tests/test_cmdline/__init__.py | 1 - scrapy/tests/test_commands.py | 1 - scrapyd/eggutils.py | 1 - 5 files changed, 1 insertion(+), 10 deletions(-) diff --git a/bin/runtests.sh b/bin/runtests.sh index a714a208d..3c8079210 100755 --- a/bin/runtests.sh +++ b/bin/runtests.sh @@ -12,9 +12,6 @@ else exit 1 fi -# disable custom settings for running tests in a neutral environment -export SCRAPY_SETTINGS_DISABLED=1 - # use vsftpd (if available) for testing ftp feed storage if type vsftpd >/dev/null 2>&1; then vsftpd_conf=$(mktemp /tmp/vsftpd-XXXX) diff --git a/scrapy/conf.py b/scrapy/conf.py index c47c38626..795c8d7c0 100644 --- a/scrapy/conf.py +++ b/scrapy/conf.py @@ -34,7 +34,4 @@ def get_project_settings(): return settings -if os.environ.get('SCRAPY_SETTINGS_DISABLED'): - settings = CrawlerSettings() -else: - settings = get_project_settings() +settings = get_project_settings() diff --git a/scrapy/tests/test_cmdline/__init__.py b/scrapy/tests/test_cmdline/__init__.py index 3588436b9..d08f266f1 100644 --- a/scrapy/tests/test_cmdline/__init__.py +++ b/scrapy/tests/test_cmdline/__init__.py @@ -10,7 +10,6 @@ class CmdlineTest(unittest.TestCase): def setUp(self): self.env = os.environ.copy() self.env['PYTHONPATH'] = os.path.dirname(scrapy.__path__[0]) - self.env.pop('SCRAPY_SETTINGS_DISABLED', None) self.env['SCRAPY_SETTINGS_MODULE'] = 'scrapy.tests.test_cmdline.settings' def _execute(self, *new_args, **kwargs): diff --git a/scrapy/tests/test_commands.py b/scrapy/tests/test_commands.py index 47f67338a..e2b671d19 100644 --- a/scrapy/tests/test_commands.py +++ b/scrapy/tests/test_commands.py @@ -61,7 +61,6 @@ class CommandTest(ProjectTest): super(CommandTest, self).setUp() self.call('startproject', self.project_name) self.cwd = join(self.temp_path, self.project_name) - self.env.pop('SCRAPY_SETTINGS_DISABLED', None) self.env['SCRAPY_SETTINGS_MODULE'] = '%s.settings' % self.project_name diff --git a/scrapyd/eggutils.py b/scrapyd/eggutils.py index 27e661cd0..c507f0be5 100644 --- a/scrapyd/eggutils.py +++ b/scrapyd/eggutils.py @@ -18,7 +18,6 @@ def get_spider_list_from_eggfile(eggfile, project): env = os.environ.copy() env['SCRAPY_PROJECT'] = project env['SCRAPY_EGGFILE'] = f.name - env.pop('SCRAPY_SETTINGS_DISABLED', None) proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env) out = proc.communicate()[0] return out.splitlines() From 2459d20cc0d3fd45acd1eeb2a98a6f12a2d57d2e Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 22 Sep 2010 16:09:13 -0300 Subject: [PATCH 04/19] Added support for unifying access to per-spider settings. Refs #245 --- scrapy/crawler.py | 1 + scrapy/settings/__init__.py | 22 ++++++++++++++++++++++ scrapy/spider.py | 16 ++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/scrapy/crawler.py b/scrapy/crawler.py index 7b153979e..22d0f2f91 100644 --- a/scrapy/crawler.py +++ b/scrapy/crawler.py @@ -54,6 +54,7 @@ class Crawler(object): @defer.inlineCallbacks def _start_spider(self, spider, requests): """Don't call this method. Use self.queue to start new spiders""" + spider.set_crawler(self) yield defer.maybeDeferred(self.engine.open_spider, spider) for request in requests: self.engine.crawl(request, spider) diff --git a/scrapy/settings/__init__.py b/scrapy/settings/__init__.py index 30b80a1cf..7ca18b3ca 100644 --- a/scrapy/settings/__init__.py +++ b/scrapy/settings/__init__.py @@ -57,3 +57,25 @@ class CrawlerSettings(Settings): def __str__(self): return "" % self.settings_module.__name__ + + +class SpiderSettings(Settings): + + def __init__(self, spider, crawler_settings, **kw): + super(SpiderSettings, self).__init__(**kw) + self.spider = spider + self.cset = crawler_settings + + def __getitem__(self, opt_name): + if opt_name in self.cset.overrides: + return self.cset.overrides[opt_name] + if hasattr(self.spider, opt_name): + return getattr(self.spider, opt_name) + if self.cset.settings_module and hasattr(self.cset.settings_module, opt_name): + return getattr(self.cset.settings_module, opt_name) + if opt_name in self.cset.defaults: + return self.cset.defaults[opt_name] + return super(SpiderSettings, self).__getitem__(opt_name) + + def __str__(self): + return "" % self.spider.name diff --git a/scrapy/spider.py b/scrapy/spider.py index b2d2f0e46..cb090e3db 100644 --- a/scrapy/spider.py +++ b/scrapy/spider.py @@ -5,6 +5,7 @@ See documentation in docs/topics/spiders.rst """ from scrapy import log +from scrapy.settings import SpiderSettings from scrapy.http import Request from scrapy.utils.misc import arg_to_iter from scrapy.utils.trackref import object_ref @@ -33,6 +34,21 @@ class BaseSpider(object_ref): """ log.msg(message, spider=self, level=level) + def set_crawler(self, crawler): + assert not hasattr(self, '_crawler'), "Spider already bounded to %s" % crawler + self._crawler = crawler + + @property + def crawler(self): + assert hasattr(self, '_crawler'), "Spider not bounded to any crawler" + return self._crawler + + @property + def settings(self): + if not hasattr(self, '_settings'): + self._settings = SpiderSettings(self, self.crawler.settings) + return self._settings + def start_requests(self): reqs = [] for url in self.start_urls: From ed4aec187f7868a7fcc7c7279489f02be79a8cf9 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 22 Sep 2010 16:09:13 -0300 Subject: [PATCH 05/19] Ported code to use new unified access to spider settings, keeping backwards compatibility for old spider attributes. Refs #245 --- docs/faq.rst | 2 +- docs/topics/commands.rst | 10 ++--- docs/topics/downloader-middleware.rst | 10 ++--- docs/topics/settings.rst | 4 +- .../downloadermiddleware/defaultheaders.py | 10 +---- .../downloadermiddleware/downloadtimeout.py | 6 ++- .../contrib/downloadermiddleware/robotstxt.py | 3 +- .../contrib/downloadermiddleware/useragent.py | 10 +++-- scrapy/core/downloader/__init__.py | 25 ++++++----- scrapy/core/downloader/webclient.py | 6 +-- ...est_downloadermiddleware_defaultheaders.py | 37 +++++++++------- ...st_downloadermiddleware_downloadtimeout.py | 39 ++++++++-------- .../test_downloadermiddleware_useragent.py | 44 ++++++++++--------- scrapy/tests/test_engine.py | 6 +-- scrapy/utils/deprecate.py | 9 ++++ scrapy/utils/test.py | 17 +++++++ 16 files changed, 129 insertions(+), 109 deletions(-) create mode 100644 scrapy/utils/deprecate.py diff --git a/docs/faq.rst b/docs/faq.rst index c0b76b21f..966de7456 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -171,7 +171,7 @@ higher) in your spider:: name = 'myspider' - download_delay = 2 + DOWNLOAD_DELAY = 2 # [ ... rest of the spider code ... ] diff --git a/docs/topics/commands.rst b/docs/topics/commands.rst index d58973eb6..6339a4607 100644 --- a/docs/topics/commands.rst +++ b/docs/topics/commands.rst @@ -73,10 +73,10 @@ information on which commands must be run from inside projects, and which not. Also keep in mind that some commands may have slightly different behaviours when running them from inside projects. For example, the fetch command will use -spider-overridden behaviours (such as custom ``user_agent`` attribute) if the -url being fetched is associated with some specific spider. This is intentional, -as the ``fetch`` command is meant to be used to check how spiders are -downloading pages. +spider-overridden behaviours (such as custom :settings:`USER_AGENT` per-spider +setting) if the url being fetched is associated with some specific spider. This +is intentional, as the ``fetch`` command is meant to be used to check how +spiders are downloading pages. .. _topics-commands-ref: @@ -243,7 +243,7 @@ Downloads the given URL using the Scrapy downloader and writes the contents to standard output. The interesting thing about this command is that it fetches the page how the -the spider would download it. For example, if the spider has an ``user_agent`` +the spider would download it. For example, if the spider has an ``USER_AGENT`` attribute which overrides the User Agent, it will use that one. So this command can be used to "see" how your spider would fetch certain page. diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst index d4139ff54..eb7a33009 100644 --- a/docs/topics/downloader-middleware.rst +++ b/docs/topics/downloader-middleware.rst @@ -177,9 +177,7 @@ DefaultHeadersMiddleware .. class:: DefaultHeadersMiddleware This middleware sets all default requests headers specified in the - :setting:`DEFAULT_REQUEST_HEADERS` setting plus those found in spider - ``default_request_headers`` attribute. Spider headers has precedence over - global headers. + :setting:`DEFAULT_REQUEST_HEADERS` setting. DownloadTimeoutMiddleware ------------------------- @@ -189,10 +187,8 @@ DownloadTimeoutMiddleware .. class:: DownloadTimeoutMiddleware - This middleware sets download timeout for requests based on - `download_timeout` spider attribute. It doesn't override timeout if - `download_timeout` is already set in request meta. Otherwise, - :setting:`DOWNLOAD_TIMEOUT` setting is used as default download timeout. + This middleware sets the download timeout for requests specified in the + :setting:`DOWNLOAD_TIMEOUT` setting. HttpAuthMiddleware ------------------ diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index a63eb1f62..e3b452419 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -398,9 +398,7 @@ setting (which is enabled by default). By default, Scrapy doesn't wait a fixed amount of time between requests, but uses a random interval between 0.5 and 1.5 * :setting:`DOWNLOAD_DELAY`. -Another way to change the download delay (per spider, instead of globally) is -by using the ``download_delay`` spider attribute, which takes more precedence -than this setting. +You can also change this setting per spider. .. setting:: DOWNLOAD_HANDLERS diff --git a/scrapy/contrib/downloadermiddleware/defaultheaders.py b/scrapy/contrib/downloadermiddleware/defaultheaders.py index 1bef04cdd..61a05fc50 100644 --- a/scrapy/contrib/downloadermiddleware/defaultheaders.py +++ b/scrapy/contrib/downloadermiddleware/defaultheaders.py @@ -10,18 +10,10 @@ from scrapy.utils.python import WeakKeyCache class DefaultHeadersMiddleware(object): def __init__(self, settings=conf.settings): - self.global_default_headers = settings.get('DEFAULT_REQUEST_HEADERS') self._headers = WeakKeyCache(self._default_headers) def _default_headers(self, spider): - headers = dict(self.global_default_headers) - spider_headers = getattr(spider, 'default_request_headers', None) or {} - for k, v in spider_headers.iteritems(): - if v: - headers[k] = v - else: - headers.pop(k, None) - return headers.items() + return spider.settings.get('DEFAULT_REQUEST_HEADERS').items() def process_request(self, request, spider): for k, v in self._headers[spider]: diff --git a/scrapy/contrib/downloadermiddleware/downloadtimeout.py b/scrapy/contrib/downloadermiddleware/downloadtimeout.py index 0c250d4c4..01ccf7bfb 100644 --- a/scrapy/contrib/downloadermiddleware/downloadtimeout.py +++ b/scrapy/contrib/downloadermiddleware/downloadtimeout.py @@ -4,6 +4,7 @@ Download timeout middleware See documentation in docs/topics/downloader-middleware.rst """ from scrapy.utils.python import WeakKeyCache +from scrapy.utils import deprecate class DownloadTimeoutMiddleware(object): @@ -12,7 +13,10 @@ class DownloadTimeoutMiddleware(object): self._cache = WeakKeyCache(self._download_timeout) def _download_timeout(self, spider): - return getattr(spider, "download_timeout", None) + if hasattr(spider, 'download_timeout'): + deprecate.attribute(spider, 'download_timeout', 'DOWNLOAD_TIMEOUT') + return spider.download_timeout + return spider.settings.getint('DOWNLOAD_TIMEOUT') def process_request(self, request, spider): timeout = self._cache[spider] diff --git a/scrapy/contrib/downloadermiddleware/robotstxt.py b/scrapy/contrib/downloadermiddleware/robotstxt.py index fe47317f7..314a9586d 100644 --- a/scrapy/contrib/downloadermiddleware/robotstxt.py +++ b/scrapy/contrib/downloadermiddleware/robotstxt.py @@ -54,8 +54,7 @@ class RobotsTxtMiddleware(object): def spider_opened(self, spider): self._spider_netlocs[spider] = set() - self._useragents[spider] = getattr(spider, 'user_agent', None) \ - or settings['USER_AGENT'] + self._useragents[spider] = spider.settings['USER_AGENT'] def spider_closed(self, spider): for netloc in self._spider_netlocs[spider]: diff --git a/scrapy/contrib/downloadermiddleware/useragent.py b/scrapy/contrib/downloadermiddleware/useragent.py index 56df2b999..752a8577b 100644 --- a/scrapy/contrib/downloadermiddleware/useragent.py +++ b/scrapy/contrib/downloadermiddleware/useragent.py @@ -1,18 +1,20 @@ """Set User-Agent header per spider or use a default value from settings""" -from scrapy.conf import settings from scrapy.utils.python import WeakKeyCache +from scrapy.utils import deprecate class UserAgentMiddleware(object): """This middleware allows spiders to override the user_agent""" - def __init__(self, settings=settings): + def __init__(self): self.cache = WeakKeyCache(self._user_agent) - self.default_useragent = settings.get('USER_AGENT') def _user_agent(self, spider): - return getattr(spider, 'user_agent', None) or self.default_useragent + if hasattr(spider, 'user_agent'): + deprecate.attribute(spider, 'user_agent', 'USER_AGENT') + return spider.user_agent + return spider.settings['USER_AGENT'] def process_request(self, request, spider): ua = self.cache[spider] diff --git a/scrapy/core/downloader/__init__.py b/scrapy/core/downloader/__init__.py index 38dcabf55..8eb99d9d0 100644 --- a/scrapy/core/downloader/__init__.py +++ b/scrapy/core/downloader/__init__.py @@ -12,6 +12,7 @@ from scrapy.exceptions import IgnoreRequest from scrapy.conf import settings from scrapy.utils.defer import mustbe_deferred from scrapy.utils.signal import send_catch_log +from scrapy.utils import deprecate from scrapy import signals from scrapy import log from .middleware import DownloaderMiddlewareManager @@ -21,18 +22,21 @@ from .handlers import DownloadHandlers class SpiderInfo(object): """Simple class to keep information and state for each open spider""" - def __init__(self, download_delay=None, max_concurrent_requests=None): - if download_delay is None: - self._download_delay = settings.getfloat('DOWNLOAD_DELAY') + def __init__(self, spider): + if hasattr(spider, 'download_delay'): + deprecate.attribute(spider, 'download_delay', 'DOWNLOAD_DELAY') + self._download_delay = spider.download_delay else: - self._download_delay = float(download_delay) + self._download_delay = spider.settings.getfloat('DOWNLOAD_DELAY') if self._download_delay: self.max_concurrent_requests = 1 - elif max_concurrent_requests is None: - self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER') else: - self.max_concurrent_requests = max_concurrent_requests - if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'): + if hasattr(spider, 'max_concurrent_requests'): + deprecate.attribute(spider, 'max_concurrent_requests', 'CONCURRENT_REQUESTS_PER_SPIDER') + self.max_concurrent_requests = spider.max_concurrent_requests + else: + self.max_concurrent_requests = spider.settings.getint('CONCURRENT_REQUESTS_PER_SPIDER') + if self._download_delay and spider.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'): # same policy as wget --random-wait self.random_delay_interval = (0.5*self._download_delay, \ 1.5*self._download_delay) @@ -178,10 +182,7 @@ class Downloader(object): def open_spider(self, spider): """Allocate resources to begin processing a spider""" assert spider not in self.sites, "Spider already opened: %s" % spider - self.sites[spider] = SpiderInfo( - download_delay=getattr(spider, 'download_delay', None), - max_concurrent_requests=getattr(spider, 'max_concurrent_requests', None) - ) + self.sites[spider] = SpiderInfo(spider) def close_spider(self, spider): """Free any resources associated with the given spider""" diff --git a/scrapy/core/downloader/webclient.py b/scrapy/core/downloader/webclient.py index 0aeb861e0..b08e8ad81 100644 --- a/scrapy/core/downloader/webclient.py +++ b/scrapy/core/downloader/webclient.py @@ -8,10 +8,6 @@ from twisted.internet import defer from scrapy.http import Headers from scrapy.utils.httpobj import urlparse_cached from scrapy.core.downloader.responsetypes import responsetypes -from scrapy.conf import settings - - -DOWNLOAD_TIMEOUT = settings.getint('DOWNLOAD_TIMEOUT') def _parsed_url_args(parsed): @@ -89,7 +85,7 @@ class ScrapyHTTPClientFactory(HTTPClientFactory): followRedirect = False afterFoundGet = False - def __init__(self, request, timeout=DOWNLOAD_TIMEOUT): + def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None diff --git a/scrapy/tests/test_downloadermiddleware_defaultheaders.py b/scrapy/tests/test_downloadermiddleware_defaultheaders.py index 5dfe5546c..cc227e20c 100644 --- a/scrapy/tests/test_downloadermiddleware_defaultheaders.py +++ b/scrapy/tests/test_downloadermiddleware_defaultheaders.py @@ -4,39 +4,44 @@ from scrapy.conf import settings from scrapy.contrib.downloadermiddleware.defaultheaders import DefaultHeadersMiddleware from scrapy.http import Request from scrapy.spider import BaseSpider +from scrapy.utils.test import get_crawler class TestDefaultHeadersMiddleware(TestCase): - def setUp(self): - self.spider = BaseSpider('foo') - self.mw = DefaultHeadersMiddleware() - self.default_request_headers = dict([(k, [v]) for k, v in \ - settings.get('DEFAULT_REQUEST_HEADERS').iteritems()]) + def get_defaults_spider_mw(self): + crawler = get_crawler() + spider = BaseSpider('foo') + spider.set_crawler(crawler) + defaults = dict([(k, [v]) for k, v in \ + crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()]) + return defaults, spider, DefaultHeadersMiddleware() def test_process_request(self): + defaults, spider, mw = self.get_defaults_spider_mw() req = Request('http://www.scrapytest.org') - self.mw.process_request(req, self.spider) - self.assertEquals(req.headers, self.default_request_headers) + mw.process_request(req, spider) + self.assertEquals(req.headers, defaults) def test_spider_default_request_headers(self): + defaults, spider, mw = self.get_defaults_spider_mw() spider_headers = {'Unexistant-Header': ['value']} # override one of the global default headers by spider - if self.default_request_headers: - k = set(self.default_request_headers).pop() + if defaults: + k = set(defaults).pop() spider_headers[k] = ['__newvalue__'] - self.spider.default_request_headers = spider_headers + spider.DEFAULT_REQUEST_HEADERS = spider_headers req = Request('http://www.scrapytest.org') - self.mw.process_request(req, self.spider) - self.assertEquals(req.headers, dict(self.default_request_headers, **spider_headers)) + mw.process_request(req, spider) + self.assertEquals(req.headers, dict(spider_headers)) def test_update_headers(self): + defaults, spider, mw = self.get_defaults_spider_mw() headers = {'Accept-Language': ['es'], 'Test-Header': ['test']} req = Request('http://www.scrapytest.org', headers=headers) self.assertEquals(req.headers, headers) - self.mw.process_request(req, self.spider) - self.default_request_headers.update(headers) - self.assertEquals(req.headers, self.default_request_headers) - + mw.process_request(req, spider) + defaults.update(headers) + self.assertEquals(req.headers, defaults) diff --git a/scrapy/tests/test_downloadermiddleware_downloadtimeout.py b/scrapy/tests/test_downloadermiddleware_downloadtimeout.py index fd60bee9e..fbe371996 100644 --- a/scrapy/tests/test_downloadermiddleware_downloadtimeout.py +++ b/scrapy/tests/test_downloadermiddleware_downloadtimeout.py @@ -3,31 +3,32 @@ import unittest from scrapy.contrib.downloadermiddleware.downloadtimeout import DownloadTimeoutMiddleware from scrapy.spider import BaseSpider from scrapy.http import Request +from scrapy.utils.test import get_crawler class DownloadTimeoutMiddlewareTest(unittest.TestCase): - def setUp(self): - self.mw = DownloadTimeoutMiddleware() - self.spider = BaseSpider('foo') - self.req = Request('http://scrapytest.org/') + def get_request_spider_mw(self): + crawler = get_crawler() + spider = BaseSpider('foo') + spider.set_crawler(crawler) + request = Request('http://scrapytest.org/') + return request, spider, DownloadTimeoutMiddleware() - def tearDown(self): - del self.mw - del self.spider - del self.req - - def test_spider_has_no_download_timeout(self): - assert self.mw.process_request(self.req, self.spider) is None - assert 'download_timeout' not in self.req.meta + def test_default_download_timeout(self): + req, spider, mw = self.get_request_spider_mw() + assert mw.process_request(req, spider) is None + self.assertEquals(req.meta.get('download_timeout'), 180) def test_spider_has_download_timeout(self): - self.spider.download_timeout = 2 - assert self.mw.process_request(self.req, self.spider) is None - self.assertEquals(self.req.meta.get('download_timeout'), 2) + req, spider, mw = self.get_request_spider_mw() + spider.DOWNLOAD_TIMEOUT = 2 + assert mw.process_request(req, spider) is None + self.assertEquals(req.meta.get('download_timeout'), 2) def test_request_has_download_timeout(self): - self.spider.download_timeout = 2 - self.req.meta['download_timeout'] = 1 - assert self.mw.process_request(self.req, self.spider) is None - self.assertEquals(self.req.meta.get('download_timeout'), 1) + req, spider, mw = self.get_request_spider_mw() + spider.DOWNLOAD_TIMEOUT = 2 + req.meta['download_timeout'] = 1 + assert mw.process_request(req, spider) is None + self.assertEquals(req.meta.get('download_timeout'), 1) diff --git a/scrapy/tests/test_downloadermiddleware_useragent.py b/scrapy/tests/test_downloadermiddleware_useragent.py index 866777341..44ac74c8b 100644 --- a/scrapy/tests/test_downloadermiddleware_useragent.py +++ b/scrapy/tests/test_downloadermiddleware_useragent.py @@ -3,47 +3,49 @@ from unittest import TestCase from scrapy.spider import BaseSpider from scrapy.http import Request from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware +from scrapy.utils.test import get_crawler class UserAgentMiddlewareTest(TestCase): - def setUp(self): - self.spider = BaseSpider('foo') - self.mw = UserAgentMiddleware() - - def tearDown(self): - del self.mw + def get_spider_and_mw(self, default_useragent): + crawler = get_crawler({'USER_AGENT': default_useragent}) + spider = BaseSpider('foo') + spider.set_crawler(crawler) + return spider, UserAgentMiddleware() def test_default_agent(self): - self.mw.default_useragent = 'default_useragent' + spider, mw = self.get_spider_and_mw('default_useragent') req = Request('http://scrapytest.org/') - assert self.mw.process_request(req, self.spider) is None + assert mw.process_request(req, spider) is None self.assertEquals(req.headers['User-Agent'], 'default_useragent') - # None or not present user_agent attribute is the same - self.spider.user_agent = None + def test_remove_agent(self): + # settings UESR_AGENT to None should remove the user agent + spider, mw = self.get_spider_and_mw('default_useragent') + spider.USER_AGENT = None req = Request('http://scrapytest.org/') - assert self.mw.process_request(req, self.spider) is None - self.assertEquals(req.headers['User-Agent'], 'default_useragent') + assert mw.process_request(req, spider) is None + assert req.headers.get('User-Agent') is None def test_spider_agent(self): - self.mw.default_useragent = 'default_useragent' - self.spider.user_agent = 'spider_useragent' + spider, mw = self.get_spider_and_mw('default_useragent') + spider.USER_AGENT = 'spider_useragent' req = Request('http://scrapytest.org/') - assert self.mw.process_request(req, self.spider) is None + assert mw.process_request(req, spider) is None self.assertEquals(req.headers['User-Agent'], 'spider_useragent') def test_header_agent(self): - self.mw.default_useragent = 'default_useragent' - self.spider.user_agent = 'spider_useragent' + spider, mw = self.get_spider_and_mw('default_useragent') + spider.USER_AGENT = 'spider_useragent' req = Request('http://scrapytest.org/', headers={'User-Agent': 'header_useragent'}) - assert self.mw.process_request(req, self.spider) is None + assert mw.process_request(req, spider) is None self.assertEquals(req.headers['User-Agent'], 'header_useragent') def test_no_agent(self): - self.mw.default_useragent = None - self.spider.user_agent = None + spider, mw = self.get_spider_and_mw(None) + spider.USER_AGENT = None req = Request('http://scrapytest.org/') - assert self.mw.process_request(req, self.spider) is None + assert mw.process_request(req, spider) is None assert 'User-Agent' not in req.headers diff --git a/scrapy/tests/test_engine.py b/scrapy/tests/test_engine.py index 8f4ab8fb7..2664daf69 100644 --- a/scrapy/tests/test_engine.py +++ b/scrapy/tests/test_engine.py @@ -17,8 +17,7 @@ from twisted.web import server, static, util from twisted.trial import unittest from scrapy import signals -from scrapy.settings import Settings -from scrapy.crawler import Crawler +from scrapy.utils.test import get_crawler from scrapy.xlib.pydispatch import dispatcher from scrapy.tests import tests_datadir from scrapy.spider import BaseSpider @@ -95,8 +94,7 @@ class CrawlerRun(object): dispatcher.connect(self.request_received, signals.request_received) dispatcher.connect(self.response_downloaded, signals.response_downloaded) - settings = Settings() - self.crawler = Crawler(settings) + self.crawler = get_crawler() self.crawler.install() self.crawler.configure() self.crawler.queue.append_spider(self.spider) diff --git a/scrapy/utils/deprecate.py b/scrapy/utils/deprecate.py new file mode 100644 index 000000000..58f580259 --- /dev/null +++ b/scrapy/utils/deprecate.py @@ -0,0 +1,9 @@ +"""Some helpers for deprecation messages""" + +import warnings + +def attribute(obj, oldattr, newattr, version='0.12'): + cname = obj.__class__.__name__ + warnings.warn("%s.%s attribute is deprecated and will be no longer supported " + "in Scrapy %s, use %s.%s attribute instead" % \ + (cname, oldattr, version, cname, newattr), DeprecationWarning, stacklevel=3) diff --git a/scrapy/utils/test.py b/scrapy/utils/test.py index 428b43f43..597ee909c 100644 --- a/scrapy/utils/test.py +++ b/scrapy/utils/test.py @@ -7,6 +7,9 @@ import os import libxml2 from twisted.trial.unittest import SkipTest +from scrapy.crawler import Crawler +from scrapy.settings import CrawlerSettings + def libxml2debug(testfunction): """Decorator for debugging libxml2 memory leaks inside a function. @@ -39,3 +42,17 @@ def assert_aws_environ(): if 'AWS_ACCESS_KEY_ID' not in os.environ: raise SkipTest("AWS keys not found") + +def get_crawler(settings_dict=None): + """Return an unconfigured Crawler object. If settings_dict is given, it + will be used as the settings present in the settings module of the + CrawlerSettings. + """ + class SettingsModuleMock(object): + pass + settings_module = SettingsModuleMock() + if settings_dict: + for k, v in settings_dict.items(): + setattr(settings_module, k, v) + settings = CrawlerSettings(settings_module) + return Crawler(settings) From 97d77c79c28bfbde9781ae88e45d0a236cc10a6a Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 22 Sep 2010 16:09:13 -0300 Subject: [PATCH 06/19] Added tests for CrawlerSettings and SpiderSettings classes --- scrapy/tests/test_settings.py | 54 ++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/scrapy/tests/test_settings.py b/scrapy/tests/test_settings.py index dfccf5dff..9da67652c 100644 --- a/scrapy/tests/test_settings.py +++ b/scrapy/tests/test_settings.py @@ -1,6 +1,8 @@ import unittest -from scrapy.settings import Settings +from scrapy.settings import Settings, SpiderSettings +from scrapy.utils.test import get_crawler +from scrapy.spider import BaseSpider class SettingsTest(unittest.TestCase): @@ -46,6 +48,56 @@ class SettingsTest(unittest.TestCase): self.assertEqual(settings.get('TEST_STRx'), None) self.assertEqual(settings.get('TEST_STRx', 'default'), 'default') +class CrawlerSettingsTest(unittest.TestCase): + + def test_global_defaults(self): + crawler = get_crawler() + self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 180) + + def test_defaults(self): + crawler = get_crawler() + crawler.settings.defaults['DOWNLOAD_TIMEOUT'] = '99' + self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 99) + + def test_settings_module(self): + crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'}) + self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 3) + + def test_overrides(self): + crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'}) + crawler.settings.overrides['DOWNLOAD_TIMEOUT'] = '15' + self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 15) + +class SpiderSettingsTest(unittest.TestCase): + + def test_global_defaults(self): + crawler = get_crawler() + settings = SpiderSettings(BaseSpider('name'), crawler.settings) + self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 180) + + def test_defaults(self): + crawler = get_crawler() + crawler.settings.defaults['DOWNLOAD_TIMEOUT'] = '99' + settings = SpiderSettings(BaseSpider('name'), crawler.settings) + self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 99) + + def test_crawler_defaults(self): + crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'}) + settings = SpiderSettings(BaseSpider('name'), crawler.settings) + self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 3) + + def test_spider_overrides_crawler(self): + crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'}) + crawler.settings.defaults['DOWNLOAD_TIMEOUT'] = '99' + settings = SpiderSettings(BaseSpider('name', DOWNLOAD_TIMEOUT='12'), crawler.settings) + self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 12) + + def test_overrides_most_precedence(self): + crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'}) + crawler.settings.overrides['DOWNLOAD_TIMEOUT'] = '15' + settings = SpiderSettings(BaseSpider('name', DOWNLOAD_TIMEOUT='12'), crawler.settings) + self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 15) + if __name__ == "__main__": unittest.main() From 9599bde3e90ea9758e3f74e39aa87d70779b9758 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 22 Sep 2010 16:09:13 -0300 Subject: [PATCH 07/19] Removed RequestLimitMiddleware --- docs/topics/settings.rst | 13 ---- docs/topics/spider-middleware.rst | 19 ------ .../contrib/spidermiddleware/requestlimit.py | 65 ------------------- scrapy/settings/default_settings.py | 3 - 4 files changed, 100 deletions(-) delete mode 100644 scrapy/contrib/spidermiddleware/requestlimit.py diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index e3b452419..0f2f99811 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -768,18 +768,6 @@ Default: ``+2`` Adjust redirect request priority relative to original request. A negative priority adjust means more priority. -.. setting:: REQUESTS_QUEUE_SIZE - -REQUESTS_QUEUE_SIZE -------------------- - -Default: ``0`` - -Scope: ``scrapy.contrib.spidermiddleware.limit`` - -If non zero, it will be used as an upper limit for the amount of requests that -can be scheduled per domain. - .. setting:: ROBOTSTXT_OBEY ROBOTSTXT_OBEY @@ -866,7 +854,6 @@ Default:: { 'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50, 'scrapy.contrib.itemsampler.ItemSamplerMiddleware': 100, - 'scrapy.contrib.spidermiddleware.requestlimit.RequestLimitMiddleware': 200, 'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500, 'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700, 'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800, diff --git a/docs/topics/spider-middleware.rst b/docs/topics/spider-middleware.rst index fbbdccfe8..1db93259a 100644 --- a/docs/topics/spider-middleware.rst +++ b/docs/topics/spider-middleware.rst @@ -245,25 +245,6 @@ RefererMiddleware Populates Request referer field, based on the Response which originated it. -RequestLimitMiddleware ----------------------- - -.. module:: scrapy.contrib.spidermiddleware.requestlimit - :synopsis: Request limit Spider Middleware - -.. class:: RequestLimitMiddleware - - Limits the maximum number of requests in the scheduler for each spider. When - a spider tries to schedule more than the allowed amount of requests, the new - requests (returned by the spider) will be dropped. - - The :class:`RequestLimitMiddleware` can be configured through the following - settings (see the settings documentation for more info): - - * :setting:`REQUESTS_QUEUE_SIZE` - If non zero, it will be used as an - upper limit for the amount of requests that can be scheduled per - domain. Can be set per spider using ``requests_queue_size`` attribute. - UrlLengthMiddleware ------------------- diff --git a/scrapy/contrib/spidermiddleware/requestlimit.py b/scrapy/contrib/spidermiddleware/requestlimit.py deleted file mode 100644 index bd52576f7..000000000 --- a/scrapy/contrib/spidermiddleware/requestlimit.py +++ /dev/null @@ -1,65 +0,0 @@ -""" -Request Limit Spider middleware - -See documentation in docs/topics/spider-middleware.rst -""" -from itertools import imap -from scrapy.xlib.pydispatch import dispatcher - -from scrapy import signals -from scrapy.project import crawler -from scrapy.exceptions import NotConfigured -from scrapy.conf import settings -from scrapy.http import Request -from scrapy import log - -class RequestLimitMiddleware(object): - - def __init__(self): - self.max_queue_size = settings.getint("REQUESTS_QUEUE_SIZE") - if not self.max_queue_size: - raise NotConfigured - - self.max_pending = {} - self.dropped_count = {} - - dispatcher.connect(self.spider_opened, signal=signals.spider_opened) - dispatcher.connect(self.spider_closed, signal=signals.spider_closed) - - def spider_opened(self, spider): - self.max_pending[spider] = getattr(spider, 'requests_queue_size', self.max_queue_size) - self.dropped_count[spider] = 0 - - def spider_closed(self, spider): - dropped_count = self.dropped_count[spider] - if dropped_count: - max_pending = self.max_pending[spider] - log.msg('Dropped %d request(s) because the scheduler queue size limit (%d requests) was exceeded' % \ - (dropped_count, max_pending), level=log.DEBUG, spider=spider) - del self.dropped_count[spider] - del self.max_pending[spider] - - def process_spider_output(self, response, result, spider): - max_pending = self.max_pending.get(spider, 0) - if max_pending: - return imap(lambda v: self._limit_requests(v, spider, max_pending), result) - else: - return result - - def _limit_requests(self, request_or_other, spider, max_pending): - if isinstance(request_or_other, Request): - free_slots = max_pending - self._pending_count(spider) - if free_slots > 0: - # Scheduler isn't saturated and it is fine to schedule more requests. - return request_or_other - else: - # Skip the request and give engine time to handle other tasks. - self.dropped_count[spider] += 1 - return None - else: - # Return others (non-requests) as is. - return request_or_other - - def _pending_count(self, spider): - pending = crawler.engine.scheduler.pending_requests.get(spider, []) - return len(pending) diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 724f24237..4f0071a96 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -194,8 +194,6 @@ REDIRECT_MAX_METAREFRESH_DELAY = 100 REDIRECT_MAX_TIMES = 20 # uses Firefox default setting REDIRECT_PRIORITY_ADJUST = +2 -REQUESTS_QUEUE_SIZE = 0 - # contrib.middleware.retry.RetryMiddleware default settings RETRY_TIMES = 2 # initial response + 2 retries = 3 requests RETRY_HTTP_CODES = ['500', '503', '504', '400', '408'] @@ -220,7 +218,6 @@ SPIDER_MIDDLEWARES = {} SPIDER_MIDDLEWARES_BASE = { # Engine side 'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50, - 'scrapy.contrib.spidermiddleware.requestlimit.RequestLimitMiddleware': 200, 'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500, 'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700, 'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800, From f29b346f793e58743eb62e6224f780566d7933da Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 22 Sep 2010 22:21:29 -0300 Subject: [PATCH 08/19] Fixed access to settings module name, broken after recent changes to Settings classes --- scrapy/cmdline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/cmdline.py b/scrapy/cmdline.py index 9706828ed..134d88bb0 100644 --- a/scrapy/cmdline.py +++ b/scrapy/cmdline.py @@ -89,7 +89,7 @@ def _check_deprecated_scrapy_ctl(argv, inproject): with open(cfg_path, 'w') as f: f.write("# generated automatically - feel free to edit" + os.linesep) f.write("[settings]" + os.linesep) - f.write("default = %s" % settings.settings_module_path + os.linesep) + f.write("default = %s" % settings.settings_module.__name__ + os.linesep) def _run_print_help(parser, func, *a, **kw): try: From 37c25fe9a89f5f319d781738d3e8f7631ac6668a Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Thu, 23 Sep 2010 12:32:49 -0300 Subject: [PATCH 09/19] Fixed CrawlerSettings.__str__() method when settings_module is None --- scrapy/settings/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scrapy/settings/__init__.py b/scrapy/settings/__init__.py index 7ca18b3ca..55374de76 100644 --- a/scrapy/settings/__init__.py +++ b/scrapy/settings/__init__.py @@ -56,7 +56,11 @@ class CrawlerSettings(Settings): return super(CrawlerSettings, self).__getitem__(opt_name) def __str__(self): - return "" % self.settings_module.__name__ + if self.settings_module: + return "" % \ + (self.settings_module.__name__, self.settings_module.__file__) + else: + return "" class SpiderSettings(Settings): From 754d0f53f9d6381e9eeb2b87fe23a0c0ef01e6f1 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Thu, 23 Sep 2010 12:33:24 -0300 Subject: [PATCH 10/19] Fixed unbounded spider error in shell, and enclosed fetch() method in a try/except block for logging errors more reliably --- scrapy/shell.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/scrapy/shell.py b/scrapy/shell.py index a4f4594b0..573a42147 100644 --- a/scrapy/shell.py +++ b/scrapy/shell.py @@ -56,23 +56,26 @@ class Shell(object): if spider is None: spider = create_spider_for_request(self.crawler.spiders, request, \ BaseSpider('default'), log_multiple=True) + spider.set_crawler(self.crawler) self.crawler.engine.open_spider(spider) return self.crawler.engine.schedule(request, spider) def fetch(self, request_or_url, spider=None): - if isinstance(request_or_url, Request): - request = request_or_url - url = request.url - else: - url = any_to_uri(request_or_url) - request = Request(url, dont_filter=True) - response = None + # we enclose all this code in a try/except block to see errors when + # they happen in a thread try: + if isinstance(request_or_url, Request): + request = request_or_url + url = request.url + else: + url = any_to_uri(request_or_url) + request = Request(url, dont_filter=True) + response = None response = threads.blockingCallFromThread(reactor, \ self._schedule, request, spider) + self.populate_vars(url, response, request, spider) except: - log.err(Failure(), "Error fetching response", spider=spider) - self.populate_vars(url, response, request, spider) + log.err(Failure(), "Error fetching: %s" % request_or_url, spider=spider) def populate_vars(self, url=None, response=None, request=None, spider=None): item = self.item_class() From 79c0e34968a740f67c3ee3df4aff5a4c93adadbd Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Thu, 23 Sep 2010 12:50:46 -0300 Subject: [PATCH 11/19] Simplified CrawlerSettings.__str__() --- scrapy/settings/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/scrapy/settings/__init__.py b/scrapy/settings/__init__.py index 55374de76..42cc35121 100644 --- a/scrapy/settings/__init__.py +++ b/scrapy/settings/__init__.py @@ -56,11 +56,7 @@ class CrawlerSettings(Settings): return super(CrawlerSettings, self).__getitem__(opt_name) def __str__(self): - if self.settings_module: - return "" % \ - (self.settings_module.__name__, self.settings_module.__file__) - else: - return "" + return "" % self.settings_module class SpiderSettings(Settings): From a5ee05e8140e0a055bfecb6283bfbd432a66418f Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Thu, 23 Sep 2010 13:43:21 -0300 Subject: [PATCH 12/19] Added support for setting exit code in Scrapy commands. Closes #248 --- scrapy/cmdline.py | 1 + scrapy/command.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/scrapy/cmdline.py b/scrapy/cmdline.py index 134d88bb0..6884aad88 100644 --- a/scrapy/cmdline.py +++ b/scrapy/cmdline.py @@ -128,6 +128,7 @@ def execute(argv=None): opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) _run_print_help(parser, _run_command, cmd, args, opts) + sys.exit(cmd.exitcode) def _run_command(cmd, args, opts): if opts.profile or opts.lsprof: diff --git a/scrapy/command.py b/scrapy/command.py index a9c447e38..dc2de9ac8 100644 --- a/scrapy/command.py +++ b/scrapy/command.py @@ -21,6 +21,8 @@ class ScrapyCommand(object): # default settings to be used for this command instead of global defaults default_settings = {} + exitcode = 0 + def set_crawler(self, crawler): self._crawler = crawler From 318f7f4c58f6031485ba8966aff55676b77b7e62 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Thu, 23 Sep 2010 13:49:29 -0300 Subject: [PATCH 13/19] Added support for passing code to evaluate in Scrapy shell command (closes #249) and simplified handling of shell errors --- scrapy/commands/shell.py | 16 ++++++++++++++-- scrapy/shell.py | 36 ++++++++++++++++++------------------ 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/scrapy/commands/shell.py b/scrapy/commands/shell.py index 20a82c6d9..ae4426d2f 100644 --- a/scrapy/commands/shell.py +++ b/scrapy/commands/shell.py @@ -6,6 +6,7 @@ See documentation in docs/topics/shell.rst from scrapy.command import ScrapyCommand from scrapy.shell import Shell +from scrapy import log class Command(ScrapyCommand): @@ -21,6 +22,11 @@ class Command(ScrapyCommand): def long_desc(self): return "Interactive console for scraping the given url" + def add_options(self, parser): + ScrapyCommand.add_options(self, parser) + parser.add_option("-c", dest="code", + help="evaluate the code in the shell, print the result and exit") + def update_vars(self, vars): """You can use this function to update the Scrapy objects that will be available in the shell @@ -29,6 +35,12 @@ class Command(ScrapyCommand): def run(self, args, opts): url = args[0] if args else None - shell = Shell(self.crawler, update_vars=self.update_vars, inthread=True) - shell.start(url=url).addBoth(lambda _: self.crawler.stop()) + shell = Shell(self.crawler, update_vars=self.update_vars, inthread=True, \ + code=opts.code) + def err(f): + log.err(f, "Shell error") + self.exitcode = 1 + d = shell.start(url=url) + d.addErrback(err) + d.addBoth(lambda _: self.crawler.stop()) self.crawler.start() diff --git a/scrapy/shell.py b/scrapy/shell.py index 573a42147..cb6706f86 100644 --- a/scrapy/shell.py +++ b/scrapy/shell.py @@ -27,12 +27,13 @@ class Shell(object): relevant_classes = (BaseSpider, Request, Response, BaseItem, \ XPathSelector, Settings) - def __init__(self, crawler, update_vars=None, inthread=False): + def __init__(self, crawler, update_vars=None, inthread=False, code=None): self.crawler = crawler self.vars = {} self.update_vars = update_vars or (lambda x: None) self.item_class = load_object(settings['DEFAULT_ITEM_CLASS']) self.inthread = inthread + self.code = code def start(self, *a, **kw): # disable accidental Ctrl-C key press from shutting down the engine @@ -50,7 +51,10 @@ class Shell(object): elif response: request = response.request self.populate_vars(request.url, response, request, spider) - start_python_console(self.vars) + if self.code: + print eval(self.code, globals(), self.vars) + else: + start_python_console(self.vars) def _schedule(self, request, spider): if spider is None: @@ -61,21 +65,16 @@ class Shell(object): return self.crawler.engine.schedule(request, spider) def fetch(self, request_or_url, spider=None): - # we enclose all this code in a try/except block to see errors when - # they happen in a thread - try: - if isinstance(request_or_url, Request): - request = request_or_url - url = request.url - else: - url = any_to_uri(request_or_url) - request = Request(url, dont_filter=True) - response = None - response = threads.blockingCallFromThread(reactor, \ - self._schedule, request, spider) - self.populate_vars(url, response, request, spider) - except: - log.err(Failure(), "Error fetching: %s" % request_or_url, spider=spider) + if isinstance(request_or_url, Request): + request = request_or_url + url = request.url + else: + url = any_to_uri(request_or_url) + request = Request(url, dont_filter=True) + response = None + response = threads.blockingCallFromThread(reactor, \ + self._schedule, request, spider) + self.populate_vars(url, response, request, spider) def populate_vars(self, url=None, response=None, request=None, spider=None): item = self.item_class() @@ -93,7 +92,8 @@ class Shell(object): self.vars['view'] = open_in_browser self.vars['shelp'] = self.print_help self.update_vars(self.vars) - self.print_help() + if not self.code: + self.print_help() def print_help(self): self.p("Available Scrapy objects:") From b78284b680c36884e9520d7d42c3bc256c97b4bc Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Thu, 23 Sep 2010 13:59:41 -0300 Subject: [PATCH 14/19] Fixed spider variable not properly populated in the Scrapy shell --- scrapy/shell.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scrapy/shell.py b/scrapy/shell.py index cb6706f86..a2bd5d7a3 100644 --- a/scrapy/shell.py +++ b/scrapy/shell.py @@ -62,7 +62,9 @@ class Shell(object): BaseSpider('default'), log_multiple=True) spider.set_crawler(self.crawler) self.crawler.engine.open_spider(spider) - return self.crawler.engine.schedule(request, spider) + d = self.crawler.engine.schedule(request, spider) + d.addCallback(lambda x: (x, spider)) + return d def fetch(self, request_or_url, spider=None): if isinstance(request_or_url, Request): @@ -72,7 +74,7 @@ class Shell(object): url = any_to_uri(request_or_url) request = Request(url, dont_filter=True) response = None - response = threads.blockingCallFromThread(reactor, \ + response, spider = threads.blockingCallFromThread(reactor, \ self._schedule, request, spider) self.populate_vars(url, response, request, spider) From 622834bc089d3462ede466e4d46975e878e1da47 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Thu, 23 Sep 2010 14:01:22 -0300 Subject: [PATCH 15/19] Removed unused imports, and use crawler.settings instead of scrapy.conf.settings in Scrapy Shell --- scrapy/shell.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/scrapy/shell.py b/scrapy/shell.py index a2bd5d7a3..5c245bbe3 100644 --- a/scrapy/shell.py +++ b/scrapy/shell.py @@ -7,9 +7,7 @@ See documentation in docs/topics/shell.rst import signal from twisted.internet import reactor, threads -from twisted.python.failure import Failure -from scrapy import log from scrapy.item import BaseItem from scrapy.spider import BaseSpider from scrapy.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector @@ -18,7 +16,6 @@ from scrapy.utils.misc import load_object from scrapy.utils.response import open_in_browser from scrapy.utils.url import any_to_uri from scrapy.utils.console import start_python_console -from scrapy.conf import settings from scrapy.settings import Settings from scrapy.http import Request, Response, TextResponse @@ -31,7 +28,7 @@ class Shell(object): self.crawler = crawler self.vars = {} self.update_vars = update_vars or (lambda x: None) - self.item_class = load_object(settings['DEFAULT_ITEM_CLASS']) + self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS']) self.inthread = inthread self.code = code @@ -81,7 +78,7 @@ class Shell(object): def populate_vars(self, url=None, response=None, request=None, spider=None): item = self.item_class() self.vars['item'] = item - self.vars['settings'] = settings + self.vars['settings'] = self.crawler.settings if url: if isinstance(response, TextResponse): self.vars['xxs'] = XmlXPathSelector(response) From 279dcc245f73db08f28c4f6fea053e65f3c7064a Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Sun, 26 Sep 2010 01:01:06 -0300 Subject: [PATCH 16/19] Fixed role name in Sphinx doc --- docs/topics/commands.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/topics/commands.rst b/docs/topics/commands.rst index 6339a4607..0ea9e13db 100644 --- a/docs/topics/commands.rst +++ b/docs/topics/commands.rst @@ -73,7 +73,7 @@ information on which commands must be run from inside projects, and which not. Also keep in mind that some commands may have slightly different behaviours when running them from inside projects. For example, the fetch command will use -spider-overridden behaviours (such as custom :settings:`USER_AGENT` per-spider +spider-overridden behaviours (such as custom :setting:`USER_AGENT` per-spider setting) if the url being fetched is associated with some specific spider. This is intentional, as the ``fetch`` command is meant to be used to check how spiders are downloading pages. From 0bf9e4627cfcc477f23fb81edabbad85c24911bd Mon Sep 17 00:00:00 2001 From: Martin Santos Date: Tue, 28 Sep 2010 16:29:37 -0300 Subject: [PATCH 17/19] added support to CloseSpider extension, for close the spider after N pages have been crawled. Using the CLOSESPIDER_PAGECOUNT setting. closes #253 --- docs/topics/extensions.rst | 14 ++++++++++++++ scrapy/contrib/closespider.py | 11 ++++++++++- scrapy/settings/default_settings.py | 1 + 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index 539d4ef1f..e2f12fbce 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -302,6 +302,20 @@ that amount if items and those items are passed by the item pipeline, the spider will be closed with the reason ``closespider_itempassed``. If zero (or non set), spiders won't be closed by number of passed items. +.. setting:: CLOSESPIDER_PAGECOUNT + +CLOSESPIDER_PAGECOUNT +"""""""""""""""""""""" + +Default: ``0`` + +.. versionadded: 0.11 + +An integer which specifies the maximum number of responses to crawl. If the spider +crawls more than that, the spider will be closed with the reason +``closespider_pagecount``. If zero (or non set), spiders won't be closed by +number of crawled responses. + StatsMailer extension ~~~~~~~~~~~~~~~~~~~~~ diff --git a/scrapy/contrib/closespider.py b/scrapy/contrib/closespider.py index a334c155a..0a04309cf 100644 --- a/scrapy/contrib/closespider.py +++ b/scrapy/contrib/closespider.py @@ -18,21 +18,30 @@ class CloseSpider(object): def __init__(self): self.timeout = settings.getint('CLOSESPIDER_TIMEOUT') self.itempassed = settings.getint('CLOSESPIDER_ITEMPASSED') + self.pagecount = settings.getint('CLOSESPIDER_PAGECOUNT') + self.pagecounts = defaultdict(int) self.counts = defaultdict(int) self.tasks = {} + if self.pagecount: + dispatcher.connect(self.page_count, signal=signals.response_received) if self.timeout: dispatcher.connect(self.spider_opened, signal=signals.spider_opened) if self.itempassed: dispatcher.connect(self.item_passed, signal=signals.item_passed) dispatcher.connect(self.spider_closed, signal=signals.spider_closed) + def page_count(self, response, request, spider): + self.pagecounts[spider] += 1 + if self.pagecounts[spider] == self.pagecount: + crawler.engine.close_spider(spider, 'closespider_pagecount') + def spider_opened(self, spider): self.tasks[spider] = reactor.callLater(self.timeout, \ crawler.engine.close_spider, spider=spider, \ reason='closespider_timeout') - + def item_passed(self, item, spider): self.counts[spider] += 1 if self.counts[spider] == self.itempassed: diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 4f0071a96..0b6703189 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -19,6 +19,7 @@ BOT_NAME = 'scrapybot' BOT_VERSION = '1.0' CLOSESPIDER_TIMEOUT = 0 +CLOSESPIDER_PAGECOUNT = 0 CLOSESPIDER_ITEMPASSED = 0 COMMANDS_MODULE = '' From 7826869cb25412a743414822b8b10e2e817de460 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Tue, 28 Sep 2010 16:44:53 -0300 Subject: [PATCH 18/19] Added missing colon --- docs/topics/extensions.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index e2f12fbce..e30c389e4 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -309,13 +309,13 @@ CLOSESPIDER_PAGECOUNT Default: ``0`` -.. versionadded: 0.11 - An integer which specifies the maximum number of responses to crawl. If the spider crawls more than that, the spider will be closed with the reason ``closespider_pagecount``. If zero (or non set), spiders won't be closed by number of crawled responses. +.. versionadded:: 0.11 + StatsMailer extension ~~~~~~~~~~~~~~~~~~~~~ From d15a97ff61295a2c869bf4b034756b6480a68860 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Tue, 28 Sep 2010 16:45:05 -0300 Subject: [PATCH 19/19] Updated Scrapy version in debian/changelog --- debian/changelog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/changelog b/debian/changelog index a01ddd7bb..46d5d3be1 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -scrapy (0.10) unstable; urgency=low +scrapy (0.11) unstable; urgency=low * Initial release.