1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 20:44:18 +00:00
This commit is contained in:
Pablo Hoffman 2010-09-27 07:57:23 -03:00
commit 51325fc93e
42 changed files with 419 additions and 425 deletions

View File

@ -12,9 +12,6 @@ else
exit 1 exit 1
fi fi
# disable custom settings for running tests in a neutral environment
export SCRAPY_SETTINGS_DISABLED=1
# use vsftpd (if available) for testing ftp feed storage # use vsftpd (if available) for testing ftp feed storage
if type vsftpd >/dev/null 2>&1; then if type vsftpd >/dev/null 2>&1; then
vsftpd_conf=$(mktemp /tmp/vsftpd-XXXX) vsftpd_conf=$(mktemp /tmp/vsftpd-XXXX)

View File

@ -171,7 +171,7 @@ higher) in your spider::
name = 'myspider' name = 'myspider'
download_delay = 2 DOWNLOAD_DELAY = 2
# [ ... rest of the spider code ... ] # [ ... rest of the spider code ... ]

View File

@ -73,10 +73,10 @@ information on which commands must be run from inside projects, and which not.
Also keep in mind that some commands may have slightly different behaviours Also keep in mind that some commands may have slightly different behaviours
when running them from inside projects. For example, the fetch command will use when running them from inside projects. For example, the fetch command will use
spider-overridden behaviours (such as custom ``user_agent`` attribute) if the spider-overridden behaviours (such as custom :setting:`USER_AGENT` per-spider
url being fetched is associated with some specific spider. This is intentional, setting) if the url being fetched is associated with some specific spider. This
as the ``fetch`` command is meant to be used to check how spiders are is intentional, as the ``fetch`` command is meant to be used to check how
downloading pages. spiders are downloading pages.
.. _topics-commands-ref: .. _topics-commands-ref:
@ -243,7 +243,7 @@ Downloads the given URL using the Scrapy downloader and writes the contents to
standard output. standard output.
The interesting thing about this command is that it fetches the page how the The interesting thing about this command is that it fetches the page how the
the spider would download it. For example, if the spider has an ``user_agent`` the spider would download it. For example, if the spider has an ``USER_AGENT``
attribute which overrides the User Agent, it will use that one. attribute which overrides the User Agent, it will use that one.
So this command can be used to "see" how your spider would fetch certain page. So this command can be used to "see" how your spider would fetch certain page.

View File

@ -177,9 +177,7 @@ DefaultHeadersMiddleware
.. class:: DefaultHeadersMiddleware .. class:: DefaultHeadersMiddleware
This middleware sets all default requests headers specified in the This middleware sets all default requests headers specified in the
:setting:`DEFAULT_REQUEST_HEADERS` setting plus those found in spider :setting:`DEFAULT_REQUEST_HEADERS` setting.
``default_request_headers`` attribute. Spider headers has precedence over
global headers.
DownloadTimeoutMiddleware DownloadTimeoutMiddleware
------------------------- -------------------------
@ -189,10 +187,8 @@ DownloadTimeoutMiddleware
.. class:: DownloadTimeoutMiddleware .. class:: DownloadTimeoutMiddleware
This middleware sets download timeout for requests based on This middleware sets the download timeout for requests specified in the
`download_timeout` spider attribute. It doesn't override timeout if :setting:`DOWNLOAD_TIMEOUT` setting.
`download_timeout` is already set in request meta. Otherwise,
:setting:`DOWNLOAD_TIMEOUT` setting is used as default download timeout.
HttpAuthMiddleware HttpAuthMiddleware
------------------ ------------------

View File

@ -39,10 +39,9 @@ different precedence. Here is the list of them in decreasing order of
precedence: precedence:
1. Global overrides (most precedence) 1. Global overrides (most precedence)
2. Environment variables 2. Project settings module
3. scrapy_settings 3. Default settings per-command
4. Default settings per-command 4. Default global settings (less precedence)
5. Default global settings (less precedence)
These mechanisms are described in more detail below. These mechanisms are described in more detail below.
@ -65,27 +64,14 @@ Example::
scrapy crawl domain.com --set LOG_FILE=scrapy.log scrapy crawl domain.com --set LOG_FILE=scrapy.log
2. Environment variables 2. Project settings module
------------------------ --------------------------
You can populate settings using environment variables prefixed with The project settings module is the standard configuration file for your Scrapy
``SCRAPY_``. For example, to change the log file location un Unix systems:: project. It's where most of your custom settings will be populated. For
example:: ``myproject.settings``.
$ export SCRAPY_LOG_FILE=scrapy.log 3. Default settings per-command
$ scrapy crawl example.com
In Windows systems, you can change the environment variables from the Control
Panel following `these guidelines`_.
.. _these guidelines: http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/sysdm_advancd_environmnt_addchange_variable.mspx
3. scrapy_settings
------------------
scrapy_settings is the standard configuration file for your Scrapy project.
It's where most of your custom settings will be populated.
4. Default settings per-command
------------------------------- -------------------------------
Each :doc:`Scrapy tool </topics/commands>` command can have its own default Each :doc:`Scrapy tool </topics/commands>` command can have its own default
@ -93,11 +79,11 @@ settings, which override the global default settings. Those custom command
settings are specified in the ``default_settings`` attribute of the command settings are specified in the ``default_settings`` attribute of the command
class. class.
5. Default global settings 4. Default global settings
-------------------------- --------------------------
The global defaults are located in scrapy.conf.default_settings and documented The global defaults are located in the ``scrapy.settings.default_settings``
in the :ref:`topics-settings-ref` section. module and documented in the :ref:`topics-settings-ref` section.
How to access settings How to access settings
====================== ======================
@ -412,9 +398,7 @@ setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
amount of time between requests, but uses a random interval between 0.5 and 1.5 amount of time between requests, but uses a random interval between 0.5 and 1.5
* :setting:`DOWNLOAD_DELAY`. * :setting:`DOWNLOAD_DELAY`.
Another way to change the download delay (per spider, instead of globally) is You can also change this setting per spider.
by using the ``download_delay`` spider attribute, which takes more precedence
than this setting.
.. setting:: DOWNLOAD_HANDLERS .. setting:: DOWNLOAD_HANDLERS
@ -784,18 +768,6 @@ Default: ``+2``
Adjust redirect request priority relative to original request. Adjust redirect request priority relative to original request.
A negative priority adjust means more priority. A negative priority adjust means more priority.
.. setting:: REQUESTS_QUEUE_SIZE
REQUESTS_QUEUE_SIZE
-------------------
Default: ``0``
Scope: ``scrapy.contrib.spidermiddleware.limit``
If non zero, it will be used as an upper limit for the amount of requests that
can be scheduled per domain.
.. setting:: ROBOTSTXT_OBEY .. setting:: ROBOTSTXT_OBEY
ROBOTSTXT_OBEY ROBOTSTXT_OBEY
@ -882,7 +854,6 @@ Default::
{ {
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50, 'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
'scrapy.contrib.itemsampler.ItemSamplerMiddleware': 100, 'scrapy.contrib.itemsampler.ItemSamplerMiddleware': 100,
'scrapy.contrib.spidermiddleware.requestlimit.RequestLimitMiddleware': 200,
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500, 'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700, 'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800, 'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,

View File

@ -245,25 +245,6 @@ RefererMiddleware
Populates Request referer field, based on the Response which originated it. Populates Request referer field, based on the Response which originated it.
RequestLimitMiddleware
----------------------
.. module:: scrapy.contrib.spidermiddleware.requestlimit
:synopsis: Request limit Spider Middleware
.. class:: RequestLimitMiddleware
Limits the maximum number of requests in the scheduler for each spider. When
a spider tries to schedule more than the allowed amount of requests, the new
requests (returned by the spider) will be dropped.
The :class:`RequestLimitMiddleware` can be configured through the following
settings (see the settings documentation for more info):
* :setting:`REQUESTS_QUEUE_SIZE` - If non zero, it will be used as an
upper limit for the amount of requests that can be scheduled per
domain. Can be set per spider using ``requests_queue_size`` attribute.
UrlLengthMiddleware UrlLengthMiddleware
------------------- -------------------

View File

@ -4,11 +4,6 @@
# default. All the other settings are documented here: # default. All the other settings are documented here:
# #
# http://doc.scrapy.org/topics/settings.html # http://doc.scrapy.org/topics/settings.html
#
# Or you can copy and paste them from where they're defined in Scrapy:
#
# scrapy/conf/default_settings.py
#
BOT_NAME = 'googledir' BOT_NAME = 'googledir'
BOT_VERSION = '1.0' BOT_VERSION = '1.0'

View File

@ -4,11 +4,6 @@
# default. All the other settings are documented here: # default. All the other settings are documented here:
# #
# http://doc.scrapy.org/topics/settings.html # http://doc.scrapy.org/topics/settings.html
#
# Or you can copy and paste them from where they're defined in Scrapy:
#
# scrapy/conf/default_settings.py
#
BOT_NAME = 'imdb' BOT_NAME = 'imdb'
BOT_VERSION = '1.0' BOT_VERSION = '1.0'

View File

@ -2,8 +2,8 @@
Scrapy - a screen scraping framework written in Python Scrapy - a screen scraping framework written in Python
""" """
version_info = (0, 10, 3, 'dev') version_info = (0, 11, 0, 'dev')
__version__ = "0.10.3" __version__ = "0.11"
import sys, os, warnings import sys, os, warnings

View File

@ -89,7 +89,7 @@ def _check_deprecated_scrapy_ctl(argv, inproject):
with open(cfg_path, 'w') as f: with open(cfg_path, 'w') as f:
f.write("# generated automatically - feel free to edit" + os.linesep) f.write("# generated automatically - feel free to edit" + os.linesep)
f.write("[settings]" + os.linesep) f.write("[settings]" + os.linesep)
f.write("default = %s" % settings.settings_module_path + os.linesep) f.write("default = %s" % settings.settings_module.__name__ + os.linesep)
def _run_print_help(parser, func, *a, **kw): def _run_print_help(parser, func, *a, **kw):
try: try:
@ -128,6 +128,7 @@ def execute(argv=None):
opts, args = parser.parse_args(args=argv[1:]) opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts) _run_print_help(parser, cmd.process_options, args, opts)
_run_print_help(parser, _run_command, cmd, args, opts) _run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
def _run_command(cmd, args, opts): def _run_command(cmd, args, opts):
if opts.profile or opts.lsprof: if opts.profile or opts.lsprof:

View File

@ -21,6 +21,8 @@ class ScrapyCommand(object):
# default settings to be used for this command instead of global defaults # default settings to be used for this command instead of global defaults
default_settings = {} default_settings = {}
exitcode = 0
def set_crawler(self, crawler): def set_crawler(self, crawler):
self._crawler = crawler self._crawler = crawler

View File

@ -6,6 +6,7 @@ See documentation in docs/topics/shell.rst
from scrapy.command import ScrapyCommand from scrapy.command import ScrapyCommand
from scrapy.shell import Shell from scrapy.shell import Shell
from scrapy import log
class Command(ScrapyCommand): class Command(ScrapyCommand):
@ -21,6 +22,11 @@ class Command(ScrapyCommand):
def long_desc(self): def long_desc(self):
return "Interactive console for scraping the given url" return "Interactive console for scraping the given url"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-c", dest="code",
help="evaluate the code in the shell, print the result and exit")
def update_vars(self, vars): def update_vars(self, vars):
"""You can use this function to update the Scrapy objects that will be """You can use this function to update the Scrapy objects that will be
available in the shell available in the shell
@ -29,6 +35,12 @@ class Command(ScrapyCommand):
def run(self, args, opts): def run(self, args, opts):
url = args[0] if args else None url = args[0] if args else None
shell = Shell(self.crawler, update_vars=self.update_vars, inthread=True) shell = Shell(self.crawler, update_vars=self.update_vars, inthread=True, \
shell.start(url=url).addBoth(lambda _: self.crawler.stop()) code=opts.code)
def err(f):
log.err(f, "Shell error")
self.exitcode = 1
d = shell.start(url=url)
d.addErrback(err)
d.addBoth(lambda _: self.crawler.stop())
self.crawler.start() self.crawler.start()

37
scrapy/conf.py Normal file
View File

@ -0,0 +1,37 @@
"""
Scrapy settings manager
See documentation in docs/topics/settings.rst
"""
import os
import cPickle as pickle
from scrapy.settings import CrawlerSettings
from scrapy.utils.conf import init_env
ENVVAR = 'SCRAPY_SETTINGS_MODULE'
def get_project_settings():
if ENVVAR not in os.environ:
project = os.environ.get('SCRAPY_PROJECT', 'default')
init_env(project)
settings_module_path = os.environ.get(ENVVAR, 'scrapy_settings')
try:
settings_module = __import__(settings_module_path, {}, {}, [''])
except ImportError:
settings_module = None
settings = CrawlerSettings(settings_module)
# XXX: remove this hack
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
settings.overrides = pickle.loads(pickled_settings) if pickled_settings else {}
# XXX: deprecate and remove this functionality
for k, v in os.environ.items():
if k.startswith('SCRAPY_'):
settings.overrides[k[7:]] = v
return settings
settings = get_project_settings()

View File

@ -1,94 +0,0 @@
"""
Scrapy settings manager
See documentation in docs/topics/settings.rst
"""
import os
import cPickle as pickle
from scrapy.conf import default_settings
from scrapy.utils.conf import init_env
import_ = lambda x: __import__(x, {}, {}, [''])
class Settings(object):
def __init__(self, values=None):
self.values = values.copy() if values else {}
self.global_defaults = default_settings
def __getitem__(self, opt_name):
if opt_name in self.values:
return self.values[opt_name]
return getattr(self.global_defaults, opt_name, None)
def get(self, name, default=None):
return self[name] if self[name] is not None else default
def getbool(self, name, default=False):
"""
True is: 1, '1', True
False is: 0, '0', False, None
"""
return bool(int(self.get(name, default)))
def getint(self, name, default=0):
return int(self.get(name, default))
def getfloat(self, name, default=0.0):
return float(self.get(name, default))
def getlist(self, name, default=None):
value = self.get(name)
if value is None:
return default or []
elif hasattr(value, '__iter__'):
return value
else:
return str(value).split(',')
class EnvironmentSettings(Settings):
ENVVAR = 'SCRAPY_SETTINGS_MODULE'
def __init__(self):
super(EnvironmentSettings, self).__init__()
self.defaults = {}
self.disabled = os.environ.get('SCRAPY_SETTINGS_DISABLED', False)
if self.ENVVAR not in os.environ:
project = os.environ.get('SCRAPY_PROJECT', 'default')
init_env(project)
settings_module_path = os.environ.get(self.ENVVAR, 'scrapy_settings')
self.set_settings_module(settings_module_path)
# XXX: find a better solution for this hack
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
self.overrides = pickle.loads(pickled_settings) if pickled_settings else {}
def set_settings_module(self, settings_module_path):
self.settings_module_path = settings_module_path
try:
self.settings_module = import_(settings_module_path)
except ImportError:
self.settings_module = None
def __getitem__(self, opt_name):
if not self.disabled:
if opt_name in self.overrides:
return self.overrides[opt_name]
if 'SCRAPY_' + opt_name in os.environ:
return os.environ['SCRAPY_' + opt_name]
if hasattr(self.settings_module, opt_name):
return getattr(self.settings_module, opt_name)
if opt_name in self.defaults:
return self.defaults[opt_name]
return super(EnvironmentSettings, self).__getitem__(opt_name)
def __str__(self):
return "<Settings %r>" % self.settings_module_path
settings = EnvironmentSettings()

View File

@ -10,18 +10,10 @@ from scrapy.utils.python import WeakKeyCache
class DefaultHeadersMiddleware(object): class DefaultHeadersMiddleware(object):
def __init__(self, settings=conf.settings): def __init__(self, settings=conf.settings):
self.global_default_headers = settings.get('DEFAULT_REQUEST_HEADERS')
self._headers = WeakKeyCache(self._default_headers) self._headers = WeakKeyCache(self._default_headers)
def _default_headers(self, spider): def _default_headers(self, spider):
headers = dict(self.global_default_headers) return spider.settings.get('DEFAULT_REQUEST_HEADERS').items()
spider_headers = getattr(spider, 'default_request_headers', None) or {}
for k, v in spider_headers.iteritems():
if v:
headers[k] = v
else:
headers.pop(k, None)
return headers.items()
def process_request(self, request, spider): def process_request(self, request, spider):
for k, v in self._headers[spider]: for k, v in self._headers[spider]:

View File

@ -4,6 +4,7 @@ Download timeout middleware
See documentation in docs/topics/downloader-middleware.rst See documentation in docs/topics/downloader-middleware.rst
""" """
from scrapy.utils.python import WeakKeyCache from scrapy.utils.python import WeakKeyCache
from scrapy.utils import deprecate
class DownloadTimeoutMiddleware(object): class DownloadTimeoutMiddleware(object):
@ -12,7 +13,10 @@ class DownloadTimeoutMiddleware(object):
self._cache = WeakKeyCache(self._download_timeout) self._cache = WeakKeyCache(self._download_timeout)
def _download_timeout(self, spider): def _download_timeout(self, spider):
return getattr(spider, "download_timeout", None) if hasattr(spider, 'download_timeout'):
deprecate.attribute(spider, 'download_timeout', 'DOWNLOAD_TIMEOUT')
return spider.download_timeout
return spider.settings.getint('DOWNLOAD_TIMEOUT')
def process_request(self, request, spider): def process_request(self, request, spider):
timeout = self._cache[spider] timeout = self._cache[spider]

View File

@ -11,9 +11,6 @@ once the spider has finished crawling all regular (non failed) pages. Once
there is no more failed pages to retry this middleware sends a signal there is no more failed pages to retry this middleware sends a signal
(retry_complete), so other extensions could connect to that signal. (retry_complete), so other extensions could connect to that signal.
Default values are located in scrapy.conf.default_settings, like any other
setting
About HTTP errors to consider: About HTTP errors to consider:
- You may want to remove 400 from RETRY_HTTP_CODES, if you stick to the HTTP - You may want to remove 400 from RETRY_HTTP_CODES, if you stick to the HTTP

View File

@ -54,8 +54,7 @@ class RobotsTxtMiddleware(object):
def spider_opened(self, spider): def spider_opened(self, spider):
self._spider_netlocs[spider] = set() self._spider_netlocs[spider] = set()
self._useragents[spider] = getattr(spider, 'user_agent', None) \ self._useragents[spider] = spider.settings['USER_AGENT']
or settings['USER_AGENT']
def spider_closed(self, spider): def spider_closed(self, spider):
for netloc in self._spider_netlocs[spider]: for netloc in self._spider_netlocs[spider]:

View File

@ -1,18 +1,20 @@
"""Set User-Agent header per spider or use a default value from settings""" """Set User-Agent header per spider or use a default value from settings"""
from scrapy.conf import settings
from scrapy.utils.python import WeakKeyCache from scrapy.utils.python import WeakKeyCache
from scrapy.utils import deprecate
class UserAgentMiddleware(object): class UserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent""" """This middleware allows spiders to override the user_agent"""
def __init__(self, settings=settings): def __init__(self):
self.cache = WeakKeyCache(self._user_agent) self.cache = WeakKeyCache(self._user_agent)
self.default_useragent = settings.get('USER_AGENT')
def _user_agent(self, spider): def _user_agent(self, spider):
return getattr(spider, 'user_agent', None) or self.default_useragent if hasattr(spider, 'user_agent'):
deprecate.attribute(spider, 'user_agent', 'USER_AGENT')
return spider.user_agent
return spider.settings['USER_AGENT']
def process_request(self, request, spider): def process_request(self, request, spider):
ua = self.cache[spider] ua = self.cache[spider]

View File

@ -1,65 +0,0 @@
"""
Request Limit Spider middleware
See documentation in docs/topics/spider-middleware.rst
"""
from itertools import imap
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.project import crawler
from scrapy.exceptions import NotConfigured
from scrapy.conf import settings
from scrapy.http import Request
from scrapy import log
class RequestLimitMiddleware(object):
def __init__(self):
self.max_queue_size = settings.getint("REQUESTS_QUEUE_SIZE")
if not self.max_queue_size:
raise NotConfigured
self.max_pending = {}
self.dropped_count = {}
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.max_pending[spider] = getattr(spider, 'requests_queue_size', self.max_queue_size)
self.dropped_count[spider] = 0
def spider_closed(self, spider):
dropped_count = self.dropped_count[spider]
if dropped_count:
max_pending = self.max_pending[spider]
log.msg('Dropped %d request(s) because the scheduler queue size limit (%d requests) was exceeded' % \
(dropped_count, max_pending), level=log.DEBUG, spider=spider)
del self.dropped_count[spider]
del self.max_pending[spider]
def process_spider_output(self, response, result, spider):
max_pending = self.max_pending.get(spider, 0)
if max_pending:
return imap(lambda v: self._limit_requests(v, spider, max_pending), result)
else:
return result
def _limit_requests(self, request_or_other, spider, max_pending):
if isinstance(request_or_other, Request):
free_slots = max_pending - self._pending_count(spider)
if free_slots > 0:
# Scheduler isn't saturated and it is fine to schedule more requests.
return request_or_other
else:
# Skip the request and give engine time to handle other tasks.
self.dropped_count[spider] += 1
return None
else:
# Return others (non-requests) as is.
return request_or_other
def _pending_count(self, spider):
pending = crawler.engine.scheduler.pending_requests.get(spider, [])
return len(pending)

View File

@ -12,6 +12,7 @@ from scrapy.exceptions import IgnoreRequest
from scrapy.conf import settings from scrapy.conf import settings
from scrapy.utils.defer import mustbe_deferred from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.signal import send_catch_log from scrapy.utils.signal import send_catch_log
from scrapy.utils import deprecate
from scrapy import signals from scrapy import signals
from scrapy import log from scrapy import log
from .middleware import DownloaderMiddlewareManager from .middleware import DownloaderMiddlewareManager
@ -21,18 +22,21 @@ from .handlers import DownloadHandlers
class SpiderInfo(object): class SpiderInfo(object):
"""Simple class to keep information and state for each open spider""" """Simple class to keep information and state for each open spider"""
def __init__(self, download_delay=None, max_concurrent_requests=None): def __init__(self, spider):
if download_delay is None: if hasattr(spider, 'download_delay'):
self._download_delay = settings.getfloat('DOWNLOAD_DELAY') deprecate.attribute(spider, 'download_delay', 'DOWNLOAD_DELAY')
self._download_delay = spider.download_delay
else: else:
self._download_delay = float(download_delay) self._download_delay = spider.settings.getfloat('DOWNLOAD_DELAY')
if self._download_delay: if self._download_delay:
self.max_concurrent_requests = 1 self.max_concurrent_requests = 1
elif max_concurrent_requests is None:
self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
else: else:
self.max_concurrent_requests = max_concurrent_requests if hasattr(spider, 'max_concurrent_requests'):
if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'): deprecate.attribute(spider, 'max_concurrent_requests', 'CONCURRENT_REQUESTS_PER_SPIDER')
self.max_concurrent_requests = spider.max_concurrent_requests
else:
self.max_concurrent_requests = spider.settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
if self._download_delay and spider.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
# same policy as wget --random-wait # same policy as wget --random-wait
self.random_delay_interval = (0.5*self._download_delay, \ self.random_delay_interval = (0.5*self._download_delay, \
1.5*self._download_delay) 1.5*self._download_delay)
@ -178,10 +182,7 @@ class Downloader(object):
def open_spider(self, spider): def open_spider(self, spider):
"""Allocate resources to begin processing a spider""" """Allocate resources to begin processing a spider"""
assert spider not in self.sites, "Spider already opened: %s" % spider assert spider not in self.sites, "Spider already opened: %s" % spider
self.sites[spider] = SpiderInfo( self.sites[spider] = SpiderInfo(spider)
download_delay=getattr(spider, 'download_delay', None),
max_concurrent_requests=getattr(spider, 'max_concurrent_requests', None)
)
def close_spider(self, spider): def close_spider(self, spider):
"""Free any resources associated with the given spider""" """Free any resources associated with the given spider"""

View File

@ -8,10 +8,6 @@ from twisted.internet import defer
from scrapy.http import Headers from scrapy.http import Headers
from scrapy.utils.httpobj import urlparse_cached from scrapy.utils.httpobj import urlparse_cached
from scrapy.core.downloader.responsetypes import responsetypes from scrapy.core.downloader.responsetypes import responsetypes
from scrapy.conf import settings
DOWNLOAD_TIMEOUT = settings.getint('DOWNLOAD_TIMEOUT')
def _parsed_url_args(parsed): def _parsed_url_args(parsed):
@ -89,7 +85,7 @@ class ScrapyHTTPClientFactory(HTTPClientFactory):
followRedirect = False followRedirect = False
afterFoundGet = False afterFoundGet = False
def __init__(self, request, timeout=DOWNLOAD_TIMEOUT): def __init__(self, request, timeout=180):
self.url = urldefrag(request.url)[0] self.url = urldefrag(request.url)[0]
self.method = request.method self.method = request.method
self.body = request.body or None self.body = request.body or None

View File

@ -54,6 +54,7 @@ class Crawler(object):
@defer.inlineCallbacks @defer.inlineCallbacks
def _start_spider(self, spider, requests): def _start_spider(self, spider, requests):
"""Don't call this method. Use self.queue to start new spiders""" """Don't call this method. Use self.queue to start new spiders"""
spider.set_crawler(self)
yield defer.maybeDeferred(self.engine.open_spider, spider) yield defer.maybeDeferred(self.engine.open_spider, spider)
for request in requests: for request in requests:
self.engine.crawl(request, spider) self.engine.crawl(request, spider)

View File

@ -0,0 +1,81 @@
from . import default_settings
class Settings(object):
def __init__(self, values=None):
self.values = values.copy() if values else {}
self.global_defaults = default_settings
def __getitem__(self, opt_name):
if opt_name in self.values:
return self.values[opt_name]
return getattr(self.global_defaults, opt_name, None)
def get(self, name, default=None):
return self[name] if self[name] is not None else default
def getbool(self, name, default=False):
"""
True is: 1, '1', True
False is: 0, '0', False, None
"""
return bool(int(self.get(name, default)))
def getint(self, name, default=0):
return int(self.get(name, default))
def getfloat(self, name, default=0.0):
return float(self.get(name, default))
def getlist(self, name, default=None):
value = self.get(name)
if value is None:
return default or []
elif hasattr(value, '__iter__'):
return value
else:
return str(value).split(',')
class CrawlerSettings(Settings):
def __init__(self, settings_module=None, **kw):
super(CrawlerSettings, self).__init__(**kw)
self.settings_module = settings_module
self.overrides = {}
self.defaults = {}
def __getitem__(self, opt_name):
if opt_name in self.overrides:
return self.overrides[opt_name]
if self.settings_module and hasattr(self.settings_module, opt_name):
return getattr(self.settings_module, opt_name)
if opt_name in self.defaults:
return self.defaults[opt_name]
return super(CrawlerSettings, self).__getitem__(opt_name)
def __str__(self):
return "<CrawlerSettings module=%r>" % self.settings_module
class SpiderSettings(Settings):
def __init__(self, spider, crawler_settings, **kw):
super(SpiderSettings, self).__init__(**kw)
self.spider = spider
self.cset = crawler_settings
def __getitem__(self, opt_name):
if opt_name in self.cset.overrides:
return self.cset.overrides[opt_name]
if hasattr(self.spider, opt_name):
return getattr(self.spider, opt_name)
if self.cset.settings_module and hasattr(self.cset.settings_module, opt_name):
return getattr(self.cset.settings_module, opt_name)
if opt_name in self.cset.defaults:
return self.cset.defaults[opt_name]
return super(SpiderSettings, self).__getitem__(opt_name)
def __str__(self):
return "<SpiderSettings spider=%r>" % self.spider.name

View File

@ -194,8 +194,6 @@ REDIRECT_MAX_METAREFRESH_DELAY = 100
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
REDIRECT_PRIORITY_ADJUST = +2 REDIRECT_PRIORITY_ADJUST = +2
REQUESTS_QUEUE_SIZE = 0
# contrib.middleware.retry.RetryMiddleware default settings # contrib.middleware.retry.RetryMiddleware default settings
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
RETRY_HTTP_CODES = ['500', '503', '504', '400', '408'] RETRY_HTTP_CODES = ['500', '503', '504', '400', '408']
@ -220,7 +218,6 @@ SPIDER_MIDDLEWARES = {}
SPIDER_MIDDLEWARES_BASE = { SPIDER_MIDDLEWARES_BASE = {
# Engine side # Engine side
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50, 'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
'scrapy.contrib.spidermiddleware.requestlimit.RequestLimitMiddleware': 200,
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500, 'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700, 'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800, 'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,

View File

@ -7,9 +7,7 @@ See documentation in docs/topics/shell.rst
import signal import signal
from twisted.internet import reactor, threads from twisted.internet import reactor, threads
from twisted.python.failure import Failure
from scrapy import log
from scrapy.item import BaseItem from scrapy.item import BaseItem
from scrapy.spider import BaseSpider from scrapy.spider import BaseSpider
from scrapy.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector from scrapy.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
@ -18,7 +16,7 @@ from scrapy.utils.misc import load_object
from scrapy.utils.response import open_in_browser from scrapy.utils.response import open_in_browser
from scrapy.utils.url import any_to_uri from scrapy.utils.url import any_to_uri
from scrapy.utils.console import start_python_console from scrapy.utils.console import start_python_console
from scrapy.conf import settings, Settings from scrapy.settings import Settings
from scrapy.http import Request, Response, TextResponse from scrapy.http import Request, Response, TextResponse
class Shell(object): class Shell(object):
@ -26,12 +24,13 @@ class Shell(object):
relevant_classes = (BaseSpider, Request, Response, BaseItem, \ relevant_classes = (BaseSpider, Request, Response, BaseItem, \
XPathSelector, Settings) XPathSelector, Settings)
def __init__(self, crawler, update_vars=None, inthread=False): def __init__(self, crawler, update_vars=None, inthread=False, code=None):
self.crawler = crawler self.crawler = crawler
self.vars = {} self.vars = {}
self.update_vars = update_vars or (lambda x: None) self.update_vars = update_vars or (lambda x: None)
self.item_class = load_object(settings['DEFAULT_ITEM_CLASS']) self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
self.inthread = inthread self.inthread = inthread
self.code = code
def start(self, *a, **kw): def start(self, *a, **kw):
# disable accidental Ctrl-C key press from shutting down the engine # disable accidental Ctrl-C key press from shutting down the engine
@ -49,14 +48,20 @@ class Shell(object):
elif response: elif response:
request = response.request request = response.request
self.populate_vars(request.url, response, request, spider) self.populate_vars(request.url, response, request, spider)
start_python_console(self.vars) if self.code:
print eval(self.code, globals(), self.vars)
else:
start_python_console(self.vars)
def _schedule(self, request, spider): def _schedule(self, request, spider):
if spider is None: if spider is None:
spider = create_spider_for_request(self.crawler.spiders, request, \ spider = create_spider_for_request(self.crawler.spiders, request, \
BaseSpider('default'), log_multiple=True) BaseSpider('default'), log_multiple=True)
spider.set_crawler(self.crawler)
self.crawler.engine.open_spider(spider) self.crawler.engine.open_spider(spider)
return self.crawler.engine.schedule(request, spider) d = self.crawler.engine.schedule(request, spider)
d.addCallback(lambda x: (x, spider))
return d
def fetch(self, request_or_url, spider=None): def fetch(self, request_or_url, spider=None):
if isinstance(request_or_url, Request): if isinstance(request_or_url, Request):
@ -66,17 +71,14 @@ class Shell(object):
url = any_to_uri(request_or_url) url = any_to_uri(request_or_url)
request = Request(url, dont_filter=True) request = Request(url, dont_filter=True)
response = None response = None
try: response, spider = threads.blockingCallFromThread(reactor, \
response = threads.blockingCallFromThread(reactor, \ self._schedule, request, spider)
self._schedule, request, spider)
except:
log.err(Failure(), "Error fetching response", spider=spider)
self.populate_vars(url, response, request, spider) self.populate_vars(url, response, request, spider)
def populate_vars(self, url=None, response=None, request=None, spider=None): def populate_vars(self, url=None, response=None, request=None, spider=None):
item = self.item_class() item = self.item_class()
self.vars['item'] = item self.vars['item'] = item
self.vars['settings'] = settings self.vars['settings'] = self.crawler.settings
if url: if url:
if isinstance(response, TextResponse): if isinstance(response, TextResponse):
self.vars['xxs'] = XmlXPathSelector(response) self.vars['xxs'] = XmlXPathSelector(response)
@ -89,7 +91,8 @@ class Shell(object):
self.vars['view'] = open_in_browser self.vars['view'] = open_in_browser
self.vars['shelp'] = self.print_help self.vars['shelp'] = self.print_help
self.update_vars(self.vars) self.update_vars(self.vars)
self.print_help() if not self.code:
self.print_help()
def print_help(self): def print_help(self):
self.p("Available Scrapy objects:") self.p("Available Scrapy objects:")

View File

@ -5,6 +5,7 @@ See documentation in docs/topics/spiders.rst
""" """
from scrapy import log from scrapy import log
from scrapy.settings import SpiderSettings
from scrapy.http import Request from scrapy.http import Request
from scrapy.utils.misc import arg_to_iter from scrapy.utils.misc import arg_to_iter
from scrapy.utils.trackref import object_ref from scrapy.utils.trackref import object_ref
@ -33,6 +34,21 @@ class BaseSpider(object_ref):
""" """
log.msg(message, spider=self, level=level) log.msg(message, spider=self, level=level)
def set_crawler(self, crawler):
assert not hasattr(self, '_crawler'), "Spider already bounded to %s" % crawler
self._crawler = crawler
@property
def crawler(self):
assert hasattr(self, '_crawler'), "Spider not bounded to any crawler"
return self._crawler
@property
def settings(self):
if not hasattr(self, '_settings'):
self._settings = SpiderSettings(self, self.crawler.settings)
return self._settings
def start_requests(self): def start_requests(self):
reqs = [] reqs = []
for url in self.start_urls: for url in self.start_urls:

View File

@ -5,10 +5,6 @@
# #
# http://doc.scrapy.org/topics/settings.html # http://doc.scrapy.org/topics/settings.html
# #
# Or you can copy and paste them from where they're defined in Scrapy:
#
# scrapy/conf/default_settings.py
#
BOT_NAME = '$project_name' BOT_NAME = '$project_name'
BOT_VERSION = '1.0' BOT_VERSION = '1.0'

View File

@ -10,7 +10,6 @@ class CmdlineTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.env = os.environ.copy() self.env = os.environ.copy()
self.env['PYTHONPATH'] = os.path.dirname(scrapy.__path__[0]) self.env['PYTHONPATH'] = os.path.dirname(scrapy.__path__[0])
self.env.pop('SCRAPY_SETTINGS_DISABLED', None)
self.env['SCRAPY_SETTINGS_MODULE'] = 'scrapy.tests.test_cmdline.settings' self.env['SCRAPY_SETTINGS_MODULE'] = 'scrapy.tests.test_cmdline.settings'
def _execute(self, *new_args, **kwargs): def _execute(self, *new_args, **kwargs):

View File

@ -61,7 +61,6 @@ class CommandTest(ProjectTest):
super(CommandTest, self).setUp() super(CommandTest, self).setUp()
self.call('startproject', self.project_name) self.call('startproject', self.project_name)
self.cwd = join(self.temp_path, self.project_name) self.cwd = join(self.temp_path, self.project_name)
self.env.pop('SCRAPY_SETTINGS_DISABLED', None)
self.env['SCRAPY_SETTINGS_MODULE'] = '%s.settings' % self.project_name self.env['SCRAPY_SETTINGS_MODULE'] = '%s.settings' % self.project_name

View File

@ -1,51 +0,0 @@
import unittest
from scrapy.conf import Settings
class SettingsTest(unittest.TestCase):
def test_get(self):
settings = Settings({
'TEST_ENABLED1': '1',
'TEST_ENABLED2': True,
'TEST_ENABLED3': 1,
'TEST_DISABLED1': '0',
'TEST_DISABLED2': False,
'TEST_DISABLED3': 0,
'TEST_INT1': 123,
'TEST_INT2': '123',
'TEST_FLOAT1': 123.45,
'TEST_FLOAT2': '123.45',
'TEST_LIST1': ['one', 'two'],
'TEST_LIST2': 'one,two',
'TEST_STR': 'value',
})
assert settings.getbool('TEST_ENABLED1') is True
assert settings.getbool('TEST_ENABLED2') is True
assert settings.getbool('TEST_ENABLED3') is True
assert settings.getbool('TEST_ENABLEDx') is False
assert settings.getbool('TEST_ENABLEDx', True) is True
assert settings.getbool('TEST_DISABLED1') is False
assert settings.getbool('TEST_DISABLED2') is False
assert settings.getbool('TEST_DISABLED3') is False
self.assertEqual(settings.getint('TEST_INT1'), 123)
self.assertEqual(settings.getint('TEST_INT2'), 123)
self.assertEqual(settings.getint('TEST_INTx'), 0)
self.assertEqual(settings.getint('TEST_INTx', 45), 45)
self.assertEqual(settings.getfloat('TEST_FLOAT1'), 123.45)
self.assertEqual(settings.getfloat('TEST_FLOAT2'), 123.45)
self.assertEqual(settings.getfloat('TEST_FLOATx'), 0.0)
self.assertEqual(settings.getfloat('TEST_FLOATx', 55.0), 55.0)
self.assertEqual(settings.getlist('TEST_LIST1'), ['one', 'two'])
self.assertEqual(settings.getlist('TEST_LIST2'), ['one', 'two'])
self.assertEqual(settings.getlist('TEST_LISTx'), [])
self.assertEqual(settings.getlist('TEST_LISTx', ['default']), ['default'])
self.assertEqual(settings['TEST_STR'], 'value')
self.assertEqual(settings.get('TEST_STR'), 'value')
self.assertEqual(settings['TEST_STRx'], None)
self.assertEqual(settings.get('TEST_STRx'), None)
self.assertEqual(settings.get('TEST_STRx', 'default'), 'default')
if __name__ == "__main__":
unittest.main()

View File

@ -4,39 +4,44 @@ from scrapy.conf import settings
from scrapy.contrib.downloadermiddleware.defaultheaders import DefaultHeadersMiddleware from scrapy.contrib.downloadermiddleware.defaultheaders import DefaultHeadersMiddleware
from scrapy.http import Request from scrapy.http import Request
from scrapy.spider import BaseSpider from scrapy.spider import BaseSpider
from scrapy.utils.test import get_crawler
class TestDefaultHeadersMiddleware(TestCase): class TestDefaultHeadersMiddleware(TestCase):
def setUp(self): def get_defaults_spider_mw(self):
self.spider = BaseSpider('foo') crawler = get_crawler()
self.mw = DefaultHeadersMiddleware() spider = BaseSpider('foo')
self.default_request_headers = dict([(k, [v]) for k, v in \ spider.set_crawler(crawler)
settings.get('DEFAULT_REQUEST_HEADERS').iteritems()]) defaults = dict([(k, [v]) for k, v in \
crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
return defaults, spider, DefaultHeadersMiddleware()
def test_process_request(self): def test_process_request(self):
defaults, spider, mw = self.get_defaults_spider_mw()
req = Request('http://www.scrapytest.org') req = Request('http://www.scrapytest.org')
self.mw.process_request(req, self.spider) mw.process_request(req, spider)
self.assertEquals(req.headers, self.default_request_headers) self.assertEquals(req.headers, defaults)
def test_spider_default_request_headers(self): def test_spider_default_request_headers(self):
defaults, spider, mw = self.get_defaults_spider_mw()
spider_headers = {'Unexistant-Header': ['value']} spider_headers = {'Unexistant-Header': ['value']}
# override one of the global default headers by spider # override one of the global default headers by spider
if self.default_request_headers: if defaults:
k = set(self.default_request_headers).pop() k = set(defaults).pop()
spider_headers[k] = ['__newvalue__'] spider_headers[k] = ['__newvalue__']
self.spider.default_request_headers = spider_headers spider.DEFAULT_REQUEST_HEADERS = spider_headers
req = Request('http://www.scrapytest.org') req = Request('http://www.scrapytest.org')
self.mw.process_request(req, self.spider) mw.process_request(req, spider)
self.assertEquals(req.headers, dict(self.default_request_headers, **spider_headers)) self.assertEquals(req.headers, dict(spider_headers))
def test_update_headers(self): def test_update_headers(self):
defaults, spider, mw = self.get_defaults_spider_mw()
headers = {'Accept-Language': ['es'], 'Test-Header': ['test']} headers = {'Accept-Language': ['es'], 'Test-Header': ['test']}
req = Request('http://www.scrapytest.org', headers=headers) req = Request('http://www.scrapytest.org', headers=headers)
self.assertEquals(req.headers, headers) self.assertEquals(req.headers, headers)
self.mw.process_request(req, self.spider) mw.process_request(req, spider)
self.default_request_headers.update(headers) defaults.update(headers)
self.assertEquals(req.headers, self.default_request_headers) self.assertEquals(req.headers, defaults)

View File

@ -3,31 +3,32 @@ import unittest
from scrapy.contrib.downloadermiddleware.downloadtimeout import DownloadTimeoutMiddleware from scrapy.contrib.downloadermiddleware.downloadtimeout import DownloadTimeoutMiddleware
from scrapy.spider import BaseSpider from scrapy.spider import BaseSpider
from scrapy.http import Request from scrapy.http import Request
from scrapy.utils.test import get_crawler
class DownloadTimeoutMiddlewareTest(unittest.TestCase): class DownloadTimeoutMiddlewareTest(unittest.TestCase):
def setUp(self): def get_request_spider_mw(self):
self.mw = DownloadTimeoutMiddleware() crawler = get_crawler()
self.spider = BaseSpider('foo') spider = BaseSpider('foo')
self.req = Request('http://scrapytest.org/') spider.set_crawler(crawler)
request = Request('http://scrapytest.org/')
return request, spider, DownloadTimeoutMiddleware()
def tearDown(self): def test_default_download_timeout(self):
del self.mw req, spider, mw = self.get_request_spider_mw()
del self.spider assert mw.process_request(req, spider) is None
del self.req self.assertEquals(req.meta.get('download_timeout'), 180)
def test_spider_has_no_download_timeout(self):
assert self.mw.process_request(self.req, self.spider) is None
assert 'download_timeout' not in self.req.meta
def test_spider_has_download_timeout(self): def test_spider_has_download_timeout(self):
self.spider.download_timeout = 2 req, spider, mw = self.get_request_spider_mw()
assert self.mw.process_request(self.req, self.spider) is None spider.DOWNLOAD_TIMEOUT = 2
self.assertEquals(self.req.meta.get('download_timeout'), 2) assert mw.process_request(req, spider) is None
self.assertEquals(req.meta.get('download_timeout'), 2)
def test_request_has_download_timeout(self): def test_request_has_download_timeout(self):
self.spider.download_timeout = 2 req, spider, mw = self.get_request_spider_mw()
self.req.meta['download_timeout'] = 1 spider.DOWNLOAD_TIMEOUT = 2
assert self.mw.process_request(self.req, self.spider) is None req.meta['download_timeout'] = 1
self.assertEquals(self.req.meta.get('download_timeout'), 1) assert mw.process_request(req, spider) is None
self.assertEquals(req.meta.get('download_timeout'), 1)

View File

@ -3,7 +3,7 @@ import unittest, tempfile, shutil, time
from scrapy.http import Response, HtmlResponse, Request from scrapy.http import Response, HtmlResponse, Request
from scrapy.spider import BaseSpider from scrapy.spider import BaseSpider
from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage, HttpCacheMiddleware from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage, HttpCacheMiddleware
from scrapy.conf import Settings from scrapy.settings import Settings
from scrapy.exceptions import IgnoreRequest from scrapy.exceptions import IgnoreRequest

View File

@ -3,47 +3,49 @@ from unittest import TestCase
from scrapy.spider import BaseSpider from scrapy.spider import BaseSpider
from scrapy.http import Request from scrapy.http import Request
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
from scrapy.utils.test import get_crawler
class UserAgentMiddlewareTest(TestCase): class UserAgentMiddlewareTest(TestCase):
def setUp(self): def get_spider_and_mw(self, default_useragent):
self.spider = BaseSpider('foo') crawler = get_crawler({'USER_AGENT': default_useragent})
self.mw = UserAgentMiddleware() spider = BaseSpider('foo')
spider.set_crawler(crawler)
def tearDown(self): return spider, UserAgentMiddleware()
del self.mw
def test_default_agent(self): def test_default_agent(self):
self.mw.default_useragent = 'default_useragent' spider, mw = self.get_spider_and_mw('default_useragent')
req = Request('http://scrapytest.org/') req = Request('http://scrapytest.org/')
assert self.mw.process_request(req, self.spider) is None assert mw.process_request(req, spider) is None
self.assertEquals(req.headers['User-Agent'], 'default_useragent') self.assertEquals(req.headers['User-Agent'], 'default_useragent')
# None or not present user_agent attribute is the same def test_remove_agent(self):
self.spider.user_agent = None # settings UESR_AGENT to None should remove the user agent
spider, mw = self.get_spider_and_mw('default_useragent')
spider.USER_AGENT = None
req = Request('http://scrapytest.org/') req = Request('http://scrapytest.org/')
assert self.mw.process_request(req, self.spider) is None assert mw.process_request(req, spider) is None
self.assertEquals(req.headers['User-Agent'], 'default_useragent') assert req.headers.get('User-Agent') is None
def test_spider_agent(self): def test_spider_agent(self):
self.mw.default_useragent = 'default_useragent' spider, mw = self.get_spider_and_mw('default_useragent')
self.spider.user_agent = 'spider_useragent' spider.USER_AGENT = 'spider_useragent'
req = Request('http://scrapytest.org/') req = Request('http://scrapytest.org/')
assert self.mw.process_request(req, self.spider) is None assert mw.process_request(req, spider) is None
self.assertEquals(req.headers['User-Agent'], 'spider_useragent') self.assertEquals(req.headers['User-Agent'], 'spider_useragent')
def test_header_agent(self): def test_header_agent(self):
self.mw.default_useragent = 'default_useragent' spider, mw = self.get_spider_and_mw('default_useragent')
self.spider.user_agent = 'spider_useragent' spider.USER_AGENT = 'spider_useragent'
req = Request('http://scrapytest.org/', headers={'User-Agent': 'header_useragent'}) req = Request('http://scrapytest.org/', headers={'User-Agent': 'header_useragent'})
assert self.mw.process_request(req, self.spider) is None assert mw.process_request(req, spider) is None
self.assertEquals(req.headers['User-Agent'], 'header_useragent') self.assertEquals(req.headers['User-Agent'], 'header_useragent')
def test_no_agent(self): def test_no_agent(self):
self.mw.default_useragent = None spider, mw = self.get_spider_and_mw(None)
self.spider.user_agent = None spider.USER_AGENT = None
req = Request('http://scrapytest.org/') req = Request('http://scrapytest.org/')
assert self.mw.process_request(req, self.spider) is None assert mw.process_request(req, spider) is None
assert 'User-Agent' not in req.headers assert 'User-Agent' not in req.headers

View File

@ -17,8 +17,7 @@ from twisted.web import server, static, util
from twisted.trial import unittest from twisted.trial import unittest
from scrapy import signals from scrapy import signals
from scrapy.conf import Settings from scrapy.utils.test import get_crawler
from scrapy.crawler import Crawler
from scrapy.xlib.pydispatch import dispatcher from scrapy.xlib.pydispatch import dispatcher
from scrapy.tests import tests_datadir from scrapy.tests import tests_datadir
from scrapy.spider import BaseSpider from scrapy.spider import BaseSpider
@ -95,8 +94,7 @@ class CrawlerRun(object):
dispatcher.connect(self.request_received, signals.request_received) dispatcher.connect(self.request_received, signals.request_received)
dispatcher.connect(self.response_downloaded, signals.response_downloaded) dispatcher.connect(self.response_downloaded, signals.response_downloaded)
settings = Settings() self.crawler = get_crawler()
self.crawler = Crawler(settings)
self.crawler.install() self.crawler.install()
self.crawler.configure() self.crawler.configure()
self.crawler.queue.append_spider(self.spider) self.crawler.queue.append_spider(self.spider)

View File

@ -1,6 +1,6 @@
from twisted.trial import unittest from twisted.trial import unittest
from scrapy.conf import Settings from scrapy.settings import Settings
from scrapy.exceptions import NotConfigured from scrapy.exceptions import NotConfigured
from scrapy.middleware import MiddlewareManager from scrapy.middleware import MiddlewareManager

View File

@ -2,7 +2,7 @@ from twisted.trial import unittest
from twisted.python import failure from twisted.python import failure
from twisted.internet import defer, reactor from twisted.internet import defer, reactor
from scrapy.conf import Settings from scrapy.settings import Settings
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy.http import Request, Response from scrapy.http import Request, Response
from scrapy.spider import BaseSpider from scrapy.spider import BaseSpider

View File

@ -0,0 +1,103 @@
import unittest
from scrapy.settings import Settings, SpiderSettings
from scrapy.utils.test import get_crawler
from scrapy.spider import BaseSpider
class SettingsTest(unittest.TestCase):
def test_get(self):
settings = Settings({
'TEST_ENABLED1': '1',
'TEST_ENABLED2': True,
'TEST_ENABLED3': 1,
'TEST_DISABLED1': '0',
'TEST_DISABLED2': False,
'TEST_DISABLED3': 0,
'TEST_INT1': 123,
'TEST_INT2': '123',
'TEST_FLOAT1': 123.45,
'TEST_FLOAT2': '123.45',
'TEST_LIST1': ['one', 'two'],
'TEST_LIST2': 'one,two',
'TEST_STR': 'value',
})
assert settings.getbool('TEST_ENABLED1') is True
assert settings.getbool('TEST_ENABLED2') is True
assert settings.getbool('TEST_ENABLED3') is True
assert settings.getbool('TEST_ENABLEDx') is False
assert settings.getbool('TEST_ENABLEDx', True) is True
assert settings.getbool('TEST_DISABLED1') is False
assert settings.getbool('TEST_DISABLED2') is False
assert settings.getbool('TEST_DISABLED3') is False
self.assertEqual(settings.getint('TEST_INT1'), 123)
self.assertEqual(settings.getint('TEST_INT2'), 123)
self.assertEqual(settings.getint('TEST_INTx'), 0)
self.assertEqual(settings.getint('TEST_INTx', 45), 45)
self.assertEqual(settings.getfloat('TEST_FLOAT1'), 123.45)
self.assertEqual(settings.getfloat('TEST_FLOAT2'), 123.45)
self.assertEqual(settings.getfloat('TEST_FLOATx'), 0.0)
self.assertEqual(settings.getfloat('TEST_FLOATx', 55.0), 55.0)
self.assertEqual(settings.getlist('TEST_LIST1'), ['one', 'two'])
self.assertEqual(settings.getlist('TEST_LIST2'), ['one', 'two'])
self.assertEqual(settings.getlist('TEST_LISTx'), [])
self.assertEqual(settings.getlist('TEST_LISTx', ['default']), ['default'])
self.assertEqual(settings['TEST_STR'], 'value')
self.assertEqual(settings.get('TEST_STR'), 'value')
self.assertEqual(settings['TEST_STRx'], None)
self.assertEqual(settings.get('TEST_STRx'), None)
self.assertEqual(settings.get('TEST_STRx', 'default'), 'default')
class CrawlerSettingsTest(unittest.TestCase):
def test_global_defaults(self):
crawler = get_crawler()
self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 180)
def test_defaults(self):
crawler = get_crawler()
crawler.settings.defaults['DOWNLOAD_TIMEOUT'] = '99'
self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 99)
def test_settings_module(self):
crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'})
self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 3)
def test_overrides(self):
crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'})
crawler.settings.overrides['DOWNLOAD_TIMEOUT'] = '15'
self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 15)
class SpiderSettingsTest(unittest.TestCase):
def test_global_defaults(self):
crawler = get_crawler()
settings = SpiderSettings(BaseSpider('name'), crawler.settings)
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 180)
def test_defaults(self):
crawler = get_crawler()
crawler.settings.defaults['DOWNLOAD_TIMEOUT'] = '99'
settings = SpiderSettings(BaseSpider('name'), crawler.settings)
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 99)
def test_crawler_defaults(self):
crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'})
settings = SpiderSettings(BaseSpider('name'), crawler.settings)
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 3)
def test_spider_overrides_crawler(self):
crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'})
crawler.settings.defaults['DOWNLOAD_TIMEOUT'] = '99'
settings = SpiderSettings(BaseSpider('name', DOWNLOAD_TIMEOUT='12'), crawler.settings)
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 12)
def test_overrides_most_precedence(self):
crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'})
crawler.settings.overrides['DOWNLOAD_TIMEOUT'] = '15'
settings = SpiderSettings(BaseSpider('name', DOWNLOAD_TIMEOUT='12'), crawler.settings)
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 15)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,9 @@
"""Some helpers for deprecation messages"""
import warnings
def attribute(obj, oldattr, newattr, version='0.12'):
cname = obj.__class__.__name__
warnings.warn("%s.%s attribute is deprecated and will be no longer supported "
"in Scrapy %s, use %s.%s attribute instead" % \
(cname, oldattr, version, cname, newattr), DeprecationWarning, stacklevel=3)

View File

@ -7,6 +7,9 @@ import os
import libxml2 import libxml2
from twisted.trial.unittest import SkipTest from twisted.trial.unittest import SkipTest
from scrapy.crawler import Crawler
from scrapy.settings import CrawlerSettings
def libxml2debug(testfunction): def libxml2debug(testfunction):
"""Decorator for debugging libxml2 memory leaks inside a function. """Decorator for debugging libxml2 memory leaks inside a function.
@ -39,3 +42,17 @@ def assert_aws_environ():
if 'AWS_ACCESS_KEY_ID' not in os.environ: if 'AWS_ACCESS_KEY_ID' not in os.environ:
raise SkipTest("AWS keys not found") raise SkipTest("AWS keys not found")
def get_crawler(settings_dict=None):
"""Return an unconfigured Crawler object. If settings_dict is given, it
will be used as the settings present in the settings module of the
CrawlerSettings.
"""
class SettingsModuleMock(object):
pass
settings_module = SettingsModuleMock()
if settings_dict:
for k, v in settings_dict.items():
setattr(settings_module, k, v)
settings = CrawlerSettings(settings_module)
return Crawler(settings)

View File

@ -18,7 +18,6 @@ def get_spider_list_from_eggfile(eggfile, project):
env = os.environ.copy() env = os.environ.copy()
env['SCRAPY_PROJECT'] = project env['SCRAPY_PROJECT'] = project
env['SCRAPY_EGGFILE'] = f.name env['SCRAPY_EGGFILE'] = f.name
env.pop('SCRAPY_SETTINGS_DISABLED', None)
proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env) proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env)
out = proc.communicate()[0] out = proc.communicate()[0]
return out.splitlines() return out.splitlines()