mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-28 13:24:31 +00:00
Automated merge with http://hg.scrapy.org/scrapy-0.10
This commit is contained in:
commit
5267772d36
@ -12,9 +12,6 @@ else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# disable custom settings for running tests in a neutral environment
|
||||
export SCRAPY_SETTINGS_DISABLED=1
|
||||
|
||||
# use vsftpd (if available) for testing ftp feed storage
|
||||
if type vsftpd >/dev/null 2>&1; then
|
||||
vsftpd_conf=$(mktemp /tmp/vsftpd-XXXX)
|
||||
|
2
debian/changelog
vendored
2
debian/changelog
vendored
@ -1,4 +1,4 @@
|
||||
scrapy (0.10) unstable; urgency=low
|
||||
scrapy (0.11) unstable; urgency=low
|
||||
|
||||
* Initial release.
|
||||
|
||||
|
@ -171,7 +171,7 @@ higher) in your spider::
|
||||
|
||||
name = 'myspider'
|
||||
|
||||
download_delay = 2
|
||||
DOWNLOAD_DELAY = 2
|
||||
|
||||
# [ ... rest of the spider code ... ]
|
||||
|
||||
|
@ -73,10 +73,10 @@ information on which commands must be run from inside projects, and which not.
|
||||
|
||||
Also keep in mind that some commands may have slightly different behaviours
|
||||
when running them from inside projects. For example, the fetch command will use
|
||||
spider-overridden behaviours (such as custom ``user_agent`` attribute) if the
|
||||
url being fetched is associated with some specific spider. This is intentional,
|
||||
as the ``fetch`` command is meant to be used to check how spiders are
|
||||
downloading pages.
|
||||
spider-overridden behaviours (such as custom :setting:`USER_AGENT` per-spider
|
||||
setting) if the url being fetched is associated with some specific spider. This
|
||||
is intentional, as the ``fetch`` command is meant to be used to check how
|
||||
spiders are downloading pages.
|
||||
|
||||
.. _topics-commands-ref:
|
||||
|
||||
@ -243,7 +243,7 @@ Downloads the given URL using the Scrapy downloader and writes the contents to
|
||||
standard output.
|
||||
|
||||
The interesting thing about this command is that it fetches the page how the
|
||||
the spider would download it. For example, if the spider has an ``user_agent``
|
||||
the spider would download it. For example, if the spider has an ``USER_AGENT``
|
||||
attribute which overrides the User Agent, it will use that one.
|
||||
|
||||
So this command can be used to "see" how your spider would fetch certain page.
|
||||
|
@ -177,9 +177,7 @@ DefaultHeadersMiddleware
|
||||
.. class:: DefaultHeadersMiddleware
|
||||
|
||||
This middleware sets all default requests headers specified in the
|
||||
:setting:`DEFAULT_REQUEST_HEADERS` setting plus those found in spider
|
||||
``default_request_headers`` attribute. Spider headers has precedence over
|
||||
global headers.
|
||||
:setting:`DEFAULT_REQUEST_HEADERS` setting.
|
||||
|
||||
DownloadTimeoutMiddleware
|
||||
-------------------------
|
||||
@ -189,10 +187,8 @@ DownloadTimeoutMiddleware
|
||||
|
||||
.. class:: DownloadTimeoutMiddleware
|
||||
|
||||
This middleware sets download timeout for requests based on
|
||||
`download_timeout` spider attribute. It doesn't override timeout if
|
||||
`download_timeout` is already set in request meta. Otherwise,
|
||||
:setting:`DOWNLOAD_TIMEOUT` setting is used as default download timeout.
|
||||
This middleware sets the download timeout for requests specified in the
|
||||
:setting:`DOWNLOAD_TIMEOUT` setting.
|
||||
|
||||
HttpAuthMiddleware
|
||||
------------------
|
||||
|
@ -302,6 +302,20 @@ that amount if items and those items are passed by the item pipeline, the
|
||||
spider will be closed with the reason ``closespider_itempassed``. If zero (or
|
||||
non set), spiders won't be closed by number of passed items.
|
||||
|
||||
.. setting:: CLOSESPIDER_PAGECOUNT
|
||||
|
||||
CLOSESPIDER_PAGECOUNT
|
||||
""""""""""""""""""""""
|
||||
|
||||
Default: ``0``
|
||||
|
||||
An integer which specifies the maximum number of responses to crawl. If the spider
|
||||
crawls more than that, the spider will be closed with the reason
|
||||
``closespider_pagecount``. If zero (or non set), spiders won't be closed by
|
||||
number of crawled responses.
|
||||
|
||||
.. versionadded:: 0.11
|
||||
|
||||
StatsMailer extension
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -39,10 +39,9 @@ different precedence. Here is the list of them in decreasing order of
|
||||
precedence:
|
||||
|
||||
1. Global overrides (most precedence)
|
||||
2. Environment variables
|
||||
3. scrapy_settings
|
||||
4. Default settings per-command
|
||||
5. Default global settings (less precedence)
|
||||
2. Project settings module
|
||||
3. Default settings per-command
|
||||
4. Default global settings (less precedence)
|
||||
|
||||
These mechanisms are described in more detail below.
|
||||
|
||||
@ -65,27 +64,14 @@ Example::
|
||||
|
||||
scrapy crawl domain.com --set LOG_FILE=scrapy.log
|
||||
|
||||
2. Environment variables
|
||||
------------------------
|
||||
2. Project settings module
|
||||
--------------------------
|
||||
|
||||
You can populate settings using environment variables prefixed with
|
||||
``SCRAPY_``. For example, to change the log file location un Unix systems::
|
||||
The project settings module is the standard configuration file for your Scrapy
|
||||
project. It's where most of your custom settings will be populated. For
|
||||
example:: ``myproject.settings``.
|
||||
|
||||
$ export SCRAPY_LOG_FILE=scrapy.log
|
||||
$ scrapy crawl example.com
|
||||
|
||||
In Windows systems, you can change the environment variables from the Control
|
||||
Panel following `these guidelines`_.
|
||||
|
||||
.. _these guidelines: http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/sysdm_advancd_environmnt_addchange_variable.mspx
|
||||
|
||||
3. scrapy_settings
|
||||
------------------
|
||||
|
||||
scrapy_settings is the standard configuration file for your Scrapy project.
|
||||
It's where most of your custom settings will be populated.
|
||||
|
||||
4. Default settings per-command
|
||||
3. Default settings per-command
|
||||
-------------------------------
|
||||
|
||||
Each :doc:`Scrapy tool </topics/commands>` command can have its own default
|
||||
@ -93,11 +79,11 @@ settings, which override the global default settings. Those custom command
|
||||
settings are specified in the ``default_settings`` attribute of the command
|
||||
class.
|
||||
|
||||
5. Default global settings
|
||||
4. Default global settings
|
||||
--------------------------
|
||||
|
||||
The global defaults are located in scrapy.conf.default_settings and documented
|
||||
in the :ref:`topics-settings-ref` section.
|
||||
The global defaults are located in the ``scrapy.settings.default_settings``
|
||||
module and documented in the :ref:`topics-settings-ref` section.
|
||||
|
||||
How to access settings
|
||||
======================
|
||||
@ -412,9 +398,7 @@ setting (which is enabled by default). By default, Scrapy doesn't wait a fixed
|
||||
amount of time between requests, but uses a random interval between 0.5 and 1.5
|
||||
* :setting:`DOWNLOAD_DELAY`.
|
||||
|
||||
Another way to change the download delay (per spider, instead of globally) is
|
||||
by using the ``download_delay`` spider attribute, which takes more precedence
|
||||
than this setting.
|
||||
You can also change this setting per spider.
|
||||
|
||||
.. setting:: DOWNLOAD_HANDLERS
|
||||
|
||||
@ -784,18 +768,6 @@ Default: ``+2``
|
||||
Adjust redirect request priority relative to original request.
|
||||
A negative priority adjust means more priority.
|
||||
|
||||
.. setting:: REQUESTS_QUEUE_SIZE
|
||||
|
||||
REQUESTS_QUEUE_SIZE
|
||||
-------------------
|
||||
|
||||
Default: ``0``
|
||||
|
||||
Scope: ``scrapy.contrib.spidermiddleware.limit``
|
||||
|
||||
If non zero, it will be used as an upper limit for the amount of requests that
|
||||
can be scheduled per domain.
|
||||
|
||||
.. setting:: ROBOTSTXT_OBEY
|
||||
|
||||
ROBOTSTXT_OBEY
|
||||
@ -882,7 +854,6 @@ Default::
|
||||
{
|
||||
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
|
||||
'scrapy.contrib.itemsampler.ItemSamplerMiddleware': 100,
|
||||
'scrapy.contrib.spidermiddleware.requestlimit.RequestLimitMiddleware': 200,
|
||||
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
|
||||
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
|
||||
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,
|
||||
|
@ -245,25 +245,6 @@ RefererMiddleware
|
||||
|
||||
Populates Request referer field, based on the Response which originated it.
|
||||
|
||||
RequestLimitMiddleware
|
||||
----------------------
|
||||
|
||||
.. module:: scrapy.contrib.spidermiddleware.requestlimit
|
||||
:synopsis: Request limit Spider Middleware
|
||||
|
||||
.. class:: RequestLimitMiddleware
|
||||
|
||||
Limits the maximum number of requests in the scheduler for each spider. When
|
||||
a spider tries to schedule more than the allowed amount of requests, the new
|
||||
requests (returned by the spider) will be dropped.
|
||||
|
||||
The :class:`RequestLimitMiddleware` can be configured through the following
|
||||
settings (see the settings documentation for more info):
|
||||
|
||||
* :setting:`REQUESTS_QUEUE_SIZE` - If non zero, it will be used as an
|
||||
upper limit for the amount of requests that can be scheduled per
|
||||
domain. Can be set per spider using ``requests_queue_size`` attribute.
|
||||
|
||||
UrlLengthMiddleware
|
||||
-------------------
|
||||
|
||||
|
@ -4,11 +4,6 @@
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/topics/settings.html
|
||||
#
|
||||
# Or you can copy and paste them from where they're defined in Scrapy:
|
||||
#
|
||||
# scrapy/conf/default_settings.py
|
||||
#
|
||||
|
||||
BOT_NAME = 'googledir'
|
||||
BOT_VERSION = '1.0'
|
||||
|
@ -4,11 +4,6 @@
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/topics/settings.html
|
||||
#
|
||||
# Or you can copy and paste them from where they're defined in Scrapy:
|
||||
#
|
||||
# scrapy/conf/default_settings.py
|
||||
#
|
||||
|
||||
BOT_NAME = 'imdb'
|
||||
BOT_VERSION = '1.0'
|
||||
|
@ -2,8 +2,8 @@
|
||||
Scrapy - a screen scraping framework written in Python
|
||||
"""
|
||||
|
||||
version_info = (0, 10, 4, 'dev')
|
||||
__version__ = "0.10.4"
|
||||
version_info = (0, 11, 0, 'dev')
|
||||
__version__ = "0.11"
|
||||
|
||||
import sys, os, warnings
|
||||
|
||||
|
@ -89,7 +89,7 @@ def _check_deprecated_scrapy_ctl(argv, inproject):
|
||||
with open(cfg_path, 'w') as f:
|
||||
f.write("# generated automatically - feel free to edit" + os.linesep)
|
||||
f.write("[settings]" + os.linesep)
|
||||
f.write("default = %s" % settings.settings_module_path + os.linesep)
|
||||
f.write("default = %s" % settings.settings_module.__name__ + os.linesep)
|
||||
|
||||
def _run_print_help(parser, func, *a, **kw):
|
||||
try:
|
||||
@ -128,6 +128,7 @@ def execute(argv=None):
|
||||
opts, args = parser.parse_args(args=argv[1:])
|
||||
_run_print_help(parser, cmd.process_options, args, opts)
|
||||
_run_print_help(parser, _run_command, cmd, args, opts)
|
||||
sys.exit(cmd.exitcode)
|
||||
|
||||
def _run_command(cmd, args, opts):
|
||||
if opts.profile or opts.lsprof:
|
||||
|
@ -21,6 +21,8 @@ class ScrapyCommand(object):
|
||||
# default settings to be used for this command instead of global defaults
|
||||
default_settings = {}
|
||||
|
||||
exitcode = 0
|
||||
|
||||
def set_crawler(self, crawler):
|
||||
self._crawler = crawler
|
||||
|
||||
|
@ -6,6 +6,7 @@ See documentation in docs/topics/shell.rst
|
||||
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.shell import Shell
|
||||
from scrapy import log
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
@ -21,6 +22,11 @@ class Command(ScrapyCommand):
|
||||
def long_desc(self):
|
||||
return "Interactive console for scraping the given url"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-c", dest="code",
|
||||
help="evaluate the code in the shell, print the result and exit")
|
||||
|
||||
def update_vars(self, vars):
|
||||
"""You can use this function to update the Scrapy objects that will be
|
||||
available in the shell
|
||||
@ -29,6 +35,12 @@ class Command(ScrapyCommand):
|
||||
|
||||
def run(self, args, opts):
|
||||
url = args[0] if args else None
|
||||
shell = Shell(self.crawler, update_vars=self.update_vars, inthread=True)
|
||||
shell.start(url=url).addBoth(lambda _: self.crawler.stop())
|
||||
shell = Shell(self.crawler, update_vars=self.update_vars, inthread=True, \
|
||||
code=opts.code)
|
||||
def err(f):
|
||||
log.err(f, "Shell error")
|
||||
self.exitcode = 1
|
||||
d = shell.start(url=url)
|
||||
d.addErrback(err)
|
||||
d.addBoth(lambda _: self.crawler.stop())
|
||||
self.crawler.start()
|
||||
|
37
scrapy/conf.py
Normal file
37
scrapy/conf.py
Normal file
@ -0,0 +1,37 @@
|
||||
"""
|
||||
Scrapy settings manager
|
||||
|
||||
See documentation in docs/topics/settings.rst
|
||||
"""
|
||||
|
||||
import os
|
||||
import cPickle as pickle
|
||||
|
||||
from scrapy.settings import CrawlerSettings
|
||||
from scrapy.utils.conf import init_env
|
||||
|
||||
ENVVAR = 'SCRAPY_SETTINGS_MODULE'
|
||||
|
||||
def get_project_settings():
|
||||
if ENVVAR not in os.environ:
|
||||
project = os.environ.get('SCRAPY_PROJECT', 'default')
|
||||
init_env(project)
|
||||
settings_module_path = os.environ.get(ENVVAR, 'scrapy_settings')
|
||||
try:
|
||||
settings_module = __import__(settings_module_path, {}, {}, [''])
|
||||
except ImportError:
|
||||
settings_module = None
|
||||
settings = CrawlerSettings(settings_module)
|
||||
|
||||
# XXX: remove this hack
|
||||
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
|
||||
settings.overrides = pickle.loads(pickled_settings) if pickled_settings else {}
|
||||
|
||||
# XXX: deprecate and remove this functionality
|
||||
for k, v in os.environ.items():
|
||||
if k.startswith('SCRAPY_'):
|
||||
settings.overrides[k[7:]] = v
|
||||
|
||||
return settings
|
||||
|
||||
settings = get_project_settings()
|
@ -1,94 +0,0 @@
|
||||
"""
|
||||
Scrapy settings manager
|
||||
|
||||
See documentation in docs/topics/settings.rst
|
||||
"""
|
||||
|
||||
import os
|
||||
import cPickle as pickle
|
||||
|
||||
from scrapy.conf import default_settings
|
||||
from scrapy.utils.conf import init_env
|
||||
|
||||
import_ = lambda x: __import__(x, {}, {}, [''])
|
||||
|
||||
|
||||
class Settings(object):
|
||||
|
||||
def __init__(self, values=None):
|
||||
self.values = values.copy() if values else {}
|
||||
self.global_defaults = default_settings
|
||||
|
||||
def __getitem__(self, opt_name):
|
||||
if opt_name in self.values:
|
||||
return self.values[opt_name]
|
||||
return getattr(self.global_defaults, opt_name, None)
|
||||
|
||||
def get(self, name, default=None):
|
||||
return self[name] if self[name] is not None else default
|
||||
|
||||
def getbool(self, name, default=False):
|
||||
"""
|
||||
True is: 1, '1', True
|
||||
False is: 0, '0', False, None
|
||||
"""
|
||||
return bool(int(self.get(name, default)))
|
||||
|
||||
def getint(self, name, default=0):
|
||||
return int(self.get(name, default))
|
||||
|
||||
def getfloat(self, name, default=0.0):
|
||||
return float(self.get(name, default))
|
||||
|
||||
def getlist(self, name, default=None):
|
||||
value = self.get(name)
|
||||
if value is None:
|
||||
return default or []
|
||||
elif hasattr(value, '__iter__'):
|
||||
return value
|
||||
else:
|
||||
return str(value).split(',')
|
||||
|
||||
|
||||
class EnvironmentSettings(Settings):
|
||||
|
||||
ENVVAR = 'SCRAPY_SETTINGS_MODULE'
|
||||
|
||||
def __init__(self):
|
||||
super(EnvironmentSettings, self).__init__()
|
||||
self.defaults = {}
|
||||
self.disabled = os.environ.get('SCRAPY_SETTINGS_DISABLED', False)
|
||||
if self.ENVVAR not in os.environ:
|
||||
project = os.environ.get('SCRAPY_PROJECT', 'default')
|
||||
init_env(project)
|
||||
settings_module_path = os.environ.get(self.ENVVAR, 'scrapy_settings')
|
||||
self.set_settings_module(settings_module_path)
|
||||
|
||||
# XXX: find a better solution for this hack
|
||||
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
|
||||
self.overrides = pickle.loads(pickled_settings) if pickled_settings else {}
|
||||
|
||||
def set_settings_module(self, settings_module_path):
|
||||
self.settings_module_path = settings_module_path
|
||||
try:
|
||||
self.settings_module = import_(settings_module_path)
|
||||
except ImportError:
|
||||
self.settings_module = None
|
||||
|
||||
def __getitem__(self, opt_name):
|
||||
if not self.disabled:
|
||||
if opt_name in self.overrides:
|
||||
return self.overrides[opt_name]
|
||||
if 'SCRAPY_' + opt_name in os.environ:
|
||||
return os.environ['SCRAPY_' + opt_name]
|
||||
if hasattr(self.settings_module, opt_name):
|
||||
return getattr(self.settings_module, opt_name)
|
||||
if opt_name in self.defaults:
|
||||
return self.defaults[opt_name]
|
||||
return super(EnvironmentSettings, self).__getitem__(opt_name)
|
||||
|
||||
def __str__(self):
|
||||
return "<Settings %r>" % self.settings_module_path
|
||||
|
||||
|
||||
settings = EnvironmentSettings()
|
@ -7,9 +7,10 @@ See documentation in docs/topics/extensions.rst
|
||||
from collections import defaultdict
|
||||
|
||||
from twisted.internet import reactor
|
||||
from twisted.python import log as txlog
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy import signals, log
|
||||
from scrapy.project import crawler
|
||||
from scrapy.conf import settings
|
||||
|
||||
@ -18,21 +19,42 @@ class CloseSpider(object):
|
||||
def __init__(self):
|
||||
self.timeout = settings.getint('CLOSESPIDER_TIMEOUT')
|
||||
self.itempassed = settings.getint('CLOSESPIDER_ITEMPASSED')
|
||||
self.pagecount = settings.getint('CLOSESPIDER_PAGECOUNT')
|
||||
self.errorcount = settings.getint('CLOSESPIDER_ERRORCOUNT')
|
||||
|
||||
self.errorcounts = defaultdict(int)
|
||||
self.pagecounts = defaultdict(int)
|
||||
self.counts = defaultdict(int)
|
||||
self.tasks = {}
|
||||
|
||||
if self.errorcount:
|
||||
txlog.addObserver(self.catch_log)
|
||||
if self.pagecount:
|
||||
dispatcher.connect(self.page_count, signal=signals.response_received)
|
||||
if self.timeout:
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
if self.itempassed:
|
||||
dispatcher.connect(self.item_passed, signal=signals.item_passed)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def catch_log(self, event):
|
||||
if event.get('logLevel') == log.ERROR:
|
||||
spider = event.get('spider')
|
||||
if spider:
|
||||
self.errorcounts[spider] += 1
|
||||
if self.errorcounts[spider] == self.errorcount:
|
||||
crawler.engine.close_spider(spider, 'closespider_errorcount')
|
||||
|
||||
def page_count(self, response, request, spider):
|
||||
self.pagecounts[spider] += 1
|
||||
if self.pagecounts[spider] == self.pagecount:
|
||||
crawler.engine.close_spider(spider, 'closespider_pagecount')
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.tasks[spider] = reactor.callLater(self.timeout, \
|
||||
crawler.engine.close_spider, spider=spider, \
|
||||
reason='closespider_timeout')
|
||||
|
||||
|
||||
def item_passed(self, item, spider):
|
||||
self.counts[spider] += 1
|
||||
if self.counts[spider] == self.itempassed:
|
||||
@ -40,6 +62,8 @@ class CloseSpider(object):
|
||||
|
||||
def spider_closed(self, spider):
|
||||
self.counts.pop(spider, None)
|
||||
self.pagecounts.pop(spider, None)
|
||||
self.errorcounts.pop(spider, None)
|
||||
tsk = self.tasks.pop(spider, None)
|
||||
if tsk and tsk.active():
|
||||
tsk.cancel()
|
||||
|
@ -10,18 +10,10 @@ from scrapy.utils.python import WeakKeyCache
|
||||
class DefaultHeadersMiddleware(object):
|
||||
|
||||
def __init__(self, settings=conf.settings):
|
||||
self.global_default_headers = settings.get('DEFAULT_REQUEST_HEADERS')
|
||||
self._headers = WeakKeyCache(self._default_headers)
|
||||
|
||||
def _default_headers(self, spider):
|
||||
headers = dict(self.global_default_headers)
|
||||
spider_headers = getattr(spider, 'default_request_headers', None) or {}
|
||||
for k, v in spider_headers.iteritems():
|
||||
if v:
|
||||
headers[k] = v
|
||||
else:
|
||||
headers.pop(k, None)
|
||||
return headers.items()
|
||||
return spider.settings.get('DEFAULT_REQUEST_HEADERS').items()
|
||||
|
||||
def process_request(self, request, spider):
|
||||
for k, v in self._headers[spider]:
|
||||
|
@ -4,6 +4,7 @@ Download timeout middleware
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
from scrapy.utils.python import WeakKeyCache
|
||||
from scrapy.utils import deprecate
|
||||
|
||||
|
||||
class DownloadTimeoutMiddleware(object):
|
||||
@ -12,7 +13,10 @@ class DownloadTimeoutMiddleware(object):
|
||||
self._cache = WeakKeyCache(self._download_timeout)
|
||||
|
||||
def _download_timeout(self, spider):
|
||||
return getattr(spider, "download_timeout", None)
|
||||
if hasattr(spider, 'download_timeout'):
|
||||
deprecate.attribute(spider, 'download_timeout', 'DOWNLOAD_TIMEOUT')
|
||||
return spider.download_timeout
|
||||
return spider.settings.getint('DOWNLOAD_TIMEOUT')
|
||||
|
||||
def process_request(self, request, spider):
|
||||
timeout = self._cache[spider]
|
||||
|
@ -11,9 +11,6 @@ once the spider has finished crawling all regular (non failed) pages. Once
|
||||
there is no more failed pages to retry this middleware sends a signal
|
||||
(retry_complete), so other extensions could connect to that signal.
|
||||
|
||||
Default values are located in scrapy.conf.default_settings, like any other
|
||||
setting
|
||||
|
||||
About HTTP errors to consider:
|
||||
|
||||
- You may want to remove 400 from RETRY_HTTP_CODES, if you stick to the HTTP
|
||||
|
@ -54,8 +54,7 @@ class RobotsTxtMiddleware(object):
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self._spider_netlocs[spider] = set()
|
||||
self._useragents[spider] = getattr(spider, 'user_agent', None) \
|
||||
or settings['USER_AGENT']
|
||||
self._useragents[spider] = spider.settings['USER_AGENT']
|
||||
|
||||
def spider_closed(self, spider):
|
||||
for netloc in self._spider_netlocs[spider]:
|
||||
|
@ -1,18 +1,20 @@
|
||||
"""Set User-Agent header per spider or use a default value from settings"""
|
||||
|
||||
from scrapy.conf import settings
|
||||
from scrapy.utils.python import WeakKeyCache
|
||||
from scrapy.utils import deprecate
|
||||
|
||||
|
||||
class UserAgentMiddleware(object):
|
||||
"""This middleware allows spiders to override the user_agent"""
|
||||
|
||||
def __init__(self, settings=settings):
|
||||
def __init__(self):
|
||||
self.cache = WeakKeyCache(self._user_agent)
|
||||
self.default_useragent = settings.get('USER_AGENT')
|
||||
|
||||
def _user_agent(self, spider):
|
||||
return getattr(spider, 'user_agent', None) or self.default_useragent
|
||||
if hasattr(spider, 'user_agent'):
|
||||
deprecate.attribute(spider, 'user_agent', 'USER_AGENT')
|
||||
return spider.user_agent
|
||||
return spider.settings['USER_AGENT']
|
||||
|
||||
def process_request(self, request, spider):
|
||||
ua = self.cache[spider]
|
||||
|
@ -83,93 +83,18 @@ class HtmlTag(HtmlDataFragment):
|
||||
_ATTR = "((?:[^=/>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?"
|
||||
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s+" + _ATTR + ")+\s*|\s*)(\/?)>"
|
||||
_DOCTYPE = r"<!DOCTYPE.*?>"
|
||||
_SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
|
||||
_COMMENT = "(<!--.*?-->)"
|
||||
|
||||
_ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
|
||||
_HTML_REGEXP = re.compile(_TAG, re.I | re.DOTALL)
|
||||
_HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG), re.I | re.DOTALL)
|
||||
_DOCTYPE_REGEXP = re.compile("(?:%s)" % _DOCTYPE)
|
||||
_COMMENT_RE = re.compile("(<!--.*?-->)", re.DOTALL)
|
||||
_SCRIPT_RE = re.compile("(<script.*?>).*?(</script.*?>)", re.DOTALL | re.I)
|
||||
_COMMENT_REGEXP = re.compile(_COMMENT, re.DOTALL)
|
||||
|
||||
def parse_html(text):
|
||||
"""Higher level html parser. Calls lower level parsers and joins sucesive
|
||||
HtmlDataFragment elements in a single one.
|
||||
"""
|
||||
script_layer = lambda x: _parse_clean_html(x, _SCRIPT_RE, HtmlTag, _simple_parse_html)
|
||||
comment_layer = lambda x: _parse_clean_html(x, _COMMENT_RE, HtmlDataFragment, script_layer)
|
||||
delayed_element = None
|
||||
for element in comment_layer(text):
|
||||
if isinstance(element, HtmlTag):
|
||||
if delayed_element is not None:
|
||||
yield delayed_element
|
||||
delayed_element = None
|
||||
yield element
|
||||
else:# element is HtmlDataFragment
|
||||
if delayed_element is not None:
|
||||
delayed_element.start = min(element.start, delayed_element.start)
|
||||
delayed_element.end = max(element.end, delayed_element.end)
|
||||
else:
|
||||
delayed_element = element
|
||||
if delayed_element is not None:
|
||||
yield delayed_element
|
||||
|
||||
def _parse_clean_html(text, regex, htype, func):
|
||||
"""
|
||||
Removes regions from text, passes the cleaned text to the lower parse layer,
|
||||
and reinserts removed regions.
|
||||
regex - regular expression that defines regions to be removed/re inserted
|
||||
htype - the html parser type of the removed elements
|
||||
func - function that performs the lower parse layer
|
||||
"""
|
||||
removed = [[m.start(), m.end(), m.groups()] for m in regex.finditer(text)]
|
||||
|
||||
cleaned = regex.sub("", text)
|
||||
shift = 0
|
||||
for element in func(cleaned):
|
||||
element.start += shift
|
||||
element.end += shift
|
||||
while removed:
|
||||
if element.end <= removed[0][0]:
|
||||
yield element
|
||||
break
|
||||
else:
|
||||
start, end, groups = removed.pop(0)
|
||||
add = end - start
|
||||
element.end += add
|
||||
shift += add
|
||||
if element.start >= start:
|
||||
element.start += add
|
||||
elif isinstance(element, HtmlTag):
|
||||
yield element
|
||||
break
|
||||
|
||||
if element.start < start:
|
||||
yield HtmlDataFragment(element.start, start)
|
||||
element.start = end
|
||||
|
||||
if htype == HtmlTag:
|
||||
begintag = _parse_tag(_HTML_REGEXP.match(groups[0]))
|
||||
endtag = _parse_tag(_HTML_REGEXP.match(groups[1]))
|
||||
begintag.start = start
|
||||
begintag.end += start
|
||||
|
||||
endtag.start = end - endtag.end
|
||||
endtag.end = end
|
||||
content = None
|
||||
if begintag.end < endtag.start:
|
||||
content = HtmlDataFragment(begintag.end, endtag.start)
|
||||
yield begintag
|
||||
if content is not None:
|
||||
yield content
|
||||
yield endtag
|
||||
else:
|
||||
yield htype(start, end)
|
||||
else:
|
||||
yield element
|
||||
|
||||
def _simple_parse_html(text):
|
||||
"""Simple html parse. It returns a sequence of HtmlTag and HtmlDataFragment
|
||||
objects. Does not ignore any region.
|
||||
"""
|
||||
# If have doctype remove it.
|
||||
start_pos = 0
|
||||
match = _DOCTYPE_REGEXP.match(text)
|
||||
@ -182,19 +107,49 @@ def _simple_parse_html(text):
|
||||
|
||||
if start > prev_end:
|
||||
yield HtmlDataFragment(prev_end, start)
|
||||
|
||||
yield _parse_tag(match)
|
||||
|
||||
if match.groups()[0] is not None: # comment
|
||||
yield HtmlDataFragment(start, end)
|
||||
elif match.groups()[1] is not None: # <script>...</script>
|
||||
for e in _parse_script(match):
|
||||
yield e
|
||||
else: # tag
|
||||
yield _parse_tag(match)
|
||||
prev_end = end
|
||||
textlen = len(text)
|
||||
if prev_end < textlen:
|
||||
yield HtmlDataFragment(prev_end, textlen)
|
||||
|
||||
def _parse_script(match):
|
||||
"""parse a <script>...</script> region matched by _HTML_REGEXP"""
|
||||
open_text, content, close_text = match.groups()[1:4]
|
||||
|
||||
open_tag = _parse_tag(_HTML_REGEXP.match(open_text))
|
||||
open_tag.start = match.start()
|
||||
open_tag.end = match.start() + len(open_text)
|
||||
|
||||
close_tag = _parse_tag(_HTML_REGEXP.match(close_text))
|
||||
close_tag.start = match.end() - len(close_text)
|
||||
close_tag.end = match.end()
|
||||
|
||||
yield open_tag
|
||||
if open_tag.end < close_tag.start:
|
||||
start_pos = 0
|
||||
for m in _COMMENT_REGEXP.finditer(content):
|
||||
if m.start() > start_pos:
|
||||
yield HtmlDataFragment(open_tag.end + start_pos, open_tag.end + m.start())
|
||||
yield HtmlDataFragment(open_tag.end + m.start(), open_tag.end + m.end())
|
||||
start_pos = m.end()
|
||||
if open_tag.end + start_pos < close_tag.start:
|
||||
yield HtmlDataFragment(open_tag.end + start_pos, close_tag.start)
|
||||
yield close_tag
|
||||
|
||||
def _parse_tag(match):
|
||||
"""
|
||||
parse a tag matched by _HTML_REGEXP
|
||||
"""
|
||||
data = match.groups()
|
||||
closing, tag, attr_text = data[:3]
|
||||
closing, tag, attr_text = data[4:7]
|
||||
# if tag is None then the match is a comment
|
||||
if tag is not None:
|
||||
unpaired = data[-1]
|
||||
|
@ -1,65 +0,0 @@
|
||||
"""
|
||||
Request Limit Spider middleware
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
from itertools import imap
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.project import crawler
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.conf import settings
|
||||
from scrapy.http import Request
|
||||
from scrapy import log
|
||||
|
||||
class RequestLimitMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self.max_queue_size = settings.getint("REQUESTS_QUEUE_SIZE")
|
||||
if not self.max_queue_size:
|
||||
raise NotConfigured
|
||||
|
||||
self.max_pending = {}
|
||||
self.dropped_count = {}
|
||||
|
||||
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.max_pending[spider] = getattr(spider, 'requests_queue_size', self.max_queue_size)
|
||||
self.dropped_count[spider] = 0
|
||||
|
||||
def spider_closed(self, spider):
|
||||
dropped_count = self.dropped_count[spider]
|
||||
if dropped_count:
|
||||
max_pending = self.max_pending[spider]
|
||||
log.msg('Dropped %d request(s) because the scheduler queue size limit (%d requests) was exceeded' % \
|
||||
(dropped_count, max_pending), level=log.DEBUG, spider=spider)
|
||||
del self.dropped_count[spider]
|
||||
del self.max_pending[spider]
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
max_pending = self.max_pending.get(spider, 0)
|
||||
if max_pending:
|
||||
return imap(lambda v: self._limit_requests(v, spider, max_pending), result)
|
||||
else:
|
||||
return result
|
||||
|
||||
def _limit_requests(self, request_or_other, spider, max_pending):
|
||||
if isinstance(request_or_other, Request):
|
||||
free_slots = max_pending - self._pending_count(spider)
|
||||
if free_slots > 0:
|
||||
# Scheduler isn't saturated and it is fine to schedule more requests.
|
||||
return request_or_other
|
||||
else:
|
||||
# Skip the request and give engine time to handle other tasks.
|
||||
self.dropped_count[spider] += 1
|
||||
return None
|
||||
else:
|
||||
# Return others (non-requests) as is.
|
||||
return request_or_other
|
||||
|
||||
def _pending_count(self, spider):
|
||||
pending = crawler.engine.scheduler.pending_requests.get(spider, [])
|
||||
return len(pending)
|
@ -12,6 +12,7 @@ from scrapy.exceptions import IgnoreRequest
|
||||
from scrapy.conf import settings
|
||||
from scrapy.utils.defer import mustbe_deferred
|
||||
from scrapy.utils.signal import send_catch_log
|
||||
from scrapy.utils import deprecate
|
||||
from scrapy import signals
|
||||
from scrapy import log
|
||||
from .middleware import DownloaderMiddlewareManager
|
||||
@ -21,18 +22,21 @@ from .handlers import DownloadHandlers
|
||||
class SpiderInfo(object):
|
||||
"""Simple class to keep information and state for each open spider"""
|
||||
|
||||
def __init__(self, download_delay=None, max_concurrent_requests=None):
|
||||
if download_delay is None:
|
||||
self._download_delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||
def __init__(self, spider):
|
||||
if hasattr(spider, 'download_delay'):
|
||||
deprecate.attribute(spider, 'download_delay', 'DOWNLOAD_DELAY')
|
||||
self._download_delay = spider.download_delay
|
||||
else:
|
||||
self._download_delay = float(download_delay)
|
||||
self._download_delay = spider.settings.getfloat('DOWNLOAD_DELAY')
|
||||
if self._download_delay:
|
||||
self.max_concurrent_requests = 1
|
||||
elif max_concurrent_requests is None:
|
||||
self.max_concurrent_requests = settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
|
||||
else:
|
||||
self.max_concurrent_requests = max_concurrent_requests
|
||||
if self._download_delay and settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
|
||||
if hasattr(spider, 'max_concurrent_requests'):
|
||||
deprecate.attribute(spider, 'max_concurrent_requests', 'CONCURRENT_REQUESTS_PER_SPIDER')
|
||||
self.max_concurrent_requests = spider.max_concurrent_requests
|
||||
else:
|
||||
self.max_concurrent_requests = spider.settings.getint('CONCURRENT_REQUESTS_PER_SPIDER')
|
||||
if self._download_delay and spider.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY'):
|
||||
# same policy as wget --random-wait
|
||||
self.random_delay_interval = (0.5*self._download_delay, \
|
||||
1.5*self._download_delay)
|
||||
@ -178,10 +182,7 @@ class Downloader(object):
|
||||
def open_spider(self, spider):
|
||||
"""Allocate resources to begin processing a spider"""
|
||||
assert spider not in self.sites, "Spider already opened: %s" % spider
|
||||
self.sites[spider] = SpiderInfo(
|
||||
download_delay=getattr(spider, 'download_delay', None),
|
||||
max_concurrent_requests=getattr(spider, 'max_concurrent_requests', None)
|
||||
)
|
||||
self.sites[spider] = SpiderInfo(spider)
|
||||
|
||||
def close_spider(self, spider):
|
||||
"""Free any resources associated with the given spider"""
|
||||
|
@ -8,10 +8,6 @@ from twisted.internet import defer
|
||||
from scrapy.http import Headers
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
DOWNLOAD_TIMEOUT = settings.getint('DOWNLOAD_TIMEOUT')
|
||||
|
||||
|
||||
def _parsed_url_args(parsed):
|
||||
@ -94,7 +90,7 @@ class ScrapyHTTPClientFactory(HTTPClientFactory):
|
||||
followRedirect = False
|
||||
afterFoundGet = False
|
||||
|
||||
def __init__(self, request, timeout=DOWNLOAD_TIMEOUT):
|
||||
def __init__(self, request, timeout=180):
|
||||
self.url = urldefrag(request.url)[0]
|
||||
self.method = request.method
|
||||
self.body = request.body or None
|
||||
|
@ -54,6 +54,7 @@ class Crawler(object):
|
||||
@defer.inlineCallbacks
|
||||
def _start_spider(self, spider, requests):
|
||||
"""Don't call this method. Use self.queue to start new spiders"""
|
||||
spider.set_crawler(self)
|
||||
yield defer.maybeDeferred(self.engine.open_spider, spider)
|
||||
for request in requests:
|
||||
self.engine.crawl(request, spider)
|
||||
|
81
scrapy/settings/__init__.py
Normal file
81
scrapy/settings/__init__.py
Normal file
@ -0,0 +1,81 @@
|
||||
from . import default_settings
|
||||
|
||||
|
||||
class Settings(object):
|
||||
|
||||
def __init__(self, values=None):
|
||||
self.values = values.copy() if values else {}
|
||||
self.global_defaults = default_settings
|
||||
|
||||
def __getitem__(self, opt_name):
|
||||
if opt_name in self.values:
|
||||
return self.values[opt_name]
|
||||
return getattr(self.global_defaults, opt_name, None)
|
||||
|
||||
def get(self, name, default=None):
|
||||
return self[name] if self[name] is not None else default
|
||||
|
||||
def getbool(self, name, default=False):
|
||||
"""
|
||||
True is: 1, '1', True
|
||||
False is: 0, '0', False, None
|
||||
"""
|
||||
return bool(int(self.get(name, default)))
|
||||
|
||||
def getint(self, name, default=0):
|
||||
return int(self.get(name, default))
|
||||
|
||||
def getfloat(self, name, default=0.0):
|
||||
return float(self.get(name, default))
|
||||
|
||||
def getlist(self, name, default=None):
|
||||
value = self.get(name)
|
||||
if value is None:
|
||||
return default or []
|
||||
elif hasattr(value, '__iter__'):
|
||||
return value
|
||||
else:
|
||||
return str(value).split(',')
|
||||
|
||||
|
||||
class CrawlerSettings(Settings):
|
||||
|
||||
def __init__(self, settings_module=None, **kw):
|
||||
super(CrawlerSettings, self).__init__(**kw)
|
||||
self.settings_module = settings_module
|
||||
self.overrides = {}
|
||||
self.defaults = {}
|
||||
|
||||
def __getitem__(self, opt_name):
|
||||
if opt_name in self.overrides:
|
||||
return self.overrides[opt_name]
|
||||
if self.settings_module and hasattr(self.settings_module, opt_name):
|
||||
return getattr(self.settings_module, opt_name)
|
||||
if opt_name in self.defaults:
|
||||
return self.defaults[opt_name]
|
||||
return super(CrawlerSettings, self).__getitem__(opt_name)
|
||||
|
||||
def __str__(self):
|
||||
return "<CrawlerSettings module=%r>" % self.settings_module
|
||||
|
||||
|
||||
class SpiderSettings(Settings):
|
||||
|
||||
def __init__(self, spider, crawler_settings, **kw):
|
||||
super(SpiderSettings, self).__init__(**kw)
|
||||
self.spider = spider
|
||||
self.cset = crawler_settings
|
||||
|
||||
def __getitem__(self, opt_name):
|
||||
if opt_name in self.cset.overrides:
|
||||
return self.cset.overrides[opt_name]
|
||||
if hasattr(self.spider, opt_name):
|
||||
return getattr(self.spider, opt_name)
|
||||
if self.cset.settings_module and hasattr(self.cset.settings_module, opt_name):
|
||||
return getattr(self.cset.settings_module, opt_name)
|
||||
if opt_name in self.cset.defaults:
|
||||
return self.cset.defaults[opt_name]
|
||||
return super(SpiderSettings, self).__getitem__(opt_name)
|
||||
|
||||
def __str__(self):
|
||||
return "<SpiderSettings spider=%r>" % self.spider.name
|
@ -19,6 +19,7 @@ BOT_NAME = 'scrapybot'
|
||||
BOT_VERSION = '1.0'
|
||||
|
||||
CLOSESPIDER_TIMEOUT = 0
|
||||
CLOSESPIDER_PAGECOUNT = 0
|
||||
CLOSESPIDER_ITEMPASSED = 0
|
||||
|
||||
COMMANDS_MODULE = ''
|
||||
@ -194,8 +195,6 @@ REDIRECT_MAX_METAREFRESH_DELAY = 100
|
||||
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
|
||||
REDIRECT_PRIORITY_ADJUST = +2
|
||||
|
||||
REQUESTS_QUEUE_SIZE = 0
|
||||
|
||||
# contrib.middleware.retry.RetryMiddleware default settings
|
||||
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
|
||||
RETRY_HTTP_CODES = ['500', '503', '504', '400', '408']
|
||||
@ -220,7 +219,6 @@ SPIDER_MIDDLEWARES = {}
|
||||
SPIDER_MIDDLEWARES_BASE = {
|
||||
# Engine side
|
||||
'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50,
|
||||
'scrapy.contrib.spidermiddleware.requestlimit.RequestLimitMiddleware': 200,
|
||||
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500,
|
||||
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700,
|
||||
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800,
|
@ -7,9 +7,7 @@ See documentation in docs/topics/shell.rst
|
||||
import signal
|
||||
|
||||
from twisted.internet import reactor, threads
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
|
||||
@ -18,7 +16,7 @@ from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.response import open_in_browser
|
||||
from scrapy.utils.url import any_to_uri
|
||||
from scrapy.utils.console import start_python_console
|
||||
from scrapy.conf import settings, Settings
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.http import Request, Response, TextResponse
|
||||
|
||||
class Shell(object):
|
||||
@ -26,12 +24,13 @@ class Shell(object):
|
||||
relevant_classes = (BaseSpider, Request, Response, BaseItem, \
|
||||
XPathSelector, Settings)
|
||||
|
||||
def __init__(self, crawler, update_vars=None, inthread=False):
|
||||
def __init__(self, crawler, update_vars=None, inthread=False, code=None):
|
||||
self.crawler = crawler
|
||||
self.vars = {}
|
||||
self.update_vars = update_vars or (lambda x: None)
|
||||
self.item_class = load_object(settings['DEFAULT_ITEM_CLASS'])
|
||||
self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
|
||||
self.inthread = inthread
|
||||
self.code = code
|
||||
|
||||
def start(self, *a, **kw):
|
||||
# disable accidental Ctrl-C key press from shutting down the engine
|
||||
@ -49,14 +48,20 @@ class Shell(object):
|
||||
elif response:
|
||||
request = response.request
|
||||
self.populate_vars(request.url, response, request, spider)
|
||||
start_python_console(self.vars)
|
||||
if self.code:
|
||||
print eval(self.code, globals(), self.vars)
|
||||
else:
|
||||
start_python_console(self.vars)
|
||||
|
||||
def _schedule(self, request, spider):
|
||||
if spider is None:
|
||||
spider = create_spider_for_request(self.crawler.spiders, request, \
|
||||
BaseSpider('default'), log_multiple=True)
|
||||
spider.set_crawler(self.crawler)
|
||||
self.crawler.engine.open_spider(spider)
|
||||
return self.crawler.engine.schedule(request, spider)
|
||||
d = self.crawler.engine.schedule(request, spider)
|
||||
d.addCallback(lambda x: (x, spider))
|
||||
return d
|
||||
|
||||
def fetch(self, request_or_url, spider=None):
|
||||
if isinstance(request_or_url, Request):
|
||||
@ -66,17 +71,14 @@ class Shell(object):
|
||||
url = any_to_uri(request_or_url)
|
||||
request = Request(url, dont_filter=True)
|
||||
response = None
|
||||
try:
|
||||
response = threads.blockingCallFromThread(reactor, \
|
||||
self._schedule, request, spider)
|
||||
except:
|
||||
log.err(Failure(), "Error fetching response", spider=spider)
|
||||
response, spider = threads.blockingCallFromThread(reactor, \
|
||||
self._schedule, request, spider)
|
||||
self.populate_vars(url, response, request, spider)
|
||||
|
||||
def populate_vars(self, url=None, response=None, request=None, spider=None):
|
||||
item = self.item_class()
|
||||
self.vars['item'] = item
|
||||
self.vars['settings'] = settings
|
||||
self.vars['settings'] = self.crawler.settings
|
||||
if url:
|
||||
if isinstance(response, TextResponse):
|
||||
self.vars['xxs'] = XmlXPathSelector(response)
|
||||
@ -89,7 +91,8 @@ class Shell(object):
|
||||
self.vars['view'] = open_in_browser
|
||||
self.vars['shelp'] = self.print_help
|
||||
self.update_vars(self.vars)
|
||||
self.print_help()
|
||||
if not self.code:
|
||||
self.print_help()
|
||||
|
||||
def print_help(self):
|
||||
self.p("Available Scrapy objects:")
|
||||
|
@ -5,6 +5,7 @@ See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.settings import SpiderSettings
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.trackref import object_ref
|
||||
@ -33,6 +34,21 @@ class BaseSpider(object_ref):
|
||||
"""
|
||||
log.msg(message, spider=self, level=level)
|
||||
|
||||
def set_crawler(self, crawler):
|
||||
assert not hasattr(self, '_crawler'), "Spider already bounded to %s" % crawler
|
||||
self._crawler = crawler
|
||||
|
||||
@property
|
||||
def crawler(self):
|
||||
assert hasattr(self, '_crawler'), "Spider not bounded to any crawler"
|
||||
return self._crawler
|
||||
|
||||
@property
|
||||
def settings(self):
|
||||
if not hasattr(self, '_settings'):
|
||||
self._settings = SpiderSettings(self, self.crawler.settings)
|
||||
return self._settings
|
||||
|
||||
def start_requests(self):
|
||||
reqs = []
|
||||
for url in self.start_urls:
|
||||
|
@ -5,10 +5,6 @@
|
||||
#
|
||||
# http://doc.scrapy.org/topics/settings.html
|
||||
#
|
||||
# Or you can copy and paste them from where they're defined in Scrapy:
|
||||
#
|
||||
# scrapy/conf/default_settings.py
|
||||
#
|
||||
|
||||
BOT_NAME = '$project_name'
|
||||
BOT_VERSION = '1.0'
|
||||
|
@ -10,7 +10,6 @@ class CmdlineTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.env = os.environ.copy()
|
||||
self.env['PYTHONPATH'] = os.path.dirname(scrapy.__path__[0])
|
||||
self.env.pop('SCRAPY_SETTINGS_DISABLED', None)
|
||||
self.env['SCRAPY_SETTINGS_MODULE'] = 'scrapy.tests.test_cmdline.settings'
|
||||
|
||||
def _execute(self, *new_args, **kwargs):
|
||||
|
@ -61,7 +61,6 @@ class CommandTest(ProjectTest):
|
||||
super(CommandTest, self).setUp()
|
||||
self.call('startproject', self.project_name)
|
||||
self.cwd = join(self.temp_path, self.project_name)
|
||||
self.env.pop('SCRAPY_SETTINGS_DISABLED', None)
|
||||
self.env['SCRAPY_SETTINGS_MODULE'] = '%s.settings' % self.project_name
|
||||
|
||||
|
||||
|
@ -1,51 +0,0 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.conf import Settings
|
||||
|
||||
class SettingsTest(unittest.TestCase):
|
||||
|
||||
def test_get(self):
|
||||
settings = Settings({
|
||||
'TEST_ENABLED1': '1',
|
||||
'TEST_ENABLED2': True,
|
||||
'TEST_ENABLED3': 1,
|
||||
'TEST_DISABLED1': '0',
|
||||
'TEST_DISABLED2': False,
|
||||
'TEST_DISABLED3': 0,
|
||||
'TEST_INT1': 123,
|
||||
'TEST_INT2': '123',
|
||||
'TEST_FLOAT1': 123.45,
|
||||
'TEST_FLOAT2': '123.45',
|
||||
'TEST_LIST1': ['one', 'two'],
|
||||
'TEST_LIST2': 'one,two',
|
||||
'TEST_STR': 'value',
|
||||
})
|
||||
assert settings.getbool('TEST_ENABLED1') is True
|
||||
assert settings.getbool('TEST_ENABLED2') is True
|
||||
assert settings.getbool('TEST_ENABLED3') is True
|
||||
assert settings.getbool('TEST_ENABLEDx') is False
|
||||
assert settings.getbool('TEST_ENABLEDx', True) is True
|
||||
assert settings.getbool('TEST_DISABLED1') is False
|
||||
assert settings.getbool('TEST_DISABLED2') is False
|
||||
assert settings.getbool('TEST_DISABLED3') is False
|
||||
self.assertEqual(settings.getint('TEST_INT1'), 123)
|
||||
self.assertEqual(settings.getint('TEST_INT2'), 123)
|
||||
self.assertEqual(settings.getint('TEST_INTx'), 0)
|
||||
self.assertEqual(settings.getint('TEST_INTx', 45), 45)
|
||||
self.assertEqual(settings.getfloat('TEST_FLOAT1'), 123.45)
|
||||
self.assertEqual(settings.getfloat('TEST_FLOAT2'), 123.45)
|
||||
self.assertEqual(settings.getfloat('TEST_FLOATx'), 0.0)
|
||||
self.assertEqual(settings.getfloat('TEST_FLOATx', 55.0), 55.0)
|
||||
self.assertEqual(settings.getlist('TEST_LIST1'), ['one', 'two'])
|
||||
self.assertEqual(settings.getlist('TEST_LIST2'), ['one', 'two'])
|
||||
self.assertEqual(settings.getlist('TEST_LISTx'), [])
|
||||
self.assertEqual(settings.getlist('TEST_LISTx', ['default']), ['default'])
|
||||
self.assertEqual(settings['TEST_STR'], 'value')
|
||||
self.assertEqual(settings.get('TEST_STR'), 'value')
|
||||
self.assertEqual(settings['TEST_STRx'], None)
|
||||
self.assertEqual(settings.get('TEST_STRx'), None)
|
||||
self.assertEqual(settings.get('TEST_STRx', 'default'), 'default')
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
190
scrapy/tests/test_contrib_ibl/samples/samples_htmlpage_0.html
Normal file
190
scrapy/tests/test_contrib_ibl/samples/samples_htmlpage_0.html
Normal file
@ -0,0 +1,190 @@
|
||||
<head>
|
||||
|
||||
|
||||
|
||||
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||
<meta name="Copyright" content="Site Layout, Design & Content Copyright 2005 - retrosixty.co.uk">
|
||||
<meta http-equiv="content-language" content="EN">
|
||||
<meta name="Designer" content="Max Williams">
|
||||
<meta name="Keywords" content="retrosixty, retro sixty, retro, furniture, retro furniture, lighting ,retro lighting, art, retro art, ceramics, retro ceramics, technology, retro technology, fifties, sixties, seventies, 20th century design, post-war, post-war decorative, retro accessories">
|
||||
<meta name="Title" content="retrosixty - retrosixty.co.uk">
|
||||
<meta name="revisit-after" content="7">
|
||||
<meta name="Robots" content="index,follow">
|
||||
<meta name="Description" content="Dealers of retro furniture, post-war decorative and fine arts.">
|
||||
<meta http-equiv="Cache-Control" content="no-cache">
|
||||
<meta http-equiv="Expires" content="0">
|
||||
<meta name="Author" content="Nick Waters">
|
||||
|
||||
<title>retrosixty - Charlotte Perriand Infraphil lamp, c1960s for Philips, Netherlands</title>
|
||||
|
||||
<script language="JavaScript">
|
||||
<!--
|
||||
function FP_swapImg() {//v1.0
|
||||
var doc=document,args=arguments,elm,n; doc.$imgSwaps=new Array(); for(n=2; n<args.length;
|
||||
n+=2) { c=o.layers; if(elm) { doc.$imgSwaps[doc.$imgSwaps.length]=elm;
|
||||
elm.$src=elm.src; elm.src=args[n+1]; } }
|
||||
}
|
||||
|
||||
function FP_preloadImgs() {//v1.0
|
||||
var c=o.childNodes; if(!d.FP_imgs) d.FP_imgs=new Array();
|
||||
for(var d=document,a=arguments; i<a.length; i++) { d.FP_imgs[i]=new Image; d.FP_imgs[i].src=a[i]; }
|
||||
}
|
||||
|
||||
function FP_getObjectByID(id,o) {//v1.0
|
||||
var c,el,els,f,m,n; if(!o)o=document; if(o.getElementById) el=o.getElementById(id);
|
||||
else if(o.layers) el=o.all[id]; else if(o.all) el=FP_getObjectByID(id,c[n]); if(el) return el;
|
||||
if(o.id==id || o.name==id) return o; if(o.childNodes) el=FP_getObjectByID(id,els[n]); if(c)
|
||||
for(n=0; n<c.length; n++) { elm=FP_getObjectByID(args[n]); if(el) return el; }
|
||||
els=f[n].elements; if(f) for(n=0; n<f.length; n++) { f=o.forms;
|
||||
for(m=0; m<els.length; m++){ i=0; if(el) return el; } }
|
||||
return null;
|
||||
}
|
||||
// -->
|
||||
</script>
|
||||
|
||||
<style fprolloverstyle="">A:hover {color: #999999}
|
||||
span.patitre
|
||||
{}
|
||||
span.auctionblock
|
||||
{}
|
||||
</style>
|
||||
|
||||
<style id="mydeco-style" type="text/css">@import url(http://localhost:8000/as/site_media/clean.css);
|
||||
</style></head><body bottommargin="0" leftmargin="0" onload="" rightmargin="0" topmargin="0" alink="#000000" bgcolor="#c0c0c0" vlink="#000000" link="#000000">
|
||||
|
||||
<div class="mydeco-selected" align="center">
|
||||
<table id="table1" bgcolor="#ffffff" border="0" cellpadding="0" cellspacing="0" width="765" height="100%">
|
||||
<tbody><tr>
|
||||
<td colspan="3" style="border-left: 1px solid rgb(0, 0, 0); border-right: 1px solid rgb(0, 0, 0);" align="center" height="120">
|
||||
<p align="center">
|
||||
<img alt="retrosixty" src="../images/logo.jpg" border="0" width="745" height="102"></p></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="border-left: 1px solid rgb(0, 0, 0);" width="177" height="20">
|
||||
<p style="margin-left: 10px;">
|
||||
<img alt="retrosixty" src="../images/top.gif" border="0" width="160" height="20"></p></td>
|
||||
<td colspan="2" style="border-right: 1px solid rgb(0, 0, 0);" width="586" height="20">
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="border-left: 1px solid rgb(0, 0, 0);" background="../images/bg.gif" valign="top" width="180">
|
||||
<p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../index.html">
|
||||
<img alt="Home" fp-style="fp-btn: Linked Column 9; fp-font-style: Bold; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Home" id="img31" onmouseout="" onmouseover="" src="../buttons/button3.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../about.html">
|
||||
<img alt="About Us" fp-style="fp-btn: Linked Column 9; fp-font-style: Bold; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="About Us" id="img42" onmouseout="" onmouseover="" src="../buttons/button32.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../shipping.html">
|
||||
<img alt="Shipping" fp-style="fp-btn: Linked Column 9; fp-font-style: Bold; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Shipping" id="img43" onmouseout="" onmouseover="" src="../buttons/button34.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../links.html">
|
||||
<img alt="Links" fp-style="fp-btn: Linked Column 9; fp-font-style: Bold; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0; fp-orig: 0" fp-title="Links" id="img45" onmouseout="" onmouseover="" src="../buttons/button1.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../contact.php">
|
||||
<img alt="Contact" fp-style="fp-btn: Linked Column 9; fp-font-style: Bold; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Contact" id="img44" onmouseout="" onmouseover="" src="../buttons/button36.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
</p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../furniture.html">
|
||||
<img alt="Furniture" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Furniture" id="img33" onmouseout="" onmouseover="" src="../buttons/buttonB.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../lighting.html">
|
||||
<img alt="Lighting" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Lighting" id="img34" onmouseout="" onmouseover="" src="../buttons/buttonD.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../tech.html">
|
||||
<img alt="Technology" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Technology" id="img35" onmouseout="" onmouseover="" src="../buttons/buttonF.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../ceramics.html">
|
||||
<img alt="Ceramics" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Ceramics" id="img36" onmouseout="" onmouseover="" src="../buttons/button11.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../art.html">
|
||||
<img alt="Art" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Art" id="img37" onmouseout="" onmouseover="" src="../buttons/button13.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../misc.html">
|
||||
<img alt="Misc. Items" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0" fp-title="Misc. Items" id="img38" onmouseout="" onmouseover="" src="../buttons/button15.jpg" border="0" width="125" height="31"></a></p><p style="margin-top: 0pt; margin-bottom: 0pt;" align="center">
|
||||
<a href="../contemp.html">
|
||||
<img alt="Contemporary" fp-style="fp-btn: Linked Column 9; fp-img-press: 0; fp-bgcolor: #7B7B7B; fp-proportional: 0; fp-orig: 0" fp-title="Contemporary" id="img46" onmouseout="" onmouseover="" src="../buttons/button17.jpg" border="0" width="125" height="31"></a></p></td>
|
||||
<td class="" valign="top" width="433">
|
||||
<p style="margin-left: 10px; margin-right: 20px;">
|
||||
<span style="font-weight: 700;"><font id="anonymous_element_1" data-scrapy-annotate="{"variant": 0, "annotations": {"content": "name"}}" size="5" face="Tahoma">
|
||||
Lighting..</font></span></p>
|
||||
<p style="margin-left: 10px; margin-right: 20px; margin-bottom: 15px;" align="justify">
|
||||
<font class="" size="2" face="Tahoma">Please click the thumbnails for larger
|
||||
images and the back button to return to the Lighting index.</font></p><div class="" align="center">
|
||||
<table id="table2" border="0" cellpadding="0" cellspacing="0" width="400" height="309">
|
||||
<tbody><tr>
|
||||
<td style="border-top: 1px solid rgb(123, 123, 123); border-bottom: 1px solid rgb(123, 123, 123);" width="130" height="309">
|
||||
<p align="center">
|
||||
</p><p class="" align="center">
|
||||
<a href="../photos/0642-01.JPG" target="_blank">
|
||||
<img id="anonymous_element_2" data-scrapy-annotate="{"variant": 0, "annotations": {"src": "image_urls"}}" src="../photos/0642-01_small.jpg" border="1"></a>
|
||||
</p><p align="center">
|
||||
<a href="../photos/0642-02.JPG" target="_blank">
|
||||
<img src="../photos/0642-02_small.jpg" border="1"></a></p><p align="center">
|
||||
<a href="../photos/0642-03.JPG" target="_blank">
|
||||
<img src="../photos/0642-03_small.jpg" border="1"></a></p><p align="center">
|
||||
<a href="../photos/0642-04.JPG" target="_blank">
|
||||
<img src="../photos/0642-04_small.jpg" border="1"></a></p><p align="center">
|
||||
</p><p align="center">
|
||||
</p><p align="center">
|
||||
</p></td>
|
||||
<td class="" style="border-top: 1px solid rgb(123, 123, 123); border-bottom: 1px solid rgb(123, 123, 123);" align="left" valign="top" height="309">
|
||||
<p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;">
|
||||
<b><font size="2" face="Tahoma">Designer</font></b><b><font size="2" face="Tahoma">:
|
||||
</font>
|
||||
</b>
|
||||
<font size="2" face="Tahoma,sans-serif">Charlotte Perriand</font><span style="font-size: 10pt; font-family: Tahoma,sans-serif;"> </span></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;">
|
||||
<font size="2" face="Tahoma"><b>Manufacturer: </b></font>
|
||||
<font size="2"><span style="font-family: Tahoma,sans-serif;">
|
||||
Philips, Netherlands</span></font><span style="font-size: 10pt; font-family: Tahoma,sans-serif;">
|
||||
</span></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;"><font size="2" face="Tahoma"><b>
|
||||
Description:
|
||||
|
||||
</b></font>
|
||||
<span style="font-size: 10pt; font-family: Tahoma,sans-serif;">
|
||||
A Perriand designed 'infraphil' infrared heat lamp
|
||||
designed in c1960s. This example is in good vintage
|
||||
condition with some minor wear as one would expect.
|
||||
Original Philips sticker intact, although it has some
|
||||
wear as pictured. </span>
|
||||
</p><p class="" style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;">
|
||||
<span class="" style="font-size: 10pt; font-family: Tahoma,sans-serif;">
|
||||
As with all electrical items we always
|
||||
recommend having them tested by a professional prior to
|
||||
use although it is in full working order. The lamp can
|
||||
be used as a table lamp, or mounted on the wall - full
|
||||
adjustable</span><font size="2" face="Tahoma">...</font></p><p class="" style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;">
|
||||
<font id="anonymous_element_3" data-scrapy-annotate="{"variant": 0, "annotations": {"content": "price"}}" size="2" face="Tahoma"><b>Price:</b> £60</font></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;">
|
||||
<font size="2" face="Tahoma"><b>Size:</b> </font>
|
||||
<font size="2"><span style="font-family: Tahoma,sans-serif;">
|
||||
N/A</span></font><span style="font-size: 10pt; font-family: Tahoma,sans-serif;">
|
||||
<span class="auctionblock"> </span>
|
||||
<span class="auctionblock"> </span> </span></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;">
|
||||
<font size="2" face="Tahoma"><b>Shipping:</b> </font>
|
||||
<span style="font-size: 10pt; font-family: Tahoma,sans-serif;">
|
||||
£7 to mainland UK</span><font size="2" face="Tahoma">.
|
||||
Please enquire for other locations.</font></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;">
|
||||
<font size="2" face="Tahoma"><b>Ref #:</b> 0642</font></p><p style="margin-left: 15px; margin-top: 25px; margin-bottom: -10px;">
|
||||
</p></td>
|
||||
</tr>
|
||||
</tbody></table>
|
||||
|
||||
|
||||
<p style="margin-left: 25px; margin-top: 25px;">
|
||||
<font size="2" face="Tahoma">
|
||||
<a href="about:blank" style="text-decoration: none;"><b>
|
||||
<< </b>BACK</a></font></p></div>
|
||||
</td>
|
||||
<td class="" style="border-right: 1px solid rgb(0, 0, 0);" valign="top" width="153">
|
||||
<p style="margin-right: 20px;" align="left">
|
||||
<img class="" alt="retrosixty" src="../images/icon1.jpg" border="0" width="133" height="133"></p>
|
||||
<p style="margin-right: 20px;">
|
||||
<img class="" alt="retrosixty" src="../images/icon2.jpg" border="0" width="133" height="133"></p>
|
||||
<p style="margin-right: 20px;">
|
||||
<img class="" alt="retrosixty" src="../images/icon3.jpg" border="0" width="133" height="133"></p>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td style="border-left: 1px solid rgb(0, 0, 0);" width="177" height="25">
|
||||
<p style="margin-left: 10px; margin-bottom: 10px;">
|
||||
<img alt="retrosixty" src="../images/bottom.gif" border="0" width="160" height="25"></p></td>
|
||||
<td colspan="2" style="border-right: 1px solid rgb(0, 0, 0);" width="586" height="25">
|
||||
<p style="margin-right: 15px;" align="right">
|
||||
<font style="font-size: 8pt;" face="Tahoma">Site Layout, Design &
|
||||
Content Copyright 2006-09 - retrosixty.co.uk</font></p></td>
|
||||
</tr>
|
||||
</tbody></table>
|
||||
</div>
|
||||
|
||||
</body>
|
3099
scrapy/tests/test_contrib_ibl/samples/samples_htmlpage_0.json
Normal file
3099
scrapy/tests/test_contrib_ibl/samples/samples_htmlpage_0.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
21968
scrapy/tests/test_contrib_ibl/samples/samples_htmlpage_1.json
Normal file
21968
scrapy/tests/test_contrib_ibl/samples/samples_htmlpage_1.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
21792
scrapy/tests/test_contrib_ibl/samples/samples_htmlpage_2.json
Normal file
21792
scrapy/tests/test_contrib_ibl/samples/samples_htmlpage_2.json
Normal file
File diff suppressed because it is too large
Load Diff
632
scrapy/tests/test_contrib_ibl/samples/samples_pageparsing_0.html
Normal file
632
scrapy/tests/test_contrib_ibl/samples/samples_pageparsing_0.html
Normal file
@ -0,0 +1,632 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html dir="ltr" lang="en" xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>TEMPUR Deluxe-HD™ Mattress | Tempur</title>
|
||||
<meta content="text/html; charset=iso-8859-1" http-equiv="Content-Type" />
|
||||
<meta content="TEMPUR Deluxe-HD™ Mattress Mattresses Pillows Small Products " name="keywords" />
|
||||
<meta content="Tempur TEMPUR Deluxe-HD™ Mattress - TEMPUR Deluxe-HD™ Mattress Product Overview The TEMPUR Deluxe-HD™ Mattress combines the unique pressure relieving qualities of TEMPUR, with extra TEMPUR-HD™ soft-touch quilted into the cover, for a luxurious feel that is unparalleled in the bedroom. It not only looks luxurious, but also offers " name="description" />
|
||||
<meta content="no" http-equiv="imagetoolbar" />
|
||||
<meta content="-1" http-equiv="Expires" />
|
||||
<meta content="webmaster@tempur.co.uk" http-equiv="reply-to" />
|
||||
<meta content="document" name="resource-type" />
|
||||
<meta content="30" name="revisit-after" />
|
||||
<meta content="TRUE" name="MSSmartTagsPreventParsing" />
|
||||
<meta content="Consumer Products/Furnishings;Consumer Products/Health" name="classification" />
|
||||
<meta content="INDEX,FOLLOW" name="ROBOTS" />
|
||||
<meta content="Global" name="distribution" />
|
||||
<meta content="Safe For Kids" name="rating" />
|
||||
<meta content="2008 Tempur-Pedic, Inc." name="copyright" />
|
||||
<meta content="Tempur UK" name="author" />
|
||||
<meta content="English" name="language" />
|
||||
<meta content="Web Page" name="doc-type" />
|
||||
<meta content="Completed" name="doc-class" />
|
||||
<meta content="Copywritten Work" name="doc-rights" />
|
||||
<meta content="6QdsKgcJMwvOOi7d0aPp99A9efMsYWnWtiD9+wwrrW4=" name="verify-v1" />
|
||||
<link href="/tempurUK/includes/css/inc.css.site_styles.css" rel="stylesheet" type="text/css" />
|
||||
|
||||
<script src="/tempurUK/includes/js/milonic/milonic_src.js" type="text/javascript"></script>
|
||||
<style>.milonic{width:1px;visibility:hidden;position:absolute}</style>
|
||||
<script type="text/javascript">
|
||||
<!--
|
||||
if (ns4) {
|
||||
_d.write("<script language=JavaScript src='/tempurUK/includes/js/milonic/mmenuns4.js'>\/script>");
|
||||
} else {
|
||||
_d.write("<script language=JavaScript src='/tempurUK/includes/js/milonic/mmenudom.js'>\/script>");
|
||||
}
|
||||
-->
|
||||
</script>
|
||||
<script language="JavaScript" src="/tempurUK/includes/js/milonic/mmenudom.js"></script>
|
||||
<script>
|
||||
<!--
|
||||
function $9(ap) {return _f}
|
||||
// --></script>
|
||||
|
||||
<base href="http://www.tempur.co.uk/" />
|
||||
|
||||
<link href="includes/templates/tempur/css/stylesheet.css" rel="stylesheet" type="text/css" />
|
||||
<script src="includes/templates/template_default/jscript/jscript_popup_ezpage.js" type="text/javascript"></script>
|
||||
<script src="includes/modules/pages/product_info/jscript_textarea_counter.js" type="text/javascript"></script>
|
||||
<script language="JavaScript" type="text/javascript">
|
||||
/*
|
||||
Milonic DHTML Menu - JavaScript Website Navigation System.
|
||||
Copyright 2004 (c) Milonic Solutions Limited. All Rights Reserved.
|
||||
Version 5+ Data File structure is the property of Milonic Solutions Ltd and must only be used in Milonic DHTML Products
|
||||
This is a commercial software product, please visit http://www.milonic.com/ for more information.
|
||||
See http://www.milonic.com/license.php for Commercial License Agreement
|
||||
All Copyright statements must always remain in place in all files at all times
|
||||
******* PLEASE NOTE: THIS IS NOT FREE SOFTWARE, IT MUST BE LICENSED FOR ALL USE *******
|
||||
|
||||
Configured by GDL & Associates on 20041212
|
||||
|
||||
*/
|
||||
|
||||
_menuCloseDelay=500 // The time delay for menus to remain visible on mouse out
|
||||
_menuOpenDelay=150 // The time delay before menus open on mouse over
|
||||
_subOffsetTop=10 // Sub menu top offset
|
||||
_subOffsetLeft=-10 // Sub menu left offset
|
||||
|
||||
var linkfront= "/tempuruk"
|
||||
var slinkfront="/tempuruk"
|
||||
var secureLink = "https://secure.tempurpedic.com/tempuruk"
|
||||
|
||||
// main-menu styles
|
||||
with(mainStyle=new mm_style()){
|
||||
borderwidth=0;
|
||||
}
|
||||
// sub-menu styles
|
||||
with(subStyle=new mm_style()){
|
||||
onbgcolor="#7e97a3";
|
||||
oncolor="#f2edd1";
|
||||
offbgcolor="#F2EDD1";
|
||||
offcolor="#7e97a3";
|
||||
bordercolor="#E2DFB7";
|
||||
borderstyle="solid";
|
||||
borderwidth=1;
|
||||
separatorcolor="#FFFFFF";
|
||||
separatorsize="1";
|
||||
padding=4;
|
||||
fontsize="90%";
|
||||
fontstyle="normal";
|
||||
fontfamily=" Arial, Helvetica, sans-serif";
|
||||
pagecolor="B85212";
|
||||
pagebgcolor="#EEEFE7";
|
||||
headercolor="#756C5A";
|
||||
headerbgcolor="#ffffff";
|
||||
subimage="/tempurUK/images/milonic_arrow.gif";
|
||||
subimagepadding="3";
|
||||
//overfilter="Fade(duration=0.2);Alpha(opacity=90);Shadow(color='#777777', Direction=135, Strength=5)";
|
||||
//outfilter="randomdissolve(duration=0.2)";
|
||||
itemheight=15;
|
||||
}
|
||||
|
||||
|
||||
// Note: Main menu is defined in each document's <body>, rather than here.
|
||||
// This is done in order to use relative positioning, as the home page
|
||||
// in particular, will need to place the menu in a different position than
|
||||
// the rest of the pages on the site. So, instead, the menu is positioned
|
||||
// inside of an HTML table. -rory
|
||||
|
||||
// Company Menu
|
||||
with(milonic=new menuname("Company")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=NASA Space Technology;showmenu=NASA;url=" + linkfront + "/company/nasa/;");
|
||||
aI("text=The TEMPUR History;url=" + linkfront + "/company/history/;");
|
||||
aI("text=Endorsements;url=" + linkfront + "/company/endorsements/;");
|
||||
aI("text=Guarantee;url=" + linkfront + "/warranty/guarantee/;");
|
||||
aI("text=TEMPUR Med;url=http://www.tempurmed.co.uk/page3736.asp;target=windowname;targetfeatures=width=900,height=500");
|
||||
aI("text=Hotels;url=" + linkfront + "/hotels/;");
|
||||
aI("text=FAQs;url=" + linkfront + "/faq/;");
|
||||
aI("text=Contact Us;url=" + linkfront + "/company/contactus/;");
|
||||
aI("text=Press Room;url=" + linkfront + "/company/pressroom/;");
|
||||
}
|
||||
|
||||
// NASA Menu
|
||||
with(milonic=new menuname("NASA")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=Recognised By NASA;url=" + linkfront + "/company/nasa/recognition/;");
|
||||
aI("text=Certificate of Achievement;url=" + linkfront + "/company/nasa/certificate/;");
|
||||
}
|
||||
|
||||
// Endorsements Menu
|
||||
with(milonic=new menuname("Endorsements")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=Consumer Endorsement;url=" + linkfront + "/company/endorsements/ConsumerEndorsement/;");
|
||||
aI("text=Consumer Surveys;url=" + linkfront + "/company/endorsements/ConsumerSurveys/;");
|
||||
aI("text=Medical Endorsements;url=" + linkfront + "/company/endorsements/MedicalEndorsements/;");
|
||||
}
|
||||
|
||||
// Contact Us Menu
|
||||
with(milonic=new menuname("Contact_us")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=Terms & Conditions Online Sale;url=" + linkfront + "/material/TermsConditions/;");
|
||||
aI("text=Terms & Conditions Web Site;url=" + linkfront + "/material/TermsConditions/websitetermsconditions/;");
|
||||
aI("text=Terms & Conditions 60-Night Trial;url=" + linkfront + "/company/contactus/60NightTrial/;");
|
||||
}
|
||||
|
||||
// Material Menu
|
||||
with(milonic=new menuname("Material")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=60 Night Trial;url=" + linkfront + "/60night/;");
|
||||
aI("text=Terms & Conditions;url=" + linkfront + "/material/TermsConditions/;");
|
||||
aI("text=Free Information Pack;url=" + linkfront + "/freeinfo/;");
|
||||
aI("text=Developed for Space;url=" + linkfront + "/material/nasa/;");
|
||||
aI("text=A Comfort Revolution;url=" + linkfront + "/material/comfortrevolution/;");
|
||||
aI("text=TEMPUR Improves Sleep Quality;url=" + linkfront + "/material/sleepquality/;");
|
||||
aI("text=Relieves & Improves Back Pain;url=" + linkfront + "/material/backpain/;");
|
||||
aI("text=Used In Healthcare;url=" + linkfront + "/material/healthcare/;");
|
||||
}
|
||||
|
||||
// Mattresses Menu
|
||||
with(milonic=new menuname("Mattresses")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=TEMPUR Combi Mattress;url=http://www.tempur.co.uk/tempuruk/mattresses/combi/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Combi-HD™ Mattress - 20cm Depth (8 Inch);url=http://www.tempur.co.uk/tempuruk/mattresses/combihd/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Combi-HD™ Mattress - 25cm Depth (10 Inch);url=http://www.tempur.co.uk/tempuruk/mattresses/combihd/25cm/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Deluxe-HD™ Mattress;url=http://www.tempur.co.uk/tempuruk/mattresses/deluxe/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Overlay Mattress;url=http://www.tempur.co.uk/tempuruk/mattresses/overlay/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
}
|
||||
|
||||
// Pillows Menu
|
||||
with(milonic=new menuname("Pillows")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=TEMPUR Original Pillow;url=http://www.tempur.co.uk/tempuruk/pillows/original/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Classic Pillow;url=http://www.tempur.co.uk/tempuruk/pillows/classicpillow/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Traditional Pillow;url=http://www.tempur.co.uk/tempuruk/pillows/traditional/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Millennium Pillow;url=http://www.tempur.co.uk/tempuruk/pillows/millenniumpillow/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR DeLuxe Pillow;url=http://www.tempur.co.uk/tempuruk/pillows/deluxepillow/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
}
|
||||
|
||||
// Small Products Menu
|
||||
with(milonic=new menuname("SmallProducts")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=Small Products;showmenu=SmallProductsSmallProducts;url=http://www.tempur.co.uk/tempuruk/smallproducts/smallproducts/?cPath=4_5&zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=Travel Products;showmenu=SmallProductsTravelProducts;url=http://www.tempur.co.uk/tempuruk/comfort/travel/?cPath=4_6&zenid=ac101e1c434adca39237334777e19b88");
|
||||
}
|
||||
|
||||
// Small Products Menu
|
||||
with(milonic=new menuname("SmallProductsSmallProducts")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=TEMPUR Seat Cushion;url=http://www.tempur.co.uk/tempuruk/comfort/comfortcushion/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Lumbar Support;url=http://www.tempur.co.uk/tempuruk/comfort/lumbarsupport/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR PC Seat Wedge;url=http://www.tempur.co.uk/tempuruk/comfort/seatwedge/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
}
|
||||
|
||||
// Travel Products Menu
|
||||
with(milonic=new menuname("SmallProductsTravelProducts")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=TEMPUR Travel Set;url=http://www.tempur.co.uk/tempuruk/comfort/travelset/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Travel Pillow;url=http://www.tempur.co.uk/tempuruk/comfort/travelneckpillow/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Transit Lumbar Support;url=http://www.tempur.co.uk/tempuruk/comfort/transitlumbar/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Transit Pillow;url=http://www.tempur.co.uk/tempuruk/comfort/transitpillow/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("text=TEMPUR Traditional Travel Pillow;url=http://www.tempur.co.uk/tempuruk/comfort/traditionaltravelpillow/?zenid=ac101e1c434adca39237334777e19b88");
|
||||
}
|
||||
|
||||
|
||||
// Beds Menu
|
||||
with(milonic=new menuname("Beds")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=Milano;url=" + linkfront + "/beds/milano/;");
|
||||
aI("text=Toscana;url=" + linkfront + "/beds/toscana/;");
|
||||
aI("text=Verona;url=" + linkfront + "/beds/verona/;");
|
||||
aI("text=Accessories;showmenu=BedAccessories;");
|
||||
}
|
||||
|
||||
// Bed Accessories Menu
|
||||
with(milonic=new menuname("BedAccessories")){
|
||||
style=subStyle;
|
||||
overflow="scroll";
|
||||
aI("text=TEMPUR Headboard Collection;url=" + linkfront + "/beds/accessories/headboards/;");
|
||||
aI("text=Remote Controls;url=" + linkfront + "/beds/accessories/remotecontrol/;");
|
||||
}
|
||||
|
||||
</script>
|
||||
<script language="javascript" type="text/javascript"><!--
|
||||
function popupWindow(url) {
|
||||
window.open(url,'popupWindow','toolbar=no,location=no,directories=no,status=no,menubar=no,scrollbars=no,resizable=yes,copyhistory=no,width=100,height=100,screenX=150,screenY=150,top=150,left=150')
|
||||
}
|
||||
function popupWindowPrice(url) {
|
||||
window.open(url,'popupWindow','toolbar=no,location=no,directories=no,status=no,menubar=no,scrollbars=yes,resizable=yes,copyhistory=no,width=600,height=400,screenX=150,screenY=150,top=150,left=150')
|
||||
}
|
||||
//--></script>
|
||||
</head>
|
||||
<body id="productinfoBody">
|
||||
<!-- ClickTale Top part -->
|
||||
<script type="text/javascript">
|
||||
var WRInitTime=(new Date()).getTime();
|
||||
</script>
|
||||
<!-- ClickTale end of Top part -->
|
||||
|
||||
|
||||
|
||||
<!--<div id="mainWrapper">-->
|
||||
|
||||
|
||||
<!--bof-header logo and navigation display-->
|
||||
|
||||
<table align="center" bgcolor="#f2edd1" border="1" bordercolor="#ac9d6a" cellpadding="0" cellspacing="0" width="840">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td valign="top">
|
||||
<!-- Begin Top Bar -->
|
||||
<script>
|
||||
drawMenus();
|
||||
</script>
|
||||
|
||||
<table align="center" bgcolor="#f2edd1" border="1" bordercolor="#ac9d6a" cellpadding="0" cellspacing="0" width="840">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td align="center" valign="top">
|
||||
<table align="center" bgcolor="#ffffff" border="0" cellpadding="0" cellspacing="0" height="170" width="840">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
<a href="/tempuruk/mattresses/"><img alt="Order online now or free phone 08000 111 083" border="0" height="111" src="/tempurUK/images/frontpage/top_header.gif" width="560"></a></td>
|
||||
<td align="right" valign="center"><img src="/tempurUK/images/top/top.gif"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
<script language="javascript">
|
||||
<!-- // Main Menu
|
||||
with (milonic=new menuname("Main Menu")) {
|
||||
style=mainStyle;
|
||||
top="offset=50";
|
||||
alwaysvisible=1;
|
||||
orientation="horizontal";
|
||||
position="relative";
|
||||
|
||||
aI("image=/tempurUK/images/top/home.gif;overimage=/tempurUK/images/top/home.gif;url=/");
|
||||
aI("image=/tempurUK/images/top/ourcompany.gif;overimage=/tempurUK/images/top/ourcompany.gif;showmenu=Company;url=/tempuruk/company/");
|
||||
aI("image=/tempurUK/images/top/material.gif;overimage=/tempurUK/images/top/material.gif;showmenu=Material;url=/tempuruk/material/");
|
||||
aI("image=/tempurUK/images/top/mattresses.gif;overimage=/tempurUK/images/top/mattresses.gif;showmenu=Mattresses;url=/tempuruk/mattresses/");
|
||||
aI("image=/tempurUK/images/top/pillows.gif;overimage=/tempurUK/images/top/pillows.gif;showmenu=Pillows;url=/tempuruk/pillows/");
|
||||
aI("image=/tempurUK/images/top/beds.gif;overimage=/tempurUK/images/top/beds.gif;showmenu=Beds;url=/tempuruk/beds/");
|
||||
aI("image=/tempurUK/images/top/small.gif;overimage=/tempurUK/images/top/small.gif;showmenu=SmallProducts;url=/tempuruk/smallproducts/");
|
||||
aI("image=/tempurUK/images/top/clearance.gif;overimage=/tempurUK/images/top/clearance.gif;url=/tempuruk/clearance/");
|
||||
aI("image=/tempurUK/images/top/myaccount.gif;overimage=/tempurUK/images/top/myaccount.gif;url=https://www.tempur.co.uk/index.php?main_page=login&zenid=ac101e1c434adca39237334777e19b88");
|
||||
aI("image=/tempurUK/images/top/end.gif;overimage=/tempurUK/images/top/end.gif");
|
||||
}
|
||||
|
||||
drawMenus();
|
||||
// -->
|
||||
</script>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<table align="center" border="0" cellpadding="0" cellspacing="0" width="840">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>
|
||||
|
||||
<table border="0" cellpadding="0" cellspacing="0" id="contentMainWrapper" width="100%">
|
||||
<tr>
|
||||
|
||||
<td class="columnLeft" id="navColumnOne" style="width: 131px">
|
||||
<div id="navColumnOneWrapper" style="width: 131px">
|
||||
<div style="margin-bottom: 1em"><a href="/tempuruk/mattresses/"><img border="0" src="/tempurUK/images/frontpage/content_offer_gif.gif"></a></div>
|
||||
|
||||
<h3 class="leftBoxHeading" id="informationHeading">Free Information</h3>
|
||||
<div class="sideBoxContent">To get your Free Information Pack <a href="/tempuruk/freeinfo/">click here</a></div>
|
||||
|
||||
|
||||
<h3 class="leftBoxHeading TestimonialBoxHeading" id="testimonials_heading">Testimonials</h3>
|
||||
<div class="sideBoxContent TestimonialBoxContent">My husband and I just spent our first night on our new TEMPUR Mattress, and we are thrilled at the terrific night's rest we both had.</div>
|
||||
|
||||
<div style="margin-top: 0.8em; margin-bottom: 2em;"><a href="/tempuruk/material/"><img border="0" height="117" src="/tempurUK/images/AuthenticTempurMaterial.gif" width="117"></a></div>
|
||||
</div></td>
|
||||
<td valign="top">
|
||||
<!-- bof breadcrumb -->
|
||||
<div id="navBreadCrumb"> <a href="http://www.tempur.co.uk/">Home</a> <span>></span>
|
||||
<a href="http://www.tempur.co.uk/tempuruk/mattresses/?zenid=ac101e1c434adca39237334777e19b88">Mattresses</a> <span>></span>
|
||||
TEMPUR Deluxe-HD™ Mattress
|
||||
</div>
|
||||
<!-- eof breadcrumb -->
|
||||
|
||||
|
||||
<!-- bof upload alerts -->
|
||||
<!-- eof upload alerts -->
|
||||
|
||||
<div class="centerColumn" id="productGeneral">
|
||||
|
||||
<!--bof Form start-->
|
||||
<form action="http://www.tempur.co.uk/tempuruk/mattresses/deluxe/?&number_of_uploads=0&action=add_product&zenid=ac101e1c434adca39237334777e19b88" enctype="multipart/form-data" method="post" name="cart_quantity">
|
||||
<!--eof Form start-->
|
||||
|
||||
|
||||
<div class="productGeneral biggerText" id="productDescription">
|
||||
|
||||
<table align="center" border="0" width="100%">
|
||||
<tbody>
|
||||
|
||||
<tr>
|
||||
<td>
|
||||
<p align="left" class="bodyText"><span class="titleText" data-scrapy-annotate="{"variant": 0, "annotations": {"content": "name"}}" >TEMPUR Deluxe-HD™ Mattress </span><br></p>
|
||||
<table border="0" width="100%">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td width="221">
|
||||
<div align="left"><img border="1" data-scrapy-annotate="{"variant": 0, "annotations": {"src": "image_urls"}}" src="/tempurUK/images/NR/rdonlyres/1CB9C3E7-3FE9-4158-B66C-A6494B845213/391/YR2Y4795_181W.jpg"></div></td>
|
||||
<td width="222">
|
||||
<div align="left">
|
||||
<div align="left"><a href="javascript:popupWindow('index.php?main_page=popup_image_path&path=tempurUK/images/popupwindow/mattresses/DeLuxeMattress_464W.jpg&text=DeLuxe');"><img border="0" data-scrapy-annotate="{"variant": 0, "annotations": {"src": "image_urls"}}" src="/tempurUK/images/NR/rdonlyres/1CB9C3E7-3FE9-4158-B66C-A6494B845213/392/deluxe_clickto_enlarge_181W.jpg">
|
||||
<div></div></a></div></div></td></tr></tbody></table>
|
||||
<table border="0" width="100%">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td data-scrapy-annotate="{"variant": 0, "annotations": {"content": "description"}}" valign="top" width="100%">
|
||||
|
||||
<p align="left" class="bodyText"> </p>
|
||||
<p align="left" class="subTitleText">Product Overview </p>
|
||||
<p align="left" class="bodyText">The TEMPUR Deluxe-HD™ Mattress combines the unique pressure relieving qualities of TEMPUR, with extra TEMPUR-HD™ soft-touch quilted into the cover, for a luxurious feel that is unparalleled in the bedroom. It not only looks luxurious, but also offers enhanced comfort. </p>
|
||||
<p align="left" class="bodyText">This 22cm mattress is constructed differently to the TEMPUR Combi-HD™ Mattress. The TEMPUR Deluxe-HD™ Mattress has a quilted velour cover with a 2cm high density "soft-touch" TEMPUR Material embedded within it. Underneath the quilted cover lies a 9cm layer of TEMPUR Material, on top of 11cm of conventional polyurethane foam.</p>
|
||||
<p align="left" class="bodyText">The Deluxe-HD™ Mattress features the new TEMPUR-Tex™ Cover with in-built humidity control. The TEMPUR-Tex™ <span class="bodyText">material allows any moisture to evaporate faster from the surface of the mattress, thus providing the consumer with a drier sleeping experience. </span></p>
|
||||
<p align="left" class="subTitleText">Product Specification</p>
|
||||
<table border="0" width="100%">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td height="92" valign="top" width="39"> <img border="0" src="/tempurUK/images/NR/rdonlyres/1CB9C3E7-3FE9-4158-B66C-A6494B845213/390/deluxe_breakdown_160W2.jpg"></td>
|
||||
|
||||
<td class="bodyText" width="100%">
|
||||
<ul>
|
||||
<li>A. Quilted Cover with 2cm of HD "soft-<div align="left"> touch" TEMPUR embedded within it. </div>
|
||||
</li><li>B. 9cm TEMPUR visco-elastic temperature
|
||||
<div align="left"> sensitive material</div>
|
||||
</li><li>C. 11cm high resilient polyurethane foam
|
||||
</li><li>15 year limited guarantee
|
||||
</li><li class="bodyText">Works in perfect partnership with the TEMPUR bed range</li></ul></td></tr></tbody></table>
|
||||
<p align="left" class="bodyText" style="margin: 0cm 0cm 0pt;" style1="">
|
||||
</p><p class="subTitleText">When you purchase a TEMPUR Mattress online you will automatically receive the 60-night trial. Please note only one mattress can be trialled per household.</p></td></tr>
|
||||
|
||||
<tr>
|
||||
<td> </td></tr>
|
||||
<tr>
|
||||
<td class="bodyText">
|
||||
<p>Please refer to our most <a class="boldBodyText" href="javascript:PopUpEZPageWindow('/tempuruk/warranty/60NightFAQ?ezpopup=1', 600, 500);"><u>Frequently Asked Questions</u></a> to ensure that you know all the facts about our 60-night trial offer.</p>
|
||||
<p class="boldBodyText"><a href="javascript:PopUpEZPageWindow('/tempuruk/warranty/genuinetempur?ezpopup=1', 400, 450);"><u>Looking to purchase TEMPUR elsewhere?</u></a></p></td></tr></tbody></table></td></tr></tbody></table>
|
||||
|
||||
<table border="0" cellpadding="0" cellspacing="0">
|
||||
<tbody onload="MM_preloadImages('/tempurUK/images/addtobag_on.gif')">
|
||||
<tr>
|
||||
<td background="/tempurUK/images/RTB/readytobuy_left.gif" height="25" valign="top"><img src="/tempurUK/images/RTB/readytobuy_top_left.gif"></td>
|
||||
<td align="right" height="25" style="background: #F9F6EF url(/tempurUK/images/RTB/readytobuy_top_1.gif) top left repeat-x;" valign="top" width="100%">
|
||||
<img src="/tempurUK/images/RTB/readytobuy_top_middle.gif">
|
||||
</td>
|
||||
<td background="/tempurUK/images/RTB/readytobuy_right.gif" height="25" valign="top" width="28"><img src="/tempurUK/images/RTB/readytobuy_top_right.gif"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td background="/tempurUK/images/RTB/readytobuy_left.gif"><img src="/tempurUK/images/RTB/readytobuy_left.gif"></td>
|
||||
<td align="center" style="background: #F9F6EF">
|
||||
<table border="0" cellpadding="0" cellspacing="0" width="100%">
|
||||
<tbody><tr>
|
||||
<td align="center" class="StartingAtOnly" height="25" valign="middle"><span data-scrapy-annotate="{"variant": 0, "annotations": {"content": "price"}}" id="Addtocart1_lpriceLBL">From £1,049.00</span></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center" class="DropDownText" height="25" id="ready_to_buy_options" valign="middle"><select id="attrib-4" name="id[4]">
|
||||
<option selected="selected" value="22">Tempur Deluxe-HD Mattress 2'6" x 6'6" (75 x 200 x 22 cm) ( £1,049.00 )</option>
|
||||
<option value="23">Tempur Deluxe-HD Mattress 3' x 6'3" (90 x 190 x 22 cm) ( £1,149.00 )</option>
|
||||
<option value="24">Tempur Deluxe-HD Mattress 3' x 6'6" (90 x 200 x 22 cm) ( £1,249.00 )</option>
|
||||
<option value="25">Tempur Deluxe-HD Mattress 4'6" x 6'3" (135 x 190 x 22 cm) ( £1,898.99 )</option>
|
||||
<option value="26">Tempur Deluxe-HD Mattress 5' x 6'6" (150 x 200 x 22 cm) ( £2,099.00 )</option>
|
||||
<option value="27">Tempur Deluxe-HD Mattress 5'3 x 6'6" (160 x 200 x 22 cm) ( £2,149.00 )</option>
|
||||
<option value="28">Tempur Deluxe-HD Mattress 6' x 6'6" (180 x 200 x 22 cm) ( £2,199.00 )</option>
|
||||
</select>
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
</tbody></table>
|
||||
</td>
|
||||
<td background="/tempurUK/images/RTB/readytobuy_right.gif"><img src="/tempurUK/images/RTB/readytobuy_right.gif"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td background="/tempurUK/images/RTB/readytobuy_left.gif" height="45" valign="bottom"><img src="/tempurUK/images/RTB/readytobuy_btm_left.gif"></td>
|
||||
<td align="right" style="background: #F9F6EF url(/tempurUK/images/RTB/readytobuy_btm_1.gif) bottom left repeat-x; padding-bottom: 19px" valign="bottom" width="100%"><input name="cart_quantity" type="hidden" value="1" /><input name="products_id" type="hidden" value="4" /><input alt="Add to Cart" src="includes/templates/tempur/buttons/english/button_in_cart.gif" title=" Add to Cart " type="image" /></td>
|
||||
<td background="/tempurUK/images/RTB/readytobuy_right.gif" valign="bottom" width="28"><img src="/tempurUK/images/RTB/readytobuy_btm_right.gif"></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<table cellspacing="0" width="100%">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><span class="boldBodyText"><a href="/tempuruk/checkout/vatexemption/" style="text-decoration:underline">You may be eligible for VAT relief</a></span></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center" width="100%">
|
||||
<table border="0" id="Dimensions1_tblDim">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td align="left"><p class="subTitleText" id="Dimensions1_Label2">Dimensions</p></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center" class="bodytext" data-scrapy-annotate="{"variant": 0, "annotations": {"content": "features"}}" >
|
||||
<table border="1" bordercolor="#7e97a3" cellspacing="0" class="DimensionsTable" id="Dimensions1_dgDimensions" rules="all" width="100%">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="subTitleText" width="30%">Size (Inches)</td><td class="subTitleText" width="40%">Size (Centimetres*)</td><td class="subTitleText" width="30%">Size</td>
|
||||
</tr><tr>
|
||||
<td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">3' x 6'3"</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">90 x 190 x 22 cm</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">Single (Standard)</font></td>
|
||||
</tr><tr>
|
||||
|
||||
<td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">3' x 6'6"</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">90 x 200 x 22 cm</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">Single (Long)</font></td>
|
||||
</tr><tr>
|
||||
<td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">4'6" x 6'3"</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">135 x 190 x 22 cm</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">Double</font></td>
|
||||
</tr><tr>
|
||||
<td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">5' x 6'6"</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">150 x 200 x 22 cm</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">King</font></td>
|
||||
|
||||
</tr><tr>
|
||||
<td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">5'3 x 6'6"</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">160 x 200 x 22 cm</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">Euro King</font></td>
|
||||
</tr><tr>
|
||||
<td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">6' x 6'6"</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">180 x 200 x 22 cm</font></td><td class="bodytext"><font face="Verdana,Arial,Helvetica,sans-serif">Super King</font></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center"><span id="Dimensions1_lblText"><p class="bodyText">*Please Note: Mattress sizes are approximate. Please allow for a 2cm tolerance.</p><p class="subTitleText">Can't find the size you are looking for?</p><p class="bodyText">Special Size Mattresses are available on request, Please contact our Direct Sales Team on <span class="titleText">08000 111 083</span> for further details.</p></span></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table></div>
|
||||
<!--bof Form close-->
|
||||
</form>
|
||||
<!--bof Form close-->
|
||||
</div>
|
||||
</td>
|
||||
|
||||
<td class="columnRight" id="navColumnTwo" style="width: 131px">
|
||||
<div id="navColumnTwoWrapper" style="width: 131px">
|
||||
<div class="rightBoxContainer" style="width: 131px">
|
||||
<h3 class="rightBoxHeading SectionMenuBoxHeading">Mattresses</h3>
|
||||
<div class="sideBoxContent SectionMenuBoxContent">
|
||||
<p class="menuTextOver">
|
||||
<a class="menuTextOver" href="http://www.tempur.co.uk/tempuruk/mattresses/combi/?zenid=ac101e1c434adca39237334777e19b88">
|
||||
TEMPUR Combi Mattress
|
||||
</a>
|
||||
</p>
|
||||
<p class="menuTextOver">
|
||||
<a class="menuTextOver" href="http://www.tempur.co.uk/tempuruk/mattresses/combihd/?zenid=ac101e1c434adca39237334777e19b88">
|
||||
TEMPUR Combi-HD™ Mattress - 20cm Depth (8 Inch)
|
||||
</a>
|
||||
</p>
|
||||
<p class="menuTextOver">
|
||||
<a class="menuTextOver" href="http://www.tempur.co.uk/tempuruk/mattresses/combihd/25cm/?zenid=ac101e1c434adca39237334777e19b88">
|
||||
TEMPUR Combi-HD™ Mattress - 25cm Depth (10 Inch)
|
||||
</a>
|
||||
</p>
|
||||
<p class="menuTextOver">
|
||||
<a class="menuTextOver" href="http://www.tempur.co.uk/tempuruk/mattresses/deluxe/?zenid=ac101e1c434adca39237334777e19b88">
|
||||
TEMPUR Deluxe-HD™ Mattress
|
||||
</a>
|
||||
</p>
|
||||
<p class="menuTextOver">
|
||||
<a class="menuTextOver" href="http://www.tempur.co.uk/tempuruk/mattresses/overlay/?zenid=ac101e1c434adca39237334777e19b88">
|
||||
TEMPUR Overlay Mattress
|
||||
</a>
|
||||
</p>
|
||||
</div>
|
||||
</div><!--// bof: shoppingcart //-->
|
||||
<div class="rightBoxContainer" id="shoppingcart" style="width: 131px">
|
||||
<h3 class="rightBoxHeading" id="shoppingcartHeading"><a href="http://www.tempur.co.uk/index.php?main_page=shopping_cart&zenid=ac101e1c434adca39237334777e19b88">Shopping Cart [more]</a></h3>
|
||||
<div class="sideBoxContent" id="shoppingcartContent"><div id="cartBoxEmpty">Your cart is empty.</div></div></div>
|
||||
<!--// eof: shoppingcart //-->
|
||||
|
||||
<!--// bof: protxdirectcardsaccepted //-->
|
||||
<div class="rightBoxContainer" id="protxdirectcardsaccepted" style="width: 131px">
|
||||
<h3 class="rightBoxHeading" id="protxdirectcardsacceptedHeading">Cards Accepted</h3>
|
||||
<div class="sideBoxContent centeredContent" id="protxdirectcardsacceptedContent">
|
||||
<img alt="Visa" class="ProtxDirectCardsAcceptedSideboxCardIcon" height="25" src="includes/templates/template_default/images/card_icons/visa.png" title=" Visa " width="65" /><img alt="MasterCard" class="ProtxDirectCardsAcceptedSideboxCardIcon" height="25" src="includes/templates/template_default/images/card_icons/mc.png" title=" MasterCard " width="40" /><img alt="Visa Debit" class="ProtxDirectCardsAcceptedSideboxCardIcon" height="25" src="includes/templates/template_default/images/card_icons/visa_debit.png" title=" Visa Debit " width="40" /><img alt="Solo" class="ProtxDirectCardsAcceptedSideboxCardIcon" height="25" src="includes/templates/template_default/images/card_icons/solo.png" title=" Solo " width="20" /><img alt="Maestro" class="ProtxDirectCardsAcceptedSideboxCardIcon" height="25" src="includes/templates/template_default/images/card_icons/maestro.png" title=" Maestro " width="40" /><img alt="Visa Electron (UKE)" class="ProtxDirectCardsAcceptedSideboxCardIcon" height="25" src="includes/templates/template_default/images/card_icons/visa_electron.png" title=" Visa Electron (UKE) " width="40" /><div style="clear: left;"> </div>
|
||||
<img alt="Verified By Visa" class="ProtxDirectCardsAcceptedSidebox3DSecureIcon" height="34" src="includes/templates/template_default/images/card_icons/verified_by_visa_small.png" title=" Verified By Visa " width="60" />
|
||||
<img alt="MasterCard SecureCode" class="ProtxDirectCardsAcceptedSidebox3DSecureIcon" height="34" src="includes/templates/template_default/images/card_icons/mastercard_securecode_small.png" title=" MasterCard SecureCode " width="57" />
|
||||
<div style="clear: left;"> </div>
|
||||
<img alt="Secured by Protx" class="ProtxDirectCardsAcceptedSideboxProtxIcon" height="43" src="includes/templates/template_default/images/card_icons/protx_secured.png" title=" Secured by Protx " width="118" />
|
||||
|
||||
</div></div>
|
||||
<!--// eof: protxdirectcardsaccepted //-->
|
||||
|
||||
|
||||
<div style="margin-top: 1em;"><a href="/tempuruk/company/nasa"><img border="0" height="138" src="/tempurUK/images/nasa.gif" width="133"></a></div>
|
||||
|
||||
</div></td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td style="background: url(/tempurUK/images/bottom.gif) bottom left repeat-x; padding-top: 4em;" />
|
||||
<table align="center" border="0" cellpadding="5" height="76" width="90%">
|
||||
<tbody>
|
||||
<tr class="navi">
|
||||
<td>
|
||||
<div align="center"><a href="/tempuruk/freeinfo/">FREE INFO PACK</a></div>
|
||||
</td>
|
||||
<td>
|
||||
<div align="center"> </div>
|
||||
</td>
|
||||
<td>
|
||||
<div align="center"><a href="/tempuruk/company/contactus/">CONTACT US</a></div>
|
||||
</td>
|
||||
<td>
|
||||
<div align="center"><a href="/tempuruk/material/">NIGHT NIGHT BACK PAIN</a></div>
|
||||
</td>
|
||||
<td>
|
||||
<div align="center"><a href="/tempuruk/material/sleepquality/">THE BEST NIGHT'S SLEEP</a></div>
|
||||
</td>
|
||||
<td>
|
||||
<div align="center"><a href="/tempuruk/mattresses/">MATTRESSES</a></div>
|
||||
</td>
|
||||
<td>
|
||||
<div align="center"><a href="/tempuruk/pillows/">PILLOWS</a></div>
|
||||
</td>
|
||||
<td>
|
||||
<img alt="" height="76" src="/tempurUK/images/spacer.gif" width="1" />
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td align="center" style="padding-bottom: 1em;">
|
||||
<span class="BinNavigation"><a href="/tempuruk/material/TermsConditions/">*Term & Conditions</a></span>
|
||||
<span class="BinNavigation"><a href="/tempuruk/faq/faq60night/">*FAQ 60Night</a></span>
|
||||
<span class="bodyTextSmall">© 2008 TEMPUR UK Ltd. All Rights Reserved</span>
|
||||
|
||||
<span class="bodyTitle">. </span><span class="BinNavigation"><a href="/tempuruk/privacy/">PRIVACY POLICY</a></span>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
|
||||
<!--</div>-->
|
||||
<!--bof- parse time display -->
|
||||
<!--eof- parse time display -->
|
||||
<!--bof- banner #6 display -->
|
||||
<!--eof- banner #6 display -->
|
||||
<!-- Siteimprove: Start //-->
|
||||
<script language="JavaScript" src="//ssl.siteimprove.com/js/siteanalyze.js" type="text/javascript"></script>
|
||||
<!-- Siteimprove: End //-->
|
||||
<script type="text/javascript">
|
||||
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
||||
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
var pageTracker = _gat._getTracker("UA-5947656-1");
|
||||
pageTracker._trackPageview();
|
||||
</script>
|
||||
<!-- ClickTale Bottom part -->
|
||||
<div id="ClickTaleDiv" style="display: none;"></div>
|
||||
<script src="/WRb.js" type="text/javascript"></script>
|
||||
<script type="text/javascript">
|
||||
var ClickTaleSSL=1;
|
||||
if(typeof ClickTale=='function') ClickTale(28035,1);
|
||||
</script>
|
||||
<!-- ClickTale end of Bottom part -->
|
||||
|
||||
|
||||
</body></html>
|
@ -0,0 +1,78 @@
|
||||
[
|
||||
{
|
||||
"surrounds_attribute": "name",
|
||||
"annotation_text": null,
|
||||
"match_common_prefix": false,
|
||||
"surrounds_variant": null,
|
||||
"variant_id": null,
|
||||
"tag_attributes": [],
|
||||
"end_index": 133,
|
||||
"start_index": 132,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"surrounds_attribute": null,
|
||||
"annotation_text": null,
|
||||
"match_common_prefix": false,
|
||||
"surrounds_variant": null,
|
||||
"variant_id": null,
|
||||
"tag_attributes": [
|
||||
[
|
||||
"src",
|
||||
"image_urls"
|
||||
]
|
||||
],
|
||||
"end_index": 142,
|
||||
"start_index": 141,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"surrounds_attribute": null,
|
||||
"annotation_text": null,
|
||||
"match_common_prefix": false,
|
||||
"surrounds_variant": null,
|
||||
"variant_id": null,
|
||||
"tag_attributes": [
|
||||
[
|
||||
"src",
|
||||
"image_urls"
|
||||
]
|
||||
],
|
||||
"end_index": 149,
|
||||
"start_index": 148,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"surrounds_attribute": "description",
|
||||
"annotation_text": null,
|
||||
"match_common_prefix": false,
|
||||
"surrounds_variant": null,
|
||||
"variant_id": null,
|
||||
"tag_attributes": [],
|
||||
"end_index": 207,
|
||||
"start_index": 161,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"surrounds_attribute": "price",
|
||||
"annotation_text": null,
|
||||
"match_common_prefix": false,
|
||||
"surrounds_variant": null,
|
||||
"variant_id": null,
|
||||
"tag_attributes": [],
|
||||
"end_index": 258,
|
||||
"start_index": 257,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"surrounds_attribute": "features",
|
||||
"annotation_text": null,
|
||||
"match_common_prefix": false,
|
||||
"surrounds_variant": null,
|
||||
"variant_id": null,
|
||||
"tag_attributes": [],
|
||||
"end_index": 421,
|
||||
"start_index": 324,
|
||||
"metadata": {}
|
||||
}
|
||||
]
|
Binary file not shown.
Binary file not shown.
@ -2,15 +2,15 @@
|
||||
htmlpage.py tests
|
||||
"""
|
||||
import os
|
||||
from gzip import GzipFile
|
||||
from unittest import TestCase
|
||||
|
||||
from scrapy.utils.py26 import json
|
||||
from scrapy.tests.test_contrib_ibl import path
|
||||
from scrapy.contrib.ibl.htmlpage import parse_html, HtmlTag, HtmlDataFragment
|
||||
from scrapy.tests.test_contrib_ibl.test_htmlpage_data import *
|
||||
from scrapy.utils.python import unicode_to_str, str_to_unicode
|
||||
|
||||
SAMPLES_FILE = "samples_htmlpage.json.gz"
|
||||
SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_htmlpage")
|
||||
|
||||
def _encode_element(el):
|
||||
"""
|
||||
@ -39,24 +39,18 @@ def add_sample(source):
|
||||
Method for adding samples to test samples file
|
||||
(use from console)
|
||||
"""
|
||||
samples = []
|
||||
if os.path.exists(SAMPLES_FILE):
|
||||
for line in GzipFile(os.path.join(path, SAMPLES_FILE), "r").readlines():
|
||||
samples.append(json.loads(line))
|
||||
count = 0
|
||||
while os.path.exists("%s_%d.json" % (SAMPLES_FILE_PREFIX, count)):
|
||||
count += 1
|
||||
|
||||
new_sample = {"source": source}
|
||||
new_sample["parsed"] = list(parse_html(source))
|
||||
samples.append(new_sample)
|
||||
samples_file = GzipFile(os.path.join(path, SAMPLES_FILE), "wb")
|
||||
for sample in samples:
|
||||
samples_file.write(json.dumps(sample, default=_encode_element) + "\n")
|
||||
samples_file.close()
|
||||
open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "wb").write(unicode_to_str(source))
|
||||
parsed = list(parse_html(source))
|
||||
open("%s_%d.json" % (SAMPLES_FILE_PREFIX, count), "wb")\
|
||||
.write(json.dumps(parsed, default=_encode_element, indent=8))
|
||||
|
||||
class TestParseHtml(TestCase):
|
||||
"""Test for parse_html"""
|
||||
def _test_sample(self, sample):
|
||||
source = sample["source"]
|
||||
expected_parsed = sample["parsed"]
|
||||
def _test_sample(self, source, expected_parsed, samplecount=None):
|
||||
parsed = parse_html(source)
|
||||
count_element = 0
|
||||
count_expected = 0
|
||||
@ -69,59 +63,70 @@ class TestParseHtml(TestCase):
|
||||
element_text = source[element.start:element.end]
|
||||
expected_text = source[expected.start:expected.end]
|
||||
if element.start != expected.start or element.end != expected.end:
|
||||
assert False, "[%s,%s] %s != [%s,%s] %s" % (element.start, \
|
||||
errstring = "[%s,%s] %s != [%s,%s] %s" % (element.start, \
|
||||
element.end, element_text, expected.start, \
|
||||
expected.end, expected_text)
|
||||
if samplecount is not None:
|
||||
errstring += " (sample %d)" % samplecount
|
||||
assert False, errstring
|
||||
if type(element) != type(expected):
|
||||
assert False, "(%s) %s != (%s) %s for text\n%s" % (count_element, \
|
||||
errstring = "(%s) %s != (%s) %s for text\n%s" % (count_element, \
|
||||
repr(type(element)), count_expected, repr(type(expected)), element_text)
|
||||
if samplecount is not None:
|
||||
errstring += " (sample %d)" % samplecount
|
||||
assert False, errstring
|
||||
if type(element) == HtmlTag:
|
||||
self.assertEqual(element.tag, expected.tag)
|
||||
self.assertEqual(element.attributes, expected.attributes)
|
||||
self.assertEqual(element.tag_type, expected.tag_type)
|
||||
if expected_parsed:
|
||||
errstring = "Expected %s" % repr(expected_parsed)
|
||||
if samplecount is not None:
|
||||
errstring += " (sample %d)" % samplecount
|
||||
assert False, errstring
|
||||
|
||||
def test_parse(self):
|
||||
"""simple parse_html test"""
|
||||
parsed = [_decode_element(d) for d in PARSED]
|
||||
sample = {"source": PAGE, "parsed": parsed}
|
||||
self._test_sample(sample)
|
||||
self._test_sample(PAGE, parsed)
|
||||
|
||||
def test_site_samples(self):
|
||||
"""test parse_html from real cases"""
|
||||
samples = []
|
||||
for line in GzipFile(os.path.join(path, SAMPLES_FILE), "r").readlines():
|
||||
samples.append(json.loads(line, object_hook=_decode_element))
|
||||
for sample in samples:
|
||||
self._test_sample(sample)
|
||||
|
||||
count = 0
|
||||
fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
|
||||
while os.path.exists(fname):
|
||||
source = str_to_unicode(open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read())
|
||||
parsed = json.loads(str_to_unicode(open(fname, "rb").read()),\
|
||||
object_hook=_decode_element)
|
||||
self._test_sample(source, parsed, count)
|
||||
count += 1
|
||||
fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
|
||||
|
||||
def test_bad(self):
|
||||
"""test parsing of bad html layout"""
|
||||
parsed = [_decode_element(d) for d in PARSED2]
|
||||
sample = {"source": PAGE2, "parsed": parsed}
|
||||
self._test_sample(sample)
|
||||
self._test_sample(PAGE2, parsed)
|
||||
|
||||
def test_comments(self):
|
||||
"""test parsing of tags inside comments"""
|
||||
parsed = [_decode_element(d) for d in PARSED3]
|
||||
sample = {"source": PAGE3, "parsed": parsed}
|
||||
self._test_sample(sample)
|
||||
self._test_sample(PAGE3, parsed)
|
||||
|
||||
def test_script_text(self):
|
||||
"""test parsing of tags inside scripts"""
|
||||
parsed = [_decode_element(d) for d in PARSED4]
|
||||
sample = {"source": PAGE4, "parsed": parsed}
|
||||
self._test_sample(sample)
|
||||
self._test_sample(PAGE4, parsed)
|
||||
|
||||
def test_sucessive(self):
|
||||
"""test parsing of sucesive cleaned elements"""
|
||||
parsed = [_decode_element(d) for d in PARSED5]
|
||||
sample = {"source": PAGE5, "parsed": parsed}
|
||||
self._test_sample(sample)
|
||||
self._test_sample(PAGE5, parsed)
|
||||
|
||||
def test_sucessive2(self):
|
||||
"""test parsing of sucesive cleaned elements (variant 2)"""
|
||||
parsed = [_decode_element(d) for d in PARSED6]
|
||||
sample = {"source": PAGE6, "parsed": parsed}
|
||||
self._test_sample(sample)
|
||||
self._test_sample(PAGE6, parsed)
|
||||
|
||||
def test_special_cases(self):
|
||||
"""some special cases tests"""
|
||||
|
@ -137,7 +137,7 @@ PARSED2 = [
|
||||
{'end': 8, 'start': 7},
|
||||
{'attributes': {}, 'end': 14, 'start': 8, 'tag': u'body', 'tag_type': 1},
|
||||
{'end': 15, 'start': 14},
|
||||
{'attributes': {u'style': u'"margin:', u'0pt"': None, u'class': u'"MsoNormal"', u'0cm': None}, 'end': 80, 'start': 15, 'tag': u'p', 'tag_type': 2},
|
||||
{'attributes': {u'style': u'"margin:', u'0pt"': None, u'class': u'"MsoNormal"', u'0cm': None}, 'end': 80, 'start': 15, 'tag': u'p', 'tag_type': 1},
|
||||
{'attributes': {u'lang': u'"EN-GB"'}, 'end': 107, 'start': 80, 'tag': u'span', 'tag_type': 1},
|
||||
{'end': 121, 'start': 107},
|
||||
{'attributes': {}, 'end': 128, 'start': 121, 'tag': u'span', 'tag_type': 2},
|
||||
@ -165,7 +165,9 @@ PARSED3 = [
|
||||
{'attributes': {}, 'end': 55, 'start': 51, 'tag': u'p', 'tag_type': 2},
|
||||
{'end': 70, 'start': 55},
|
||||
{'attributes': {u'type': u'text/javascript'}, 'end': 101, 'start': 70, 'tag': u'script', 'tag_type': 1},
|
||||
{'end': 124, 'start': 101},
|
||||
{'end': 104, 'start': 101},
|
||||
{'end': 118, 'start': 104},
|
||||
{'end': 124, 'start': 118},
|
||||
{'attributes': {}, 'end': 133, 'start': 124, 'tag': u'script', 'tag_type': 2},
|
||||
{'attributes': {}, 'end': 140, 'start': 133, 'tag': u'body', 'tag_type': 2},
|
||||
{'attributes': {}, 'end': 147, 'start': 140, 'tag': u'html', 'tag_type': 2}
|
||||
@ -204,7 +206,8 @@ PARSED5 = [
|
||||
{'end': 45, 'start': 42},
|
||||
{'attributes': {}, 'end': 54, 'start': 45, 'tag': u'script', 'tag_type': 2},
|
||||
{'attributes': {}, 'end': 61, 'start': 54, 'tag': u'body', 'tag_type': 2},
|
||||
{'end': 91, 'start': 61},
|
||||
{'end': 76, 'start': 61},
|
||||
{'end': 91, 'start': 76},
|
||||
{'attributes': {}, 'end': 98, 'start': 91, 'tag': u'html', 'tag_type': 2},
|
||||
]
|
||||
|
||||
@ -215,7 +218,9 @@ PARSED6 = [
|
||||
{'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1},
|
||||
{'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1},
|
||||
{'attributes': {}, 'end': 20, 'start': 12, 'tag': u'script', 'tag_type': 1},
|
||||
{'end': 40, 'start': 20},
|
||||
{'end': 23, 'start': 20},
|
||||
{'end': 37, 'start': 23},
|
||||
{'end': 40, 'start': 37},
|
||||
{'attributes': {}, 'end': 49, 'start': 40, 'tag': u'script', 'tag_type': 2},
|
||||
{'end': 52, 'start': 49},
|
||||
{'attributes': {}, 'end': 60, 'start': 52, 'tag': u'script', 'tag_type': 1},
|
||||
@ -225,3 +230,19 @@ PARSED6 = [
|
||||
{'attributes': {}, 'end': 81, 'start': 74, 'tag': u'body', 'tag_type': 2},
|
||||
{'attributes': {}, 'end': 88, 'start': 81, 'tag': u'html', 'tag_type': 2},
|
||||
]
|
||||
|
||||
# Test source without ending body nor html
|
||||
PAGE7 = u"""<html><body><p>veris in temporibus sub aprilis idibus</p><script>script code</script><!--comment-->"""
|
||||
|
||||
PARSED7 = [
|
||||
{'attributes' : {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1},
|
||||
{'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1},
|
||||
{'attributes': {}, 'end': 15, 'start': 12, 'tag': u'p', 'tag_type': 1},
|
||||
{'end': 53, 'start': 15},
|
||||
{'attributes': {}, 'end': 57, 'start': 53, 'tag': u'p', 'tag_type': 2},
|
||||
{'attributes' : {}, 'end': 65, 'start': 57, 'tag': u'script', 'tag_type': 1},
|
||||
{'end': 76, 'start': 65},
|
||||
{'attributes' : {}, 'end': 85, 'start': 76, 'tag': u'script', 'tag_type': 2},
|
||||
{'end': 99, 'start': 85},
|
||||
]
|
||||
|
||||
|
@ -3,7 +3,6 @@ Unit tests for pageparsing
|
||||
"""
|
||||
import os
|
||||
from cStringIO import StringIO
|
||||
from gzip import GzipFile
|
||||
|
||||
from twisted.trial.unittest import TestCase, SkipTest
|
||||
from scrapy.utils.python import str_to_unicode
|
||||
@ -274,14 +273,13 @@ class TestPageParsing(TestCase):
|
||||
"""
|
||||
Tests from real pages. More reliable and easy to build for more complicated structures
|
||||
"""
|
||||
samples_file = open(os.path.join(path, "samples_pageparsing.json.gz"), "rb")
|
||||
samples = []
|
||||
for line in GzipFile(fileobj=StringIO(samples_file.read())).readlines():
|
||||
samples.append(json.loads(line))
|
||||
for sample in samples:
|
||||
source = sample["annotated"]
|
||||
annotations = sample["annotations"]
|
||||
template = HtmlPage(body=str_to_unicode(source))
|
||||
SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_pageparsing")
|
||||
count = 0
|
||||
fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
|
||||
while os.path.exists(fname):
|
||||
source = str_to_unicode(open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read())
|
||||
annotations = json.loads(str_to_unicode(open(fname, "rb").read()))
|
||||
template = HtmlPage(body=source)
|
||||
parser = TemplatePageParser(TokenDict())
|
||||
parser.feed(template)
|
||||
for annotation in parser.annotations:
|
||||
@ -293,3 +291,5 @@ class TestPageParsing(TestCase):
|
||||
else:
|
||||
self.assertEqual(getattr(annotation, s), test_annotation[s])
|
||||
self.assertEqual(annotations, [])
|
||||
count += 1
|
||||
fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
|
||||
|
@ -4,39 +4,44 @@ from scrapy.conf import settings
|
||||
from scrapy.contrib.downloadermiddleware.defaultheaders import DefaultHeadersMiddleware
|
||||
from scrapy.http import Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
|
||||
class TestDefaultHeadersMiddleware(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('foo')
|
||||
self.mw = DefaultHeadersMiddleware()
|
||||
self.default_request_headers = dict([(k, [v]) for k, v in \
|
||||
settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
|
||||
def get_defaults_spider_mw(self):
|
||||
crawler = get_crawler()
|
||||
spider = BaseSpider('foo')
|
||||
spider.set_crawler(crawler)
|
||||
defaults = dict([(k, [v]) for k, v in \
|
||||
crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
|
||||
return defaults, spider, DefaultHeadersMiddleware()
|
||||
|
||||
def test_process_request(self):
|
||||
defaults, spider, mw = self.get_defaults_spider_mw()
|
||||
req = Request('http://www.scrapytest.org')
|
||||
self.mw.process_request(req, self.spider)
|
||||
self.assertEquals(req.headers, self.default_request_headers)
|
||||
mw.process_request(req, spider)
|
||||
self.assertEquals(req.headers, defaults)
|
||||
|
||||
def test_spider_default_request_headers(self):
|
||||
defaults, spider, mw = self.get_defaults_spider_mw()
|
||||
spider_headers = {'Unexistant-Header': ['value']}
|
||||
# override one of the global default headers by spider
|
||||
if self.default_request_headers:
|
||||
k = set(self.default_request_headers).pop()
|
||||
if defaults:
|
||||
k = set(defaults).pop()
|
||||
spider_headers[k] = ['__newvalue__']
|
||||
self.spider.default_request_headers = spider_headers
|
||||
spider.DEFAULT_REQUEST_HEADERS = spider_headers
|
||||
|
||||
req = Request('http://www.scrapytest.org')
|
||||
self.mw.process_request(req, self.spider)
|
||||
self.assertEquals(req.headers, dict(self.default_request_headers, **spider_headers))
|
||||
mw.process_request(req, spider)
|
||||
self.assertEquals(req.headers, dict(spider_headers))
|
||||
|
||||
def test_update_headers(self):
|
||||
defaults, spider, mw = self.get_defaults_spider_mw()
|
||||
headers = {'Accept-Language': ['es'], 'Test-Header': ['test']}
|
||||
req = Request('http://www.scrapytest.org', headers=headers)
|
||||
self.assertEquals(req.headers, headers)
|
||||
|
||||
self.mw.process_request(req, self.spider)
|
||||
self.default_request_headers.update(headers)
|
||||
self.assertEquals(req.headers, self.default_request_headers)
|
||||
|
||||
mw.process_request(req, spider)
|
||||
defaults.update(headers)
|
||||
self.assertEquals(req.headers, defaults)
|
||||
|
@ -3,31 +3,32 @@ import unittest
|
||||
from scrapy.contrib.downloadermiddleware.downloadtimeout import DownloadTimeoutMiddleware
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
|
||||
class DownloadTimeoutMiddlewareTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.mw = DownloadTimeoutMiddleware()
|
||||
self.spider = BaseSpider('foo')
|
||||
self.req = Request('http://scrapytest.org/')
|
||||
def get_request_spider_mw(self):
|
||||
crawler = get_crawler()
|
||||
spider = BaseSpider('foo')
|
||||
spider.set_crawler(crawler)
|
||||
request = Request('http://scrapytest.org/')
|
||||
return request, spider, DownloadTimeoutMiddleware()
|
||||
|
||||
def tearDown(self):
|
||||
del self.mw
|
||||
del self.spider
|
||||
del self.req
|
||||
|
||||
def test_spider_has_no_download_timeout(self):
|
||||
assert self.mw.process_request(self.req, self.spider) is None
|
||||
assert 'download_timeout' not in self.req.meta
|
||||
def test_default_download_timeout(self):
|
||||
req, spider, mw = self.get_request_spider_mw()
|
||||
assert mw.process_request(req, spider) is None
|
||||
self.assertEquals(req.meta.get('download_timeout'), 180)
|
||||
|
||||
def test_spider_has_download_timeout(self):
|
||||
self.spider.download_timeout = 2
|
||||
assert self.mw.process_request(self.req, self.spider) is None
|
||||
self.assertEquals(self.req.meta.get('download_timeout'), 2)
|
||||
req, spider, mw = self.get_request_spider_mw()
|
||||
spider.DOWNLOAD_TIMEOUT = 2
|
||||
assert mw.process_request(req, spider) is None
|
||||
self.assertEquals(req.meta.get('download_timeout'), 2)
|
||||
|
||||
def test_request_has_download_timeout(self):
|
||||
self.spider.download_timeout = 2
|
||||
self.req.meta['download_timeout'] = 1
|
||||
assert self.mw.process_request(self.req, self.spider) is None
|
||||
self.assertEquals(self.req.meta.get('download_timeout'), 1)
|
||||
req, spider, mw = self.get_request_spider_mw()
|
||||
spider.DOWNLOAD_TIMEOUT = 2
|
||||
req.meta['download_timeout'] = 1
|
||||
assert mw.process_request(req, spider) is None
|
||||
self.assertEquals(req.meta.get('download_timeout'), 1)
|
||||
|
@ -3,7 +3,7 @@ import unittest, tempfile, shutil, time
|
||||
from scrapy.http import Response, HtmlResponse, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage, HttpCacheMiddleware
|
||||
from scrapy.conf import Settings
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
|
||||
|
||||
|
@ -3,47 +3,49 @@ from unittest import TestCase
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Request
|
||||
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
|
||||
class UserAgentMiddlewareTest(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('foo')
|
||||
self.mw = UserAgentMiddleware()
|
||||
|
||||
def tearDown(self):
|
||||
del self.mw
|
||||
def get_spider_and_mw(self, default_useragent):
|
||||
crawler = get_crawler({'USER_AGENT': default_useragent})
|
||||
spider = BaseSpider('foo')
|
||||
spider.set_crawler(crawler)
|
||||
return spider, UserAgentMiddleware()
|
||||
|
||||
def test_default_agent(self):
|
||||
self.mw.default_useragent = 'default_useragent'
|
||||
spider, mw = self.get_spider_and_mw('default_useragent')
|
||||
req = Request('http://scrapytest.org/')
|
||||
assert self.mw.process_request(req, self.spider) is None
|
||||
assert mw.process_request(req, spider) is None
|
||||
self.assertEquals(req.headers['User-Agent'], 'default_useragent')
|
||||
|
||||
# None or not present user_agent attribute is the same
|
||||
self.spider.user_agent = None
|
||||
def test_remove_agent(self):
|
||||
# settings UESR_AGENT to None should remove the user agent
|
||||
spider, mw = self.get_spider_and_mw('default_useragent')
|
||||
spider.USER_AGENT = None
|
||||
req = Request('http://scrapytest.org/')
|
||||
assert self.mw.process_request(req, self.spider) is None
|
||||
self.assertEquals(req.headers['User-Agent'], 'default_useragent')
|
||||
assert mw.process_request(req, spider) is None
|
||||
assert req.headers.get('User-Agent') is None
|
||||
|
||||
def test_spider_agent(self):
|
||||
self.mw.default_useragent = 'default_useragent'
|
||||
self.spider.user_agent = 'spider_useragent'
|
||||
spider, mw = self.get_spider_and_mw('default_useragent')
|
||||
spider.USER_AGENT = 'spider_useragent'
|
||||
req = Request('http://scrapytest.org/')
|
||||
assert self.mw.process_request(req, self.spider) is None
|
||||
assert mw.process_request(req, spider) is None
|
||||
self.assertEquals(req.headers['User-Agent'], 'spider_useragent')
|
||||
|
||||
def test_header_agent(self):
|
||||
self.mw.default_useragent = 'default_useragent'
|
||||
self.spider.user_agent = 'spider_useragent'
|
||||
spider, mw = self.get_spider_and_mw('default_useragent')
|
||||
spider.USER_AGENT = 'spider_useragent'
|
||||
req = Request('http://scrapytest.org/', headers={'User-Agent': 'header_useragent'})
|
||||
assert self.mw.process_request(req, self.spider) is None
|
||||
assert mw.process_request(req, spider) is None
|
||||
self.assertEquals(req.headers['User-Agent'], 'header_useragent')
|
||||
|
||||
def test_no_agent(self):
|
||||
self.mw.default_useragent = None
|
||||
self.spider.user_agent = None
|
||||
spider, mw = self.get_spider_and_mw(None)
|
||||
spider.USER_AGENT = None
|
||||
req = Request('http://scrapytest.org/')
|
||||
assert self.mw.process_request(req, self.spider) is None
|
||||
assert mw.process_request(req, spider) is None
|
||||
assert 'User-Agent' not in req.headers
|
||||
|
||||
|
@ -17,8 +17,7 @@ from twisted.web import server, static, util
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.conf import Settings
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.utils.test import get_crawler
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.tests import tests_datadir
|
||||
from scrapy.spider import BaseSpider
|
||||
@ -95,8 +94,7 @@ class CrawlerRun(object):
|
||||
dispatcher.connect(self.request_received, signals.request_received)
|
||||
dispatcher.connect(self.response_downloaded, signals.response_downloaded)
|
||||
|
||||
settings = Settings()
|
||||
self.crawler = Crawler(settings)
|
||||
self.crawler = get_crawler()
|
||||
self.crawler.install()
|
||||
self.crawler.configure()
|
||||
self.crawler.queue.append_spider(self.spider)
|
||||
|
@ -1,6 +1,6 @@
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.conf import Settings
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
|
||||
|
@ -2,7 +2,7 @@ from twisted.trial import unittest
|
||||
from twisted.python import failure
|
||||
from twisted.internet import defer, reactor
|
||||
|
||||
from scrapy.conf import Settings
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.spider import BaseSpider
|
||||
|
103
scrapy/tests/test_settings.py
Normal file
103
scrapy/tests/test_settings.py
Normal file
@ -0,0 +1,103 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.settings import Settings, SpiderSettings
|
||||
from scrapy.utils.test import get_crawler
|
||||
from scrapy.spider import BaseSpider
|
||||
|
||||
class SettingsTest(unittest.TestCase):
|
||||
|
||||
def test_get(self):
|
||||
settings = Settings({
|
||||
'TEST_ENABLED1': '1',
|
||||
'TEST_ENABLED2': True,
|
||||
'TEST_ENABLED3': 1,
|
||||
'TEST_DISABLED1': '0',
|
||||
'TEST_DISABLED2': False,
|
||||
'TEST_DISABLED3': 0,
|
||||
'TEST_INT1': 123,
|
||||
'TEST_INT2': '123',
|
||||
'TEST_FLOAT1': 123.45,
|
||||
'TEST_FLOAT2': '123.45',
|
||||
'TEST_LIST1': ['one', 'two'],
|
||||
'TEST_LIST2': 'one,two',
|
||||
'TEST_STR': 'value',
|
||||
})
|
||||
assert settings.getbool('TEST_ENABLED1') is True
|
||||
assert settings.getbool('TEST_ENABLED2') is True
|
||||
assert settings.getbool('TEST_ENABLED3') is True
|
||||
assert settings.getbool('TEST_ENABLEDx') is False
|
||||
assert settings.getbool('TEST_ENABLEDx', True) is True
|
||||
assert settings.getbool('TEST_DISABLED1') is False
|
||||
assert settings.getbool('TEST_DISABLED2') is False
|
||||
assert settings.getbool('TEST_DISABLED3') is False
|
||||
self.assertEqual(settings.getint('TEST_INT1'), 123)
|
||||
self.assertEqual(settings.getint('TEST_INT2'), 123)
|
||||
self.assertEqual(settings.getint('TEST_INTx'), 0)
|
||||
self.assertEqual(settings.getint('TEST_INTx', 45), 45)
|
||||
self.assertEqual(settings.getfloat('TEST_FLOAT1'), 123.45)
|
||||
self.assertEqual(settings.getfloat('TEST_FLOAT2'), 123.45)
|
||||
self.assertEqual(settings.getfloat('TEST_FLOATx'), 0.0)
|
||||
self.assertEqual(settings.getfloat('TEST_FLOATx', 55.0), 55.0)
|
||||
self.assertEqual(settings.getlist('TEST_LIST1'), ['one', 'two'])
|
||||
self.assertEqual(settings.getlist('TEST_LIST2'), ['one', 'two'])
|
||||
self.assertEqual(settings.getlist('TEST_LISTx'), [])
|
||||
self.assertEqual(settings.getlist('TEST_LISTx', ['default']), ['default'])
|
||||
self.assertEqual(settings['TEST_STR'], 'value')
|
||||
self.assertEqual(settings.get('TEST_STR'), 'value')
|
||||
self.assertEqual(settings['TEST_STRx'], None)
|
||||
self.assertEqual(settings.get('TEST_STRx'), None)
|
||||
self.assertEqual(settings.get('TEST_STRx', 'default'), 'default')
|
||||
|
||||
class CrawlerSettingsTest(unittest.TestCase):
|
||||
|
||||
def test_global_defaults(self):
|
||||
crawler = get_crawler()
|
||||
self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 180)
|
||||
|
||||
def test_defaults(self):
|
||||
crawler = get_crawler()
|
||||
crawler.settings.defaults['DOWNLOAD_TIMEOUT'] = '99'
|
||||
self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 99)
|
||||
|
||||
def test_settings_module(self):
|
||||
crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'})
|
||||
self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 3)
|
||||
|
||||
def test_overrides(self):
|
||||
crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'})
|
||||
crawler.settings.overrides['DOWNLOAD_TIMEOUT'] = '15'
|
||||
self.assertEqual(crawler.settings.getint('DOWNLOAD_TIMEOUT'), 15)
|
||||
|
||||
class SpiderSettingsTest(unittest.TestCase):
|
||||
|
||||
def test_global_defaults(self):
|
||||
crawler = get_crawler()
|
||||
settings = SpiderSettings(BaseSpider('name'), crawler.settings)
|
||||
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 180)
|
||||
|
||||
def test_defaults(self):
|
||||
crawler = get_crawler()
|
||||
crawler.settings.defaults['DOWNLOAD_TIMEOUT'] = '99'
|
||||
settings = SpiderSettings(BaseSpider('name'), crawler.settings)
|
||||
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 99)
|
||||
|
||||
def test_crawler_defaults(self):
|
||||
crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'})
|
||||
settings = SpiderSettings(BaseSpider('name'), crawler.settings)
|
||||
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 3)
|
||||
|
||||
def test_spider_overrides_crawler(self):
|
||||
crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'})
|
||||
crawler.settings.defaults['DOWNLOAD_TIMEOUT'] = '99'
|
||||
settings = SpiderSettings(BaseSpider('name', DOWNLOAD_TIMEOUT='12'), crawler.settings)
|
||||
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 12)
|
||||
|
||||
def test_overrides_most_precedence(self):
|
||||
crawler = get_crawler({'DOWNLOAD_TIMEOUT': '3'})
|
||||
crawler.settings.overrides['DOWNLOAD_TIMEOUT'] = '15'
|
||||
settings = SpiderSettings(BaseSpider('name', DOWNLOAD_TIMEOUT='12'), crawler.settings)
|
||||
self.assertEqual(settings.getint('DOWNLOAD_TIMEOUT'), 15)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
9
scrapy/utils/deprecate.py
Normal file
9
scrapy/utils/deprecate.py
Normal file
@ -0,0 +1,9 @@
|
||||
"""Some helpers for deprecation messages"""
|
||||
|
||||
import warnings
|
||||
|
||||
def attribute(obj, oldattr, newattr, version='0.12'):
|
||||
cname = obj.__class__.__name__
|
||||
warnings.warn("%s.%s attribute is deprecated and will be no longer supported "
|
||||
"in Scrapy %s, use %s.%s attribute instead" % \
|
||||
(cname, oldattr, version, cname, newattr), DeprecationWarning, stacklevel=3)
|
@ -7,6 +7,9 @@ import os
|
||||
import libxml2
|
||||
from twisted.trial.unittest import SkipTest
|
||||
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.settings import CrawlerSettings
|
||||
|
||||
def libxml2debug(testfunction):
|
||||
"""Decorator for debugging libxml2 memory leaks inside a function.
|
||||
|
||||
@ -39,3 +42,17 @@ def assert_aws_environ():
|
||||
|
||||
if 'AWS_ACCESS_KEY_ID' not in os.environ:
|
||||
raise SkipTest("AWS keys not found")
|
||||
|
||||
def get_crawler(settings_dict=None):
|
||||
"""Return an unconfigured Crawler object. If settings_dict is given, it
|
||||
will be used as the settings present in the settings module of the
|
||||
CrawlerSettings.
|
||||
"""
|
||||
class SettingsModuleMock(object):
|
||||
pass
|
||||
settings_module = SettingsModuleMock()
|
||||
if settings_dict:
|
||||
for k, v in settings_dict.items():
|
||||
setattr(settings_module, k, v)
|
||||
settings = CrawlerSettings(settings_module)
|
||||
return Crawler(settings)
|
||||
|
@ -18,7 +18,6 @@ def get_spider_list_from_eggfile(eggfile, project):
|
||||
env = os.environ.copy()
|
||||
env['SCRAPY_PROJECT'] = project
|
||||
env['SCRAPY_EGGFILE'] = f.name
|
||||
env.pop('SCRAPY_SETTINGS_DISABLED', None)
|
||||
proc = Popen(pargs, stdout=PIPE, cwd=tmpdir, env=env)
|
||||
out = proc.communicate()[0]
|
||||
return out.splitlines()
|
||||
|
Loading…
x
Reference in New Issue
Block a user