mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 19:43:39 +00:00
Merge pull request #510 from dangra/kmike-reanme-base-spider
Rename BaseSpider to Spider. Fixes #495, fixes #501.
This commit is contained in:
commit
d7fbccdff7
@ -97,18 +97,18 @@ of domains).
|
||||
They define an initial list of URLs to download, how to follow links, and how
|
||||
to parse the contents of those pages to extract :ref:`items <topics-items>`.
|
||||
|
||||
To create a Spider, you must subclass :class:`scrapy.spider.BaseSpider`, and
|
||||
To create a Spider, you must subclass :class:`scrapy.spider.Spider`, and
|
||||
define the three main, mandatory, attributes:
|
||||
|
||||
* :attr:`~scrapy.spider.BaseSpider.name`: identifies the Spider. It must be
|
||||
* :attr:`~scrapy.spider.Spider.name`: identifies the Spider. It must be
|
||||
unique, that is, you can't set the same name for different Spiders.
|
||||
|
||||
* :attr:`~scrapy.spider.BaseSpider.start_urls`: is a list of URLs where the
|
||||
* :attr:`~scrapy.spider.Spider.start_urls`: is a list of URLs where the
|
||||
Spider will begin to crawl from. So, the first pages downloaded will be those
|
||||
listed here. The subsequent URLs will be generated successively from data
|
||||
contained in the start URLs.
|
||||
|
||||
* :meth:`~scrapy.spider.BaseSpider.parse` is a method of the spider, which will
|
||||
* :meth:`~scrapy.spider.Spider.parse` is a method of the spider, which will
|
||||
be called with the downloaded :class:`~scrapy.http.Response` object of each
|
||||
start URL. The response is passed to the method as the first and only
|
||||
argument.
|
||||
@ -116,16 +116,16 @@ define the three main, mandatory, attributes:
|
||||
This method is responsible for parsing the response data and extracting
|
||||
scraped data (as scraped items) and more URLs to follow.
|
||||
|
||||
The :meth:`~scrapy.spider.BaseSpider.parse` method is in charge of processing
|
||||
The :meth:`~scrapy.spider.Spider.parse` method is in charge of processing
|
||||
the response and returning scraped data (as :class:`~scrapy.item.Item`
|
||||
objects) and more URLs to follow (as :class:`~scrapy.http.Request` objects).
|
||||
|
||||
This is the code for our first Spider; save it in a file named
|
||||
``dmoz_spider.py`` under the ``tutorial/spiders`` directory::
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class DmozSpider(BaseSpider):
|
||||
class DmozSpider(Spider):
|
||||
name = "dmoz"
|
||||
allowed_domains = ["dmoz.org"]
|
||||
start_urls = [
|
||||
@ -174,7 +174,7 @@ the spider as their callback function.
|
||||
|
||||
These Requests are scheduled, then executed, and
|
||||
:class:`scrapy.http.Response` objects are returned and then fed back to the
|
||||
spider, through the :meth:`~scrapy.spider.BaseSpider.parse` method.
|
||||
spider, through the :meth:`~scrapy.spider.Spider.parse` method.
|
||||
|
||||
Extracting Items
|
||||
----------------
|
||||
@ -259,7 +259,7 @@ This is what the shell looks like::
|
||||
[s] item Item()
|
||||
[s] request <GET http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
[s] response <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>
|
||||
[s] spider <BaseSpider 'default' at 0x1b6c2d0>
|
||||
[s] spider <Spider 'default' at 0x1b6c2d0>
|
||||
[s] Useful shortcuts:
|
||||
[s] shelp() Print this help
|
||||
[s] fetch(req_or_url) Fetch a new request or URL and update shell objects
|
||||
@ -343,10 +343,10 @@ that property here, so::
|
||||
|
||||
Let's add this code to our spider::
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.selector import Selector
|
||||
|
||||
class DmozSpider(BaseSpider):
|
||||
class DmozSpider(Spider):
|
||||
name = "dmoz"
|
||||
allowed_domains = ["dmoz.org"]
|
||||
start_urls = [
|
||||
@ -386,12 +386,12 @@ Spiders are expected to return their scraped data inside
|
||||
:class:`~scrapy.item.Item` objects. So, in order to return the data we've
|
||||
scraped so far, the final code for our Spider would be like this::
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.selector import Selector
|
||||
|
||||
from tutorial.items import DmozItem
|
||||
|
||||
class DmozSpider(BaseSpider):
|
||||
class DmozSpider(Spider):
|
||||
name = "dmoz"
|
||||
allowed_domains = ["dmoz.org"]
|
||||
start_urls = [
|
||||
|
@ -190,9 +190,9 @@ Usage example::
|
||||
xmlfeed
|
||||
|
||||
$ scrapy genspider -d basic
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class $classname(BaseSpider):
|
||||
class $classname(Spider):
|
||||
name = "$name"
|
||||
allowed_domains = ["$domain"]
|
||||
start_urls = (
|
||||
|
@ -7,7 +7,7 @@ Debugging Spiders
|
||||
This document explains the most common techniques for debugging spiders.
|
||||
Consider the following scrapy spider below::
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
class MySpider(Spider):
|
||||
name = 'myspider'
|
||||
start_urls = (
|
||||
'http://example.com/page1',
|
||||
|
@ -91,7 +91,7 @@ single Python class that defines one or more of the following methods:
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider for which this request is intended
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
.. method:: process_response(request, response, spider)
|
||||
|
||||
@ -118,7 +118,7 @@ single Python class that defines one or more of the following methods:
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param spider: the spider for which this response is intended
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
.. method:: process_exception(request, exception, spider)
|
||||
|
||||
@ -149,7 +149,7 @@ single Python class that defines one or more of the following methods:
|
||||
:type exception: an ``Exception`` object
|
||||
|
||||
:param spider: the spider for which this request is intended
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
.. _topics-downloader-middleware-ref:
|
||||
|
||||
|
@ -37,7 +37,7 @@ single Python class that must implement the following method:
|
||||
:type item: :class:`~scrapy.item.Item` object
|
||||
|
||||
:param spider: the spider which scraped the item
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
Additionally, they may also implement the following methods:
|
||||
|
||||
@ -46,14 +46,14 @@ Additionally, they may also implement the following methods:
|
||||
This method is called when the spider is opened.
|
||||
|
||||
:param spider: the spider which was opened
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
.. method:: close_spider(spider)
|
||||
|
||||
This method is called when the spider is closed.
|
||||
|
||||
:param spider: the spider which was closed
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
|
||||
Item pipeline example
|
||||
|
@ -88,7 +88,7 @@ subclasses):
|
||||
* ``scrapy.http.Response``
|
||||
* ``scrapy.item.Item``
|
||||
* ``scrapy.selector.Selector``
|
||||
* ``scrapy.spider.BaseSpider``
|
||||
* ``scrapy.spider.Spider``
|
||||
|
||||
A real example
|
||||
--------------
|
||||
@ -150,8 +150,8 @@ difficult to read. For this reason, that function has a ``ignore`` argument
|
||||
which can be used to ignore a particular class (and all its subclases). For
|
||||
example, using::
|
||||
|
||||
>>> from scrapy.spider import BaseSpider
|
||||
>>> prefs(ignore=BaseSpider)
|
||||
>>> from scrapy.spider import Spider
|
||||
>>> prefs(ignore=Spider)
|
||||
|
||||
Won't show any live references to spiders.
|
||||
|
||||
|
@ -43,7 +43,7 @@ Logging from Spiders
|
||||
====================
|
||||
|
||||
The recommended way to log from spiders is by using the Spider
|
||||
:meth:`~scrapy.spider.BaseSpider.log` method, which already populates the
|
||||
:meth:`~scrapy.spider.Spider.log` method, which already populates the
|
||||
``spider`` argument of the :func:`scrapy.log.msg` function. The other arguments
|
||||
are passed directly to the :func:`~scrapy.log.msg` function.
|
||||
|
||||
@ -86,7 +86,7 @@ scrapy.log module
|
||||
:param spider: the spider to use for logging this message. This parameter
|
||||
should always be used when logging things related to a particular
|
||||
spider.
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
.. data:: CRITICAL
|
||||
|
||||
|
@ -37,7 +37,7 @@ Request objects
|
||||
request (once its downloaded) as its first parameter. For more information
|
||||
see :ref:`topics-request-response-ref-request-callback-arguments` below.
|
||||
If a Request doesn't specify a callback, the spider's
|
||||
:meth:`~scrapy.spider.BaseSpider.parse` method will be used.
|
||||
:meth:`~scrapy.spider.Spider.parse` method will be used.
|
||||
Note that if exceptions are raised during processing, errback is called instead.
|
||||
|
||||
:type callback: callable
|
||||
@ -342,7 +342,7 @@ automatically pre-populated and only override a couple of them, such as the
|
||||
user name and password. You can use the :meth:`FormRequest.from_response`
|
||||
method for this job. Here's an example spider which uses it::
|
||||
|
||||
class LoginSpider(BaseSpider):
|
||||
class LoginSpider(Spider):
|
||||
name = 'example.com'
|
||||
start_urls = ['http://www.example.com/users/login.php']
|
||||
|
||||
|
@ -56,10 +56,10 @@ Scrapy selectors are instances of :class:`~scrapy.selector.Selector` class
|
||||
constructed by passing a `Response` object as first argument, the response's
|
||||
body is what they're going to be "selecting"::
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.selector import Selector
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
class MySpider(Spider):
|
||||
# ...
|
||||
def parse(self, response):
|
||||
sel = Selector(response)
|
||||
|
@ -72,7 +72,7 @@ content).
|
||||
Those objects are:
|
||||
|
||||
* ``spider`` - the Spider which is known to handle the URL, or a
|
||||
:class:`~scrapy.spider.BaseSpider` object if there is no spider found for
|
||||
:class:`~scrapy.spider.Spider` object if there is no spider found for
|
||||
the current URL
|
||||
|
||||
* ``request`` - a :class:`~scrapy.http.Request` object of the last fetched
|
||||
@ -116,7 +116,7 @@ all start with the ``[s]`` prefix)::
|
||||
[s] request <http://scrapy.org>
|
||||
[s] response <http://scrapy.org>
|
||||
[s] settings <Settings 'mybot.settings'>
|
||||
[s] spider <scrapy.spider.models.BaseSpider object at 0x2bed9d0>
|
||||
[s] spider <Spider 'default' at 0x2bed9d0>
|
||||
[s] Useful shortcuts:
|
||||
[s] shelp() Prints this help.
|
||||
[s] fetch(req_or_url) Fetch a new request or URL and update objects
|
||||
@ -136,7 +136,7 @@ After that, we can star playing with the objects::
|
||||
[s] request <GET http://slashdot.org>
|
||||
[s] response <200 http://slashdot.org>
|
||||
[s] settings <Settings 'jobsbot.settings'>
|
||||
[s] spider <BaseSpider 'default' at 0x3c44a10>
|
||||
[s] spider <Spider 'default' at 0x3c44a10>
|
||||
[s] Useful shortcuts:
|
||||
[s] shelp() Shell help (print this help)
|
||||
[s] fetch(req_or_url) Fetch request (or URL) and update local objects
|
||||
@ -165,7 +165,7 @@ This can be achieved by using the ``scrapy.shell.inspect_response`` function.
|
||||
|
||||
Here's an example of how you would call it from your spider::
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
class MySpider(Spider):
|
||||
...
|
||||
|
||||
def parse(self, response):
|
||||
|
@ -77,7 +77,7 @@ item_scraped
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param spider: the spider which scraped the item
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
item_dropped
|
||||
------------
|
||||
@ -94,7 +94,7 @@ item_dropped
|
||||
:type item: :class:`~scrapy.item.Item` object
|
||||
|
||||
:param spider: the spider which scraped the item
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
:param exception: the exception (which must be a
|
||||
:exc:`~scrapy.exceptions.DropItem` subclass) which caused the item
|
||||
@ -113,7 +113,7 @@ spider_closed
|
||||
This signal supports returning deferreds from their handlers.
|
||||
|
||||
:param spider: the spider which has been closed
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
:param reason: a string which describes the reason why the spider was closed. If
|
||||
it was closed because the spider has completed scraping, the reason
|
||||
@ -137,7 +137,7 @@ spider_opened
|
||||
This signal supports returning deferreds from their handlers.
|
||||
|
||||
:param spider: the spider which has been opened
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
spider_idle
|
||||
-----------
|
||||
@ -161,7 +161,7 @@ spider_idle
|
||||
This signal does not support returning deferreds from their handlers.
|
||||
|
||||
:param spider: the spider which has gone idle
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
spider_error
|
||||
------------
|
||||
@ -178,7 +178,7 @@ spider_error
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param spider: the spider which raised the exception
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
|
||||
response_received
|
||||
@ -199,7 +199,7 @@ response_received
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider for which the response is intended
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
response_downloaded
|
||||
-------------------
|
||||
@ -218,6 +218,6 @@ response_downloaded
|
||||
:type request: :class:`~scrapy.http.Request` object
|
||||
|
||||
:param spider: the spider for which the response is intended
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
.. _Failure: http://twistedmatrix.com/documents/current/api/twisted.python.failure.Failure.html
|
||||
|
@ -81,7 +81,7 @@ single Python class that defines one or more of the following methods:
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
:param spider: the spider for which this response is intended
|
||||
:type spider: :class:`~scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.spider.Spider` object
|
||||
|
||||
|
||||
.. method:: process_spider_output(response, result, spider)
|
||||
@ -101,7 +101,7 @@ single Python class that defines one or more of the following methods:
|
||||
:class:`~scrapy.item.Item` objects
|
||||
|
||||
:param spider: the spider whose result is being processed
|
||||
:type spider: :class:`~scrapy.item.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.item.Spider` object
|
||||
|
||||
|
||||
.. method:: process_spider_exception(response, exception, spider)
|
||||
@ -129,7 +129,7 @@ single Python class that defines one or more of the following methods:
|
||||
:type exception: `Exception`_ object
|
||||
|
||||
:param spider: the spider which raised the exception
|
||||
:type spider: :class:`scrapy.spider.BaseSpider` object
|
||||
:type spider: :class:`scrapy.spider.Spider` object
|
||||
|
||||
.. method:: process_start_requests(start_requests, spider)
|
||||
|
||||
@ -156,7 +156,7 @@ single Python class that defines one or more of the following methods:
|
||||
:type start_requests: an iterable of :class:`~scrapy.http.Request`
|
||||
|
||||
:param spider: the spider to whom the start requests belong
|
||||
:type spider: :class:`~scrapy.item.BaseSpider` object
|
||||
:type spider: :class:`~scrapy.item.Spider` object
|
||||
|
||||
|
||||
.. _Exception: http://docs.python.org/library/exceptions.html#exceptions.Exception
|
||||
@ -268,7 +268,7 @@ OffsiteMiddleware
|
||||
Filters out Requests for URLs outside the domains covered by the spider.
|
||||
|
||||
This middleware filters out every request whose host names aren't in the
|
||||
spider's :attr:`~scrapy.spider.BaseSpider.allowed_domains` attribute.
|
||||
spider's :attr:`~scrapy.spider.Spider.allowed_domains` attribute.
|
||||
|
||||
When your spider returns a request for a domain not belonging to those
|
||||
covered by the spider, this middleware will log a debug message similar to
|
||||
@ -283,7 +283,7 @@ OffsiteMiddleware
|
||||
will be printed (but only for the first request filtered).
|
||||
|
||||
If the spider doesn't define an
|
||||
:attr:`~scrapy.spider.BaseSpider.allowed_domains` attribute, or the
|
||||
:attr:`~scrapy.spider.Spider.allowed_domains` attribute, or the
|
||||
attribute is empty, the offsite middleware will allow all requests.
|
||||
|
||||
If the request has the :attr:`~scrapy.http.Request.dont_filter` attribute
|
||||
|
@ -17,10 +17,10 @@ For spiders, the scraping cycle goes through something like this:
|
||||
those requests.
|
||||
|
||||
The first requests to perform are obtained by calling the
|
||||
:meth:`~scrapy.spider.BaseSpider.start_requests` method which (by default)
|
||||
:meth:`~scrapy.spider.Spider.start_requests` method which (by default)
|
||||
generates :class:`~scrapy.http.Request` for the URLs specified in the
|
||||
:attr:`~scrapy.spider.BaseSpider.start_urls` and the
|
||||
:attr:`~scrapy.spider.BaseSpider.parse` method as callback function for the
|
||||
:attr:`~scrapy.spider.Spider.start_urls` and the
|
||||
:attr:`~scrapy.spider.Spider.parse` method as callback function for the
|
||||
Requests.
|
||||
|
||||
2. In the callback function, you parse the response (web page) and return either
|
||||
@ -58,7 +58,7 @@ Spider arguments are passed through the :command:`crawl` command using the
|
||||
|
||||
Spiders receive arguments in their constructors::
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
class MySpider(Spider):
|
||||
name = 'myspider'
|
||||
|
||||
def __init__(self, category=None, *args, **kwargs):
|
||||
@ -93,10 +93,10 @@ with a ``TestItem`` declared in a ``myproject.items`` module::
|
||||
.. module:: scrapy.spider
|
||||
:synopsis: Spiders base class, spider manager and spider middleware
|
||||
|
||||
BaseSpider
|
||||
----------
|
||||
Spider
|
||||
------
|
||||
|
||||
.. class:: BaseSpider()
|
||||
.. class:: Spider()
|
||||
|
||||
This is the simplest spider, and the one from which every other spider
|
||||
must inherit from (either the ones that come bundled with Scrapy, or the ones
|
||||
@ -178,7 +178,7 @@ BaseSpider
|
||||
|
||||
The ``parse`` method is in charge of processing the response and returning
|
||||
scraped data and/or more URLs to follow. Other Requests callbacks have
|
||||
the same requirements as the :class:`BaseSpider` class.
|
||||
the same requirements as the :class:`Spider` class.
|
||||
|
||||
This method, as well as any other Request callback, must return an
|
||||
iterable of :class:`~scrapy.http.Request` and/or
|
||||
@ -194,15 +194,15 @@ BaseSpider
|
||||
spider. For more information see :ref:`topics-logging`.
|
||||
|
||||
|
||||
BaseSpider example
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
Spider example
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
Let's see an example::
|
||||
|
||||
from scrapy import log # This module is useful for printing out debug information
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
class MySpider(Spider):
|
||||
name = 'example.com'
|
||||
allowed_domains = ['example.com']
|
||||
start_urls = [
|
||||
@ -217,11 +217,11 @@ Let's see an example::
|
||||
Another example returning multiples Requests and Items from a single callback::
|
||||
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request
|
||||
from myproject.items import MyItem
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
class MySpider(Spider):
|
||||
name = 'example.com'
|
||||
allowed_domains = ['example.com']
|
||||
start_urls = [
|
||||
@ -252,7 +252,7 @@ CrawlSpider
|
||||
it's generic enough for several cases, so you can start from it and override it
|
||||
as needed for more custom functionality, or just implement your own spider.
|
||||
|
||||
Apart from the attributes inherited from BaseSpider (that you must
|
||||
Apart from the attributes inherited from Spider (that you must
|
||||
specify), this class supports a new attribute:
|
||||
|
||||
.. attribute:: rules
|
||||
|
@ -7,11 +7,11 @@ usage:
|
||||
|
||||
"""
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request
|
||||
|
||||
|
||||
class QPSSpider(BaseSpider):
|
||||
class QPSSpider(Spider):
|
||||
|
||||
name = 'qps'
|
||||
benchurl = 'http://localhost:8880/'
|
||||
|
@ -3,7 +3,7 @@ from w3lib.url import is_url
|
||||
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.utils.spider import create_spider_for_request
|
||||
|
||||
@ -54,6 +54,6 @@ class Command(ScrapyCommand):
|
||||
spider = crawler.spiders.create(opts.spider)
|
||||
else:
|
||||
spider = create_spider_for_request(crawler.spiders, request, \
|
||||
default_spider=BaseSpider('default'))
|
||||
default_spider=Spider('default'))
|
||||
crawler.crawl(spider, [request])
|
||||
self.crawler_process.start()
|
||||
|
@ -9,7 +9,7 @@ import copy
|
||||
|
||||
from scrapy.http import Request, HtmlResponse
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
def identity(x):
|
||||
return x
|
||||
@ -27,7 +27,7 @@ class Rule(object):
|
||||
else:
|
||||
self.follow = follow
|
||||
|
||||
class CrawlSpider(BaseSpider):
|
||||
class CrawlSpider(Spider):
|
||||
|
||||
rules = ()
|
||||
|
||||
|
@ -4,7 +4,7 @@ for scraping from an XML feed.
|
||||
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.iterators import xmliter, csviter
|
||||
@ -13,7 +13,7 @@ from scrapy.selector import Selector
|
||||
from scrapy.exceptions import NotConfigured, NotSupported
|
||||
|
||||
|
||||
class XMLFeedSpider(BaseSpider):
|
||||
class XMLFeedSpider(Spider):
|
||||
"""
|
||||
This class intends to be the base class for spiders that scrape
|
||||
from XML feeds.
|
||||
@ -92,7 +92,7 @@ class XMLFeedSpider(BaseSpider):
|
||||
for (prefix, uri) in self.namespaces:
|
||||
selector.register_namespace(prefix, uri)
|
||||
|
||||
class CSVFeedSpider(BaseSpider):
|
||||
class CSVFeedSpider(Spider):
|
||||
"""Spider for parsing CSV feeds.
|
||||
It receives a CSV file in a response; iterates through each of its rows,
|
||||
and calls parse_row with a dict containing each field's data.
|
||||
|
@ -1,7 +1,7 @@
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
class InitSpider(BaseSpider):
|
||||
class InitSpider(Spider):
|
||||
"""Base Spider with initialization facilities"""
|
||||
|
||||
def start_requests(self):
|
||||
|
@ -1,12 +1,12 @@
|
||||
import re
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request, XmlResponse
|
||||
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
|
||||
from scrapy.utils.gz import gunzip, is_gzipped
|
||||
from scrapy import log
|
||||
|
||||
class SitemapSpider(BaseSpider):
|
||||
class SitemapSpider(Spider):
|
||||
|
||||
sitemap_urls = ()
|
||||
sitemap_rules = [('', 'parse')]
|
||||
|
@ -11,7 +11,7 @@ from twisted.python import threadable
|
||||
from w3lib.url import any_to_uri
|
||||
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.utils.spider import create_spider_for_request
|
||||
from scrapy.utils.misc import load_object
|
||||
@ -24,7 +24,7 @@ from scrapy.exceptions import IgnoreRequest
|
||||
|
||||
class Shell(object):
|
||||
|
||||
relevant_classes = (BaseSpider, Request, Response, BaseItem,
|
||||
relevant_classes = (Spider, Request, Response, BaseItem,
|
||||
Selector, Settings)
|
||||
|
||||
def __init__(self, crawler, update_vars=None, code=None):
|
||||
@ -67,7 +67,7 @@ class Shell(object):
|
||||
if spider is None:
|
||||
spider = create_spider_for_request(self.crawler.spiders,
|
||||
request,
|
||||
BaseSpider('default'),
|
||||
Spider('default'),
|
||||
log_multiple=True)
|
||||
spider.set_crawler(self.crawler)
|
||||
self.crawler.engine.open_spider(spider, close_if_idle=False)
|
||||
|
@ -3,14 +3,14 @@ Base class for Scrapy spiders
|
||||
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.trackref import object_ref
|
||||
from scrapy.utils.url import url_is_from_spider
|
||||
from scrapy.utils.deprecate import create_deprecated_class
|
||||
|
||||
|
||||
class BaseSpider(object_ref):
|
||||
class Spider(object_ref):
|
||||
"""Base class for scrapy spiders. All spiders must inherit from this
|
||||
class.
|
||||
"""
|
||||
@ -65,6 +65,9 @@ class BaseSpider(object_ref):
|
||||
__repr__ = __str__
|
||||
|
||||
|
||||
BaseSpider = create_deprecated_class('BaseSpider', Spider)
|
||||
|
||||
|
||||
class ObsoleteClass(object):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
@ -1,6 +1,6 @@
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class $classname(BaseSpider):
|
||||
class $classname(Spider):
|
||||
name = "$name"
|
||||
allowed_domains = ["$domain"]
|
||||
start_urls = (
|
||||
|
@ -5,13 +5,13 @@ Some spiders used for testing and benchmarking
|
||||
import time
|
||||
from urllib import urlencode
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request
|
||||
from scrapy.item import Item
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
|
||||
|
||||
class MetaSpider(BaseSpider):
|
||||
class MetaSpider(Spider):
|
||||
|
||||
name = 'meta'
|
||||
|
||||
|
@ -130,9 +130,9 @@ class RunSpiderCommandTest(CommandTest):
|
||||
with open(fname, 'w') as f:
|
||||
f.write("""
|
||||
from scrapy import log
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
class MySpider(Spider):
|
||||
name = 'myspider'
|
||||
|
||||
def start_requests(self):
|
||||
@ -153,7 +153,7 @@ class MySpider(BaseSpider):
|
||||
with open(fname, 'w') as f:
|
||||
f.write("""
|
||||
from scrapy import log
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
""")
|
||||
p = self.proc('runspider', fname)
|
||||
log = p.stderr.read()
|
||||
@ -184,10 +184,10 @@ class ParseCommandTest(CommandTest):
|
||||
with open(fname, 'w') as f:
|
||||
f.write("""
|
||||
from scrapy import log
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.item import Item
|
||||
|
||||
class MySpider(BaseSpider):
|
||||
class MySpider(Spider):
|
||||
name = '{0}'
|
||||
|
||||
def parse(self, response):
|
||||
|
@ -2,7 +2,7 @@ from unittest import TextTestRunner
|
||||
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request
|
||||
from scrapy.item import Item, Field
|
||||
from scrapy.contracts import ContractsManager
|
||||
@ -22,7 +22,7 @@ class ResponseMock(object):
|
||||
url = 'http://scrapy.org'
|
||||
|
||||
|
||||
class TestSpider(BaseSpider):
|
||||
class TestSpider(Spider):
|
||||
name = 'demo_spider'
|
||||
|
||||
def returns_request(self, response):
|
||||
|
@ -6,7 +6,7 @@ from twisted.trial import unittest
|
||||
from twisted.internet import defer
|
||||
from w3lib.url import path_to_file_uri
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.contrib.feedexport import IFeedStorage, FileFeedStorage, FTPFeedStorage, S3FeedStorage, StdoutFeedStorage
|
||||
from scrapy.utils.test import assert_aws_environ
|
||||
|
||||
@ -38,7 +38,7 @@ class FileFeedStorageTest(unittest.TestCase):
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def _assert_stores(self, storage, path):
|
||||
spider = BaseSpider("default")
|
||||
spider = Spider("default")
|
||||
file = storage.open(spider)
|
||||
file.write("content")
|
||||
yield storage.store(file)
|
||||
@ -59,7 +59,7 @@ class FTPFeedStorageTest(unittest.TestCase):
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def _assert_stores(self, storage, path):
|
||||
spider = BaseSpider("default")
|
||||
spider = Spider("default")
|
||||
file = storage.open(spider)
|
||||
file.write("content")
|
||||
yield storage.store(file)
|
||||
@ -81,7 +81,7 @@ class S3FeedStorageTest(unittest.TestCase):
|
||||
from boto import connect_s3
|
||||
storage = S3FeedStorage(uri)
|
||||
verifyObject(IFeedStorage, storage)
|
||||
file = storage.open(BaseSpider("default"))
|
||||
file = storage.open(Spider("default"))
|
||||
file.write("content")
|
||||
yield storage.store(file)
|
||||
u = urlparse.urlparse(uri)
|
||||
@ -94,7 +94,7 @@ class StdoutFeedStorageTest(unittest.TestCase):
|
||||
def test_store(self):
|
||||
out = StringIO()
|
||||
storage = StdoutFeedStorage('stdout:', _stdout=out)
|
||||
file = storage.open(BaseSpider("default"))
|
||||
file = storage.open(Spider("default"))
|
||||
file.write("content")
|
||||
yield storage.store(file)
|
||||
self.assertEqual(out.getvalue(), "content")
|
||||
|
@ -3,7 +3,7 @@ from datetime import datetime
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.contrib.spiderstate import SpiderState
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
|
||||
class SpiderStateTest(unittest.TestCase):
|
||||
@ -11,7 +11,7 @@ class SpiderStateTest(unittest.TestCase):
|
||||
def test_store_load(self):
|
||||
jobdir = self.mktemp()
|
||||
os.mkdir(jobdir)
|
||||
spider = BaseSpider(name='default')
|
||||
spider = Spider(name='default')
|
||||
dt = datetime.now()
|
||||
|
||||
ss = SpiderState(jobdir)
|
||||
@ -20,7 +20,7 @@ class SpiderStateTest(unittest.TestCase):
|
||||
spider.state['dt'] = dt
|
||||
ss.spider_closed(spider)
|
||||
|
||||
spider2 = BaseSpider(name='default')
|
||||
spider2 = Spider(name='default')
|
||||
ss2 = SpiderState(jobdir)
|
||||
ss2.spider_opened(spider2)
|
||||
self.assertEqual(spider.state, {'one': 1, 'dt': dt})
|
||||
@ -29,7 +29,7 @@ class SpiderStateTest(unittest.TestCase):
|
||||
def test_state_attribute(self):
|
||||
# state attribute must be present if jobdir is not set, to provide a
|
||||
# consistent interface
|
||||
spider = BaseSpider(name='default')
|
||||
spider = Spider(name='default')
|
||||
ss = SpiderState()
|
||||
ss.spider_opened(spider)
|
||||
self.assertEqual(spider.state, {})
|
||||
|
@ -22,7 +22,7 @@ from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler
|
||||
from scrapy.core.downloader.handlers.s3 import S3DownloadHandler
|
||||
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request
|
||||
from scrapy.settings import Settings
|
||||
from scrapy import optional_features
|
||||
@ -45,11 +45,11 @@ class FileTestCase(unittest.TestCase):
|
||||
|
||||
request = Request(path_to_file_uri(self.tmpname + '^'))
|
||||
assert request.url.upper().endswith('%5E')
|
||||
return self.download_request(request, BaseSpider('foo')).addCallback(_test)
|
||||
return self.download_request(request, Spider('foo')).addCallback(_test)
|
||||
|
||||
def test_non_existent(self):
|
||||
request = Request('file://%s' % self.mktemp())
|
||||
d = self.download_request(request, BaseSpider('foo'))
|
||||
d = self.download_request(request, Spider('foo'))
|
||||
return self.assertFailure(d, IOError)
|
||||
|
||||
|
||||
@ -87,35 +87,35 @@ class HttpTestCase(unittest.TestCase):
|
||||
|
||||
def test_download(self):
|
||||
request = Request(self.getURL('file'))
|
||||
d = self.download_request(request, BaseSpider('foo'))
|
||||
d = self.download_request(request, Spider('foo'))
|
||||
d.addCallback(lambda r: r.body)
|
||||
d.addCallback(self.assertEquals, "0123456789")
|
||||
return d
|
||||
|
||||
def test_download_head(self):
|
||||
request = Request(self.getURL('file'), method='HEAD')
|
||||
d = self.download_request(request, BaseSpider('foo'))
|
||||
d = self.download_request(request, Spider('foo'))
|
||||
d.addCallback(lambda r: r.body)
|
||||
d.addCallback(self.assertEquals, '')
|
||||
return d
|
||||
|
||||
def test_redirect_status(self):
|
||||
request = Request(self.getURL('redirect'))
|
||||
d = self.download_request(request, BaseSpider('foo'))
|
||||
d = self.download_request(request, Spider('foo'))
|
||||
d.addCallback(lambda r: r.status)
|
||||
d.addCallback(self.assertEquals, 302)
|
||||
return d
|
||||
|
||||
def test_redirect_status_head(self):
|
||||
request = Request(self.getURL('redirect'), method='HEAD')
|
||||
d = self.download_request(request, BaseSpider('foo'))
|
||||
d = self.download_request(request, Spider('foo'))
|
||||
d.addCallback(lambda r: r.status)
|
||||
d.addCallback(self.assertEquals, 302)
|
||||
return d
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_timeout_download_from_spider(self):
|
||||
spider = BaseSpider('foo')
|
||||
spider = Spider('foo')
|
||||
meta = {'download_timeout': 0.2}
|
||||
# client connects but no data is received
|
||||
request = Request(self.getURL('wait'), meta=meta)
|
||||
@ -132,7 +132,7 @@ class HttpTestCase(unittest.TestCase):
|
||||
self.assertEquals(request.headers, {})
|
||||
|
||||
request = Request(self.getURL('host'))
|
||||
return self.download_request(request, BaseSpider('foo')).addCallback(_test)
|
||||
return self.download_request(request, Spider('foo')).addCallback(_test)
|
||||
|
||||
def test_host_header_seted_in_request_headers(self):
|
||||
def _test(response):
|
||||
@ -140,9 +140,9 @@ class HttpTestCase(unittest.TestCase):
|
||||
self.assertEquals(request.headers.get('Host'), 'example.com')
|
||||
|
||||
request = Request(self.getURL('host'), headers={'Host': 'example.com'})
|
||||
return self.download_request(request, BaseSpider('foo')).addCallback(_test)
|
||||
return self.download_request(request, Spider('foo')).addCallback(_test)
|
||||
|
||||
d = self.download_request(request, BaseSpider('foo'))
|
||||
d = self.download_request(request, Spider('foo'))
|
||||
d.addCallback(lambda r: r.body)
|
||||
d.addCallback(self.assertEquals, 'example.com')
|
||||
return d
|
||||
@ -150,7 +150,7 @@ class HttpTestCase(unittest.TestCase):
|
||||
def test_payload(self):
|
||||
body = '1'*100 # PayloadResource requires body length to be 100
|
||||
request = Request(self.getURL('payload'), method='POST', body=body)
|
||||
d = self.download_request(request, BaseSpider('foo'))
|
||||
d = self.download_request(request, Spider('foo'))
|
||||
d.addCallback(lambda r: r.body)
|
||||
d.addCallback(self.assertEquals, body)
|
||||
return d
|
||||
@ -211,7 +211,7 @@ class HttpProxyTestCase(unittest.TestCase):
|
||||
|
||||
http_proxy = self.getURL('')
|
||||
request = Request('http://example.com', meta={'proxy': http_proxy})
|
||||
return self.download_request(request, BaseSpider('foo')).addCallback(_test)
|
||||
return self.download_request(request, Spider('foo')).addCallback(_test)
|
||||
|
||||
def test_download_with_proxy_https_noconnect(self):
|
||||
def _test(response):
|
||||
@ -221,7 +221,7 @@ class HttpProxyTestCase(unittest.TestCase):
|
||||
|
||||
http_proxy = '%s?noconnect' % self.getURL('')
|
||||
request = Request('https://example.com', meta={'proxy': http_proxy})
|
||||
return self.download_request(request, BaseSpider('foo')).addCallback(_test)
|
||||
return self.download_request(request, Spider('foo')).addCallback(_test)
|
||||
|
||||
def test_download_without_proxy(self):
|
||||
def _test(response):
|
||||
@ -230,7 +230,7 @@ class HttpProxyTestCase(unittest.TestCase):
|
||||
self.assertEquals(response.body, '/path/to/resource')
|
||||
|
||||
request = Request(self.getURL('path/to/resource'))
|
||||
return self.download_request(request, BaseSpider('foo')).addCallback(_test)
|
||||
return self.download_request(request, Spider('foo')).addCallback(_test)
|
||||
|
||||
|
||||
class DeprecatedHttpProxyTestCase(unittest.TestCase):
|
||||
@ -270,7 +270,7 @@ class S3TestCase(unittest.TestCase):
|
||||
self.AWS_SECRET_ACCESS_KEY, \
|
||||
httpdownloadhandler=HttpDownloadHandlerMock)
|
||||
self.download_request = s3reqh.download_request
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
|
||||
def test_request_signing1(self):
|
||||
# gets an object from the johnsmith bucket.
|
||||
|
@ -2,7 +2,7 @@ from twisted.trial.unittest import TestCase
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
@ -13,7 +13,7 @@ class ManagerTestCase(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.crawler = get_crawler(self.settings_dict)
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
self.spider.set_crawler(self.crawler)
|
||||
self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
|
||||
# some mw depends on stats collector
|
||||
|
@ -1,14 +1,14 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.contrib.downloadermiddleware.cookies import CookiesMiddleware
|
||||
|
||||
|
||||
class CookiesMiddlewareTest(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
self.mw = CookiesMiddleware()
|
||||
|
||||
def tearDown(self):
|
||||
|
@ -1,7 +1,7 @@
|
||||
from unittest import TestCase, main
|
||||
from scrapy.http import Response, XmlResponse
|
||||
from scrapy.contrib_exp.downloadermiddleware.decompression import DecompressionMiddleware
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.tests import get_testdata
|
||||
from scrapy.utils.test import assert_samelines
|
||||
|
||||
@ -22,7 +22,7 @@ class DecompressionMiddlewareTest(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.mw = DecompressionMiddleware()
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
|
||||
def test_known_compression_formats(self):
|
||||
for fmt in self.test_formats:
|
||||
|
@ -2,7 +2,7 @@ from unittest import TestCase
|
||||
|
||||
from scrapy.contrib.downloadermiddleware.defaultheaders import DefaultHeadersMiddleware
|
||||
from scrapy.http import Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ class TestDefaultHeadersMiddleware(TestCase):
|
||||
|
||||
def get_defaults_spider_mw(self):
|
||||
crawler = get_crawler()
|
||||
spider = BaseSpider('foo')
|
||||
spider = Spider('foo')
|
||||
spider.set_crawler(crawler)
|
||||
defaults = dict([(k, [v]) for k, v in \
|
||||
crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
|
||||
|
@ -1,7 +1,7 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.contrib.downloadermiddleware.downloadtimeout import DownloadTimeoutMiddleware
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
@ -10,7 +10,7 @@ class DownloadTimeoutMiddlewareTest(unittest.TestCase):
|
||||
|
||||
def get_request_spider_mw(self):
|
||||
crawler = get_crawler()
|
||||
spider = BaseSpider('foo')
|
||||
spider = Spider('foo')
|
||||
spider.set_crawler(crawler)
|
||||
request = Request('http://scrapytest.org/')
|
||||
return request, spider, DownloadTimeoutMiddleware.from_crawler(crawler)
|
||||
|
@ -2,9 +2,9 @@ import unittest
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.contrib.downloadermiddleware.httpauth import HttpAuthMiddleware
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class TestSpider(BaseSpider):
|
||||
class TestSpider(Spider):
|
||||
http_user = 'foo'
|
||||
http_pass = 'bar'
|
||||
|
||||
|
@ -7,7 +7,7 @@ import email.utils
|
||||
from contextlib import contextmanager
|
||||
|
||||
from scrapy.http import Response, HtmlResponse, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
from scrapy.utils.test import get_crawler
|
||||
@ -24,7 +24,7 @@ class _BaseTest(unittest.TestCase):
|
||||
self.today = email.utils.formatdate()
|
||||
self.tomorrow = email.utils.formatdate(time.time() + 86400)
|
||||
self.crawler = get_crawler()
|
||||
self.spider = BaseSpider('example.com')
|
||||
self.spider = Spider('example.com')
|
||||
self.tmpdir = tempfile.mkdtemp()
|
||||
self.request = Request('http://www.example.com',
|
||||
headers={'User-Agent': 'test'})
|
||||
|
@ -3,7 +3,7 @@ from os.path import join, abspath, dirname
|
||||
from cStringIO import StringIO
|
||||
from gzip import GzipFile
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Response, Request, HtmlResponse
|
||||
from scrapy.contrib.downloadermiddleware.httpcompression import HttpCompressionMiddleware
|
||||
from scrapy.tests import tests_datadir
|
||||
@ -22,7 +22,7 @@ FORMAT = {
|
||||
class HttpCompressionTest(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
self.mw = HttpCompressionMiddleware()
|
||||
|
||||
def _getresponse(self, coding):
|
||||
|
@ -5,9 +5,9 @@ from twisted.trial.unittest import TestCase, SkipTest
|
||||
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
spider = BaseSpider('foo')
|
||||
spider = Spider('foo')
|
||||
|
||||
class TestDefaultHeadersMiddleware(TestCase):
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware, MetaRefreshMiddleware
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
from scrapy.http import Request, Response, HtmlResponse
|
||||
from scrapy.utils.test import get_crawler
|
||||
@ -11,7 +11,7 @@ class RedirectMiddlewareTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
crawler = get_crawler()
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
self.mw = RedirectMiddleware.from_crawler(crawler)
|
||||
|
||||
def test_priority_adjust(self):
|
||||
@ -124,7 +124,7 @@ class MetaRefreshMiddlewareTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
crawler = get_crawler()
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
self.mw = MetaRefreshMiddleware.from_crawler(crawler)
|
||||
|
||||
def _body(self, interval=5, url='http://example.org/newpage'):
|
||||
|
@ -6,7 +6,7 @@ from twisted.internet.error import TimeoutError as ServerTimeoutError, \
|
||||
from scrapy import optional_features
|
||||
from scrapy.contrib.downloadermiddleware.retry import RetryMiddleware
|
||||
from scrapy.xlib.tx import ResponseFailed
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
@ -14,7 +14,7 @@ from scrapy.utils.test import get_crawler
|
||||
class RetryTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
crawler = get_crawler()
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
self.mw = RetryMiddleware.from_crawler(crawler)
|
||||
self.mw.max_retry_times = 2
|
||||
|
||||
|
@ -2,7 +2,7 @@ from unittest import TestCase
|
||||
|
||||
from scrapy.contrib.downloadermiddleware.stats import DownloaderStats
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ class TestDownloaderStats(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.crawler = get_crawler()
|
||||
self.spider = BaseSpider('scrapytest.org')
|
||||
self.spider = Spider('scrapytest.org')
|
||||
self.mw = DownloaderStats(self.crawler.stats)
|
||||
|
||||
self.crawler.stats.open_spider(self.spider)
|
||||
|
@ -1,6 +1,6 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request
|
||||
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
|
||||
from scrapy.utils.test import get_crawler
|
||||
@ -10,7 +10,7 @@ class UserAgentMiddlewareTest(TestCase):
|
||||
|
||||
def get_spider_and_mw(self, default_useragent):
|
||||
crawler = get_crawler({'USER_AGENT': default_useragent})
|
||||
spider = BaseSpider('foo')
|
||||
spider = Spider('foo')
|
||||
spider.set_crawler(crawler)
|
||||
return spider, UserAgentMiddleware.from_crawler(crawler)
|
||||
|
||||
|
@ -21,7 +21,7 @@ from scrapy import signals
|
||||
from scrapy.utils.test import get_crawler
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.tests import tests_datadir
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.item import Item, Field
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.http import Request
|
||||
@ -32,7 +32,7 @@ class TestItem(Item):
|
||||
url = Field()
|
||||
price = Field()
|
||||
|
||||
class TestSpider(BaseSpider):
|
||||
class TestSpider(Spider):
|
||||
name = "scrapytest.org"
|
||||
allowed_domains = ["scrapytest.org", "localhost"]
|
||||
|
||||
|
@ -4,7 +4,7 @@ from twisted.python import log as txlog, failure
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.settings import default_settings
|
||||
|
||||
class LogTest(unittest.TestCase):
|
||||
@ -41,7 +41,7 @@ class ScrapyFileLogObserverTest(unittest.TestCase):
|
||||
self.assertEqual(self.logged(), "[scrapy] INFO: Hello")
|
||||
|
||||
def test_msg_spider(self):
|
||||
spider = BaseSpider("myspider")
|
||||
spider = Spider("myspider")
|
||||
log.msg("Hello", spider=spider)
|
||||
self.assertEqual(self.logged(), "[myspider] INFO: Hello")
|
||||
|
||||
@ -58,7 +58,7 @@ class ScrapyFileLogObserverTest(unittest.TestCase):
|
||||
self.assertEqual(self.logged(), "[scrapy] NOLEVEL: Hello")
|
||||
|
||||
def test_msg_level_spider(self):
|
||||
spider = BaseSpider("myspider")
|
||||
spider = Spider("myspider")
|
||||
log.msg("Hello", spider=spider, level=log.WARNING)
|
||||
self.assertEqual(self.logged(), "[myspider] WARNING: Hello")
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.item import Item, Field
|
||||
from scrapy.logformatter import LogFormatter
|
||||
@ -18,7 +18,7 @@ class LoggingContribTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.formatter = LogFormatter()
|
||||
self.spider = BaseSpider('default')
|
||||
self.spider = Spider('default')
|
||||
|
||||
def test_crawled(self):
|
||||
req = Request("http://www.example.com")
|
||||
|
@ -6,7 +6,7 @@ from twisted.internet.defer import Deferred, inlineCallbacks
|
||||
from twisted.python import log as txlog
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.contrib.pipeline.media import MediaPipeline
|
||||
from scrapy.utils.signal import disconnect_all
|
||||
@ -24,7 +24,7 @@ class BaseMediaPipelineTestCase(unittest.TestCase):
|
||||
pipeline_class = MediaPipeline
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('media.com')
|
||||
self.spider = Spider('media.com')
|
||||
self.pipe = self.pipeline_class(download_func=_mocked_download_func)
|
||||
self.pipe.open_spider(self.spider)
|
||||
self.info = self.pipe.spiderinfo
|
||||
|
@ -2,7 +2,7 @@ import unittest
|
||||
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.test import get_crawler
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class SettingsTest(unittest.TestCase):
|
||||
|
||||
|
@ -2,18 +2,20 @@ import gzip
|
||||
import inspect
|
||||
import warnings
|
||||
from cStringIO import StringIO
|
||||
from scrapy.utils.trackref import object_ref
|
||||
|
||||
from twisted.trial import unittest
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider, BaseSpider
|
||||
from scrapy.http import Response, TextResponse, XmlResponse, HtmlResponse
|
||||
from scrapy.contrib.spiders.init import InitSpider
|
||||
from scrapy.contrib.spiders import CrawlSpider, XMLFeedSpider, CSVFeedSpider, SitemapSpider
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
class BaseSpiderTest(unittest.TestCase):
|
||||
class SpiderTest(unittest.TestCase):
|
||||
|
||||
spider_class = BaseSpider
|
||||
spider_class = Spider
|
||||
|
||||
def setUp(self):
|
||||
warnings.simplefilter("always")
|
||||
@ -43,12 +45,12 @@ class BaseSpiderTest(unittest.TestCase):
|
||||
self.assertRaises(ValueError, self.spider_class, somearg='foo')
|
||||
|
||||
|
||||
class InitSpiderTest(BaseSpiderTest):
|
||||
class InitSpiderTest(SpiderTest):
|
||||
|
||||
spider_class = InitSpider
|
||||
|
||||
|
||||
class XMLFeedSpiderTest(BaseSpiderTest):
|
||||
class XMLFeedSpiderTest(SpiderTest):
|
||||
|
||||
spider_class = XMLFeedSpider
|
||||
|
||||
@ -92,17 +94,17 @@ class XMLFeedSpiderTest(BaseSpiderTest):
|
||||
], iterator)
|
||||
|
||||
|
||||
class CSVFeedSpiderTest(BaseSpiderTest):
|
||||
class CSVFeedSpiderTest(SpiderTest):
|
||||
|
||||
spider_class = CSVFeedSpider
|
||||
|
||||
|
||||
class CrawlSpiderTest(BaseSpiderTest):
|
||||
class CrawlSpiderTest(SpiderTest):
|
||||
|
||||
spider_class = CrawlSpider
|
||||
|
||||
|
||||
class SitemapSpiderTest(BaseSpiderTest):
|
||||
class SitemapSpiderTest(SpiderTest):
|
||||
|
||||
spider_class = SitemapSpider
|
||||
|
||||
@ -134,5 +136,61 @@ class SitemapSpiderTest(BaseSpiderTest):
|
||||
r = Response(url="http://www.example.com/sitemap.xml.gz", body=self.GZBODY)
|
||||
self.assertEqual(spider._get_sitemap_body(r), self.BODY)
|
||||
|
||||
|
||||
class BaseSpiderDeprecationTest(unittest.TestCase):
|
||||
|
||||
def test_basespider_is_deprecated(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
|
||||
class MySpider1(BaseSpider):
|
||||
pass
|
||||
|
||||
self.assertEqual(len(w), 1)
|
||||
self.assertEqual(w[0].category, ScrapyDeprecationWarning)
|
||||
self.assertEqual(w[0].lineno, inspect.getsourcelines(MySpider1)[1])
|
||||
|
||||
def test_basespider_issubclass(self):
|
||||
class MySpider2(Spider):
|
||||
pass
|
||||
|
||||
class MySpider2a(MySpider2):
|
||||
pass
|
||||
|
||||
class Foo(object):
|
||||
pass
|
||||
|
||||
class Foo2(object_ref):
|
||||
pass
|
||||
|
||||
assert issubclass(MySpider2, BaseSpider)
|
||||
assert issubclass(MySpider2a, BaseSpider)
|
||||
assert not issubclass(Foo, BaseSpider)
|
||||
assert not issubclass(Foo2, BaseSpider)
|
||||
|
||||
def test_basespider_isinstance(self):
|
||||
class MySpider3(Spider):
|
||||
name = 'myspider3'
|
||||
|
||||
class MySpider3a(MySpider3):
|
||||
pass
|
||||
|
||||
class Foo(object):
|
||||
pass
|
||||
|
||||
class Foo2(object_ref):
|
||||
pass
|
||||
|
||||
assert isinstance(MySpider3(), BaseSpider)
|
||||
assert isinstance(MySpider3a(), BaseSpider)
|
||||
assert not isinstance(Foo(), BaseSpider)
|
||||
assert not isinstance(Foo2(), BaseSpider)
|
||||
|
||||
def test_crawl_spider(self):
|
||||
assert issubclass(CrawlSpider, Spider)
|
||||
assert issubclass(CrawlSpider, BaseSpider)
|
||||
assert isinstance(CrawlSpider(name='foo'), Spider)
|
||||
assert isinstance(CrawlSpider(name='foo'), BaseSpider)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
@ -1,4 +1,4 @@
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class Spider0(BaseSpider):
|
||||
class Spider0(Spider):
|
||||
allowed_domains = ["scrapy1.org", "scrapy3.org"]
|
||||
|
@ -1,5 +1,5 @@
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class Spider1(BaseSpider):
|
||||
class Spider1(Spider):
|
||||
name = "spider1"
|
||||
allowed_domains = ["scrapy1.org", "scrapy3.org"]
|
||||
|
@ -1,5 +1,5 @@
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class Spider2(BaseSpider):
|
||||
class Spider2(Spider):
|
||||
name = "spider2"
|
||||
allowed_domains = ["scrapy2.org", "scrapy3.org"]
|
||||
|
@ -1,6 +1,6 @@
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class Spider3(BaseSpider):
|
||||
class Spider3(Spider):
|
||||
name = "spider3"
|
||||
allowed_domains = ['spider3.com']
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class Spider4(BaseSpider):
|
||||
class Spider4(Spider):
|
||||
name = "spider4"
|
||||
|
||||
@classmethod
|
||||
|
@ -2,7 +2,7 @@ from unittest import TestCase
|
||||
|
||||
from scrapy.contrib.spidermiddleware.depth import DepthMiddleware
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.statscol import StatsCollector
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
@ -10,7 +10,7 @@ from scrapy.utils.test import get_crawler
|
||||
class TestDepthMiddleware(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('scrapytest.org')
|
||||
self.spider = Spider('scrapytest.org')
|
||||
|
||||
self.stats = StatsCollector(get_crawler())
|
||||
self.stats.open_spider(self.spider)
|
||||
|
@ -1,7 +1,7 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.contrib.spidermiddleware.httperror import HttpErrorMiddleware, HttpError
|
||||
from scrapy.settings import Settings
|
||||
|
||||
@ -9,7 +9,7 @@ from scrapy.settings import Settings
|
||||
class TestHttpErrorMiddleware(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
self.mw = HttpErrorMiddleware(Settings({}))
|
||||
self.req = Request('http://scrapytest.org')
|
||||
|
||||
@ -47,7 +47,7 @@ class TestHttpErrorMiddlewareSettings(TestCase):
|
||||
"""Similar test, but with settings"""
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOWED_CODES': (402,)}))
|
||||
self.req = Request('http://scrapytest.org')
|
||||
|
||||
@ -89,7 +89,7 @@ class TestHttpErrorMiddlewareSettings(TestCase):
|
||||
class TestHttpErrorMiddlewareHandleAll(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True}))
|
||||
self.req = Request('http://scrapytest.org')
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.contrib.spidermiddleware.offsite import OffsiteMiddleware
|
||||
|
||||
|
||||
@ -13,7 +13,7 @@ class TestOffsiteMiddleware(TestCase):
|
||||
self.mw.spider_opened(self.spider)
|
||||
|
||||
def _get_spider(self):
|
||||
return BaseSpider('foo', allowed_domains=['scrapytest.org', 'scrapy.org'])
|
||||
return Spider('foo', allowed_domains=['scrapytest.org', 'scrapy.org'])
|
||||
|
||||
def test_process_spider_output(self):
|
||||
res = Response('http://scrapytest.org')
|
||||
@ -33,7 +33,7 @@ class TestOffsiteMiddleware(TestCase):
|
||||
class TestOffsiteMiddleware2(TestOffsiteMiddleware):
|
||||
|
||||
def _get_spider(self):
|
||||
return BaseSpider('foo', allowed_domains=None)
|
||||
return Spider('foo', allowed_domains=None)
|
||||
|
||||
def test_process_spider_output(self):
|
||||
res = Response('http://scrapytest.org')
|
||||
@ -44,5 +44,5 @@ class TestOffsiteMiddleware2(TestOffsiteMiddleware):
|
||||
class TestOffsiteMiddleware3(TestOffsiteMiddleware2):
|
||||
|
||||
def _get_spider(self):
|
||||
return BaseSpider('foo')
|
||||
return Spider('foo')
|
||||
|
||||
|
@ -1,14 +1,14 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.contrib.spidermiddleware.referer import RefererMiddleware
|
||||
|
||||
|
||||
class TestRefererMiddleware(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
self.mw = RefererMiddleware()
|
||||
|
||||
def test_process_spider_output(self):
|
||||
|
@ -2,7 +2,7 @@ from unittest import TestCase
|
||||
|
||||
from scrapy.contrib.spidermiddleware.urllength import UrlLengthMiddleware
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
|
||||
class TestUrlLengthMiddleware(TestCase):
|
||||
@ -15,7 +15,7 @@ class TestUrlLengthMiddleware(TestCase):
|
||||
reqs = [short_url_req, long_url_req]
|
||||
|
||||
mw = UrlLengthMiddleware(maxlength=25)
|
||||
spider = BaseSpider('foo')
|
||||
spider = Spider('foo')
|
||||
out = list(mw.process_spider_output(res, reqs, spider))
|
||||
self.assertEquals(out, [short_url_req])
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.statscol import StatsCollector, DummyStatsCollector
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
@ -8,7 +8,7 @@ class StatsCollectorTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.crawler = get_crawler()
|
||||
self.spider = BaseSpider('foo')
|
||||
self.spider = Spider('foo')
|
||||
|
||||
def test_collector(self):
|
||||
stats = StatsCollector(self.crawler)
|
||||
|
138
scrapy/tests/test_utils_deprecate.py
Normal file
138
scrapy/tests/test_utils_deprecate.py
Normal file
@ -0,0 +1,138 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
import inspect
|
||||
import unittest
|
||||
import warnings
|
||||
from scrapy.utils.deprecate import create_deprecated_class
|
||||
|
||||
class MyWarning(UserWarning):
|
||||
pass
|
||||
|
||||
class SomeBaseClass(object):
|
||||
pass
|
||||
|
||||
class NewName(SomeBaseClass):
|
||||
pass
|
||||
|
||||
|
||||
class WarnWhenSubclassedTest(unittest.TestCase):
|
||||
|
||||
def test_no_warning_on_definition(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
Deprecated = create_deprecated_class('Deprecated', NewName)
|
||||
|
||||
self.assertEqual(w, [])
|
||||
|
||||
def test_warning_on_subclassing(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
Deprecated = create_deprecated_class('Deprecated', NewName,
|
||||
warn_category=MyWarning)
|
||||
|
||||
class UserClass(Deprecated):
|
||||
pass
|
||||
|
||||
self.assertEqual(len(w), 1)
|
||||
msg = w[0]
|
||||
assert issubclass(msg.category, MyWarning)
|
||||
self.assertEqual(
|
||||
str(msg.message),
|
||||
"scrapy.tests.test_utils_deprecate.UserClass inherits from "
|
||||
"deprecated class scrapy.tests.test_utils_deprecate.Deprecated, "
|
||||
"please inherit from scrapy.tests.test_utils_deprecate.NewName."
|
||||
)
|
||||
self.assertEqual(msg.lineno, inspect.getsourcelines(UserClass)[1])
|
||||
|
||||
def test_warning_on_instance(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
Deprecated = create_deprecated_class('Deprecated', NewName,
|
||||
warn_category=MyWarning)
|
||||
|
||||
class UserClass(Deprecated):
|
||||
pass
|
||||
|
||||
_, lineno = Deprecated(), inspect.getlineno(inspect.currentframe())
|
||||
_ = UserClass()
|
||||
|
||||
self.assertEqual(len(w), 2)
|
||||
msg = w[1]
|
||||
assert issubclass(msg.category, MyWarning)
|
||||
self.assertEqual(
|
||||
str(msg.message),
|
||||
"scrapy.tests.test_utils_deprecate.Deprecated is deprecated, "
|
||||
"instanciate scrapy.tests.test_utils_deprecate.NewName instead."
|
||||
)
|
||||
self.assertEqual(msg.lineno, lineno)
|
||||
|
||||
def test_warning_auto_message(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
Deprecated = create_deprecated_class('Deprecated', NewName)
|
||||
|
||||
class UserClass2(Deprecated):
|
||||
pass
|
||||
|
||||
msg = str(w[0].message)
|
||||
self.assertIn("scrapy.tests.test_utils_deprecate.NewName", msg)
|
||||
self.assertIn("scrapy.tests.test_utils_deprecate.Deprecated", msg)
|
||||
|
||||
def test_issubclass(self):
|
||||
with warnings.catch_warnings(record=True):
|
||||
DeprecatedName = create_deprecated_class('DeprecatedName', NewName)
|
||||
|
||||
class UpdatedUserClass1(NewName):
|
||||
pass
|
||||
|
||||
class UpdatedUserClass1a(NewName):
|
||||
pass
|
||||
|
||||
class OutdatedUserClass1(DeprecatedName):
|
||||
pass
|
||||
|
||||
class UnrelatedClass(object):
|
||||
pass
|
||||
|
||||
class OldStyleClass:
|
||||
pass
|
||||
|
||||
assert issubclass(UpdatedUserClass1, NewName)
|
||||
assert issubclass(UpdatedUserClass1a, NewName)
|
||||
assert issubclass(UpdatedUserClass1, DeprecatedName)
|
||||
assert issubclass(UpdatedUserClass1a, DeprecatedName)
|
||||
assert issubclass(OutdatedUserClass1, DeprecatedName)
|
||||
assert not issubclass(UnrelatedClass, DeprecatedName)
|
||||
assert not issubclass(OldStyleClass, DeprecatedName)
|
||||
assert not issubclass(OldStyleClass, DeprecatedName)
|
||||
|
||||
self.assertRaises(TypeError, issubclass, object(), DeprecatedName)
|
||||
|
||||
def test_isinstance(self):
|
||||
with warnings.catch_warnings(record=True):
|
||||
DeprecatedName = create_deprecated_class('DeprecatedName', NewName)
|
||||
|
||||
class UpdatedUserClass2(NewName):
|
||||
pass
|
||||
|
||||
class UpdatedUserClass2a(NewName):
|
||||
pass
|
||||
|
||||
class OutdatedUserClass2(DeprecatedName):
|
||||
pass
|
||||
|
||||
class UnrelatedClass(object):
|
||||
pass
|
||||
|
||||
class OldStyleClass:
|
||||
pass
|
||||
|
||||
assert isinstance(UpdatedUserClass2(), NewName)
|
||||
assert isinstance(UpdatedUserClass2a(), NewName)
|
||||
assert isinstance(UpdatedUserClass2(), DeprecatedName)
|
||||
assert isinstance(UpdatedUserClass2a(), DeprecatedName)
|
||||
assert isinstance(OutdatedUserClass2(), DeprecatedName)
|
||||
assert not isinstance(UnrelatedClass(), DeprecatedName)
|
||||
assert not isinstance(OldStyleClass(), DeprecatedName)
|
||||
|
||||
def test_clsdict(self):
|
||||
with warnings.catch_warnings(record=True):
|
||||
Deprecated = create_deprecated_class('Deprecated', NewName, {'foo': 'bar'})
|
||||
|
||||
self.assertEqual(Deprecated.foo, 'bar')
|
@ -1,7 +1,7 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.utils.reqser import request_to_dict, request_from_dict
|
||||
|
||||
class RequestSerializationTest(unittest.TestCase):
|
||||
@ -67,7 +67,7 @@ class RequestSerializationTest(unittest.TestCase):
|
||||
self.assertRaises(ValueError, request_to_dict, r)
|
||||
|
||||
|
||||
class TestSpider(BaseSpider):
|
||||
class TestSpider(Spider):
|
||||
name = 'test'
|
||||
def parse_item(self, response):
|
||||
pass
|
||||
|
@ -6,7 +6,7 @@ from decimal import Decimal
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.utils.serialize import SpiderReferencer, ScrapyJSONEncoder, ScrapyJSONDecoder
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request, Response
|
||||
|
||||
|
||||
@ -21,8 +21,8 @@ class CrawlerMock(object):
|
||||
class BaseTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider1 = BaseSpider('name1')
|
||||
self.spider2 = BaseSpider('name2')
|
||||
self.spider1 = Spider('name1')
|
||||
self.spider2 = Spider('name2')
|
||||
open_spiders = set([self.spider1, self.spider2])
|
||||
crawler = CrawlerMock(open_spiders)
|
||||
self.spref = SpiderReferencer(crawler)
|
||||
@ -43,7 +43,7 @@ class SpiderReferencerTestCase(BaseTestCase):
|
||||
sp1 = self.spref.get_spider_from_reference(ref1)
|
||||
sp2 = self.spref.get_spider_from_reference(ref2)
|
||||
sp1_ = self.spref.get_spider_from_reference(ref1)
|
||||
assert isinstance(sp1, BaseSpider)
|
||||
assert isinstance(sp1, Spider)
|
||||
assert sp1 is not sp2
|
||||
assert sp1 is sp1_
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.utils.url import url_is_from_any_domain, url_is_from_spider, canonicalize_url
|
||||
|
||||
__doctests__ = ['scrapy.utils.url']
|
||||
@ -26,14 +26,14 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
self.assertFalse(url_is_from_any_domain(url+'.testdomain.com', ['testdomain.com']))
|
||||
|
||||
def test_url_is_from_spider(self):
|
||||
spider = BaseSpider(name='example.com')
|
||||
spider = Spider(name='example.com')
|
||||
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
|
||||
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
|
||||
self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', spider))
|
||||
self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', spider))
|
||||
|
||||
def test_url_is_from_spider_class_attributes(self):
|
||||
class MySpider(BaseSpider):
|
||||
class MySpider(Spider):
|
||||
name = 'example.com'
|
||||
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
|
||||
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
|
||||
@ -41,7 +41,7 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
|
||||
|
||||
def test_url_is_from_spider_with_allowed_domains(self):
|
||||
spider = BaseSpider(name='example.com', allowed_domains=['example.org', 'example.net'])
|
||||
spider = Spider(name='example.com', allowed_domains=['example.org', 'example.net'])
|
||||
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
|
||||
self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
|
||||
self.assertTrue(url_is_from_spider('http://example.com/some/page.html', spider))
|
||||
@ -49,14 +49,14 @@ class UrlUtilsTest(unittest.TestCase):
|
||||
self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', spider))
|
||||
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', spider))
|
||||
|
||||
spider = BaseSpider(name='example.com', allowed_domains=set(('example.com', 'example.net')))
|
||||
spider = Spider(name='example.com', allowed_domains=set(('example.com', 'example.net')))
|
||||
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
|
||||
|
||||
spider = BaseSpider(name='example.com', allowed_domains=('example.com', 'example.net'))
|
||||
spider = Spider(name='example.com', allowed_domains=('example.com', 'example.net'))
|
||||
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
|
||||
|
||||
def test_url_is_from_spider_with_allowed_domains_class_attributes(self):
|
||||
class MySpider(BaseSpider):
|
||||
class MySpider(Spider):
|
||||
name = 'example.com'
|
||||
allowed_domains = ('example.org', 'example.net')
|
||||
self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
|
||||
|
@ -1,11 +1,98 @@
|
||||
"""Some helpers for deprecation messages"""
|
||||
|
||||
import warnings
|
||||
|
||||
import inspect
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
def attribute(obj, oldattr, newattr, version='0.12'):
|
||||
cname = obj.__class__.__name__
|
||||
warnings.warn("%s.%s attribute is deprecated and will be no longer supported "
|
||||
"in Scrapy %s, use %s.%s attribute instead" % \
|
||||
(cname, oldattr, version, cname, newattr), ScrapyDeprecationWarning, stacklevel=3)
|
||||
|
||||
|
||||
def create_deprecated_class(name, new_class, clsdict=None,
|
||||
warn_category=ScrapyDeprecationWarning,
|
||||
subclass_warn_message="{cls} inherits from "\
|
||||
"deprecated class {old}, please inherit "\
|
||||
"from {new}.",
|
||||
instance_warn_message="{cls} is deprecated, "\
|
||||
"instanciate {new} instead."):
|
||||
"""
|
||||
Return a "deprecated" class that causes its subclasses to issue a warning.
|
||||
Subclasses of ``new_class`` are considered subclasses of this class.
|
||||
It also warns when the deprecated class is instanciated, but do not when
|
||||
its subclasses are instanciated.
|
||||
|
||||
It can be used to rename a base class in a library. For example, if we
|
||||
have
|
||||
|
||||
class OldName(SomeClass):
|
||||
# ...
|
||||
|
||||
and we want to rename it to NewName, we can do the following::
|
||||
|
||||
class NewName(SomeClass):
|
||||
# ...
|
||||
|
||||
OldName = create_deprecated_class('OldName', NewName)
|
||||
|
||||
Then, if user class inherits from OldName, warning is issued. Also, if
|
||||
some code uses ``issubclass(sub, OldName)`` or ``isinstance(sub(), OldName)``
|
||||
checks they'll still return True if sub is a subclass of NewName instead of
|
||||
OldName.
|
||||
"""
|
||||
|
||||
class DeprecatedClass(type):
|
||||
|
||||
deprecated_class = None
|
||||
|
||||
def __new__(metacls, name, bases, clsdict_):
|
||||
cls = super(DeprecatedClass, metacls).__new__(metacls, name, bases, clsdict_)
|
||||
if metacls.deprecated_class is None:
|
||||
metacls.deprecated_class = cls
|
||||
return cls
|
||||
|
||||
def __init__(cls, name, bases, clsdict_):
|
||||
old = cls.__class__.deprecated_class
|
||||
if cls is not old:
|
||||
msg = subclass_warn_message.format(cls=_clspath(cls),
|
||||
old=_clspath(old),
|
||||
new=_clspath(new_class))
|
||||
warnings.warn(msg, warn_category, stacklevel=2)
|
||||
super(DeprecatedClass, cls).__init__(name, bases, clsdict_)
|
||||
|
||||
# see http://www.python.org/dev/peps/pep-3119/#overloading-isinstance-and-issubclass
|
||||
# and http://docs.python.org/2/reference/datamodel.html#customizing-instance-and-subclass-checks
|
||||
# for implementation details
|
||||
def __instancecheck__(cls, inst):
|
||||
return any(cls.__subclasscheck__(c)
|
||||
for c in {type(inst), inst.__class__})
|
||||
|
||||
def __subclasscheck__(cls, sub):
|
||||
if not inspect.isclass(sub):
|
||||
raise TypeError("issubclass() arg 1 must be a class")
|
||||
|
||||
mro = getattr(sub, '__mro__', ())
|
||||
candidates = {cls, new_class}
|
||||
return any(c in candidates for c in mro)
|
||||
|
||||
def __call__(cls, *args, **kwargs):
|
||||
if cls is cls.__class__.deprecated_class:
|
||||
msg = instance_warn_message.format(cls=_clspath(cls),
|
||||
new=_clspath(new_class))
|
||||
warnings.warn(msg, warn_category, stacklevel=2)
|
||||
return super(DeprecatedClass, cls).__call__(*args, **kwargs)
|
||||
|
||||
deprecated_cls = DeprecatedClass(name, (new_class,), clsdict or {})
|
||||
frm = inspect.stack()[1]
|
||||
parent_module = inspect.getmodule(frm[0])
|
||||
if parent_module is not None:
|
||||
deprecated_cls.__module__ = parent_module.__name__
|
||||
|
||||
return deprecated_cls
|
||||
|
||||
|
||||
def _clspath(cls):
|
||||
return '{}.{}'.format(cls.__module__, cls.__name__)
|
||||
|
@ -5,7 +5,7 @@ import json
|
||||
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.item import BaseItem
|
||||
|
||||
@ -42,7 +42,7 @@ class SpiderReferencer(object):
|
||||
|
||||
def encode_references(self, obj):
|
||||
"""Look for Spider objects and replace them with spider references"""
|
||||
if isinstance(obj, BaseSpider):
|
||||
if isinstance(obj, Spider):
|
||||
return self.get_reference_from_spider(obj)
|
||||
elif isinstance(obj, dict):
|
||||
d = {}
|
||||
|
@ -14,11 +14,11 @@ def iter_spider_classes(module):
|
||||
"""
|
||||
# this needs to be imported here until get rid of the spider manager
|
||||
# singleton in scrapy.spider.spiders
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.spider import Spider
|
||||
|
||||
for obj in vars(module).itervalues():
|
||||
if inspect.isclass(obj) and \
|
||||
issubclass(obj, BaseSpider) and \
|
||||
issubclass(obj, Spider) and \
|
||||
obj.__module__ == module.__name__ and \
|
||||
getattr(obj, 'name', None):
|
||||
yield obj
|
||||
|
Loading…
x
Reference in New Issue
Block a user