1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 04:23:45 +00:00

Moved init_domain functionality out of the engine (refs #88) and into the

spider level. A new spider (InitSpider, not yet documented) was added to
provide initialization facilities.

Also renamed make_request_from_url to make_requests_from_url and allowed it to
return iterables.
This commit is contained in:
Pablo Hoffman 2009-06-18 14:43:56 -03:00
parent b040e6a3a7
commit 3cb12e5dd3
7 changed files with 71 additions and 88 deletions

View File

@ -46,12 +46,12 @@ method ``parse`` for each of the resulting responses.
This is the method called by Scrapy when the spider is opened for scraping
when no particular URLs are specified. If particular URLs are specified,
the :meth:`BaseSpider.make_request_from_url` is used instead to create the
the :meth:`BaseSpider.make_requests_from_url` is used instead to create the
Requests. This method is also called only once from Scrapy, so it's safe to
implement it as a generator.
The default implementation uses :meth:`BaseSpider.make_request_from_url` to
generate Requests for each url in :attr:`start_urls`.
The default implementation uses :meth:`BaseSpider.make_requests_from_url`
to generate Requests for each url in :attr:`start_urls`.
If you want to change the Requests used to start scraping a domain, this is
the method to override. For example, if you need to start by login in using
@ -67,11 +67,13 @@ method ``parse`` for each of the resulting responses.
# each of them, with another callback
pass
.. method:: BaseSpider.make_request_from_url(url)
.. method:: BaseSpider.make_requests_from_url(url)
A method that receives a URL and returns an :class:`~scrapy.http.Request`
object to scrape that URL with this spider. This method is used to
construct the initial requests in the :meth:`start_requests` method.
A method that receives a URL and returns a :class:`~scrapy.http.Request`
object (or a list of :class:`~scrapy.http.Request` objects) to scrape. This
method is used to construct the initial requests in the
:meth:`start_requests` method, and is typically used to convert urls to
requests.
Unless overridden, this method returns Requests with the :meth:`parse`
method as their callback function, and with dont_filter parameter enabled

View File

@ -8,7 +8,7 @@ See documentation in docs/ref/spiders.rst
import copy
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders.init import InitSpider
from scrapy.conf import settings
class Rule(object):
@ -45,7 +45,7 @@ class Rule(object):
else:
self.follow = follow
class CrawlSpider(BaseSpider):
class CrawlSpider(InitSpider):
"""
Class for spiders that crawl over web pages and extract/parse their links
given some crawling rules.

View File

@ -5,14 +5,14 @@ for scraping from an XML feed.
See documentation in docs/ref/spiders.rst
"""
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders.init import InitSpider
from scrapy.item import ScrapedItem
from scrapy.http import Request
from scrapy.utils.iterators import xmliter, csviter
from scrapy.xpath.selector import XmlXPathSelector, HtmlXPathSelector
from scrapy.core.exceptions import NotConfigured, NotSupported
class XMLFeedSpider(BaseSpider):
class XMLFeedSpider(InitSpider):
"""
This class intends to be the base class for spiders that scrape
from XML feeds.
@ -88,7 +88,7 @@ class XMLFeedSpider(BaseSpider):
for (prefix, uri) in self.namespaces:
selector.register_namespace(prefix, uri)
class CSVFeedSpider(BaseSpider):
class CSVFeedSpider(InitSpider):
"""Spider for parsing CSV feeds.
It receives a CSV file in a response; iterates through each of its rows,
and calls parse_row with a dict containing each field's data.

View File

@ -0,0 +1,44 @@
from scrapy.spider import BaseSpider
class InitSpider(BaseSpider):
"""Base Spider with initialization facilities"""
def __init__(self):
super(InitSpider, self).__init__()
self._postinit_reqs = []
self._init_complete = False
self._init_started = False
def make_requests_from_url(self, url):
req = super(InitSpider, self).make_requests_from_url(url)
if self._init_complete:
return req
self._postinit_reqs.append(req)
if not self._init_started:
self._init_started = True
return self.init_request()
def initialized(self, response=None):
"""This method must be set as the callback of your last initialization
request. See self.init_request() docstring for more info.
"""
self._init_complete = True
reqs = self._postinit_reqs[:]
del self._postinit_reqs
return reqs
def init_request(self):
"""This function should return one initialization request, with the
self.initialized method as callback. When the self.initialized method
is called this spider is considered initialized. If you need to perform
several requests for initializing your spider, you can do so by using
different callbacks. The only requirement is that the final callback
(of the last initialization request) must be self.initialized.
The default implementation calls self.initialized immediately, and
means that no initialization is needed. This method should be
overridden only when you need to perform requests to initialize your
spider
"""
return self.initialized()

View File

@ -23,7 +23,7 @@ from scrapy.item import ScrapedItem
from scrapy.item.pipeline import ItemPipelineManager
from scrapy.spider import spiders
from scrapy.spider.middleware import SpiderMiddlewareManager
from scrapy.utils.defer import chain_deferred, defer_succeed, mustbe_deferred, deferred_imap
from scrapy.utils.defer import chain_deferred, deferred_imap
from scrapy.utils.request import request_info
class ExecutionEngine(object):
@ -50,7 +50,6 @@ class ExecutionEngine(object):
def __init__(self):
self.configured = False
self.keep_alive = False
self.initializing = set() # domais in intialization state
self.cancelled = set() # domains in cancelation state
self.debug_mode = settings.getbool('ENGINE_DEBUG')
self.tasks = []
@ -216,8 +215,7 @@ class ExecutionEngine(object):
pending = self.scheduler.domain_has_pending_requests(domain)
downloading = not self.downloader.domain_is_idle(domain)
haspipe = not self.pipeline.domain_is_idle(domain)
oninit = domain in self.initializing
return not (pending or downloading or haspipe or oninit or scraping)
return not (pending or downloading or haspipe or scraping)
def domain_is_open(self, domain):
return domain in self.downloader.sites
@ -369,57 +367,6 @@ class ExecutionEngine(object):
dwld.addBoth(_on_complete)
return deferred
def initialize(self, spider):
domain = spider.domain_name
if not hasattr(spider, 'init_domain'):
return defer_succeed(True)
def _initialize(req):
if isinstance(req, Request):
_response = None
def _referer(response):
req.deferred.addCallback(_setreferer, response)
return response
def _setreferer(result, response):
if isinstance(result, Request):
result.headers.setdefault('Referer', response.url)
return result
def _onerror(_failure):
ex = _failure.value
if isinstance(ex, IgnoreRequest):
log.msg(ex.message, log.DEBUG, domain=domain)
else:
return _failure
schd = self.schedule(req, spider)
schd.addCallback(_referer)
chain_deferred(schd, req.deferred)
schd.addErrback(_onerror)
schd.addBoth(_initialize)
return schd
return req
def _bugtrap(_failure):
log.msg("Bug in %s init_domain code: %s" % (domain, _failure), log.ERROR, domain=domain)
def _state(state):
self.initializing.remove(domain)
if state is True:
log.msg('Succeded initialization for %s' % domain, log.INFO, domain=domain)
else:
log.msg('Failed initialization for %s' % domain, log.INFO, domain=domain)
return state
log.msg('Started initialization for %s' % domain, log.INFO, domain=domain)
self.initializing.add(domain)
req = spider.init_domain()
deferred = mustbe_deferred(_initialize, req)
deferred.addErrback(_bugtrap)
deferred.addCallback(_state)
return deferred
def open_domain(self, domain, spider=None):
log.msg("Domain opened", domain=domain)
spider = spider or spiders.fromdomain(domain)
@ -431,15 +378,8 @@ class ExecutionEngine(object):
self._scraping[domain] = set()
signals.send_catch_log(signals.domain_open, sender=self.__class__, domain=domain, spider=spider)
# init_domain
dfd = self.initialize(spider)
def _state(state):
if state is True:
signals.send_catch_log(signals.domain_opened, sender=self.__class__, domain=domain, spider=spider)
self._run_starters(spider)
else:
self._domain_idle(domain)
dfd.addCallback(_state)
signals.send_catch_log(signals.domain_opened, sender=self.__class__, domain=domain, spider=spider)
self._run_starters(spider)
def _domain_idle(self, domain):
"""Called when a domain gets idle. This function is called when there are no

View File

@ -7,7 +7,7 @@ from scrapy import log
from scrapy.http import Request
from scrapy.core.engine import scrapyengine
from scrapy.spider import spiders
from scrapy.utils.misc import load_object
from scrapy.utils.misc import load_object, arg_to_iter
from scrapy.utils.url import is_url
from scrapy.conf import settings
@ -126,8 +126,8 @@ class ExecutionManager(object):
for url in urls:
spider = spiders.fromurl(url)
if spider:
req = spider.make_request_from_url(url)
perdomain.setdefault(spider.domain_name, []).append(req)
for req in arg_to_iter(spider.make_requests_from_url(url)):
perdomain.setdefault(spider.domain_name, []).append(req)
else:
log.msg('Could not find spider for <%s>' % url, log.ERROR)

View File

@ -8,6 +8,7 @@ from twisted.plugin import IPlugin
from scrapy import log
from scrapy.http import Request
from scrapy.utils.misc import arg_to_iter
def _valid_domain_name(obj):
"""Check the domain name specified is valid"""
@ -39,13 +40,6 @@ class ISpider(Interface, IPlugin) :
invariant(_valid_domain_name)
invariant(_valid_download_delay)
def init_domain(self):
"""This is first called to initialize domain specific quirks, like
session cookies or login stuff
"""
pass
class BaseSpider(object):
"""Base class for scrapy spiders. All spiders must inherit from this
class.
@ -64,9 +58,12 @@ class BaseSpider(object):
log.msg(message, domain=self.domain_name, level=level)
def start_requests(self):
return [self.make_request_from_url(url) for url in self.start_urls]
reqs = []
for url in self.start_urls:
reqs.extend(arg_to_iter(self.make_requests_from_url(url)))
return reqs
def make_request_from_url(self, url):
def make_requests_from_url(self, url):
return Request(url, callback=self.parse, dont_filter=True)
def parse(self, response):