mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 04:23:45 +00:00
Moved init_domain functionality out of the engine (refs #88) and into the
spider level. A new spider (InitSpider, not yet documented) was added to provide initialization facilities. Also renamed make_request_from_url to make_requests_from_url and allowed it to return iterables.
This commit is contained in:
parent
b040e6a3a7
commit
3cb12e5dd3
@ -46,12 +46,12 @@ method ``parse`` for each of the resulting responses.
|
||||
|
||||
This is the method called by Scrapy when the spider is opened for scraping
|
||||
when no particular URLs are specified. If particular URLs are specified,
|
||||
the :meth:`BaseSpider.make_request_from_url` is used instead to create the
|
||||
the :meth:`BaseSpider.make_requests_from_url` is used instead to create the
|
||||
Requests. This method is also called only once from Scrapy, so it's safe to
|
||||
implement it as a generator.
|
||||
|
||||
The default implementation uses :meth:`BaseSpider.make_request_from_url` to
|
||||
generate Requests for each url in :attr:`start_urls`.
|
||||
The default implementation uses :meth:`BaseSpider.make_requests_from_url`
|
||||
to generate Requests for each url in :attr:`start_urls`.
|
||||
|
||||
If you want to change the Requests used to start scraping a domain, this is
|
||||
the method to override. For example, if you need to start by login in using
|
||||
@ -67,11 +67,13 @@ method ``parse`` for each of the resulting responses.
|
||||
# each of them, with another callback
|
||||
pass
|
||||
|
||||
.. method:: BaseSpider.make_request_from_url(url)
|
||||
.. method:: BaseSpider.make_requests_from_url(url)
|
||||
|
||||
A method that receives a URL and returns an :class:`~scrapy.http.Request`
|
||||
object to scrape that URL with this spider. This method is used to
|
||||
construct the initial requests in the :meth:`start_requests` method.
|
||||
A method that receives a URL and returns a :class:`~scrapy.http.Request`
|
||||
object (or a list of :class:`~scrapy.http.Request` objects) to scrape. This
|
||||
method is used to construct the initial requests in the
|
||||
:meth:`start_requests` method, and is typically used to convert urls to
|
||||
requests.
|
||||
|
||||
Unless overridden, this method returns Requests with the :meth:`parse`
|
||||
method as their callback function, and with dont_filter parameter enabled
|
||||
|
@ -8,7 +8,7 @@ See documentation in docs/ref/spiders.rst
|
||||
import copy
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.contrib.spiders.init import InitSpider
|
||||
from scrapy.conf import settings
|
||||
|
||||
class Rule(object):
|
||||
@ -45,7 +45,7 @@ class Rule(object):
|
||||
else:
|
||||
self.follow = follow
|
||||
|
||||
class CrawlSpider(BaseSpider):
|
||||
class CrawlSpider(InitSpider):
|
||||
"""
|
||||
Class for spiders that crawl over web pages and extract/parse their links
|
||||
given some crawling rules.
|
||||
|
@ -5,14 +5,14 @@ for scraping from an XML feed.
|
||||
See documentation in docs/ref/spiders.rst
|
||||
"""
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.contrib.spiders.init import InitSpider
|
||||
from scrapy.item import ScrapedItem
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.iterators import xmliter, csviter
|
||||
from scrapy.xpath.selector import XmlXPathSelector, HtmlXPathSelector
|
||||
from scrapy.core.exceptions import NotConfigured, NotSupported
|
||||
|
||||
class XMLFeedSpider(BaseSpider):
|
||||
class XMLFeedSpider(InitSpider):
|
||||
"""
|
||||
This class intends to be the base class for spiders that scrape
|
||||
from XML feeds.
|
||||
@ -88,7 +88,7 @@ class XMLFeedSpider(BaseSpider):
|
||||
for (prefix, uri) in self.namespaces:
|
||||
selector.register_namespace(prefix, uri)
|
||||
|
||||
class CSVFeedSpider(BaseSpider):
|
||||
class CSVFeedSpider(InitSpider):
|
||||
"""Spider for parsing CSV feeds.
|
||||
It receives a CSV file in a response; iterates through each of its rows,
|
||||
and calls parse_row with a dict containing each field's data.
|
||||
|
44
scrapy/contrib/spiders/init.py
Normal file
44
scrapy/contrib/spiders/init.py
Normal file
@ -0,0 +1,44 @@
|
||||
from scrapy.spider import BaseSpider
|
||||
|
||||
class InitSpider(BaseSpider):
|
||||
"""Base Spider with initialization facilities"""
|
||||
|
||||
def __init__(self):
|
||||
super(InitSpider, self).__init__()
|
||||
self._postinit_reqs = []
|
||||
self._init_complete = False
|
||||
self._init_started = False
|
||||
|
||||
def make_requests_from_url(self, url):
|
||||
req = super(InitSpider, self).make_requests_from_url(url)
|
||||
if self._init_complete:
|
||||
return req
|
||||
self._postinit_reqs.append(req)
|
||||
if not self._init_started:
|
||||
self._init_started = True
|
||||
return self.init_request()
|
||||
|
||||
def initialized(self, response=None):
|
||||
"""This method must be set as the callback of your last initialization
|
||||
request. See self.init_request() docstring for more info.
|
||||
"""
|
||||
self._init_complete = True
|
||||
reqs = self._postinit_reqs[:]
|
||||
del self._postinit_reqs
|
||||
return reqs
|
||||
|
||||
def init_request(self):
|
||||
"""This function should return one initialization request, with the
|
||||
self.initialized method as callback. When the self.initialized method
|
||||
is called this spider is considered initialized. If you need to perform
|
||||
several requests for initializing your spider, you can do so by using
|
||||
different callbacks. The only requirement is that the final callback
|
||||
(of the last initialization request) must be self.initialized.
|
||||
|
||||
The default implementation calls self.initialized immediately, and
|
||||
means that no initialization is needed. This method should be
|
||||
overridden only when you need to perform requests to initialize your
|
||||
spider
|
||||
"""
|
||||
return self.initialized()
|
||||
|
@ -23,7 +23,7 @@ from scrapy.item import ScrapedItem
|
||||
from scrapy.item.pipeline import ItemPipelineManager
|
||||
from scrapy.spider import spiders
|
||||
from scrapy.spider.middleware import SpiderMiddlewareManager
|
||||
from scrapy.utils.defer import chain_deferred, defer_succeed, mustbe_deferred, deferred_imap
|
||||
from scrapy.utils.defer import chain_deferred, deferred_imap
|
||||
from scrapy.utils.request import request_info
|
||||
|
||||
class ExecutionEngine(object):
|
||||
@ -50,7 +50,6 @@ class ExecutionEngine(object):
|
||||
def __init__(self):
|
||||
self.configured = False
|
||||
self.keep_alive = False
|
||||
self.initializing = set() # domais in intialization state
|
||||
self.cancelled = set() # domains in cancelation state
|
||||
self.debug_mode = settings.getbool('ENGINE_DEBUG')
|
||||
self.tasks = []
|
||||
@ -216,8 +215,7 @@ class ExecutionEngine(object):
|
||||
pending = self.scheduler.domain_has_pending_requests(domain)
|
||||
downloading = not self.downloader.domain_is_idle(domain)
|
||||
haspipe = not self.pipeline.domain_is_idle(domain)
|
||||
oninit = domain in self.initializing
|
||||
return not (pending or downloading or haspipe or oninit or scraping)
|
||||
return not (pending or downloading or haspipe or scraping)
|
||||
|
||||
def domain_is_open(self, domain):
|
||||
return domain in self.downloader.sites
|
||||
@ -369,57 +367,6 @@ class ExecutionEngine(object):
|
||||
dwld.addBoth(_on_complete)
|
||||
return deferred
|
||||
|
||||
def initialize(self, spider):
|
||||
domain = spider.domain_name
|
||||
if not hasattr(spider, 'init_domain'):
|
||||
return defer_succeed(True)
|
||||
|
||||
def _initialize(req):
|
||||
if isinstance(req, Request):
|
||||
_response = None
|
||||
def _referer(response):
|
||||
req.deferred.addCallback(_setreferer, response)
|
||||
return response
|
||||
|
||||
def _setreferer(result, response):
|
||||
if isinstance(result, Request):
|
||||
result.headers.setdefault('Referer', response.url)
|
||||
return result
|
||||
|
||||
def _onerror(_failure):
|
||||
ex = _failure.value
|
||||
if isinstance(ex, IgnoreRequest):
|
||||
log.msg(ex.message, log.DEBUG, domain=domain)
|
||||
else:
|
||||
return _failure
|
||||
|
||||
schd = self.schedule(req, spider)
|
||||
schd.addCallback(_referer)
|
||||
chain_deferred(schd, req.deferred)
|
||||
schd.addErrback(_onerror)
|
||||
schd.addBoth(_initialize)
|
||||
return schd
|
||||
return req
|
||||
|
||||
def _bugtrap(_failure):
|
||||
log.msg("Bug in %s init_domain code: %s" % (domain, _failure), log.ERROR, domain=domain)
|
||||
|
||||
def _state(state):
|
||||
self.initializing.remove(domain)
|
||||
if state is True:
|
||||
log.msg('Succeded initialization for %s' % domain, log.INFO, domain=domain)
|
||||
else:
|
||||
log.msg('Failed initialization for %s' % domain, log.INFO, domain=domain)
|
||||
return state
|
||||
|
||||
log.msg('Started initialization for %s' % domain, log.INFO, domain=domain)
|
||||
self.initializing.add(domain)
|
||||
req = spider.init_domain()
|
||||
deferred = mustbe_deferred(_initialize, req)
|
||||
deferred.addErrback(_bugtrap)
|
||||
deferred.addCallback(_state)
|
||||
return deferred
|
||||
|
||||
def open_domain(self, domain, spider=None):
|
||||
log.msg("Domain opened", domain=domain)
|
||||
spider = spider or spiders.fromdomain(domain)
|
||||
@ -431,15 +378,8 @@ class ExecutionEngine(object):
|
||||
self._scraping[domain] = set()
|
||||
signals.send_catch_log(signals.domain_open, sender=self.__class__, domain=domain, spider=spider)
|
||||
|
||||
# init_domain
|
||||
dfd = self.initialize(spider)
|
||||
def _state(state):
|
||||
if state is True:
|
||||
signals.send_catch_log(signals.domain_opened, sender=self.__class__, domain=domain, spider=spider)
|
||||
self._run_starters(spider)
|
||||
else:
|
||||
self._domain_idle(domain)
|
||||
dfd.addCallback(_state)
|
||||
signals.send_catch_log(signals.domain_opened, sender=self.__class__, domain=domain, spider=spider)
|
||||
self._run_starters(spider)
|
||||
|
||||
def _domain_idle(self, domain):
|
||||
"""Called when a domain gets idle. This function is called when there are no
|
||||
|
@ -7,7 +7,7 @@ from scrapy import log
|
||||
from scrapy.http import Request
|
||||
from scrapy.core.engine import scrapyengine
|
||||
from scrapy.spider import spiders
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.misc import load_object, arg_to_iter
|
||||
from scrapy.utils.url import is_url
|
||||
from scrapy.conf import settings
|
||||
|
||||
@ -126,8 +126,8 @@ class ExecutionManager(object):
|
||||
for url in urls:
|
||||
spider = spiders.fromurl(url)
|
||||
if spider:
|
||||
req = spider.make_request_from_url(url)
|
||||
perdomain.setdefault(spider.domain_name, []).append(req)
|
||||
for req in arg_to_iter(spider.make_requests_from_url(url)):
|
||||
perdomain.setdefault(spider.domain_name, []).append(req)
|
||||
else:
|
||||
log.msg('Could not find spider for <%s>' % url, log.ERROR)
|
||||
|
||||
|
@ -8,6 +8,7 @@ from twisted.plugin import IPlugin
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
|
||||
def _valid_domain_name(obj):
|
||||
"""Check the domain name specified is valid"""
|
||||
@ -39,13 +40,6 @@ class ISpider(Interface, IPlugin) :
|
||||
invariant(_valid_domain_name)
|
||||
invariant(_valid_download_delay)
|
||||
|
||||
def init_domain(self):
|
||||
"""This is first called to initialize domain specific quirks, like
|
||||
session cookies or login stuff
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class BaseSpider(object):
|
||||
"""Base class for scrapy spiders. All spiders must inherit from this
|
||||
class.
|
||||
@ -64,9 +58,12 @@ class BaseSpider(object):
|
||||
log.msg(message, domain=self.domain_name, level=level)
|
||||
|
||||
def start_requests(self):
|
||||
return [self.make_request_from_url(url) for url in self.start_urls]
|
||||
reqs = []
|
||||
for url in self.start_urls:
|
||||
reqs.extend(arg_to_iter(self.make_requests_from_url(url)))
|
||||
return reqs
|
||||
|
||||
def make_request_from_url(self, url):
|
||||
def make_requests_from_url(self, url):
|
||||
return Request(url, callback=self.parse, dont_filter=True)
|
||||
|
||||
def parse(self, response):
|
||||
|
Loading…
x
Reference in New Issue
Block a user