1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 05:24:33 +00:00

added new CrawlSpider that's gonna replace the old CrawlSpider soon

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40415
This commit is contained in:
Pablo Hoffman 2008-11-25 02:38:26 +00:00
parent f558a49d2c
commit 0fbb7579f6

View File

@ -0,0 +1,83 @@
"""
This module contains the basic crawling spider that you can use to inherit your
spider from.
"""
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.item import ScrapedItem
class CrawlSpider(BaseSpider):
"""
This is the base class for crawling spiders. It is based on a list of
crawling rules (stored in the "rules" attribute) which specify how links
are extracted and followed, and how pages are processed (by using a
callback function).
For more info about rules see the Rule class.
"""
def parse(self, response):
"""Method called by the framework core for all the start_urls. Do not
override this function, override parse_start_url instead.
"""
if response.url in self.start_urls:
return self._parse_wrapper(response, self.parse_start_url, cb_kwargs={}, follow=True)
else:
return self.parse_url(response)
def parse_start_url(self, response):
"""Callback function for processing start_urls. It must return a list
of ScrapedItems and/or Requests.
"""
return []
def _requests_to_follow(self, response):
requests = []
seen = set()
for rule in self.rules:
callback = rule.callback if callable(rule.callback) else getattr(self, rule.callback, None)
links = [l for l in rule.link_extractor.extract_urls(response) if l not in seen]
seen.union(links)
for link in links:
r = Request(url=link.url, link_text=link.text)
r.append_callback(self._parse_wrapper, callback, cb_kwargs=rule.cb_kwargs, follow=rule.follow)
requests.append(r)
return requests
def _parse_wrapper(self, response, callback, cb_kwargs, follow):
res = []
if follow:
res.extend(self._requests_to_follow(response))
if callback:
res.extend(callback(response, **cb_kwargs) or ())
return res
class Rule(object):
"""
A rule for crawling, which receives the following constructor arguments:
link_extractor (required)
A LinkExtractor which defines the policy for extracting links
callback (optional)
A function to use to process the page once it has been downloaded. If
callback is omitted the page is not procesed, just crawled. If callback
is a string (instead of callable) a method of the spider class with that
name is used as the callback function
cb_kwargs (optional)
A dict specifying keyword arguments to pass to the callback function
follow (optional)
If True, links will be followed from the pages crawled by this rule.
It defaults to True when no callback is specified or False when a
callback is specified
"""
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None):
self.link_extractor = link_extractor
self.callback = callback
self.cb_kwargs = cb_kwargs or {}
if follow is None:
self.follow = False if callback else True
else:
self.follow = follow