1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 13:05:38 +00:00

more refactoring to CrawlSpider

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40447
This commit is contained in:
Pablo Hoffman 2008-11-28 01:57:07 +00:00
parent 1b92fef9a7
commit 6d5a130b96

View File

@ -1,3 +1,5 @@
import copy
from scrapy.http import Request from scrapy.http import Request
from scrapy.spider import BaseSpider from scrapy.spider import BaseSpider
from scrapy.item import ScrapedItem from scrapy.item import ScrapedItem
@ -19,18 +21,18 @@ class Rule(object):
If True, links will be followed from the pages crawled by this rule. If True, links will be followed from the pages crawled by this rule.
It defaults to True when no callback is specified or False when a It defaults to True when no callback is specified or False when a
callback is specified callback is specified
link_filter (optional) process_links (optional)
Can be either a callable, or a string with the name of a method defined Can be either a callable, or a string with the name of a method defined
in the spider's class. in the spider's class.
This method will be called with the list of extracted links matching This method will be called with the list of extracted links matching
this rule (if any) and must return another list of links. this rule (if any) and must return another list of links.
""" """
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, link_filter=None): def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None):
self.link_extractor = link_extractor self.link_extractor = link_extractor
self.callback = callback self.callback = callback
self.cb_kwargs = cb_kwargs or {} self.cb_kwargs = cb_kwargs or {}
self.link_filter = link_filter self.process_links = process_links
if follow is None: if follow is None:
self.follow = False if callback else True self.follow = False if callback else True
else: else:
@ -50,38 +52,29 @@ class CrawlSpider(BaseSpider):
rules = () rules = ()
def __init__(self): def __init__(self):
def _get_method(method): """Constructor takes care of compiling rules"""
if isinstance(method, basestring):
return getattr(self, method, None)
elif method and callable(method):
return method
super(CrawlSpider, self).__init__() super(CrawlSpider, self).__init__()
for rule in self.rules: self._compile_rules()
rule.callback = _get_method(rule.callback)
rule.link_filter = _get_method(rule.link_filter)
def parse(self, response): def parse(self, response):
"""This function is called by the core for all the start_urls. Do not """This function is called by the framework core for all the
override this function, override parse_start_url instead.""" start_urls. Do not override this function, override parse_start_url
if response.url in self.start_urls: instead."""
return self._parse_wrapper(response, self.parse_start_url, cb_kwargs={}, follow=True) return self._response_downloaded(response, self.parse_start_url, cb_kwargs={}, follow=True)
else:
return self.parse_url(response)
def parse_start_url(self, response): def parse_start_url(self, response):
"""Callback function for processing start_urls. It must return a list """Overrideable callback function for processing start_urls. It must
of ScrapedItems and/or Requests.""" return a list of ScrapedItems and/or Requests"""
return [] return []
def scraped_item(self, response, item): def process_results(self, results, response):
""" """This overridable method is called for each result (item or request)
This method is called for each item returned by the spider, and it's intended returned by the spider, and it's intended to perform any last time
to do anything that it's needed before returning the item to the core, specially processing required before returning the results to the framework core,
setting its GUID. for example setting the item GUIDs. It receives a list of results and
It receives and returns an item the response which originated that results. It must return a list
""" of results (Items or Requests)."""
return item return results
def _requests_to_follow(self, response): def _requests_to_follow(self, response):
""" """
@ -91,18 +84,18 @@ class CrawlSpider(BaseSpider):
""" """
requests = [] requests = []
seen = set() seen = set()
for rule in self.rules: for rule in self._rules:
links = [link for link in rule.link_extractor.extract_urls(response) if link not in seen] links = [l for l in rule.link_extractor.extract_urls(response) if l not in seen]
if rule.link_filter: if rule.process_links:
links = rule.link_filter(links) links = rule.process_links(links)
seen = seen.union(links) seen = seen.union(links)
for link in links: for link in links:
r = Request(url=link.url, link_text=link.text) r = Request(url=link.url, link_text=link.text)
r.append_callback(self._parse_wrapper, rule.callback, cb_kwargs=rule.cb_kwargs, follow=rule.follow) r.append_callback(self._response_downloaded, rule.callback, cb_kwargs=rule.cb_kwargs, follow=rule.follow)
requests.append(r) requests.append(r)
return requests return requests
def _parse_wrapper(self, response, callback, cb_kwargs, follow): def _response_downloaded(self, response, callback, cb_kwargs, follow):
""" """
This is were any response arrives, and were it's decided whether This is were any response arrives, and were it's decided whether
to extract links or not from it, and if it will be parsed or not. to extract links or not from it, and if it will be parsed or not.
@ -113,25 +106,20 @@ class CrawlSpider(BaseSpider):
res.extend(self._requests_to_follow(response)) res.extend(self._requests_to_follow(response))
if callback: if callback:
cb_res = callback(response, **cb_kwargs) or () cb_res = callback(response, **cb_kwargs) or ()
for entry in cb_res: cb_res = self.process_results(cb_res, response)
if isinstance(entry, ScrapedItem):
entry = self.scraped_item(response, entry)
res.extend(cb_res) res.extend(cb_res)
return res return res
def parse_url(self, response): def _compile_rules(self):
""" """Compile the crawling rules"""
This method is called whenever you run scrapy with the 'parse' command
over an URL.
"""
ret = set()
for rule in self.rules:
links = [link for link in rule.link_extractor.extract_urls(response) if link not in ret]
if rule.link_filter:
links = rule.link_filter(links)
ret = ret.union(links)
if rule.callback and rule.link_extractor.match(response.url): def get_method(method):
ret = ret.union(rule.callback(response)) if callable(method):
return list(ret) return method
elif isinstance(method, basestring):
return getattr(self, method, None)
self._rules = [copy.copy(r) for r in self.rules]
for rule in self._rules:
rule.callback = get_method(rule.callback)
rule.process_links = get_method(rule.process_links)