mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 13:05:38 +00:00
more refactoring to CrawlSpider
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40447
This commit is contained in:
parent
1b92fef9a7
commit
6d5a130b96
@ -1,3 +1,5 @@
|
|||||||
|
import copy
|
||||||
|
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy.spider import BaseSpider
|
from scrapy.spider import BaseSpider
|
||||||
from scrapy.item import ScrapedItem
|
from scrapy.item import ScrapedItem
|
||||||
@ -19,18 +21,18 @@ class Rule(object):
|
|||||||
If True, links will be followed from the pages crawled by this rule.
|
If True, links will be followed from the pages crawled by this rule.
|
||||||
It defaults to True when no callback is specified or False when a
|
It defaults to True when no callback is specified or False when a
|
||||||
callback is specified
|
callback is specified
|
||||||
link_filter (optional)
|
process_links (optional)
|
||||||
Can be either a callable, or a string with the name of a method defined
|
Can be either a callable, or a string with the name of a method defined
|
||||||
in the spider's class.
|
in the spider's class.
|
||||||
This method will be called with the list of extracted links matching
|
This method will be called with the list of extracted links matching
|
||||||
this rule (if any) and must return another list of links.
|
this rule (if any) and must return another list of links.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, link_filter=None):
|
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None):
|
||||||
self.link_extractor = link_extractor
|
self.link_extractor = link_extractor
|
||||||
self.callback = callback
|
self.callback = callback
|
||||||
self.cb_kwargs = cb_kwargs or {}
|
self.cb_kwargs = cb_kwargs or {}
|
||||||
self.link_filter = link_filter
|
self.process_links = process_links
|
||||||
if follow is None:
|
if follow is None:
|
||||||
self.follow = False if callback else True
|
self.follow = False if callback else True
|
||||||
else:
|
else:
|
||||||
@ -50,38 +52,29 @@ class CrawlSpider(BaseSpider):
|
|||||||
rules = ()
|
rules = ()
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
def _get_method(method):
|
"""Constructor takes care of compiling rules"""
|
||||||
if isinstance(method, basestring):
|
|
||||||
return getattr(self, method, None)
|
|
||||||
elif method and callable(method):
|
|
||||||
return method
|
|
||||||
|
|
||||||
super(CrawlSpider, self).__init__()
|
super(CrawlSpider, self).__init__()
|
||||||
for rule in self.rules:
|
self._compile_rules()
|
||||||
rule.callback = _get_method(rule.callback)
|
|
||||||
rule.link_filter = _get_method(rule.link_filter)
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
"""This function is called by the core for all the start_urls. Do not
|
"""This function is called by the framework core for all the
|
||||||
override this function, override parse_start_url instead."""
|
start_urls. Do not override this function, override parse_start_url
|
||||||
if response.url in self.start_urls:
|
instead."""
|
||||||
return self._parse_wrapper(response, self.parse_start_url, cb_kwargs={}, follow=True)
|
return self._response_downloaded(response, self.parse_start_url, cb_kwargs={}, follow=True)
|
||||||
else:
|
|
||||||
return self.parse_url(response)
|
|
||||||
|
|
||||||
def parse_start_url(self, response):
|
def parse_start_url(self, response):
|
||||||
"""Callback function for processing start_urls. It must return a list
|
"""Overrideable callback function for processing start_urls. It must
|
||||||
of ScrapedItems and/or Requests."""
|
return a list of ScrapedItems and/or Requests"""
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def scraped_item(self, response, item):
|
def process_results(self, results, response):
|
||||||
"""
|
"""This overridable method is called for each result (item or request)
|
||||||
This method is called for each item returned by the spider, and it's intended
|
returned by the spider, and it's intended to perform any last time
|
||||||
to do anything that it's needed before returning the item to the core, specially
|
processing required before returning the results to the framework core,
|
||||||
setting its GUID.
|
for example setting the item GUIDs. It receives a list of results and
|
||||||
It receives and returns an item
|
the response which originated that results. It must return a list
|
||||||
"""
|
of results (Items or Requests)."""
|
||||||
return item
|
return results
|
||||||
|
|
||||||
def _requests_to_follow(self, response):
|
def _requests_to_follow(self, response):
|
||||||
"""
|
"""
|
||||||
@ -91,18 +84,18 @@ class CrawlSpider(BaseSpider):
|
|||||||
"""
|
"""
|
||||||
requests = []
|
requests = []
|
||||||
seen = set()
|
seen = set()
|
||||||
for rule in self.rules:
|
for rule in self._rules:
|
||||||
links = [link for link in rule.link_extractor.extract_urls(response) if link not in seen]
|
links = [l for l in rule.link_extractor.extract_urls(response) if l not in seen]
|
||||||
if rule.link_filter:
|
if rule.process_links:
|
||||||
links = rule.link_filter(links)
|
links = rule.process_links(links)
|
||||||
seen = seen.union(links)
|
seen = seen.union(links)
|
||||||
for link in links:
|
for link in links:
|
||||||
r = Request(url=link.url, link_text=link.text)
|
r = Request(url=link.url, link_text=link.text)
|
||||||
r.append_callback(self._parse_wrapper, rule.callback, cb_kwargs=rule.cb_kwargs, follow=rule.follow)
|
r.append_callback(self._response_downloaded, rule.callback, cb_kwargs=rule.cb_kwargs, follow=rule.follow)
|
||||||
requests.append(r)
|
requests.append(r)
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
def _parse_wrapper(self, response, callback, cb_kwargs, follow):
|
def _response_downloaded(self, response, callback, cb_kwargs, follow):
|
||||||
"""
|
"""
|
||||||
This is were any response arrives, and were it's decided whether
|
This is were any response arrives, and were it's decided whether
|
||||||
to extract links or not from it, and if it will be parsed or not.
|
to extract links or not from it, and if it will be parsed or not.
|
||||||
@ -113,25 +106,20 @@ class CrawlSpider(BaseSpider):
|
|||||||
res.extend(self._requests_to_follow(response))
|
res.extend(self._requests_to_follow(response))
|
||||||
if callback:
|
if callback:
|
||||||
cb_res = callback(response, **cb_kwargs) or ()
|
cb_res = callback(response, **cb_kwargs) or ()
|
||||||
for entry in cb_res:
|
cb_res = self.process_results(cb_res, response)
|
||||||
if isinstance(entry, ScrapedItem):
|
|
||||||
entry = self.scraped_item(response, entry)
|
|
||||||
res.extend(cb_res)
|
res.extend(cb_res)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def parse_url(self, response):
|
def _compile_rules(self):
|
||||||
"""
|
"""Compile the crawling rules"""
|
||||||
This method is called whenever you run scrapy with the 'parse' command
|
|
||||||
over an URL.
|
|
||||||
"""
|
|
||||||
ret = set()
|
|
||||||
for rule in self.rules:
|
|
||||||
links = [link for link in rule.link_extractor.extract_urls(response) if link not in ret]
|
|
||||||
if rule.link_filter:
|
|
||||||
links = rule.link_filter(links)
|
|
||||||
ret = ret.union(links)
|
|
||||||
|
|
||||||
if rule.callback and rule.link_extractor.match(response.url):
|
def get_method(method):
|
||||||
ret = ret.union(rule.callback(response))
|
if callable(method):
|
||||||
return list(ret)
|
return method
|
||||||
|
elif isinstance(method, basestring):
|
||||||
|
return getattr(self, method, None)
|
||||||
|
|
||||||
|
self._rules = [copy.copy(r) for r in self.rules]
|
||||||
|
for rule in self._rules:
|
||||||
|
rule.callback = get_method(rule.callback)
|
||||||
|
rule.process_links = get_method(rule.process_links)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user