mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 13:04:20 +00:00
Added possibility of checking whether a response should or shouldnt be parsed with the corresponding parse_suffix method (by defining check_suffix) in CrawlSpider
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40369
This commit is contained in:
parent
6db6b8c52e
commit
c5d134053b
@ -30,15 +30,14 @@ class CrawlSpider(BasicSpider):
|
|||||||
self._links_callback = []
|
self._links_callback = []
|
||||||
for attr in dir(self):
|
for attr in dir(self):
|
||||||
if attr.startswith('links_'):
|
if attr.startswith('links_'):
|
||||||
suffix = attr.split('_', 1)[1]
|
suffix = attr[6:]
|
||||||
value = getattr(self, attr)
|
extractor = getattr(self, attr)
|
||||||
callback = getattr(self, 'parse_%s' % suffix, None)
|
self._links_callback.append((suffix, extractor))
|
||||||
self._links_callback.append((value, callback))
|
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
"""This function is called by the core for all the start_urls. Do not
|
"""This function is called by the core for all the start_urls. Do not
|
||||||
override this function, override parse_start_url instead."""
|
override this function, override parse_start_url instead."""
|
||||||
return self._parse_wrapper(response, self.parse_start_url)
|
return self._parse_wrapper(response, callback=self.parse_start_url)
|
||||||
|
|
||||||
def parse_start_url(self, response):
|
def parse_start_url(self, response):
|
||||||
"""Callback function for processing start_urls. It must return a list
|
"""Callback function for processing start_urls. It must return a list
|
||||||
@ -48,21 +47,28 @@ class CrawlSpider(BasicSpider):
|
|||||||
def _links_to_follow(self, response):
|
def _links_to_follow(self, response):
|
||||||
res = []
|
res = []
|
||||||
links_to_follow = {}
|
links_to_follow = {}
|
||||||
for lx, callback in self._links_callback:
|
for suffix, lx in self._links_callback:
|
||||||
links = lx.extract_urls(response)
|
links = lx.extract_urls(response)
|
||||||
links = self.post_extract_links(links) if hasattr(self, 'post_extract_links') else links
|
links = self.post_extract_links(links) if hasattr(self, 'post_extract_links') else links
|
||||||
for link in links:
|
for link in links:
|
||||||
links_to_follow[link.url] = (callback, link.text)
|
request = Request(url=link.url, link_text=link.text)
|
||||||
|
request.append_callback(self._parse_wrapper, suffix=suffix)
|
||||||
for url, (callback, link_text) in links_to_follow.iteritems():
|
res.append(request)
|
||||||
request = Request(url=url, link_text=link_text)
|
|
||||||
request.append_callback(self._parse_wrapper, callback)
|
|
||||||
res.append(request)
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def _parse_wrapper(self, response, callback):
|
def _parse_wrapper(self, response, **kwargs):
|
||||||
|
suffix = kwargs.get('suffix')
|
||||||
|
callback = kwargs.get('callback')
|
||||||
res = self._links_to_follow(response)
|
res = self._links_to_follow(response)
|
||||||
res += callback(response) if callback else ()
|
|
||||||
|
if suffix:
|
||||||
|
check_fcn = getattr(self, 'check_%s' % suffix, None)
|
||||||
|
if not check_fcn or check_fcn(response):
|
||||||
|
parse_fcn = getattr(self, 'parse_%s' % suffix, None)
|
||||||
|
res += parse_fcn(response) if parse_fcn else ()
|
||||||
|
elif callback:
|
||||||
|
res += callback(response)
|
||||||
|
|
||||||
for entry in res:
|
for entry in res:
|
||||||
if isinstance(entry, ScrapedItem):
|
if isinstance(entry, ScrapedItem):
|
||||||
self.set_guid(entry)
|
self.set_guid(entry)
|
||||||
@ -77,10 +83,12 @@ class CrawlSpider(BasicSpider):
|
|||||||
ret = []
|
ret = []
|
||||||
for name in extractor_names:
|
for name in extractor_names:
|
||||||
extractor = getattr(self, name)
|
extractor = getattr(self, name)
|
||||||
callback_name = 'parse_%s' % name[6:]
|
check_name = 'check_%s' % name[6:]
|
||||||
if hasattr(self, callback_name):
|
parse_name = 'parse_%s' % name[6:]
|
||||||
if extractor.match(response.url):
|
if extractor.match(response.url) and hasattr(self, parse_name):
|
||||||
ret.extend(getattr(self, callback_name)(response))
|
if hasattr(self, check_name) and not getattr(self, check_name)(response):
|
||||||
|
continue
|
||||||
|
ret.extend(getattr(self, parse_name)(response))
|
||||||
for entry in ret:
|
for entry in ret:
|
||||||
if isinstance(entry, ScrapedItem):
|
if isinstance(entry, ScrapedItem):
|
||||||
self.set_guid(entry)
|
self.set_guid(entry)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user