1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 13:04:20 +00:00

Added possibility of checking whether a response should or shouldnt be parsed with the corresponding parse_suffix method (by defining check_suffix) in CrawlSpider

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40369
This commit is contained in:
elpolilla 2008-11-10 15:30:45 +00:00
parent 6db6b8c52e
commit c5d134053b

View File

@ -30,15 +30,14 @@ class CrawlSpider(BasicSpider):
self._links_callback = [] self._links_callback = []
for attr in dir(self): for attr in dir(self):
if attr.startswith('links_'): if attr.startswith('links_'):
suffix = attr.split('_', 1)[1] suffix = attr[6:]
value = getattr(self, attr) extractor = getattr(self, attr)
callback = getattr(self, 'parse_%s' % suffix, None) self._links_callback.append((suffix, extractor))
self._links_callback.append((value, callback))
def parse(self, response): def parse(self, response):
"""This function is called by the core for all the start_urls. Do not """This function is called by the core for all the start_urls. Do not
override this function, override parse_start_url instead.""" override this function, override parse_start_url instead."""
return self._parse_wrapper(response, self.parse_start_url) return self._parse_wrapper(response, callback=self.parse_start_url)
def parse_start_url(self, response): def parse_start_url(self, response):
"""Callback function for processing start_urls. It must return a list """Callback function for processing start_urls. It must return a list
@ -48,21 +47,28 @@ class CrawlSpider(BasicSpider):
def _links_to_follow(self, response): def _links_to_follow(self, response):
res = [] res = []
links_to_follow = {} links_to_follow = {}
for lx, callback in self._links_callback: for suffix, lx in self._links_callback:
links = lx.extract_urls(response) links = lx.extract_urls(response)
links = self.post_extract_links(links) if hasattr(self, 'post_extract_links') else links links = self.post_extract_links(links) if hasattr(self, 'post_extract_links') else links
for link in links: for link in links:
links_to_follow[link.url] = (callback, link.text) request = Request(url=link.url, link_text=link.text)
request.append_callback(self._parse_wrapper, suffix=suffix)
for url, (callback, link_text) in links_to_follow.iteritems(): res.append(request)
request = Request(url=url, link_text=link_text)
request.append_callback(self._parse_wrapper, callback)
res.append(request)
return res return res
def _parse_wrapper(self, response, callback): def _parse_wrapper(self, response, **kwargs):
suffix = kwargs.get('suffix')
callback = kwargs.get('callback')
res = self._links_to_follow(response) res = self._links_to_follow(response)
res += callback(response) if callback else ()
if suffix:
check_fcn = getattr(self, 'check_%s' % suffix, None)
if not check_fcn or check_fcn(response):
parse_fcn = getattr(self, 'parse_%s' % suffix, None)
res += parse_fcn(response) if parse_fcn else ()
elif callback:
res += callback(response)
for entry in res: for entry in res:
if isinstance(entry, ScrapedItem): if isinstance(entry, ScrapedItem):
self.set_guid(entry) self.set_guid(entry)
@ -77,10 +83,12 @@ class CrawlSpider(BasicSpider):
ret = [] ret = []
for name in extractor_names: for name in extractor_names:
extractor = getattr(self, name) extractor = getattr(self, name)
callback_name = 'parse_%s' % name[6:] check_name = 'check_%s' % name[6:]
if hasattr(self, callback_name): parse_name = 'parse_%s' % name[6:]
if extractor.match(response.url): if extractor.match(response.url) and hasattr(self, parse_name):
ret.extend(getattr(self, callback_name)(response)) if hasattr(self, check_name) and not getattr(self, check_name)(response):
continue
ret.extend(getattr(self, parse_name)(response))
for entry in ret: for entry in ret:
if isinstance(entry, ScrapedItem): if isinstance(entry, ScrapedItem):
self.set_guid(entry) self.set_guid(entry)