1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 19:43:40 +00:00

added rule shorthand, for creating CrawlSpider rules

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40864
This commit is contained in:
Ismael Carnales 2009-02-18 11:18:11 +00:00
parent 84f63b146b
commit b20ac057b3
2 changed files with 10 additions and 1 deletions

View File

@ -1,2 +1,2 @@
from scrapy.contrib.spiders.crawl import CrawlSpider, Rule from scrapy.contrib.spiders.crawl import CrawlSpider, Rule, rule
from scrapy.contrib.spiders.feed import XMLFeedSpider, CSVFeedSpider from scrapy.contrib.spiders.feed import XMLFeedSpider, CSVFeedSpider

View File

@ -10,6 +10,7 @@ import copy
from scrapy.http import Request from scrapy.http import Request
from scrapy.spider import BaseSpider from scrapy.spider import BaseSpider
from scrapy.conf import settings from scrapy.conf import settings
from scrapy.link.extractors import RegexLinkExtractor
class Rule(object): class Rule(object):
""" """
@ -45,6 +46,14 @@ class Rule(object):
else: else:
self.follow = follow self.follow = follow
def rule(regex, callback=None, exclude=None, follow=None):
"""Shorthand for creating CrawlSpider rules"""
deny = [exclude] if exclude else []
link_extractor = RegexLinkExtractor(allow=[regex], deny=deny)
return Rule(link_extractor, callback=callback, follow=follow)
class CrawlSpider(BaseSpider): class CrawlSpider(BaseSpider):
""" """
Class for spiders that crawl over web pages and extract/parse their links Class for spiders that crawl over web pages and extract/parse their links