From b20ac057b33ea18d192e7911ea323e48884ef34f Mon Sep 17 00:00:00 2001 From: Ismael Carnales Date: Wed, 18 Feb 2009 11:18:11 +0000 Subject: [PATCH] added rule shorthand, for creating CrawlSpider rules --HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40864 --- scrapy/trunk/scrapy/contrib/spiders/__init__.py | 2 +- scrapy/trunk/scrapy/contrib/spiders/crawl.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/scrapy/trunk/scrapy/contrib/spiders/__init__.py b/scrapy/trunk/scrapy/contrib/spiders/__init__.py index 6a310dd44..fdd68948e 100644 --- a/scrapy/trunk/scrapy/contrib/spiders/__init__.py +++ b/scrapy/trunk/scrapy/contrib/spiders/__init__.py @@ -1,2 +1,2 @@ -from scrapy.contrib.spiders.crawl import CrawlSpider, Rule +from scrapy.contrib.spiders.crawl import CrawlSpider, Rule, rule from scrapy.contrib.spiders.feed import XMLFeedSpider, CSVFeedSpider diff --git a/scrapy/trunk/scrapy/contrib/spiders/crawl.py b/scrapy/trunk/scrapy/contrib/spiders/crawl.py index 8d4a97b08..f62cef294 100644 --- a/scrapy/trunk/scrapy/contrib/spiders/crawl.py +++ b/scrapy/trunk/scrapy/contrib/spiders/crawl.py @@ -10,6 +10,7 @@ import copy from scrapy.http import Request from scrapy.spider import BaseSpider from scrapy.conf import settings +from scrapy.link.extractors import RegexLinkExtractor class Rule(object): """ @@ -45,6 +46,14 @@ class Rule(object): else: self.follow = follow + +def rule(regex, callback=None, exclude=None, follow=None): + """Shorthand for creating CrawlSpider rules""" + deny = [exclude] if exclude else [] + link_extractor = RegexLinkExtractor(allow=[regex], deny=deny) + return Rule(link_extractor, callback=callback, follow=follow) + + class CrawlSpider(BaseSpider): """ Class for spiders that crawl over web pages and extract/parse their links