diff --git a/scrapy/trunk/scrapy/contrib/spiders/crawl.py b/scrapy/trunk/scrapy/contrib/spiders/crawl.py index 89414ee38..7a2626be6 100644 --- a/scrapy/trunk/scrapy/contrib/spiders/crawl.py +++ b/scrapy/trunk/scrapy/contrib/spiders/crawl.py @@ -85,7 +85,7 @@ class CrawlSpider(BaseSpider): requests = [] seen = set() for rule in self._rules: - links = [l for l in rule.link_extractor.extract_urls(response) if l not in seen] + links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] if links and rule.process_links: links = rule.process_links(links) seen = seen.union(links) diff --git a/scrapy/trunk/scrapy/link/__init__.py b/scrapy/trunk/scrapy/link/__init__.py index 58ca1f641..38c0715ff 100644 --- a/scrapy/trunk/scrapy/link/__init__.py +++ b/scrapy/trunk/scrapy/link/__init__.py @@ -7,7 +7,7 @@ from scrapy.utils.url import urljoin_rfc as urljoin class LinkExtractor(FixedSGMLParser): """LinkExtractor are used to extract links from web pages. They are - instantiated and later "applied" to a Response using the extract_urls + instantiated and later "applied" to a Response using the extract_links method which must receive a Response object and return a dict whoose keys are the (absolute) urls to follow, and its values any arbitrary data. In this case the values are the text of the hyperlink. @@ -16,7 +16,7 @@ class LinkExtractor(FixedSGMLParser): functionality for extracting links to follow, but you could override this class or create a new one if you need some additional functionality. The only requisite is that the new (or overrided) class must provide a - extract_urls method that receives a Response and returns a dict with the + extract_links method that receives a Response and returns a dict with the links to follow as its keys. The constructor arguments are: @@ -35,7 +35,7 @@ class LinkExtractor(FixedSGMLParser): self.scan_attr = attr if callable(attr) else lambda a: a == attr self.current_link = None - def extract_urls(self, response, unique=False): + def extract_links(self, response, unique=False): self.reset() self.unique = unique self.feed(response.body.to_string()) diff --git a/scrapy/trunk/scrapy/link/extractors.py b/scrapy/trunk/scrapy/link/extractors.py index 8b830c743..224e4ef1e 100644 --- a/scrapy/trunk/scrapy/link/extractors.py +++ b/scrapy/trunk/scrapy/link/extractors.py @@ -54,11 +54,11 @@ class RegexLinkExtractor(LinkExtractor): attr_func = lambda x: x in attrs LinkExtractor.__init__(self, tag=tag_func, attr=attr_func) - def extract_urls(self, response, unique=True): + def extract_links(self, response, unique=True): if self.restrict_xpaths: response = new_response_from_xpaths(response, self.restrict_xpaths) - links = LinkExtractor.extract_urls(self, response, unique) + links = LinkExtractor.extract_links(self, response, unique) links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: diff --git a/scrapy/trunk/scrapy/tests/test_link.py b/scrapy/trunk/scrapy/tests/test_link.py index 43f95d3d0..446991713 100644 --- a/scrapy/trunk/scrapy/tests/test_link.py +++ b/scrapy/trunk/scrapy/tests/test_link.py @@ -17,7 +17,7 @@ class LinkExtractorTestCase(unittest.TestCase): response = Response("example.org", "http://example.org/somepage/index.html", body=html) lx = LinkExtractor() # default: tag=a, attr=href - self.assertEqual(lx.extract_urls(response), + self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/somepage/item/12.html', text='Item 12'), Link(url='http://example.org/about.html', text='About us'), Link(url='http://example.org/othercat.html', text='Other category'), @@ -30,7 +30,7 @@ class LinkExtractorTestCase(unittest.TestCase): response = Response("example.org", "http://example.org/somepage/index.html", body=html) lx = LinkExtractor() # default: tag=a, attr=href - self.assertEqual(lx.extract_urls(response), + self.assertEqual(lx.extract_links(response), [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')]) def test_matches(self): diff --git a/scrapy/trunk/scrapy/tests/test_spiders/testplugin.py b/scrapy/trunk/scrapy/tests/test_spiders/testplugin.py index 818f6c8ea..d47264db2 100644 --- a/scrapy/trunk/scrapy/tests/test_spiders/testplugin.py +++ b/scrapy/trunk/scrapy/tests/test_spiders/testplugin.py @@ -22,7 +22,7 @@ class TestSpider(BaseSpider): def parse(self, response): xlink = LinkExtractor() itemre = re.compile(self.itemurl_re) - for link in xlink.extract_urls(response): + for link in xlink.extract_links(response): if itemre.search(link.url): yield Request(url=link.url, callback=self.parse_item)