1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 03:03:47 +00:00

Renamed LinkExtractors extract_urls method to extract_links

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40594
This commit is contained in:
elpolilla 2009-01-02 02:34:44 +00:00
parent c82c799d07
commit 91a23e61bf
5 changed files with 9 additions and 9 deletions

View File

@ -85,7 +85,7 @@ class CrawlSpider(BaseSpider):
requests = []
seen = set()
for rule in self._rules:
links = [l for l in rule.link_extractor.extract_urls(response) if l not in seen]
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
if links and rule.process_links:
links = rule.process_links(links)
seen = seen.union(links)

View File

@ -7,7 +7,7 @@ from scrapy.utils.url import urljoin_rfc as urljoin
class LinkExtractor(FixedSGMLParser):
"""LinkExtractor are used to extract links from web pages. They are
instantiated and later "applied" to a Response using the extract_urls
instantiated and later "applied" to a Response using the extract_links
method which must receive a Response object and return a dict whoose keys
are the (absolute) urls to follow, and its values any arbitrary data. In
this case the values are the text of the hyperlink.
@ -16,7 +16,7 @@ class LinkExtractor(FixedSGMLParser):
functionality for extracting links to follow, but you could override this
class or create a new one if you need some additional functionality. The
only requisite is that the new (or overrided) class must provide a
extract_urls method that receives a Response and returns a dict with the
extract_links method that receives a Response and returns a dict with the
links to follow as its keys.
The constructor arguments are:
@ -35,7 +35,7 @@ class LinkExtractor(FixedSGMLParser):
self.scan_attr = attr if callable(attr) else lambda a: a == attr
self.current_link = None
def extract_urls(self, response, unique=False):
def extract_links(self, response, unique=False):
self.reset()
self.unique = unique
self.feed(response.body.to_string())

View File

@ -54,11 +54,11 @@ class RegexLinkExtractor(LinkExtractor):
attr_func = lambda x: x in attrs
LinkExtractor.__init__(self, tag=tag_func, attr=attr_func)
def extract_urls(self, response, unique=True):
def extract_links(self, response, unique=True):
if self.restrict_xpaths:
response = new_response_from_xpaths(response, self.restrict_xpaths)
links = LinkExtractor.extract_urls(self, response, unique)
links = LinkExtractor.extract_links(self, response, unique)
links = [link for link in links if _is_valid_url(link.url)]
if self.allow_res:

View File

@ -17,7 +17,7 @@ class LinkExtractorTestCase(unittest.TestCase):
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
lx = LinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_urls(response),
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
Link(url='http://example.org/about.html', text='About us'),
Link(url='http://example.org/othercat.html', text='Other category'),
@ -30,7 +30,7 @@ class LinkExtractorTestCase(unittest.TestCase):
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
lx = LinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_urls(response),
self.assertEqual(lx.extract_links(response),
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
def test_matches(self):

View File

@ -22,7 +22,7 @@ class TestSpider(BaseSpider):
def parse(self, response):
xlink = LinkExtractor()
itemre = re.compile(self.itemurl_re)
for link in xlink.extract_urls(response):
for link in xlink.extract_links(response):
if itemre.search(link.url):
yield Request(url=link.url, callback=self.parse_item)