1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 06:23:41 +00:00

Renamed LinkExtractors extract_urls method to extract_links

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40594
This commit is contained in:
elpolilla 2009-01-02 02:34:44 +00:00
parent c82c799d07
commit 91a23e61bf
5 changed files with 9 additions and 9 deletions

View File

@ -85,7 +85,7 @@ class CrawlSpider(BaseSpider):
requests = [] requests = []
seen = set() seen = set()
for rule in self._rules: for rule in self._rules:
links = [l for l in rule.link_extractor.extract_urls(response) if l not in seen] links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
if links and rule.process_links: if links and rule.process_links:
links = rule.process_links(links) links = rule.process_links(links)
seen = seen.union(links) seen = seen.union(links)

View File

@ -7,7 +7,7 @@ from scrapy.utils.url import urljoin_rfc as urljoin
class LinkExtractor(FixedSGMLParser): class LinkExtractor(FixedSGMLParser):
"""LinkExtractor are used to extract links from web pages. They are """LinkExtractor are used to extract links from web pages. They are
instantiated and later "applied" to a Response using the extract_urls instantiated and later "applied" to a Response using the extract_links
method which must receive a Response object and return a dict whoose keys method which must receive a Response object and return a dict whoose keys
are the (absolute) urls to follow, and its values any arbitrary data. In are the (absolute) urls to follow, and its values any arbitrary data. In
this case the values are the text of the hyperlink. this case the values are the text of the hyperlink.
@ -16,7 +16,7 @@ class LinkExtractor(FixedSGMLParser):
functionality for extracting links to follow, but you could override this functionality for extracting links to follow, but you could override this
class or create a new one if you need some additional functionality. The class or create a new one if you need some additional functionality. The
only requisite is that the new (or overrided) class must provide a only requisite is that the new (or overrided) class must provide a
extract_urls method that receives a Response and returns a dict with the extract_links method that receives a Response and returns a dict with the
links to follow as its keys. links to follow as its keys.
The constructor arguments are: The constructor arguments are:
@ -35,7 +35,7 @@ class LinkExtractor(FixedSGMLParser):
self.scan_attr = attr if callable(attr) else lambda a: a == attr self.scan_attr = attr if callable(attr) else lambda a: a == attr
self.current_link = None self.current_link = None
def extract_urls(self, response, unique=False): def extract_links(self, response, unique=False):
self.reset() self.reset()
self.unique = unique self.unique = unique
self.feed(response.body.to_string()) self.feed(response.body.to_string())

View File

@ -54,11 +54,11 @@ class RegexLinkExtractor(LinkExtractor):
attr_func = lambda x: x in attrs attr_func = lambda x: x in attrs
LinkExtractor.__init__(self, tag=tag_func, attr=attr_func) LinkExtractor.__init__(self, tag=tag_func, attr=attr_func)
def extract_urls(self, response, unique=True): def extract_links(self, response, unique=True):
if self.restrict_xpaths: if self.restrict_xpaths:
response = new_response_from_xpaths(response, self.restrict_xpaths) response = new_response_from_xpaths(response, self.restrict_xpaths)
links = LinkExtractor.extract_urls(self, response, unique) links = LinkExtractor.extract_links(self, response, unique)
links = [link for link in links if _is_valid_url(link.url)] links = [link for link in links if _is_valid_url(link.url)]
if self.allow_res: if self.allow_res:

View File

@ -17,7 +17,7 @@ class LinkExtractorTestCase(unittest.TestCase):
response = Response("example.org", "http://example.org/somepage/index.html", body=html) response = Response("example.org", "http://example.org/somepage/index.html", body=html)
lx = LinkExtractor() # default: tag=a, attr=href lx = LinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_urls(response), self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'), [Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
Link(url='http://example.org/about.html', text='About us'), Link(url='http://example.org/about.html', text='About us'),
Link(url='http://example.org/othercat.html', text='Other category'), Link(url='http://example.org/othercat.html', text='Other category'),
@ -30,7 +30,7 @@ class LinkExtractorTestCase(unittest.TestCase):
response = Response("example.org", "http://example.org/somepage/index.html", body=html) response = Response("example.org", "http://example.org/somepage/index.html", body=html)
lx = LinkExtractor() # default: tag=a, attr=href lx = LinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_urls(response), self.assertEqual(lx.extract_links(response),
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')]) [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
def test_matches(self): def test_matches(self):

View File

@ -22,7 +22,7 @@ class TestSpider(BaseSpider):
def parse(self, response): def parse(self, response):
xlink = LinkExtractor() xlink = LinkExtractor()
itemre = re.compile(self.itemurl_re) itemre = re.compile(self.itemurl_re)
for link in xlink.extract_urls(response): for link in xlink.extract_links(response):
if itemre.search(link.url): if itemre.search(link.url):
yield Request(url=link.url, callback=self.parse_item) yield Request(url=link.url, callback=self.parse_item)