mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 03:03:47 +00:00
Renamed LinkExtractors extract_urls method to extract_links
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40594
This commit is contained in:
parent
c82c799d07
commit
91a23e61bf
@ -85,7 +85,7 @@ class CrawlSpider(BaseSpider):
|
||||
requests = []
|
||||
seen = set()
|
||||
for rule in self._rules:
|
||||
links = [l for l in rule.link_extractor.extract_urls(response) if l not in seen]
|
||||
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
|
||||
if links and rule.process_links:
|
||||
links = rule.process_links(links)
|
||||
seen = seen.union(links)
|
||||
|
@ -7,7 +7,7 @@ from scrapy.utils.url import urljoin_rfc as urljoin
|
||||
|
||||
class LinkExtractor(FixedSGMLParser):
|
||||
"""LinkExtractor are used to extract links from web pages. They are
|
||||
instantiated and later "applied" to a Response using the extract_urls
|
||||
instantiated and later "applied" to a Response using the extract_links
|
||||
method which must receive a Response object and return a dict whoose keys
|
||||
are the (absolute) urls to follow, and its values any arbitrary data. In
|
||||
this case the values are the text of the hyperlink.
|
||||
@ -16,7 +16,7 @@ class LinkExtractor(FixedSGMLParser):
|
||||
functionality for extracting links to follow, but you could override this
|
||||
class or create a new one if you need some additional functionality. The
|
||||
only requisite is that the new (or overrided) class must provide a
|
||||
extract_urls method that receives a Response and returns a dict with the
|
||||
extract_links method that receives a Response and returns a dict with the
|
||||
links to follow as its keys.
|
||||
|
||||
The constructor arguments are:
|
||||
@ -35,7 +35,7 @@ class LinkExtractor(FixedSGMLParser):
|
||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||
self.current_link = None
|
||||
|
||||
def extract_urls(self, response, unique=False):
|
||||
def extract_links(self, response, unique=False):
|
||||
self.reset()
|
||||
self.unique = unique
|
||||
self.feed(response.body.to_string())
|
||||
|
@ -54,11 +54,11 @@ class RegexLinkExtractor(LinkExtractor):
|
||||
attr_func = lambda x: x in attrs
|
||||
LinkExtractor.__init__(self, tag=tag_func, attr=attr_func)
|
||||
|
||||
def extract_urls(self, response, unique=True):
|
||||
def extract_links(self, response, unique=True):
|
||||
if self.restrict_xpaths:
|
||||
response = new_response_from_xpaths(response, self.restrict_xpaths)
|
||||
|
||||
links = LinkExtractor.extract_urls(self, response, unique)
|
||||
links = LinkExtractor.extract_links(self, response, unique)
|
||||
links = [link for link in links if _is_valid_url(link.url)]
|
||||
|
||||
if self.allow_res:
|
||||
|
@ -17,7 +17,7 @@ class LinkExtractorTestCase(unittest.TestCase):
|
||||
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
|
||||
|
||||
lx = LinkExtractor() # default: tag=a, attr=href
|
||||
self.assertEqual(lx.extract_urls(response),
|
||||
self.assertEqual(lx.extract_links(response),
|
||||
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
|
||||
Link(url='http://example.org/about.html', text='About us'),
|
||||
Link(url='http://example.org/othercat.html', text='Other category'),
|
||||
@ -30,7 +30,7 @@ class LinkExtractorTestCase(unittest.TestCase):
|
||||
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
|
||||
|
||||
lx = LinkExtractor() # default: tag=a, attr=href
|
||||
self.assertEqual(lx.extract_urls(response),
|
||||
self.assertEqual(lx.extract_links(response),
|
||||
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
|
||||
|
||||
def test_matches(self):
|
||||
|
@ -22,7 +22,7 @@ class TestSpider(BaseSpider):
|
||||
def parse(self, response):
|
||||
xlink = LinkExtractor()
|
||||
itemre = re.compile(self.itemurl_re)
|
||||
for link in xlink.extract_urls(response):
|
||||
for link in xlink.extract_links(response):
|
||||
if itemre.search(link.url):
|
||||
yield Request(url=link.url, callback=self.parse_item)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user