mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 06:23:41 +00:00
Renamed LinkExtractors extract_urls method to extract_links
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40594
This commit is contained in:
parent
c82c799d07
commit
91a23e61bf
@ -85,7 +85,7 @@ class CrawlSpider(BaseSpider):
|
|||||||
requests = []
|
requests = []
|
||||||
seen = set()
|
seen = set()
|
||||||
for rule in self._rules:
|
for rule in self._rules:
|
||||||
links = [l for l in rule.link_extractor.extract_urls(response) if l not in seen]
|
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
|
||||||
if links and rule.process_links:
|
if links and rule.process_links:
|
||||||
links = rule.process_links(links)
|
links = rule.process_links(links)
|
||||||
seen = seen.union(links)
|
seen = seen.union(links)
|
||||||
|
@ -7,7 +7,7 @@ from scrapy.utils.url import urljoin_rfc as urljoin
|
|||||||
|
|
||||||
class LinkExtractor(FixedSGMLParser):
|
class LinkExtractor(FixedSGMLParser):
|
||||||
"""LinkExtractor are used to extract links from web pages. They are
|
"""LinkExtractor are used to extract links from web pages. They are
|
||||||
instantiated and later "applied" to a Response using the extract_urls
|
instantiated and later "applied" to a Response using the extract_links
|
||||||
method which must receive a Response object and return a dict whoose keys
|
method which must receive a Response object and return a dict whoose keys
|
||||||
are the (absolute) urls to follow, and its values any arbitrary data. In
|
are the (absolute) urls to follow, and its values any arbitrary data. In
|
||||||
this case the values are the text of the hyperlink.
|
this case the values are the text of the hyperlink.
|
||||||
@ -16,7 +16,7 @@ class LinkExtractor(FixedSGMLParser):
|
|||||||
functionality for extracting links to follow, but you could override this
|
functionality for extracting links to follow, but you could override this
|
||||||
class or create a new one if you need some additional functionality. The
|
class or create a new one if you need some additional functionality. The
|
||||||
only requisite is that the new (or overrided) class must provide a
|
only requisite is that the new (or overrided) class must provide a
|
||||||
extract_urls method that receives a Response and returns a dict with the
|
extract_links method that receives a Response and returns a dict with the
|
||||||
links to follow as its keys.
|
links to follow as its keys.
|
||||||
|
|
||||||
The constructor arguments are:
|
The constructor arguments are:
|
||||||
@ -35,7 +35,7 @@ class LinkExtractor(FixedSGMLParser):
|
|||||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||||
self.current_link = None
|
self.current_link = None
|
||||||
|
|
||||||
def extract_urls(self, response, unique=False):
|
def extract_links(self, response, unique=False):
|
||||||
self.reset()
|
self.reset()
|
||||||
self.unique = unique
|
self.unique = unique
|
||||||
self.feed(response.body.to_string())
|
self.feed(response.body.to_string())
|
||||||
|
@ -54,11 +54,11 @@ class RegexLinkExtractor(LinkExtractor):
|
|||||||
attr_func = lambda x: x in attrs
|
attr_func = lambda x: x in attrs
|
||||||
LinkExtractor.__init__(self, tag=tag_func, attr=attr_func)
|
LinkExtractor.__init__(self, tag=tag_func, attr=attr_func)
|
||||||
|
|
||||||
def extract_urls(self, response, unique=True):
|
def extract_links(self, response, unique=True):
|
||||||
if self.restrict_xpaths:
|
if self.restrict_xpaths:
|
||||||
response = new_response_from_xpaths(response, self.restrict_xpaths)
|
response = new_response_from_xpaths(response, self.restrict_xpaths)
|
||||||
|
|
||||||
links = LinkExtractor.extract_urls(self, response, unique)
|
links = LinkExtractor.extract_links(self, response, unique)
|
||||||
links = [link for link in links if _is_valid_url(link.url)]
|
links = [link for link in links if _is_valid_url(link.url)]
|
||||||
|
|
||||||
if self.allow_res:
|
if self.allow_res:
|
||||||
|
@ -17,7 +17,7 @@ class LinkExtractorTestCase(unittest.TestCase):
|
|||||||
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
|
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
|
||||||
|
|
||||||
lx = LinkExtractor() # default: tag=a, attr=href
|
lx = LinkExtractor() # default: tag=a, attr=href
|
||||||
self.assertEqual(lx.extract_urls(response),
|
self.assertEqual(lx.extract_links(response),
|
||||||
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
|
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
|
||||||
Link(url='http://example.org/about.html', text='About us'),
|
Link(url='http://example.org/about.html', text='About us'),
|
||||||
Link(url='http://example.org/othercat.html', text='Other category'),
|
Link(url='http://example.org/othercat.html', text='Other category'),
|
||||||
@ -30,7 +30,7 @@ class LinkExtractorTestCase(unittest.TestCase):
|
|||||||
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
|
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
|
||||||
|
|
||||||
lx = LinkExtractor() # default: tag=a, attr=href
|
lx = LinkExtractor() # default: tag=a, attr=href
|
||||||
self.assertEqual(lx.extract_urls(response),
|
self.assertEqual(lx.extract_links(response),
|
||||||
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
|
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
|
||||||
|
|
||||||
def test_matches(self):
|
def test_matches(self):
|
||||||
|
@ -22,7 +22,7 @@ class TestSpider(BaseSpider):
|
|||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
xlink = LinkExtractor()
|
xlink = LinkExtractor()
|
||||||
itemre = re.compile(self.itemurl_re)
|
itemre = re.compile(self.itemurl_re)
|
||||||
for link in xlink.extract_urls(response):
|
for link in xlink.extract_links(response):
|
||||||
if itemre.search(link.url):
|
if itemre.search(link.url):
|
||||||
yield Request(url=link.url, callback=self.parse_item)
|
yield Request(url=link.url, callback=self.parse_item)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user