mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 21:44:19 +00:00
Merge pull request #3152 from nctl144/ftp_linkextractors
[MRG+1] add ftp to the scheme list
This commit is contained in:
commit
9c6d265280
@ -41,7 +41,8 @@ IGNORED_EXTENSIONS = [
|
||||
|
||||
_re_type = type(re.compile("", 0))
|
||||
_matches = lambda url, regexs: any(r.search(url) for r in regexs)
|
||||
_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', 'file'}
|
||||
_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', \
|
||||
'file', 'ftp'}
|
||||
|
||||
|
||||
class FilteringLinkExtractor(object):
|
||||
|
@ -451,6 +451,17 @@ class Base:
|
||||
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
|
||||
])
|
||||
|
||||
def test_ftp_links(self):
|
||||
body = b"""
|
||||
<html><body>
|
||||
<div><a href="ftp://www.external.com/">An Item</a></div>
|
||||
</body></html>"""
|
||||
response = HtmlResponse("http://www.example.com/index.html", body=body, encoding='utf8')
|
||||
lx = self.extractor_cls()
|
||||
self.assertEqual(lx.extract_links(response), [
|
||||
Link(url='ftp://www.external.com/', text=u'An Item', fragment='', nofollow=False),
|
||||
])
|
||||
|
||||
|
||||
class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
|
||||
extractor_cls = LxmlLinkExtractor
|
||||
@ -471,4 +482,3 @@ class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
|
||||
@pytest.mark.xfail
|
||||
def test_restrict_xpaths_with_html_entities(self):
|
||||
super(LxmlLinkExtractorTestCase, self).test_restrict_xpaths_with_html_entities()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user