1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 21:44:19 +00:00

Merge pull request #3152 from nctl144/ftp_linkextractors

[MRG+1] add ftp to the scheme list
This commit is contained in:
Daniel Graña 2018-07-11 11:50:02 -03:00 committed by GitHub
commit 9c6d265280
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 13 additions and 2 deletions

View File

@ -41,7 +41,8 @@ IGNORED_EXTENSIONS = [
_re_type = type(re.compile("", 0))
_matches = lambda url, regexs: any(r.search(url) for r in regexs)
_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', 'file'}
_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', \
'file', 'ftp'}
class FilteringLinkExtractor(object):

View File

@ -451,6 +451,17 @@ class Base:
Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
])
def test_ftp_links(self):
body = b"""
<html><body>
<div><a href="ftp://www.external.com/">An Item</a></div>
</body></html>"""
response = HtmlResponse("http://www.example.com/index.html", body=body, encoding='utf8')
lx = self.extractor_cls()
self.assertEqual(lx.extract_links(response), [
Link(url='ftp://www.external.com/', text=u'An Item', fragment='', nofollow=False),
])
class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
extractor_cls = LxmlLinkExtractor
@ -471,4 +482,3 @@ class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
@pytest.mark.xfail
def test_restrict_xpaths_with_html_entities(self):
super(LxmlLinkExtractorTestCase, self).test_restrict_xpaths_with_html_entities()