Merge pull request #3152 from nctl144/ftp_linkextractors

[MRG+1] add ftp to the scheme list
2025-02-23 21:44:19 +00:00 · 2018-07-11 11:50:02 -03:00 · 2018-07-11 11:50:02 -03:00 · 9c6d265280
commit 9c6d265280
parent 8d5320dcd2 4c05441450
2 changed files with 13 additions and 2 deletions
--- a/scrapy/linkextractors/init.py
+++ b/scrapy/linkextractors/init.py
@ -41,7 +41,8 @@ IGNORED_EXTENSIONS = [

 _re_type = type(re.compile("", 0))
 _matches = lambda url, regexs: any(r.search(url) for r in regexs)
-_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', 'file'}
+_is_valid_url = lambda url: url.split('://', 1)[0] in {'http', 'https', \
+                                                       'file', 'ftp'}


 class FilteringLinkExtractor(object):
--- a/tests/test_linkextractors.py
+++ b/tests/test_linkextractors.py
@ -451,6 +451,17 @@ class Base:
                Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
            ])

+        def test_ftp_links(self):
+            body = b"""
+            <html><body>
+            <div><a href="ftp://www.external.com/">An Item</a></div>
+            </body></html>"""
+            response = HtmlResponse("http://www.example.com/index.html", body=body, encoding='utf8')
+            lx = self.extractor_cls()
+            self.assertEqual(lx.extract_links(response), [
+                Link(url='ftp://www.external.com/', text=u'An Item', fragment='', nofollow=False),
+            ])
+

 class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
    extractor_cls = LxmlLinkExtractor
@ -471,4 +482,3 @@ class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
    @pytest.mark.xfail
    def test_restrict_xpaths_with_html_entities(self):
        super(LxmlLinkExtractorTestCase, self).test_restrict_xpaths_with_html_entities()
-