Merge pull request #983 from ArturGaspar/linkextractor_css

[MRG+1] CSS support in link extractors
2025-02-23 21:24:20 +00:00 · 2015-03-17 01:07:47 -03:00 · 2015-03-17 01:07:47 -03:00 · f924567591
commit f924567591
parent b461c6f16f 22247cf791
5 changed files with 40 additions and 12 deletions
--- a/docs/topics/link-extractors.rst
+++ b/docs/topics/link-extractors.rst
@ -51,7 +51,7 @@ LxmlLinkExtractor
   :synopsis: lxml's HTMLParser-based link extractors


-.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)
+.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), restrict_css=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)

    LxmlLinkExtractor is the recommended link extractor with handy filtering
    options. It is implemented using lxml's robust HTMLParser.
@ -88,6 +88,11 @@ LxmlLinkExtractor
        links. See examples below.
    :type restrict_xpaths: str or list

+    :param restrict_css: a CSS selector (or list of selectors) which defines
+        regions inside the response where links should be extracted from.
+        Has the same behaviour as ``restrict_xpaths``.
+    :type restrict_css: str or list
+
    :param tags: a tag or a list of tags to consider when extracting links.
        Defaults to ``('a', 'area')``.
    :type tags: str or list
--- a/scrapy/contrib/linkextractors/lxmlhtml.py
+++ b/scrapy/contrib/linkextractors/lxmlhtml.py
@ -81,17 +81,18 @@ class LxmlParserLinkExtractor(object):
 class LxmlLinkExtractor(FilteringLinkExtractor):

    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
-                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
-                 deny_extensions=None):
+                 tags=('a', 'area'), attrs=('href',), canonicalize=True,
+                 unique=True, process_value=None, deny_extensions=None, restrict_css=()):
        tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func,
            unique=unique, process=process_value)

-        super(LxmlLinkExtractor, self).__init__(lx, allow, deny,
-            allow_domains, deny_domains, restrict_xpaths, canonicalize,
-            deny_extensions)
+        super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
+            allow_domains=allow_domains, deny_domains=deny_domains,
+            restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
+            canonicalize=canonicalize, deny_extensions=deny_extensions)

    def extract_links(self, response):
        html = Selector(response)
--- a/scrapy/contrib/linkextractors/sgml.py
+++ b/scrapy/contrib/linkextractors/sgml.py
@ -98,8 +98,8 @@ class BaseSgmlLinkExtractor(SGMLParser):
 class SgmlLinkExtractor(FilteringLinkExtractor):

    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
-                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
-                 deny_extensions=None):
+                 tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True,
+                 process_value=None, deny_extensions=None, restrict_css=()):

        warnings.warn(
            "SgmlLinkExtractor is deprecated and will be removed in future releases. "
@ -115,9 +115,10 @@ class SgmlLinkExtractor(FilteringLinkExtractor):
            lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
                unique=unique, process_value=process_value)

-        super(SgmlLinkExtractor, self).__init__(lx, allow, deny,
-            allow_domains, deny_domains, restrict_xpaths, canonicalize,
-            deny_extensions)
+        super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
+            allow_domains=allow_domains, deny_domains=deny_domains,
+            restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
+            canonicalize=canonicalize, deny_extensions=deny_extensions)

        # FIXME: was added to fix a RegexLinkExtractor testcase
        self.base_url = None
--- a/scrapy/linkextractor.py
+++ b/scrapy/linkextractor.py
@ -5,6 +5,7 @@ scrapy.contrib.linkextractor).
 import re
 from six.moves.urllib.parse import urlparse

+from scrapy.selector.csstranslator import ScrapyHTMLTranslator
 from scrapy.utils.url import url_is_from_any_domain
 from scrapy.utils.url import canonicalize_url, url_is_from_any_domain, url_has_any_extension
 from scrapy.utils.misc import arg_to_iter
@ -38,8 +39,10 @@ _is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'fil

 class FilteringLinkExtractor(object):

+    _csstranslator = ScrapyHTMLTranslator()
+
    def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains,
-                 restrict_xpaths, canonicalize, deny_extensions):
+                 restrict_xpaths, canonicalize, deny_extensions, restrict_css):

        self.link_extractor = link_extractor

@ -50,6 +53,9 @@ class FilteringLinkExtractor(object):
        self.deny_domains = set(arg_to_iter(deny_domains))

        self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
+        self.restrict_xpaths += tuple(map(self._csstranslator.css_to_xpath,
+                                          arg_to_iter(restrict_css)))
+
        self.canonicalize = canonicalize
        if deny_extensions is None:
            deny_extensions = IGNORED_EXTENSIONS
--- a/tests/test_contrib_linkextractors.py
+++ b/tests/test_contrib_linkextractors.py
@ -284,6 +284,21 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
                         [Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
                               fragment='', nofollow=False)])

+    def test_restrict_css(self):
+        lx = self.extractor_cls(restrict_css=('#subwrapper a',))
+        self.assertEqual(lx.extract_links(self.response), [
+            Link(url='http://example.com/sample2.html', text=u'sample 2')
+        ])
+
+    def test_restrict_css_and_restrict_xpaths_together(self):
+        lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', ),
+                                restrict_css=('#subwrapper + a', ))
+        self.assertEqual([link for link in lx.extract_links(self.response)], [
+            Link(url='http://example.com/sample1.html', text=u''),
+            Link(url='http://example.com/sample2.html', text=u'sample 2'),
+            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
+        ])
+
    def test_area_tag_with_unicode_present(self):
        body = """<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>"""
        response = HtmlResponse("http://example.org", body=body, encoding='utf-8')