1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 21:24:20 +00:00

Merge pull request #983 from ArturGaspar/linkextractor_css

[MRG+1] CSS support in link extractors
This commit is contained in:
Pablo Hoffman 2015-03-17 01:07:47 -03:00
commit f924567591
5 changed files with 40 additions and 12 deletions

View File

@ -51,7 +51,7 @@ LxmlLinkExtractor
:synopsis: lxml's HTMLParser-based link extractors
.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)
.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), restrict_css=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)
LxmlLinkExtractor is the recommended link extractor with handy filtering
options. It is implemented using lxml's robust HTMLParser.
@ -88,6 +88,11 @@ LxmlLinkExtractor
links. See examples below.
:type restrict_xpaths: str or list
:param restrict_css: a CSS selector (or list of selectors) which defines
regions inside the response where links should be extracted from.
Has the same behaviour as ``restrict_xpaths``.
:type restrict_css: str or list
:param tags: a tag or a list of tags to consider when extracting links.
Defaults to ``('a', 'area')``.
:type tags: str or list

View File

@ -81,17 +81,18 @@ class LxmlParserLinkExtractor(object):
class LxmlLinkExtractor(FilteringLinkExtractor):
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
deny_extensions=None):
tags=('a', 'area'), attrs=('href',), canonicalize=True,
unique=True, process_value=None, deny_extensions=None, restrict_css=()):
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
tag_func = lambda x: x in tags
attr_func = lambda x: x in attrs
lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func,
unique=unique, process=process_value)
super(LxmlLinkExtractor, self).__init__(lx, allow, deny,
allow_domains, deny_domains, restrict_xpaths, canonicalize,
deny_extensions)
super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
allow_domains=allow_domains, deny_domains=deny_domains,
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
canonicalize=canonicalize, deny_extensions=deny_extensions)
def extract_links(self, response):
html = Selector(response)

View File

@ -98,8 +98,8 @@ class BaseSgmlLinkExtractor(SGMLParser):
class SgmlLinkExtractor(FilteringLinkExtractor):
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
deny_extensions=None):
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True,
process_value=None, deny_extensions=None, restrict_css=()):
warnings.warn(
"SgmlLinkExtractor is deprecated and will be removed in future releases. "
@ -115,9 +115,10 @@ class SgmlLinkExtractor(FilteringLinkExtractor):
lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
unique=unique, process_value=process_value)
super(SgmlLinkExtractor, self).__init__(lx, allow, deny,
allow_domains, deny_domains, restrict_xpaths, canonicalize,
deny_extensions)
super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
allow_domains=allow_domains, deny_domains=deny_domains,
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
canonicalize=canonicalize, deny_extensions=deny_extensions)
# FIXME: was added to fix a RegexLinkExtractor testcase
self.base_url = None

View File

@ -5,6 +5,7 @@ scrapy.contrib.linkextractor).
import re
from six.moves.urllib.parse import urlparse
from scrapy.selector.csstranslator import ScrapyHTMLTranslator
from scrapy.utils.url import url_is_from_any_domain
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain, url_has_any_extension
from scrapy.utils.misc import arg_to_iter
@ -38,8 +39,10 @@ _is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'fil
class FilteringLinkExtractor(object):
_csstranslator = ScrapyHTMLTranslator()
def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains,
restrict_xpaths, canonicalize, deny_extensions):
restrict_xpaths, canonicalize, deny_extensions, restrict_css):
self.link_extractor = link_extractor
@ -50,6 +53,9 @@ class FilteringLinkExtractor(object):
self.deny_domains = set(arg_to_iter(deny_domains))
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
self.restrict_xpaths += tuple(map(self._csstranslator.css_to_xpath,
arg_to_iter(restrict_css)))
self.canonicalize = canonicalize
if deny_extensions is None:
deny_extensions = IGNORED_EXTENSIONS

View File

@ -284,6 +284,21 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
[Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
fragment='', nofollow=False)])
def test_restrict_css(self):
lx = self.extractor_cls(restrict_css=('#subwrapper a',))
self.assertEqual(lx.extract_links(self.response), [
Link(url='http://example.com/sample2.html', text=u'sample 2')
])
def test_restrict_css_and_restrict_xpaths_together(self):
lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', ),
restrict_css=('#subwrapper + a', ))
self.assertEqual([link for link in lx.extract_links(self.response)], [
Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
])
def test_area_tag_with_unicode_present(self):
body = """<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>"""
response = HtmlResponse("http://example.org", body=body, encoding='utf-8')