mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 21:24:20 +00:00
Merge pull request #983 from ArturGaspar/linkextractor_css
[MRG+1] CSS support in link extractors
This commit is contained in:
commit
f924567591
@ -51,7 +51,7 @@ LxmlLinkExtractor
|
||||
:synopsis: lxml's HTMLParser-based link extractors
|
||||
|
||||
|
||||
.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)
|
||||
.. class:: LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), restrict_css=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None)
|
||||
|
||||
LxmlLinkExtractor is the recommended link extractor with handy filtering
|
||||
options. It is implemented using lxml's robust HTMLParser.
|
||||
@ -88,6 +88,11 @@ LxmlLinkExtractor
|
||||
links. See examples below.
|
||||
:type restrict_xpaths: str or list
|
||||
|
||||
:param restrict_css: a CSS selector (or list of selectors) which defines
|
||||
regions inside the response where links should be extracted from.
|
||||
Has the same behaviour as ``restrict_xpaths``.
|
||||
:type restrict_css: str or list
|
||||
|
||||
:param tags: a tag or a list of tags to consider when extracting links.
|
||||
Defaults to ``('a', 'area')``.
|
||||
:type tags: str or list
|
||||
|
@ -81,17 +81,18 @@ class LxmlParserLinkExtractor(object):
|
||||
class LxmlLinkExtractor(FilteringLinkExtractor):
|
||||
|
||||
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
||||
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
|
||||
deny_extensions=None):
|
||||
tags=('a', 'area'), attrs=('href',), canonicalize=True,
|
||||
unique=True, process_value=None, deny_extensions=None, restrict_css=()):
|
||||
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
|
||||
tag_func = lambda x: x in tags
|
||||
attr_func = lambda x: x in attrs
|
||||
lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func,
|
||||
unique=unique, process=process_value)
|
||||
|
||||
super(LxmlLinkExtractor, self).__init__(lx, allow, deny,
|
||||
allow_domains, deny_domains, restrict_xpaths, canonicalize,
|
||||
deny_extensions)
|
||||
super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
|
||||
allow_domains=allow_domains, deny_domains=deny_domains,
|
||||
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
|
||||
canonicalize=canonicalize, deny_extensions=deny_extensions)
|
||||
|
||||
def extract_links(self, response):
|
||||
html = Selector(response)
|
||||
|
@ -98,8 +98,8 @@ class BaseSgmlLinkExtractor(SGMLParser):
|
||||
class SgmlLinkExtractor(FilteringLinkExtractor):
|
||||
|
||||
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
||||
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
|
||||
deny_extensions=None):
|
||||
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True,
|
||||
process_value=None, deny_extensions=None, restrict_css=()):
|
||||
|
||||
warnings.warn(
|
||||
"SgmlLinkExtractor is deprecated and will be removed in future releases. "
|
||||
@ -115,9 +115,10 @@ class SgmlLinkExtractor(FilteringLinkExtractor):
|
||||
lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
|
||||
unique=unique, process_value=process_value)
|
||||
|
||||
super(SgmlLinkExtractor, self).__init__(lx, allow, deny,
|
||||
allow_domains, deny_domains, restrict_xpaths, canonicalize,
|
||||
deny_extensions)
|
||||
super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
|
||||
allow_domains=allow_domains, deny_domains=deny_domains,
|
||||
restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
|
||||
canonicalize=canonicalize, deny_extensions=deny_extensions)
|
||||
|
||||
# FIXME: was added to fix a RegexLinkExtractor testcase
|
||||
self.base_url = None
|
||||
|
@ -5,6 +5,7 @@ scrapy.contrib.linkextractor).
|
||||
import re
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from scrapy.selector.csstranslator import ScrapyHTMLTranslator
|
||||
from scrapy.utils.url import url_is_from_any_domain
|
||||
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain, url_has_any_extension
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
@ -38,8 +39,10 @@ _is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'fil
|
||||
|
||||
class FilteringLinkExtractor(object):
|
||||
|
||||
_csstranslator = ScrapyHTMLTranslator()
|
||||
|
||||
def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains,
|
||||
restrict_xpaths, canonicalize, deny_extensions):
|
||||
restrict_xpaths, canonicalize, deny_extensions, restrict_css):
|
||||
|
||||
self.link_extractor = link_extractor
|
||||
|
||||
@ -50,6 +53,9 @@ class FilteringLinkExtractor(object):
|
||||
self.deny_domains = set(arg_to_iter(deny_domains))
|
||||
|
||||
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
|
||||
self.restrict_xpaths += tuple(map(self._csstranslator.css_to_xpath,
|
||||
arg_to_iter(restrict_css)))
|
||||
|
||||
self.canonicalize = canonicalize
|
||||
if deny_extensions is None:
|
||||
deny_extensions = IGNORED_EXTENSIONS
|
||||
|
@ -284,6 +284,21 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
[Link(url='http://example.org/foo', text=u'>\u4eac<\u4e1c',
|
||||
fragment='', nofollow=False)])
|
||||
|
||||
def test_restrict_css(self):
|
||||
lx = self.extractor_cls(restrict_css=('#subwrapper a',))
|
||||
self.assertEqual(lx.extract_links(self.response), [
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2')
|
||||
])
|
||||
|
||||
def test_restrict_css_and_restrict_xpaths_together(self):
|
||||
lx = self.extractor_cls(restrict_xpaths=('//div[@id="subwrapper"]', ),
|
||||
restrict_css=('#subwrapper + a', ))
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)], [
|
||||
Link(url='http://example.com/sample1.html', text=u''),
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
||||
])
|
||||
|
||||
def test_area_tag_with_unicode_present(self):
|
||||
body = """<html><body>\xbe\xa9<map><area href="http://example.org/foo" /></map></body></html>"""
|
||||
response = HtmlResponse("http://example.org", body=body, encoding='utf-8')
|
||||
|
Loading…
x
Reference in New Issue
Block a user