mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 04:44:26 +00:00
Sorted out Link Extractors organization by moving all them to
scrapy.contrib.linkextractors. The most relevant being: scrapy.link.extractors.RegexLinkExtractor which was moved to: scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor The old location still works but throws a deprecation warning. It will be removed before the 0.7 release. Documentation and tests were also updated. Also, in this changeset, a new regex-based link extractor was added to scrapy.contrib.linkextractors.regex. --HG-- rename : scrapy/tests/sample_data/link_extractor/regex_linkextractor.html => scrapy/tests/sample_data/link_extractor/sgml_linkextractor.html rename : scrapy/tests/test_link.py => scrapy/tests/test_contrib_linkextractors.py
This commit is contained in:
parent
7b34e08392
commit
86498abdf1
@ -125,7 +125,7 @@ Finally, here's the spider code::
|
||||
|
||||
domain_name = 'mininova.org'
|
||||
start_urls = ['http://www.mininova.org/today']
|
||||
rules = [Rule(RegexLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
||||
rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
||||
|
||||
def parse_torrent(self, response):
|
||||
x = HtmlXPathSelector(response)
|
||||
|
@ -93,7 +93,7 @@ Let's now take a look at an example CrawlSpider with Rules::
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.link.extractors import RegexLinkExtractor
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.xpath.selector import HtmlXPathSelector
|
||||
from scrapy.item import ScrapedItem
|
||||
|
||||
@ -104,10 +104,10 @@ Let's now take a look at an example CrawlSpider with Rules::
|
||||
rules = (
|
||||
# Extract links matching 'category.php' (but not matching 'subsection.php')
|
||||
# and follow links from them (since no callback means follow=True by default).
|
||||
Rule(RegexLinkExtractor(allow=('category\.php', ), deny=('subsection\,php', ))),
|
||||
Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\,php', ))),
|
||||
|
||||
# Extract links matching 'item.php' and parse them with the spider's method parse_item
|
||||
Rule(RegexLinkExtractor(allow=('item\.php', )), callback='parse_item'),
|
||||
Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
|
||||
)
|
||||
|
||||
def parse_item(self, response):
|
||||
|
@ -4,67 +4,25 @@
|
||||
Available Link Extractors
|
||||
=========================
|
||||
|
||||
.. module:: scrapy.link
|
||||
.. module:: scrapy.contrib.linkextractors
|
||||
:synopsis: Link extractors classes
|
||||
|
||||
LinkExtractor
|
||||
=============
|
||||
All available link extractors classes bundled with Scrapy are provided in the
|
||||
:mod:`scrapy.contrib.linkextractors` module.
|
||||
|
||||
.. class:: LinkExtractor(tag="a", href="href", unique=False, process_value=None)
|
||||
.. module:: scrapy.contrib.linkextractors.sgml
|
||||
:synopsis: SGMLParser-based link extractors
|
||||
|
||||
This is the most basic Link Extractor which extracts links from a response with
|
||||
by looking at the given attributes inside the given tags.
|
||||
SgmlLinkExtractor
|
||||
=================
|
||||
|
||||
The constructor arguments are:
|
||||
.. class:: SgmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
|
||||
|
||||
:param tag: either a string (with the name of a tag) or a function that
|
||||
receives a tag name and returns ``True`` if links should be extracted
|
||||
from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
|
||||
request (once its downloaded) as its first parameter. For more
|
||||
information see :ref:`ref-request-callback-arguments` below.
|
||||
:type tag: str or callable
|
||||
|
||||
:param attr: either string (with the name of a tag attribute), or a
|
||||
function that receives a an attribute name and returns ``True`` if
|
||||
links should be extracted from it, or ``False`` if the shouldn't.
|
||||
Defaults to ``href``.
|
||||
:type attr: str or callable
|
||||
|
||||
:param unique: is a boolean that specifies if a duplicate filtering should
|
||||
be applied to links extracted.
|
||||
:type unique: boolean
|
||||
|
||||
:param process_value: a function which receives each value extracted from
|
||||
the tag and attributes scanned and can modify the value and return a
|
||||
new one, or return ``None`` to ignore the link altogether. If not
|
||||
given, ``process_value`` defaults to ``lambda x: x``.
|
||||
|
||||
.. highlight:: html
|
||||
|
||||
For example, to extract links from this code::
|
||||
|
||||
<a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
|
||||
|
||||
.. highlight:: python
|
||||
|
||||
You can use the following function in ``process_value``::
|
||||
|
||||
def process_value(value):
|
||||
m = re.search("javascript:goToPage\('(.*?)'", value)
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
:type process_value: callable
|
||||
|
||||
RegexLinkExtractor
|
||||
==================
|
||||
|
||||
.. class:: RegexLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
|
||||
|
||||
The RegexLinkExtractor extends the base :class:`LinkExtractor` by providing
|
||||
additional filters that you can specify to extract links, including regular
|
||||
expressions patterns that the links must match to be extracted. All those
|
||||
filters are configured through these constructor paramters:
|
||||
The SgmlLinkExtractor extends the base :class:`BaseSgmlLinkExtractor` by
|
||||
providing additional filters that you can specify to extract links,
|
||||
including regular expressions patterns that the links must match to be
|
||||
extracted. All those filters are configured through these constructor
|
||||
parameters:
|
||||
|
||||
:param allow: a single regular expression (or list of regular expressions)
|
||||
that the (absolute) urls must match in order to be extracted. If not
|
||||
@ -112,3 +70,52 @@ RegexLinkExtractor
|
||||
:class:`LinkExtractor` class constructor
|
||||
:type process_value: boolean
|
||||
|
||||
BaseSgmlLinkExtractor
|
||||
=====================
|
||||
|
||||
.. class:: BaseSgmlLinkExtractor(tag="a", href="href", unique=False, process_value=None)
|
||||
|
||||
The purpose of this Link Extractor is only to serve as a base class for the
|
||||
:class:`SgmlLinkExtractor`. You should use that one instead.
|
||||
|
||||
The constructor arguments are:
|
||||
|
||||
:param tag: either a string (with the name of a tag) or a function that
|
||||
receives a tag name and returns ``True`` if links should be extracted
|
||||
from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
|
||||
request (once its downloaded) as its first parameter. For more
|
||||
information see :ref:`ref-request-callback-arguments` below.
|
||||
:type tag: str or callable
|
||||
|
||||
:param attr: either string (with the name of a tag attribute), or a
|
||||
function that receives a an attribute name and returns ``True`` if
|
||||
links should be extracted from it, or ``False`` if the shouldn't.
|
||||
Defaults to ``href``.
|
||||
:type attr: str or callable
|
||||
|
||||
:param unique: is a boolean that specifies if a duplicate filtering should
|
||||
be applied to links extracted.
|
||||
:type unique: boolean
|
||||
|
||||
:param process_value: a function which receives each value extracted from
|
||||
the tag and attributes scanned and can modify the value and return a
|
||||
new one, or return ``None`` to ignore the link altogether. If not
|
||||
given, ``process_value`` defaults to ``lambda x: x``.
|
||||
|
||||
.. highlight:: html
|
||||
|
||||
For example, to extract links from this code::
|
||||
|
||||
<a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
|
||||
|
||||
.. highlight:: python
|
||||
|
||||
You can use the following function in ``process_value``::
|
||||
|
||||
def process_value(value):
|
||||
m = re.search("javascript:goToPage\('(.*?)'", value)
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
:type process_value: callable
|
||||
|
||||
|
@ -152,7 +152,7 @@ CrawlSpider example
|
||||
Let's now take a look at an example CrawlSpider with rules::
|
||||
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.link.extractors import RegexLinkExtractor
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.xpath.selector import HtmlXPathSelector
|
||||
from scrapy.item import ScrapedItem
|
||||
|
||||
@ -163,10 +163,10 @@ Let's now take a look at an example CrawlSpider with rules::
|
||||
rules = (
|
||||
# Extract links matching 'category.php' (but not matching 'subsection.php')
|
||||
# and follow links from them (since no callback means follow=True by default).
|
||||
Rule(RegexLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
|
||||
Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
|
||||
|
||||
# Extract links matching 'item.php' and parse them with the spider's method parse_item
|
||||
Rule(RegexLinkExtractor(allow=('item\.php', )), callback='parse_item'),
|
||||
Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
|
||||
)
|
||||
|
||||
def parse_item(self, response):
|
||||
|
@ -63,7 +63,7 @@ those links. For example, the following one::
|
||||
|
||||
So, based on that regular expression we can create the first crawling rule::
|
||||
|
||||
Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$', ),
|
||||
Rule(SgmlLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$', ),
|
||||
'parse_category',
|
||||
follow=True,
|
||||
),
|
||||
@ -75,7 +75,7 @@ process and extract data from those pages.
|
||||
|
||||
This is how the spider would look so far::
|
||||
|
||||
from scrapy.link.extractors import RegexLinkExtractor
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
|
||||
class GoogleDirectorySpider(CrawlSpider):
|
||||
@ -83,7 +83,7 @@ This is how the spider would look so far::
|
||||
start_urls = ['http://www.google.com/dirhp']
|
||||
|
||||
rules = (
|
||||
Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
|
||||
Rule(SgmlLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
|
||||
'parse_category', follow=True,
|
||||
),
|
||||
)
|
||||
|
7
scrapy/contrib/linkextractors/__init__.py
Normal file
7
scrapy/contrib/linkextractors/__init__.py
Normal file
@ -0,0 +1,7 @@
|
||||
"""
|
||||
scrapy.contrib.linkextractors
|
||||
|
||||
This package contains a collection of Link Extractors.
|
||||
|
||||
For more info see docs/ref/link-extractors.rst
|
||||
"""
|
@ -1,7 +1,8 @@
|
||||
"""
|
||||
This module provides additional LinkExtractors, apart from the ones in scrapy.link
|
||||
and scrapy.link.extractors.
|
||||
This module implements the HtmlImageLinkExtractor for extracting
|
||||
image links only.
|
||||
"""
|
||||
|
||||
import urlparse
|
||||
|
||||
from scrapy.link import Link
|
28
scrapy/contrib/linkextractors/regex.py
Normal file
28
scrapy/contrib/linkextractors/regex.py
Normal file
@ -0,0 +1,28 @@
|
||||
import re
|
||||
|
||||
from scrapy.utils.url import urljoin_rfc as urljoin
|
||||
from scrapy.utils.markup import remove_tags, remove_entities, replace_escape_chars
|
||||
|
||||
from scrapy.link import Link
|
||||
from .sgml import SgmlLinkExtractor
|
||||
|
||||
linkre = re.compile(
|
||||
"<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>",
|
||||
re.DOTALL | re.IGNORECASE)
|
||||
|
||||
def clean_link(link_text):
|
||||
"""Remove leading and trailing whitespace and punctuation"""
|
||||
return link_text.strip("\t\r\n '\"")
|
||||
|
||||
class RegexLinkExtractor(SgmlLinkExtractor):
|
||||
"""High performant link extractor"""
|
||||
def _extract_links(self, response_text, response_url, response_encoding):
|
||||
base_url = self.base_url if self.base_url else response_url
|
||||
|
||||
clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
|
||||
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
|
||||
|
||||
links_text = linkre.findall(response_text)
|
||||
urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])
|
||||
|
||||
return [Link(url, text) for url, text in urlstext]
|
126
scrapy/contrib/linkextractors/sgml.py
Normal file
126
scrapy/contrib/linkextractors/sgml.py
Normal file
@ -0,0 +1,126 @@
|
||||
"""
|
||||
SGMLParser-based Link extractors
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from scrapy.xpath import HtmlXPathSelector
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
|
||||
from scrapy.utils.url import safe_url_string, urljoin_rfc as urljoin, canonicalize_url, url_is_from_any_domain
|
||||
|
||||
class BaseSgmlLinkExtractor(FixedSGMLParser):
|
||||
|
||||
def __init__(self, tag="a", attr="href", unique=False, process_value=None):
|
||||
FixedSGMLParser.__init__(self)
|
||||
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||
self.process_value = (lambda v: v) if process_value is None else process_value
|
||||
self.current_link = None
|
||||
self.unique = unique
|
||||
|
||||
def _extract_links(self, response_text, response_url, response_encoding):
|
||||
self.reset()
|
||||
self.feed(response_text)
|
||||
self.close()
|
||||
|
||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||
|
||||
ret = []
|
||||
base_url = self.base_url if self.base_url else response_url
|
||||
for link in links:
|
||||
link.url = urljoin(base_url, link.url)
|
||||
link.url = safe_url_string(link.url, response_encoding)
|
||||
link.text = str_to_unicode(link.text, response_encoding)
|
||||
ret.append(link)
|
||||
|
||||
return ret
|
||||
|
||||
def extract_links(self, response):
|
||||
# wrapper needed to allow to work directly with text
|
||||
return self._extract_links(response.body, response.url, response.encoding)
|
||||
|
||||
def reset(self):
|
||||
FixedSGMLParser.reset(self)
|
||||
self.links = []
|
||||
self.base_url = None
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
if tag == 'base':
|
||||
self.base_url = dict(attrs).get('href')
|
||||
if self.scan_tag(tag):
|
||||
for attr, value in attrs:
|
||||
if self.scan_attr(attr):
|
||||
url = self.process_value(value)
|
||||
if url is not None:
|
||||
link = Link(url=url)
|
||||
self.links.append(link)
|
||||
self.current_link = link
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
self.current_link = None
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.current_link and not self.current_link.text:
|
||||
self.current_link.text = data.strip()
|
||||
|
||||
def matches(self, url):
|
||||
"""This extractor matches with any url, since
|
||||
it doesn't contain any patterns"""
|
||||
return True
|
||||
|
||||
_re_type = type(re.compile("", 0))
|
||||
|
||||
_matches = lambda url, regexs: any((r.search(url) for r in regexs))
|
||||
_is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'file'])
|
||||
|
||||
class SgmlLinkExtractor(BaseSgmlLinkExtractor):
|
||||
|
||||
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
||||
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
|
||||
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
|
||||
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
|
||||
self.allow_domains = set(arg_to_iter(allow_domains))
|
||||
self.deny_domains = set(arg_to_iter(deny_domains))
|
||||
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
|
||||
self.canonicalize = canonicalize
|
||||
tag_func = lambda x: x in tags
|
||||
attr_func = lambda x: x in attrs
|
||||
BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func,
|
||||
unique=unique, process_value=process_value)
|
||||
|
||||
def extract_links(self, response):
|
||||
if self.restrict_xpaths:
|
||||
hxs = HtmlXPathSelector(response)
|
||||
html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
|
||||
links = self._extract_links(html_slice, response.url, response.encoding)
|
||||
else:
|
||||
links = BaseSgmlLinkExtractor.extract_links(self, response)
|
||||
|
||||
links = [link for link in links if _is_valid_url(link.url)]
|
||||
|
||||
if self.allow_res:
|
||||
links = [link for link in links if _matches(link.url, self.allow_res)]
|
||||
if self.deny_res:
|
||||
links = [link for link in links if not _matches(link.url, self.deny_res)]
|
||||
if self.allow_domains:
|
||||
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
|
||||
if self.deny_domains:
|
||||
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
|
||||
|
||||
if self.canonicalize:
|
||||
for link in links:
|
||||
link.url = canonicalize_url(link.url)
|
||||
|
||||
return links
|
||||
|
||||
def matches(self, url):
|
||||
if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
|
||||
return False
|
||||
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
|
||||
return False
|
||||
|
||||
allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
|
||||
denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
|
||||
return any(allowed) and not any(denied)
|
@ -1,5 +1,5 @@
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.link.extractors import RegexLinkExtractor
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
|
||||
class GenericSpider(CrawlSpider):
|
||||
"""
|
||||
@ -10,11 +10,11 @@ class GenericSpider(CrawlSpider):
|
||||
def __init__(self, domain_name):
|
||||
self.domain_name = domain_name
|
||||
self.rules = (
|
||||
Rule(RegexLinkExtractor(allow_domains=(domain_name,)), self.parse_note, follow=True),
|
||||
Rule(SgmlLinkExtractor(allow_domains=(domain_name,)), self.parse_note, follow=True),
|
||||
)
|
||||
super(GenericSpider, self).__init__()
|
||||
|
||||
def parse_note(self, response):
|
||||
pass
|
||||
|
||||
# not a singleton
|
||||
# not a singleton
|
||||
|
@ -10,7 +10,7 @@ from scrapy.utils.url import is_url
|
||||
from scrapy.utils.response import get_base_url
|
||||
from scrapy.utils.python import flatten, unicode_to_str
|
||||
from scrapy.xpath.selector import XPathSelector, XPathSelectorList
|
||||
from scrapy.contrib.link_extractors import HTMLImageLinkExtractor
|
||||
from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor
|
||||
|
||||
def extract(location, adaptor_args=None):
|
||||
"""
|
||||
|
@ -1,76 +1,12 @@
|
||||
"""
|
||||
LinkExtractor provides en efficient way to extract links from pages
|
||||
This module defines the Link object used in Link extractors.
|
||||
|
||||
See documentation in docs/ref/link-extractors.rst
|
||||
For actual link extractors implementation see scrapy.contrib.linkextractor, or
|
||||
its documentation in: docs/ref/link-extractors.rst
|
||||
"""
|
||||
|
||||
from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
|
||||
from scrapy.utils.url import safe_url_string, urljoin_rfc as urljoin
|
||||
|
||||
class LinkExtractor(FixedSGMLParser):
|
||||
|
||||
def __init__(self, tag="a", attr="href", unique=False, process_value=None):
|
||||
FixedSGMLParser.__init__(self)
|
||||
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||
self.process_value = (lambda v: v) if process_value is None else process_value
|
||||
self.current_link = None
|
||||
self.unique = unique
|
||||
|
||||
def _extract_links(self, response_text, response_url, response_encoding):
|
||||
self.reset()
|
||||
self.feed(response_text)
|
||||
self.close()
|
||||
|
||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||
|
||||
ret = []
|
||||
base_url = self.base_url if self.base_url else response_url
|
||||
for link in links:
|
||||
link.url = urljoin(base_url, link.url)
|
||||
link.url = safe_url_string(link.url, response_encoding)
|
||||
link.text = str_to_unicode(link.text, response_encoding)
|
||||
ret.append(link)
|
||||
|
||||
return ret
|
||||
|
||||
def extract_links(self, response):
|
||||
# wrapper needed to allow to work directly with text
|
||||
return self._extract_links(response.body, response.url, response.encoding)
|
||||
|
||||
def reset(self):
|
||||
FixedSGMLParser.reset(self)
|
||||
self.links = []
|
||||
self.base_url = None
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
if tag == 'base':
|
||||
self.base_url = dict(attrs).get('href')
|
||||
if self.scan_tag(tag):
|
||||
for attr, value in attrs:
|
||||
if self.scan_attr(attr):
|
||||
url = self.process_value(value)
|
||||
if url is not None:
|
||||
link = Link(url=url)
|
||||
self.links.append(link)
|
||||
self.current_link = link
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
self.current_link = None
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.current_link and not self.current_link.text:
|
||||
self.current_link.text = data.strip()
|
||||
|
||||
def matches(self, url):
|
||||
"""This extractor matches with any url, since
|
||||
it doesn't contain any patterns"""
|
||||
return True
|
||||
|
||||
|
||||
class Link(object):
|
||||
"""
|
||||
Link objects represent an extracted link by the LinkExtractor.
|
||||
"""Link objects represent an extracted link by the LinkExtractor.
|
||||
At the moment, it contains just the url and link text.
|
||||
"""
|
||||
|
||||
@ -85,3 +21,26 @@ class Link(object):
|
||||
|
||||
def __repr__(self):
|
||||
return '<Link url=%r text=%r >' % (self.url, self.text)
|
||||
|
||||
|
||||
# FIXME: code below is for backwards compatibility and should be removed before
|
||||
# the 0.7 release
|
||||
|
||||
import warnings
|
||||
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
|
||||
|
||||
class LinkExtractor(BaseSgmlLinkExtractor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn("scrapy.link.LinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.BaseSgmlLinkExtractor instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
BaseSgmlLinkExtractor.__init__(self, *args, **kwargs)
|
||||
|
||||
class RegexLinkExtractor(SgmlLinkExtractor):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn("scrapy.link.RegexLinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
SgmlLinkExtractor.__init__(self, *args, **kwargs)
|
||||
|
||||
|
@ -1,68 +1,14 @@
|
||||
"""
|
||||
This module provides some LinkExtractors, which extend the base LinkExtractor
|
||||
(scrapy.link.LinkExtractor) with some additional useful features.
|
||||
# FIXME: code below is for backwards compatibility and should be removed before
|
||||
# the 0.7 release
|
||||
|
||||
See documentation in docs/ref/link-extractors.rst
|
||||
"""
|
||||
import warnings
|
||||
|
||||
import re
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
|
||||
from scrapy.link import LinkExtractor
|
||||
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
|
||||
from scrapy.xpath import HtmlXPathSelector
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
class RegexLinkExtractor(SgmlLinkExtractor):
|
||||
|
||||
_re_type = type(re.compile("", 0))
|
||||
def __init__(self, *args, **kwargs):
|
||||
warnings.warn("scrapy.link.extractors.RegexLinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor instead",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
SgmlLinkExtractor.__init__(self, *args, **kwargs)
|
||||
|
||||
_matches = lambda url, regexs: any((r.search(url) for r in regexs))
|
||||
_is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'file'])
|
||||
|
||||
class RegexLinkExtractor(LinkExtractor):
|
||||
|
||||
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
||||
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
|
||||
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
|
||||
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
|
||||
self.allow_domains = set(arg_to_iter(allow_domains))
|
||||
self.deny_domains = set(arg_to_iter(deny_domains))
|
||||
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
|
||||
self.canonicalize = canonicalize
|
||||
tag_func = lambda x: x in tags
|
||||
attr_func = lambda x: x in attrs
|
||||
LinkExtractor.__init__(self, tag=tag_func, attr=attr_func,
|
||||
unique=unique, process_value=process_value)
|
||||
|
||||
def extract_links(self, response):
|
||||
if self.restrict_xpaths:
|
||||
hxs = HtmlXPathSelector(response)
|
||||
html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
|
||||
links = self._extract_links(html_slice, response.url, response.encoding)
|
||||
else:
|
||||
links = LinkExtractor.extract_links(self, response)
|
||||
|
||||
links = [link for link in links if _is_valid_url(link.url)]
|
||||
|
||||
if self.allow_res:
|
||||
links = [link for link in links if _matches(link.url, self.allow_res)]
|
||||
if self.deny_res:
|
||||
links = [link for link in links if not _matches(link.url, self.deny_res)]
|
||||
if self.allow_domains:
|
||||
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
|
||||
if self.deny_domains:
|
||||
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
|
||||
|
||||
if self.canonicalize:
|
||||
for link in links:
|
||||
link.url = canonicalize_url(link.url)
|
||||
|
||||
return links
|
||||
|
||||
def matches(self, url):
|
||||
if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
|
||||
return False
|
||||
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
|
||||
return False
|
||||
|
||||
allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
|
||||
denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
|
||||
return any(allowed) and not any(denied)
|
||||
|
@ -2,9 +2,9 @@ import re
|
||||
import unittest
|
||||
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.link import LinkExtractor, Link
|
||||
from scrapy.link.extractors import RegexLinkExtractor
|
||||
from scrapy.contrib.link_extractors import HTMLImageLinkExtractor
|
||||
from scrapy.link import Link
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
|
||||
from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor
|
||||
from scrapy.tests import get_testdata
|
||||
|
||||
class LinkExtractorTestCase(unittest.TestCase):
|
||||
@ -18,7 +18,7 @@ class LinkExtractorTestCase(unittest.TestCase):
|
||||
</body></html>"""
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
||||
|
||||
lx = LinkExtractor() # default: tag=a, attr=href
|
||||
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
|
||||
self.assertEqual(lx.extract_links(response),
|
||||
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
|
||||
Link(url='http://example.org/about.html', text='About us'),
|
||||
@ -31,7 +31,7 @@ class LinkExtractorTestCase(unittest.TestCase):
|
||||
</body></html>"""
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
||||
|
||||
lx = LinkExtractor() # default: tag=a, attr=href
|
||||
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
|
||||
self.assertEqual(lx.extract_links(response),
|
||||
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
|
||||
|
||||
@ -42,7 +42,7 @@ class LinkExtractorTestCase(unittest.TestCase):
|
||||
body = get_testdata('link_extractor', 'linkextractor_latin1.html')
|
||||
response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)
|
||||
|
||||
lx = LinkExtractor()
|
||||
lx = BaseSgmlLinkExtractor()
|
||||
self.assertEqual(lx.extract_links(response_utf8),
|
||||
[ Link(url='http://example.com/sample_%C3%B1.html', text=''),
|
||||
Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ])
|
||||
@ -59,75 +59,75 @@ class LinkExtractorTestCase(unittest.TestCase):
|
||||
url1 = 'http://lotsofstuff.com/stuff1/index'
|
||||
url2 = 'http://evenmorestuff.com/uglystuff/index'
|
||||
|
||||
lx = LinkExtractor()
|
||||
lx = BaseSgmlLinkExtractor()
|
||||
self.assertEqual(lx.matches(url1), True)
|
||||
self.assertEqual(lx.matches(url2), True)
|
||||
|
||||
class RegexLinkExtractorTestCase(unittest.TestCase):
|
||||
class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
def setUp(self):
|
||||
body = get_testdata('link_extractor', 'regex_linkextractor.html')
|
||||
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
|
||||
self.response = HtmlResponse(url='http://example.com/index', body=body)
|
||||
|
||||
def test_urls_type(self):
|
||||
'''Test that the resulting urls are regular strings and not a unicode objects'''
|
||||
lx = RegexLinkExtractor()
|
||||
lx = SgmlLinkExtractor()
|
||||
self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response)))
|
||||
|
||||
def test_extraction(self):
|
||||
'''Test the extractor's behaviour among different situations'''
|
||||
|
||||
lx = RegexLinkExtractor()
|
||||
lx = SgmlLinkExtractor()
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
||||
Link(url='http://www.google.com/something', text=u'') ])
|
||||
|
||||
lx = RegexLinkExtractor(allow=('sample', ))
|
||||
lx = SgmlLinkExtractor(allow=('sample', ))
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])
|
||||
|
||||
lx = RegexLinkExtractor(allow=('sample', ), unique=False)
|
||||
lx = SgmlLinkExtractor(allow=('sample', ), unique=False)
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
||||
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])
|
||||
|
||||
lx = RegexLinkExtractor(allow=('sample', ), deny=('3', ))
|
||||
lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
|
||||
|
||||
lx = RegexLinkExtractor(allow_domains=('google.com', ))
|
||||
lx = SgmlLinkExtractor(allow_domains=('google.com', ))
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://www.google.com/something', text=u'') ])
|
||||
|
||||
lx = RegexLinkExtractor(tags=('img', ), attrs=('src', ))
|
||||
lx = SgmlLinkExtractor(tags=('img', ), attrs=('src', ))
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://example.com/sample2.jpg', text=u'') ])
|
||||
|
||||
def test_extraction_using_single_values(self):
|
||||
'''Test the extractor's behaviour among different situations'''
|
||||
|
||||
lx = RegexLinkExtractor(allow='sample')
|
||||
lx = SgmlLinkExtractor(allow='sample')
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])
|
||||
|
||||
lx = RegexLinkExtractor(allow='sample', deny='3')
|
||||
lx = SgmlLinkExtractor(allow='sample', deny='3')
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
|
||||
|
||||
lx = RegexLinkExtractor(allow_domains='google.com')
|
||||
lx = SgmlLinkExtractor(allow_domains='google.com')
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://www.google.com/something', text=u'') ])
|
||||
|
||||
lx = RegexLinkExtractor(deny_domains='example.com')
|
||||
lx = SgmlLinkExtractor(deny_domains='example.com')
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://www.google.com/something', text=u'') ])
|
||||
|
||||
@ -135,23 +135,23 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
|
||||
url1 = 'http://lotsofstuff.com/stuff1/index'
|
||||
url2 = 'http://evenmorestuff.com/uglystuff/index'
|
||||
|
||||
lx = RegexLinkExtractor(allow=(r'stuff1', ))
|
||||
lx = SgmlLinkExtractor(allow=(r'stuff1', ))
|
||||
self.assertEqual(lx.matches(url1), True)
|
||||
self.assertEqual(lx.matches(url2), False)
|
||||
|
||||
lx = RegexLinkExtractor(deny=(r'uglystuff', ))
|
||||
lx = SgmlLinkExtractor(deny=(r'uglystuff', ))
|
||||
self.assertEqual(lx.matches(url1), True)
|
||||
self.assertEqual(lx.matches(url2), False)
|
||||
|
||||
lx = RegexLinkExtractor(allow_domains=('evenmorestuff.com', ))
|
||||
lx = SgmlLinkExtractor(allow_domains=('evenmorestuff.com', ))
|
||||
self.assertEqual(lx.matches(url1), False)
|
||||
self.assertEqual(lx.matches(url2), True)
|
||||
|
||||
lx = RegexLinkExtractor(deny_domains=('lotsofstuff.com', ))
|
||||
lx = SgmlLinkExtractor(deny_domains=('lotsofstuff.com', ))
|
||||
self.assertEqual(lx.matches(url1), False)
|
||||
self.assertEqual(lx.matches(url2), True)
|
||||
|
||||
lx = RegexLinkExtractor(allow=('blah1', ), deny=('blah2', ),
|
||||
lx = SgmlLinkExtractor(allow=('blah1', ), deny=('blah2', ),
|
||||
allow_domains=('blah1.com', ), deny_domains=('blah2.com', ))
|
||||
self.assertEqual(lx.matches('http://blah1.com/blah1'), True)
|
||||
self.assertEqual(lx.matches('http://blah1.com/blah2'), False)
|
||||
@ -159,7 +159,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
|
||||
self.assertEqual(lx.matches('http://blah2.com/blah2'), False)
|
||||
|
||||
def test_restrict_xpaths(self):
|
||||
lx = RegexLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
|
||||
lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
|
||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
|
||||
@ -177,7 +177,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
|
||||
</body></html>"""
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')
|
||||
|
||||
lx = RegexLinkExtractor(restrict_xpaths="//div[@class='links']")
|
||||
lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']")
|
||||
self.assertEqual(lx.extract_links(response),
|
||||
[Link(url='http://example.org/about.html', text=u'About us\xa3')])
|
||||
|
||||
@ -194,7 +194,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
|
||||
if m:
|
||||
return m.group(1)
|
||||
|
||||
lx = RegexLinkExtractor(process_value=process_value)
|
||||
lx = SgmlLinkExtractor(process_value=process_value)
|
||||
self.assertEqual(lx.extract_links(response),
|
||||
[Link(url='http://example.org/other/page.html', text='Link text')])
|
||||
|
@ -7,7 +7,7 @@ import re
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.item import ScrapedItem
|
||||
from scrapy.link import LinkExtractor
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.http import Request
|
||||
|
||||
class TestSpider(BaseSpider):
|
||||
@ -20,7 +20,7 @@ class TestSpider(BaseSpider):
|
||||
price_re = re.compile(">Price: \$(.*?)<", re.M)
|
||||
|
||||
def parse(self, response):
|
||||
xlink = LinkExtractor()
|
||||
xlink = SgmlLinkExtractor()
|
||||
itemre = re.compile(self.itemurl_re)
|
||||
for link in xlink.extract_links(response):
|
||||
if itemre.search(link.url):
|
||||
|
Loading…
x
Reference in New Issue
Block a user