1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 04:44:26 +00:00

Sorted out Link Extractors organization by moving all them to

scrapy.contrib.linkextractors.

The most relevant being:
    scrapy.link.extractors.RegexLinkExtractor

which was moved to:
    scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor

The old location still works but throws a deprecation warning. It will be
removed before the 0.7 release.

Documentation and tests were also updated.

Also, in this changeset, a new regex-based link extractor was added to
scrapy.contrib.linkextractors.regex.

--HG--
rename : scrapy/tests/sample_data/link_extractor/regex_linkextractor.html => scrapy/tests/sample_data/link_extractor/sgml_linkextractor.html
rename : scrapy/tests/test_link.py => scrapy/tests/test_contrib_linkextractors.py
This commit is contained in:
Pablo Hoffman 2009-05-18 19:19:37 -03:00
parent 7b34e08392
commit 86498abdf1
16 changed files with 306 additions and 232 deletions

View File

@ -125,7 +125,7 @@ Finally, here's the spider code::
domain_name = 'mininova.org'
start_urls = ['http://www.mininova.org/today']
rules = [Rule(RegexLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
def parse_torrent(self, response):
x = HtmlXPathSelector(response)

View File

@ -93,7 +93,7 @@ Let's now take a look at an example CrawlSpider with Rules::
from scrapy import log
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.link.extractors import RegexLinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.xpath.selector import HtmlXPathSelector
from scrapy.item import ScrapedItem
@ -104,10 +104,10 @@ Let's now take a look at an example CrawlSpider with Rules::
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(RegexLinkExtractor(allow=('category\.php', ), deny=('subsection\,php', ))),
Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\,php', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(RegexLinkExtractor(allow=('item\.php', )), callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
)
def parse_item(self, response):

View File

@ -4,67 +4,25 @@
Available Link Extractors
=========================
.. module:: scrapy.link
.. module:: scrapy.contrib.linkextractors
:synopsis: Link extractors classes
LinkExtractor
=============
All available link extractors classes bundled with Scrapy are provided in the
:mod:`scrapy.contrib.linkextractors` module.
.. class:: LinkExtractor(tag="a", href="href", unique=False, process_value=None)
.. module:: scrapy.contrib.linkextractors.sgml
:synopsis: SGMLParser-based link extractors
This is the most basic Link Extractor which extracts links from a response with
by looking at the given attributes inside the given tags.
SgmlLinkExtractor
=================
The constructor arguments are:
.. class:: SgmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
:param tag: either a string (with the name of a tag) or a function that
receives a tag name and returns ``True`` if links should be extracted
from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
request (once its downloaded) as its first parameter. For more
information see :ref:`ref-request-callback-arguments` below.
:type tag: str or callable
:param attr: either string (with the name of a tag attribute), or a
function that receives a an attribute name and returns ``True`` if
links should be extracted from it, or ``False`` if the shouldn't.
Defaults to ``href``.
:type attr: str or callable
:param unique: is a boolean that specifies if a duplicate filtering should
be applied to links extracted.
:type unique: boolean
:param process_value: a function which receives each value extracted from
the tag and attributes scanned and can modify the value and return a
new one, or return ``None`` to ignore the link altogether. If not
given, ``process_value`` defaults to ``lambda x: x``.
.. highlight:: html
For example, to extract links from this code::
<a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
.. highlight:: python
You can use the following function in ``process_value``::
def process_value(value):
m = re.search("javascript:goToPage\('(.*?)'", value)
if m:
return m.group(1)
:type process_value: callable
RegexLinkExtractor
==================
.. class:: RegexLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
The RegexLinkExtractor extends the base :class:`LinkExtractor` by providing
additional filters that you can specify to extract links, including regular
expressions patterns that the links must match to be extracted. All those
filters are configured through these constructor paramters:
The SgmlLinkExtractor extends the base :class:`BaseSgmlLinkExtractor` by
providing additional filters that you can specify to extract links,
including regular expressions patterns that the links must match to be
extracted. All those filters are configured through these constructor
parameters:
:param allow: a single regular expression (or list of regular expressions)
that the (absolute) urls must match in order to be extracted. If not
@ -112,3 +70,52 @@ RegexLinkExtractor
:class:`LinkExtractor` class constructor
:type process_value: boolean
BaseSgmlLinkExtractor
=====================
.. class:: BaseSgmlLinkExtractor(tag="a", href="href", unique=False, process_value=None)
The purpose of this Link Extractor is only to serve as a base class for the
:class:`SgmlLinkExtractor`. You should use that one instead.
The constructor arguments are:
:param tag: either a string (with the name of a tag) or a function that
receives a tag name and returns ``True`` if links should be extracted
from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
request (once its downloaded) as its first parameter. For more
information see :ref:`ref-request-callback-arguments` below.
:type tag: str or callable
:param attr: either string (with the name of a tag attribute), or a
function that receives a an attribute name and returns ``True`` if
links should be extracted from it, or ``False`` if the shouldn't.
Defaults to ``href``.
:type attr: str or callable
:param unique: is a boolean that specifies if a duplicate filtering should
be applied to links extracted.
:type unique: boolean
:param process_value: a function which receives each value extracted from
the tag and attributes scanned and can modify the value and return a
new one, or return ``None`` to ignore the link altogether. If not
given, ``process_value`` defaults to ``lambda x: x``.
.. highlight:: html
For example, to extract links from this code::
<a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
.. highlight:: python
You can use the following function in ``process_value``::
def process_value(value):
m = re.search("javascript:goToPage\('(.*?)'", value)
if m:
return m.group(1)
:type process_value: callable

View File

@ -152,7 +152,7 @@ CrawlSpider example
Let's now take a look at an example CrawlSpider with rules::
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.link.extractors import RegexLinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.xpath.selector import HtmlXPathSelector
from scrapy.item import ScrapedItem
@ -163,10 +163,10 @@ Let's now take a look at an example CrawlSpider with rules::
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(RegexLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(RegexLinkExtractor(allow=('item\.php', )), callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
)
def parse_item(self, response):

View File

@ -63,7 +63,7 @@ those links. For example, the following one::
So, based on that regular expression we can create the first crawling rule::
Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$', ),
Rule(SgmlLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$', ),
'parse_category',
follow=True,
),
@ -75,7 +75,7 @@ process and extract data from those pages.
This is how the spider would look so far::
from scrapy.link.extractors import RegexLinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
class GoogleDirectorySpider(CrawlSpider):
@ -83,7 +83,7 @@ This is how the spider would look so far::
start_urls = ['http://www.google.com/dirhp']
rules = (
Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
Rule(SgmlLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
'parse_category', follow=True,
),
)

View File

@ -0,0 +1,7 @@
"""
scrapy.contrib.linkextractors
This package contains a collection of Link Extractors.
For more info see docs/ref/link-extractors.rst
"""

View File

@ -1,7 +1,8 @@
"""
This module provides additional LinkExtractors, apart from the ones in scrapy.link
and scrapy.link.extractors.
This module implements the HtmlImageLinkExtractor for extracting
image links only.
"""
import urlparse
from scrapy.link import Link

View File

@ -0,0 +1,28 @@
import re
from scrapy.utils.url import urljoin_rfc as urljoin
from scrapy.utils.markup import remove_tags, remove_entities, replace_escape_chars
from scrapy.link import Link
from .sgml import SgmlLinkExtractor
linkre = re.compile(
"<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>",
re.DOTALL | re.IGNORECASE)
def clean_link(link_text):
"""Remove leading and trailing whitespace and punctuation"""
return link_text.strip("\t\r\n '\"")
class RegexLinkExtractor(SgmlLinkExtractor):
"""High performant link extractor"""
def _extract_links(self, response_text, response_url, response_encoding):
base_url = self.base_url if self.base_url else response_url
clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
links_text = linkre.findall(response_text)
urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])
return [Link(url, text) for url, text in urlstext]

View File

@ -0,0 +1,126 @@
"""
SGMLParser-based Link extractors
"""
import re
from scrapy.xpath import HtmlXPathSelector
from scrapy.link import Link
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
from scrapy.utils.url import safe_url_string, urljoin_rfc as urljoin, canonicalize_url, url_is_from_any_domain
class BaseSgmlLinkExtractor(FixedSGMLParser):
def __init__(self, tag="a", attr="href", unique=False, process_value=None):
FixedSGMLParser.__init__(self)
self.scan_tag = tag if callable(tag) else lambda t: t == tag
self.scan_attr = attr if callable(attr) else lambda a: a == attr
self.process_value = (lambda v: v) if process_value is None else process_value
self.current_link = None
self.unique = unique
def _extract_links(self, response_text, response_url, response_encoding):
self.reset()
self.feed(response_text)
self.close()
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
ret = []
base_url = self.base_url if self.base_url else response_url
for link in links:
link.url = urljoin(base_url, link.url)
link.url = safe_url_string(link.url, response_encoding)
link.text = str_to_unicode(link.text, response_encoding)
ret.append(link)
return ret
def extract_links(self, response):
# wrapper needed to allow to work directly with text
return self._extract_links(response.body, response.url, response.encoding)
def reset(self):
FixedSGMLParser.reset(self)
self.links = []
self.base_url = None
def unknown_starttag(self, tag, attrs):
if tag == 'base':
self.base_url = dict(attrs).get('href')
if self.scan_tag(tag):
for attr, value in attrs:
if self.scan_attr(attr):
url = self.process_value(value)
if url is not None:
link = Link(url=url)
self.links.append(link)
self.current_link = link
def unknown_endtag(self, tag):
self.current_link = None
def handle_data(self, data):
if self.current_link and not self.current_link.text:
self.current_link.text = data.strip()
def matches(self, url):
"""This extractor matches with any url, since
it doesn't contain any patterns"""
return True
_re_type = type(re.compile("", 0))
_matches = lambda url, regexs: any((r.search(url) for r in regexs))
_is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'file'])
class SgmlLinkExtractor(BaseSgmlLinkExtractor):
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
self.allow_domains = set(arg_to_iter(allow_domains))
self.deny_domains = set(arg_to_iter(deny_domains))
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
self.canonicalize = canonicalize
tag_func = lambda x: x in tags
attr_func = lambda x: x in attrs
BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func,
unique=unique, process_value=process_value)
def extract_links(self, response):
if self.restrict_xpaths:
hxs = HtmlXPathSelector(response)
html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
links = self._extract_links(html_slice, response.url, response.encoding)
else:
links = BaseSgmlLinkExtractor.extract_links(self, response)
links = [link for link in links if _is_valid_url(link.url)]
if self.allow_res:
links = [link for link in links if _matches(link.url, self.allow_res)]
if self.deny_res:
links = [link for link in links if not _matches(link.url, self.deny_res)]
if self.allow_domains:
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
if self.deny_domains:
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
if self.canonicalize:
for link in links:
link.url = canonicalize_url(link.url)
return links
def matches(self, url):
if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
return False
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
return False
allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
return any(allowed) and not any(denied)

View File

@ -1,5 +1,5 @@
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.link.extractors import RegexLinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class GenericSpider(CrawlSpider):
"""
@ -10,11 +10,11 @@ class GenericSpider(CrawlSpider):
def __init__(self, domain_name):
self.domain_name = domain_name
self.rules = (
Rule(RegexLinkExtractor(allow_domains=(domain_name,)), self.parse_note, follow=True),
Rule(SgmlLinkExtractor(allow_domains=(domain_name,)), self.parse_note, follow=True),
)
super(GenericSpider, self).__init__()
def parse_note(self, response):
pass
# not a singleton
# not a singleton

View File

@ -10,7 +10,7 @@ from scrapy.utils.url import is_url
from scrapy.utils.response import get_base_url
from scrapy.utils.python import flatten, unicode_to_str
from scrapy.xpath.selector import XPathSelector, XPathSelectorList
from scrapy.contrib.link_extractors import HTMLImageLinkExtractor
from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor
def extract(location, adaptor_args=None):
"""

View File

@ -1,76 +1,12 @@
"""
LinkExtractor provides en efficient way to extract links from pages
This module defines the Link object used in Link extractors.
See documentation in docs/ref/link-extractors.rst
For actual link extractors implementation see scrapy.contrib.linkextractor, or
its documentation in: docs/ref/link-extractors.rst
"""
from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
from scrapy.utils.url import safe_url_string, urljoin_rfc as urljoin
class LinkExtractor(FixedSGMLParser):
def __init__(self, tag="a", attr="href", unique=False, process_value=None):
FixedSGMLParser.__init__(self)
self.scan_tag = tag if callable(tag) else lambda t: t == tag
self.scan_attr = attr if callable(attr) else lambda a: a == attr
self.process_value = (lambda v: v) if process_value is None else process_value
self.current_link = None
self.unique = unique
def _extract_links(self, response_text, response_url, response_encoding):
self.reset()
self.feed(response_text)
self.close()
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
ret = []
base_url = self.base_url if self.base_url else response_url
for link in links:
link.url = urljoin(base_url, link.url)
link.url = safe_url_string(link.url, response_encoding)
link.text = str_to_unicode(link.text, response_encoding)
ret.append(link)
return ret
def extract_links(self, response):
# wrapper needed to allow to work directly with text
return self._extract_links(response.body, response.url, response.encoding)
def reset(self):
FixedSGMLParser.reset(self)
self.links = []
self.base_url = None
def unknown_starttag(self, tag, attrs):
if tag == 'base':
self.base_url = dict(attrs).get('href')
if self.scan_tag(tag):
for attr, value in attrs:
if self.scan_attr(attr):
url = self.process_value(value)
if url is not None:
link = Link(url=url)
self.links.append(link)
self.current_link = link
def unknown_endtag(self, tag):
self.current_link = None
def handle_data(self, data):
if self.current_link and not self.current_link.text:
self.current_link.text = data.strip()
def matches(self, url):
"""This extractor matches with any url, since
it doesn't contain any patterns"""
return True
class Link(object):
"""
Link objects represent an extracted link by the LinkExtractor.
"""Link objects represent an extracted link by the LinkExtractor.
At the moment, it contains just the url and link text.
"""
@ -85,3 +21,26 @@ class Link(object):
def __repr__(self):
return '<Link url=%r text=%r >' % (self.url, self.text)
# FIXME: code below is for backwards compatibility and should be removed before
# the 0.7 release
import warnings
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
class LinkExtractor(BaseSgmlLinkExtractor):
def __init__(self, *args, **kwargs):
warnings.warn("scrapy.link.LinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.BaseSgmlLinkExtractor instead",
DeprecationWarning, stacklevel=2)
BaseSgmlLinkExtractor.__init__(self, *args, **kwargs)
class RegexLinkExtractor(SgmlLinkExtractor):
def __init__(self, *args, **kwargs):
warnings.warn("scrapy.link.RegexLinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor instead",
DeprecationWarning, stacklevel=2)
SgmlLinkExtractor.__init__(self, *args, **kwargs)

View File

@ -1,68 +1,14 @@
"""
This module provides some LinkExtractors, which extend the base LinkExtractor
(scrapy.link.LinkExtractor) with some additional useful features.
# FIXME: code below is for backwards compatibility and should be removed before
# the 0.7 release
See documentation in docs/ref/link-extractors.rst
"""
import warnings
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.link import LinkExtractor
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
from scrapy.xpath import HtmlXPathSelector
from scrapy.utils.misc import arg_to_iter
class RegexLinkExtractor(SgmlLinkExtractor):
_re_type = type(re.compile("", 0))
def __init__(self, *args, **kwargs):
warnings.warn("scrapy.link.extractors.RegexLinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor instead",
DeprecationWarning, stacklevel=2)
SgmlLinkExtractor.__init__(self, *args, **kwargs)
_matches = lambda url, regexs: any((r.search(url) for r in regexs))
_is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'file'])
class RegexLinkExtractor(LinkExtractor):
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
self.allow_domains = set(arg_to_iter(allow_domains))
self.deny_domains = set(arg_to_iter(deny_domains))
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
self.canonicalize = canonicalize
tag_func = lambda x: x in tags
attr_func = lambda x: x in attrs
LinkExtractor.__init__(self, tag=tag_func, attr=attr_func,
unique=unique, process_value=process_value)
def extract_links(self, response):
if self.restrict_xpaths:
hxs = HtmlXPathSelector(response)
html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
links = self._extract_links(html_slice, response.url, response.encoding)
else:
links = LinkExtractor.extract_links(self, response)
links = [link for link in links if _is_valid_url(link.url)]
if self.allow_res:
links = [link for link in links if _matches(link.url, self.allow_res)]
if self.deny_res:
links = [link for link in links if not _matches(link.url, self.deny_res)]
if self.allow_domains:
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
if self.deny_domains:
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
if self.canonicalize:
for link in links:
link.url = canonicalize_url(link.url)
return links
def matches(self, url):
if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
return False
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
return False
allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
return any(allowed) and not any(denied)

View File

@ -2,9 +2,9 @@ import re
import unittest
from scrapy.http import HtmlResponse
from scrapy.link import LinkExtractor, Link
from scrapy.link.extractors import RegexLinkExtractor
from scrapy.contrib.link_extractors import HTMLImageLinkExtractor
from scrapy.link import Link
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor
from scrapy.tests import get_testdata
class LinkExtractorTestCase(unittest.TestCase):
@ -18,7 +18,7 @@ class LinkExtractorTestCase(unittest.TestCase):
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
lx = LinkExtractor() # default: tag=a, attr=href
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
Link(url='http://example.org/about.html', text='About us'),
@ -31,7 +31,7 @@ class LinkExtractorTestCase(unittest.TestCase):
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
lx = LinkExtractor() # default: tag=a, attr=href
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_links(response),
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
@ -42,7 +42,7 @@ class LinkExtractorTestCase(unittest.TestCase):
body = get_testdata('link_extractor', 'linkextractor_latin1.html')
response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)
lx = LinkExtractor()
lx = BaseSgmlLinkExtractor()
self.assertEqual(lx.extract_links(response_utf8),
[ Link(url='http://example.com/sample_%C3%B1.html', text=''),
Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ])
@ -59,75 +59,75 @@ class LinkExtractorTestCase(unittest.TestCase):
url1 = 'http://lotsofstuff.com/stuff1/index'
url2 = 'http://evenmorestuff.com/uglystuff/index'
lx = LinkExtractor()
lx = BaseSgmlLinkExtractor()
self.assertEqual(lx.matches(url1), True)
self.assertEqual(lx.matches(url2), True)
class RegexLinkExtractorTestCase(unittest.TestCase):
class SgmlLinkExtractorTestCase(unittest.TestCase):
def setUp(self):
body = get_testdata('link_extractor', 'regex_linkextractor.html')
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
self.response = HtmlResponse(url='http://example.com/index', body=body)
def test_urls_type(self):
'''Test that the resulting urls are regular strings and not a unicode objects'''
lx = RegexLinkExtractor()
lx = SgmlLinkExtractor()
self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response)))
def test_extraction(self):
'''Test the extractor's behaviour among different situations'''
lx = RegexLinkExtractor()
lx = SgmlLinkExtractor()
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://www.google.com/something', text=u'') ])
lx = RegexLinkExtractor(allow=('sample', ))
lx = SgmlLinkExtractor(allow=('sample', ))
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])
lx = RegexLinkExtractor(allow=('sample', ), unique=False)
lx = SgmlLinkExtractor(allow=('sample', ), unique=False)
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])
lx = RegexLinkExtractor(allow=('sample', ), deny=('3', ))
lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
lx = RegexLinkExtractor(allow_domains=('google.com', ))
lx = SgmlLinkExtractor(allow_domains=('google.com', ))
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://www.google.com/something', text=u'') ])
lx = RegexLinkExtractor(tags=('img', ), attrs=('src', ))
lx = SgmlLinkExtractor(tags=('img', ), attrs=('src', ))
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://example.com/sample2.jpg', text=u'') ])
def test_extraction_using_single_values(self):
'''Test the extractor's behaviour among different situations'''
lx = RegexLinkExtractor(allow='sample')
lx = SgmlLinkExtractor(allow='sample')
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2'),
Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])
lx = RegexLinkExtractor(allow='sample', deny='3')
lx = SgmlLinkExtractor(allow='sample', deny='3')
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
lx = RegexLinkExtractor(allow_domains='google.com')
lx = SgmlLinkExtractor(allow_domains='google.com')
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://www.google.com/something', text=u'') ])
lx = RegexLinkExtractor(deny_domains='example.com')
lx = SgmlLinkExtractor(deny_domains='example.com')
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://www.google.com/something', text=u'') ])
@ -135,23 +135,23 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
url1 = 'http://lotsofstuff.com/stuff1/index'
url2 = 'http://evenmorestuff.com/uglystuff/index'
lx = RegexLinkExtractor(allow=(r'stuff1', ))
lx = SgmlLinkExtractor(allow=(r'stuff1', ))
self.assertEqual(lx.matches(url1), True)
self.assertEqual(lx.matches(url2), False)
lx = RegexLinkExtractor(deny=(r'uglystuff', ))
lx = SgmlLinkExtractor(deny=(r'uglystuff', ))
self.assertEqual(lx.matches(url1), True)
self.assertEqual(lx.matches(url2), False)
lx = RegexLinkExtractor(allow_domains=('evenmorestuff.com', ))
lx = SgmlLinkExtractor(allow_domains=('evenmorestuff.com', ))
self.assertEqual(lx.matches(url1), False)
self.assertEqual(lx.matches(url2), True)
lx = RegexLinkExtractor(deny_domains=('lotsofstuff.com', ))
lx = SgmlLinkExtractor(deny_domains=('lotsofstuff.com', ))
self.assertEqual(lx.matches(url1), False)
self.assertEqual(lx.matches(url2), True)
lx = RegexLinkExtractor(allow=('blah1', ), deny=('blah2', ),
lx = SgmlLinkExtractor(allow=('blah1', ), deny=('blah2', ),
allow_domains=('blah1.com', ), deny_domains=('blah2.com', ))
self.assertEqual(lx.matches('http://blah1.com/blah1'), True)
self.assertEqual(lx.matches('http://blah1.com/blah2'), False)
@ -159,7 +159,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
self.assertEqual(lx.matches('http://blah2.com/blah2'), False)
def test_restrict_xpaths(self):
lx = RegexLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
self.assertEqual([link for link in lx.extract_links(self.response)],
[ Link(url='http://example.com/sample1.html', text=u''),
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
@ -177,7 +177,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
</body></html>"""
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')
lx = RegexLinkExtractor(restrict_xpaths="//div[@class='links']")
lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']")
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/about.html', text=u'About us\xa3')])
@ -194,7 +194,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
if m:
return m.group(1)
lx = RegexLinkExtractor(process_value=process_value)
lx = SgmlLinkExtractor(process_value=process_value)
self.assertEqual(lx.extract_links(response),
[Link(url='http://example.org/other/page.html', text='Link text')])

View File

@ -7,7 +7,7 @@ import re
from scrapy.spider import BaseSpider
from scrapy.item import ScrapedItem
from scrapy.link import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
class TestSpider(BaseSpider):
@ -20,7 +20,7 @@ class TestSpider(BaseSpider):
price_re = re.compile(">Price: \$(.*?)<", re.M)
def parse(self, response):
xlink = LinkExtractor()
xlink = SgmlLinkExtractor()
itemre = re.compile(self.itemurl_re)
for link in xlink.extract_links(response):
if itemre.search(link.url):