mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 15:43:48 +00:00
Sorted out Link Extractors organization by moving all them to
scrapy.contrib.linkextractors. The most relevant being: scrapy.link.extractors.RegexLinkExtractor which was moved to: scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor The old location still works but throws a deprecation warning. It will be removed before the 0.7 release. Documentation and tests were also updated. Also, in this changeset, a new regex-based link extractor was added to scrapy.contrib.linkextractors.regex. --HG-- rename : scrapy/tests/sample_data/link_extractor/regex_linkextractor.html => scrapy/tests/sample_data/link_extractor/sgml_linkextractor.html rename : scrapy/tests/test_link.py => scrapy/tests/test_contrib_linkextractors.py
This commit is contained in:
parent
7b34e08392
commit
86498abdf1
@ -125,7 +125,7 @@ Finally, here's the spider code::
|
|||||||
|
|
||||||
domain_name = 'mininova.org'
|
domain_name = 'mininova.org'
|
||||||
start_urls = ['http://www.mininova.org/today']
|
start_urls = ['http://www.mininova.org/today']
|
||||||
rules = [Rule(RegexLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
||||||
|
|
||||||
def parse_torrent(self, response):
|
def parse_torrent(self, response):
|
||||||
x = HtmlXPathSelector(response)
|
x = HtmlXPathSelector(response)
|
||||||
|
@ -93,7 +93,7 @@ Let's now take a look at an example CrawlSpider with Rules::
|
|||||||
|
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||||
from scrapy.link.extractors import RegexLinkExtractor
|
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||||
from scrapy.xpath.selector import HtmlXPathSelector
|
from scrapy.xpath.selector import HtmlXPathSelector
|
||||||
from scrapy.item import ScrapedItem
|
from scrapy.item import ScrapedItem
|
||||||
|
|
||||||
@ -104,10 +104,10 @@ Let's now take a look at an example CrawlSpider with Rules::
|
|||||||
rules = (
|
rules = (
|
||||||
# Extract links matching 'category.php' (but not matching 'subsection.php')
|
# Extract links matching 'category.php' (but not matching 'subsection.php')
|
||||||
# and follow links from them (since no callback means follow=True by default).
|
# and follow links from them (since no callback means follow=True by default).
|
||||||
Rule(RegexLinkExtractor(allow=('category\.php', ), deny=('subsection\,php', ))),
|
Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\,php', ))),
|
||||||
|
|
||||||
# Extract links matching 'item.php' and parse them with the spider's method parse_item
|
# Extract links matching 'item.php' and parse them with the spider's method parse_item
|
||||||
Rule(RegexLinkExtractor(allow=('item\.php', )), callback='parse_item'),
|
Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse_item(self, response):
|
def parse_item(self, response):
|
||||||
|
@ -4,67 +4,25 @@
|
|||||||
Available Link Extractors
|
Available Link Extractors
|
||||||
=========================
|
=========================
|
||||||
|
|
||||||
.. module:: scrapy.link
|
.. module:: scrapy.contrib.linkextractors
|
||||||
:synopsis: Link extractors classes
|
:synopsis: Link extractors classes
|
||||||
|
|
||||||
LinkExtractor
|
All available link extractors classes bundled with Scrapy are provided in the
|
||||||
=============
|
:mod:`scrapy.contrib.linkextractors` module.
|
||||||
|
|
||||||
.. class:: LinkExtractor(tag="a", href="href", unique=False, process_value=None)
|
.. module:: scrapy.contrib.linkextractors.sgml
|
||||||
|
:synopsis: SGMLParser-based link extractors
|
||||||
|
|
||||||
This is the most basic Link Extractor which extracts links from a response with
|
SgmlLinkExtractor
|
||||||
by looking at the given attributes inside the given tags.
|
=================
|
||||||
|
|
||||||
The constructor arguments are:
|
.. class:: SgmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
|
||||||
|
|
||||||
:param tag: either a string (with the name of a tag) or a function that
|
The SgmlLinkExtractor extends the base :class:`BaseSgmlLinkExtractor` by
|
||||||
receives a tag name and returns ``True`` if links should be extracted
|
providing additional filters that you can specify to extract links,
|
||||||
from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
|
including regular expressions patterns that the links must match to be
|
||||||
request (once its downloaded) as its first parameter. For more
|
extracted. All those filters are configured through these constructor
|
||||||
information see :ref:`ref-request-callback-arguments` below.
|
parameters:
|
||||||
:type tag: str or callable
|
|
||||||
|
|
||||||
:param attr: either string (with the name of a tag attribute), or a
|
|
||||||
function that receives a an attribute name and returns ``True`` if
|
|
||||||
links should be extracted from it, or ``False`` if the shouldn't.
|
|
||||||
Defaults to ``href``.
|
|
||||||
:type attr: str or callable
|
|
||||||
|
|
||||||
:param unique: is a boolean that specifies if a duplicate filtering should
|
|
||||||
be applied to links extracted.
|
|
||||||
:type unique: boolean
|
|
||||||
|
|
||||||
:param process_value: a function which receives each value extracted from
|
|
||||||
the tag and attributes scanned and can modify the value and return a
|
|
||||||
new one, or return ``None`` to ignore the link altogether. If not
|
|
||||||
given, ``process_value`` defaults to ``lambda x: x``.
|
|
||||||
|
|
||||||
.. highlight:: html
|
|
||||||
|
|
||||||
For example, to extract links from this code::
|
|
||||||
|
|
||||||
<a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
|
|
||||||
|
|
||||||
.. highlight:: python
|
|
||||||
|
|
||||||
You can use the following function in ``process_value``::
|
|
||||||
|
|
||||||
def process_value(value):
|
|
||||||
m = re.search("javascript:goToPage\('(.*?)'", value)
|
|
||||||
if m:
|
|
||||||
return m.group(1)
|
|
||||||
|
|
||||||
:type process_value: callable
|
|
||||||
|
|
||||||
RegexLinkExtractor
|
|
||||||
==================
|
|
||||||
|
|
||||||
.. class:: RegexLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
|
|
||||||
|
|
||||||
The RegexLinkExtractor extends the base :class:`LinkExtractor` by providing
|
|
||||||
additional filters that you can specify to extract links, including regular
|
|
||||||
expressions patterns that the links must match to be extracted. All those
|
|
||||||
filters are configured through these constructor paramters:
|
|
||||||
|
|
||||||
:param allow: a single regular expression (or list of regular expressions)
|
:param allow: a single regular expression (or list of regular expressions)
|
||||||
that the (absolute) urls must match in order to be extracted. If not
|
that the (absolute) urls must match in order to be extracted. If not
|
||||||
@ -112,3 +70,52 @@ RegexLinkExtractor
|
|||||||
:class:`LinkExtractor` class constructor
|
:class:`LinkExtractor` class constructor
|
||||||
:type process_value: boolean
|
:type process_value: boolean
|
||||||
|
|
||||||
|
BaseSgmlLinkExtractor
|
||||||
|
=====================
|
||||||
|
|
||||||
|
.. class:: BaseSgmlLinkExtractor(tag="a", href="href", unique=False, process_value=None)
|
||||||
|
|
||||||
|
The purpose of this Link Extractor is only to serve as a base class for the
|
||||||
|
:class:`SgmlLinkExtractor`. You should use that one instead.
|
||||||
|
|
||||||
|
The constructor arguments are:
|
||||||
|
|
||||||
|
:param tag: either a string (with the name of a tag) or a function that
|
||||||
|
receives a tag name and returns ``True`` if links should be extracted
|
||||||
|
from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
|
||||||
|
request (once its downloaded) as its first parameter. For more
|
||||||
|
information see :ref:`ref-request-callback-arguments` below.
|
||||||
|
:type tag: str or callable
|
||||||
|
|
||||||
|
:param attr: either string (with the name of a tag attribute), or a
|
||||||
|
function that receives a an attribute name and returns ``True`` if
|
||||||
|
links should be extracted from it, or ``False`` if the shouldn't.
|
||||||
|
Defaults to ``href``.
|
||||||
|
:type attr: str or callable
|
||||||
|
|
||||||
|
:param unique: is a boolean that specifies if a duplicate filtering should
|
||||||
|
be applied to links extracted.
|
||||||
|
:type unique: boolean
|
||||||
|
|
||||||
|
:param process_value: a function which receives each value extracted from
|
||||||
|
the tag and attributes scanned and can modify the value and return a
|
||||||
|
new one, or return ``None`` to ignore the link altogether. If not
|
||||||
|
given, ``process_value`` defaults to ``lambda x: x``.
|
||||||
|
|
||||||
|
.. highlight:: html
|
||||||
|
|
||||||
|
For example, to extract links from this code::
|
||||||
|
|
||||||
|
<a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
|
||||||
|
|
||||||
|
.. highlight:: python
|
||||||
|
|
||||||
|
You can use the following function in ``process_value``::
|
||||||
|
|
||||||
|
def process_value(value):
|
||||||
|
m = re.search("javascript:goToPage\('(.*?)'", value)
|
||||||
|
if m:
|
||||||
|
return m.group(1)
|
||||||
|
|
||||||
|
:type process_value: callable
|
||||||
|
|
||||||
|
@ -152,7 +152,7 @@ CrawlSpider example
|
|||||||
Let's now take a look at an example CrawlSpider with rules::
|
Let's now take a look at an example CrawlSpider with rules::
|
||||||
|
|
||||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||||
from scrapy.link.extractors import RegexLinkExtractor
|
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||||
from scrapy.xpath.selector import HtmlXPathSelector
|
from scrapy.xpath.selector import HtmlXPathSelector
|
||||||
from scrapy.item import ScrapedItem
|
from scrapy.item import ScrapedItem
|
||||||
|
|
||||||
@ -163,10 +163,10 @@ Let's now take a look at an example CrawlSpider with rules::
|
|||||||
rules = (
|
rules = (
|
||||||
# Extract links matching 'category.php' (but not matching 'subsection.php')
|
# Extract links matching 'category.php' (but not matching 'subsection.php')
|
||||||
# and follow links from them (since no callback means follow=True by default).
|
# and follow links from them (since no callback means follow=True by default).
|
||||||
Rule(RegexLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
|
Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
|
||||||
|
|
||||||
# Extract links matching 'item.php' and parse them with the spider's method parse_item
|
# Extract links matching 'item.php' and parse them with the spider's method parse_item
|
||||||
Rule(RegexLinkExtractor(allow=('item\.php', )), callback='parse_item'),
|
Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse_item(self, response):
|
def parse_item(self, response):
|
||||||
|
@ -63,7 +63,7 @@ those links. For example, the following one::
|
|||||||
|
|
||||||
So, based on that regular expression we can create the first crawling rule::
|
So, based on that regular expression we can create the first crawling rule::
|
||||||
|
|
||||||
Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$', ),
|
Rule(SgmlLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$', ),
|
||||||
'parse_category',
|
'parse_category',
|
||||||
follow=True,
|
follow=True,
|
||||||
),
|
),
|
||||||
@ -75,7 +75,7 @@ process and extract data from those pages.
|
|||||||
|
|
||||||
This is how the spider would look so far::
|
This is how the spider would look so far::
|
||||||
|
|
||||||
from scrapy.link.extractors import RegexLinkExtractor
|
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||||
|
|
||||||
class GoogleDirectorySpider(CrawlSpider):
|
class GoogleDirectorySpider(CrawlSpider):
|
||||||
@ -83,7 +83,7 @@ This is how the spider would look so far::
|
|||||||
start_urls = ['http://www.google.com/dirhp']
|
start_urls = ['http://www.google.com/dirhp']
|
||||||
|
|
||||||
rules = (
|
rules = (
|
||||||
Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
|
Rule(SgmlLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
|
||||||
'parse_category', follow=True,
|
'parse_category', follow=True,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
7
scrapy/contrib/linkextractors/__init__.py
Normal file
7
scrapy/contrib/linkextractors/__init__.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
"""
|
||||||
|
scrapy.contrib.linkextractors
|
||||||
|
|
||||||
|
This package contains a collection of Link Extractors.
|
||||||
|
|
||||||
|
For more info see docs/ref/link-extractors.rst
|
||||||
|
"""
|
@ -1,7 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
This module provides additional LinkExtractors, apart from the ones in scrapy.link
|
This module implements the HtmlImageLinkExtractor for extracting
|
||||||
and scrapy.link.extractors.
|
image links only.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from scrapy.link import Link
|
from scrapy.link import Link
|
28
scrapy/contrib/linkextractors/regex.py
Normal file
28
scrapy/contrib/linkextractors/regex.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from scrapy.utils.url import urljoin_rfc as urljoin
|
||||||
|
from scrapy.utils.markup import remove_tags, remove_entities, replace_escape_chars
|
||||||
|
|
||||||
|
from scrapy.link import Link
|
||||||
|
from .sgml import SgmlLinkExtractor
|
||||||
|
|
||||||
|
linkre = re.compile(
|
||||||
|
"<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>",
|
||||||
|
re.DOTALL | re.IGNORECASE)
|
||||||
|
|
||||||
|
def clean_link(link_text):
|
||||||
|
"""Remove leading and trailing whitespace and punctuation"""
|
||||||
|
return link_text.strip("\t\r\n '\"")
|
||||||
|
|
||||||
|
class RegexLinkExtractor(SgmlLinkExtractor):
|
||||||
|
"""High performant link extractor"""
|
||||||
|
def _extract_links(self, response_text, response_url, response_encoding):
|
||||||
|
base_url = self.base_url if self.base_url else response_url
|
||||||
|
|
||||||
|
clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
|
||||||
|
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
|
||||||
|
|
||||||
|
links_text = linkre.findall(response_text)
|
||||||
|
urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])
|
||||||
|
|
||||||
|
return [Link(url, text) for url, text in urlstext]
|
126
scrapy/contrib/linkextractors/sgml.py
Normal file
126
scrapy/contrib/linkextractors/sgml.py
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
"""
|
||||||
|
SGMLParser-based Link extractors
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from scrapy.xpath import HtmlXPathSelector
|
||||||
|
from scrapy.link import Link
|
||||||
|
from scrapy.utils.misc import arg_to_iter
|
||||||
|
from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
|
||||||
|
from scrapy.utils.url import safe_url_string, urljoin_rfc as urljoin, canonicalize_url, url_is_from_any_domain
|
||||||
|
|
||||||
|
class BaseSgmlLinkExtractor(FixedSGMLParser):
|
||||||
|
|
||||||
|
def __init__(self, tag="a", attr="href", unique=False, process_value=None):
|
||||||
|
FixedSGMLParser.__init__(self)
|
||||||
|
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
||||||
|
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||||
|
self.process_value = (lambda v: v) if process_value is None else process_value
|
||||||
|
self.current_link = None
|
||||||
|
self.unique = unique
|
||||||
|
|
||||||
|
def _extract_links(self, response_text, response_url, response_encoding):
|
||||||
|
self.reset()
|
||||||
|
self.feed(response_text)
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
base_url = self.base_url if self.base_url else response_url
|
||||||
|
for link in links:
|
||||||
|
link.url = urljoin(base_url, link.url)
|
||||||
|
link.url = safe_url_string(link.url, response_encoding)
|
||||||
|
link.text = str_to_unicode(link.text, response_encoding)
|
||||||
|
ret.append(link)
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def extract_links(self, response):
|
||||||
|
# wrapper needed to allow to work directly with text
|
||||||
|
return self._extract_links(response.body, response.url, response.encoding)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
FixedSGMLParser.reset(self)
|
||||||
|
self.links = []
|
||||||
|
self.base_url = None
|
||||||
|
|
||||||
|
def unknown_starttag(self, tag, attrs):
|
||||||
|
if tag == 'base':
|
||||||
|
self.base_url = dict(attrs).get('href')
|
||||||
|
if self.scan_tag(tag):
|
||||||
|
for attr, value in attrs:
|
||||||
|
if self.scan_attr(attr):
|
||||||
|
url = self.process_value(value)
|
||||||
|
if url is not None:
|
||||||
|
link = Link(url=url)
|
||||||
|
self.links.append(link)
|
||||||
|
self.current_link = link
|
||||||
|
|
||||||
|
def unknown_endtag(self, tag):
|
||||||
|
self.current_link = None
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self.current_link and not self.current_link.text:
|
||||||
|
self.current_link.text = data.strip()
|
||||||
|
|
||||||
|
def matches(self, url):
|
||||||
|
"""This extractor matches with any url, since
|
||||||
|
it doesn't contain any patterns"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
_re_type = type(re.compile("", 0))
|
||||||
|
|
||||||
|
_matches = lambda url, regexs: any((r.search(url) for r in regexs))
|
||||||
|
_is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'file'])
|
||||||
|
|
||||||
|
class SgmlLinkExtractor(BaseSgmlLinkExtractor):
|
||||||
|
|
||||||
|
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
||||||
|
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
|
||||||
|
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
|
||||||
|
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
|
||||||
|
self.allow_domains = set(arg_to_iter(allow_domains))
|
||||||
|
self.deny_domains = set(arg_to_iter(deny_domains))
|
||||||
|
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
|
||||||
|
self.canonicalize = canonicalize
|
||||||
|
tag_func = lambda x: x in tags
|
||||||
|
attr_func = lambda x: x in attrs
|
||||||
|
BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func,
|
||||||
|
unique=unique, process_value=process_value)
|
||||||
|
|
||||||
|
def extract_links(self, response):
|
||||||
|
if self.restrict_xpaths:
|
||||||
|
hxs = HtmlXPathSelector(response)
|
||||||
|
html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
|
||||||
|
links = self._extract_links(html_slice, response.url, response.encoding)
|
||||||
|
else:
|
||||||
|
links = BaseSgmlLinkExtractor.extract_links(self, response)
|
||||||
|
|
||||||
|
links = [link for link in links if _is_valid_url(link.url)]
|
||||||
|
|
||||||
|
if self.allow_res:
|
||||||
|
links = [link for link in links if _matches(link.url, self.allow_res)]
|
||||||
|
if self.deny_res:
|
||||||
|
links = [link for link in links if not _matches(link.url, self.deny_res)]
|
||||||
|
if self.allow_domains:
|
||||||
|
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
|
||||||
|
if self.deny_domains:
|
||||||
|
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
|
||||||
|
|
||||||
|
if self.canonicalize:
|
||||||
|
for link in links:
|
||||||
|
link.url = canonicalize_url(link.url)
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
def matches(self, url):
|
||||||
|
if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
|
||||||
|
return False
|
||||||
|
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
|
||||||
|
return False
|
||||||
|
|
||||||
|
allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
|
||||||
|
denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
|
||||||
|
return any(allowed) and not any(denied)
|
@ -1,5 +1,5 @@
|
|||||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||||
from scrapy.link.extractors import RegexLinkExtractor
|
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||||
|
|
||||||
class GenericSpider(CrawlSpider):
|
class GenericSpider(CrawlSpider):
|
||||||
"""
|
"""
|
||||||
@ -10,7 +10,7 @@ class GenericSpider(CrawlSpider):
|
|||||||
def __init__(self, domain_name):
|
def __init__(self, domain_name):
|
||||||
self.domain_name = domain_name
|
self.domain_name = domain_name
|
||||||
self.rules = (
|
self.rules = (
|
||||||
Rule(RegexLinkExtractor(allow_domains=(domain_name,)), self.parse_note, follow=True),
|
Rule(SgmlLinkExtractor(allow_domains=(domain_name,)), self.parse_note, follow=True),
|
||||||
)
|
)
|
||||||
super(GenericSpider, self).__init__()
|
super(GenericSpider, self).__init__()
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ from scrapy.utils.url import is_url
|
|||||||
from scrapy.utils.response import get_base_url
|
from scrapy.utils.response import get_base_url
|
||||||
from scrapy.utils.python import flatten, unicode_to_str
|
from scrapy.utils.python import flatten, unicode_to_str
|
||||||
from scrapy.xpath.selector import XPathSelector, XPathSelectorList
|
from scrapy.xpath.selector import XPathSelector, XPathSelectorList
|
||||||
from scrapy.contrib.link_extractors import HTMLImageLinkExtractor
|
from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor
|
||||||
|
|
||||||
def extract(location, adaptor_args=None):
|
def extract(location, adaptor_args=None):
|
||||||
"""
|
"""
|
||||||
|
@ -1,76 +1,12 @@
|
|||||||
"""
|
"""
|
||||||
LinkExtractor provides en efficient way to extract links from pages
|
This module defines the Link object used in Link extractors.
|
||||||
|
|
||||||
See documentation in docs/ref/link-extractors.rst
|
For actual link extractors implementation see scrapy.contrib.linkextractor, or
|
||||||
|
its documentation in: docs/ref/link-extractors.rst
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
|
|
||||||
from scrapy.utils.url import safe_url_string, urljoin_rfc as urljoin
|
|
||||||
|
|
||||||
class LinkExtractor(FixedSGMLParser):
|
|
||||||
|
|
||||||
def __init__(self, tag="a", attr="href", unique=False, process_value=None):
|
|
||||||
FixedSGMLParser.__init__(self)
|
|
||||||
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
|
||||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
|
||||||
self.process_value = (lambda v: v) if process_value is None else process_value
|
|
||||||
self.current_link = None
|
|
||||||
self.unique = unique
|
|
||||||
|
|
||||||
def _extract_links(self, response_text, response_url, response_encoding):
|
|
||||||
self.reset()
|
|
||||||
self.feed(response_text)
|
|
||||||
self.close()
|
|
||||||
|
|
||||||
links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
|
|
||||||
|
|
||||||
ret = []
|
|
||||||
base_url = self.base_url if self.base_url else response_url
|
|
||||||
for link in links:
|
|
||||||
link.url = urljoin(base_url, link.url)
|
|
||||||
link.url = safe_url_string(link.url, response_encoding)
|
|
||||||
link.text = str_to_unicode(link.text, response_encoding)
|
|
||||||
ret.append(link)
|
|
||||||
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def extract_links(self, response):
|
|
||||||
# wrapper needed to allow to work directly with text
|
|
||||||
return self._extract_links(response.body, response.url, response.encoding)
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
FixedSGMLParser.reset(self)
|
|
||||||
self.links = []
|
|
||||||
self.base_url = None
|
|
||||||
|
|
||||||
def unknown_starttag(self, tag, attrs):
|
|
||||||
if tag == 'base':
|
|
||||||
self.base_url = dict(attrs).get('href')
|
|
||||||
if self.scan_tag(tag):
|
|
||||||
for attr, value in attrs:
|
|
||||||
if self.scan_attr(attr):
|
|
||||||
url = self.process_value(value)
|
|
||||||
if url is not None:
|
|
||||||
link = Link(url=url)
|
|
||||||
self.links.append(link)
|
|
||||||
self.current_link = link
|
|
||||||
|
|
||||||
def unknown_endtag(self, tag):
|
|
||||||
self.current_link = None
|
|
||||||
|
|
||||||
def handle_data(self, data):
|
|
||||||
if self.current_link and not self.current_link.text:
|
|
||||||
self.current_link.text = data.strip()
|
|
||||||
|
|
||||||
def matches(self, url):
|
|
||||||
"""This extractor matches with any url, since
|
|
||||||
it doesn't contain any patterns"""
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
"""
|
"""Link objects represent an extracted link by the LinkExtractor.
|
||||||
Link objects represent an extracted link by the LinkExtractor.
|
|
||||||
At the moment, it contains just the url and link text.
|
At the moment, it contains just the url and link text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -85,3 +21,26 @@ class Link(object):
|
|||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<Link url=%r text=%r >' % (self.url, self.text)
|
return '<Link url=%r text=%r >' % (self.url, self.text)
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME: code below is for backwards compatibility and should be removed before
|
||||||
|
# the 0.7 release
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
|
||||||
|
|
||||||
|
class LinkExtractor(BaseSgmlLinkExtractor):
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
warnings.warn("scrapy.link.LinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.BaseSgmlLinkExtractor instead",
|
||||||
|
DeprecationWarning, stacklevel=2)
|
||||||
|
BaseSgmlLinkExtractor.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
|
class RegexLinkExtractor(SgmlLinkExtractor):
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
warnings.warn("scrapy.link.RegexLinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor instead",
|
||||||
|
DeprecationWarning, stacklevel=2)
|
||||||
|
SgmlLinkExtractor.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
|
@ -1,68 +1,14 @@
|
|||||||
"""
|
# FIXME: code below is for backwards compatibility and should be removed before
|
||||||
This module provides some LinkExtractors, which extend the base LinkExtractor
|
# the 0.7 release
|
||||||
(scrapy.link.LinkExtractor) with some additional useful features.
|
|
||||||
|
|
||||||
See documentation in docs/ref/link-extractors.rst
|
import warnings
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||||
|
|
||||||
from scrapy.link import LinkExtractor
|
class RegexLinkExtractor(SgmlLinkExtractor):
|
||||||
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
|
|
||||||
from scrapy.xpath import HtmlXPathSelector
|
|
||||||
from scrapy.utils.misc import arg_to_iter
|
|
||||||
|
|
||||||
_re_type = type(re.compile("", 0))
|
def __init__(self, *args, **kwargs):
|
||||||
|
warnings.warn("scrapy.link.extractors.RegexLinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor instead",
|
||||||
|
DeprecationWarning, stacklevel=2)
|
||||||
|
SgmlLinkExtractor.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
_matches = lambda url, regexs: any((r.search(url) for r in regexs))
|
|
||||||
_is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'file'])
|
|
||||||
|
|
||||||
class RegexLinkExtractor(LinkExtractor):
|
|
||||||
|
|
||||||
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
|
||||||
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
|
|
||||||
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
|
|
||||||
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
|
|
||||||
self.allow_domains = set(arg_to_iter(allow_domains))
|
|
||||||
self.deny_domains = set(arg_to_iter(deny_domains))
|
|
||||||
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
|
|
||||||
self.canonicalize = canonicalize
|
|
||||||
tag_func = lambda x: x in tags
|
|
||||||
attr_func = lambda x: x in attrs
|
|
||||||
LinkExtractor.__init__(self, tag=tag_func, attr=attr_func,
|
|
||||||
unique=unique, process_value=process_value)
|
|
||||||
|
|
||||||
def extract_links(self, response):
|
|
||||||
if self.restrict_xpaths:
|
|
||||||
hxs = HtmlXPathSelector(response)
|
|
||||||
html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
|
|
||||||
links = self._extract_links(html_slice, response.url, response.encoding)
|
|
||||||
else:
|
|
||||||
links = LinkExtractor.extract_links(self, response)
|
|
||||||
|
|
||||||
links = [link for link in links if _is_valid_url(link.url)]
|
|
||||||
|
|
||||||
if self.allow_res:
|
|
||||||
links = [link for link in links if _matches(link.url, self.allow_res)]
|
|
||||||
if self.deny_res:
|
|
||||||
links = [link for link in links if not _matches(link.url, self.deny_res)]
|
|
||||||
if self.allow_domains:
|
|
||||||
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
|
|
||||||
if self.deny_domains:
|
|
||||||
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
|
|
||||||
|
|
||||||
if self.canonicalize:
|
|
||||||
for link in links:
|
|
||||||
link.url = canonicalize_url(link.url)
|
|
||||||
|
|
||||||
return links
|
|
||||||
|
|
||||||
def matches(self, url):
|
|
||||||
if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
|
|
||||||
return False
|
|
||||||
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
|
|
||||||
return False
|
|
||||||
|
|
||||||
allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
|
|
||||||
denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
|
|
||||||
return any(allowed) and not any(denied)
|
|
||||||
|
@ -2,9 +2,9 @@ import re
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from scrapy.http import HtmlResponse
|
from scrapy.http import HtmlResponse
|
||||||
from scrapy.link import LinkExtractor, Link
|
from scrapy.link import Link
|
||||||
from scrapy.link.extractors import RegexLinkExtractor
|
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
|
||||||
from scrapy.contrib.link_extractors import HTMLImageLinkExtractor
|
from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor
|
||||||
from scrapy.tests import get_testdata
|
from scrapy.tests import get_testdata
|
||||||
|
|
||||||
class LinkExtractorTestCase(unittest.TestCase):
|
class LinkExtractorTestCase(unittest.TestCase):
|
||||||
@ -18,7 +18,7 @@ class LinkExtractorTestCase(unittest.TestCase):
|
|||||||
</body></html>"""
|
</body></html>"""
|
||||||
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
||||||
|
|
||||||
lx = LinkExtractor() # default: tag=a, attr=href
|
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
|
||||||
self.assertEqual(lx.extract_links(response),
|
self.assertEqual(lx.extract_links(response),
|
||||||
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
|
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
|
||||||
Link(url='http://example.org/about.html', text='About us'),
|
Link(url='http://example.org/about.html', text='About us'),
|
||||||
@ -31,7 +31,7 @@ class LinkExtractorTestCase(unittest.TestCase):
|
|||||||
</body></html>"""
|
</body></html>"""
|
||||||
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
response = HtmlResponse("http://example.org/somepage/index.html", body=html)
|
||||||
|
|
||||||
lx = LinkExtractor() # default: tag=a, attr=href
|
lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href
|
||||||
self.assertEqual(lx.extract_links(response),
|
self.assertEqual(lx.extract_links(response),
|
||||||
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
|
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
|
||||||
|
|
||||||
@ -42,7 +42,7 @@ class LinkExtractorTestCase(unittest.TestCase):
|
|||||||
body = get_testdata('link_extractor', 'linkextractor_latin1.html')
|
body = get_testdata('link_extractor', 'linkextractor_latin1.html')
|
||||||
response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)
|
response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)
|
||||||
|
|
||||||
lx = LinkExtractor()
|
lx = BaseSgmlLinkExtractor()
|
||||||
self.assertEqual(lx.extract_links(response_utf8),
|
self.assertEqual(lx.extract_links(response_utf8),
|
||||||
[ Link(url='http://example.com/sample_%C3%B1.html', text=''),
|
[ Link(url='http://example.com/sample_%C3%B1.html', text=''),
|
||||||
Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ])
|
Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ])
|
||||||
@ -59,75 +59,75 @@ class LinkExtractorTestCase(unittest.TestCase):
|
|||||||
url1 = 'http://lotsofstuff.com/stuff1/index'
|
url1 = 'http://lotsofstuff.com/stuff1/index'
|
||||||
url2 = 'http://evenmorestuff.com/uglystuff/index'
|
url2 = 'http://evenmorestuff.com/uglystuff/index'
|
||||||
|
|
||||||
lx = LinkExtractor()
|
lx = BaseSgmlLinkExtractor()
|
||||||
self.assertEqual(lx.matches(url1), True)
|
self.assertEqual(lx.matches(url1), True)
|
||||||
self.assertEqual(lx.matches(url2), True)
|
self.assertEqual(lx.matches(url2), True)
|
||||||
|
|
||||||
class RegexLinkExtractorTestCase(unittest.TestCase):
|
class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
body = get_testdata('link_extractor', 'regex_linkextractor.html')
|
body = get_testdata('link_extractor', 'sgml_linkextractor.html')
|
||||||
self.response = HtmlResponse(url='http://example.com/index', body=body)
|
self.response = HtmlResponse(url='http://example.com/index', body=body)
|
||||||
|
|
||||||
def test_urls_type(self):
|
def test_urls_type(self):
|
||||||
'''Test that the resulting urls are regular strings and not a unicode objects'''
|
'''Test that the resulting urls are regular strings and not a unicode objects'''
|
||||||
lx = RegexLinkExtractor()
|
lx = SgmlLinkExtractor()
|
||||||
self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response)))
|
self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response)))
|
||||||
|
|
||||||
def test_extraction(self):
|
def test_extraction(self):
|
||||||
'''Test the extractor's behaviour among different situations'''
|
'''Test the extractor's behaviour among different situations'''
|
||||||
|
|
||||||
lx = RegexLinkExtractor()
|
lx = SgmlLinkExtractor()
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
||||||
Link(url='http://www.google.com/something', text=u'') ])
|
Link(url='http://www.google.com/something', text=u'') ])
|
||||||
|
|
||||||
lx = RegexLinkExtractor(allow=('sample', ))
|
lx = SgmlLinkExtractor(allow=('sample', ))
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])
|
||||||
|
|
||||||
lx = RegexLinkExtractor(allow=('sample', ), unique=False)
|
lx = SgmlLinkExtractor(allow=('sample', ), unique=False)
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
|
||||||
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])
|
Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])
|
||||||
|
|
||||||
lx = RegexLinkExtractor(allow=('sample', ), deny=('3', ))
|
lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||||
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
|
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
|
||||||
|
|
||||||
lx = RegexLinkExtractor(allow_domains=('google.com', ))
|
lx = SgmlLinkExtractor(allow_domains=('google.com', ))
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://www.google.com/something', text=u'') ])
|
[ Link(url='http://www.google.com/something', text=u'') ])
|
||||||
|
|
||||||
lx = RegexLinkExtractor(tags=('img', ), attrs=('src', ))
|
lx = SgmlLinkExtractor(tags=('img', ), attrs=('src', ))
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://example.com/sample2.jpg', text=u'') ])
|
[ Link(url='http://example.com/sample2.jpg', text=u'') ])
|
||||||
|
|
||||||
def test_extraction_using_single_values(self):
|
def test_extraction_using_single_values(self):
|
||||||
'''Test the extractor's behaviour among different situations'''
|
'''Test the extractor's behaviour among different situations'''
|
||||||
|
|
||||||
lx = RegexLinkExtractor(allow='sample')
|
lx = SgmlLinkExtractor(allow='sample')
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||||
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
Link(url='http://example.com/sample2.html', text=u'sample 2'),
|
||||||
Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])
|
Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])
|
||||||
|
|
||||||
lx = RegexLinkExtractor(allow='sample', deny='3')
|
lx = SgmlLinkExtractor(allow='sample', deny='3')
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||||
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
|
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
|
||||||
|
|
||||||
lx = RegexLinkExtractor(allow_domains='google.com')
|
lx = SgmlLinkExtractor(allow_domains='google.com')
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://www.google.com/something', text=u'') ])
|
[ Link(url='http://www.google.com/something', text=u'') ])
|
||||||
|
|
||||||
lx = RegexLinkExtractor(deny_domains='example.com')
|
lx = SgmlLinkExtractor(deny_domains='example.com')
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://www.google.com/something', text=u'') ])
|
[ Link(url='http://www.google.com/something', text=u'') ])
|
||||||
|
|
||||||
@ -135,23 +135,23 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
|
|||||||
url1 = 'http://lotsofstuff.com/stuff1/index'
|
url1 = 'http://lotsofstuff.com/stuff1/index'
|
||||||
url2 = 'http://evenmorestuff.com/uglystuff/index'
|
url2 = 'http://evenmorestuff.com/uglystuff/index'
|
||||||
|
|
||||||
lx = RegexLinkExtractor(allow=(r'stuff1', ))
|
lx = SgmlLinkExtractor(allow=(r'stuff1', ))
|
||||||
self.assertEqual(lx.matches(url1), True)
|
self.assertEqual(lx.matches(url1), True)
|
||||||
self.assertEqual(lx.matches(url2), False)
|
self.assertEqual(lx.matches(url2), False)
|
||||||
|
|
||||||
lx = RegexLinkExtractor(deny=(r'uglystuff', ))
|
lx = SgmlLinkExtractor(deny=(r'uglystuff', ))
|
||||||
self.assertEqual(lx.matches(url1), True)
|
self.assertEqual(lx.matches(url1), True)
|
||||||
self.assertEqual(lx.matches(url2), False)
|
self.assertEqual(lx.matches(url2), False)
|
||||||
|
|
||||||
lx = RegexLinkExtractor(allow_domains=('evenmorestuff.com', ))
|
lx = SgmlLinkExtractor(allow_domains=('evenmorestuff.com', ))
|
||||||
self.assertEqual(lx.matches(url1), False)
|
self.assertEqual(lx.matches(url1), False)
|
||||||
self.assertEqual(lx.matches(url2), True)
|
self.assertEqual(lx.matches(url2), True)
|
||||||
|
|
||||||
lx = RegexLinkExtractor(deny_domains=('lotsofstuff.com', ))
|
lx = SgmlLinkExtractor(deny_domains=('lotsofstuff.com', ))
|
||||||
self.assertEqual(lx.matches(url1), False)
|
self.assertEqual(lx.matches(url1), False)
|
||||||
self.assertEqual(lx.matches(url2), True)
|
self.assertEqual(lx.matches(url2), True)
|
||||||
|
|
||||||
lx = RegexLinkExtractor(allow=('blah1', ), deny=('blah2', ),
|
lx = SgmlLinkExtractor(allow=('blah1', ), deny=('blah2', ),
|
||||||
allow_domains=('blah1.com', ), deny_domains=('blah2.com', ))
|
allow_domains=('blah1.com', ), deny_domains=('blah2.com', ))
|
||||||
self.assertEqual(lx.matches('http://blah1.com/blah1'), True)
|
self.assertEqual(lx.matches('http://blah1.com/blah1'), True)
|
||||||
self.assertEqual(lx.matches('http://blah1.com/blah2'), False)
|
self.assertEqual(lx.matches('http://blah1.com/blah2'), False)
|
||||||
@ -159,7 +159,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
|
|||||||
self.assertEqual(lx.matches('http://blah2.com/blah2'), False)
|
self.assertEqual(lx.matches('http://blah2.com/blah2'), False)
|
||||||
|
|
||||||
def test_restrict_xpaths(self):
|
def test_restrict_xpaths(self):
|
||||||
lx = RegexLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
|
lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
|
||||||
self.assertEqual([link for link in lx.extract_links(self.response)],
|
self.assertEqual([link for link in lx.extract_links(self.response)],
|
||||||
[ Link(url='http://example.com/sample1.html', text=u''),
|
[ Link(url='http://example.com/sample1.html', text=u''),
|
||||||
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
|
Link(url='http://example.com/sample2.html', text=u'sample 2') ])
|
||||||
@ -177,7 +177,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
|
|||||||
</body></html>"""
|
</body></html>"""
|
||||||
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')
|
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')
|
||||||
|
|
||||||
lx = RegexLinkExtractor(restrict_xpaths="//div[@class='links']")
|
lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']")
|
||||||
self.assertEqual(lx.extract_links(response),
|
self.assertEqual(lx.extract_links(response),
|
||||||
[Link(url='http://example.org/about.html', text=u'About us\xa3')])
|
[Link(url='http://example.org/about.html', text=u'About us\xa3')])
|
||||||
|
|
||||||
@ -194,7 +194,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
|
|||||||
if m:
|
if m:
|
||||||
return m.group(1)
|
return m.group(1)
|
||||||
|
|
||||||
lx = RegexLinkExtractor(process_value=process_value)
|
lx = SgmlLinkExtractor(process_value=process_value)
|
||||||
self.assertEqual(lx.extract_links(response),
|
self.assertEqual(lx.extract_links(response),
|
||||||
[Link(url='http://example.org/other/page.html', text='Link text')])
|
[Link(url='http://example.org/other/page.html', text='Link text')])
|
||||||
|
|
@ -7,7 +7,7 @@ import re
|
|||||||
|
|
||||||
from scrapy.spider import BaseSpider
|
from scrapy.spider import BaseSpider
|
||||||
from scrapy.item import ScrapedItem
|
from scrapy.item import ScrapedItem
|
||||||
from scrapy.link import LinkExtractor
|
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
|
|
||||||
class TestSpider(BaseSpider):
|
class TestSpider(BaseSpider):
|
||||||
@ -20,7 +20,7 @@ class TestSpider(BaseSpider):
|
|||||||
price_re = re.compile(">Price: \$(.*?)<", re.M)
|
price_re = re.compile(">Price: \$(.*?)<", re.M)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
xlink = LinkExtractor()
|
xlink = SgmlLinkExtractor()
|
||||||
itemre = re.compile(self.itemurl_re)
|
itemre = re.compile(self.itemurl_re)
|
||||||
for link in xlink.extract_links(response):
|
for link in xlink.extract_links(response):
|
||||||
if itemre.search(link.url):
|
if itemre.search(link.url):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user