mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 10:43:44 +00:00
deprecate SgmlLinkExtractor
This commit is contained in:
parent
2b749668ba
commit
00cd4f0fa5
@ -4,19 +4,19 @@
|
||||
Link Extractors
|
||||
===============
|
||||
|
||||
LinkExtractors are objects whose only purpose is to extract links from web
|
||||
Link extractors are objects whose only purpose is to extract links from web
|
||||
pages (:class:`scrapy.http.Response` objects) which will be eventually
|
||||
followed.
|
||||
|
||||
There are two Link Extractors available in Scrapy by default, but you create
|
||||
your own custom Link Extractors to suit your needs by implementing a simple
|
||||
interface.
|
||||
There is ``scrapy.contrib.linkextractors import LinkExtractor`` available
|
||||
in Scrapy, but you can create your own custom Link Extractors to suit your
|
||||
needs by implementing a simple interface.
|
||||
|
||||
The only public method that every LinkExtractor has is ``extract_links``,
|
||||
The only public method that every link extractor has is ``extract_links``,
|
||||
which receives a :class:`~scrapy.http.Response` object and returns a list
|
||||
of :class:`scrapy.link.Link` objects. Link Extractors are meant to be instantiated once and their
|
||||
``extract_links`` method called several times with different responses, to
|
||||
extract links to follow.
|
||||
of :class:`scrapy.link.Link` objects. Link extractors are meant to be
|
||||
instantiated once and their ``extract_links`` method called several times
|
||||
with different responses to extract links to follow.
|
||||
|
||||
Link extractors are used in the :class:`~scrapy.contrib.spiders.CrawlSpider`
|
||||
class (available in Scrapy), through a set of rules, but you can also use it in
|
||||
@ -33,14 +33,16 @@ Built-in link extractors reference
|
||||
.. module:: scrapy.contrib.linkextractors
|
||||
:synopsis: Link extractors classes
|
||||
|
||||
All available link extractors classes bundled with Scrapy are provided in the
|
||||
Link extractors classes bundled with Scrapy are provided in the
|
||||
:mod:`scrapy.contrib.linkextractors` module.
|
||||
|
||||
If you don't know what link extractor to choose, just use the default which is
|
||||
the same as LxmlLinkExtractor (see below)::
|
||||
The default link extractor is ``LinkExtractor``, which is the same as
|
||||
:class:`~.LxmlLinkExtractor`::
|
||||
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
|
||||
There used to be other link extractor classes in previous Scrapy versions,
|
||||
but they are deprecated now.
|
||||
|
||||
LxmlLinkExtractor
|
||||
-----------------
|
||||
@ -103,109 +105,6 @@ LxmlLinkExtractor
|
||||
links.
|
||||
:type unique: boolean
|
||||
|
||||
:param process_value: see ``process_value`` argument of
|
||||
:class:`BaseSgmlLinkExtractor` class constructor
|
||||
:type process_value: callable
|
||||
|
||||
|
||||
SgmlLinkExtractor
|
||||
-----------------
|
||||
|
||||
.. module:: scrapy.contrib.linkextractors.sgml
|
||||
:synopsis: SGMLParser-based link extractors
|
||||
|
||||
.. warning:: SGMLParser based link extractors are unmantained and its usage is discouraged.
|
||||
It is recommended to migrate to :class:`LxmlLinkExtractor` if you are still
|
||||
using :class:`SgmlLinkExtractor`.
|
||||
|
||||
.. class:: SgmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
|
||||
|
||||
The SgmlLinkExtractor is built upon the base :class:`BaseSgmlLinkExtractor`
|
||||
and provides additional filters that you can specify to extract links,
|
||||
including regular expressions patterns that the links must match to be
|
||||
extracted. All those filters are configured through these constructor
|
||||
parameters:
|
||||
|
||||
:param allow: a single regular expression (or list of regular expressions)
|
||||
that the (absolute) urls must match in order to be extracted. If not
|
||||
given (or empty), it will match all links.
|
||||
:type allow: a regular expression (or list of)
|
||||
|
||||
:param deny: a single regular expression (or list of regular expressions)
|
||||
that the (absolute) urls must match in order to be excluded (ie. not
|
||||
extracted). It has precedence over the ``allow`` parameter. If not
|
||||
given (or empty) it won't exclude any links.
|
||||
:type deny: a regular expression (or list of)
|
||||
|
||||
:param allow_domains: a single value or a list of string containing
|
||||
domains which will be considered for extracting the links
|
||||
:type allow_domains: str or list
|
||||
|
||||
:param deny_domains: a single value or a list of strings containing
|
||||
domains which won't be considered for extracting the links
|
||||
:type deny_domains: str or list
|
||||
|
||||
:param deny_extensions: a single value or list of strings containing
|
||||
extensions that should be ignored when extracting links.
|
||||
If not given, it will default to the
|
||||
``IGNORED_EXTENSIONS`` list defined in the `scrapy.linkextractor`_
|
||||
module.
|
||||
:type deny_extensions: list
|
||||
|
||||
:param restrict_xpaths: is a XPath (or list of XPath's) which defines
|
||||
regions inside the response where links should be extracted from.
|
||||
If given, only the text selected by those XPath will be scanned for
|
||||
links. See examples below.
|
||||
:type restrict_xpaths: str or list
|
||||
|
||||
:param tags: a tag or a list of tags to consider when extracting links.
|
||||
Defaults to ``('a', 'area')``.
|
||||
:type tags: str or list
|
||||
|
||||
:param attrs: an attribute or list of attributes which should be considered when looking
|
||||
for links to extract (only for those tags specified in the ``tags``
|
||||
parameter). Defaults to ``('href',)``
|
||||
:type attrs: list
|
||||
|
||||
:param canonicalize: canonicalize each extracted url (using
|
||||
scrapy.utils.url.canonicalize_url). Defaults to ``True``.
|
||||
:type canonicalize: boolean
|
||||
|
||||
:param unique: whether duplicate filtering should be applied to extracted
|
||||
links.
|
||||
:type unique: boolean
|
||||
|
||||
:param process_value: see ``process_value`` argument of
|
||||
:class:`BaseSgmlLinkExtractor` class constructor
|
||||
:type process_value: callable
|
||||
|
||||
BaseSgmlLinkExtractor
|
||||
---------------------
|
||||
|
||||
.. class:: BaseSgmlLinkExtractor(tag="a", attr="href", unique=False, process_value=None)
|
||||
|
||||
The purpose of this Link Extractor is only to serve as a base class for the
|
||||
:class:`SgmlLinkExtractor`. You should use that one instead.
|
||||
|
||||
The constructor arguments are:
|
||||
|
||||
:param tag: either a string (with the name of a tag) or a function that
|
||||
receives a tag name and returns ``True`` if links should be extracted from
|
||||
that tag, or ``False`` if they shouldn't. Defaults to ``'a'``. request
|
||||
(once it's downloaded) as its first parameter. For more information, see
|
||||
:ref:`topics-request-response-ref-request-callback-arguments`.
|
||||
:type tag: str or callable
|
||||
|
||||
:param attr: either string (with the name of a tag attribute), or a
|
||||
function that receives an attribute name and returns ``True`` if
|
||||
links should be extracted from it, or ``False`` if they shouldn't.
|
||||
Defaults to ``href``.
|
||||
:type attr: str or callable
|
||||
|
||||
:param unique: is a boolean that specifies if a duplicate filtering should
|
||||
be applied to links extracted.
|
||||
:type unique: boolean
|
||||
|
||||
:param process_value: a function which receives each value extracted from
|
||||
the tag and attributes scanned and can modify the value and return a
|
||||
new one, or return ``None`` to ignore the link altogether. If not
|
||||
|
@ -1,8 +1,9 @@
|
||||
"""
|
||||
SGMLParser-based Link extractors
|
||||
"""
|
||||
import re
|
||||
from urlparse import urlparse, urljoin
|
||||
from urlparse import urljoin
|
||||
import warnings
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.link import Link
|
||||
@ -10,12 +11,19 @@ from scrapy.linkextractor import FilteringLinkExtractor
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
|
||||
from scrapy.utils.response import get_base_url
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
class BaseSgmlLinkExtractor(FixedSGMLParser):
|
||||
|
||||
def __init__(self, tag="a", attr="href", unique=False, process_value=None):
|
||||
FixedSGMLParser.__init__(self)
|
||||
warnings.warn(
|
||||
"BaseSgmlLinkExtractor is deprecated and will be removed in future releases!!! "
|
||||
"Please use scrapy.contrib.linkextractors.LinkExtractor",
|
||||
ScrapyDeprecationWarning
|
||||
)
|
||||
with warnings.catch_warnings(record=True):
|
||||
FixedSGMLParser.__init__(self)
|
||||
self.scan_tag = tag if callable(tag) else lambda t: t == tag
|
||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||
self.process_value = (lambda v: v) if process_value is None else process_value
|
||||
@ -91,11 +99,21 @@ class SgmlLinkExtractor(FilteringLinkExtractor):
|
||||
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
|
||||
tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None,
|
||||
deny_extensions=None):
|
||||
|
||||
warnings.warn(
|
||||
"SgmlLinkExtractor is deprecated and will be removed in future releases. "
|
||||
"Please use scrapy.contrib.linkextractors.LinkExtractor",
|
||||
ScrapyDeprecationWarning
|
||||
)
|
||||
|
||||
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
|
||||
tag_func = lambda x: x in tags
|
||||
attr_func = lambda x: x in attrs
|
||||
lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
|
||||
unique=unique, process_value=process_value)
|
||||
|
||||
with warnings.catch_warnings(record=True):
|
||||
lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
|
||||
unique=unique, process_value=process_value)
|
||||
|
||||
super(SgmlLinkExtractor, self).__init__(lx, allow, deny,
|
||||
allow_domains, deny_domains, restrict_xpaths, canonicalize,
|
||||
deny_extensions)
|
||||
|
Loading…
x
Reference in New Issue
Block a user