Sorted out Link Extractors organization by moving all them to

scrapy.contrib.linkextractors. The most relevant being: scrapy.link.extractors.RegexLinkExtractor which was moved to: scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor The old location still works but throws a deprecation warning. It will be removed before the 0.7 release. Documentation and tests were also updated. Also, in this changeset, a new regex-based link extractor was added to scrapy.contrib.linkextractors.regex. --HG-- rename : scrapy/tests/sample_data/link_extractor/regex_linkextractor.html => scrapy/tests/sample_data/link_extractor/sgml_linkextractor.html rename : scrapy/tests/test_link.py => scrapy/tests/test_contrib_linkextractors.py
2025-02-24 15:43:48 +00:00 · 2009-05-18 19:19:37 -03:00 · 2009-05-18 19:19:37 -03:00 · 86498abdf1
commit 86498abdf1
parent 7b34e08392
16 changed files with 306 additions and 232 deletions
--- a/docs/intro/overview.rst
+++ b/docs/intro/overview.rst
@ -125,7 +125,7 @@ Finally, here's the spider code::
        domain_name = 'mininova.org'
        start_urls = ['http://www.mininova.org/today']
-        rules = [Rule(RegexLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
+        rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
        def parse_torrent(self, response):
            x = HtmlXPathSelector(response)
--- a/docs/proposed/spiders.rst
+++ b/docs/proposed/spiders.rst
@ -93,7 +93,7 @@ Let's now take a look at an example CrawlSpider with Rules::
    from scrapy import log
    from scrapy.contrib.spiders import CrawlSpider, Rule
-    from scrapy.link.extractors import RegexLinkExtractor
+    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.xpath.selector import HtmlXPathSelector
    from scrapy.item import ScrapedItem
@ -104,10 +104,10 @@ Let's now take a look at an example CrawlSpider with Rules::
        rules = (
            # Extract links matching 'category.php' (but not matching 'subsection.php')
            # and follow links from them (since no callback means follow=True by default).
-            Rule(RegexLinkExtractor(allow=('category\.php', ), deny=('subsection\,php', ))),
+            Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\,php', ))),
            # Extract links matching 'item.php' and parse them with the spider's method parse_item
-            Rule(RegexLinkExtractor(allow=('item\.php', )), callback='parse_item'),
+            Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
        )
        def parse_item(self, response):
--- a/docs/ref/link-extractors.rst
+++ b/docs/ref/link-extractors.rst
@ -4,67 +4,25 @@
 Available Link Extractors
 =========================
-.. module:: scrapy.link
+.. module:: scrapy.contrib.linkextractors
   :synopsis: Link extractors classes
-LinkExtractor
+All available link extractors classes bundled with Scrapy are provided in the
-=============
+:mod:`scrapy.contrib.linkextractors` module.
-.. class:: LinkExtractor(tag="a", href="href", unique=False, process_value=None)
+.. module:: scrapy.contrib.linkextractors.sgml
   :synopsis: SGMLParser-based link extractors
-    This is the most basic Link Extractor which extracts links from a response with
+SgmlLinkExtractor
-    by looking at the given attributes inside the given tags.
+=================
-    The constructor arguments are:
+.. class:: SgmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
-    :param tag: either a string (with the name of a tag) or a function that
+    The SgmlLinkExtractor extends the base :class:`BaseSgmlLinkExtractor` by
-        receives a tag name and returns ``True`` if links should be extracted
+    providing additional filters that you can specify to extract links,
-        from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
+    including regular expressions patterns that the links must match to be
-        request (once its downloaded) as its first parameter. For more
+    extracted. All those filters are configured through these constructor
-        information see :ref:`ref-request-callback-arguments` below.
+    parameters:
    :type tag: str or callable
    :param attr:  either string (with the name of a tag attribute), or a
        function that receives a an attribute name and returns ``True`` if
        links should be extracted from it, or ``False`` if the shouldn't.
        Defaults to ``href``.
    :type attr: str or callable
    :param unique: is a boolean that specifies if a duplicate filtering should
        be applied to links extracted.
    :type unique: boolean
    :param process_value: a function which receives each value extracted from
        the tag and attributes scanned and can modify the value and return a
        new one, or return ``None`` to ignore the link altogether. If not
        given, ``process_value`` defaults to ``lambda x: x``.
        .. highlight:: html
        For example, to extract links from this code::
            <a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
        .. highlight:: python
        You can use the following function in ``process_value``::
            def process_value(value):
                m = re.search("javascript:goToPage\('(.*?)'", value)
                if m:
                    return m.group(1) 
    :type process_value: callable
 RegexLinkExtractor
 ==================
 .. class:: RegexLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
    The RegexLinkExtractor extends the base :class:`LinkExtractor` by providing
    additional filters that you can specify to extract links, including regular
    expressions patterns that the links must match to be extracted. All those
    filters are configured through these constructor paramters:
    :param allow: a single regular expression (or list of regular expressions)
        that the (absolute) urls must match in order to be extracted. If not
@ -112,3 +70,52 @@ RegexLinkExtractor
        :class:`LinkExtractor` class constructor
    :type process_value: boolean
 BaseSgmlLinkExtractor
 =====================
 .. class:: BaseSgmlLinkExtractor(tag="a", href="href", unique=False, process_value=None)
    The purpose of this Link Extractor is only to serve as a base class for the
    :class:`SgmlLinkExtractor`. You should use that one instead.
    The constructor arguments are:
    :param tag: either a string (with the name of a tag) or a function that
        receives a tag name and returns ``True`` if links should be extracted
        from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
        request (once its downloaded) as its first parameter. For more
        information see :ref:`ref-request-callback-arguments` below.
    :type tag: str or callable
    :param attr:  either string (with the name of a tag attribute), or a
        function that receives a an attribute name and returns ``True`` if
        links should be extracted from it, or ``False`` if the shouldn't.
        Defaults to ``href``.
    :type attr: str or callable
    :param unique: is a boolean that specifies if a duplicate filtering should
        be applied to links extracted.
    :type unique: boolean
    :param process_value: a function which receives each value extracted from
        the tag and attributes scanned and can modify the value and return a
        new one, or return ``None`` to ignore the link altogether. If not
        given, ``process_value`` defaults to ``lambda x: x``.
        .. highlight:: html
        For example, to extract links from this code::
            <a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
        .. highlight:: python
        You can use the following function in ``process_value``::
            def process_value(value):
                m = re.search("javascript:goToPage\('(.*?)'", value)
                if m:
                    return m.group(1) 
    :type process_value: callable
--- a/docs/ref/spiders.rst
+++ b/docs/ref/spiders.rst
@ -152,7 +152,7 @@ CrawlSpider example
 Let's now take a look at an example CrawlSpider with rules::
    from scrapy.contrib.spiders import CrawlSpider, Rule
-    from scrapy.link.extractors import RegexLinkExtractor
+    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.xpath.selector import HtmlXPathSelector
    from scrapy.item import ScrapedItem
@ -163,10 +163,10 @@ Let's now take a look at an example CrawlSpider with rules::
        rules = (
            # Extract links matching 'category.php' (but not matching 'subsection.php') 
            # and follow links from them (since no callback means follow=True by default).
-            Rule(RegexLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
+            Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
            # Extract links matching 'item.php' and parse them with the spider's method parse_item
-            Rule(RegexLinkExtractor(allow=('item\.php', )), callback='parse_item'),
+            Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
        )
        def parse_item(self, response):
--- a/docs/topics/firebug.rst
+++ b/docs/topics/firebug.rst
@ -63,7 +63,7 @@ those links. For example, the following one::
 So, based on that regular expression we can create the first crawling rule::
-    Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$', ),
+    Rule(SgmlLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$', ),
        'parse_category',
        follow=True,
    ),
@ -75,7 +75,7 @@ process and extract data from those pages.
 This is how the spider would look so far::
-   from scrapy.link.extractors import RegexLinkExtractor
+   from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
   from scrapy.contrib.spiders import CrawlSpider, Rule
   class GoogleDirectorySpider(CrawlSpider):
@ -83,7 +83,7 @@ This is how the spider would look so far::
       start_urls = ['http://www.google.com/dirhp']
       rules = (
-           Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
+           Rule(SgmlLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
               'parse_category', follow=True,
           ),
       )
--- a/scrapy/contrib/linkextractors/init.py
+++ b/scrapy/contrib/linkextractors/init.py
@ -0,0 +1,7 @@
 """
 scrapy.contrib.linkextractors
 This package contains a collection of Link Extractors. 
 For more info see docs/ref/link-extractors.rst
 """
--- a/scrapy/contrib/linkextractors/image.py
+++ b/scrapy/contrib/linkextractors/image.py
@ -1,7 +1,8 @@
 """
-This module provides additional LinkExtractors, apart from the ones in scrapy.link
+This module implements the HtmlImageLinkExtractor for extracting 
-and scrapy.link.extractors.
+image links only.
 """
 import urlparse
 from scrapy.link import Link
--- a/scrapy/contrib/linkextractors/regex.py
+++ b/scrapy/contrib/linkextractors/regex.py
@ -0,0 +1,28 @@
 import re
 from scrapy.utils.url import urljoin_rfc as urljoin
 from scrapy.utils.markup import remove_tags, remove_entities, replace_escape_chars
 from scrapy.link import Link
 from .sgml import SgmlLinkExtractor
 linkre = re.compile(
        "<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>", 
        re.DOTALL | re.IGNORECASE)
 def clean_link(link_text):
    """Remove leading and trailing whitespace and punctuation"""
    return link_text.strip("\t\r\n '\"")
 class RegexLinkExtractor(SgmlLinkExtractor):
    """High performant link extractor"""
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = self.base_url if self.base_url else response_url
        clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])
        return [Link(url, text) for url, text in urlstext]
--- a/scrapy/contrib/linkextractors/sgml.py
+++ b/scrapy/contrib/linkextractors/sgml.py
@ -0,0 +1,126 @@
 """
 SGMLParser-based Link extractors
 """
 import re
 from scrapy.xpath import HtmlXPathSelector
 from scrapy.link import Link
 from scrapy.utils.misc import arg_to_iter
 from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
 from scrapy.utils.url import safe_url_string, urljoin_rfc as urljoin, canonicalize_url, url_is_from_any_domain
 class BaseSgmlLinkExtractor(FixedSGMLParser):
    def __init__(self, tag="a", attr="href", unique=False, process_value=None):
        FixedSGMLParser.__init__(self)
        self.scan_tag = tag if callable(tag) else lambda t: t == tag
        self.scan_attr = attr if callable(attr) else lambda a: a == attr
        self.process_value = (lambda v: v) if process_value is None else process_value
        self.current_link = None
        self.unique = unique
    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()
        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
        ret = []
        base_url = self.base_url if self.base_url else response_url
        for link in links:
            link.url = urljoin(base_url, link.url)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding)
            ret.append(link)
        return ret
    def extract_links(self, response):
        # wrapper needed to allow to work directly with text
        return self._extract_links(response.body, response.url, response.encoding)
    def reset(self):
        FixedSGMLParser.reset(self)
        self.links = []
        self.base_url = None
    def unknown_starttag(self, tag, attrs):
        if tag == 'base':
            self.base_url = dict(attrs).get('href')
        if self.scan_tag(tag):
            for attr, value in attrs:
                if self.scan_attr(attr):
                    url = self.process_value(value)
                    if url is not None:
                        link = Link(url=url)
                        self.links.append(link)
                        self.current_link = link
    def unknown_endtag(self, tag):
        self.current_link = None
    def handle_data(self, data):
        if self.current_link and not self.current_link.text:
            self.current_link.text = data.strip()
    def matches(self, url):
        """This extractor matches with any url, since
        it doesn't contain any patterns"""
        return True
 _re_type = type(re.compile("", 0))
 _matches = lambda url, regexs: any((r.search(url) for r in regexs))
 _is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'file'])
 class SgmlLinkExtractor(BaseSgmlLinkExtractor):
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
                 tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
        self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
        self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
        self.allow_domains = set(arg_to_iter(allow_domains))
        self.deny_domains = set(arg_to_iter(deny_domains))
        self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
        self.canonicalize = canonicalize
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func, 
            unique=unique, process_value=process_value)
    def extract_links(self, response):
        if self.restrict_xpaths:
            hxs = HtmlXPathSelector(response)
            html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
            links = self._extract_links(html_slice, response.url, response.encoding)
        else:
            links = BaseSgmlLinkExtractor.extract_links(self, response)
        links = [link for link in links if _is_valid_url(link.url)]
        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)
        return links
    def matches(self, url):
        if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
            return False
        if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
            return False
        allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
        denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
        return any(allowed) and not any(denied)
--- a/scrapy/contrib/spiders/generic.py
+++ b/scrapy/contrib/spiders/generic.py
@ -1,5 +1,5 @@
 from scrapy.contrib.spiders import CrawlSpider, Rule
-from scrapy.link.extractors import RegexLinkExtractor
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 class GenericSpider(CrawlSpider):
    """
@ -10,7 +10,7 @@ class GenericSpider(CrawlSpider):
    def __init__(self, domain_name):
        self.domain_name = domain_name
        self.rules = (
-            Rule(RegexLinkExtractor(allow_domains=(domain_name,)), self.parse_note, follow=True),
+            Rule(SgmlLinkExtractor(allow_domains=(domain_name,)), self.parse_note, follow=True),
        )
        super(GenericSpider, self).__init__()
--- a/scrapy/contrib_exp/adaptors/extraction.py
+++ b/scrapy/contrib_exp/adaptors/extraction.py
@ -10,7 +10,7 @@ from scrapy.utils.url import is_url
 from scrapy.utils.response import get_base_url
 from scrapy.utils.python import flatten, unicode_to_str
 from scrapy.xpath.selector import XPathSelector, XPathSelectorList
-from scrapy.contrib.link_extractors import HTMLImageLinkExtractor
+from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor
 def extract(location, adaptor_args=None):
    """
--- a/scrapy/link/init.py
+++ b/scrapy/link/init.py
@ -1,76 +1,12 @@
 """
-LinkExtractor provides en efficient way to extract links from pages
+This module defines the Link object used in Link extractors.
-See documentation in docs/ref/link-extractors.rst
+For actual link extractors implementation see scrapy.contrib.linkextractor, or
 its documentation in: docs/ref/link-extractors.rst
 """
 from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
 from scrapy.utils.url import safe_url_string, urljoin_rfc as urljoin
 class LinkExtractor(FixedSGMLParser):
    def __init__(self, tag="a", attr="href", unique=False, process_value=None):
        FixedSGMLParser.__init__(self)
        self.scan_tag = tag if callable(tag) else lambda t: t == tag
        self.scan_attr = attr if callable(attr) else lambda a: a == attr
        self.process_value = (lambda v: v) if process_value is None else process_value
        self.current_link = None
        self.unique = unique
    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()
        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
        ret = []
        base_url = self.base_url if self.base_url else response_url
        for link in links:
            link.url = urljoin(base_url, link.url)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding)
            ret.append(link)
        return ret
    def extract_links(self, response):
        # wrapper needed to allow to work directly with text
        return self._extract_links(response.body, response.url, response.encoding)
    def reset(self):
        FixedSGMLParser.reset(self)
        self.links = []
        self.base_url = None
    def unknown_starttag(self, tag, attrs):
        if tag == 'base':
            self.base_url = dict(attrs).get('href')
        if self.scan_tag(tag):
            for attr, value in attrs:
                if self.scan_attr(attr):
                    url = self.process_value(value)
                    if url is not None:
                        link = Link(url=url)
                        self.links.append(link)
                        self.current_link = link
    def unknown_endtag(self, tag):
        self.current_link = None
    def handle_data(self, data):
        if self.current_link and not self.current_link.text:
            self.current_link.text = data.strip()
    def matches(self, url):
        """This extractor matches with any url, since
        it doesn't contain any patterns"""
        return True
 class Link(object):
-    """
+    """Link objects represent an extracted link by the LinkExtractor.
    Link objects represent an extracted link by the LinkExtractor.
    At the moment, it contains just the url and link text.
    """
@ -85,3 +21,26 @@ class Link(object):
    def __repr__(self):
        return '<Link url=%r text=%r >' % (self.url, self.text)
 # FIXME: code below is for backwards compatibility and should be removed before
 # the 0.7 release
 import warnings
 from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
 class LinkExtractor(BaseSgmlLinkExtractor):
    def __init__(self, *args, **kwargs):
        warnings.warn("scrapy.link.LinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.BaseSgmlLinkExtractor instead",
            DeprecationWarning, stacklevel=2)
        BaseSgmlLinkExtractor.__init__(self, *args, **kwargs)
 class RegexLinkExtractor(SgmlLinkExtractor):
    def __init__(self, *args, **kwargs):
        warnings.warn("scrapy.link.RegexLinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor instead",
            DeprecationWarning, stacklevel=2)
        SgmlLinkExtractor.__init__(self, *args, **kwargs)
--- a/scrapy/link/extractors.py
+++ b/scrapy/link/extractors.py
@ -1,68 +1,14 @@
-"""
+# FIXME: code below is for backwards compatibility and should be removed before
-This module provides some LinkExtractors, which extend the base LinkExtractor
+# the 0.7 release
 (scrapy.link.LinkExtractor) with some additional useful features.
-See documentation in docs/ref/link-extractors.rst
+import warnings
 """
-import re
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
-from scrapy.link import LinkExtractor
+class RegexLinkExtractor(SgmlLinkExtractor):
 from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
 from scrapy.xpath import HtmlXPathSelector
 from scrapy.utils.misc import arg_to_iter
-_re_type = type(re.compile("", 0))
+    def __init__(self, *args, **kwargs):
        warnings.warn("scrapy.link.extractors.RegexLinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor instead",
            DeprecationWarning, stacklevel=2)
        SgmlLinkExtractor.__init__(self, *args, **kwargs)
 _matches = lambda url, regexs: any((r.search(url) for r in regexs))
 _is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'file'])
 class RegexLinkExtractor(LinkExtractor):
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
                 tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
        self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
        self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
        self.allow_domains = set(arg_to_iter(allow_domains))
        self.deny_domains = set(arg_to_iter(deny_domains))
        self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
        self.canonicalize = canonicalize
        tag_func = lambda x: x in tags
        attr_func = lambda x: x in attrs
        LinkExtractor.__init__(self, tag=tag_func, attr=attr_func, 
            unique=unique, process_value=process_value)
    def extract_links(self, response):
        if self.restrict_xpaths:
            hxs = HtmlXPathSelector(response)
            html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
            links = self._extract_links(html_slice, response.url, response.encoding)
        else:
            links = LinkExtractor.extract_links(self, response)
        links = [link for link in links if _is_valid_url(link.url)]
        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)
        return links
    def matches(self, url):
        if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
            return False
        if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
            return False
        allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
        denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
        return any(allowed) and not any(denied)
--- a/scrapy/tests/sample_data/link_extractor/regex_linkextractor.html
+++ b/scrapy/tests/sample_data/link_extractor/regex_linkextractor.html
--- a/scrapy/tests/test_contrib_linkextractors.py
+++ b/scrapy/tests/test_contrib_linkextractors.py
@ -2,9 +2,9 @@ import re
 import unittest
 from scrapy.http import HtmlResponse
-from scrapy.link import LinkExtractor, Link
+from scrapy.link import Link
-from scrapy.link.extractors import RegexLinkExtractor
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
-from scrapy.contrib.link_extractors import HTMLImageLinkExtractor
+from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor
 from scrapy.tests import get_testdata
 class LinkExtractorTestCase(unittest.TestCase):
@ -18,7 +18,7 @@ class LinkExtractorTestCase(unittest.TestCase):
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html)
-        lx = LinkExtractor()  # default: tag=a, attr=href
+        lx = BaseSgmlLinkExtractor()  # default: tag=a, attr=href
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://example.org/somepage/item/12.html', text='Item 12'), 
                          Link(url='http://example.org/about.html', text='About us'),
@ -31,7 +31,7 @@ class LinkExtractorTestCase(unittest.TestCase):
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html)
-        lx = LinkExtractor()  # default: tag=a, attr=href
+        lx = BaseSgmlLinkExtractor()  # default: tag=a, attr=href
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
@ -42,7 +42,7 @@ class LinkExtractorTestCase(unittest.TestCase):
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)
-        lx = LinkExtractor()
+        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8),
            [ Link(url='http://example.com/sample_%C3%B1.html', text=''),
              Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ])
@ -59,75 +59,75 @@ class LinkExtractorTestCase(unittest.TestCase):
        url1 = 'http://lotsofstuff.com/stuff1/index'
        url2 = 'http://evenmorestuff.com/uglystuff/index'
-        lx = LinkExtractor()
+        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), True)
-class RegexLinkExtractorTestCase(unittest.TestCase):
+class SgmlLinkExtractorTestCase(unittest.TestCase):
    def setUp(self):
-        body = get_testdata('link_extractor', 'regex_linkextractor.html')
+        body = get_testdata('link_extractor', 'sgml_linkextractor.html')
        self.response = HtmlResponse(url='http://example.com/index', body=body)
    def test_urls_type(self):
        '''Test that the resulting urls are regular strings and not a unicode objects'''
-        lx = RegexLinkExtractor()
+        lx = SgmlLinkExtractor()
        self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response)))
    def test_extraction(self):
        '''Test the extractor's behaviour among different situations'''
-        lx = RegexLinkExtractor()
+        lx = SgmlLinkExtractor()
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
              Link(url='http://www.google.com/something', text=u'') ])
-        lx = RegexLinkExtractor(allow=('sample', ))
+        lx = SgmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])
-        lx = RegexLinkExtractor(allow=('sample', ), unique=False)
+        lx = SgmlLinkExtractor(allow=('sample', ), unique=False)
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])
-        lx = RegexLinkExtractor(allow=('sample', ), deny=('3', ))
+        lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2') ])
-        lx = RegexLinkExtractor(allow_domains=('google.com', ))
+        lx = SgmlLinkExtractor(allow_domains=('google.com', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://www.google.com/something', text=u'') ])
-        lx = RegexLinkExtractor(tags=('img', ), attrs=('src', ))
+        lx = SgmlLinkExtractor(tags=('img', ), attrs=('src', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample2.jpg', text=u'') ])
    def test_extraction_using_single_values(self):
        '''Test the extractor's behaviour among different situations'''
-        lx = RegexLinkExtractor(allow='sample')
+        lx = SgmlLinkExtractor(allow='sample')
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])
-        lx = RegexLinkExtractor(allow='sample', deny='3')
+        lx = SgmlLinkExtractor(allow='sample', deny='3')
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2') ])
-        lx = RegexLinkExtractor(allow_domains='google.com')
+        lx = SgmlLinkExtractor(allow_domains='google.com')
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://www.google.com/something', text=u'') ])
-        lx = RegexLinkExtractor(deny_domains='example.com')
+        lx = SgmlLinkExtractor(deny_domains='example.com')
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://www.google.com/something', text=u'') ])
@ -135,23 +135,23 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
        url1 = 'http://lotsofstuff.com/stuff1/index'
        url2 = 'http://evenmorestuff.com/uglystuff/index'
-        lx = RegexLinkExtractor(allow=(r'stuff1', ))
+        lx = SgmlLinkExtractor(allow=(r'stuff1', ))
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), False)
-        lx = RegexLinkExtractor(deny=(r'uglystuff', ))
+        lx = SgmlLinkExtractor(deny=(r'uglystuff', ))
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), False)
-        lx = RegexLinkExtractor(allow_domains=('evenmorestuff.com', ))
+        lx = SgmlLinkExtractor(allow_domains=('evenmorestuff.com', ))
        self.assertEqual(lx.matches(url1), False)
        self.assertEqual(lx.matches(url2), True)
-        lx = RegexLinkExtractor(deny_domains=('lotsofstuff.com', ))
+        lx = SgmlLinkExtractor(deny_domains=('lotsofstuff.com', ))
        self.assertEqual(lx.matches(url1), False)
        self.assertEqual(lx.matches(url2), True)
-        lx = RegexLinkExtractor(allow=('blah1', ), deny=('blah2', ),
+        lx = SgmlLinkExtractor(allow=('blah1', ), deny=('blah2', ),
            allow_domains=('blah1.com', ), deny_domains=('blah2.com', ))
        self.assertEqual(lx.matches('http://blah1.com/blah1'), True)
        self.assertEqual(lx.matches('http://blah1.com/blah2'), False)
@ -159,7 +159,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
        self.assertEqual(lx.matches('http://blah2.com/blah2'), False)
    def test_restrict_xpaths(self):
-        lx = RegexLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
+        lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2') ])
@ -177,7 +177,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')
-        lx = RegexLinkExtractor(restrict_xpaths="//div[@class='links']") 
+        lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']") 
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://example.org/about.html', text=u'About us\xa3')])
@ -194,7 +194,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
            if m:
                return m.group(1)
-        lx = RegexLinkExtractor(process_value=process_value)
+        lx = SgmlLinkExtractor(process_value=process_value)
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://example.org/other/page.html', text='Link text')])
--- a/scrapy/tests/test_spiders/testspider.py
+++ b/scrapy/tests/test_spiders/testspider.py
@ -7,7 +7,7 @@ import re
 from scrapy.spider import BaseSpider
 from scrapy.item import ScrapedItem
-from scrapy.link import LinkExtractor
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 from scrapy.http import Request
 class TestSpider(BaseSpider):
@ -20,7 +20,7 @@ class TestSpider(BaseSpider):
    price_re = re.compile(">Price: \$(.*?)<", re.M)
    def parse(self, response):
-        xlink = LinkExtractor()
+        xlink = SgmlLinkExtractor()
        itemre = re.compile(self.itemurl_re)
        for link in xlink.extract_links(response):
            if itemre.search(link.url):