Sorted out Link Extractors organization by moving all them to

scrapy.contrib.linkextractors. The most relevant being: scrapy.link.extractors.RegexLinkExtractor which was moved to: scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor The old location still works but throws a deprecation warning. It will be removed before the 0.7 release. Documentation and tests were also updated. Also, in this changeset, a new regex-based link extractor was added to scrapy.contrib.linkextractors.regex. --HG-- rename : scrapy/tests/sample_data/link_extractor/regex_linkextractor.html => scrapy/tests/sample_data/link_extractor/sgml_linkextractor.html rename : scrapy/tests/test_link.py => scrapy/tests/test_contrib_linkextractors.py
2025-02-24 04:44:26 +00:00 · 2009-05-18 19:19:37 -03:00 · 2009-05-18 19:19:37 -03:00 · 86498abdf1
commit 86498abdf1
parent 7b34e08392
16 changed files with 306 additions and 232 deletions
--- a/docs/intro/overview.rst
+++ b/docs/intro/overview.rst
@ -125,7 +125,7 @@ Finally, here's the spider code::

        domain_name = 'mininova.org'
        start_urls = ['http://www.mininova.org/today']
-        rules = [Rule(RegexLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
+        rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
        
        def parse_torrent(self, response):
            x = HtmlXPathSelector(response)
--- a/docs/proposed/spiders.rst
+++ b/docs/proposed/spiders.rst
@ -93,7 +93,7 @@ Let's now take a look at an example CrawlSpider with Rules::

    from scrapy import log
    from scrapy.contrib.spiders import CrawlSpider, Rule
-    from scrapy.link.extractors import RegexLinkExtractor
+    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.xpath.selector import HtmlXPathSelector
    from scrapy.item import ScrapedItem

@ -104,10 +104,10 @@ Let's now take a look at an example CrawlSpider with Rules::
        rules = (
            # Extract links matching 'category.php' (but not matching 'subsection.php')
            # and follow links from them (since no callback means follow=True by default).
-            Rule(RegexLinkExtractor(allow=('category\.php', ), deny=('subsection\,php', ))),
+            Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\,php', ))),

            # Extract links matching 'item.php' and parse them with the spider's method parse_item
-            Rule(RegexLinkExtractor(allow=('item\.php', )), callback='parse_item'),
+            Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
        )

        def parse_item(self, response):
--- a/docs/ref/link-extractors.rst
+++ b/docs/ref/link-extractors.rst
@ -4,67 +4,25 @@
 Available Link Extractors
 =========================

-.. module:: scrapy.link
+.. module:: scrapy.contrib.linkextractors
   :synopsis: Link extractors classes

-LinkExtractor
-=============
+All available link extractors classes bundled with Scrapy are provided in the
+:mod:`scrapy.contrib.linkextractors` module.

-.. class:: LinkExtractor(tag="a", href="href", unique=False, process_value=None)
+.. module:: scrapy.contrib.linkextractors.sgml
+   :synopsis: SGMLParser-based link extractors

-    This is the most basic Link Extractor which extracts links from a response with
-    by looking at the given attributes inside the given tags.
+SgmlLinkExtractor
+=================

-    The constructor arguments are:
+.. class:: SgmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)

-    :param tag: either a string (with the name of a tag) or a function that
-        receives a tag name and returns ``True`` if links should be extracted
-        from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
-        request (once its downloaded) as its first parameter. For more
-        information see :ref:`ref-request-callback-arguments` below.
-    :type tag: str or callable
-
-    :param attr:  either string (with the name of a tag attribute), or a
-        function that receives a an attribute name and returns ``True`` if
-        links should be extracted from it, or ``False`` if the shouldn't.
-        Defaults to ``href``.
-    :type attr: str or callable
-
-    :param unique: is a boolean that specifies if a duplicate filtering should
-        be applied to links extracted.
-    :type unique: boolean
-
-    :param process_value: a function which receives each value extracted from
-        the tag and attributes scanned and can modify the value and return a
-        new one, or return ``None`` to ignore the link altogether. If not
-        given, ``process_value`` defaults to ``lambda x: x``.
-
-        .. highlight:: html
-
-        For example, to extract links from this code::
-
-            <a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
-        
-        .. highlight:: python
-
-        You can use the following function in ``process_value``::
-        
-            def process_value(value):
-                m = re.search("javascript:goToPage\('(.*?)'", value)
-                if m:
-                    return m.group(1) 
-
-    :type process_value: callable
-
-RegexLinkExtractor
-==================
-
-.. class:: RegexLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None)
-
-    The RegexLinkExtractor extends the base :class:`LinkExtractor` by providing
-    additional filters that you can specify to extract links, including regular
-    expressions patterns that the links must match to be extracted. All those
-    filters are configured through these constructor paramters:
+    The SgmlLinkExtractor extends the base :class:`BaseSgmlLinkExtractor` by
+    providing additional filters that you can specify to extract links,
+    including regular expressions patterns that the links must match to be
+    extracted. All those filters are configured through these constructor
+    parameters:

    :param allow: a single regular expression (or list of regular expressions)
        that the (absolute) urls must match in order to be extracted. If not
@ -112,3 +70,52 @@ RegexLinkExtractor
        :class:`LinkExtractor` class constructor
    :type process_value: boolean

+BaseSgmlLinkExtractor
+=====================
+
+.. class:: BaseSgmlLinkExtractor(tag="a", href="href", unique=False, process_value=None)
+
+    The purpose of this Link Extractor is only to serve as a base class for the
+    :class:`SgmlLinkExtractor`. You should use that one instead.
+    
+    The constructor arguments are:
+
+    :param tag: either a string (with the name of a tag) or a function that
+        receives a tag name and returns ``True`` if links should be extracted
+        from those tag, or ``False`` if they shouldn't. Defaults to ``'a'``.
+        request (once its downloaded) as its first parameter. For more
+        information see :ref:`ref-request-callback-arguments` below.
+    :type tag: str or callable
+
+    :param attr:  either string (with the name of a tag attribute), or a
+        function that receives a an attribute name and returns ``True`` if
+        links should be extracted from it, or ``False`` if the shouldn't.
+        Defaults to ``href``.
+    :type attr: str or callable
+
+    :param unique: is a boolean that specifies if a duplicate filtering should
+        be applied to links extracted.
+    :type unique: boolean
+
+    :param process_value: a function which receives each value extracted from
+        the tag and attributes scanned and can modify the value and return a
+        new one, or return ``None`` to ignore the link altogether. If not
+        given, ``process_value`` defaults to ``lambda x: x``.
+
+        .. highlight:: html
+
+        For example, to extract links from this code::
+
+            <a href="javascript:goToPage('../other/page.html'); return false">Link text</a>
+        
+        .. highlight:: python
+
+        You can use the following function in ``process_value``::
+        
+            def process_value(value):
+                m = re.search("javascript:goToPage\('(.*?)'", value)
+                if m:
+                    return m.group(1) 
+
+    :type process_value: callable
+
--- a/docs/ref/spiders.rst
+++ b/docs/ref/spiders.rst
@ -152,7 +152,7 @@ CrawlSpider example
 Let's now take a look at an example CrawlSpider with rules::

    from scrapy.contrib.spiders import CrawlSpider, Rule
-    from scrapy.link.extractors import RegexLinkExtractor
+    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.xpath.selector import HtmlXPathSelector
    from scrapy.item import ScrapedItem

@ -163,10 +163,10 @@ Let's now take a look at an example CrawlSpider with rules::
        rules = (
            # Extract links matching 'category.php' (but not matching 'subsection.php') 
            # and follow links from them (since no callback means follow=True by default).
-            Rule(RegexLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
+            Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),

            # Extract links matching 'item.php' and parse them with the spider's method parse_item
-            Rule(RegexLinkExtractor(allow=('item\.php', )), callback='parse_item'),
+            Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
        )

        def parse_item(self, response):
--- a/docs/topics/firebug.rst
+++ b/docs/topics/firebug.rst
@ -63,7 +63,7 @@ those links. For example, the following one::

 So, based on that regular expression we can create the first crawling rule::

-    Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$', ),
+    Rule(SgmlLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$', ),
        'parse_category',
        follow=True,
    ),
@ -75,7 +75,7 @@ process and extract data from those pages.

 This is how the spider would look so far::

-   from scrapy.link.extractors import RegexLinkExtractor
+   from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
   from scrapy.contrib.spiders import CrawlSpider, Rule

   class GoogleDirectorySpider(CrawlSpider):
@ -83,7 +83,7 @@ This is how the spider would look so far::
       start_urls = ['http://www.google.com/dirhp']

       rules = (
-           Rule(RegexLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
+           Rule(SgmlLinkExtractor(allow='google.com/[A-Z][a-zA-Z_/]+$'),
               'parse_category', follow=True,
           ),
       )
--- a/scrapy/contrib/linkextractors/init.py
+++ b/scrapy/contrib/linkextractors/init.py
@ -0,0 +1,7 @@
+"""
+scrapy.contrib.linkextractors
+
+This package contains a collection of Link Extractors. 
+
+For more info see docs/ref/link-extractors.rst
+"""
--- a/scrapy/contrib/linkextractors/image.py
+++ b/scrapy/contrib/linkextractors/image.py
@ -1,7 +1,8 @@
 """
-This module provides additional LinkExtractors, apart from the ones in scrapy.link
-and scrapy.link.extractors.
+This module implements the HtmlImageLinkExtractor for extracting 
+image links only.
 """
+
 import urlparse

 from scrapy.link import Link
--- a/scrapy/contrib/linkextractors/regex.py
+++ b/scrapy/contrib/linkextractors/regex.py
@ -0,0 +1,28 @@
+import re
+
+from scrapy.utils.url import urljoin_rfc as urljoin
+from scrapy.utils.markup import remove_tags, remove_entities, replace_escape_chars
+
+from scrapy.link import Link
+from .sgml import SgmlLinkExtractor
+
+linkre = re.compile(
+        "<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>", 
+        re.DOTALL | re.IGNORECASE)
+
+def clean_link(link_text):
+    """Remove leading and trailing whitespace and punctuation"""
+    return link_text.strip("\t\r\n '\"")
+
+class RegexLinkExtractor(SgmlLinkExtractor):
+    """High performant link extractor"""
+    def _extract_links(self, response_text, response_url, response_encoding):
+        base_url = self.base_url if self.base_url else response_url
+
+        clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
+        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()
+
+        links_text = linkre.findall(response_text)
+        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])
+
+        return [Link(url, text) for url, text in urlstext]
--- a/scrapy/contrib/linkextractors/sgml.py
+++ b/scrapy/contrib/linkextractors/sgml.py
@ -0,0 +1,126 @@
+"""
+SGMLParser-based Link extractors
+"""
+
+import re
+
+from scrapy.xpath import HtmlXPathSelector
+from scrapy.link import Link
+from scrapy.utils.misc import arg_to_iter
+from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
+from scrapy.utils.url import safe_url_string, urljoin_rfc as urljoin, canonicalize_url, url_is_from_any_domain
+
+class BaseSgmlLinkExtractor(FixedSGMLParser):
+
+    def __init__(self, tag="a", attr="href", unique=False, process_value=None):
+        FixedSGMLParser.__init__(self)
+        self.scan_tag = tag if callable(tag) else lambda t: t == tag
+        self.scan_attr = attr if callable(attr) else lambda a: a == attr
+        self.process_value = (lambda v: v) if process_value is None else process_value
+        self.current_link = None
+        self.unique = unique
+
+    def _extract_links(self, response_text, response_url, response_encoding):
+        self.reset()
+        self.feed(response_text)
+        self.close()
+
+        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
+
+        ret = []
+        base_url = self.base_url if self.base_url else response_url
+        for link in links:
+            link.url = urljoin(base_url, link.url)
+            link.url = safe_url_string(link.url, response_encoding)
+            link.text = str_to_unicode(link.text, response_encoding)
+            ret.append(link)
+
+        return ret
+
+    def extract_links(self, response):
+        # wrapper needed to allow to work directly with text
+        return self._extract_links(response.body, response.url, response.encoding)
+
+    def reset(self):
+        FixedSGMLParser.reset(self)
+        self.links = []
+        self.base_url = None
+
+    def unknown_starttag(self, tag, attrs):
+        if tag == 'base':
+            self.base_url = dict(attrs).get('href')
+        if self.scan_tag(tag):
+            for attr, value in attrs:
+                if self.scan_attr(attr):
+                    url = self.process_value(value)
+                    if url is not None:
+                        link = Link(url=url)
+                        self.links.append(link)
+                        self.current_link = link
+
+    def unknown_endtag(self, tag):
+        self.current_link = None
+
+    def handle_data(self, data):
+        if self.current_link and not self.current_link.text:
+            self.current_link.text = data.strip()
+
+    def matches(self, url):
+        """This extractor matches with any url, since
+        it doesn't contain any patterns"""
+        return True
+
+_re_type = type(re.compile("", 0))
+
+_matches = lambda url, regexs: any((r.search(url) for r in regexs))
+_is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'file'])
+
+class SgmlLinkExtractor(BaseSgmlLinkExtractor):
+
+    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
+                 tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
+        self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
+        self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
+        self.allow_domains = set(arg_to_iter(allow_domains))
+        self.deny_domains = set(arg_to_iter(deny_domains))
+        self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
+        self.canonicalize = canonicalize
+        tag_func = lambda x: x in tags
+        attr_func = lambda x: x in attrs
+        BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func, 
+            unique=unique, process_value=process_value)
+
+    def extract_links(self, response):
+        if self.restrict_xpaths:
+            hxs = HtmlXPathSelector(response)
+            html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
+            links = self._extract_links(html_slice, response.url, response.encoding)
+        else:
+            links = BaseSgmlLinkExtractor.extract_links(self, response)
+
+        links = [link for link in links if _is_valid_url(link.url)]
+
+        if self.allow_res:
+            links = [link for link in links if _matches(link.url, self.allow_res)]
+        if self.deny_res:
+            links = [link for link in links if not _matches(link.url, self.deny_res)]
+        if self.allow_domains:
+            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
+        if self.deny_domains:
+            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
+
+        if self.canonicalize:
+            for link in links:
+                link.url = canonicalize_url(link.url)
+
+        return links
+
+    def matches(self, url):
+        if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
+            return False
+        if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
+            return False
+
+        allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
+        denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
+        return any(allowed) and not any(denied)
--- a/scrapy/contrib/spiders/generic.py
+++ b/scrapy/contrib/spiders/generic.py
@ -1,5 +1,5 @@
 from scrapy.contrib.spiders import CrawlSpider, Rule
-from scrapy.link.extractors import RegexLinkExtractor
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

 class GenericSpider(CrawlSpider):
    """
@ -10,11 +10,11 @@ class GenericSpider(CrawlSpider):
    def __init__(self, domain_name):
        self.domain_name = domain_name
        self.rules = (
-            Rule(RegexLinkExtractor(allow_domains=(domain_name,)), self.parse_note, follow=True),
+            Rule(SgmlLinkExtractor(allow_domains=(domain_name,)), self.parse_note, follow=True),
        )
        super(GenericSpider, self).__init__()
        
    def parse_note(self, response):
        pass

-# not a singleton
+# not a singleton
--- a/scrapy/contrib_exp/adaptors/extraction.py
+++ b/scrapy/contrib_exp/adaptors/extraction.py
@ -10,7 +10,7 @@ from scrapy.utils.url import is_url
 from scrapy.utils.response import get_base_url
 from scrapy.utils.python import flatten, unicode_to_str
 from scrapy.xpath.selector import XPathSelector, XPathSelectorList
-from scrapy.contrib.link_extractors import HTMLImageLinkExtractor
+from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor

 def extract(location, adaptor_args=None):
    """
--- a/scrapy/link/init.py
+++ b/scrapy/link/init.py
@ -1,76 +1,12 @@
 """
-LinkExtractor provides en efficient way to extract links from pages
+This module defines the Link object used in Link extractors.

-See documentation in docs/ref/link-extractors.rst
+For actual link extractors implementation see scrapy.contrib.linkextractor, or
+its documentation in: docs/ref/link-extractors.rst
 """

-from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
-from scrapy.utils.url import safe_url_string, urljoin_rfc as urljoin
-
-class LinkExtractor(FixedSGMLParser):
-
-    def __init__(self, tag="a", attr="href", unique=False, process_value=None):
-        FixedSGMLParser.__init__(self)
-        self.scan_tag = tag if callable(tag) else lambda t: t == tag
-        self.scan_attr = attr if callable(attr) else lambda a: a == attr
-        self.process_value = (lambda v: v) if process_value is None else process_value
-        self.current_link = None
-        self.unique = unique
-
-    def _extract_links(self, response_text, response_url, response_encoding):
-        self.reset()
-        self.feed(response_text)
-        self.close()
-
-        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
-
-        ret = []
-        base_url = self.base_url if self.base_url else response_url
-        for link in links:
-            link.url = urljoin(base_url, link.url)
-            link.url = safe_url_string(link.url, response_encoding)
-            link.text = str_to_unicode(link.text, response_encoding)
-            ret.append(link)
-
-        return ret
-
-    def extract_links(self, response):
-        # wrapper needed to allow to work directly with text
-        return self._extract_links(response.body, response.url, response.encoding)
-
-    def reset(self):
-        FixedSGMLParser.reset(self)
-        self.links = []
-        self.base_url = None
-
-    def unknown_starttag(self, tag, attrs):
-        if tag == 'base':
-            self.base_url = dict(attrs).get('href')
-        if self.scan_tag(tag):
-            for attr, value in attrs:
-                if self.scan_attr(attr):
-                    url = self.process_value(value)
-                    if url is not None:
-                        link = Link(url=url)
-                        self.links.append(link)
-                        self.current_link = link
-
-    def unknown_endtag(self, tag):
-        self.current_link = None
-
-    def handle_data(self, data):
-        if self.current_link and not self.current_link.text:
-            self.current_link.text = data.strip()
-
-    def matches(self, url):
-        """This extractor matches with any url, since
-        it doesn't contain any patterns"""
-        return True
-
-
 class Link(object):
-    """
-    Link objects represent an extracted link by the LinkExtractor.
+    """Link objects represent an extracted link by the LinkExtractor.
    At the moment, it contains just the url and link text.
    """

@ -85,3 +21,26 @@ class Link(object):

    def __repr__(self):
        return '<Link url=%r text=%r >' % (self.url, self.text)
+
+
+# FIXME: code below is for backwards compatibility and should be removed before
+# the 0.7 release
+
+import warnings
+
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
+
+class LinkExtractor(BaseSgmlLinkExtractor):
+    
+    def __init__(self, *args, **kwargs):
+        warnings.warn("scrapy.link.LinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.BaseSgmlLinkExtractor instead",
+            DeprecationWarning, stacklevel=2)
+        BaseSgmlLinkExtractor.__init__(self, *args, **kwargs)
+
+class RegexLinkExtractor(SgmlLinkExtractor):
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn("scrapy.link.RegexLinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor instead",
+            DeprecationWarning, stacklevel=2)
+        SgmlLinkExtractor.__init__(self, *args, **kwargs)
+
--- a/scrapy/link/extractors.py
+++ b/scrapy/link/extractors.py
@ -1,68 +1,14 @@
-"""
-This module provides some LinkExtractors, which extend the base LinkExtractor
-(scrapy.link.LinkExtractor) with some additional useful features.
+# FIXME: code below is for backwards compatibility and should be removed before
+# the 0.7 release

-See documentation in docs/ref/link-extractors.rst
-"""
+import warnings

-import re
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

-from scrapy.link import LinkExtractor
-from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
-from scrapy.xpath import HtmlXPathSelector
-from scrapy.utils.misc import arg_to_iter
+class RegexLinkExtractor(SgmlLinkExtractor):

-_re_type = type(re.compile("", 0))
+    def __init__(self, *args, **kwargs):
+        warnings.warn("scrapy.link.extractors.RegexLinkExtractor is deprecated, use scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor instead",
+            DeprecationWarning, stacklevel=2)
+        SgmlLinkExtractor.__init__(self, *args, **kwargs)

-_matches = lambda url, regexs: any((r.search(url) for r in regexs))
-_is_valid_url = lambda url: url.split('://', 1)[0] in set(['http', 'https', 'file'])
-
-class RegexLinkExtractor(LinkExtractor):
-
-    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
-                 tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None):
-        self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)]
-        self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
-        self.allow_domains = set(arg_to_iter(allow_domains))
-        self.deny_domains = set(arg_to_iter(deny_domains))
-        self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
-        self.canonicalize = canonicalize
-        tag_func = lambda x: x in tags
-        attr_func = lambda x: x in attrs
-        LinkExtractor.__init__(self, tag=tag_func, attr=attr_func, 
-            unique=unique, process_value=process_value)
-
-    def extract_links(self, response):
-        if self.restrict_xpaths:
-            hxs = HtmlXPathSelector(response)
-            html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
-            links = self._extract_links(html_slice, response.url, response.encoding)
-        else:
-            links = LinkExtractor.extract_links(self, response)
-
-        links = [link for link in links if _is_valid_url(link.url)]
-
-        if self.allow_res:
-            links = [link for link in links if _matches(link.url, self.allow_res)]
-        if self.deny_res:
-            links = [link for link in links if not _matches(link.url, self.deny_res)]
-        if self.allow_domains:
-            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
-        if self.deny_domains:
-            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
-
-        if self.canonicalize:
-            for link in links:
-                link.url = canonicalize_url(link.url)
-
-        return links
-
-    def matches(self, url):
-        if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
-            return False
-        if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
-            return False
-
-        allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
-        denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
-        return any(allowed) and not any(denied)
--- a/scrapy/tests/sample_data/link_extractor/regex_linkextractor.html
+++ b/scrapy/tests/sample_data/link_extractor/regex_linkextractor.html
--- a/scrapy/tests/test_contrib_linkextractors.py
+++ b/scrapy/tests/test_contrib_linkextractors.py
@ -2,9 +2,9 @@ import re
 import unittest

 from scrapy.http import HtmlResponse
-from scrapy.link import LinkExtractor, Link
-from scrapy.link.extractors import RegexLinkExtractor
-from scrapy.contrib.link_extractors import HTMLImageLinkExtractor
+from scrapy.link import Link
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor, BaseSgmlLinkExtractor
+from scrapy.contrib.linkextractors.image import HTMLImageLinkExtractor
 from scrapy.tests import get_testdata

 class LinkExtractorTestCase(unittest.TestCase):
@ -18,7 +18,7 @@ class LinkExtractorTestCase(unittest.TestCase):
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html)

-        lx = LinkExtractor()  # default: tag=a, attr=href
+        lx = BaseSgmlLinkExtractor()  # default: tag=a, attr=href
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://example.org/somepage/item/12.html', text='Item 12'), 
                          Link(url='http://example.org/about.html', text='About us'),
@ -31,7 +31,7 @@ class LinkExtractorTestCase(unittest.TestCase):
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html)

-        lx = LinkExtractor()  # default: tag=a, attr=href
+        lx = BaseSgmlLinkExtractor()  # default: tag=a, attr=href
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])

@ -42,7 +42,7 @@ class LinkExtractorTestCase(unittest.TestCase):
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body)

-        lx = LinkExtractor()
+        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.extract_links(response_utf8),
            [ Link(url='http://example.com/sample_%C3%B1.html', text=''),
              Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')) ])
@ -59,75 +59,75 @@ class LinkExtractorTestCase(unittest.TestCase):
        url1 = 'http://lotsofstuff.com/stuff1/index'
        url2 = 'http://evenmorestuff.com/uglystuff/index'

-        lx = LinkExtractor()
+        lx = BaseSgmlLinkExtractor()
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), True)

-class RegexLinkExtractorTestCase(unittest.TestCase):
+class SgmlLinkExtractorTestCase(unittest.TestCase):
    def setUp(self):
-        body = get_testdata('link_extractor', 'regex_linkextractor.html')
+        body = get_testdata('link_extractor', 'sgml_linkextractor.html')
        self.response = HtmlResponse(url='http://example.com/index', body=body)

    def test_urls_type(self):
        '''Test that the resulting urls are regular strings and not a unicode objects'''
-        lx = RegexLinkExtractor()
+        lx = SgmlLinkExtractor()
        self.assertTrue(all(isinstance(link.url, str) for link in lx.extract_links(self.response)))

    def test_extraction(self):
        '''Test the extractor's behaviour among different situations'''

-        lx = RegexLinkExtractor()
+        lx = SgmlLinkExtractor()
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
              Link(url='http://www.google.com/something', text=u'') ])

-        lx = RegexLinkExtractor(allow=('sample', ))
+        lx = SgmlLinkExtractor(allow=('sample', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])

-        lx = RegexLinkExtractor(allow=('sample', ), unique=False)
+        lx = SgmlLinkExtractor(allow=('sample', ), unique=False)
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 repetition') ])

-        lx = RegexLinkExtractor(allow=('sample', ), deny=('3', ))
+        lx = SgmlLinkExtractor(allow=('sample', ), deny=('3', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2') ])

-        lx = RegexLinkExtractor(allow_domains=('google.com', ))
+        lx = SgmlLinkExtractor(allow_domains=('google.com', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://www.google.com/something', text=u'') ])

-        lx = RegexLinkExtractor(tags=('img', ), attrs=('src', ))
+        lx = SgmlLinkExtractor(tags=('img', ), attrs=('src', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample2.jpg', text=u'') ])

    def test_extraction_using_single_values(self):
        '''Test the extractor's behaviour among different situations'''

-        lx = RegexLinkExtractor(allow='sample')
+        lx = SgmlLinkExtractor(allow='sample')
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2'),
              Link(url='http://example.com/sample3.html', text=u'sample 3 text') ])

-        lx = RegexLinkExtractor(allow='sample', deny='3')
+        lx = SgmlLinkExtractor(allow='sample', deny='3')
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2') ])

-        lx = RegexLinkExtractor(allow_domains='google.com')
+        lx = SgmlLinkExtractor(allow_domains='google.com')
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://www.google.com/something', text=u'') ])

-        lx = RegexLinkExtractor(deny_domains='example.com')
+        lx = SgmlLinkExtractor(deny_domains='example.com')
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://www.google.com/something', text=u'') ])

@ -135,23 +135,23 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
        url1 = 'http://lotsofstuff.com/stuff1/index'
        url2 = 'http://evenmorestuff.com/uglystuff/index'

-        lx = RegexLinkExtractor(allow=(r'stuff1', ))
+        lx = SgmlLinkExtractor(allow=(r'stuff1', ))
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), False)

-        lx = RegexLinkExtractor(deny=(r'uglystuff', ))
+        lx = SgmlLinkExtractor(deny=(r'uglystuff', ))
        self.assertEqual(lx.matches(url1), True)
        self.assertEqual(lx.matches(url2), False)

-        lx = RegexLinkExtractor(allow_domains=('evenmorestuff.com', ))
+        lx = SgmlLinkExtractor(allow_domains=('evenmorestuff.com', ))
        self.assertEqual(lx.matches(url1), False)
        self.assertEqual(lx.matches(url2), True)

-        lx = RegexLinkExtractor(deny_domains=('lotsofstuff.com', ))
+        lx = SgmlLinkExtractor(deny_domains=('lotsofstuff.com', ))
        self.assertEqual(lx.matches(url1), False)
        self.assertEqual(lx.matches(url2), True)

-        lx = RegexLinkExtractor(allow=('blah1', ), deny=('blah2', ),
+        lx = SgmlLinkExtractor(allow=('blah1', ), deny=('blah2', ),
            allow_domains=('blah1.com', ), deny_domains=('blah2.com', ))
        self.assertEqual(lx.matches('http://blah1.com/blah1'), True)
        self.assertEqual(lx.matches('http://blah1.com/blah2'), False)
@ -159,7 +159,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
        self.assertEqual(lx.matches('http://blah2.com/blah2'), False)

    def test_restrict_xpaths(self):
-        lx = RegexLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
+        lx = SgmlLinkExtractor(restrict_xpaths=('//div[@id="subwrapper"]', ))
        self.assertEqual([link for link in lx.extract_links(self.response)],
            [ Link(url='http://example.com/sample1.html', text=u''),
              Link(url='http://example.com/sample2.html', text=u'sample 2') ])
@ -177,7 +177,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='windows-1252')

-        lx = RegexLinkExtractor(restrict_xpaths="//div[@class='links']") 
+        lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']") 
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://example.org/about.html', text=u'About us\xa3')])

@ -194,7 +194,7 @@ class RegexLinkExtractorTestCase(unittest.TestCase):
            if m:
                return m.group(1)

-        lx = RegexLinkExtractor(process_value=process_value)
+        lx = SgmlLinkExtractor(process_value=process_value)
        self.assertEqual(lx.extract_links(response),
                         [Link(url='http://example.org/other/page.html', text='Link text')])

--- a/scrapy/tests/test_spiders/testspider.py
+++ b/scrapy/tests/test_spiders/testspider.py
@ -7,7 +7,7 @@ import re

 from scrapy.spider import BaseSpider
 from scrapy.item import ScrapedItem
-from scrapy.link import LinkExtractor
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 from scrapy.http import Request

 class TestSpider(BaseSpider):
@ -20,7 +20,7 @@ class TestSpider(BaseSpider):
    price_re = re.compile(">Price: \$(.*?)<", re.M)

    def parse(self, response):
-        xlink = LinkExtractor()
+        xlink = SgmlLinkExtractor()
        itemre = re.compile(self.itemurl_re)
        for link in xlink.extract_links(response):
            if itemre.search(link.url):