promote LxmlLinkExtractor as default in docs

2025-02-23 11:24:08 +00:00 · 2014-06-25 14:34:30 -03:00 · 2014-06-25 14:34:30 -03:00 · a9ecef5662
commit a9ecef5662
parent 90e69141d3
9 changed files with 26 additions and 19 deletions
--- a/docs/intro/overview.rst
+++ b/docs/intro/overview.rst
@ -130,14 +130,14 @@ For more information about XPath see the `XPath reference`_.
 Finally, here's the spider code::

    from scrapy.contrib.spiders import CrawlSpider, Rule
-    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+    from scrapy.contrib.linkextractors import LinkExtractor

    class MininovaSpider(CrawlSpider):

        name = 'mininova'
        allowed_domains = ['mininova.org']
        start_urls = ['http://www.mininova.org/today']
-        rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
+        rules = [Rule(LinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]

        def parse_torrent(self, response):
            torrent = TorrentItem()
--- a/docs/topics/firebug.rst
+++ b/docs/topics/firebug.rst
@ -69,7 +69,7 @@ those links. For example, the following one::

 So, based on that regular expression we can create the first crawling rule::

-    Rule(SgmlLinkExtractor(allow='directory.google.com/[A-Z][a-zA-Z_/]+$', ),
+    Rule(LinkExtractor(allow='directory.google.com/[A-Z][a-zA-Z_/]+$', ),
        'parse_category',
        follow=True,
    ),
@ -81,7 +81,7 @@ process and extract data from those pages.

 This is how the spider would look so far::

-   from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+   from scrapy.contrib.linkextractors import LinkExtractor
   from scrapy.contrib.spiders import CrawlSpider, Rule

   class GoogleDirectorySpider(CrawlSpider):
@ -90,7 +90,7 @@ This is how the spider would look so far::
       start_urls = ['http://directory.google.com/']

       rules = (
-           Rule(SgmlLinkExtractor(allow='directory\.google\.com/[A-Z][a-zA-Z_/]+$'),
+           Rule(LinkExtractor(allow='directory\.google\.com/[A-Z][a-zA-Z_/]+$'),
               'parse_category', follow=True,
           ),
       )
--- a/docs/topics/link-extractors.rst
+++ b/docs/topics/link-extractors.rst
@ -36,6 +36,12 @@ Built-in link extractors reference
 All available link extractors classes bundled with Scrapy are provided in the
 :mod:`scrapy.contrib.linkextractors` module.

+If you don't know what link extractor to choose, just use the default which is
+the same than LxmlLinkExtractor (see below)::
+
+    from scrapy.contrib.linkextractors import LinkExtractor
+
+
 .. module:: scrapy.contrib.linkextractors.lxmlhtml
   :synopsis: lxml's HTMLParser-based link extractors

--- a/docs/topics/spiders.rst
+++ b/docs/topics/spiders.rst
@ -318,7 +318,7 @@ Let's now take a look at an example CrawlSpider with rules::

    import scrapy
    from scrapy.contrib.spiders import CrawlSpider, Rule
-    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+    from scrapy.contrib.linkextractors import LinkExtractor

    class MySpider(CrawlSpider):
        name = 'example.com'
@ -328,10 +328,10 @@ Let's now take a look at an example CrawlSpider with rules::
        rules = (
            # Extract links matching 'category.php' (but not matching 'subsection.php')
            # and follow links from them (since no callback means follow=True by default).
-            Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
+            Rule(LinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),

            # Extract links matching 'item.php' and parse them with the spider's method parse_item
-            Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
+            Rule(LinkExtractor(allow=('item\.php', )), callback='parse_item'),
        )

        def parse_item(self, response):
--- a/scrapy/contrib/linkextractors/init.py
+++ b/scrapy/contrib/linkextractors/init.py
@ -1,7 +1,8 @@
 """
 scrapy.contrib.linkextractors

-This package contains a collection of Link Extractors. 
+This package contains a collection of Link Extractors.

 For more info see docs/topics/link-extractors.rst
 """
+from .lxmlhtml import LxmlLinkExtractor as LinkExtractor
--- a/scrapy/templates/spiders/crawl.tmpl
+++ b/scrapy/templates/spiders/crawl.tmpl
@ -1,5 +1,5 @@
 import scrapy
-from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.linkextractors import LinkExtractor
 from scrapy.contrib.spiders import CrawlSpider, Rule

 from $project_name.items import ${ProjectName}Item
@ -11,7 +11,7 @@ class $classname(CrawlSpider):
    start_urls = ['http://www.$domain/']

    rules = (
-        Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
+        Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
--- a/scrapy/tests/spiders.py
+++ b/scrapy/tests/spiders.py
@ -8,7 +8,7 @@ from urllib import urlencode
 from scrapy.spider import Spider
 from scrapy.http import Request
 from scrapy.item import Item
-from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.linkextractors import LinkExtractor


 class MetaSpider(Spider):
@ -26,7 +26,7 @@ class MetaSpider(Spider):
 class FollowAllSpider(MetaSpider):

    name = 'follow'
-    link_extractor = SgmlLinkExtractor()
+    link_extractor = LinkExtractor()

    def __init__(self, total=10, show=20, order="rand", maxlatency=0.0, *args, **kwargs):
        super(FollowAllSpider, self).__init__(*args, **kwargs)
--- a/scrapy/tests/test_engine.py
+++ b/scrapy/tests/test_engine.py
@ -23,7 +23,7 @@ from scrapy.xlib.pydispatch import dispatcher
 from scrapy.tests import tests_datadir
 from scrapy.spider import Spider
 from scrapy.item import Item, Field
-from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.linkextractors import LinkExtractor
 from scrapy.http import Request
 from scrapy.utils.signal import disconnect_all

@ -41,7 +41,7 @@ class TestSpider(Spider):
    price_re = re.compile(">Price: \$(.*?)<", re.M)

    def parse(self, response):
-        xlink = SgmlLinkExtractor()
+        xlink = LinkExtractor()
        itemre = re.compile(self.itemurl_re)
        for link in xlink.extract_links(response):
            if itemre.search(link.url):
--- a/scrapy/tests/test_spider.py
+++ b/scrapy/tests/test_spider.py
@ -11,7 +11,7 @@ from scrapy.http import Request, Response, TextResponse, XmlResponse, HtmlRespon
 from scrapy.contrib.spiders.init import InitSpider
 from scrapy.contrib.spiders import CrawlSpider, Rule, XMLFeedSpider, \
    CSVFeedSpider, SitemapSpider
-from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.linkextractors import LinkExtractor
 from scrapy.exceptions import ScrapyDeprecationWarning


@ -124,7 +124,7 @@ class CrawlSpiderTest(SpiderTest):
            name="test"
            allowed_domains=['example.org']
            rules = (
-                Rule(SgmlLinkExtractor(), process_links="dummy_process_links"),
+                Rule(LinkExtractor(), process_links="dummy_process_links"),
            )

            def dummy_process_links(self, links):
@ -150,7 +150,7 @@ class CrawlSpiderTest(SpiderTest):
            name="test"
            allowed_domains=['example.org']
            rules = (
-                Rule(SgmlLinkExtractor(), process_links="filter_process_links"),
+                Rule(LinkExtractor(), process_links="filter_process_links"),
            )
            _test_regex = re.compile('nofollow')
            def filter_process_links(self, links):
@ -174,7 +174,7 @@ class CrawlSpiderTest(SpiderTest):
            name="test"
            allowed_domains=['example.org']
            rules = (
-                Rule(SgmlLinkExtractor(), process_links="dummy_process_links"),
+                Rule(LinkExtractor(), process_links="dummy_process_links"),
            )

            def dummy_process_links(self, links):