1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 11:24:08 +00:00

promote LxmlLinkExtractor as default in docs

This commit is contained in:
Daniel Graña 2014-06-25 14:34:30 -03:00
parent 90e69141d3
commit a9ecef5662
9 changed files with 26 additions and 19 deletions

View File

@ -130,14 +130,14 @@ For more information about XPath see the `XPath reference`_.
Finally, here's the spider code::
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
class MininovaSpider(CrawlSpider):
name = 'mininova'
allowed_domains = ['mininova.org']
start_urls = ['http://www.mininova.org/today']
rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
rules = [Rule(LinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
def parse_torrent(self, response):
torrent = TorrentItem()

View File

@ -69,7 +69,7 @@ those links. For example, the following one::
So, based on that regular expression we can create the first crawling rule::
Rule(SgmlLinkExtractor(allow='directory.google.com/[A-Z][a-zA-Z_/]+$', ),
Rule(LinkExtractor(allow='directory.google.com/[A-Z][a-zA-Z_/]+$', ),
'parse_category',
follow=True,
),
@ -81,7 +81,7 @@ process and extract data from those pages.
This is how the spider would look so far::
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
class GoogleDirectorySpider(CrawlSpider):
@ -90,7 +90,7 @@ This is how the spider would look so far::
start_urls = ['http://directory.google.com/']
rules = (
Rule(SgmlLinkExtractor(allow='directory\.google\.com/[A-Z][a-zA-Z_/]+$'),
Rule(LinkExtractor(allow='directory\.google\.com/[A-Z][a-zA-Z_/]+$'),
'parse_category', follow=True,
),
)

View File

@ -36,6 +36,12 @@ Built-in link extractors reference
All available link extractors classes bundled with Scrapy are provided in the
:mod:`scrapy.contrib.linkextractors` module.
If you don't know what link extractor to choose, just use the default which is
the same than LxmlLinkExtractor (see below)::
from scrapy.contrib.linkextractors import LinkExtractor
.. module:: scrapy.contrib.linkextractors.lxmlhtml
:synopsis: lxml's HTMLParser-based link extractors

View File

@ -318,7 +318,7 @@ Let's now take a look at an example CrawlSpider with rules::
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
class MySpider(CrawlSpider):
name = 'example.com'
@ -328,10 +328,10 @@ Let's now take a look at an example CrawlSpider with rules::
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
Rule(LinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
Rule(LinkExtractor(allow=('item\.php', )), callback='parse_item'),
)
def parse_item(self, response):

View File

@ -1,7 +1,8 @@
"""
scrapy.contrib.linkextractors
This package contains a collection of Link Extractors.
This package contains a collection of Link Extractors.
For more info see docs/topics/link-extractors.rst
"""
from .lxmlhtml import LxmlLinkExtractor as LinkExtractor

View File

@ -1,5 +1,5 @@
import scrapy
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from $project_name.items import ${ProjectName}Item
@ -11,7 +11,7 @@ class $classname(CrawlSpider):
start_urls = ['http://www.$domain/']
rules = (
Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)
def parse_item(self, response):

View File

@ -8,7 +8,7 @@ from urllib import urlencode
from scrapy.spider import Spider
from scrapy.http import Request
from scrapy.item import Item
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
class MetaSpider(Spider):
@ -26,7 +26,7 @@ class MetaSpider(Spider):
class FollowAllSpider(MetaSpider):
name = 'follow'
link_extractor = SgmlLinkExtractor()
link_extractor = LinkExtractor()
def __init__(self, total=10, show=20, order="rand", maxlatency=0.0, *args, **kwargs):
super(FollowAllSpider, self).__init__(*args, **kwargs)

View File

@ -23,7 +23,7 @@ from scrapy.xlib.pydispatch import dispatcher
from scrapy.tests import tests_datadir
from scrapy.spider import Spider
from scrapy.item import Item, Field
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.http import Request
from scrapy.utils.signal import disconnect_all
@ -41,7 +41,7 @@ class TestSpider(Spider):
price_re = re.compile(">Price: \$(.*?)<", re.M)
def parse(self, response):
xlink = SgmlLinkExtractor()
xlink = LinkExtractor()
itemre = re.compile(self.itemurl_re)
for link in xlink.extract_links(response):
if itemre.search(link.url):

View File

@ -11,7 +11,7 @@ from scrapy.http import Request, Response, TextResponse, XmlResponse, HtmlRespon
from scrapy.contrib.spiders.init import InitSpider
from scrapy.contrib.spiders import CrawlSpider, Rule, XMLFeedSpider, \
CSVFeedSpider, SitemapSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.exceptions import ScrapyDeprecationWarning
@ -124,7 +124,7 @@ class CrawlSpiderTest(SpiderTest):
name="test"
allowed_domains=['example.org']
rules = (
Rule(SgmlLinkExtractor(), process_links="dummy_process_links"),
Rule(LinkExtractor(), process_links="dummy_process_links"),
)
def dummy_process_links(self, links):
@ -150,7 +150,7 @@ class CrawlSpiderTest(SpiderTest):
name="test"
allowed_domains=['example.org']
rules = (
Rule(SgmlLinkExtractor(), process_links="filter_process_links"),
Rule(LinkExtractor(), process_links="filter_process_links"),
)
_test_regex = re.compile('nofollow')
def filter_process_links(self, links):
@ -174,7 +174,7 @@ class CrawlSpiderTest(SpiderTest):
name="test"
allowed_domains=['example.org']
rules = (
Rule(SgmlLinkExtractor(), process_links="dummy_process_links"),
Rule(LinkExtractor(), process_links="dummy_process_links"),
)
def dummy_process_links(self, links):