mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 11:24:08 +00:00
promote LxmlLinkExtractor as default in docs
This commit is contained in:
parent
90e69141d3
commit
a9ecef5662
@ -130,14 +130,14 @@ For more information about XPath see the `XPath reference`_.
|
||||
Finally, here's the spider code::
|
||||
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
|
||||
class MininovaSpider(CrawlSpider):
|
||||
|
||||
name = 'mininova'
|
||||
allowed_domains = ['mininova.org']
|
||||
start_urls = ['http://www.mininova.org/today']
|
||||
rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
||||
rules = [Rule(LinkExtractor(allow=['/tor/\d+']), 'parse_torrent')]
|
||||
|
||||
def parse_torrent(self, response):
|
||||
torrent = TorrentItem()
|
||||
|
@ -69,7 +69,7 @@ those links. For example, the following one::
|
||||
|
||||
So, based on that regular expression we can create the first crawling rule::
|
||||
|
||||
Rule(SgmlLinkExtractor(allow='directory.google.com/[A-Z][a-zA-Z_/]+$', ),
|
||||
Rule(LinkExtractor(allow='directory.google.com/[A-Z][a-zA-Z_/]+$', ),
|
||||
'parse_category',
|
||||
follow=True,
|
||||
),
|
||||
@ -81,7 +81,7 @@ process and extract data from those pages.
|
||||
|
||||
This is how the spider would look so far::
|
||||
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
|
||||
class GoogleDirectorySpider(CrawlSpider):
|
||||
@ -90,7 +90,7 @@ This is how the spider would look so far::
|
||||
start_urls = ['http://directory.google.com/']
|
||||
|
||||
rules = (
|
||||
Rule(SgmlLinkExtractor(allow='directory\.google\.com/[A-Z][a-zA-Z_/]+$'),
|
||||
Rule(LinkExtractor(allow='directory\.google\.com/[A-Z][a-zA-Z_/]+$'),
|
||||
'parse_category', follow=True,
|
||||
),
|
||||
)
|
||||
|
@ -36,6 +36,12 @@ Built-in link extractors reference
|
||||
All available link extractors classes bundled with Scrapy are provided in the
|
||||
:mod:`scrapy.contrib.linkextractors` module.
|
||||
|
||||
If you don't know what link extractor to choose, just use the default which is
|
||||
the same than LxmlLinkExtractor (see below)::
|
||||
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
|
||||
|
||||
.. module:: scrapy.contrib.linkextractors.lxmlhtml
|
||||
:synopsis: lxml's HTMLParser-based link extractors
|
||||
|
||||
|
@ -318,7 +318,7 @@ Let's now take a look at an example CrawlSpider with rules::
|
||||
|
||||
import scrapy
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
|
||||
class MySpider(CrawlSpider):
|
||||
name = 'example.com'
|
||||
@ -328,10 +328,10 @@ Let's now take a look at an example CrawlSpider with rules::
|
||||
rules = (
|
||||
# Extract links matching 'category.php' (but not matching 'subsection.php')
|
||||
# and follow links from them (since no callback means follow=True by default).
|
||||
Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
|
||||
Rule(LinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
|
||||
|
||||
# Extract links matching 'item.php' and parse them with the spider's method parse_item
|
||||
Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
|
||||
Rule(LinkExtractor(allow=('item\.php', )), callback='parse_item'),
|
||||
)
|
||||
|
||||
def parse_item(self, response):
|
||||
|
@ -1,7 +1,8 @@
|
||||
"""
|
||||
scrapy.contrib.linkextractors
|
||||
|
||||
This package contains a collection of Link Extractors.
|
||||
This package contains a collection of Link Extractors.
|
||||
|
||||
For more info see docs/topics/link-extractors.rst
|
||||
"""
|
||||
from .lxmlhtml import LxmlLinkExtractor as LinkExtractor
|
||||
|
@ -1,5 +1,5 @@
|
||||
import scrapy
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
|
||||
from $project_name.items import ${ProjectName}Item
|
||||
@ -11,7 +11,7 @@ class $classname(CrawlSpider):
|
||||
start_urls = ['http://www.$domain/']
|
||||
|
||||
rules = (
|
||||
Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
|
||||
Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
|
||||
)
|
||||
|
||||
def parse_item(self, response):
|
||||
|
@ -8,7 +8,7 @@ from urllib import urlencode
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request
|
||||
from scrapy.item import Item
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
|
||||
|
||||
class MetaSpider(Spider):
|
||||
@ -26,7 +26,7 @@ class MetaSpider(Spider):
|
||||
class FollowAllSpider(MetaSpider):
|
||||
|
||||
name = 'follow'
|
||||
link_extractor = SgmlLinkExtractor()
|
||||
link_extractor = LinkExtractor()
|
||||
|
||||
def __init__(self, total=10, show=20, order="rand", maxlatency=0.0, *args, **kwargs):
|
||||
super(FollowAllSpider, self).__init__(*args, **kwargs)
|
||||
|
@ -23,7 +23,7 @@ from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.tests import tests_datadir
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.item import Item, Field
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.signal import disconnect_all
|
||||
|
||||
@ -41,7 +41,7 @@ class TestSpider(Spider):
|
||||
price_re = re.compile(">Price: \$(.*?)<", re.M)
|
||||
|
||||
def parse(self, response):
|
||||
xlink = SgmlLinkExtractor()
|
||||
xlink = LinkExtractor()
|
||||
itemre = re.compile(self.itemurl_re)
|
||||
for link in xlink.extract_links(response):
|
||||
if itemre.search(link.url):
|
||||
|
@ -11,7 +11,7 @@ from scrapy.http import Request, Response, TextResponse, XmlResponse, HtmlRespon
|
||||
from scrapy.contrib.spiders.init import InitSpider
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule, XMLFeedSpider, \
|
||||
CSVFeedSpider, SitemapSpider
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.contrib.linkextractors import LinkExtractor
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
@ -124,7 +124,7 @@ class CrawlSpiderTest(SpiderTest):
|
||||
name="test"
|
||||
allowed_domains=['example.org']
|
||||
rules = (
|
||||
Rule(SgmlLinkExtractor(), process_links="dummy_process_links"),
|
||||
Rule(LinkExtractor(), process_links="dummy_process_links"),
|
||||
)
|
||||
|
||||
def dummy_process_links(self, links):
|
||||
@ -150,7 +150,7 @@ class CrawlSpiderTest(SpiderTest):
|
||||
name="test"
|
||||
allowed_domains=['example.org']
|
||||
rules = (
|
||||
Rule(SgmlLinkExtractor(), process_links="filter_process_links"),
|
||||
Rule(LinkExtractor(), process_links="filter_process_links"),
|
||||
)
|
||||
_test_regex = re.compile('nofollow')
|
||||
def filter_process_links(self, links):
|
||||
@ -174,7 +174,7 @@ class CrawlSpiderTest(SpiderTest):
|
||||
name="test"
|
||||
allowed_domains=['example.org']
|
||||
rules = (
|
||||
Rule(SgmlLinkExtractor(), process_links="dummy_process_links"),
|
||||
Rule(LinkExtractor(), process_links="dummy_process_links"),
|
||||
)
|
||||
|
||||
def dummy_process_links(self, links):
|
||||
|
Loading…
x
Reference in New Issue
Block a user