mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 04:43:51 +00:00
modified doc to reflect the new spider callback return policy (lists not needed)
This commit is contained in:
parent
802f918b69
commit
5862ba7db7
@ -138,7 +138,7 @@ Finally, here's the spider code::
|
|||||||
torrent['name'] = x.select("//h1/text()").extract()
|
torrent['name'] = x.select("//h1/text()").extract()
|
||||||
torrent['description'] = x.select("//div[@id='description']").extract()
|
torrent['description'] = x.select("//div[@id='description']").extract()
|
||||||
torrent['size'] = x.select("//div[@id='info-left']/p[2]/text()[2]").extract()
|
torrent['size'] = x.select("//div[@id='info-left']/p[2]/text()[2]").extract()
|
||||||
return [torrent]
|
return torrent
|
||||||
|
|
||||||
|
|
||||||
For brevity sake, we intentionally left out the import statements and the
|
For brevity sake, we intentionally left out the import statements and the
|
||||||
|
@ -137,7 +137,6 @@ This is the code for our first Spider, save it in a file named
|
|||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
filename = response.url.split("/")[-2]
|
filename = response.url.split("/")[-2]
|
||||||
open(filename, 'wb').write(response.body)
|
open(filename, 'wb').write(response.body)
|
||||||
return []
|
|
||||||
|
|
||||||
SPIDER = DmozSpider()
|
SPIDER = DmozSpider()
|
||||||
|
|
||||||
@ -369,7 +368,6 @@ Let's add this code to our spider::
|
|||||||
link = site.select('a/@href').extract()
|
link = site.select('a/@href').extract()
|
||||||
desc = site.select('text()').extract()
|
desc = site.select('text()').extract()
|
||||||
print title, link, desc
|
print title, link, desc
|
||||||
return []
|
|
||||||
|
|
||||||
SPIDER = DmozSpider()
|
SPIDER = DmozSpider()
|
||||||
|
|
||||||
|
@ -22,11 +22,11 @@ For spiders, the scraping cycle goes through something like this:
|
|||||||
:attr:`~scrapy.spider.BaseSpider.parse` method as callback function for the
|
:attr:`~scrapy.spider.BaseSpider.parse` method as callback function for the
|
||||||
Requests.
|
Requests.
|
||||||
|
|
||||||
2. In the callback function you parse the response (web page) and return an
|
2. In the callback function you parse the response (web page) and return either
|
||||||
iterable containing either :class:`~scrapy.item.Item` objects,
|
:class:`~scrapy.item.Item` objects, :class:`~scrapy.http.Request` objects,
|
||||||
:class:`~scrapy.http.Request` objects, or both. Those Requests will also
|
or an iterable of both. Those Requests will also contain a callback (maybe
|
||||||
contain a callback (maybe the same) and will then be followed by downloaded
|
the same) and will then be followed by downloaded by Scrapy and then their
|
||||||
by Scrapy and then their response handled to the specified callback.
|
response handled to the specified callback.
|
||||||
|
|
||||||
3. In callback functions you parse the page contants, typically using
|
3. In callback functions you parse the page contants, typically using
|
||||||
:ref:`topics-selectors` (but you can also use BeautifuSoup, lxml or whatever
|
:ref:`topics-selectors` (but you can also use BeautifuSoup, lxml or whatever
|
||||||
@ -138,9 +138,8 @@ BaseSpider
|
|||||||
will be used to parse the first pages crawled by the spider.
|
will be used to parse the first pages crawled by the spider.
|
||||||
|
|
||||||
The ``parse`` method is in charge of processing the response and returning
|
The ``parse`` method is in charge of processing the response and returning
|
||||||
scraped data and/or more URLs to follow, because of this, the method must
|
scraped data and/or more URLs to follow. Other Requests callbacks have
|
||||||
always return a list or at least an empty one. Other Requests callbacks
|
the same requirements as the BaseSpider class.
|
||||||
have the same requirements as the BaseSpider class.
|
|
||||||
|
|
||||||
.. method:: log(message, [level, component])
|
.. method:: log(message, [level, component])
|
||||||
|
|
||||||
@ -167,7 +166,6 @@ Let's see an example::
|
|||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
self.log('A response from %s just arrived!' % response.url)
|
self.log('A response from %s just arrived!' % response.url)
|
||||||
return []
|
|
||||||
|
|
||||||
SPIDER = MySpider()
|
SPIDER = MySpider()
|
||||||
|
|
||||||
@ -251,7 +249,7 @@ Let's now take a look at an example CrawlSpider with rules::
|
|||||||
item['id'] = hxs.select('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
|
item['id'] = hxs.select('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
|
||||||
item['name'] = hxs.select('//td[@id="item_name"]/text()').extract()
|
item['name'] = hxs.select('//td[@id="item_name"]/text()').extract()
|
||||||
item['description'] = hxs.select('//td[@id="item_description"]/text()').extract()
|
item['description'] = hxs.select('//td[@id="item_description"]/text()').extract()
|
||||||
return [item]
|
return item
|
||||||
|
|
||||||
SPIDER = MySpider()
|
SPIDER = MySpider()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user