mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 20:44:04 +00:00
more patches sent by Patrick
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40779
This commit is contained in:
parent
c1a1b8945a
commit
4013497edb
@ -204,6 +204,7 @@ Ok, done! Let's now sum this up into a spider::
|
|||||||
}
|
}
|
||||||
|
|
||||||
def parse_page(self, response):
|
def parse_page(self, response):
|
||||||
|
items = []
|
||||||
rows = hxs.x('//tr[child::td[@class="prod_attrib"]]')
|
rows = hxs.x('//tr[child::td[@class="prod_attrib"]]')
|
||||||
for product in rows:
|
for product in rows:
|
||||||
item = ScrapedItem()
|
item = ScrapedItem()
|
||||||
@ -214,8 +215,9 @@ Ok, done! Let's now sum this up into a spider::
|
|||||||
item.attribute('description', product.x('td[@class="prod_attrib"][3]/text()'))
|
item.attribute('description', product.x('td[@class="prod_attrib"][3]/text()'))
|
||||||
item.attribute('weight', product.x('td[@class="prod_attrib"][4]/text()'))
|
item.attribute('weight', product.x('td[@class="prod_attrib"][4]/text()'))
|
||||||
item.attribute('price', product.x('td[@class="prod_attrib"][5]/text()').re('(\d+)'))
|
item.attribute('price', product.x('td[@class="prod_attrib"][5]/text()').re('(\d+)'))
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
return [item]
|
return items
|
||||||
|
|
||||||
SPIDER = MySpider()
|
SPIDER = MySpider()
|
||||||
|
|
||||||
|
@ -13,6 +13,12 @@ TEMPLATES_DIR = '%s/templates' % $project_name.__path__[0]
|
|||||||
ENABLED_SPIDERS_FILE = '%s/conf/enabled_spiders.list' % $project_name.__path__[0]
|
ENABLED_SPIDERS_FILE = '%s/conf/enabled_spiders.list' % $project_name.__path__[0]
|
||||||
DEFAULT_ITEM_CLASS = 'scrapy.item.ScrapedItem'
|
DEFAULT_ITEM_CLASS = 'scrapy.item.ScrapedItem'
|
||||||
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
|
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
|
||||||
|
|
||||||
|
# The amount of time (in secs) that the downloader should wait before
|
||||||
|
# downloading consecutive pages from the same spider. This can be used
|
||||||
|
# to throttle the crawling speed to avoid hitting servers too
|
||||||
|
# hard. Decimal numbers are supported. Example:
|
||||||
|
# DOWNLOAD_DELAY = 2.5
|
||||||
DOWNLOAD_TIMEOUT = 600
|
DOWNLOAD_TIMEOUT = 600
|
||||||
|
|
||||||
# use this spider class as default when no spider was found for a given url
|
# use this spider class as default when no spider was found for a given url
|
||||||
|
Loading…
x
Reference in New Issue
Block a user