Added the example project from the tutorial

--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40657
2025-02-27 01:43:43 +00:00 · 2009-01-06 16:08:03 +00:00 · 2009-01-06 16:08:03 +00:00 · c80fef7eb0
commit c80fef7eb0
parent 9d6defa643
10 changed files with 231 additions and 0 deletions
--- a/scrapy/trunk/examples/google_bot/init.py
+++ b/scrapy/trunk/examples/google_bot/init.py
--- a/scrapy/trunk/examples/google_bot/items.py
+++ b/scrapy/trunk/examples/google_bot/items.py
@ -0,0 +1,6 @@
 # Define here the models for your scraped items
 from scrapy.item import ScrapedItem
 class MyItem(ScrapedItem):
    pass
--- a/scrapy/trunk/examples/google_bot/scrapy-ctl.py
+++ b/scrapy/trunk/examples/google_bot/scrapy-ctl.py
@ -0,0 +1,4 @@
 #!/usr/bin/env python
 from scrapy.command.cmdline import execute
 execute()
--- a/scrapy/trunk/examples/google_bot/scrapy_settings.py
+++ b/scrapy/trunk/examples/google_bot/scrapy_settings.py
@ -0,0 +1,96 @@
 import google_bot
 # ---------------------------------------------------------------------------
 # - Scrapy settings for google_bot                                    -
 # ---------------------------------------------------------------------------
 BOT_NAME = 'scrapybot'
 BOT_VERSION = '1.0'
 SPIDER_MODULES = ['google_bot.spiders']
 NEWSPIDER_MODULE = 'google_bot.spiders'
 TEMPLATES_DIR = '%s/templates' % google_bot.__path__[0]
 ENABLED_SPIDERS_FILE = '%s/conf/enabled_spiders.list' % google_bot.__path__[0]
 DEFAULT_ITEM_CLASS = 'scrapy.item.ScrapedItem'
 USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
 DOWNLOAD_TIMEOUT = 600
 # uncomment if you want to add your own custom scrapy commands
 #COMMANDS_MODULE = 'google_bot.commands'
 #COMMANDS_SETTINGS_MODULE = 'google_bot.conf.commands'
 #Global timeout between sucessive downloads (can be overrided by spider
 #attribute download_timeout
 #DOWNLOAD_TIMEOUT = 0
 MYSQL_CONNECTION_SETTINGS = {"charset": "utf8" }
 MYSQL_CONNECTION_PING_PERIOD = 600
 SCHEDULER = 'scrapy.core.scheduler.Scheduler'
 SCHEDULER_ORDER = 'BFO'   # available orders: BFO (default), DFO
 #CACHE2_DIR = '/tmp/cache2'  # if set, enables HTTP cache
 #CACHE2_IGNORE_MISSING = 0   # ignore requests not in cache
 #CACHE2_SECTORIZE = 1         # sectorize domains to distribute storage among servers
 #STATS_ENABLED = 1   # enable stats
 #STATS_CLEANUP = 0   # cleanup domain stats when a domain is closed (saves memory)
 #STATS_DEBUG = 0     # log stats on domain closed
 EXTENSIONS = (
    'scrapy.management.web.WebConsole',
    'scrapy.management.telnet.TelnetConsole',
 )
 DOWNLOADER_MIDDLEWARES = (
    # Engine side
    'scrapy.contrib.downloadermiddleware.errorpages.ErrorPagesMiddleware',
    'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware',
    'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware',
    'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware',
    'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware',
    'scrapy.contrib.downloadermiddleware.common.CommonMiddleware',
    'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware',
    'scrapy.contrib.downloadermiddleware.compression.CompressionMiddleware',
    'scrapy.contrib.downloadermiddleware.debug.CrawlDebug',
    'scrapy.contrib.downloadermiddleware.cache.CacheMiddleware',
    # Downloader side
 )
 SPIDER_MIDDLEWARES = (
    # Engine side
    'scrapy.contrib.spidermiddleware.limit.RequestLimitMiddleware',
    'scrapy.contrib.spidermiddleware.restrict.RestrictMiddleware',
    'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware',
    'scrapy.contrib.spidermiddleware.referer.RefererMiddleware',
    'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware',
    'scrapy.contrib.spidermiddleware.depth.DepthMiddleware',
    'scrapy.contrib.spidermiddleware.urlfilter.UrlFilterMiddleware',
    # Spider side
 )
 # Item pipelines are usually configured by commands (see conf/commands)
 ITEM_PIPELINES = (
    'scrapy.contrib.pipeline.show.ShowItemPipeline',
 )
 #DEPTH_LIMIT = 10  # limit the maximum link depth to follow
 #DEPTH_STATS = 1    # enable depth stats
 # Limit URL length. See: http://www.boutell.com/newfaq/misc/urllength.html
 URLLENGTH_LIMIT = 2083
 #WEBCONSOLE_ENABLED = 1
 #WEBCONSOLE_PORT = 8060  # if not set uses a dynamic port
 #TELNETCONSOLE_ENABLED = 1
 #TELNETCONSOLE_PORT = 2020  # if not set uses a dynamic port
 # global mail sending settings
 #MAIL_HOST = 'localhost'
 #MAIL_FROM = 'scrapybot@localhost'
 # scrapy webservice
 WS_ENABLED = 0
 SPIDERPROFILER_ENABLED = 0
--- a/scrapy/trunk/examples/google_bot/spiders/init.py
+++ b/scrapy/trunk/examples/google_bot/spiders/init.py
@ -0,0 +1 @@
 # Place here all your scrapy spiders
--- a/scrapy/trunk/examples/google_bot/spiders/google_directory.py
+++ b/scrapy/trunk/examples/google_bot/spiders/google_directory.py
@ -0,0 +1,44 @@
 # -*- coding: utf8 -*-
 from scrapy.xpath import HtmlXPathSelector
 from scrapy.item import ScrapedItem
 from scrapy.contrib import adaptors
 from scrapy.contrib.spiders import CrawlSpider, Rule
 from scrapy.link.extractors import RegexLinkExtractor
 from scrapy.utils.misc import items_to_csv
 class GoogleDirectorySpider(CrawlSpider):
    domain_name = 'google.com'
    start_urls = ['http://www.google.com/dirhp']
    rules = (
        Rule(RegexLinkExtractor(allow=('google.com/[A-Z][a-zA-Z_/]+$', ), ),
            'parse_category',
            follow=True,
        ),
    )
    adaptor_pipe = [adaptors.extract, adaptors.Delist(), adaptors.strip]
    csv_file = open('scraped_items.csv', 'w')
    def parse_category(self, response):
        items = [] # The item (links to websites) list we're going to return
        hxs = HtmlXPathSelector(response) # The selector we're going to use in order to extract data from the page
        links = hxs.x('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font')
        for link in links:
            item = ScrapedItem()
            item.set_adaptors({
                'name': self.adaptor_pipe,
                'url': self.adaptor_pipe,
                'description': self.adaptor_pipe,
            })
            item.attribute('name', link.x('a/text()'))
            item.attribute('url', link.x('a/@href'))
            item.attribute('description', link.x('font[2]/text()'))
            items.append(item)
        items_to_csv(self.csv_file, items)
        return items
 SPIDER = GoogleDirectorySpider()
--- a/scrapy/trunk/examples/google_bot/templates/spider_basic.tmpl
+++ b/scrapy/trunk/examples/google_bot/templates/spider_basic.tmpl
@ -0,0 +1,13 @@
 from scrapy.spider import BaseSpider
 class $classname(BaseSpider):
    domain_name = "$site"
    start_urls = (
        'http://www.$site/',
        )
    def parse(self, response):
        return ()
 SPIDER = $classname()
--- a/scrapy/trunk/examples/google_bot/templates/spider_crawl.tmpl
+++ b/scrapy/trunk/examples/google_bot/templates/spider_crawl.tmpl
@ -0,0 +1,24 @@
 # -*- coding: utf8 -*-
 import re
 from scrapy.xpath import HtmlXPathSelector
 from scrapy.item import ScrapedItem
 from scrapy.link.extractors import RegexLinkExtractor
 from scrapy.contrib.spiders import CrawlSpider, Rule
 class $classname(CrawlSpider):
    domain_name = '$site'
    start_urls = ['http://www.$site/']
    rules = (
        Rule(RegexLinkExtractor(allow=(r'Items/', )), 'parse_item', follow=True),
    )
    def parse_item(self, response):
        #xs = HtmlXPathSelector(response)
        #i = ScrapedItem()
        #i.attribute('site_id', xs.x('//input[@id="sid"]/@value'))
        #i.attribute('name', xs.x('//div[@id="name"]'))
        #i.attribute('description', xs.x('//div[@id="description"]'))
 SPIDER = $classname()
--- a/scrapy/trunk/examples/google_bot/templates/spider_csvfeed.tmpl
+++ b/scrapy/trunk/examples/google_bot/templates/spider_csvfeed.tmpl
@ -0,0 +1,23 @@
 # -*- coding: utf8 -*-
 from scrapy.contrib.spiders import CSVFeedSpider
 class $classname(CSVFeedSpider):
    domain_name = '$site'
    start_urls = ['http://www.$site/feed.csv']
    # headers = ['id', 'name', 'description', 'image_link']
    # delimiter = '\t'
    # Do any adaptations you need here
    #def adapt_response(self, response):
    #    return response
    def parse_row(self, response, row):
        p = self.create_product(response)
        #p.attribute('site_id', row['id'])
        #p.attribute('supplier', self.domain_name)
        #p.attribute('name', row['name'])
        #p.attribute('description', row['description'])
        #p.attribute('image_urls', row['image_link'])
        return p
 SPIDER = $classname()
--- a/scrapy/trunk/examples/google_bot/templates/spider_xmlfeed.tmpl
+++ b/scrapy/trunk/examples/google_bot/templates/spider_xmlfeed.tmpl
@ -0,0 +1,20 @@
 # -*- coding: utf8 -*-
 from scrapy.contrib.spiders import XMLFeedSpider
 class $classname(XMLFeedSpider):
    domain_name = '$site'
    start_urls = ['http://www.$site/feed.xml']
    def parse_item(self, response, xSel):
        p = self.create_product(response)
        #p.attribute('url', xSel(''))
        #p.attribute('supplier', self.domain_name)
        #p.attribute('site_id', xSel(''))
        #p.attribute('name', xSel(''))
        #p.attribute('description', xSel(''))
        #p.attribute('image_urls', xSel(''))
        #p.attribute('price', xSel(''))
        #p.attribute('dimensions', xSel(''))
        return p
 SPIDER = $classname()