1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 22:04:01 +00:00

Added the example project from the tutorial

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40657
This commit is contained in:
elpolilla 2009-01-06 16:08:03 +00:00
parent 9d6defa643
commit c80fef7eb0
10 changed files with 231 additions and 0 deletions

View File

@ -0,0 +1,6 @@
# Define here the models for your scraped items
from scrapy.item import ScrapedItem
class MyItem(ScrapedItem):
pass

View File

@ -0,0 +1,4 @@
#!/usr/bin/env python
from scrapy.command.cmdline import execute
execute()

View File

@ -0,0 +1,96 @@
import google_bot
# ---------------------------------------------------------------------------
# - Scrapy settings for google_bot -
# ---------------------------------------------------------------------------
BOT_NAME = 'scrapybot'
BOT_VERSION = '1.0'
SPIDER_MODULES = ['google_bot.spiders']
NEWSPIDER_MODULE = 'google_bot.spiders'
TEMPLATES_DIR = '%s/templates' % google_bot.__path__[0]
ENABLED_SPIDERS_FILE = '%s/conf/enabled_spiders.list' % google_bot.__path__[0]
DEFAULT_ITEM_CLASS = 'scrapy.item.ScrapedItem'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
DOWNLOAD_TIMEOUT = 600
# uncomment if you want to add your own custom scrapy commands
#COMMANDS_MODULE = 'google_bot.commands'
#COMMANDS_SETTINGS_MODULE = 'google_bot.conf.commands'
#Global timeout between sucessive downloads (can be overrided by spider
#attribute download_timeout
#DOWNLOAD_TIMEOUT = 0
MYSQL_CONNECTION_SETTINGS = {"charset": "utf8" }
MYSQL_CONNECTION_PING_PERIOD = 600
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
#CACHE2_DIR = '/tmp/cache2' # if set, enables HTTP cache
#CACHE2_IGNORE_MISSING = 0 # ignore requests not in cache
#CACHE2_SECTORIZE = 1 # sectorize domains to distribute storage among servers
#STATS_ENABLED = 1 # enable stats
#STATS_CLEANUP = 0 # cleanup domain stats when a domain is closed (saves memory)
#STATS_DEBUG = 0 # log stats on domain closed
EXTENSIONS = (
'scrapy.management.web.WebConsole',
'scrapy.management.telnet.TelnetConsole',
)
DOWNLOADER_MIDDLEWARES = (
# Engine side
'scrapy.contrib.downloadermiddleware.errorpages.ErrorPagesMiddleware',
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware',
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware',
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware',
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware',
'scrapy.contrib.downloadermiddleware.common.CommonMiddleware',
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware',
'scrapy.contrib.downloadermiddleware.compression.CompressionMiddleware',
'scrapy.contrib.downloadermiddleware.debug.CrawlDebug',
'scrapy.contrib.downloadermiddleware.cache.CacheMiddleware',
# Downloader side
)
SPIDER_MIDDLEWARES = (
# Engine side
'scrapy.contrib.spidermiddleware.limit.RequestLimitMiddleware',
'scrapy.contrib.spidermiddleware.restrict.RestrictMiddleware',
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware',
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware',
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware',
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware',
'scrapy.contrib.spidermiddleware.urlfilter.UrlFilterMiddleware',
# Spider side
)
# Item pipelines are usually configured by commands (see conf/commands)
ITEM_PIPELINES = (
'scrapy.contrib.pipeline.show.ShowItemPipeline',
)
#DEPTH_LIMIT = 10 # limit the maximum link depth to follow
#DEPTH_STATS = 1 # enable depth stats
# Limit URL length. See: http://www.boutell.com/newfaq/misc/urllength.html
URLLENGTH_LIMIT = 2083
#WEBCONSOLE_ENABLED = 1
#WEBCONSOLE_PORT = 8060 # if not set uses a dynamic port
#TELNETCONSOLE_ENABLED = 1
#TELNETCONSOLE_PORT = 2020 # if not set uses a dynamic port
# global mail sending settings
#MAIL_HOST = 'localhost'
#MAIL_FROM = 'scrapybot@localhost'
# scrapy webservice
WS_ENABLED = 0
SPIDERPROFILER_ENABLED = 0

View File

@ -0,0 +1 @@
# Place here all your scrapy spiders

View File

@ -0,0 +1,44 @@
# -*- coding: utf8 -*-
from scrapy.xpath import HtmlXPathSelector
from scrapy.item import ScrapedItem
from scrapy.contrib import adaptors
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.link.extractors import RegexLinkExtractor
from scrapy.utils.misc import items_to_csv
class GoogleDirectorySpider(CrawlSpider):
domain_name = 'google.com'
start_urls = ['http://www.google.com/dirhp']
rules = (
Rule(RegexLinkExtractor(allow=('google.com/[A-Z][a-zA-Z_/]+$', ), ),
'parse_category',
follow=True,
),
)
adaptor_pipe = [adaptors.extract, adaptors.Delist(), adaptors.strip]
csv_file = open('scraped_items.csv', 'w')
def parse_category(self, response):
items = [] # The item (links to websites) list we're going to return
hxs = HtmlXPathSelector(response) # The selector we're going to use in order to extract data from the page
links = hxs.x('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font')
for link in links:
item = ScrapedItem()
item.set_adaptors({
'name': self.adaptor_pipe,
'url': self.adaptor_pipe,
'description': self.adaptor_pipe,
})
item.attribute('name', link.x('a/text()'))
item.attribute('url', link.x('a/@href'))
item.attribute('description', link.x('font[2]/text()'))
items.append(item)
items_to_csv(self.csv_file, items)
return items
SPIDER = GoogleDirectorySpider()

View File

@ -0,0 +1,13 @@
from scrapy.spider import BaseSpider
class $classname(BaseSpider):
domain_name = "$site"
start_urls = (
'http://www.$site/',
)
def parse(self, response):
return ()
SPIDER = $classname()

View File

@ -0,0 +1,24 @@
# -*- coding: utf8 -*-
import re
from scrapy.xpath import HtmlXPathSelector
from scrapy.item import ScrapedItem
from scrapy.link.extractors import RegexLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
class $classname(CrawlSpider):
domain_name = '$site'
start_urls = ['http://www.$site/']
rules = (
Rule(RegexLinkExtractor(allow=(r'Items/', )), 'parse_item', follow=True),
)
def parse_item(self, response):
#xs = HtmlXPathSelector(response)
#i = ScrapedItem()
#i.attribute('site_id', xs.x('//input[@id="sid"]/@value'))
#i.attribute('name', xs.x('//div[@id="name"]'))
#i.attribute('description', xs.x('//div[@id="description"]'))
SPIDER = $classname()

View File

@ -0,0 +1,23 @@
# -*- coding: utf8 -*-
from scrapy.contrib.spiders import CSVFeedSpider
class $classname(CSVFeedSpider):
domain_name = '$site'
start_urls = ['http://www.$site/feed.csv']
# headers = ['id', 'name', 'description', 'image_link']
# delimiter = '\t'
# Do any adaptations you need here
#def adapt_response(self, response):
# return response
def parse_row(self, response, row):
p = self.create_product(response)
#p.attribute('site_id', row['id'])
#p.attribute('supplier', self.domain_name)
#p.attribute('name', row['name'])
#p.attribute('description', row['description'])
#p.attribute('image_urls', row['image_link'])
return p
SPIDER = $classname()

View File

@ -0,0 +1,20 @@
# -*- coding: utf8 -*-
from scrapy.contrib.spiders import XMLFeedSpider
class $classname(XMLFeedSpider):
domain_name = '$site'
start_urls = ['http://www.$site/feed.xml']
def parse_item(self, response, xSel):
p = self.create_product(response)
#p.attribute('url', xSel(''))
#p.attribute('supplier', self.domain_name)
#p.attribute('site_id', xSel(''))
#p.attribute('name', xSel(''))
#p.attribute('description', xSel(''))
#p.attribute('image_urls', xSel(''))
#p.attribute('price', xSel(''))
#p.attribute('dimensions', xSel(''))
return p
SPIDER = $classname()