mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 22:04:01 +00:00
Added the example project from the tutorial
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40657
This commit is contained in:
parent
9d6defa643
commit
c80fef7eb0
0
scrapy/trunk/examples/google_bot/__init__.py
Normal file
0
scrapy/trunk/examples/google_bot/__init__.py
Normal file
6
scrapy/trunk/examples/google_bot/items.py
Normal file
6
scrapy/trunk/examples/google_bot/items.py
Normal file
@ -0,0 +1,6 @@
|
||||
# Define here the models for your scraped items
|
||||
|
||||
from scrapy.item import ScrapedItem
|
||||
|
||||
class MyItem(ScrapedItem):
|
||||
pass
|
4
scrapy/trunk/examples/google_bot/scrapy-ctl.py
Executable file
4
scrapy/trunk/examples/google_bot/scrapy-ctl.py
Executable file
@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
from scrapy.command.cmdline import execute
|
||||
execute()
|
96
scrapy/trunk/examples/google_bot/scrapy_settings.py
Normal file
96
scrapy/trunk/examples/google_bot/scrapy_settings.py
Normal file
@ -0,0 +1,96 @@
|
||||
import google_bot
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# - Scrapy settings for google_bot -
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
BOT_NAME = 'scrapybot'
|
||||
BOT_VERSION = '1.0'
|
||||
|
||||
SPIDER_MODULES = ['google_bot.spiders']
|
||||
NEWSPIDER_MODULE = 'google_bot.spiders'
|
||||
TEMPLATES_DIR = '%s/templates' % google_bot.__path__[0]
|
||||
ENABLED_SPIDERS_FILE = '%s/conf/enabled_spiders.list' % google_bot.__path__[0]
|
||||
DEFAULT_ITEM_CLASS = 'scrapy.item.ScrapedItem'
|
||||
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
|
||||
DOWNLOAD_TIMEOUT = 600
|
||||
|
||||
# uncomment if you want to add your own custom scrapy commands
|
||||
#COMMANDS_MODULE = 'google_bot.commands'
|
||||
#COMMANDS_SETTINGS_MODULE = 'google_bot.conf.commands'
|
||||
|
||||
#Global timeout between sucessive downloads (can be overrided by spider
|
||||
#attribute download_timeout
|
||||
#DOWNLOAD_TIMEOUT = 0
|
||||
|
||||
MYSQL_CONNECTION_SETTINGS = {"charset": "utf8" }
|
||||
MYSQL_CONNECTION_PING_PERIOD = 600
|
||||
|
||||
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
|
||||
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
|
||||
|
||||
#CACHE2_DIR = '/tmp/cache2' # if set, enables HTTP cache
|
||||
#CACHE2_IGNORE_MISSING = 0 # ignore requests not in cache
|
||||
#CACHE2_SECTORIZE = 1 # sectorize domains to distribute storage among servers
|
||||
|
||||
#STATS_ENABLED = 1 # enable stats
|
||||
#STATS_CLEANUP = 0 # cleanup domain stats when a domain is closed (saves memory)
|
||||
#STATS_DEBUG = 0 # log stats on domain closed
|
||||
|
||||
EXTENSIONS = (
|
||||
'scrapy.management.web.WebConsole',
|
||||
'scrapy.management.telnet.TelnetConsole',
|
||||
)
|
||||
|
||||
DOWNLOADER_MIDDLEWARES = (
|
||||
# Engine side
|
||||
'scrapy.contrib.downloadermiddleware.errorpages.ErrorPagesMiddleware',
|
||||
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware',
|
||||
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware',
|
||||
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware',
|
||||
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware',
|
||||
'scrapy.contrib.downloadermiddleware.common.CommonMiddleware',
|
||||
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware',
|
||||
'scrapy.contrib.downloadermiddleware.compression.CompressionMiddleware',
|
||||
'scrapy.contrib.downloadermiddleware.debug.CrawlDebug',
|
||||
'scrapy.contrib.downloadermiddleware.cache.CacheMiddleware',
|
||||
# Downloader side
|
||||
)
|
||||
|
||||
SPIDER_MIDDLEWARES = (
|
||||
# Engine side
|
||||
'scrapy.contrib.spidermiddleware.limit.RequestLimitMiddleware',
|
||||
'scrapy.contrib.spidermiddleware.restrict.RestrictMiddleware',
|
||||
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware',
|
||||
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware',
|
||||
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware',
|
||||
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware',
|
||||
'scrapy.contrib.spidermiddleware.urlfilter.UrlFilterMiddleware',
|
||||
# Spider side
|
||||
)
|
||||
|
||||
# Item pipelines are usually configured by commands (see conf/commands)
|
||||
ITEM_PIPELINES = (
|
||||
'scrapy.contrib.pipeline.show.ShowItemPipeline',
|
||||
)
|
||||
|
||||
#DEPTH_LIMIT = 10 # limit the maximum link depth to follow
|
||||
#DEPTH_STATS = 1 # enable depth stats
|
||||
|
||||
# Limit URL length. See: http://www.boutell.com/newfaq/misc/urllength.html
|
||||
URLLENGTH_LIMIT = 2083
|
||||
|
||||
#WEBCONSOLE_ENABLED = 1
|
||||
#WEBCONSOLE_PORT = 8060 # if not set uses a dynamic port
|
||||
|
||||
#TELNETCONSOLE_ENABLED = 1
|
||||
#TELNETCONSOLE_PORT = 2020 # if not set uses a dynamic port
|
||||
|
||||
# global mail sending settings
|
||||
#MAIL_HOST = 'localhost'
|
||||
#MAIL_FROM = 'scrapybot@localhost'
|
||||
|
||||
# scrapy webservice
|
||||
WS_ENABLED = 0
|
||||
|
||||
SPIDERPROFILER_ENABLED = 0
|
1
scrapy/trunk/examples/google_bot/spiders/__init__.py
Normal file
1
scrapy/trunk/examples/google_bot/spiders/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# Place here all your scrapy spiders
|
44
scrapy/trunk/examples/google_bot/spiders/google_directory.py
Normal file
44
scrapy/trunk/examples/google_bot/spiders/google_directory.py
Normal file
@ -0,0 +1,44 @@
|
||||
# -*- coding: utf8 -*-
|
||||
from scrapy.xpath import HtmlXPathSelector
|
||||
from scrapy.item import ScrapedItem
|
||||
from scrapy.contrib import adaptors
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.link.extractors import RegexLinkExtractor
|
||||
from scrapy.utils.misc import items_to_csv
|
||||
|
||||
class GoogleDirectorySpider(CrawlSpider):
|
||||
domain_name = 'google.com'
|
||||
start_urls = ['http://www.google.com/dirhp']
|
||||
|
||||
rules = (
|
||||
Rule(RegexLinkExtractor(allow=('google.com/[A-Z][a-zA-Z_/]+$', ), ),
|
||||
'parse_category',
|
||||
follow=True,
|
||||
),
|
||||
)
|
||||
|
||||
adaptor_pipe = [adaptors.extract, adaptors.Delist(), adaptors.strip]
|
||||
csv_file = open('scraped_items.csv', 'w')
|
||||
|
||||
def parse_category(self, response):
|
||||
items = [] # The item (links to websites) list we're going to return
|
||||
hxs = HtmlXPathSelector(response) # The selector we're going to use in order to extract data from the page
|
||||
links = hxs.x('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font')
|
||||
|
||||
for link in links:
|
||||
item = ScrapedItem()
|
||||
item.set_adaptors({
|
||||
'name': self.adaptor_pipe,
|
||||
'url': self.adaptor_pipe,
|
||||
'description': self.adaptor_pipe,
|
||||
})
|
||||
|
||||
item.attribute('name', link.x('a/text()'))
|
||||
item.attribute('url', link.x('a/@href'))
|
||||
item.attribute('description', link.x('font[2]/text()'))
|
||||
items.append(item)
|
||||
|
||||
items_to_csv(self.csv_file, items)
|
||||
return items
|
||||
|
||||
SPIDER = GoogleDirectorySpider()
|
13
scrapy/trunk/examples/google_bot/templates/spider_basic.tmpl
Normal file
13
scrapy/trunk/examples/google_bot/templates/spider_basic.tmpl
Normal file
@ -0,0 +1,13 @@
|
||||
from scrapy.spider import BaseSpider
|
||||
|
||||
class $classname(BaseSpider):
|
||||
domain_name = "$site"
|
||||
start_urls = (
|
||||
'http://www.$site/',
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
return ()
|
||||
|
||||
SPIDER = $classname()
|
||||
|
24
scrapy/trunk/examples/google_bot/templates/spider_crawl.tmpl
Normal file
24
scrapy/trunk/examples/google_bot/templates/spider_crawl.tmpl
Normal file
@ -0,0 +1,24 @@
|
||||
# -*- coding: utf8 -*-
|
||||
import re
|
||||
|
||||
from scrapy.xpath import HtmlXPathSelector
|
||||
from scrapy.item import ScrapedItem
|
||||
from scrapy.link.extractors import RegexLinkExtractor
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
|
||||
class $classname(CrawlSpider):
|
||||
domain_name = '$site'
|
||||
start_urls = ['http://www.$site/']
|
||||
|
||||
rules = (
|
||||
Rule(RegexLinkExtractor(allow=(r'Items/', )), 'parse_item', follow=True),
|
||||
)
|
||||
|
||||
def parse_item(self, response):
|
||||
#xs = HtmlXPathSelector(response)
|
||||
#i = ScrapedItem()
|
||||
#i.attribute('site_id', xs.x('//input[@id="sid"]/@value'))
|
||||
#i.attribute('name', xs.x('//div[@id="name"]'))
|
||||
#i.attribute('description', xs.x('//div[@id="description"]'))
|
||||
|
||||
SPIDER = $classname()
|
@ -0,0 +1,23 @@
|
||||
# -*- coding: utf8 -*-
|
||||
from scrapy.contrib.spiders import CSVFeedSpider
|
||||
|
||||
class $classname(CSVFeedSpider):
|
||||
domain_name = '$site'
|
||||
start_urls = ['http://www.$site/feed.csv']
|
||||
# headers = ['id', 'name', 'description', 'image_link']
|
||||
# delimiter = '\t'
|
||||
|
||||
# Do any adaptations you need here
|
||||
#def adapt_response(self, response):
|
||||
# return response
|
||||
|
||||
def parse_row(self, response, row):
|
||||
p = self.create_product(response)
|
||||
#p.attribute('site_id', row['id'])
|
||||
#p.attribute('supplier', self.domain_name)
|
||||
#p.attribute('name', row['name'])
|
||||
#p.attribute('description', row['description'])
|
||||
#p.attribute('image_urls', row['image_link'])
|
||||
return p
|
||||
|
||||
SPIDER = $classname()
|
@ -0,0 +1,20 @@
|
||||
# -*- coding: utf8 -*-
|
||||
from scrapy.contrib.spiders import XMLFeedSpider
|
||||
|
||||
class $classname(XMLFeedSpider):
|
||||
domain_name = '$site'
|
||||
start_urls = ['http://www.$site/feed.xml']
|
||||
|
||||
def parse_item(self, response, xSel):
|
||||
p = self.create_product(response)
|
||||
#p.attribute('url', xSel(''))
|
||||
#p.attribute('supplier', self.domain_name)
|
||||
#p.attribute('site_id', xSel(''))
|
||||
#p.attribute('name', xSel(''))
|
||||
#p.attribute('description', xSel(''))
|
||||
#p.attribute('image_urls', xSel(''))
|
||||
#p.attribute('price', xSel(''))
|
||||
#p.attribute('dimensions', xSel(''))
|
||||
return p
|
||||
|
||||
SPIDER = $classname()
|
Loading…
x
Reference in New Issue
Block a user