mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 01:43:43 +00:00
Added the example project from the tutorial
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40657
This commit is contained in:
parent
9d6defa643
commit
c80fef7eb0
0
scrapy/trunk/examples/google_bot/__init__.py
Normal file
0
scrapy/trunk/examples/google_bot/__init__.py
Normal file
6
scrapy/trunk/examples/google_bot/items.py
Normal file
6
scrapy/trunk/examples/google_bot/items.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# Define here the models for your scraped items
|
||||||
|
|
||||||
|
from scrapy.item import ScrapedItem
|
||||||
|
|
||||||
|
class MyItem(ScrapedItem):
|
||||||
|
pass
|
4
scrapy/trunk/examples/google_bot/scrapy-ctl.py
Executable file
4
scrapy/trunk/examples/google_bot/scrapy-ctl.py
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from scrapy.command.cmdline import execute
|
||||||
|
execute()
|
96
scrapy/trunk/examples/google_bot/scrapy_settings.py
Normal file
96
scrapy/trunk/examples/google_bot/scrapy_settings.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
import google_bot
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# - Scrapy settings for google_bot -
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
BOT_NAME = 'scrapybot'
|
||||||
|
BOT_VERSION = '1.0'
|
||||||
|
|
||||||
|
SPIDER_MODULES = ['google_bot.spiders']
|
||||||
|
NEWSPIDER_MODULE = 'google_bot.spiders'
|
||||||
|
TEMPLATES_DIR = '%s/templates' % google_bot.__path__[0]
|
||||||
|
ENABLED_SPIDERS_FILE = '%s/conf/enabled_spiders.list' % google_bot.__path__[0]
|
||||||
|
DEFAULT_ITEM_CLASS = 'scrapy.item.ScrapedItem'
|
||||||
|
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
|
||||||
|
DOWNLOAD_TIMEOUT = 600
|
||||||
|
|
||||||
|
# uncomment if you want to add your own custom scrapy commands
|
||||||
|
#COMMANDS_MODULE = 'google_bot.commands'
|
||||||
|
#COMMANDS_SETTINGS_MODULE = 'google_bot.conf.commands'
|
||||||
|
|
||||||
|
#Global timeout between sucessive downloads (can be overrided by spider
|
||||||
|
#attribute download_timeout
|
||||||
|
#DOWNLOAD_TIMEOUT = 0
|
||||||
|
|
||||||
|
MYSQL_CONNECTION_SETTINGS = {"charset": "utf8" }
|
||||||
|
MYSQL_CONNECTION_PING_PERIOD = 600
|
||||||
|
|
||||||
|
SCHEDULER = 'scrapy.core.scheduler.Scheduler'
|
||||||
|
SCHEDULER_ORDER = 'BFO' # available orders: BFO (default), DFO
|
||||||
|
|
||||||
|
#CACHE2_DIR = '/tmp/cache2' # if set, enables HTTP cache
|
||||||
|
#CACHE2_IGNORE_MISSING = 0 # ignore requests not in cache
|
||||||
|
#CACHE2_SECTORIZE = 1 # sectorize domains to distribute storage among servers
|
||||||
|
|
||||||
|
#STATS_ENABLED = 1 # enable stats
|
||||||
|
#STATS_CLEANUP = 0 # cleanup domain stats when a domain is closed (saves memory)
|
||||||
|
#STATS_DEBUG = 0 # log stats on domain closed
|
||||||
|
|
||||||
|
EXTENSIONS = (
|
||||||
|
'scrapy.management.web.WebConsole',
|
||||||
|
'scrapy.management.telnet.TelnetConsole',
|
||||||
|
)
|
||||||
|
|
||||||
|
DOWNLOADER_MIDDLEWARES = (
|
||||||
|
# Engine side
|
||||||
|
'scrapy.contrib.downloadermiddleware.errorpages.ErrorPagesMiddleware',
|
||||||
|
'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware',
|
||||||
|
'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware',
|
||||||
|
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware',
|
||||||
|
'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware',
|
||||||
|
'scrapy.contrib.downloadermiddleware.common.CommonMiddleware',
|
||||||
|
'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware',
|
||||||
|
'scrapy.contrib.downloadermiddleware.compression.CompressionMiddleware',
|
||||||
|
'scrapy.contrib.downloadermiddleware.debug.CrawlDebug',
|
||||||
|
'scrapy.contrib.downloadermiddleware.cache.CacheMiddleware',
|
||||||
|
# Downloader side
|
||||||
|
)
|
||||||
|
|
||||||
|
SPIDER_MIDDLEWARES = (
|
||||||
|
# Engine side
|
||||||
|
'scrapy.contrib.spidermiddleware.limit.RequestLimitMiddleware',
|
||||||
|
'scrapy.contrib.spidermiddleware.restrict.RestrictMiddleware',
|
||||||
|
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware',
|
||||||
|
'scrapy.contrib.spidermiddleware.referer.RefererMiddleware',
|
||||||
|
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware',
|
||||||
|
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware',
|
||||||
|
'scrapy.contrib.spidermiddleware.urlfilter.UrlFilterMiddleware',
|
||||||
|
# Spider side
|
||||||
|
)
|
||||||
|
|
||||||
|
# Item pipelines are usually configured by commands (see conf/commands)
|
||||||
|
ITEM_PIPELINES = (
|
||||||
|
'scrapy.contrib.pipeline.show.ShowItemPipeline',
|
||||||
|
)
|
||||||
|
|
||||||
|
#DEPTH_LIMIT = 10 # limit the maximum link depth to follow
|
||||||
|
#DEPTH_STATS = 1 # enable depth stats
|
||||||
|
|
||||||
|
# Limit URL length. See: http://www.boutell.com/newfaq/misc/urllength.html
|
||||||
|
URLLENGTH_LIMIT = 2083
|
||||||
|
|
||||||
|
#WEBCONSOLE_ENABLED = 1
|
||||||
|
#WEBCONSOLE_PORT = 8060 # if not set uses a dynamic port
|
||||||
|
|
||||||
|
#TELNETCONSOLE_ENABLED = 1
|
||||||
|
#TELNETCONSOLE_PORT = 2020 # if not set uses a dynamic port
|
||||||
|
|
||||||
|
# global mail sending settings
|
||||||
|
#MAIL_HOST = 'localhost'
|
||||||
|
#MAIL_FROM = 'scrapybot@localhost'
|
||||||
|
|
||||||
|
# scrapy webservice
|
||||||
|
WS_ENABLED = 0
|
||||||
|
|
||||||
|
SPIDERPROFILER_ENABLED = 0
|
1
scrapy/trunk/examples/google_bot/spiders/__init__.py
Normal file
1
scrapy/trunk/examples/google_bot/spiders/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# Place here all your scrapy spiders
|
44
scrapy/trunk/examples/google_bot/spiders/google_directory.py
Normal file
44
scrapy/trunk/examples/google_bot/spiders/google_directory.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
# -*- coding: utf8 -*-
|
||||||
|
from scrapy.xpath import HtmlXPathSelector
|
||||||
|
from scrapy.item import ScrapedItem
|
||||||
|
from scrapy.contrib import adaptors
|
||||||
|
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||||
|
from scrapy.link.extractors import RegexLinkExtractor
|
||||||
|
from scrapy.utils.misc import items_to_csv
|
||||||
|
|
||||||
|
class GoogleDirectorySpider(CrawlSpider):
|
||||||
|
domain_name = 'google.com'
|
||||||
|
start_urls = ['http://www.google.com/dirhp']
|
||||||
|
|
||||||
|
rules = (
|
||||||
|
Rule(RegexLinkExtractor(allow=('google.com/[A-Z][a-zA-Z_/]+$', ), ),
|
||||||
|
'parse_category',
|
||||||
|
follow=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
adaptor_pipe = [adaptors.extract, adaptors.Delist(), adaptors.strip]
|
||||||
|
csv_file = open('scraped_items.csv', 'w')
|
||||||
|
|
||||||
|
def parse_category(self, response):
|
||||||
|
items = [] # The item (links to websites) list we're going to return
|
||||||
|
hxs = HtmlXPathSelector(response) # The selector we're going to use in order to extract data from the page
|
||||||
|
links = hxs.x('//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font')
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
item = ScrapedItem()
|
||||||
|
item.set_adaptors({
|
||||||
|
'name': self.adaptor_pipe,
|
||||||
|
'url': self.adaptor_pipe,
|
||||||
|
'description': self.adaptor_pipe,
|
||||||
|
})
|
||||||
|
|
||||||
|
item.attribute('name', link.x('a/text()'))
|
||||||
|
item.attribute('url', link.x('a/@href'))
|
||||||
|
item.attribute('description', link.x('font[2]/text()'))
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
items_to_csv(self.csv_file, items)
|
||||||
|
return items
|
||||||
|
|
||||||
|
SPIDER = GoogleDirectorySpider()
|
13
scrapy/trunk/examples/google_bot/templates/spider_basic.tmpl
Normal file
13
scrapy/trunk/examples/google_bot/templates/spider_basic.tmpl
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from scrapy.spider import BaseSpider
|
||||||
|
|
||||||
|
class $classname(BaseSpider):
|
||||||
|
domain_name = "$site"
|
||||||
|
start_urls = (
|
||||||
|
'http://www.$site/',
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
return ()
|
||||||
|
|
||||||
|
SPIDER = $classname()
|
||||||
|
|
24
scrapy/trunk/examples/google_bot/templates/spider_crawl.tmpl
Normal file
24
scrapy/trunk/examples/google_bot/templates/spider_crawl.tmpl
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# -*- coding: utf8 -*-
|
||||||
|
import re
|
||||||
|
|
||||||
|
from scrapy.xpath import HtmlXPathSelector
|
||||||
|
from scrapy.item import ScrapedItem
|
||||||
|
from scrapy.link.extractors import RegexLinkExtractor
|
||||||
|
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||||
|
|
||||||
|
class $classname(CrawlSpider):
|
||||||
|
domain_name = '$site'
|
||||||
|
start_urls = ['http://www.$site/']
|
||||||
|
|
||||||
|
rules = (
|
||||||
|
Rule(RegexLinkExtractor(allow=(r'Items/', )), 'parse_item', follow=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_item(self, response):
|
||||||
|
#xs = HtmlXPathSelector(response)
|
||||||
|
#i = ScrapedItem()
|
||||||
|
#i.attribute('site_id', xs.x('//input[@id="sid"]/@value'))
|
||||||
|
#i.attribute('name', xs.x('//div[@id="name"]'))
|
||||||
|
#i.attribute('description', xs.x('//div[@id="description"]'))
|
||||||
|
|
||||||
|
SPIDER = $classname()
|
@ -0,0 +1,23 @@
|
|||||||
|
# -*- coding: utf8 -*-
|
||||||
|
from scrapy.contrib.spiders import CSVFeedSpider
|
||||||
|
|
||||||
|
class $classname(CSVFeedSpider):
|
||||||
|
domain_name = '$site'
|
||||||
|
start_urls = ['http://www.$site/feed.csv']
|
||||||
|
# headers = ['id', 'name', 'description', 'image_link']
|
||||||
|
# delimiter = '\t'
|
||||||
|
|
||||||
|
# Do any adaptations you need here
|
||||||
|
#def adapt_response(self, response):
|
||||||
|
# return response
|
||||||
|
|
||||||
|
def parse_row(self, response, row):
|
||||||
|
p = self.create_product(response)
|
||||||
|
#p.attribute('site_id', row['id'])
|
||||||
|
#p.attribute('supplier', self.domain_name)
|
||||||
|
#p.attribute('name', row['name'])
|
||||||
|
#p.attribute('description', row['description'])
|
||||||
|
#p.attribute('image_urls', row['image_link'])
|
||||||
|
return p
|
||||||
|
|
||||||
|
SPIDER = $classname()
|
@ -0,0 +1,20 @@
|
|||||||
|
# -*- coding: utf8 -*-
|
||||||
|
from scrapy.contrib.spiders import XMLFeedSpider
|
||||||
|
|
||||||
|
class $classname(XMLFeedSpider):
|
||||||
|
domain_name = '$site'
|
||||||
|
start_urls = ['http://www.$site/feed.xml']
|
||||||
|
|
||||||
|
def parse_item(self, response, xSel):
|
||||||
|
p = self.create_product(response)
|
||||||
|
#p.attribute('url', xSel(''))
|
||||||
|
#p.attribute('supplier', self.domain_name)
|
||||||
|
#p.attribute('site_id', xSel(''))
|
||||||
|
#p.attribute('name', xSel(''))
|
||||||
|
#p.attribute('description', xSel(''))
|
||||||
|
#p.attribute('image_urls', xSel(''))
|
||||||
|
#p.attribute('price', xSel(''))
|
||||||
|
#p.attribute('dimensions', xSel(''))
|
||||||
|
return p
|
||||||
|
|
||||||
|
SPIDER = $classname()
|
Loading…
x
Reference in New Issue
Block a user