1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 19:24:12 +00:00

added first (not yet stable) revision of Crawler class, which allows to use the Scrapy crawler from stand-alone scripts

This commit is contained in:
Pablo Hoffman 2009-09-12 19:32:23 -03:00
parent 1381c1e50a
commit 3f30fee6ea
2 changed files with 117 additions and 0 deletions

View File

@ -0,0 +1,51 @@
"""
Simple script to follow links from a start url. The links are followed in no
particular order.
Usage:
count_and_follow_links.py <start_url> <links_to_follow>
Example:
count_and_follow_links.py http://scrapy.org/ 20
For each page visisted, this script will print the page body size and the
number of links found.
"""
import sys
from urlparse import urljoin
from scrapy.crawler import Crawler
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request, HtmlResponse
links_followed = 0
def parse(response):
global links_followed
links_followed += 1
if links_followed >= links_to_follow:
crawler.stop()
# ignore non-HTML responses
if not isinstance(response, HtmlResponse):
return
links = HtmlXPathSelector(response).select('//a/@href').extract()
abslinks = [urljoin(response.url, l) for l in links]
print "page %2d/%d: %s" % (links_followed, links_to_follow, response.url)
print " size : %d bytes" % len(response.body)
print " links: %d" % len(links)
print
return [Request(l, callback=parse) for l in abslinks]
if len(sys.argv) != 3:
print __doc__
sys.exit(2)
start_url, links_to_follow = sys.argv[1], int(sys.argv[2])
request = Request(start_url, callback=parse)
crawler = Crawler()
crawler.crawl(request)

66
scrapy/crawler.py Normal file
View File

@ -0,0 +1,66 @@
"""
Crawler class
The Crawler class can be used to crawl pages using the Scrapy crawler from
outside a Scrapy project, for example, from a standalone script.
To use it, instantiate it and call the "crawl" method with one (or more)
requests. For example:
>>> from scrapy.crawler import Crawler
>>> from scrapy.http import Request
>>> def parse_response(response):
... print "Visited: %s" % response.url
...
>>> request = Request('http://scrapy.org', callback=parse_response)
>>> crawler = Crawler()
>>> crawler.crawl(request)
Visited http://scrapy.org
>>>
Request callbacks follow the same API of spiders callback, which means that all
requests returned from the callbacks will be followed.
See examples/scripts/count_and_follow_links.py for a more detailed example.
WARNING: The Crawler class currently has a big limitation - it cannot be used
more than once in the same Python process. This is due to the fact that Twisted
reactors cannot be restarted. Hopefully, this limitation will be removed in the
future.
"""
from scrapy.xlib.pydispatch import dispatcher
from scrapy.core.manager import scrapymanager
from scrapy.core.engine import scrapyengine
from scrapy.conf import settings as scrapy_settings
from scrapy import log
class Crawler(object):
def __init__(self, enable_log=False, stop_on_error=False, silence_errors=False, \
settings=None):
self.stop_on_error = stop_on_error
self.silence_errors = silence_errors
# disable offsite middleware (by default) because it prevents free crawling
if settings is not None:
settings.overrides.update(settings)
scrapy_settings.overrides['SPIDER_MIDDLEWARES'] = {
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None}
scrapy_settings.overrides['LOG_ENABLED'] = enable_log
scrapymanager.configure()
dispatcher.connect(self._logmessage_received, signal=log.logmessage_received)
def crawl(self, *args):
scrapymanager.runonce(*args)
def stop(self):
scrapyengine.stop()
log.log_level = log.SILENT
scrapyengine.kill()
def _logmessage_received(self, message, level):
if level <= log.ERROR:
if not self.silence_errors:
print "Crawler error: %s" % message
if self.stop_on_error:
self.stop()