mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 19:24:12 +00:00
added first (not yet stable) revision of Crawler class, which allows to use the Scrapy crawler from stand-alone scripts
This commit is contained in:
parent
1381c1e50a
commit
3f30fee6ea
51
examples/scripts/count_and_follow_links.py
Normal file
51
examples/scripts/count_and_follow_links.py
Normal file
@ -0,0 +1,51 @@
|
||||
"""
|
||||
Simple script to follow links from a start url. The links are followed in no
|
||||
particular order.
|
||||
|
||||
Usage:
|
||||
count_and_follow_links.py <start_url> <links_to_follow>
|
||||
|
||||
Example:
|
||||
count_and_follow_links.py http://scrapy.org/ 20
|
||||
|
||||
For each page visisted, this script will print the page body size and the
|
||||
number of links found.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from urlparse import urljoin
|
||||
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.selector import HtmlXPathSelector
|
||||
from scrapy.http import Request, HtmlResponse
|
||||
|
||||
links_followed = 0
|
||||
|
||||
def parse(response):
|
||||
global links_followed
|
||||
links_followed += 1
|
||||
if links_followed >= links_to_follow:
|
||||
crawler.stop()
|
||||
|
||||
# ignore non-HTML responses
|
||||
if not isinstance(response, HtmlResponse):
|
||||
return
|
||||
|
||||
links = HtmlXPathSelector(response).select('//a/@href').extract()
|
||||
abslinks = [urljoin(response.url, l) for l in links]
|
||||
|
||||
print "page %2d/%d: %s" % (links_followed, links_to_follow, response.url)
|
||||
print " size : %d bytes" % len(response.body)
|
||||
print " links: %d" % len(links)
|
||||
print
|
||||
|
||||
return [Request(l, callback=parse) for l in abslinks]
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print __doc__
|
||||
sys.exit(2)
|
||||
|
||||
start_url, links_to_follow = sys.argv[1], int(sys.argv[2])
|
||||
request = Request(start_url, callback=parse)
|
||||
crawler = Crawler()
|
||||
crawler.crawl(request)
|
66
scrapy/crawler.py
Normal file
66
scrapy/crawler.py
Normal file
@ -0,0 +1,66 @@
|
||||
"""
|
||||
Crawler class
|
||||
|
||||
The Crawler class can be used to crawl pages using the Scrapy crawler from
|
||||
outside a Scrapy project, for example, from a standalone script.
|
||||
|
||||
To use it, instantiate it and call the "crawl" method with one (or more)
|
||||
requests. For example:
|
||||
|
||||
>>> from scrapy.crawler import Crawler
|
||||
>>> from scrapy.http import Request
|
||||
>>> def parse_response(response):
|
||||
... print "Visited: %s" % response.url
|
||||
...
|
||||
>>> request = Request('http://scrapy.org', callback=parse_response)
|
||||
>>> crawler = Crawler()
|
||||
>>> crawler.crawl(request)
|
||||
Visited http://scrapy.org
|
||||
>>>
|
||||
|
||||
Request callbacks follow the same API of spiders callback, which means that all
|
||||
requests returned from the callbacks will be followed.
|
||||
|
||||
See examples/scripts/count_and_follow_links.py for a more detailed example.
|
||||
|
||||
WARNING: The Crawler class currently has a big limitation - it cannot be used
|
||||
more than once in the same Python process. This is due to the fact that Twisted
|
||||
reactors cannot be restarted. Hopefully, this limitation will be removed in the
|
||||
future.
|
||||
"""
|
||||
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.core.manager import scrapymanager
|
||||
from scrapy.core.engine import scrapyengine
|
||||
from scrapy.conf import settings as scrapy_settings
|
||||
from scrapy import log
|
||||
|
||||
class Crawler(object):
|
||||
|
||||
def __init__(self, enable_log=False, stop_on_error=False, silence_errors=False, \
|
||||
settings=None):
|
||||
self.stop_on_error = stop_on_error
|
||||
self.silence_errors = silence_errors
|
||||
# disable offsite middleware (by default) because it prevents free crawling
|
||||
if settings is not None:
|
||||
settings.overrides.update(settings)
|
||||
scrapy_settings.overrides['SPIDER_MIDDLEWARES'] = {
|
||||
'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None}
|
||||
scrapy_settings.overrides['LOG_ENABLED'] = enable_log
|
||||
scrapymanager.configure()
|
||||
dispatcher.connect(self._logmessage_received, signal=log.logmessage_received)
|
||||
|
||||
def crawl(self, *args):
|
||||
scrapymanager.runonce(*args)
|
||||
|
||||
def stop(self):
|
||||
scrapyengine.stop()
|
||||
log.log_level = log.SILENT
|
||||
scrapyengine.kill()
|
||||
|
||||
def _logmessage_received(self, message, level):
|
||||
if level <= log.ERROR:
|
||||
if not self.silence_errors:
|
||||
print "Crawler error: %s" % message
|
||||
if self.stop_on_error:
|
||||
self.stop()
|
Loading…
x
Reference in New Issue
Block a user