added first (not yet stable) revision of Crawler class, which allows to use the Scrapy crawler from stand-alone scripts

2025-02-24 19:24:12 +00:00 · 2009-09-12 19:32:23 -03:00 · 2009-09-12 19:32:23 -03:00 · 3f30fee6ea
commit 3f30fee6ea
parent 1381c1e50a
2 changed files with 117 additions and 0 deletions
--- a/examples/scripts/count_and_follow_links.py
+++ b/examples/scripts/count_and_follow_links.py
@ -0,0 +1,51 @@
+"""
+Simple script to follow links from a start url. The links are followed in no
+particular order.
+
+Usage:
+count_and_follow_links.py <start_url> <links_to_follow>
+
+Example:
+count_and_follow_links.py http://scrapy.org/ 20
+
+For each page visisted, this script will print the page body size and the
+number of links found.
+"""
+
+import sys
+from urlparse import urljoin
+
+from scrapy.crawler import Crawler
+from scrapy.selector import HtmlXPathSelector
+from scrapy.http import Request, HtmlResponse
+
+links_followed = 0
+
+def parse(response):
+    global links_followed
+    links_followed += 1
+    if links_followed >= links_to_follow:
+        crawler.stop()
+
+    # ignore non-HTML responses
+    if not isinstance(response, HtmlResponse):
+        return
+
+    links = HtmlXPathSelector(response).select('//a/@href').extract()
+    abslinks = [urljoin(response.url, l) for l in links]
+
+    print "page %2d/%d: %s" % (links_followed, links_to_follow, response.url)
+    print "  size : %d bytes" % len(response.body)
+    print "  links: %d" % len(links)
+    print
+
+    return [Request(l, callback=parse) for l in abslinks]
+
+if len(sys.argv) != 3:
+    print __doc__
+    sys.exit(2)
+
+start_url, links_to_follow = sys.argv[1], int(sys.argv[2])
+request = Request(start_url, callback=parse)
+crawler = Crawler()
+crawler.crawl(request)
--- a/scrapy/crawler.py
+++ b/scrapy/crawler.py
@ -0,0 +1,66 @@
+"""
+Crawler class
+
+The Crawler class can be used to crawl pages using the Scrapy crawler from
+outside a Scrapy project, for example, from a standalone script. 
+
+To use it, instantiate it and call the "crawl" method with one (or more)
+requests. For example:
+
+    >>> from scrapy.crawler import Crawler
+    >>> from scrapy.http import Request
+    >>> def parse_response(response):
+    ...     print "Visited: %s" % response.url
+    ...
+    >>> request = Request('http://scrapy.org', callback=parse_response)
+    >>> crawler = Crawler()
+    >>> crawler.crawl(request)
+    Visited http://scrapy.org 
+    >>>
+
+Request callbacks follow the same API of spiders callback, which means that all
+requests returned from the callbacks will be followed.
+
+See examples/scripts/count_and_follow_links.py for a more detailed example.
+
+WARNING: The Crawler class currently has a big limitation - it cannot be used
+more than once in the same Python process. This is due to the fact that Twisted
+reactors cannot be restarted. Hopefully, this limitation will be removed in the
+future.
+"""
+
+from scrapy.xlib.pydispatch import dispatcher
+from scrapy.core.manager import scrapymanager
+from scrapy.core.engine import scrapyengine
+from scrapy.conf import settings as scrapy_settings
+from scrapy import log
+
+class Crawler(object):
+
+    def __init__(self, enable_log=False, stop_on_error=False, silence_errors=False, \
+            settings=None):
+        self.stop_on_error = stop_on_error
+        self.silence_errors = silence_errors
+        # disable offsite middleware (by default) because it prevents free crawling
+        if settings is not None:
+            settings.overrides.update(settings)
+        scrapy_settings.overrides['SPIDER_MIDDLEWARES'] = {
+            'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None}
+        scrapy_settings.overrides['LOG_ENABLED'] = enable_log
+        scrapymanager.configure()
+        dispatcher.connect(self._logmessage_received, signal=log.logmessage_received)
+
+    def crawl(self, *args):
+        scrapymanager.runonce(*args)
+
+    def stop(self):
+        scrapyengine.stop()
+        log.log_level = log.SILENT
+        scrapyengine.kill()
+
+    def _logmessage_received(self, message, level):
+        if level <= log.ERROR:
+            if not self.silence_errors:
+                print "Crawler error: %s" % message
+            if self.stop_on_error:
+                self.stop()