diff --git a/bin/scrapy-ws.py b/bin/scrapy-ws.py new file mode 100755 index 000000000..f6357e726 --- /dev/null +++ b/bin/scrapy-ws.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +""" +Example script to control and monitor Scrapy using its web service. It only +provides a reduced functionality as its main purpose is to illustrate how to +write a web service client. Feel free to improve or write you own. +""" + +import sys, optparse, urllib +from urlparse import urljoin + +from scrapy.utils.jsonrpc import jsonrpc_client_call, JsonRpcError +from scrapy.utils.py26 import json + +def get_commands(): + return { + 'help': cmd_help, + 'run': cmd_run, + 'list-available': cmd_list_available, + 'list-running': cmd_list_running, + 'list-resources': cmd_list_resources, + 'list-extensions': cmd_list_extensions, + 'get-global-stats': cmd_get_global_stats, + 'get-spider-stats': cmd_get_spider_stats, + } + +def cmd_help(args, opts): + """help - list available commands""" + print "Available commands:" + for _, func in sorted(get_commands().items()): + print " ", func.__doc__ + +def cmd_run(args, opts): + """run - schedule spider for running""" + jsonrpc_call(opts, 'manager/queue', 'append_spider_name', args[0]) + +def cmd_list_running(args, opts): + """list-running - list running spiders""" + for x in json_get(opts, 'manager/engine/open_spiders'): + print x + +def cmd_list_available(args, opts): + """list-available - list name of available spiders""" + for x in jsonrpc_call(opts, 'spiders', 'list'): + print x + +def cmd_list_resources(args, opts): + """list-resources - list available web service resources""" + for x in json_get(opts, '')['resources']: + print x + +def cmd_list_extensions(args, opts): + """list-extensions - list enabled extensions""" + for x in jsonrpc_call(opts, 'extensions/enabled', 'keys'): + print x + +def cmd_get_spider_stats(args, opts): + """get-spider-stats - get stats of a running spider""" + stats = jsonrpc_call(opts, 'stats', 'get_stats', args[0]) + for name, value in stats.items(): + print "%-40s %s" % (name, value) + +def cmd_get_global_stats(args, opts): + """get-global-stats - get global stats""" + stats = jsonrpc_call(opts, 'stats', 'get_stats') + for name, value in stats.items(): + print "%-40s %s" % (name, value) + +def get_wsurl(opts, path): + return urljoin("http://%s:%s/"% (opts.host, opts.port), path) + +def jsonrpc_call(opts, path, method, *args, **kwargs): + url = get_wsurl(opts, path) + return jsonrpc_client_call(url, method, *args, **kwargs) + +def json_get(opts, path): + url = get_wsurl(opts, path) + return json.loads(urllib.urlopen(url).read()) + +def parse_opts(): + usage = "%prog [options] [arg] ..." + description = "Scrapy web service control script. Use '%prog help' " \ + "to see the list of available commands." + op = optparse.OptionParser(usage=usage, description=description) + op.add_option("-H", dest="host", default="localhost", \ + help="Scrapy host to connect to") + op.add_option("-P", dest="port", type="int", default=6080, \ + help="Scrapy port to connect to") + opts, args = op.parse_args() + if not args: + op.print_help() + sys.exit(2) + cmdname, cmdargs, opts = args[0], args[1:], opts + commands = get_commands() + if cmdname not in commands: + sys.stderr.write("Unknown command: %s\n\n" % cmdname) + cmd_help(None, None) + sys.exit(1) + return commands[cmdname], cmdargs, opts + +def main(): + cmd, args, opts = parse_opts() + try: + cmd(args, opts) + except IndexError: + print cmd.__doc__ + except JsonRpcError, e: + print str(e) + if e.data: + print "Server Traceback below:" + print e.data + + +if __name__ == '__main__': + main() diff --git a/docs/index.rst b/docs/index.rst index 0258271a1..46a1187da 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -87,7 +87,7 @@ Built-in services topics/stats topics/email topics/telnetconsole - topics/webconsole + topics/webservice :doc:`topics/logging` Understand the simple logging facility provided by Scrapy. @@ -101,8 +101,8 @@ Built-in services :doc:`topics/telnetconsole` Inspect a running crawler using a built-in Python console. -:doc:`topics/webconsole` - Monitor and control a crawler using a web interface. +:doc:`topics/webservice` + Monitor and control a crawler using a web service. Solving specific problems diff --git a/docs/intro/overview.rst b/docs/intro/overview.rst index 5dd61100e..29300fbfe 100644 --- a/docs/intro/overview.rst +++ b/docs/intro/overview.rst @@ -190,7 +190,7 @@ scraping easy and efficient, such as: * An :ref:`Interactive scraping shell console `, very useful for writing and debugging your spiders -* A :ref:`Web management console ` for monitoring and +* A builtin :ref:`Web service ` for monitoring and controlling your bot * A :ref:`Telnet console ` for full unrestricted access diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index 972d6ea54..361f74a10 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -36,10 +36,8 @@ by a string: the full Python path to the extension's class name. For example:: EXTENSIONS = { 'scrapy.contrib.corestats.CoreStats': 500, - 'scrapy.management.web.WebConsole': 500, - 'scrapy.management.telnet.TelnetConsole': 500, - 'scrapy.contrib.webconsole.enginestatus.EngineStatus': 500, - 'scrapy.contrib.webconsole.stats.StatsDump': 500, + 'scrapy.webservice.WebService': 500, + 'scrapy.telnet.TelnetConsole': 500, } @@ -71,10 +69,10 @@ Accessing enabled extensions Even though it's not usually needed, you can access extension objects through the :ref:`topics-extensions-ref-manager` which is populated when extensions are -loaded. For example, to access the ``WebConsole`` extension:: +loaded. For example, to access the ``WebService`` extension:: from scrapy.extension import extensions - webconsole_extension = extensions.enabled['WebConsole'] + webservice_extension = extensions.enabled['WebService'] .. see also:: @@ -146,7 +144,7 @@ how you :ref:`configure the downloader middlewares >>> extensions.load() >>> print extensions.enabled {'CoreStats': , - 'WebConsole': , + 'WebService': , ... .. attribute:: disabled @@ -158,7 +156,7 @@ how you :ref:`configure the downloader middlewares >>> from scrapy.extension import extensions >>> extensions.load() >>> print extensions.disabled - {'MemoryDebugger': 'scrapy.contrib.webconsole.stats.MemoryDebugger', + {'MemoryDebugger': 'scrapy.contrib.memdebug.MemoryDebugger', 'MyExtension': 'myproject.extensions.MyExtension', ... @@ -193,44 +191,34 @@ Core Stats extension Enable the collection of core statistics, provided the stats collection is enabled (see :ref:`topics-stats`). -.. _topics-extensions-ref-webconsole: +.. _topics-extensions-ref-webservice: -Web console extension +Web service extension ~~~~~~~~~~~~~~~~~~~~~ -.. module:: scrapy.management.web - :synopsis: Web management console +.. module:: scrapy.webservice + :synopsis: Web service -.. class:: scrapy.management.web.WebConsole +.. class:: scrapy.webservice.WebService -Provides an extensible web server for managing a Scrapy process. It's enabled -by the :setting:`WEBCONSOLE_ENABLED` setting. The server will listen in the -port specified in :setting:`WEBCONSOLE_PORT`, and will log to the file -specified in :setting:`WEBCONSOLE_LOGFILE`. - -The web server is designed to be extended by other extensions which can add -their own management web interfaces. - -See also :ref:`topics-webconsole` for information on how to write your own web -console extension, and :ref:`topics-webconsole-extensions-ref` for a list of -available built-in (web console) extensions. +See `topics-webservice`. .. _topics-extensions-ref-telnetconsole: Telnet console extension ~~~~~~~~~~~~~~~~~~~~~~~~ -.. module:: scrapy.management.telnet - :synopsis: Telnet management console +.. module:: scrapy.telnet + :synopsis: Telnet console -.. class:: scrapy.management.telnet.TelnetConsole +.. class:: scrapy.telnet.TelnetConsole Provides a telnet console for getting into a Python interpreter inside the currently running Scrapy process, which can be very useful for debugging. The telnet console must be enabled by the :setting:`TELNETCONSOLE_ENABLED` setting, and the server will listen in the port specified in -:setting:`WEBCONSOLE_PORT`. +:setting:`TELNETCONSOLE_PORT`. .. _topics-extensions-ref-memusage: diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index e40e0ac13..0a00b8d26 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -519,13 +519,8 @@ Default:: { 'scrapy.contrib.corestats.CoreStats': 0, - 'scrapy.management.web.WebConsole': 0, - 'scrapy.management.telnet.TelnetConsole': 0, - 'scrapy.contrib.webconsole.scheduler.SchedulerQueue': 0, - 'scrapy.contrib.webconsole.livestats.LiveStats': 0, - 'scrapy.contrib.webconsole.spiderctl.Spiderctl': 0, - 'scrapy.contrib.webconsole.enginestatus.EngineStatus': 0, - 'scrapy.contrib.webconsole.stats.StatsDump': 0, + 'scrapy.webservice.WebService': 0, + 'scrapy.telnet.TelnetConsole': 0, 'scrapy.contrib.memusage.MemoryUsage': 0, 'scrapy.contrib.memdebug.MemoryDebugger': 0, 'scrapy.contrib.closedomain.CloseDomain': 0, @@ -1054,33 +1049,3 @@ Default: ``"%s/%s" % (BOT_NAME, BOT_VERSION)`` The default User-Agent to use when crawling, unless overrided. -.. setting:: WEBCONSOLE_ENABLED - -WEBCONSOLE_ENABLED ------------------- - -Default: True - -A boolean which specifies if the web management console will be enabled -(provided its extension is also enabled). - -.. setting:: WEBCONSOLE_LOGFILE - -WEBCONSOLE_LOGFILE ------------------- - -Default: ``None`` - -A file to use for logging HTTP requests made to the web console. If unset web -the log is sent to standard scrapy log. - -.. setting:: WEBCONSOLE_PORT - -WEBCONSOLE_PORT ---------------- - -Default: ``6080`` - -The port to use for the web console. If set to ``None`` or ``0``, a dynamically -assigned port is used. For more info see :ref:`topics-webconsole`. - diff --git a/docs/topics/webconsole.rst b/docs/topics/webconsole.rst deleted file mode 100644 index fd23e4dd7..000000000 --- a/docs/topics/webconsole.rst +++ /dev/null @@ -1,142 +0,0 @@ -.. _topics-webconsole: - -=========== -Web Console -=========== - -Scrapy comes with a built-in web server for monitoring and controlling a Scrapy -running process. - -The web console is :ref:`built-in Scrapy extension -` which comes enabled by default, but you can also -disable it if you're running tight on memory. - -For more information about this extension see -:ref:`topics-extensions-ref-webconsole`. - -Writing a web console extension -=============================== - -Writing a web console extension is similar to writing any other :ref:`Scrapy -extensions ` except that the extension class must: - -1. catch the ``scrapy.management.web.webconsole_discover_module`` signal, and - return itself in the handler. - -2. have the following two attributes: - -.. attribute:: webconsole_id - - The id by which the Scrapy web interface will known this extension, and - also the main dir under which this extension interface will work. For - example, assuming Scrapy web server is listening on - http://localhost:8000/ and the ``webconsole_id='extension1'`` the web - main page for the interface of that extension will be: - - http://localhost:8000/extension1/ - -.. attribute:: webconsole_name - - The name by which the Scrapy web server will know that extension. That name - will be displayed in the main web console index, as the text that links to - the extension main page. - -3. implement the following method: - -.. method:: webconsole_render(wc_request) - -``wc_request`` is a `twisted.web.http.Request`_ object with the HTTP request -sent to the web console. - -.. _twisted.web.http.Request: http://python.net/crew/mwh/apidocs/twisted.web.http.Request.html - -It must return a str with the web page to render, typically containing HTML -code. - -Example web console extension -============================= - -Here's an example of a simple web console extension that just displays a "Hello -world!" text:: - - from scrapy.xlib.pydispatch import dispatcher - from scrapy.management.web import webconsole_discover_module - - class HelloWorldConsole(object): - webconsole_id = 'helloworld' - webconsole_name = 'Hello world' - - def __init__(self): - dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module) - - def webconsole_discover_module(self): - return self - - def webconsole_render(self, wc_request): - return "

Hello world!

" - -If you start Scrapy with the web console enabled on http://localhost:8000/ and -you access the URL: - - http://localhost:8000/helloworld/ - -You will see a page containing a big "Hello World!" text. - -.. _topics-webconsole-extensions-ref: - -Available Web console extensions --------------------------------- - -.. module:: scrapy.contrib.webconsole - :synopsis: Contains most built-in web console extensions - -Here is a list of built-in web console extensions. - -Scheduler queue extension -~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. module:: scrapy.contrib.webconsole.scheduler - :synopsis: Scheduler queue web console extension - -.. class:: scrapy.contrib.webconsole.scheduler.SchedulerQueue - -Display a list of all pending Requests in the Scheduler queue, grouped by -domain/spider. - -Spider live stats extension -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. module:: scrapy.contrib.webconsole.livestats - :synopsis: Spider live stats web console extension - -.. class:: scrapy.contrib.webconsole.livestats.LiveStats - -Display a table with stats of all spider crawled by the current Scrapy run, -including: - -* Number of items scraped -* Number of pages crawled -* Number of pending requests in the scheduler -* Number of pending requests in the downloader queue -* Number of requests currently being downloaded - -Engine status extension -~~~~~~~~~~~~~~~~~~~~~~~ - -.. module:: scrapy.contrib.webconsole.enginestatus - :synopsis: Engine status web console extension - -.. class:: scrapy.contrib.webconsole.enginestatus.EngineStatus - -Display the current status of the Scrapy Engine, which is just the output of -the Scrapy engine ``getstatus()`` method. - -Stats collector dump extension -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. module:: scrapy.contrib.webconsole.stats - :synopsis: Stats dump web console extension - -.. class:: scrapy.contrib.webconsole.stats.StatsDump - -Display the stats collected so far by the stats collector. diff --git a/docs/topics/webservice.rst b/docs/topics/webservice.rst new file mode 100644 index 000000000..5f4222e7c --- /dev/null +++ b/docs/topics/webservice.rst @@ -0,0 +1,236 @@ +.. _topics-webservice: + +=========== +Web Service +=========== + +Scrapy comes with a built-in web service for monitoring and controlling a +running crawler. The service exposes most resources using the `JSON-RPC 2.0`_ +protocol, but there are also other (read-only) resources which just output JSON +data. + +Provides an extensible web service for managing a Scrapy process. It's enabled +by the :setting:`WEBSERVICE_ENABLED` setting. The web server will listen in the +port specified in :setting:`WEBSERVICE_PORT`, and will log to the file +specified in :setting:`WEBSERVICE_LOGFILE`. + +The web service is a :ref:`built-in Scrapy extension ` +which comes enabled by default, but you can also disable it if you're running +tight on memory. + +.. _topics-webservice-resources: + +Web service resources +===================== + +The web service contains several resources, defined in the +:setting:`WEBSERVICE_RESOURCES` setting. Each resource provides a different +functionality. See :ref:`topics-webservice-resources-ref` for a list of +resources available by default. + +Althought you can implement your own resources using any protocol, there are +two kinds of resources bundled with Scrapy: + +* Simple JSON resources - which are read-only and just output JSON data +* JSON-RPC resources - which provide direct access to certain Scrapy objects + using the `JSON-RPC 2.0`_ protocol + +.. module:: scrapy.contrib.webservice + :synopsis: Built-in web service resources + +.. _topics-webservice-resources-ref: + +Available JSON-RPC resources +---------------------------- + +These are the JSON-RPC resources available by default in Scrapy: + +Execution Manager JSON-RPC resource +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. module:: scrapy.contrib.webservice.manager + :synopsis: Execution Manager JSON-RPC resource + +.. class:: ManagerResource + + Provides access to the Execution Manager that controls the crawler. + + Available by default at: http://localhost:6080/manager + +Stats Collector JSON-RPC resource +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. module:: scrapy.contrib.webservice.stats + :synopsis: Stats JSON-RPC resource + +.. class:: StatsResource + + Provides access to the Stats Collector used by the crawler. + + Available by default at: http://localhost:6080/stats + +Spider Manager JSON-RPC resource +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. module:: scrapy.contrib.webservice.spiders + :synopsis: Spider Manager JSON-RPC resource + +.. class:: SpidersResource + + Provides access to the Spider Manager used by the crawler. + + Available by default at: http://localhost:6080/spiders + +Extension Manager JSON-RPC resource +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. module:: scrapy.contrib.webservice.extensions + :synopsis: Extension Manager JSON-RPC resource + +.. class:: ExtensionsResource + + Provides access to the Extension Manager used by the crawler. + + Available by default at: http://localhost:6080/extensions + +Available JSON resources +------------------------ + +These are the JSON resources available by default: + +Extension Manager JSON-RPC resource +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. module:: scrapy.contrib.webservice.enginestatus + :synopsis: Engine Status JSON resource + +.. class:: EngineStatusResource + + Provides access to the Extension Manager used by the crawler. + + Available by default at: http://localhost:6080/enginestatus + +Web service settings +==================== + +These are the settings that control the web service behaviour: + +.. setting:: WEBSERVICE_ENABLED + +WEBSERVICE_ENABLED +------------------ + +Default: ``True`` + +A boolean which specifies if the web service will be enabled (provided its +extension is also enabled). + +.. setting:: WEBSERVICE_LOGFILE + +WEBSERVICE_LOGFILE +------------------ + +Default: ``None`` + +A file to use for logging HTTP requests made to the web service. If unset web +the log is sent to standard scrapy log. + +.. setting:: WEBSERVICE_PORT + +WEBSERVICE_PORT +--------------- + +Default: ``6080`` + +The port to use for the web service. If set to ``None`` or ``0``, a dynamically +assigned port is used. + +WEBSERVICE_RESOURCES +-------------------- + +Default: ``{}`` + +The list of web service resources enabled for your project. See +:ref:`topics-webservice-resources`. These are added to the ones available by +default in Scrapy, defined in the :setting:`WEBSERVICE_RESOURCES_BASE` setting. + +WEBSERVICE_RESOURCES_BASE +------------------------- + +Default:: + + { + 'scrapy.contrib.webservice.manager.ManagerResource': 1, + 'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1, + 'scrapy.contrib.webservice.extensions.ExtensionsResource': 1, + 'scrapy.contrib.webservice.spiders.SpidersResource': 1, + 'scrapy.contrib.webservice.stats.StatsResource': 1, + } + +The list of web service resources available by default in Scrapy. You shouldn't +change this setting in your project, change :setting:`WEBSERVICE_RESOURCES` +instead. If you want to disable some resource set its value to ``None`` in +:setting:`WEBSERVICE_RESOURCES`. + +Writing a web service resource +============================== + +Web service resources are implemented using the Twisted Web API. See this +`Twisted Web guide`_ for more information on Twisted web and Twisted web +resources. + +To write a web service resource you should subclass the :class:`JsonResource` or +:class:`JsonRpcResource` classes and implement the :class:`renderGET` method. + +.. class:: scrapy.webservice.JsonResource + + A subclass of `twisted.web.resource.Resource`_ that implements a JSON web + service resource. See + + .. attribute:: ws_name + + The name by which the Scrapy web service will known this resource, and + also the path wehere this resource will listen. For example, assuming + Scrapy web service is listening on http://localhost:6080/ and the + ``ws_name`` is ``'resource1'`` the URL for that resource will be: + + http://localhost:6080/resource1/ + +.. class:: scrapy.webservice.JsonRpcResource(target=None) + + This is a subclass of :class:`JsonResource` for implementing JSON-RPC + resources. JSON-RPC resources wrap Python (Scrapy) objects around a + JSON-RPC API. The resource wrapped must be returned by the + :meth:`get_target` method, which returns the target passed in the + constructor by default + + .. method:: get_target() + + Return the object wrapped by this JSON-RPC resource. By default, it + returns the object passed on the constructor. + +Examples of web service resources +================================= + +StatsResource (JSON-RPC resource) +--------------------------------- + +.. literalinclude:: ../../scrapy/contrib/webservice/stats.py + +EngineStatusResource (JSON resource) +------------------------------------- + +.. literalinclude:: ../../scrapy/contrib/webservice/enginestatus.py + +Example of web service client +============================= + +scrapy-ws.py script +------------------- + +.. literalinclude:: ../../bin/scrapy-ws.py + +.. _Twisted Web guide: http://jcalderone.livejournal.com/50562.html +.. _JSON-RPC 2.0: http://www.jsonrpc.org/ +.. _twisted.web.resource.Resource: http://twistedmatrix.com/documents/10.0.0/api/twisted.web.resource.Resource.html + diff --git a/scrapy/conf/default_settings.py b/scrapy/conf/default_settings.py index f86615b32..d2983f898 100644 --- a/scrapy/conf/default_settings.py +++ b/scrapy/conf/default_settings.py @@ -109,13 +109,8 @@ EXTENSIONS = {} EXTENSIONS_BASE = { 'scrapy.contrib.corestats.CoreStats': 0, - 'scrapy.management.web.WebConsole': 0, + 'scrapy.webservice.WebService': 0, 'scrapy.telnet.TelnetConsole': 0, - 'scrapy.contrib.webconsole.scheduler.SchedulerQueue': 0, - 'scrapy.contrib.webconsole.livestats.LiveStats': 0, - 'scrapy.contrib.webconsole.spiderctl.Spiderctl': 0, - 'scrapy.contrib.webconsole.enginestatus.EngineStatus': 0, - 'scrapy.contrib.webconsole.stats.StatsDump': 0, 'scrapy.contrib.memusage.MemoryUsage': 0, 'scrapy.contrib.memdebug.MemoryDebugger': 0, 'scrapy.contrib.closespider.CloseSpider': 0, @@ -232,3 +227,14 @@ WEBCONSOLE_ENABLED = True WEBCONSOLE_PORT = 6080 WEBCONSOLE_LOGFILE = None +WEBSERVICE_ENABLED = True +WEBSERVICE_LOGFILE = None +WEBSERVICE_PORT = 6080 +WEBSERVICE_RESOURCES = {} +WEBSERVICE_RESOURCES_BASE = { + 'scrapy.contrib.webservice.manager.ManagerResource': 1, + 'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1, + 'scrapy.contrib.webservice.extensions.ExtensionsResource': 1, + 'scrapy.contrib.webservice.spiders.SpidersResource': 1, + 'scrapy.contrib.webservice.stats.StatsResource': 1, +} diff --git a/scrapy/contrib/webconsole/__init__.py b/scrapy/contrib/webconsole/__init__.py index e69de29bb..72830df17 100644 --- a/scrapy/contrib/webconsole/__init__.py +++ b/scrapy/contrib/webconsole/__init__.py @@ -0,0 +1,3 @@ +import warnings +warnings.warn("Web console is deprecated. Consider using web service instead.", \ + DeprecationWarning, stacklevel=2) diff --git a/scrapy/contrib/webservice/__init__.py b/scrapy/contrib/webservice/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/scrapy/contrib/webservice/enginestatus.py b/scrapy/contrib/webservice/enginestatus.py new file mode 100644 index 000000000..66230fab0 --- /dev/null +++ b/scrapy/contrib/webservice/enginestatus.py @@ -0,0 +1,24 @@ +from scrapy.webservice import JsonResource +from scrapy.core.manager import scrapymanager +from scrapy.utils.engine import get_engine_status + +class EngineStatusResource(JsonResource): + + ws_name = 'enginestatus' + + def __init__(self, spider_name=None, _manager=scrapymanager): + JsonResource.__init__(self) + self._spider_name = spider_name + self.isLeaf = spider_name is not None + self._manager = _manager + + def render_GET(self, txrequest): + status = get_engine_status(self._manager.engine) + if self._spider_name is None: + return status + for sp, st in status['spiders'].items(): + if sp.name == self._spider_name: + return st + + def getChild(self, name, txrequest): + return EngineStatusResource(name, self._manager) diff --git a/scrapy/contrib/webservice/extensions.py b/scrapy/contrib/webservice/extensions.py new file mode 100644 index 000000000..e18186cd1 --- /dev/null +++ b/scrapy/contrib/webservice/extensions.py @@ -0,0 +1,10 @@ +from scrapy.webservice import JsonRpcResource +from scrapy.extension import extensions + +class ExtensionsResource(JsonRpcResource): + + ws_name = 'extensions' + + def __init__(self, _extensions=extensions): + JsonRpcResource.__init__(self) + self._target = _extensions diff --git a/scrapy/contrib/webservice/manager.py b/scrapy/contrib/webservice/manager.py new file mode 100644 index 000000000..87bc4e270 --- /dev/null +++ b/scrapy/contrib/webservice/manager.py @@ -0,0 +1,10 @@ +from scrapy.webservice import JsonRpcResource +from scrapy.core.manager import scrapymanager + +class ManagerResource(JsonRpcResource): + + ws_name = 'manager' + + def __init__(self, _manager=scrapymanager): + JsonRpcResource.__init__(self) + self._target = _manager diff --git a/scrapy/contrib/webservice/spiders.py b/scrapy/contrib/webservice/spiders.py new file mode 100644 index 000000000..dd64645a1 --- /dev/null +++ b/scrapy/contrib/webservice/spiders.py @@ -0,0 +1,10 @@ +from scrapy.webservice import JsonRpcResource +from scrapy.spider import spiders + +class SpidersResource(JsonRpcResource): + + ws_name = 'spiders' + + def __init__(self, _spiders=spiders): + JsonRpcResource.__init__(self) + self._target = _spiders diff --git a/scrapy/contrib/webservice/stats.py b/scrapy/contrib/webservice/stats.py new file mode 100644 index 000000000..3ab2efce3 --- /dev/null +++ b/scrapy/contrib/webservice/stats.py @@ -0,0 +1,10 @@ +from scrapy.webservice import JsonRpcResource +from scrapy.stats import stats + +class StatsResource(JsonRpcResource): + + ws_name = 'stats' + + def __init__(self, _stats=stats): + JsonRpcResource.__init__(self) + self._target = _stats diff --git a/scrapy/management/web.py b/scrapy/management/web.py index 498613a5c..d54cf4953 100644 --- a/scrapy/management/web.py +++ b/scrapy/management/web.py @@ -1,8 +1,6 @@ -""" -Scrapy Web Console extension - -See docs/topics/webconsole.rst -""" +import warnings +warnings.warn("Scrapy web console is deprecated. Consider using web service instead.", \ + DeprecationWarning, stacklevel=2) import re import socket diff --git a/scrapy/spider/models.py b/scrapy/spider/models.py index 09e9f1771..d0be6f901 100644 --- a/scrapy/spider/models.py +++ b/scrapy/spider/models.py @@ -77,6 +77,6 @@ class BaseSpider(object_ref): raise NotImplementedError def __str__(self): - return "<%s %r>" % (type(self).__name__, self.name) + return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self)) __repr__ = __str__ diff --git a/scrapy/tests/test_utils_jsonrpc.py b/scrapy/tests/test_utils_jsonrpc.py new file mode 100644 index 000000000..5e2e01344 --- /dev/null +++ b/scrapy/tests/test_utils_jsonrpc.py @@ -0,0 +1,112 @@ +import unittest +from cStringIO import StringIO + +from scrapy.utils.jsonrpc import jsonrpc_client_call, jsonrpc_server_call, \ + JsonRpcError, jsonrpc_errors +from scrapy.utils.py26 import json + +class urllib_stub(object): + def __init__(self, result=None, error=None): + response = {} + if result: + response.update(result=result) + if error: + response.update(error=error) + self.response = json.dumps(response) + self.request = None + + def urlopen(self, url, request): + self.url = url + self.request = request + return StringIO(self.response) + +class TestTarget(object): + + def call(self, *args, **kwargs): + return list(args), kwargs + + def exception(self): + raise Exception("testing-errors") + +class JsonRpcUtilsTestCase(unittest.TestCase): + + def test_jsonrpc_client_call_request(self): + ul = urllib_stub(1) + jsonrpc_client_call('url', 'test', 'one', 2, _urllib=ul) + req = json.loads(ul.request) + assert 'id' in req + self.assertEqual(ul.url, 'url') + self.assertEqual(req['jsonrpc'], '2.0') + self.assertEqual(req['method'], 'test') + self.assertEqual(req['params'], ['one', 2]) + + def test_jsonrpc_client_call_response(self): + ul = urllib_stub() + # must return result or error + self.assertRaises(ValueError, jsonrpc_client_call, 'url', 'test', _urllib=ul) + ul = urllib_stub(result={'one': 1}) + self.assertEquals(jsonrpc_client_call('url', 'test', _urllib=ul), {'one': 1}) + ul = urllib_stub(error={'code': 123, 'message': 'hello', 'data': 'some data'}) + + raised = False + try: + jsonrpc_client_call('url', 'test', _urllib=ul) + except JsonRpcError, e: + raised = True + self.assertEqual(e.code, 123) + self.assertEqual(e.message, 'hello') + self.assertEqual(e.data, 'some data') + assert '123' in str(e) + assert 'hello' in str(e) + assert raised, "JsonRpcError not raised" + + def test_jsonrpc_server_call(self): + t = TestTarget() + r = jsonrpc_server_call(t, 'invalid json data') + assert 'error' in r + assert r['jsonrpc'] == '2.0' + assert r['id'] is None + self.assertEqual(r['error']['code'], jsonrpc_errors.PARSE_ERROR) + assert 'Traceback' in r['error']['data'] + + r = jsonrpc_server_call(t, '{"test": "test"}') + assert 'error' in r + assert r['jsonrpc'] == '2.0' + assert r['id'] is None + self.assertEqual(r['error']['code'], jsonrpc_errors.INVALID_REQUEST) + + r = jsonrpc_server_call(t, '{"method": "notfound", "id": 1}') + assert 'error' in r + assert r['jsonrpc'] == '2.0' + assert r['id'] == 1 + self.assertEqual(r['error']['code'], jsonrpc_errors.METHOD_NOT_FOUND) + + r = jsonrpc_server_call(t, '{"method": "exception", "id": 1}') + assert 'error' in r + assert r['jsonrpc'] == '2.0' + assert r['id'] == 1 + self.assertEqual(r['error']['code'], jsonrpc_errors.INTERNAL_ERROR) + assert 'testing-errors' in r['error']['message'] + assert 'Traceback' in r['error']['data'] + + r = jsonrpc_server_call(t, '{"method": "call", "id": 2}') + assert 'result' in r + assert r['jsonrpc'] == '2.0' + assert r['id'] == 2 + self.assertEqual(r['result'], ([], {})) + + r = jsonrpc_server_call(t, '{"method": "call", "params": [456, 123], "id": 3}') + assert 'result' in r + assert r['jsonrpc'] == '2.0' + assert r['id'] == 3 + self.assertEqual(r['result'], ([456, 123], {})) + + r = jsonrpc_server_call(t, '{"method": "call", "params": {"data": 789}, "id": 3}') + assert 'result' in r + assert r['jsonrpc'] == '2.0' + assert r['id'] == 3 + self.assertEqual(r['result'], ([], {'data': 789})) + +if __name__ == "__main__": + unittest.main() + diff --git a/scrapy/tests/test_utils_serialize.py b/scrapy/tests/test_utils_serialize.py new file mode 100644 index 000000000..01b5fab2a --- /dev/null +++ b/scrapy/tests/test_utils_serialize.py @@ -0,0 +1,125 @@ +import unittest +import datetime +from decimal import Decimal + +from scrapy.utils.serialize import SpiderReferencer, ScrapyJSONEncoder, ScrapyJSONDecoder +from scrapy.utils.py26 import json +from scrapy.spider import BaseSpider +from scrapy.http import Request, Response + + +class ExecutionEngineStub(object): + def __init__(self, open_spiders): + self.open_spiders = open_spiders + +class ExecutionMangerStub(object): + def __init__(self, open_spiders): + self.engine = ExecutionEngineStub(open_spiders) + +class BaseTestCase(unittest.TestCase): + + def setUp(self): + self.spider1 = BaseSpider('name1') + self.spider2 = BaseSpider('name2') + open_spiders = set([self.spider1, self.spider2]) + manager = ExecutionMangerStub(open_spiders) + self.spref = SpiderReferencer(manager) + self.encoder = ScrapyJSONEncoder(spref=self.spref) + self.decoder = ScrapyJSONDecoder(spref=self.spref) + +class SpiderReferencerTestCase(BaseTestCase): + + def test_spiders_and_references(self): + ref1 = self.spref.get_reference_from_spider(self.spider1) + assert isinstance(ref1, str) + assert self.spider1.name in ref1 + ref2 = self.spref.get_reference_from_spider(self.spider2) + ref1_ = self.spref.get_reference_from_spider(self.spider1) + assert ref1 == ref1_ + assert ref1 != ref2 + + sp1 = self.spref.get_spider_from_reference(ref1) + sp2 = self.spref.get_spider_from_reference(ref2) + sp1_ = self.spref.get_spider_from_reference(ref1) + assert isinstance(sp1, BaseSpider) + assert sp1 is not sp2 + assert sp1 is sp1_ + + # must return string as-is if spider id not found + assert 'lala' == self.spref.get_spider_from_reference('lala') + # must raise RuntimeError if spider id is not found and spider is not running + self.assertRaises(RuntimeError, self.spref.get_spider_from_reference, 'spider:fffffff') + + def test_encode_decode(self): + sr = self.spref + sp1 = self.spider1 + sp2 = self.spider2 + ref1 = sr.get_reference_from_spider(sp1) + ref2 = sr.get_reference_from_spider(sp2) + + examples = [ + ('lala', 'lala'), + (sp1, ref1), + (['lala', sp1], ['lala', ref1]), + ({'lala': sp1}, {'lala': ref1}), + ({sp1: sp2}, {ref1: ref2}), + ({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}}) + ] + for spiders, refs in examples: + self.assertEqual(sr.encode_references(spiders), refs) + self.assertEqual(sr.decode_references(refs), spiders) + +class JsonEncoderTestCase(BaseTestCase): + + def test_encode_decode(self): + sr = self.spref + sp1 = self.spider1 + sp2 = self.spider2 + ref1 = sr.get_reference_from_spider(sp1) + ref2 = sr.get_reference_from_spider(sp2) + dt = datetime.datetime(2010, 1, 2, 10, 11, 12) + dts = "2010-01-02 10:11:12" + d = datetime.date(2010, 1, 2) + ds = "2010-01-02" + t = datetime.time(10, 11, 12) + ts = "10:11:12" + dec = Decimal("1000.12") + decs = "1000.12" + + examples_encode_decode = [ + ('lala', 'lala'), + (sp1, ref1), + (['lala', sp1], ['lala', ref1]), + ({'lala': sp1}, {'lala': ref1}), + ({sp1: sp2}, {ref1: ref2}), + ({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}}) + ] + for spiders, refs in examples_encode_decode: + self.assertEqual(self.encoder.encode(spiders), json.dumps(refs)) + self.assertEqual(self.decoder.decode(json.dumps(refs)), spiders) + + examples_encode_only = [ + ({sp1: dt}, {ref1: dts}), + ({sp1: d}, {ref1: ds}), + ({sp1: t}, {ref1: ts}), + ({sp1: dec}, {ref1: decs}), + ] + for spiders, refs in examples_encode_only: + self.assertEqual(self.encoder.encode(spiders), json.dumps(refs)) + + def test_encode_request(self): + r = Request("http://www.example.com/lala") + rs = self.encoder.encode(r) + assert r.method in rs + assert r.url in rs + + def test_encode_response(self): + r = Response("http://www.example.com/lala") + rs = self.encoder.encode(r) + assert r.url in rs + assert str(r.status) in rs + + +if __name__ == "__main__": + unittest.main() + diff --git a/scrapy/utils/engine.py b/scrapy/utils/engine.py index 6f37055a5..a87741631 100644 --- a/scrapy/utils/engine.py +++ b/scrapy/utils/engine.py @@ -37,23 +37,34 @@ def get_engine_status(engine=None): "engine.scraper.sites[spider].needs_backout()", ] - s = "Execution engine status\n\n" - + status = {'global': {}, 'spiders': {}} for test in global_tests: try: - s += "%-47s : %s\n" % (test, eval(test)) + status['global'][test] = eval(test) except Exception, e: - s += "%-47s : %s (exception)\n" % (test, type(e).__name__) - s += "\n" + status['global'][test] = "%s (exception)" % type(e).__name__ for spider in engine.downloader.sites: - s += "Spider: %s\n" % spider + x = {} for test in spider_tests: try: - s += " %-50s : %s\n" % (test, eval(test)) + x[test] = eval(test) except Exception, e: - s += " %-50s : %s (exception)\n" % (test, type(e).__name__) + x[test] = "%s (exception)" % type(e).__name__ + status['spiders'][spider] = x + return status + +def format_engine_status(engine=None): + status = get_engine_status(engine) + s = "Execution engine status\n\n" + for test, result in status['global'].items(): + s += "%-47s : %s\n" % (test, result) + s += "\n" + for spider, tests in status['spiders'].items(): + s += "Spider: %s\n" % spider + for test, result in tests.items(): + s += " %-50s : %s\n" % (test, result) return s def print_engine_status(engine=None): - print get_engine_status(engine) + print format_engine_status(engine) diff --git a/scrapy/utils/jsonrpc.py b/scrapy/utils/jsonrpc.py new file mode 100644 index 000000000..6735e7a03 --- /dev/null +++ b/scrapy/utils/jsonrpc.py @@ -0,0 +1,94 @@ +""" +This module implements the JSON-RPC 2.0 protocol, as defined in: +http://groups.google.com/group/json-rpc/web/json-rpc-2-0 +""" + +import urllib +import traceback + +from scrapy.utils.py26 import json +from scrapy.utils.serialize import ScrapyJSONDecoder + +# JSON-RPC 2.0 errors, as defined in: +class jsonrpc_errors: + PARSE_ERROR = -32700 + INVALID_REQUEST = -32600 + METHOD_NOT_FOUND = -32601 + INVALID_PARAMS = -32602 + INTERNAL_ERROR = -32603 + +class JsonRpcError(Exception): + + def __init__(self, code, message, data=None): + super(JsonRpcError, self).__init__() + self.code = code + self.message = message + self.data = data + + def __str__(self): + return "JSON-RPC error (code %d): %s" % (self.code, self.message) + +def jsonrpc_client_call(url, method, *args, **kwargs): + """Execute a JSON-RPC call on the given url""" + _urllib = kwargs.pop('_urllib', urllib) + req = {'jsonrpc': '2.0', 'method': method, 'params': args or kwargs, 'id': 1} + res = json.loads(_urllib.urlopen(url, json.dumps(req)).read()) + if 'result' in res: + return res['result'] + elif 'error' in res: + er = res['error'] + raise JsonRpcError(er['code'], er['message'], er['data']) + else: + msg = "JSON-RPC response must contain 'result' or 'error': %s" % res + raise ValueError(msg) + +def jsonrpc_server_call(target, jsonrpc_request, json_decoder=None): + """Execute the given JSON-RPC request (as JSON-encoded string) on the given + target object and return the JSON-RPC response, as a dict + """ + if json_decoder is None: + json_decoder = ScrapyJSONDecoder() + + try: + req = json_decoder.decode(jsonrpc_request) + except Exception, e: + return jsonrpc_error(None, jsonrpc_errors.PARSE_ERROR, 'Parse error', \ + traceback.format_exc()) + + try: + id, methname = req['id'], req['method'] + except KeyError: + return jsonrpc_error(None, jsonrpc_errors.INVALID_REQUEST, 'Invalid Request') + + try: + method = getattr(target, methname) + except AttributeError: + return jsonrpc_error(id, jsonrpc_errors.METHOD_NOT_FOUND, 'Method not found') + + params = req.get('params', []) + a, kw = ([], params) if isinstance(params, dict) else (params, {}) + try: + return jsonrpc_result(id, method(*a, **kw)) + except Exception, e: + return jsonrpc_error(id, jsonrpc_errors.INTERNAL_ERROR, str(e), \ + traceback.format_exc()) + +def jsonrpc_error(id, code, message, data=None): + """Create JSON-RPC error response""" + return { + 'jsonrpc': '2.0', + 'error': { + 'code': code, + 'message': message, + 'data': data, + }, + 'id': id, + } + +def jsonrpc_result(id, result): + """Create JSON-RPC result response""" + return { + 'jsonrpc': '2.0', + 'result': result, + 'id': id, + } diff --git a/scrapy/utils/serialize.py b/scrapy/utils/serialize.py new file mode 100644 index 000000000..aa07c7865 --- /dev/null +++ b/scrapy/utils/serialize.py @@ -0,0 +1,115 @@ +import re +import datetime +import decimal + +from scrapy.core.manager import scrapymanager +from scrapy.spider import BaseSpider +from scrapy.http import Request, Response +from scrapy.utils.py26 import json + + +class SpiderReferencer(object): + """Class to serialize (and deserialize) objects (typically dicts) + containing references to running spiders (ie. Spider objects). This is + required because simplejson fails to serialize dicts containing + non-primitive types as keys, even when you override + ScrapyJSONEncoder.default() with a custom encoding mechanism. + """ + + spider_ref_re = re.compile('^spider:([0-9a-f]+)(:.*)?$') + + def __init__(self, manager=None): + self.manager = manager or scrapymanager + + def get_reference_from_spider(self, spider): + return 'spider:%x:%s' % (id(spider), spider.name) + + def get_spider_from_reference(self, ref): + """Returns the Spider referenced by text, if text is a spider + reference. Otherwise it returns the text itself. If the text references + a non-running spider it raises a RuntimeError. + """ + m = self.spider_ref_re.search(ref) + if m: + spid = int(m.group(1), 16) + for spider in self.manager.engine.open_spiders: + if id(spider) == spid: + return spider + raise RuntimeError("Spider not running: %s" % ref) + return ref + + def encode_references(self, obj): + """Look for Spider objects and replace them with spider references""" + if isinstance(obj, BaseSpider): + return self.get_reference_from_spider(obj) + elif isinstance(obj, dict): + d = {} + for k, v in obj.items(): + k = self.encode_references(k) + v = self.encode_references(v) + d[k] = v + return d + elif isinstance(obj, (list, tuple)): + return [self.encode_references(x) for x in obj] + else: + return obj + + def decode_references(self, obj): + """Look for spider references and replace them with Spider objects""" + if isinstance(obj, basestring): + return self.get_spider_from_reference(obj) + elif isinstance(obj, dict): + d = {} + for k, v in obj.items(): + k = self.decode_references(k) + v = self.decode_references(v) + d[k] = v + return d + elif isinstance(obj, (list, tuple)): + return [self.decode_references(x) for x in obj] + else: + return obj + + +class ScrapyJSONEncoder(json.JSONEncoder): + + DATE_FORMAT = "%Y-%m-%d" + TIME_FORMAT = "%H:%M:%S" + + def __init__(self, *a, **kw): + self.spref = kw.pop('spref', None) or SpiderReferencer() + super(ScrapyJSONEncoder, self).__init__(*a, **kw) + + def encode(self, o): + if self.spref: + o = self.spref.encode_references(o) + return super(ScrapyJSONEncoder, self).encode(o) + + def default(self, o): + if isinstance(o, datetime.datetime): + return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT)) + elif isinstance(o, datetime.date): + return o.strftime(self.DATE_FORMAT) + elif isinstance(o, datetime.time): + return o.strftime(self.TIME_FORMAT) + elif isinstance(o, decimal.Decimal): + return str(o) + elif isinstance(o, Request): + return "<%s %s %s>" % (type(o).__name__, o.method, o.url) + elif isinstance(o, Response): + return "<%s %s %s>" % (type(o).__name__, o.status, o.url) + else: + return super(ScrapyJSONEncoder, self).default(o) + + +class ScrapyJSONDecoder(json.JSONDecoder): + + def __init__(self, *a, **kw): + self.spref = kw.pop('spref', None) or SpiderReferencer() + super(ScrapyJSONDecoder, self).__init__(*a, **kw) + + def decode(self, s): + o = super(ScrapyJSONDecoder, self).decode(s) + if self.spref: + o = self.spref.decode_references(o) + return o diff --git a/scrapy/webservice.py b/scrapy/webservice.py new file mode 100644 index 000000000..ddfe97369 --- /dev/null +++ b/scrapy/webservice.py @@ -0,0 +1,86 @@ +""" +Scrapy web services extension + +See docs/topics/ws.rst +""" + +from twisted.internet import reactor +from twisted.web import server, resource, error + +from scrapy.core.exceptions import NotConfigured +from scrapy.utils.jsonrpc import jsonrpc_server_call +from scrapy.utils.serialize import ScrapyJSONEncoder, ScrapyJSONDecoder +from scrapy.utils.misc import load_object +from scrapy.utils.conf import build_component_list +from scrapy.conf import settings + + +class JsonResource(resource.Resource): + + ws_name = None + json_encoder = ScrapyJSONEncoder() + + def render(self, txrequest): + r = resource.Resource.render(self, txrequest) + r = self.json_encoder.encode(r) + txrequest.setHeader('Content-Type', 'application/json') + txrequest.setHeader('Content-Length', len(r)) + return r + + +class JsonRpcResource(JsonResource): + + json_decoder = ScrapyJSONDecoder() + + def __init__(self, target=None): + JsonResource.__init__(self) + self._target = target + + def render_GET(self, txrequest): + return self.get_target() + + def render_POST(self, txrequest): + reqstr = txrequest.content.read() + target = self.get_target() + return jsonrpc_server_call(target, reqstr, self.json_decoder) + + def getChild(self, name, txrequest): + target = self.get_target() + try: + newtarget = getattr(target, name) + return JsonRpcResource(newtarget) + except AttributeError: + return error.NoResource("No such child resource.") + + def get_target(self): + return self._target + + +class RootResource(JsonResource): + + def render_GET(self, txrequest): + return {'resources': self.children.keys()} + + def getChild(self, name, txrequest): + if name == '': + return self + return JsonResource.getChild(self, name, txrequest) + + +class WebService(server.Site): + + def __init__(self): + if not settings.getbool('WEBSERVICE_ENABLED'): + raise NotConfigured + logfile = settings['WEBSERVICE_LOGFILE'] + port = settings.getint('WEBSERVICE_PORT') + root = RootResource() + reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \ + settings['WEBSERVICE_RESOURCES']) + for res_cls in map(load_object, reslist): + res = res_cls() + root.putChild(res.ws_name, res) + server.Site.__init__(self, root, logPath=logfile) + self.noisy = False + reactor.callWhenRunning(reactor.listenTCP, port, self) +