* Added Scrapy Web Service with documentation and tests.

* Marked Web Console as deprecated. * Removed Web Console documentation to discourage its use.
2025-02-23 15:04:27 +00:00 · 2010-06-09 13:46:22 -03:00 · 2010-06-09 13:46:22 -03:00 · 6a33d6c4d0
commit 6a33d6c4d0
parent 2499dfee5e
23 changed files with 1007 additions and 232 deletions
--- a/bin/scrapy-ws.py
+++ b/bin/scrapy-ws.py
@ -0,0 +1,114 @@
 #!/usr/bin/env python
 """
 Example script to control and monitor Scrapy using its web service. It only
 provides a reduced functionality as its main purpose is to illustrate how to
 write a web service client. Feel free to improve or write you own.
 """
 import sys, optparse, urllib
 from urlparse import urljoin
 from scrapy.utils.jsonrpc import jsonrpc_client_call, JsonRpcError
 from scrapy.utils.py26 import json
 def get_commands():
    return {
        'help': cmd_help,
        'run': cmd_run,
        'list-available': cmd_list_available,
        'list-running': cmd_list_running,
        'list-resources': cmd_list_resources,
        'list-extensions': cmd_list_extensions,
        'get-global-stats': cmd_get_global_stats,
        'get-spider-stats': cmd_get_spider_stats,
    }
 def cmd_help(args, opts):
    """help - list available commands"""
    print "Available commands:"
    for _, func in sorted(get_commands().items()):
        print "  ", func.__doc__
 def cmd_run(args, opts):
    """run <spider_name> - schedule spider for running"""
    jsonrpc_call(opts, 'manager/queue', 'append_spider_name', args[0])
 def cmd_list_running(args, opts):
    """list-running - list running spiders"""
    for x in json_get(opts, 'manager/engine/open_spiders'):
        print x
 def cmd_list_available(args, opts):
    """list-available - list name of available spiders"""
    for x in jsonrpc_call(opts, 'spiders', 'list'):
        print x
 def cmd_list_resources(args, opts):
    """list-resources - list available web service resources"""
    for x in json_get(opts, '')['resources']:
        print x
 def cmd_list_extensions(args, opts):
    """list-extensions - list enabled extensions"""
    for x in jsonrpc_call(opts, 'extensions/enabled', 'keys'):
        print x
 def cmd_get_spider_stats(args, opts):
    """get-spider-stats <spider> - get stats of a running spider"""
    stats = jsonrpc_call(opts, 'stats', 'get_stats', args[0])
    for name, value in stats.items():
        print "%-40s %s" % (name, value)
 def cmd_get_global_stats(args, opts):
    """get-global-stats - get global stats"""
    stats = jsonrpc_call(opts, 'stats', 'get_stats')
    for name, value in stats.items():
        print "%-40s %s" % (name, value)
 def get_wsurl(opts, path):
    return urljoin("http://%s:%s/"% (opts.host, opts.port), path)
 def jsonrpc_call(opts, path, method, *args, **kwargs):
    url = get_wsurl(opts, path)
    return jsonrpc_client_call(url, method, *args, **kwargs)
 def json_get(opts, path):
    url = get_wsurl(opts, path)
    return json.loads(urllib.urlopen(url).read())
 def parse_opts():
    usage = "%prog [options] <command> [arg] ..."
    description = "Scrapy web service control script. Use '%prog help' " \
        "to see the list of available commands."
    op = optparse.OptionParser(usage=usage, description=description)
    op.add_option("-H", dest="host", default="localhost", \
        help="Scrapy host to connect to")
    op.add_option("-P", dest="port", type="int", default=6080, \
        help="Scrapy port to connect to")
    opts, args = op.parse_args()
    if not args:
        op.print_help()
        sys.exit(2)
    cmdname, cmdargs, opts = args[0], args[1:], opts
    commands = get_commands()
    if cmdname not in commands:
        sys.stderr.write("Unknown command: %s\n\n" % cmdname)
        cmd_help(None, None)
        sys.exit(1)
    return commands[cmdname], cmdargs, opts
 def main():
    cmd, args, opts = parse_opts()
    try:
        cmd(args, opts)
    except IndexError:
        print cmd.__doc__
    except JsonRpcError, e:
        print str(e)
        if e.data:
            print "Server Traceback below:"
            print e.data
 if __name__ == '__main__':
    main()
--- a/docs/index.rst
+++ b/docs/index.rst
@ -87,7 +87,7 @@ Built-in services
   topics/stats
   topics/email
   topics/telnetconsole
-   topics/webconsole
+   topics/webservice
 :doc:`topics/logging`
    Understand the simple logging facility provided by Scrapy.
@ -101,8 +101,8 @@ Built-in services
 :doc:`topics/telnetconsole`
    Inspect a running crawler using a built-in Python console.
-:doc:`topics/webconsole`
+:doc:`topics/webservice`
-    Monitor and control a crawler using a web interface.
+    Monitor and control a crawler using a web service.
 Solving specific problems
--- a/docs/intro/overview.rst
+++ b/docs/intro/overview.rst
@ -190,7 +190,7 @@ scraping easy and efficient, such as:
 * An :ref:`Interactive scraping shell console <topics-shell>`, very useful for
  writing and debugging your spiders
-* A :ref:`Web management console <topics-webconsole>` for monitoring and
+* A builtin :ref:`Web service <topics-webservice>` for monitoring and
  controlling your bot
 * A :ref:`Telnet console <topics-telnetconsole>` for full unrestricted access
--- a/docs/topics/extensions.rst
+++ b/docs/topics/extensions.rst
@ -36,10 +36,8 @@ by a string: the full Python path to the extension's class name. For example::
    EXTENSIONS = {
        'scrapy.contrib.corestats.CoreStats': 500,
-        'scrapy.management.web.WebConsole': 500,
+        'scrapy.webservice.WebService': 500,
-        'scrapy.management.telnet.TelnetConsole': 500,
+        'scrapy.telnet.TelnetConsole': 500,
        'scrapy.contrib.webconsole.enginestatus.EngineStatus': 500,
        'scrapy.contrib.webconsole.stats.StatsDump': 500,
    }
@ -71,10 +69,10 @@ Accessing enabled extensions
 Even though it's not usually needed, you can access extension objects through
 the :ref:`topics-extensions-ref-manager` which is populated when extensions are
-loaded.  For example, to access the ``WebConsole`` extension::
+loaded.  For example, to access the ``WebService`` extension::
    from scrapy.extension import extensions
-    webconsole_extension = extensions.enabled['WebConsole']
+    webservice_extension = extensions.enabled['WebService']
 .. see also::
@ -146,7 +144,7 @@ how you :ref:`configure the downloader middlewares
            >>> extensions.load()
            >>> print extensions.enabled
            {'CoreStats': <scrapy.contrib.corestats.CoreStats object at 0x9e272ac>,
-             'WebConsole': <scrapy.management.telnet.TelnetConsole instance at 0xa05670c>,
+             'WebService': <scrapy.management.telnet.TelnetConsole instance at 0xa05670c>,
            ...
    .. attribute:: disabled
@ -158,7 +156,7 @@ how you :ref:`configure the downloader middlewares
            >>> from scrapy.extension import extensions
            >>> extensions.load()
            >>> print extensions.disabled
-            {'MemoryDebugger': 'scrapy.contrib.webconsole.stats.MemoryDebugger',
+            {'MemoryDebugger': 'scrapy.contrib.memdebug.MemoryDebugger',
             'MyExtension': 'myproject.extensions.MyExtension',
            ...
@ -193,44 +191,34 @@ Core Stats extension
 Enable the collection of core statistics, provided the stats collection is
 enabled (see :ref:`topics-stats`).
-.. _topics-extensions-ref-webconsole:
+.. _topics-extensions-ref-webservice:
-Web console extension
+Web service extension
 ~~~~~~~~~~~~~~~~~~~~~
-.. module:: scrapy.management.web
+.. module:: scrapy.webservice
-   :synopsis: Web management console 
+   :synopsis: Web service
-.. class:: scrapy.management.web.WebConsole
+.. class:: scrapy.webservice.WebService
-Provides an extensible web server for managing a Scrapy process. It's enabled
+See `topics-webservice`.
 by the :setting:`WEBCONSOLE_ENABLED` setting. The server will listen in the
 port specified in :setting:`WEBCONSOLE_PORT`, and will log to the file
 specified in :setting:`WEBCONSOLE_LOGFILE`.
 The web server is designed to be extended by other extensions which can add
 their own management web interfaces. 
 See also :ref:`topics-webconsole` for information on how to write your own web
 console extension, and :ref:`topics-webconsole-extensions-ref` for a list of
 available built-in (web console) extensions.
 .. _topics-extensions-ref-telnetconsole:
 Telnet console extension
 ~~~~~~~~~~~~~~~~~~~~~~~~
-.. module:: scrapy.management.telnet
+.. module:: scrapy.telnet
-   :synopsis: Telnet management console 
+   :synopsis: Telnet console 
-.. class:: scrapy.management.telnet.TelnetConsole
+.. class:: scrapy.telnet.TelnetConsole
 Provides a telnet console for getting into a Python interpreter inside the
 currently running Scrapy process, which can be very useful for debugging. 
 The telnet console must be enabled by the :setting:`TELNETCONSOLE_ENABLED`
 setting, and the server will listen in the port specified in
-:setting:`WEBCONSOLE_PORT`.
+:setting:`TELNETCONSOLE_PORT`.
 .. _topics-extensions-ref-memusage:
--- a/docs/topics/settings.rst
+++ b/docs/topics/settings.rst
@ -519,13 +519,8 @@ Default::
    {
        'scrapy.contrib.corestats.CoreStats': 0,
-        'scrapy.management.web.WebConsole': 0,
+        'scrapy.webservice.WebService': 0,
-        'scrapy.management.telnet.TelnetConsole': 0,
+        'scrapy.telnet.TelnetConsole': 0,
        'scrapy.contrib.webconsole.scheduler.SchedulerQueue': 0,
        'scrapy.contrib.webconsole.livestats.LiveStats': 0,
        'scrapy.contrib.webconsole.spiderctl.Spiderctl': 0,
        'scrapy.contrib.webconsole.enginestatus.EngineStatus': 0,
        'scrapy.contrib.webconsole.stats.StatsDump': 0,
        'scrapy.contrib.memusage.MemoryUsage': 0,
        'scrapy.contrib.memdebug.MemoryDebugger': 0,
        'scrapy.contrib.closedomain.CloseDomain': 0,
@ -1054,33 +1049,3 @@ Default: ``"%s/%s" % (BOT_NAME, BOT_VERSION)``
 The default User-Agent to use when crawling, unless overrided. 
 .. setting:: WEBCONSOLE_ENABLED
 WEBCONSOLE_ENABLED
 ------------------
 Default: True
 A boolean which specifies if the web management console will be enabled
 (provided its extension is also enabled).
 .. setting:: WEBCONSOLE_LOGFILE
 WEBCONSOLE_LOGFILE
 ------------------
 Default: ``None``
 A file to use for logging HTTP requests made to the web console. If unset web
 the log is sent to standard scrapy log.
 .. setting:: WEBCONSOLE_PORT
 WEBCONSOLE_PORT
 ---------------
 Default: ``6080``
 The port to use for the web console. If set to ``None`` or ``0``, a dynamically
 assigned port is used. For more info see :ref:`topics-webconsole`.
--- a/docs/topics/webconsole.rst
+++ b/docs/topics/webconsole.rst
@ -1,142 +0,0 @@
 .. _topics-webconsole:
 ===========
 Web Console
 ===========
 Scrapy comes with a built-in web server for monitoring and controlling a Scrapy
 running process. 
 The web console is :ref:`built-in Scrapy extension
 <topics-extensions-ref>` which comes enabled by default, but you can also
 disable it if you're running tight on memory.
 For more information about this extension see
 :ref:`topics-extensions-ref-webconsole`.
 Writing a web console extension
 ===============================
 Writing a web console extension is similar to writing any other :ref:`Scrapy
 extensions <topics-extensions>` except that the extension class must:
 1. catch the ``scrapy.management.web.webconsole_discover_module`` signal, and
   return itself in the handler.
 2. have the following two attributes:
 .. attribute:: webconsole_id
    The id by which the Scrapy web interface will known this extension, and
    also the main dir under which this extension interface will work. For
    example, assuming Scrapy web server is listening on
    http://localhost:8000/ and the ``webconsole_id='extension1'`` the web
    main page for the interface of that extension will be:
        http://localhost:8000/extension1/
 .. attribute:: webconsole_name
    The name by which the Scrapy web server will know that extension. That name
    will be displayed in the main web console index, as the text that links to
    the extension main page.
 3. implement the following method:
 .. method:: webconsole_render(wc_request)
 ``wc_request`` is a `twisted.web.http.Request`_ object with the HTTP request
 sent to the web console.
 .. _twisted.web.http.Request: http://python.net/crew/mwh/apidocs/twisted.web.http.Request.html
 It must return a str with the web page to render, typically containing HTML
 code.
 Example web console extension
 =============================
 Here's an example of a simple web console extension that just displays a "Hello
 world!" text::
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.management.web import webconsole_discover_module
    class HelloWorldConsole(object):
        webconsole_id = 'helloworld'
        webconsole_name = 'Hello world'
        def __init__(self):
            dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module)
        def webconsole_discover_module(self):
            return self
        def webconsole_render(self, wc_request):
            return "<html><head></head><body><h1>Hello world!</h1></body>"
 If you start Scrapy with the web console enabled on http://localhost:8000/ and
 you access the URL:
    http://localhost:8000/helloworld/
 You will see a page containing a big "Hello World!" text.
 .. _topics-webconsole-extensions-ref:
 Available Web console extensions
 --------------------------------
 .. module:: scrapy.contrib.webconsole
   :synopsis: Contains most built-in web console extensions
 Here is a list of built-in web console extensions.
 Scheduler queue extension
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 .. module:: scrapy.contrib.webconsole.scheduler
   :synopsis: Scheduler queue web console extension
 .. class:: scrapy.contrib.webconsole.scheduler.SchedulerQueue
 Display a list of all pending Requests in the Scheduler queue, grouped by
 domain/spider.
 Spider live stats extension
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. module:: scrapy.contrib.webconsole.livestats
   :synopsis: Spider live stats web console extension
 .. class:: scrapy.contrib.webconsole.livestats.LiveStats
 Display a table with stats of all spider crawled by the current Scrapy run,
 including:
 * Number of items scraped
 * Number of pages crawled
 * Number of pending requests in the scheduler
 * Number of pending requests in the downloader queue
 * Number of requests currently being downloaded
 Engine status extension
 ~~~~~~~~~~~~~~~~~~~~~~~
 .. module:: scrapy.contrib.webconsole.enginestatus
   :synopsis: Engine status web console extension
 .. class:: scrapy.contrib.webconsole.enginestatus.EngineStatus
 Display the current status of the Scrapy Engine, which is just the output of
 the Scrapy engine ``getstatus()`` method.
 Stats collector dump extension
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. module:: scrapy.contrib.webconsole.stats
   :synopsis: Stats dump web console extension
 .. class:: scrapy.contrib.webconsole.stats.StatsDump
 Display the stats collected so far by the stats collector.
--- a/docs/topics/webservice.rst
+++ b/docs/topics/webservice.rst
@ -0,0 +1,236 @@
 .. _topics-webservice:
 ===========
 Web Service
 ===========
 Scrapy comes with a built-in web service for monitoring and controlling a
 running crawler. The service exposes most resources using the `JSON-RPC 2.0`_
 protocol, but there are also other (read-only) resources which just output JSON
 data.
 Provides an extensible web service for managing a Scrapy process. It's enabled
 by the :setting:`WEBSERVICE_ENABLED` setting. The web server will listen in the
 port specified in :setting:`WEBSERVICE_PORT`, and will log to the file
 specified in :setting:`WEBSERVICE_LOGFILE`.
 The web service is a :ref:`built-in Scrapy extension <topics-extensions-ref>`
 which comes enabled by default, but you can also disable it if you're running
 tight on memory.
 .. _topics-webservice-resources:
 Web service resources
 =====================
 The web service contains several resources, defined in the
 :setting:`WEBSERVICE_RESOURCES` setting. Each resource provides a different
 functionality. See :ref:`topics-webservice-resources-ref` for a list of
 resources available by default.
 Althought you can implement your own resources using any protocol, there are
 two kinds of resources bundled with Scrapy:
 * Simple JSON resources - which are read-only and just output JSON data
 * JSON-RPC resources - which provide direct access to certain Scrapy objects
  using the `JSON-RPC 2.0`_ protocol
 .. module:: scrapy.contrib.webservice
   :synopsis: Built-in web service resources
 .. _topics-webservice-resources-ref:
 Available JSON-RPC resources
 ----------------------------
 These are the JSON-RPC resources available by default in Scrapy:
 Execution Manager JSON-RPC resource
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. module:: scrapy.contrib.webservice.manager
   :synopsis: Execution Manager JSON-RPC resource
 .. class:: ManagerResource
    Provides access to the Execution Manager that controls the crawler.
    Available by default at: http://localhost:6080/manager
 Stats Collector JSON-RPC resource
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. module:: scrapy.contrib.webservice.stats
   :synopsis: Stats JSON-RPC resource
 .. class:: StatsResource
    Provides access to the Stats Collector used by the crawler.
    Available by default at: http://localhost:6080/stats
 Spider Manager JSON-RPC resource
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. module:: scrapy.contrib.webservice.spiders
   :synopsis: Spider Manager JSON-RPC resource
 .. class:: SpidersResource
    Provides access to the Spider Manager used by the crawler.
    Available by default at: http://localhost:6080/spiders
 Extension Manager JSON-RPC resource
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. module:: scrapy.contrib.webservice.extensions
   :synopsis: Extension Manager JSON-RPC resource
 .. class:: ExtensionsResource
    Provides access to the Extension Manager used by the crawler.
    Available by default at: http://localhost:6080/extensions
 Available JSON resources
 ------------------------
 These are the JSON resources available by default:
 Extension Manager JSON-RPC resource
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. module:: scrapy.contrib.webservice.enginestatus
   :synopsis: Engine Status JSON resource
 .. class:: EngineStatusResource
    Provides access to the Extension Manager used by the crawler.
    Available by default at: http://localhost:6080/enginestatus
 Web service settings
 ====================
 These are the settings that control the web service behaviour:
 .. setting:: WEBSERVICE_ENABLED
 WEBSERVICE_ENABLED
 ------------------
 Default: ``True``
 A boolean which specifies if the web service will be enabled (provided its
 extension is also enabled).
 .. setting:: WEBSERVICE_LOGFILE
 WEBSERVICE_LOGFILE
 ------------------
 Default: ``None``
 A file to use for logging HTTP requests made to the web service. If unset web
 the log is sent to standard scrapy log.
 .. setting:: WEBSERVICE_PORT
 WEBSERVICE_PORT
 ---------------
 Default: ``6080``
 The port to use for the web service. If set to ``None`` or ``0``, a dynamically
 assigned port is used.
 WEBSERVICE_RESOURCES
 --------------------
 Default: ``{}``
 The list of web service resources enabled for your project. See
 :ref:`topics-webservice-resources`. These are added to the ones available by
 default in Scrapy, defined in the :setting:`WEBSERVICE_RESOURCES_BASE` setting.
 WEBSERVICE_RESOURCES_BASE
 -------------------------
 Default::
    {
        'scrapy.contrib.webservice.manager.ManagerResource': 1,
        'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
        'scrapy.contrib.webservice.extensions.ExtensionsResource': 1,
        'scrapy.contrib.webservice.spiders.SpidersResource': 1,
        'scrapy.contrib.webservice.stats.StatsResource': 1,
    }
 The list of web service resources available by default in Scrapy. You shouldn't
 change this setting in your project, change :setting:`WEBSERVICE_RESOURCES`
 instead. If you want to disable some resource set its value to ``None`` in
 :setting:`WEBSERVICE_RESOURCES`.
 Writing a web service resource
 ==============================
 Web service resources are implemented using the Twisted Web API. See this
 `Twisted Web guide`_ for more information on Twisted web and Twisted web
 resources.
 To write a web service resource you should subclass the :class:`JsonResource` or
 :class:`JsonRpcResource` classes and implement the :class:`renderGET` method. 
 .. class:: scrapy.webservice.JsonResource
    A subclass of `twisted.web.resource.Resource`_ that implements a JSON web
    service resource. See 
    .. attribute:: ws_name
        The name by which the Scrapy web service will known this resource, and
        also the path wehere this resource will listen. For example, assuming
        Scrapy web service is listening on http://localhost:6080/ and the
        ``ws_name`` is ``'resource1'`` the URL for that resource will be:
            http://localhost:6080/resource1/
 .. class:: scrapy.webservice.JsonRpcResource(target=None)
    This is a subclass of :class:`JsonResource` for implementing JSON-RPC
    resources. JSON-RPC resources wrap Python (Scrapy) objects around a
    JSON-RPC API. The resource wrapped must be returned by the
    :meth:`get_target` method, which returns the target passed in the
    constructor by default
    .. method:: get_target()
        Return the object wrapped by this JSON-RPC resource. By default, it
        returns the object passed on the constructor.
 Examples of web service resources
 =================================
 StatsResource (JSON-RPC resource)
 ---------------------------------
 .. literalinclude:: ../../scrapy/contrib/webservice/stats.py
 EngineStatusResource (JSON resource)
 -------------------------------------
 .. literalinclude:: ../../scrapy/contrib/webservice/enginestatus.py
 Example of web service client
 =============================
 scrapy-ws.py script
 -------------------
 .. literalinclude:: ../../bin/scrapy-ws.py
 .. _Twisted Web guide: http://jcalderone.livejournal.com/50562.html 
 .. _JSON-RPC 2.0: http://www.jsonrpc.org/
 .. _twisted.web.resource.Resource: http://twistedmatrix.com/documents/10.0.0/api/twisted.web.resource.Resource.html 
--- a/scrapy/conf/default_settings.py
+++ b/scrapy/conf/default_settings.py
@ -109,13 +109,8 @@ EXTENSIONS = {}
 EXTENSIONS_BASE = {
    'scrapy.contrib.corestats.CoreStats': 0,
-    'scrapy.management.web.WebConsole': 0,
+    'scrapy.webservice.WebService': 0,
    'scrapy.telnet.TelnetConsole': 0,
    'scrapy.contrib.webconsole.scheduler.SchedulerQueue': 0,
    'scrapy.contrib.webconsole.livestats.LiveStats': 0,
    'scrapy.contrib.webconsole.spiderctl.Spiderctl': 0,
    'scrapy.contrib.webconsole.enginestatus.EngineStatus': 0,
    'scrapy.contrib.webconsole.stats.StatsDump': 0,
    'scrapy.contrib.memusage.MemoryUsage': 0,
    'scrapy.contrib.memdebug.MemoryDebugger': 0,
    'scrapy.contrib.closespider.CloseSpider': 0,
@ -232,3 +227,14 @@ WEBCONSOLE_ENABLED = True
 WEBCONSOLE_PORT = 6080
 WEBCONSOLE_LOGFILE = None
 WEBSERVICE_ENABLED = True
 WEBSERVICE_LOGFILE = None
 WEBSERVICE_PORT = 6080
 WEBSERVICE_RESOURCES = {}
 WEBSERVICE_RESOURCES_BASE = {
    'scrapy.contrib.webservice.manager.ManagerResource': 1,
    'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
    'scrapy.contrib.webservice.extensions.ExtensionsResource': 1,
    'scrapy.contrib.webservice.spiders.SpidersResource': 1,
    'scrapy.contrib.webservice.stats.StatsResource': 1,
 }
--- a/scrapy/contrib/webconsole/init.py
+++ b/scrapy/contrib/webconsole/init.py
@ -0,0 +1,3 @@
 import warnings
 warnings.warn("Web console is deprecated. Consider using web service instead.", \
    DeprecationWarning, stacklevel=2)
--- a/scrapy/contrib/webservice/init.py
+++ b/scrapy/contrib/webservice/init.py
--- a/scrapy/contrib/webservice/enginestatus.py
+++ b/scrapy/contrib/webservice/enginestatus.py
@ -0,0 +1,24 @@
 from scrapy.webservice import JsonResource
 from scrapy.core.manager import scrapymanager
 from scrapy.utils.engine import get_engine_status
 class EngineStatusResource(JsonResource):
    ws_name = 'enginestatus'
    def __init__(self, spider_name=None, _manager=scrapymanager):
        JsonResource.__init__(self)
        self._spider_name = spider_name
        self.isLeaf = spider_name is not None
        self._manager = _manager
    def render_GET(self, txrequest):
        status = get_engine_status(self._manager.engine)
        if self._spider_name is None:
            return status
        for sp, st in status['spiders'].items():
            if sp.name == self._spider_name:
                return st
    def getChild(self, name, txrequest):
        return EngineStatusResource(name, self._manager)
--- a/scrapy/contrib/webservice/extensions.py
+++ b/scrapy/contrib/webservice/extensions.py
@ -0,0 +1,10 @@
 from scrapy.webservice import JsonRpcResource
 from scrapy.extension import extensions
 class ExtensionsResource(JsonRpcResource):
    ws_name = 'extensions'
    def __init__(self, _extensions=extensions):
        JsonRpcResource.__init__(self)
        self._target = _extensions
--- a/scrapy/contrib/webservice/manager.py
+++ b/scrapy/contrib/webservice/manager.py
@ -0,0 +1,10 @@
 from scrapy.webservice import JsonRpcResource
 from scrapy.core.manager import scrapymanager
 class ManagerResource(JsonRpcResource):
    ws_name = 'manager'
    def __init__(self, _manager=scrapymanager):
        JsonRpcResource.__init__(self)
        self._target = _manager
--- a/scrapy/contrib/webservice/spiders.py
+++ b/scrapy/contrib/webservice/spiders.py
@ -0,0 +1,10 @@
 from scrapy.webservice import JsonRpcResource
 from scrapy.spider import spiders
 class SpidersResource(JsonRpcResource):
    ws_name = 'spiders'
    def __init__(self, _spiders=spiders):
        JsonRpcResource.__init__(self)
        self._target = _spiders
--- a/scrapy/contrib/webservice/stats.py
+++ b/scrapy/contrib/webservice/stats.py
@ -0,0 +1,10 @@
 from scrapy.webservice import JsonRpcResource
 from scrapy.stats import stats
 class StatsResource(JsonRpcResource):
    ws_name = 'stats'
    def __init__(self, _stats=stats):
        JsonRpcResource.__init__(self)
        self._target = _stats
--- a/scrapy/management/web.py
+++ b/scrapy/management/web.py
@ -1,8 +1,6 @@
-"""
+import warnings
-Scrapy Web Console extension
+warnings.warn("Scrapy web console is deprecated. Consider using web service instead.", \
-
+    DeprecationWarning, stacklevel=2)
 See docs/topics/webconsole.rst
 """
 import re
 import socket
--- a/scrapy/spider/models.py
+++ b/scrapy/spider/models.py
@ -77,6 +77,6 @@ class BaseSpider(object_ref):
        raise NotImplementedError
    def __str__(self):
-        return "<%s %r>" % (type(self).__name__, self.name)
+        return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
    __repr__ = __str__
--- a/scrapy/tests/test_utils_jsonrpc.py
+++ b/scrapy/tests/test_utils_jsonrpc.py
@ -0,0 +1,112 @@
 import unittest
 from cStringIO import StringIO
 from scrapy.utils.jsonrpc import jsonrpc_client_call, jsonrpc_server_call, \
    JsonRpcError, jsonrpc_errors
 from scrapy.utils.py26 import json
 class urllib_stub(object):
    def __init__(self, result=None, error=None):
        response = {}
        if result:
            response.update(result=result)
        if error:
            response.update(error=error)
        self.response = json.dumps(response)
        self.request = None
    def urlopen(self, url, request):
        self.url = url
        self.request = request
        return StringIO(self.response)
 class TestTarget(object):
    def call(self, *args, **kwargs):
        return list(args), kwargs
    def exception(self):
        raise Exception("testing-errors")
 class JsonRpcUtilsTestCase(unittest.TestCase):
    def test_jsonrpc_client_call_request(self):
        ul = urllib_stub(1)
        jsonrpc_client_call('url', 'test', 'one', 2, _urllib=ul)
        req = json.loads(ul.request)
        assert 'id' in req
        self.assertEqual(ul.url, 'url')
        self.assertEqual(req['jsonrpc'], '2.0')
        self.assertEqual(req['method'], 'test')
        self.assertEqual(req['params'], ['one', 2])
    def test_jsonrpc_client_call_response(self):
        ul = urllib_stub()
        # must return result or error
        self.assertRaises(ValueError, jsonrpc_client_call, 'url', 'test', _urllib=ul)
        ul = urllib_stub(result={'one': 1})
        self.assertEquals(jsonrpc_client_call('url', 'test', _urllib=ul), {'one': 1})
        ul = urllib_stub(error={'code': 123, 'message': 'hello', 'data': 'some data'})
        raised = False
        try:
            jsonrpc_client_call('url', 'test', _urllib=ul)
        except JsonRpcError, e:
            raised = True
            self.assertEqual(e.code, 123)
            self.assertEqual(e.message, 'hello')
            self.assertEqual(e.data, 'some data')
            assert '123' in str(e)
            assert 'hello' in str(e)
        assert raised, "JsonRpcError not raised"
    def test_jsonrpc_server_call(self):
        t = TestTarget()
        r = jsonrpc_server_call(t, 'invalid json data')
        assert 'error' in r
        assert r['jsonrpc'] == '2.0'
        assert r['id'] is None
        self.assertEqual(r['error']['code'], jsonrpc_errors.PARSE_ERROR)
        assert 'Traceback' in r['error']['data']
        r = jsonrpc_server_call(t, '{"test": "test"}')
        assert 'error' in r
        assert r['jsonrpc'] == '2.0'
        assert r['id'] is None
        self.assertEqual(r['error']['code'], jsonrpc_errors.INVALID_REQUEST)
        r = jsonrpc_server_call(t, '{"method": "notfound", "id": 1}')
        assert 'error' in r
        assert r['jsonrpc'] == '2.0'
        assert r['id'] == 1
        self.assertEqual(r['error']['code'], jsonrpc_errors.METHOD_NOT_FOUND)
        r = jsonrpc_server_call(t, '{"method": "exception", "id": 1}')
        assert 'error' in r
        assert r['jsonrpc'] == '2.0'
        assert r['id'] == 1
        self.assertEqual(r['error']['code'], jsonrpc_errors.INTERNAL_ERROR)
        assert 'testing-errors' in r['error']['message']
        assert 'Traceback' in r['error']['data']
        r = jsonrpc_server_call(t, '{"method": "call", "id": 2}')
        assert 'result' in r
        assert r['jsonrpc'] == '2.0'
        assert r['id'] == 2
        self.assertEqual(r['result'], ([], {}))
        r = jsonrpc_server_call(t, '{"method": "call", "params": [456, 123], "id": 3}')
        assert 'result' in r
        assert r['jsonrpc'] == '2.0'
        assert r['id'] == 3
        self.assertEqual(r['result'], ([456, 123], {}))
        r = jsonrpc_server_call(t, '{"method": "call", "params": {"data": 789}, "id": 3}')
        assert 'result' in r
        assert r['jsonrpc'] == '2.0'
        assert r['id'] == 3
        self.assertEqual(r['result'], ([], {'data': 789}))
 if __name__ == "__main__":
    unittest.main()
--- a/scrapy/tests/test_utils_serialize.py
+++ b/scrapy/tests/test_utils_serialize.py
@ -0,0 +1,125 @@
 import unittest
 import datetime
 from decimal import Decimal
 from scrapy.utils.serialize import SpiderReferencer, ScrapyJSONEncoder, ScrapyJSONDecoder
 from scrapy.utils.py26 import json
 from scrapy.spider import BaseSpider
 from scrapy.http import Request, Response
 class ExecutionEngineStub(object):
    def __init__(self, open_spiders):
        self.open_spiders = open_spiders
 class ExecutionMangerStub(object):
    def __init__(self, open_spiders):
        self.engine = ExecutionEngineStub(open_spiders)
 class BaseTestCase(unittest.TestCase):
    def setUp(self):
        self.spider1 = BaseSpider('name1')
        self.spider2 = BaseSpider('name2')
        open_spiders = set([self.spider1, self.spider2])
        manager = ExecutionMangerStub(open_spiders)
        self.spref = SpiderReferencer(manager)
        self.encoder = ScrapyJSONEncoder(spref=self.spref)
        self.decoder = ScrapyJSONDecoder(spref=self.spref)
 class SpiderReferencerTestCase(BaseTestCase):
    def test_spiders_and_references(self):
        ref1 = self.spref.get_reference_from_spider(self.spider1)
        assert isinstance(ref1, str)
        assert self.spider1.name in ref1
        ref2 = self.spref.get_reference_from_spider(self.spider2)
        ref1_ = self.spref.get_reference_from_spider(self.spider1)
        assert ref1 == ref1_
        assert ref1 != ref2
        sp1 = self.spref.get_spider_from_reference(ref1)
        sp2 = self.spref.get_spider_from_reference(ref2)
        sp1_ = self.spref.get_spider_from_reference(ref1)
        assert isinstance(sp1, BaseSpider)
        assert sp1 is not sp2
        assert sp1 is sp1_
        # must return string as-is if spider id not found
        assert 'lala' == self.spref.get_spider_from_reference('lala')
        # must raise RuntimeError if spider id is not found and spider is not running
        self.assertRaises(RuntimeError, self.spref.get_spider_from_reference, 'spider:fffffff')
    def test_encode_decode(self):
        sr = self.spref
        sp1 = self.spider1
        sp2 = self.spider2
        ref1 = sr.get_reference_from_spider(sp1)
        ref2 = sr.get_reference_from_spider(sp2)
        examples = [
            ('lala', 'lala'),
            (sp1, ref1),
            (['lala', sp1], ['lala', ref1]),
            ({'lala': sp1}, {'lala': ref1}),
            ({sp1: sp2}, {ref1: ref2}),
            ({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}})
        ]
        for spiders, refs in examples:
            self.assertEqual(sr.encode_references(spiders), refs)
            self.assertEqual(sr.decode_references(refs), spiders)
 class JsonEncoderTestCase(BaseTestCase):
    def test_encode_decode(self):
        sr = self.spref
        sp1 = self.spider1
        sp2 = self.spider2
        ref1 = sr.get_reference_from_spider(sp1)
        ref2 = sr.get_reference_from_spider(sp2)
        dt = datetime.datetime(2010, 1, 2, 10, 11, 12)
        dts = "2010-01-02 10:11:12"
        d = datetime.date(2010, 1, 2)
        ds = "2010-01-02"
        t = datetime.time(10, 11, 12)
        ts = "10:11:12"
        dec = Decimal("1000.12")
        decs = "1000.12"
        examples_encode_decode = [
            ('lala', 'lala'),
            (sp1, ref1),
            (['lala', sp1], ['lala', ref1]),
            ({'lala': sp1}, {'lala': ref1}),
            ({sp1: sp2}, {ref1: ref2}),
            ({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}})
        ]
        for spiders, refs in examples_encode_decode:
            self.assertEqual(self.encoder.encode(spiders), json.dumps(refs))
            self.assertEqual(self.decoder.decode(json.dumps(refs)), spiders)
        examples_encode_only = [
            ({sp1: dt}, {ref1: dts}),
            ({sp1: d}, {ref1: ds}),
            ({sp1: t}, {ref1: ts}),
            ({sp1: dec}, {ref1: decs}),
        ]
        for spiders, refs in examples_encode_only:
            self.assertEqual(self.encoder.encode(spiders), json.dumps(refs))
    def test_encode_request(self):
        r = Request("http://www.example.com/lala")
        rs = self.encoder.encode(r)
        assert r.method in rs
        assert r.url in rs
    def test_encode_response(self):
        r = Response("http://www.example.com/lala")
        rs = self.encoder.encode(r)
        assert r.url in rs
        assert str(r.status) in rs
 if __name__ == "__main__":
    unittest.main()
--- a/scrapy/utils/engine.py
+++ b/scrapy/utils/engine.py
@ -37,23 +37,34 @@ def get_engine_status(engine=None):
        "engine.scraper.sites[spider].needs_backout()",
    ]
-    s = "Execution engine status\n\n"
+    status = {'global': {}, 'spiders': {}}
    for test in global_tests:
        try:
-            s += "%-47s : %s\n" % (test, eval(test))
+            status['global'][test] = eval(test)
        except Exception, e:
-            s += "%-47s : %s (exception)\n" % (test, type(e).__name__)
+            status['global'][test] = "%s (exception)" % type(e).__name__
    s += "\n"
    for spider in engine.downloader.sites:
-        s += "Spider: %s\n" % spider
+        x = {}
        for test in spider_tests:
            try:
-                s += "  %-50s : %s\n" % (test, eval(test))
+                x[test] = eval(test)
            except Exception, e:
-                s += "  %-50s : %s (exception)\n" % (test, type(e).__name__)
+                x[test] = "%s (exception)" % type(e).__name__
            status['spiders'][spider] = x
    return status
 def format_engine_status(engine=None):
    status = get_engine_status(engine)
    s = "Execution engine status\n\n"
    for test, result in status['global'].items():
        s += "%-47s : %s\n" % (test, result)
    s += "\n"
    for spider, tests in status['spiders'].items():
        s += "Spider: %s\n" % spider
        for test, result in tests.items():
            s += "  %-50s : %s\n" % (test, result)
    return s
 def print_engine_status(engine=None):
-    print get_engine_status(engine)
+    print format_engine_status(engine)
--- a/scrapy/utils/jsonrpc.py
+++ b/scrapy/utils/jsonrpc.py
@ -0,0 +1,94 @@
 """
 This module implements the JSON-RPC 2.0 protocol, as defined in:
 http://groups.google.com/group/json-rpc/web/json-rpc-2-0
 """
 import urllib
 import traceback
 from scrapy.utils.py26 import json
 from scrapy.utils.serialize import ScrapyJSONDecoder
 # JSON-RPC 2.0 errors, as defined in:
 class jsonrpc_errors:
    PARSE_ERROR = -32700
    INVALID_REQUEST = -32600
    METHOD_NOT_FOUND = -32601
    INVALID_PARAMS = -32602
    INTERNAL_ERROR = -32603
 class JsonRpcError(Exception):
    def __init__(self, code, message, data=None):
        super(JsonRpcError, self).__init__()
        self.code = code
        self.message = message
        self.data = data
    def __str__(self):
        return "JSON-RPC error (code %d): %s" % (self.code, self.message)
 def jsonrpc_client_call(url, method, *args, **kwargs):
    """Execute a JSON-RPC call on the given url"""
    _urllib = kwargs.pop('_urllib', urllib)
    req = {'jsonrpc': '2.0', 'method': method, 'params': args or kwargs, 'id': 1}
    res = json.loads(_urllib.urlopen(url, json.dumps(req)).read())
    if 'result' in res:
        return res['result']
    elif 'error' in res:
        er = res['error']
        raise JsonRpcError(er['code'], er['message'], er['data'])
    else:
        msg = "JSON-RPC response must contain 'result' or 'error': %s" % res
        raise ValueError(msg)
 def jsonrpc_server_call(target, jsonrpc_request, json_decoder=None):
    """Execute the given JSON-RPC request (as JSON-encoded string) on the given
    target object and return the JSON-RPC response, as a dict
    """
    if json_decoder is None:
        json_decoder = ScrapyJSONDecoder()
    try:
        req = json_decoder.decode(jsonrpc_request)
    except Exception, e:
        return jsonrpc_error(None, jsonrpc_errors.PARSE_ERROR, 'Parse error', \
            traceback.format_exc())
    try:
        id, methname = req['id'], req['method']
    except KeyError:
        return jsonrpc_error(None, jsonrpc_errors.INVALID_REQUEST, 'Invalid Request')
    try:
        method = getattr(target, methname)
    except AttributeError:
        return jsonrpc_error(id, jsonrpc_errors.METHOD_NOT_FOUND, 'Method not found')
    params = req.get('params', [])
    a, kw = ([], params) if isinstance(params, dict) else (params, {})
    try:
        return jsonrpc_result(id, method(*a, **kw))
    except Exception, e:
        return jsonrpc_error(id, jsonrpc_errors.INTERNAL_ERROR, str(e), \
            traceback.format_exc())
 def jsonrpc_error(id, code, message, data=None):
    """Create JSON-RPC error response"""
    return {
        'jsonrpc': '2.0',
        'error': {
            'code': code,
            'message': message,
            'data': data,
        },
        'id': id,
    }
 def jsonrpc_result(id, result):
    """Create JSON-RPC result response"""
    return {
        'jsonrpc': '2.0',
        'result': result,
        'id': id,
    }
--- a/scrapy/utils/serialize.py
+++ b/scrapy/utils/serialize.py
@ -0,0 +1,115 @@
 import re
 import datetime
 import decimal
 from scrapy.core.manager import scrapymanager
 from scrapy.spider import BaseSpider
 from scrapy.http import Request, Response
 from scrapy.utils.py26 import json
 class SpiderReferencer(object):
    """Class to serialize (and deserialize) objects (typically dicts)
    containing references to running spiders (ie. Spider objects). This is
    required because simplejson fails to serialize dicts containing
    non-primitive types as keys, even when you override
    ScrapyJSONEncoder.default() with a custom encoding mechanism.
    """
    spider_ref_re = re.compile('^spider:([0-9a-f]+)(:.*)?$')
    def __init__(self, manager=None):
        self.manager = manager or scrapymanager
    def get_reference_from_spider(self, spider):
        return 'spider:%x:%s' % (id(spider), spider.name)
    def get_spider_from_reference(self, ref):
        """Returns the Spider referenced by text, if text is a spider
        reference. Otherwise it returns the text itself. If the text references
        a non-running spider it raises a RuntimeError.
        """
        m = self.spider_ref_re.search(ref)
        if m:
            spid = int(m.group(1), 16)
            for spider in self.manager.engine.open_spiders:
                if id(spider) == spid:
                    return spider
            raise RuntimeError("Spider not running: %s" % ref)
        return ref
    def encode_references(self, obj):
        """Look for Spider objects and replace them with spider references"""
        if isinstance(obj, BaseSpider):
            return self.get_reference_from_spider(obj)
        elif isinstance(obj, dict):
            d = {}
            for k, v in obj.items():
                k = self.encode_references(k)
                v = self.encode_references(v)
                d[k] = v
            return d
        elif isinstance(obj, (list, tuple)):
            return [self.encode_references(x) for x in obj]
        else:
            return obj
    def decode_references(self, obj):
        """Look for spider references and replace them with Spider objects"""
        if isinstance(obj, basestring):
            return self.get_spider_from_reference(obj)
        elif isinstance(obj, dict):
            d = {}
            for k, v in obj.items():
                k = self.decode_references(k)
                v = self.decode_references(v)
                d[k] = v
            return d
        elif isinstance(obj, (list, tuple)):
            return [self.decode_references(x) for x in obj]
        else:
            return obj
 class ScrapyJSONEncoder(json.JSONEncoder):
    DATE_FORMAT = "%Y-%m-%d"
    TIME_FORMAT = "%H:%M:%S"
    def __init__(self, *a, **kw):
        self.spref = kw.pop('spref', None) or SpiderReferencer()
        super(ScrapyJSONEncoder, self).__init__(*a, **kw)
    def encode(self, o):
        if self.spref:
            o = self.spref.encode_references(o)
        return super(ScrapyJSONEncoder, self).encode(o)
    def default(self, o):
        if isinstance(o, datetime.datetime):
            return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
        elif isinstance(o, datetime.date):
            return o.strftime(self.DATE_FORMAT)
        elif isinstance(o, datetime.time):
            return o.strftime(self.TIME_FORMAT)
        elif isinstance(o, decimal.Decimal):
            return str(o)
        elif isinstance(o, Request):
            return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
        elif isinstance(o, Response):
            return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
        else:
            return super(ScrapyJSONEncoder, self).default(o)
 class ScrapyJSONDecoder(json.JSONDecoder):
    def __init__(self, *a, **kw):
        self.spref = kw.pop('spref', None) or SpiderReferencer()
        super(ScrapyJSONDecoder, self).__init__(*a, **kw)
    def decode(self, s):
        o = super(ScrapyJSONDecoder, self).decode(s)
        if self.spref:
            o = self.spref.decode_references(o)
        return o
--- a/scrapy/webservice.py
+++ b/scrapy/webservice.py
@ -0,0 +1,86 @@
 """
 Scrapy web services extension
 See docs/topics/ws.rst
 """
 from twisted.internet import reactor
 from twisted.web import server, resource, error
 from scrapy.core.exceptions import NotConfigured
 from scrapy.utils.jsonrpc import jsonrpc_server_call
 from scrapy.utils.serialize import ScrapyJSONEncoder, ScrapyJSONDecoder
 from scrapy.utils.misc import load_object
 from scrapy.utils.conf import build_component_list
 from scrapy.conf import settings
 class JsonResource(resource.Resource):
    ws_name = None
    json_encoder = ScrapyJSONEncoder()
    def render(self, txrequest):
        r = resource.Resource.render(self, txrequest)
        r = self.json_encoder.encode(r)
        txrequest.setHeader('Content-Type', 'application/json')
        txrequest.setHeader('Content-Length', len(r))
        return r
 class JsonRpcResource(JsonResource):
    json_decoder = ScrapyJSONDecoder()
    def __init__(self, target=None):
        JsonResource.__init__(self)
        self._target = target
    def render_GET(self, txrequest):
        return self.get_target()
    def render_POST(self, txrequest):
        reqstr = txrequest.content.read()
        target = self.get_target()
        return jsonrpc_server_call(target, reqstr, self.json_decoder)
    def getChild(self, name, txrequest):
        target = self.get_target()
        try:
            newtarget = getattr(target, name)
            return JsonRpcResource(newtarget)
        except AttributeError:
            return error.NoResource("No such child resource.")
    def get_target(self):
        return self._target
 class RootResource(JsonResource):
    def render_GET(self, txrequest):
        return {'resources': self.children.keys()}
    def getChild(self, name, txrequest):
        if name == '':
            return self
        return JsonResource.getChild(self, name, txrequest)
 class WebService(server.Site):
    def __init__(self):
        if not settings.getbool('WEBSERVICE_ENABLED'):
            raise NotConfigured
        logfile = settings['WEBSERVICE_LOGFILE']
        port = settings.getint('WEBSERVICE_PORT')
        root = RootResource()
        reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \
            settings['WEBSERVICE_RESOURCES'])
        for res_cls in map(load_object, reslist):
            res = res_cls()
            root.putChild(res.ws_name, res)
        server.Site.__init__(self, root, logPath=logfile)
        self.noisy = False
        reactor.callWhenRunning(reactor.listenTCP, port, self)