1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 15:04:27 +00:00

* Added Scrapy Web Service with documentation and tests.

* Marked Web Console as deprecated.
* Removed Web Console documentation to discourage its use.
This commit is contained in:
Pablo Hoffman 2010-06-09 13:46:22 -03:00
parent 2499dfee5e
commit 6a33d6c4d0
23 changed files with 1007 additions and 232 deletions

114
bin/scrapy-ws.py Executable file
View File

@ -0,0 +1,114 @@
#!/usr/bin/env python
"""
Example script to control and monitor Scrapy using its web service. It only
provides a reduced functionality as its main purpose is to illustrate how to
write a web service client. Feel free to improve or write you own.
"""
import sys, optparse, urllib
from urlparse import urljoin
from scrapy.utils.jsonrpc import jsonrpc_client_call, JsonRpcError
from scrapy.utils.py26 import json
def get_commands():
return {
'help': cmd_help,
'run': cmd_run,
'list-available': cmd_list_available,
'list-running': cmd_list_running,
'list-resources': cmd_list_resources,
'list-extensions': cmd_list_extensions,
'get-global-stats': cmd_get_global_stats,
'get-spider-stats': cmd_get_spider_stats,
}
def cmd_help(args, opts):
"""help - list available commands"""
print "Available commands:"
for _, func in sorted(get_commands().items()):
print " ", func.__doc__
def cmd_run(args, opts):
"""run <spider_name> - schedule spider for running"""
jsonrpc_call(opts, 'manager/queue', 'append_spider_name', args[0])
def cmd_list_running(args, opts):
"""list-running - list running spiders"""
for x in json_get(opts, 'manager/engine/open_spiders'):
print x
def cmd_list_available(args, opts):
"""list-available - list name of available spiders"""
for x in jsonrpc_call(opts, 'spiders', 'list'):
print x
def cmd_list_resources(args, opts):
"""list-resources - list available web service resources"""
for x in json_get(opts, '')['resources']:
print x
def cmd_list_extensions(args, opts):
"""list-extensions - list enabled extensions"""
for x in jsonrpc_call(opts, 'extensions/enabled', 'keys'):
print x
def cmd_get_spider_stats(args, opts):
"""get-spider-stats <spider> - get stats of a running spider"""
stats = jsonrpc_call(opts, 'stats', 'get_stats', args[0])
for name, value in stats.items():
print "%-40s %s" % (name, value)
def cmd_get_global_stats(args, opts):
"""get-global-stats - get global stats"""
stats = jsonrpc_call(opts, 'stats', 'get_stats')
for name, value in stats.items():
print "%-40s %s" % (name, value)
def get_wsurl(opts, path):
return urljoin("http://%s:%s/"% (opts.host, opts.port), path)
def jsonrpc_call(opts, path, method, *args, **kwargs):
url = get_wsurl(opts, path)
return jsonrpc_client_call(url, method, *args, **kwargs)
def json_get(opts, path):
url = get_wsurl(opts, path)
return json.loads(urllib.urlopen(url).read())
def parse_opts():
usage = "%prog [options] <command> [arg] ..."
description = "Scrapy web service control script. Use '%prog help' " \
"to see the list of available commands."
op = optparse.OptionParser(usage=usage, description=description)
op.add_option("-H", dest="host", default="localhost", \
help="Scrapy host to connect to")
op.add_option("-P", dest="port", type="int", default=6080, \
help="Scrapy port to connect to")
opts, args = op.parse_args()
if not args:
op.print_help()
sys.exit(2)
cmdname, cmdargs, opts = args[0], args[1:], opts
commands = get_commands()
if cmdname not in commands:
sys.stderr.write("Unknown command: %s\n\n" % cmdname)
cmd_help(None, None)
sys.exit(1)
return commands[cmdname], cmdargs, opts
def main():
cmd, args, opts = parse_opts()
try:
cmd(args, opts)
except IndexError:
print cmd.__doc__
except JsonRpcError, e:
print str(e)
if e.data:
print "Server Traceback below:"
print e.data
if __name__ == '__main__':
main()

View File

@ -87,7 +87,7 @@ Built-in services
topics/stats topics/stats
topics/email topics/email
topics/telnetconsole topics/telnetconsole
topics/webconsole topics/webservice
:doc:`topics/logging` :doc:`topics/logging`
Understand the simple logging facility provided by Scrapy. Understand the simple logging facility provided by Scrapy.
@ -101,8 +101,8 @@ Built-in services
:doc:`topics/telnetconsole` :doc:`topics/telnetconsole`
Inspect a running crawler using a built-in Python console. Inspect a running crawler using a built-in Python console.
:doc:`topics/webconsole` :doc:`topics/webservice`
Monitor and control a crawler using a web interface. Monitor and control a crawler using a web service.
Solving specific problems Solving specific problems

View File

@ -190,7 +190,7 @@ scraping easy and efficient, such as:
* An :ref:`Interactive scraping shell console <topics-shell>`, very useful for * An :ref:`Interactive scraping shell console <topics-shell>`, very useful for
writing and debugging your spiders writing and debugging your spiders
* A :ref:`Web management console <topics-webconsole>` for monitoring and * A builtin :ref:`Web service <topics-webservice>` for monitoring and
controlling your bot controlling your bot
* A :ref:`Telnet console <topics-telnetconsole>` for full unrestricted access * A :ref:`Telnet console <topics-telnetconsole>` for full unrestricted access

View File

@ -36,10 +36,8 @@ by a string: the full Python path to the extension's class name. For example::
EXTENSIONS = { EXTENSIONS = {
'scrapy.contrib.corestats.CoreStats': 500, 'scrapy.contrib.corestats.CoreStats': 500,
'scrapy.management.web.WebConsole': 500, 'scrapy.webservice.WebService': 500,
'scrapy.management.telnet.TelnetConsole': 500, 'scrapy.telnet.TelnetConsole': 500,
'scrapy.contrib.webconsole.enginestatus.EngineStatus': 500,
'scrapy.contrib.webconsole.stats.StatsDump': 500,
} }
@ -71,10 +69,10 @@ Accessing enabled extensions
Even though it's not usually needed, you can access extension objects through Even though it's not usually needed, you can access extension objects through
the :ref:`topics-extensions-ref-manager` which is populated when extensions are the :ref:`topics-extensions-ref-manager` which is populated when extensions are
loaded. For example, to access the ``WebConsole`` extension:: loaded. For example, to access the ``WebService`` extension::
from scrapy.extension import extensions from scrapy.extension import extensions
webconsole_extension = extensions.enabled['WebConsole'] webservice_extension = extensions.enabled['WebService']
.. see also:: .. see also::
@ -146,7 +144,7 @@ how you :ref:`configure the downloader middlewares
>>> extensions.load() >>> extensions.load()
>>> print extensions.enabled >>> print extensions.enabled
{'CoreStats': <scrapy.contrib.corestats.CoreStats object at 0x9e272ac>, {'CoreStats': <scrapy.contrib.corestats.CoreStats object at 0x9e272ac>,
'WebConsole': <scrapy.management.telnet.TelnetConsole instance at 0xa05670c>, 'WebService': <scrapy.management.telnet.TelnetConsole instance at 0xa05670c>,
... ...
.. attribute:: disabled .. attribute:: disabled
@ -158,7 +156,7 @@ how you :ref:`configure the downloader middlewares
>>> from scrapy.extension import extensions >>> from scrapy.extension import extensions
>>> extensions.load() >>> extensions.load()
>>> print extensions.disabled >>> print extensions.disabled
{'MemoryDebugger': 'scrapy.contrib.webconsole.stats.MemoryDebugger', {'MemoryDebugger': 'scrapy.contrib.memdebug.MemoryDebugger',
'MyExtension': 'myproject.extensions.MyExtension', 'MyExtension': 'myproject.extensions.MyExtension',
... ...
@ -193,44 +191,34 @@ Core Stats extension
Enable the collection of core statistics, provided the stats collection is Enable the collection of core statistics, provided the stats collection is
enabled (see :ref:`topics-stats`). enabled (see :ref:`topics-stats`).
.. _topics-extensions-ref-webconsole: .. _topics-extensions-ref-webservice:
Web console extension Web service extension
~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.management.web .. module:: scrapy.webservice
:synopsis: Web management console :synopsis: Web service
.. class:: scrapy.management.web.WebConsole .. class:: scrapy.webservice.WebService
Provides an extensible web server for managing a Scrapy process. It's enabled See `topics-webservice`.
by the :setting:`WEBCONSOLE_ENABLED` setting. The server will listen in the
port specified in :setting:`WEBCONSOLE_PORT`, and will log to the file
specified in :setting:`WEBCONSOLE_LOGFILE`.
The web server is designed to be extended by other extensions which can add
their own management web interfaces.
See also :ref:`topics-webconsole` for information on how to write your own web
console extension, and :ref:`topics-webconsole-extensions-ref` for a list of
available built-in (web console) extensions.
.. _topics-extensions-ref-telnetconsole: .. _topics-extensions-ref-telnetconsole:
Telnet console extension Telnet console extension
~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.management.telnet .. module:: scrapy.telnet
:synopsis: Telnet management console :synopsis: Telnet console
.. class:: scrapy.management.telnet.TelnetConsole .. class:: scrapy.telnet.TelnetConsole
Provides a telnet console for getting into a Python interpreter inside the Provides a telnet console for getting into a Python interpreter inside the
currently running Scrapy process, which can be very useful for debugging. currently running Scrapy process, which can be very useful for debugging.
The telnet console must be enabled by the :setting:`TELNETCONSOLE_ENABLED` The telnet console must be enabled by the :setting:`TELNETCONSOLE_ENABLED`
setting, and the server will listen in the port specified in setting, and the server will listen in the port specified in
:setting:`WEBCONSOLE_PORT`. :setting:`TELNETCONSOLE_PORT`.
.. _topics-extensions-ref-memusage: .. _topics-extensions-ref-memusage:

View File

@ -519,13 +519,8 @@ Default::
{ {
'scrapy.contrib.corestats.CoreStats': 0, 'scrapy.contrib.corestats.CoreStats': 0,
'scrapy.management.web.WebConsole': 0, 'scrapy.webservice.WebService': 0,
'scrapy.management.telnet.TelnetConsole': 0, 'scrapy.telnet.TelnetConsole': 0,
'scrapy.contrib.webconsole.scheduler.SchedulerQueue': 0,
'scrapy.contrib.webconsole.livestats.LiveStats': 0,
'scrapy.contrib.webconsole.spiderctl.Spiderctl': 0,
'scrapy.contrib.webconsole.enginestatus.EngineStatus': 0,
'scrapy.contrib.webconsole.stats.StatsDump': 0,
'scrapy.contrib.memusage.MemoryUsage': 0, 'scrapy.contrib.memusage.MemoryUsage': 0,
'scrapy.contrib.memdebug.MemoryDebugger': 0, 'scrapy.contrib.memdebug.MemoryDebugger': 0,
'scrapy.contrib.closedomain.CloseDomain': 0, 'scrapy.contrib.closedomain.CloseDomain': 0,
@ -1054,33 +1049,3 @@ Default: ``"%s/%s" % (BOT_NAME, BOT_VERSION)``
The default User-Agent to use when crawling, unless overrided. The default User-Agent to use when crawling, unless overrided.
.. setting:: WEBCONSOLE_ENABLED
WEBCONSOLE_ENABLED
------------------
Default: True
A boolean which specifies if the web management console will be enabled
(provided its extension is also enabled).
.. setting:: WEBCONSOLE_LOGFILE
WEBCONSOLE_LOGFILE
------------------
Default: ``None``
A file to use for logging HTTP requests made to the web console. If unset web
the log is sent to standard scrapy log.
.. setting:: WEBCONSOLE_PORT
WEBCONSOLE_PORT
---------------
Default: ``6080``
The port to use for the web console. If set to ``None`` or ``0``, a dynamically
assigned port is used. For more info see :ref:`topics-webconsole`.

View File

@ -1,142 +0,0 @@
.. _topics-webconsole:
===========
Web Console
===========
Scrapy comes with a built-in web server for monitoring and controlling a Scrapy
running process.
The web console is :ref:`built-in Scrapy extension
<topics-extensions-ref>` which comes enabled by default, but you can also
disable it if you're running tight on memory.
For more information about this extension see
:ref:`topics-extensions-ref-webconsole`.
Writing a web console extension
===============================
Writing a web console extension is similar to writing any other :ref:`Scrapy
extensions <topics-extensions>` except that the extension class must:
1. catch the ``scrapy.management.web.webconsole_discover_module`` signal, and
return itself in the handler.
2. have the following two attributes:
.. attribute:: webconsole_id
The id by which the Scrapy web interface will known this extension, and
also the main dir under which this extension interface will work. For
example, assuming Scrapy web server is listening on
http://localhost:8000/ and the ``webconsole_id='extension1'`` the web
main page for the interface of that extension will be:
http://localhost:8000/extension1/
.. attribute:: webconsole_name
The name by which the Scrapy web server will know that extension. That name
will be displayed in the main web console index, as the text that links to
the extension main page.
3. implement the following method:
.. method:: webconsole_render(wc_request)
``wc_request`` is a `twisted.web.http.Request`_ object with the HTTP request
sent to the web console.
.. _twisted.web.http.Request: http://python.net/crew/mwh/apidocs/twisted.web.http.Request.html
It must return a str with the web page to render, typically containing HTML
code.
Example web console extension
=============================
Here's an example of a simple web console extension that just displays a "Hello
world!" text::
from scrapy.xlib.pydispatch import dispatcher
from scrapy.management.web import webconsole_discover_module
class HelloWorldConsole(object):
webconsole_id = 'helloworld'
webconsole_name = 'Hello world'
def __init__(self):
dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module)
def webconsole_discover_module(self):
return self
def webconsole_render(self, wc_request):
return "<html><head></head><body><h1>Hello world!</h1></body>"
If you start Scrapy with the web console enabled on http://localhost:8000/ and
you access the URL:
http://localhost:8000/helloworld/
You will see a page containing a big "Hello World!" text.
.. _topics-webconsole-extensions-ref:
Available Web console extensions
--------------------------------
.. module:: scrapy.contrib.webconsole
:synopsis: Contains most built-in web console extensions
Here is a list of built-in web console extensions.
Scheduler queue extension
~~~~~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.webconsole.scheduler
:synopsis: Scheduler queue web console extension
.. class:: scrapy.contrib.webconsole.scheduler.SchedulerQueue
Display a list of all pending Requests in the Scheduler queue, grouped by
domain/spider.
Spider live stats extension
~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.webconsole.livestats
:synopsis: Spider live stats web console extension
.. class:: scrapy.contrib.webconsole.livestats.LiveStats
Display a table with stats of all spider crawled by the current Scrapy run,
including:
* Number of items scraped
* Number of pages crawled
* Number of pending requests in the scheduler
* Number of pending requests in the downloader queue
* Number of requests currently being downloaded
Engine status extension
~~~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.webconsole.enginestatus
:synopsis: Engine status web console extension
.. class:: scrapy.contrib.webconsole.enginestatus.EngineStatus
Display the current status of the Scrapy Engine, which is just the output of
the Scrapy engine ``getstatus()`` method.
Stats collector dump extension
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.webconsole.stats
:synopsis: Stats dump web console extension
.. class:: scrapy.contrib.webconsole.stats.StatsDump
Display the stats collected so far by the stats collector.

236
docs/topics/webservice.rst Normal file
View File

@ -0,0 +1,236 @@
.. _topics-webservice:
===========
Web Service
===========
Scrapy comes with a built-in web service for monitoring and controlling a
running crawler. The service exposes most resources using the `JSON-RPC 2.0`_
protocol, but there are also other (read-only) resources which just output JSON
data.
Provides an extensible web service for managing a Scrapy process. It's enabled
by the :setting:`WEBSERVICE_ENABLED` setting. The web server will listen in the
port specified in :setting:`WEBSERVICE_PORT`, and will log to the file
specified in :setting:`WEBSERVICE_LOGFILE`.
The web service is a :ref:`built-in Scrapy extension <topics-extensions-ref>`
which comes enabled by default, but you can also disable it if you're running
tight on memory.
.. _topics-webservice-resources:
Web service resources
=====================
The web service contains several resources, defined in the
:setting:`WEBSERVICE_RESOURCES` setting. Each resource provides a different
functionality. See :ref:`topics-webservice-resources-ref` for a list of
resources available by default.
Althought you can implement your own resources using any protocol, there are
two kinds of resources bundled with Scrapy:
* Simple JSON resources - which are read-only and just output JSON data
* JSON-RPC resources - which provide direct access to certain Scrapy objects
using the `JSON-RPC 2.0`_ protocol
.. module:: scrapy.contrib.webservice
:synopsis: Built-in web service resources
.. _topics-webservice-resources-ref:
Available JSON-RPC resources
----------------------------
These are the JSON-RPC resources available by default in Scrapy:
Execution Manager JSON-RPC resource
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.webservice.manager
:synopsis: Execution Manager JSON-RPC resource
.. class:: ManagerResource
Provides access to the Execution Manager that controls the crawler.
Available by default at: http://localhost:6080/manager
Stats Collector JSON-RPC resource
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.webservice.stats
:synopsis: Stats JSON-RPC resource
.. class:: StatsResource
Provides access to the Stats Collector used by the crawler.
Available by default at: http://localhost:6080/stats
Spider Manager JSON-RPC resource
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.webservice.spiders
:synopsis: Spider Manager JSON-RPC resource
.. class:: SpidersResource
Provides access to the Spider Manager used by the crawler.
Available by default at: http://localhost:6080/spiders
Extension Manager JSON-RPC resource
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.webservice.extensions
:synopsis: Extension Manager JSON-RPC resource
.. class:: ExtensionsResource
Provides access to the Extension Manager used by the crawler.
Available by default at: http://localhost:6080/extensions
Available JSON resources
------------------------
These are the JSON resources available by default:
Extension Manager JSON-RPC resource
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. module:: scrapy.contrib.webservice.enginestatus
:synopsis: Engine Status JSON resource
.. class:: EngineStatusResource
Provides access to the Extension Manager used by the crawler.
Available by default at: http://localhost:6080/enginestatus
Web service settings
====================
These are the settings that control the web service behaviour:
.. setting:: WEBSERVICE_ENABLED
WEBSERVICE_ENABLED
------------------
Default: ``True``
A boolean which specifies if the web service will be enabled (provided its
extension is also enabled).
.. setting:: WEBSERVICE_LOGFILE
WEBSERVICE_LOGFILE
------------------
Default: ``None``
A file to use for logging HTTP requests made to the web service. If unset web
the log is sent to standard scrapy log.
.. setting:: WEBSERVICE_PORT
WEBSERVICE_PORT
---------------
Default: ``6080``
The port to use for the web service. If set to ``None`` or ``0``, a dynamically
assigned port is used.
WEBSERVICE_RESOURCES
--------------------
Default: ``{}``
The list of web service resources enabled for your project. See
:ref:`topics-webservice-resources`. These are added to the ones available by
default in Scrapy, defined in the :setting:`WEBSERVICE_RESOURCES_BASE` setting.
WEBSERVICE_RESOURCES_BASE
-------------------------
Default::
{
'scrapy.contrib.webservice.manager.ManagerResource': 1,
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
'scrapy.contrib.webservice.extensions.ExtensionsResource': 1,
'scrapy.contrib.webservice.spiders.SpidersResource': 1,
'scrapy.contrib.webservice.stats.StatsResource': 1,
}
The list of web service resources available by default in Scrapy. You shouldn't
change this setting in your project, change :setting:`WEBSERVICE_RESOURCES`
instead. If you want to disable some resource set its value to ``None`` in
:setting:`WEBSERVICE_RESOURCES`.
Writing a web service resource
==============================
Web service resources are implemented using the Twisted Web API. See this
`Twisted Web guide`_ for more information on Twisted web and Twisted web
resources.
To write a web service resource you should subclass the :class:`JsonResource` or
:class:`JsonRpcResource` classes and implement the :class:`renderGET` method.
.. class:: scrapy.webservice.JsonResource
A subclass of `twisted.web.resource.Resource`_ that implements a JSON web
service resource. See
.. attribute:: ws_name
The name by which the Scrapy web service will known this resource, and
also the path wehere this resource will listen. For example, assuming
Scrapy web service is listening on http://localhost:6080/ and the
``ws_name`` is ``'resource1'`` the URL for that resource will be:
http://localhost:6080/resource1/
.. class:: scrapy.webservice.JsonRpcResource(target=None)
This is a subclass of :class:`JsonResource` for implementing JSON-RPC
resources. JSON-RPC resources wrap Python (Scrapy) objects around a
JSON-RPC API. The resource wrapped must be returned by the
:meth:`get_target` method, which returns the target passed in the
constructor by default
.. method:: get_target()
Return the object wrapped by this JSON-RPC resource. By default, it
returns the object passed on the constructor.
Examples of web service resources
=================================
StatsResource (JSON-RPC resource)
---------------------------------
.. literalinclude:: ../../scrapy/contrib/webservice/stats.py
EngineStatusResource (JSON resource)
-------------------------------------
.. literalinclude:: ../../scrapy/contrib/webservice/enginestatus.py
Example of web service client
=============================
scrapy-ws.py script
-------------------
.. literalinclude:: ../../bin/scrapy-ws.py
.. _Twisted Web guide: http://jcalderone.livejournal.com/50562.html
.. _JSON-RPC 2.0: http://www.jsonrpc.org/
.. _twisted.web.resource.Resource: http://twistedmatrix.com/documents/10.0.0/api/twisted.web.resource.Resource.html

View File

@ -109,13 +109,8 @@ EXTENSIONS = {}
EXTENSIONS_BASE = { EXTENSIONS_BASE = {
'scrapy.contrib.corestats.CoreStats': 0, 'scrapy.contrib.corestats.CoreStats': 0,
'scrapy.management.web.WebConsole': 0, 'scrapy.webservice.WebService': 0,
'scrapy.telnet.TelnetConsole': 0, 'scrapy.telnet.TelnetConsole': 0,
'scrapy.contrib.webconsole.scheduler.SchedulerQueue': 0,
'scrapy.contrib.webconsole.livestats.LiveStats': 0,
'scrapy.contrib.webconsole.spiderctl.Spiderctl': 0,
'scrapy.contrib.webconsole.enginestatus.EngineStatus': 0,
'scrapy.contrib.webconsole.stats.StatsDump': 0,
'scrapy.contrib.memusage.MemoryUsage': 0, 'scrapy.contrib.memusage.MemoryUsage': 0,
'scrapy.contrib.memdebug.MemoryDebugger': 0, 'scrapy.contrib.memdebug.MemoryDebugger': 0,
'scrapy.contrib.closespider.CloseSpider': 0, 'scrapy.contrib.closespider.CloseSpider': 0,
@ -232,3 +227,14 @@ WEBCONSOLE_ENABLED = True
WEBCONSOLE_PORT = 6080 WEBCONSOLE_PORT = 6080
WEBCONSOLE_LOGFILE = None WEBCONSOLE_LOGFILE = None
WEBSERVICE_ENABLED = True
WEBSERVICE_LOGFILE = None
WEBSERVICE_PORT = 6080
WEBSERVICE_RESOURCES = {}
WEBSERVICE_RESOURCES_BASE = {
'scrapy.contrib.webservice.manager.ManagerResource': 1,
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
'scrapy.contrib.webservice.extensions.ExtensionsResource': 1,
'scrapy.contrib.webservice.spiders.SpidersResource': 1,
'scrapy.contrib.webservice.stats.StatsResource': 1,
}

View File

@ -0,0 +1,3 @@
import warnings
warnings.warn("Web console is deprecated. Consider using web service instead.", \
DeprecationWarning, stacklevel=2)

View File

View File

@ -0,0 +1,24 @@
from scrapy.webservice import JsonResource
from scrapy.core.manager import scrapymanager
from scrapy.utils.engine import get_engine_status
class EngineStatusResource(JsonResource):
ws_name = 'enginestatus'
def __init__(self, spider_name=None, _manager=scrapymanager):
JsonResource.__init__(self)
self._spider_name = spider_name
self.isLeaf = spider_name is not None
self._manager = _manager
def render_GET(self, txrequest):
status = get_engine_status(self._manager.engine)
if self._spider_name is None:
return status
for sp, st in status['spiders'].items():
if sp.name == self._spider_name:
return st
def getChild(self, name, txrequest):
return EngineStatusResource(name, self._manager)

View File

@ -0,0 +1,10 @@
from scrapy.webservice import JsonRpcResource
from scrapy.extension import extensions
class ExtensionsResource(JsonRpcResource):
ws_name = 'extensions'
def __init__(self, _extensions=extensions):
JsonRpcResource.__init__(self)
self._target = _extensions

View File

@ -0,0 +1,10 @@
from scrapy.webservice import JsonRpcResource
from scrapy.core.manager import scrapymanager
class ManagerResource(JsonRpcResource):
ws_name = 'manager'
def __init__(self, _manager=scrapymanager):
JsonRpcResource.__init__(self)
self._target = _manager

View File

@ -0,0 +1,10 @@
from scrapy.webservice import JsonRpcResource
from scrapy.spider import spiders
class SpidersResource(JsonRpcResource):
ws_name = 'spiders'
def __init__(self, _spiders=spiders):
JsonRpcResource.__init__(self)
self._target = _spiders

View File

@ -0,0 +1,10 @@
from scrapy.webservice import JsonRpcResource
from scrapy.stats import stats
class StatsResource(JsonRpcResource):
ws_name = 'stats'
def __init__(self, _stats=stats):
JsonRpcResource.__init__(self)
self._target = _stats

View File

@ -1,8 +1,6 @@
""" import warnings
Scrapy Web Console extension warnings.warn("Scrapy web console is deprecated. Consider using web service instead.", \
DeprecationWarning, stacklevel=2)
See docs/topics/webconsole.rst
"""
import re import re
import socket import socket

View File

@ -77,6 +77,6 @@ class BaseSpider(object_ref):
raise NotImplementedError raise NotImplementedError
def __str__(self): def __str__(self):
return "<%s %r>" % (type(self).__name__, self.name) return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
__repr__ = __str__ __repr__ = __str__

View File

@ -0,0 +1,112 @@
import unittest
from cStringIO import StringIO
from scrapy.utils.jsonrpc import jsonrpc_client_call, jsonrpc_server_call, \
JsonRpcError, jsonrpc_errors
from scrapy.utils.py26 import json
class urllib_stub(object):
def __init__(self, result=None, error=None):
response = {}
if result:
response.update(result=result)
if error:
response.update(error=error)
self.response = json.dumps(response)
self.request = None
def urlopen(self, url, request):
self.url = url
self.request = request
return StringIO(self.response)
class TestTarget(object):
def call(self, *args, **kwargs):
return list(args), kwargs
def exception(self):
raise Exception("testing-errors")
class JsonRpcUtilsTestCase(unittest.TestCase):
def test_jsonrpc_client_call_request(self):
ul = urllib_stub(1)
jsonrpc_client_call('url', 'test', 'one', 2, _urllib=ul)
req = json.loads(ul.request)
assert 'id' in req
self.assertEqual(ul.url, 'url')
self.assertEqual(req['jsonrpc'], '2.0')
self.assertEqual(req['method'], 'test')
self.assertEqual(req['params'], ['one', 2])
def test_jsonrpc_client_call_response(self):
ul = urllib_stub()
# must return result or error
self.assertRaises(ValueError, jsonrpc_client_call, 'url', 'test', _urllib=ul)
ul = urllib_stub(result={'one': 1})
self.assertEquals(jsonrpc_client_call('url', 'test', _urllib=ul), {'one': 1})
ul = urllib_stub(error={'code': 123, 'message': 'hello', 'data': 'some data'})
raised = False
try:
jsonrpc_client_call('url', 'test', _urllib=ul)
except JsonRpcError, e:
raised = True
self.assertEqual(e.code, 123)
self.assertEqual(e.message, 'hello')
self.assertEqual(e.data, 'some data')
assert '123' in str(e)
assert 'hello' in str(e)
assert raised, "JsonRpcError not raised"
def test_jsonrpc_server_call(self):
t = TestTarget()
r = jsonrpc_server_call(t, 'invalid json data')
assert 'error' in r
assert r['jsonrpc'] == '2.0'
assert r['id'] is None
self.assertEqual(r['error']['code'], jsonrpc_errors.PARSE_ERROR)
assert 'Traceback' in r['error']['data']
r = jsonrpc_server_call(t, '{"test": "test"}')
assert 'error' in r
assert r['jsonrpc'] == '2.0'
assert r['id'] is None
self.assertEqual(r['error']['code'], jsonrpc_errors.INVALID_REQUEST)
r = jsonrpc_server_call(t, '{"method": "notfound", "id": 1}')
assert 'error' in r
assert r['jsonrpc'] == '2.0'
assert r['id'] == 1
self.assertEqual(r['error']['code'], jsonrpc_errors.METHOD_NOT_FOUND)
r = jsonrpc_server_call(t, '{"method": "exception", "id": 1}')
assert 'error' in r
assert r['jsonrpc'] == '2.0'
assert r['id'] == 1
self.assertEqual(r['error']['code'], jsonrpc_errors.INTERNAL_ERROR)
assert 'testing-errors' in r['error']['message']
assert 'Traceback' in r['error']['data']
r = jsonrpc_server_call(t, '{"method": "call", "id": 2}')
assert 'result' in r
assert r['jsonrpc'] == '2.0'
assert r['id'] == 2
self.assertEqual(r['result'], ([], {}))
r = jsonrpc_server_call(t, '{"method": "call", "params": [456, 123], "id": 3}')
assert 'result' in r
assert r['jsonrpc'] == '2.0'
assert r['id'] == 3
self.assertEqual(r['result'], ([456, 123], {}))
r = jsonrpc_server_call(t, '{"method": "call", "params": {"data": 789}, "id": 3}')
assert 'result' in r
assert r['jsonrpc'] == '2.0'
assert r['id'] == 3
self.assertEqual(r['result'], ([], {'data': 789}))
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,125 @@
import unittest
import datetime
from decimal import Decimal
from scrapy.utils.serialize import SpiderReferencer, ScrapyJSONEncoder, ScrapyJSONDecoder
from scrapy.utils.py26 import json
from scrapy.spider import BaseSpider
from scrapy.http import Request, Response
class ExecutionEngineStub(object):
def __init__(self, open_spiders):
self.open_spiders = open_spiders
class ExecutionMangerStub(object):
def __init__(self, open_spiders):
self.engine = ExecutionEngineStub(open_spiders)
class BaseTestCase(unittest.TestCase):
def setUp(self):
self.spider1 = BaseSpider('name1')
self.spider2 = BaseSpider('name2')
open_spiders = set([self.spider1, self.spider2])
manager = ExecutionMangerStub(open_spiders)
self.spref = SpiderReferencer(manager)
self.encoder = ScrapyJSONEncoder(spref=self.spref)
self.decoder = ScrapyJSONDecoder(spref=self.spref)
class SpiderReferencerTestCase(BaseTestCase):
def test_spiders_and_references(self):
ref1 = self.spref.get_reference_from_spider(self.spider1)
assert isinstance(ref1, str)
assert self.spider1.name in ref1
ref2 = self.spref.get_reference_from_spider(self.spider2)
ref1_ = self.spref.get_reference_from_spider(self.spider1)
assert ref1 == ref1_
assert ref1 != ref2
sp1 = self.spref.get_spider_from_reference(ref1)
sp2 = self.spref.get_spider_from_reference(ref2)
sp1_ = self.spref.get_spider_from_reference(ref1)
assert isinstance(sp1, BaseSpider)
assert sp1 is not sp2
assert sp1 is sp1_
# must return string as-is if spider id not found
assert 'lala' == self.spref.get_spider_from_reference('lala')
# must raise RuntimeError if spider id is not found and spider is not running
self.assertRaises(RuntimeError, self.spref.get_spider_from_reference, 'spider:fffffff')
def test_encode_decode(self):
sr = self.spref
sp1 = self.spider1
sp2 = self.spider2
ref1 = sr.get_reference_from_spider(sp1)
ref2 = sr.get_reference_from_spider(sp2)
examples = [
('lala', 'lala'),
(sp1, ref1),
(['lala', sp1], ['lala', ref1]),
({'lala': sp1}, {'lala': ref1}),
({sp1: sp2}, {ref1: ref2}),
({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}})
]
for spiders, refs in examples:
self.assertEqual(sr.encode_references(spiders), refs)
self.assertEqual(sr.decode_references(refs), spiders)
class JsonEncoderTestCase(BaseTestCase):
def test_encode_decode(self):
sr = self.spref
sp1 = self.spider1
sp2 = self.spider2
ref1 = sr.get_reference_from_spider(sp1)
ref2 = sr.get_reference_from_spider(sp2)
dt = datetime.datetime(2010, 1, 2, 10, 11, 12)
dts = "2010-01-02 10:11:12"
d = datetime.date(2010, 1, 2)
ds = "2010-01-02"
t = datetime.time(10, 11, 12)
ts = "10:11:12"
dec = Decimal("1000.12")
decs = "1000.12"
examples_encode_decode = [
('lala', 'lala'),
(sp1, ref1),
(['lala', sp1], ['lala', ref1]),
({'lala': sp1}, {'lala': ref1}),
({sp1: sp2}, {ref1: ref2}),
({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}})
]
for spiders, refs in examples_encode_decode:
self.assertEqual(self.encoder.encode(spiders), json.dumps(refs))
self.assertEqual(self.decoder.decode(json.dumps(refs)), spiders)
examples_encode_only = [
({sp1: dt}, {ref1: dts}),
({sp1: d}, {ref1: ds}),
({sp1: t}, {ref1: ts}),
({sp1: dec}, {ref1: decs}),
]
for spiders, refs in examples_encode_only:
self.assertEqual(self.encoder.encode(spiders), json.dumps(refs))
def test_encode_request(self):
r = Request("http://www.example.com/lala")
rs = self.encoder.encode(r)
assert r.method in rs
assert r.url in rs
def test_encode_response(self):
r = Response("http://www.example.com/lala")
rs = self.encoder.encode(r)
assert r.url in rs
assert str(r.status) in rs
if __name__ == "__main__":
unittest.main()

View File

@ -37,23 +37,34 @@ def get_engine_status(engine=None):
"engine.scraper.sites[spider].needs_backout()", "engine.scraper.sites[spider].needs_backout()",
] ]
s = "Execution engine status\n\n" status = {'global': {}, 'spiders': {}}
for test in global_tests: for test in global_tests:
try: try:
s += "%-47s : %s\n" % (test, eval(test)) status['global'][test] = eval(test)
except Exception, e: except Exception, e:
s += "%-47s : %s (exception)\n" % (test, type(e).__name__) status['global'][test] = "%s (exception)" % type(e).__name__
s += "\n"
for spider in engine.downloader.sites: for spider in engine.downloader.sites:
s += "Spider: %s\n" % spider x = {}
for test in spider_tests: for test in spider_tests:
try: try:
s += " %-50s : %s\n" % (test, eval(test)) x[test] = eval(test)
except Exception, e: except Exception, e:
s += " %-50s : %s (exception)\n" % (test, type(e).__name__) x[test] = "%s (exception)" % type(e).__name__
status['spiders'][spider] = x
return status
def format_engine_status(engine=None):
status = get_engine_status(engine)
s = "Execution engine status\n\n"
for test, result in status['global'].items():
s += "%-47s : %s\n" % (test, result)
s += "\n"
for spider, tests in status['spiders'].items():
s += "Spider: %s\n" % spider
for test, result in tests.items():
s += " %-50s : %s\n" % (test, result)
return s return s
def print_engine_status(engine=None): def print_engine_status(engine=None):
print get_engine_status(engine) print format_engine_status(engine)

94
scrapy/utils/jsonrpc.py Normal file
View File

@ -0,0 +1,94 @@
"""
This module implements the JSON-RPC 2.0 protocol, as defined in:
http://groups.google.com/group/json-rpc/web/json-rpc-2-0
"""
import urllib
import traceback
from scrapy.utils.py26 import json
from scrapy.utils.serialize import ScrapyJSONDecoder
# JSON-RPC 2.0 errors, as defined in:
class jsonrpc_errors:
PARSE_ERROR = -32700
INVALID_REQUEST = -32600
METHOD_NOT_FOUND = -32601
INVALID_PARAMS = -32602
INTERNAL_ERROR = -32603
class JsonRpcError(Exception):
def __init__(self, code, message, data=None):
super(JsonRpcError, self).__init__()
self.code = code
self.message = message
self.data = data
def __str__(self):
return "JSON-RPC error (code %d): %s" % (self.code, self.message)
def jsonrpc_client_call(url, method, *args, **kwargs):
"""Execute a JSON-RPC call on the given url"""
_urllib = kwargs.pop('_urllib', urllib)
req = {'jsonrpc': '2.0', 'method': method, 'params': args or kwargs, 'id': 1}
res = json.loads(_urllib.urlopen(url, json.dumps(req)).read())
if 'result' in res:
return res['result']
elif 'error' in res:
er = res['error']
raise JsonRpcError(er['code'], er['message'], er['data'])
else:
msg = "JSON-RPC response must contain 'result' or 'error': %s" % res
raise ValueError(msg)
def jsonrpc_server_call(target, jsonrpc_request, json_decoder=None):
"""Execute the given JSON-RPC request (as JSON-encoded string) on the given
target object and return the JSON-RPC response, as a dict
"""
if json_decoder is None:
json_decoder = ScrapyJSONDecoder()
try:
req = json_decoder.decode(jsonrpc_request)
except Exception, e:
return jsonrpc_error(None, jsonrpc_errors.PARSE_ERROR, 'Parse error', \
traceback.format_exc())
try:
id, methname = req['id'], req['method']
except KeyError:
return jsonrpc_error(None, jsonrpc_errors.INVALID_REQUEST, 'Invalid Request')
try:
method = getattr(target, methname)
except AttributeError:
return jsonrpc_error(id, jsonrpc_errors.METHOD_NOT_FOUND, 'Method not found')
params = req.get('params', [])
a, kw = ([], params) if isinstance(params, dict) else (params, {})
try:
return jsonrpc_result(id, method(*a, **kw))
except Exception, e:
return jsonrpc_error(id, jsonrpc_errors.INTERNAL_ERROR, str(e), \
traceback.format_exc())
def jsonrpc_error(id, code, message, data=None):
"""Create JSON-RPC error response"""
return {
'jsonrpc': '2.0',
'error': {
'code': code,
'message': message,
'data': data,
},
'id': id,
}
def jsonrpc_result(id, result):
"""Create JSON-RPC result response"""
return {
'jsonrpc': '2.0',
'result': result,
'id': id,
}

115
scrapy/utils/serialize.py Normal file
View File

@ -0,0 +1,115 @@
import re
import datetime
import decimal
from scrapy.core.manager import scrapymanager
from scrapy.spider import BaseSpider
from scrapy.http import Request, Response
from scrapy.utils.py26 import json
class SpiderReferencer(object):
"""Class to serialize (and deserialize) objects (typically dicts)
containing references to running spiders (ie. Spider objects). This is
required because simplejson fails to serialize dicts containing
non-primitive types as keys, even when you override
ScrapyJSONEncoder.default() with a custom encoding mechanism.
"""
spider_ref_re = re.compile('^spider:([0-9a-f]+)(:.*)?$')
def __init__(self, manager=None):
self.manager = manager or scrapymanager
def get_reference_from_spider(self, spider):
return 'spider:%x:%s' % (id(spider), spider.name)
def get_spider_from_reference(self, ref):
"""Returns the Spider referenced by text, if text is a spider
reference. Otherwise it returns the text itself. If the text references
a non-running spider it raises a RuntimeError.
"""
m = self.spider_ref_re.search(ref)
if m:
spid = int(m.group(1), 16)
for spider in self.manager.engine.open_spiders:
if id(spider) == spid:
return spider
raise RuntimeError("Spider not running: %s" % ref)
return ref
def encode_references(self, obj):
"""Look for Spider objects and replace them with spider references"""
if isinstance(obj, BaseSpider):
return self.get_reference_from_spider(obj)
elif isinstance(obj, dict):
d = {}
for k, v in obj.items():
k = self.encode_references(k)
v = self.encode_references(v)
d[k] = v
return d
elif isinstance(obj, (list, tuple)):
return [self.encode_references(x) for x in obj]
else:
return obj
def decode_references(self, obj):
"""Look for spider references and replace them with Spider objects"""
if isinstance(obj, basestring):
return self.get_spider_from_reference(obj)
elif isinstance(obj, dict):
d = {}
for k, v in obj.items():
k = self.decode_references(k)
v = self.decode_references(v)
d[k] = v
return d
elif isinstance(obj, (list, tuple)):
return [self.decode_references(x) for x in obj]
else:
return obj
class ScrapyJSONEncoder(json.JSONEncoder):
DATE_FORMAT = "%Y-%m-%d"
TIME_FORMAT = "%H:%M:%S"
def __init__(self, *a, **kw):
self.spref = kw.pop('spref', None) or SpiderReferencer()
super(ScrapyJSONEncoder, self).__init__(*a, **kw)
def encode(self, o):
if self.spref:
o = self.spref.encode_references(o)
return super(ScrapyJSONEncoder, self).encode(o)
def default(self, o):
if isinstance(o, datetime.datetime):
return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
elif isinstance(o, datetime.date):
return o.strftime(self.DATE_FORMAT)
elif isinstance(o, datetime.time):
return o.strftime(self.TIME_FORMAT)
elif isinstance(o, decimal.Decimal):
return str(o)
elif isinstance(o, Request):
return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
elif isinstance(o, Response):
return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
else:
return super(ScrapyJSONEncoder, self).default(o)
class ScrapyJSONDecoder(json.JSONDecoder):
def __init__(self, *a, **kw):
self.spref = kw.pop('spref', None) or SpiderReferencer()
super(ScrapyJSONDecoder, self).__init__(*a, **kw)
def decode(self, s):
o = super(ScrapyJSONDecoder, self).decode(s)
if self.spref:
o = self.spref.decode_references(o)
return o

86
scrapy/webservice.py Normal file
View File

@ -0,0 +1,86 @@
"""
Scrapy web services extension
See docs/topics/ws.rst
"""
from twisted.internet import reactor
from twisted.web import server, resource, error
from scrapy.core.exceptions import NotConfigured
from scrapy.utils.jsonrpc import jsonrpc_server_call
from scrapy.utils.serialize import ScrapyJSONEncoder, ScrapyJSONDecoder
from scrapy.utils.misc import load_object
from scrapy.utils.conf import build_component_list
from scrapy.conf import settings
class JsonResource(resource.Resource):
ws_name = None
json_encoder = ScrapyJSONEncoder()
def render(self, txrequest):
r = resource.Resource.render(self, txrequest)
r = self.json_encoder.encode(r)
txrequest.setHeader('Content-Type', 'application/json')
txrequest.setHeader('Content-Length', len(r))
return r
class JsonRpcResource(JsonResource):
json_decoder = ScrapyJSONDecoder()
def __init__(self, target=None):
JsonResource.__init__(self)
self._target = target
def render_GET(self, txrequest):
return self.get_target()
def render_POST(self, txrequest):
reqstr = txrequest.content.read()
target = self.get_target()
return jsonrpc_server_call(target, reqstr, self.json_decoder)
def getChild(self, name, txrequest):
target = self.get_target()
try:
newtarget = getattr(target, name)
return JsonRpcResource(newtarget)
except AttributeError:
return error.NoResource("No such child resource.")
def get_target(self):
return self._target
class RootResource(JsonResource):
def render_GET(self, txrequest):
return {'resources': self.children.keys()}
def getChild(self, name, txrequest):
if name == '':
return self
return JsonResource.getChild(self, name, txrequest)
class WebService(server.Site):
def __init__(self):
if not settings.getbool('WEBSERVICE_ENABLED'):
raise NotConfigured
logfile = settings['WEBSERVICE_LOGFILE']
port = settings.getint('WEBSERVICE_PORT')
root = RootResource()
reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \
settings['WEBSERVICE_RESOURCES'])
for res_cls in map(load_object, reslist):
res = res_cls()
root.putChild(res.ws_name, res)
server.Site.__init__(self, root, logPath=logfile)
self.noisy = False
reactor.callWhenRunning(reactor.listenTCP, port, self)