mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 15:23:40 +00:00
* Added Scrapy Web Service with documentation and tests.
* Marked Web Console as deprecated. * Removed Web Console documentation to discourage its use.
This commit is contained in:
parent
2499dfee5e
commit
6a33d6c4d0
114
bin/scrapy-ws.py
Executable file
114
bin/scrapy-ws.py
Executable file
@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Example script to control and monitor Scrapy using its web service. It only
|
||||
provides a reduced functionality as its main purpose is to illustrate how to
|
||||
write a web service client. Feel free to improve or write you own.
|
||||
"""
|
||||
|
||||
import sys, optparse, urllib
|
||||
from urlparse import urljoin
|
||||
|
||||
from scrapy.utils.jsonrpc import jsonrpc_client_call, JsonRpcError
|
||||
from scrapy.utils.py26 import json
|
||||
|
||||
def get_commands():
|
||||
return {
|
||||
'help': cmd_help,
|
||||
'run': cmd_run,
|
||||
'list-available': cmd_list_available,
|
||||
'list-running': cmd_list_running,
|
||||
'list-resources': cmd_list_resources,
|
||||
'list-extensions': cmd_list_extensions,
|
||||
'get-global-stats': cmd_get_global_stats,
|
||||
'get-spider-stats': cmd_get_spider_stats,
|
||||
}
|
||||
|
||||
def cmd_help(args, opts):
|
||||
"""help - list available commands"""
|
||||
print "Available commands:"
|
||||
for _, func in sorted(get_commands().items()):
|
||||
print " ", func.__doc__
|
||||
|
||||
def cmd_run(args, opts):
|
||||
"""run <spider_name> - schedule spider for running"""
|
||||
jsonrpc_call(opts, 'manager/queue', 'append_spider_name', args[0])
|
||||
|
||||
def cmd_list_running(args, opts):
|
||||
"""list-running - list running spiders"""
|
||||
for x in json_get(opts, 'manager/engine/open_spiders'):
|
||||
print x
|
||||
|
||||
def cmd_list_available(args, opts):
|
||||
"""list-available - list name of available spiders"""
|
||||
for x in jsonrpc_call(opts, 'spiders', 'list'):
|
||||
print x
|
||||
|
||||
def cmd_list_resources(args, opts):
|
||||
"""list-resources - list available web service resources"""
|
||||
for x in json_get(opts, '')['resources']:
|
||||
print x
|
||||
|
||||
def cmd_list_extensions(args, opts):
|
||||
"""list-extensions - list enabled extensions"""
|
||||
for x in jsonrpc_call(opts, 'extensions/enabled', 'keys'):
|
||||
print x
|
||||
|
||||
def cmd_get_spider_stats(args, opts):
|
||||
"""get-spider-stats <spider> - get stats of a running spider"""
|
||||
stats = jsonrpc_call(opts, 'stats', 'get_stats', args[0])
|
||||
for name, value in stats.items():
|
||||
print "%-40s %s" % (name, value)
|
||||
|
||||
def cmd_get_global_stats(args, opts):
|
||||
"""get-global-stats - get global stats"""
|
||||
stats = jsonrpc_call(opts, 'stats', 'get_stats')
|
||||
for name, value in stats.items():
|
||||
print "%-40s %s" % (name, value)
|
||||
|
||||
def get_wsurl(opts, path):
|
||||
return urljoin("http://%s:%s/"% (opts.host, opts.port), path)
|
||||
|
||||
def jsonrpc_call(opts, path, method, *args, **kwargs):
|
||||
url = get_wsurl(opts, path)
|
||||
return jsonrpc_client_call(url, method, *args, **kwargs)
|
||||
|
||||
def json_get(opts, path):
|
||||
url = get_wsurl(opts, path)
|
||||
return json.loads(urllib.urlopen(url).read())
|
||||
|
||||
def parse_opts():
|
||||
usage = "%prog [options] <command> [arg] ..."
|
||||
description = "Scrapy web service control script. Use '%prog help' " \
|
||||
"to see the list of available commands."
|
||||
op = optparse.OptionParser(usage=usage, description=description)
|
||||
op.add_option("-H", dest="host", default="localhost", \
|
||||
help="Scrapy host to connect to")
|
||||
op.add_option("-P", dest="port", type="int", default=6080, \
|
||||
help="Scrapy port to connect to")
|
||||
opts, args = op.parse_args()
|
||||
if not args:
|
||||
op.print_help()
|
||||
sys.exit(2)
|
||||
cmdname, cmdargs, opts = args[0], args[1:], opts
|
||||
commands = get_commands()
|
||||
if cmdname not in commands:
|
||||
sys.stderr.write("Unknown command: %s\n\n" % cmdname)
|
||||
cmd_help(None, None)
|
||||
sys.exit(1)
|
||||
return commands[cmdname], cmdargs, opts
|
||||
|
||||
def main():
|
||||
cmd, args, opts = parse_opts()
|
||||
try:
|
||||
cmd(args, opts)
|
||||
except IndexError:
|
||||
print cmd.__doc__
|
||||
except JsonRpcError, e:
|
||||
print str(e)
|
||||
if e.data:
|
||||
print "Server Traceback below:"
|
||||
print e.data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -87,7 +87,7 @@ Built-in services
|
||||
topics/stats
|
||||
topics/email
|
||||
topics/telnetconsole
|
||||
topics/webconsole
|
||||
topics/webservice
|
||||
|
||||
:doc:`topics/logging`
|
||||
Understand the simple logging facility provided by Scrapy.
|
||||
@ -101,8 +101,8 @@ Built-in services
|
||||
:doc:`topics/telnetconsole`
|
||||
Inspect a running crawler using a built-in Python console.
|
||||
|
||||
:doc:`topics/webconsole`
|
||||
Monitor and control a crawler using a web interface.
|
||||
:doc:`topics/webservice`
|
||||
Monitor and control a crawler using a web service.
|
||||
|
||||
|
||||
Solving specific problems
|
||||
|
@ -190,7 +190,7 @@ scraping easy and efficient, such as:
|
||||
* An :ref:`Interactive scraping shell console <topics-shell>`, very useful for
|
||||
writing and debugging your spiders
|
||||
|
||||
* A :ref:`Web management console <topics-webconsole>` for monitoring and
|
||||
* A builtin :ref:`Web service <topics-webservice>` for monitoring and
|
||||
controlling your bot
|
||||
|
||||
* A :ref:`Telnet console <topics-telnetconsole>` for full unrestricted access
|
||||
|
@ -36,10 +36,8 @@ by a string: the full Python path to the extension's class name. For example::
|
||||
|
||||
EXTENSIONS = {
|
||||
'scrapy.contrib.corestats.CoreStats': 500,
|
||||
'scrapy.management.web.WebConsole': 500,
|
||||
'scrapy.management.telnet.TelnetConsole': 500,
|
||||
'scrapy.contrib.webconsole.enginestatus.EngineStatus': 500,
|
||||
'scrapy.contrib.webconsole.stats.StatsDump': 500,
|
||||
'scrapy.webservice.WebService': 500,
|
||||
'scrapy.telnet.TelnetConsole': 500,
|
||||
}
|
||||
|
||||
|
||||
@ -71,10 +69,10 @@ Accessing enabled extensions
|
||||
|
||||
Even though it's not usually needed, you can access extension objects through
|
||||
the :ref:`topics-extensions-ref-manager` which is populated when extensions are
|
||||
loaded. For example, to access the ``WebConsole`` extension::
|
||||
loaded. For example, to access the ``WebService`` extension::
|
||||
|
||||
from scrapy.extension import extensions
|
||||
webconsole_extension = extensions.enabled['WebConsole']
|
||||
webservice_extension = extensions.enabled['WebService']
|
||||
|
||||
.. see also::
|
||||
|
||||
@ -146,7 +144,7 @@ how you :ref:`configure the downloader middlewares
|
||||
>>> extensions.load()
|
||||
>>> print extensions.enabled
|
||||
{'CoreStats': <scrapy.contrib.corestats.CoreStats object at 0x9e272ac>,
|
||||
'WebConsole': <scrapy.management.telnet.TelnetConsole instance at 0xa05670c>,
|
||||
'WebService': <scrapy.management.telnet.TelnetConsole instance at 0xa05670c>,
|
||||
...
|
||||
|
||||
.. attribute:: disabled
|
||||
@ -158,7 +156,7 @@ how you :ref:`configure the downloader middlewares
|
||||
>>> from scrapy.extension import extensions
|
||||
>>> extensions.load()
|
||||
>>> print extensions.disabled
|
||||
{'MemoryDebugger': 'scrapy.contrib.webconsole.stats.MemoryDebugger',
|
||||
{'MemoryDebugger': 'scrapy.contrib.memdebug.MemoryDebugger',
|
||||
'MyExtension': 'myproject.extensions.MyExtension',
|
||||
...
|
||||
|
||||
@ -193,44 +191,34 @@ Core Stats extension
|
||||
Enable the collection of core statistics, provided the stats collection is
|
||||
enabled (see :ref:`topics-stats`).
|
||||
|
||||
.. _topics-extensions-ref-webconsole:
|
||||
.. _topics-extensions-ref-webservice:
|
||||
|
||||
Web console extension
|
||||
Web service extension
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.management.web
|
||||
:synopsis: Web management console
|
||||
.. module:: scrapy.webservice
|
||||
:synopsis: Web service
|
||||
|
||||
.. class:: scrapy.management.web.WebConsole
|
||||
.. class:: scrapy.webservice.WebService
|
||||
|
||||
Provides an extensible web server for managing a Scrapy process. It's enabled
|
||||
by the :setting:`WEBCONSOLE_ENABLED` setting. The server will listen in the
|
||||
port specified in :setting:`WEBCONSOLE_PORT`, and will log to the file
|
||||
specified in :setting:`WEBCONSOLE_LOGFILE`.
|
||||
|
||||
The web server is designed to be extended by other extensions which can add
|
||||
their own management web interfaces.
|
||||
|
||||
See also :ref:`topics-webconsole` for information on how to write your own web
|
||||
console extension, and :ref:`topics-webconsole-extensions-ref` for a list of
|
||||
available built-in (web console) extensions.
|
||||
See `topics-webservice`.
|
||||
|
||||
.. _topics-extensions-ref-telnetconsole:
|
||||
|
||||
Telnet console extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.management.telnet
|
||||
:synopsis: Telnet management console
|
||||
.. module:: scrapy.telnet
|
||||
:synopsis: Telnet console
|
||||
|
||||
.. class:: scrapy.management.telnet.TelnetConsole
|
||||
.. class:: scrapy.telnet.TelnetConsole
|
||||
|
||||
Provides a telnet console for getting into a Python interpreter inside the
|
||||
currently running Scrapy process, which can be very useful for debugging.
|
||||
|
||||
The telnet console must be enabled by the :setting:`TELNETCONSOLE_ENABLED`
|
||||
setting, and the server will listen in the port specified in
|
||||
:setting:`WEBCONSOLE_PORT`.
|
||||
:setting:`TELNETCONSOLE_PORT`.
|
||||
|
||||
.. _topics-extensions-ref-memusage:
|
||||
|
||||
|
@ -519,13 +519,8 @@ Default::
|
||||
|
||||
{
|
||||
'scrapy.contrib.corestats.CoreStats': 0,
|
||||
'scrapy.management.web.WebConsole': 0,
|
||||
'scrapy.management.telnet.TelnetConsole': 0,
|
||||
'scrapy.contrib.webconsole.scheduler.SchedulerQueue': 0,
|
||||
'scrapy.contrib.webconsole.livestats.LiveStats': 0,
|
||||
'scrapy.contrib.webconsole.spiderctl.Spiderctl': 0,
|
||||
'scrapy.contrib.webconsole.enginestatus.EngineStatus': 0,
|
||||
'scrapy.contrib.webconsole.stats.StatsDump': 0,
|
||||
'scrapy.webservice.WebService': 0,
|
||||
'scrapy.telnet.TelnetConsole': 0,
|
||||
'scrapy.contrib.memusage.MemoryUsage': 0,
|
||||
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
||||
'scrapy.contrib.closedomain.CloseDomain': 0,
|
||||
@ -1054,33 +1049,3 @@ Default: ``"%s/%s" % (BOT_NAME, BOT_VERSION)``
|
||||
|
||||
The default User-Agent to use when crawling, unless overrided.
|
||||
|
||||
.. setting:: WEBCONSOLE_ENABLED
|
||||
|
||||
WEBCONSOLE_ENABLED
|
||||
------------------
|
||||
|
||||
Default: True
|
||||
|
||||
A boolean which specifies if the web management console will be enabled
|
||||
(provided its extension is also enabled).
|
||||
|
||||
.. setting:: WEBCONSOLE_LOGFILE
|
||||
|
||||
WEBCONSOLE_LOGFILE
|
||||
------------------
|
||||
|
||||
Default: ``None``
|
||||
|
||||
A file to use for logging HTTP requests made to the web console. If unset web
|
||||
the log is sent to standard scrapy log.
|
||||
|
||||
.. setting:: WEBCONSOLE_PORT
|
||||
|
||||
WEBCONSOLE_PORT
|
||||
---------------
|
||||
|
||||
Default: ``6080``
|
||||
|
||||
The port to use for the web console. If set to ``None`` or ``0``, a dynamically
|
||||
assigned port is used. For more info see :ref:`topics-webconsole`.
|
||||
|
||||
|
@ -1,142 +0,0 @@
|
||||
.. _topics-webconsole:
|
||||
|
||||
===========
|
||||
Web Console
|
||||
===========
|
||||
|
||||
Scrapy comes with a built-in web server for monitoring and controlling a Scrapy
|
||||
running process.
|
||||
|
||||
The web console is :ref:`built-in Scrapy extension
|
||||
<topics-extensions-ref>` which comes enabled by default, but you can also
|
||||
disable it if you're running tight on memory.
|
||||
|
||||
For more information about this extension see
|
||||
:ref:`topics-extensions-ref-webconsole`.
|
||||
|
||||
Writing a web console extension
|
||||
===============================
|
||||
|
||||
Writing a web console extension is similar to writing any other :ref:`Scrapy
|
||||
extensions <topics-extensions>` except that the extension class must:
|
||||
|
||||
1. catch the ``scrapy.management.web.webconsole_discover_module`` signal, and
|
||||
return itself in the handler.
|
||||
|
||||
2. have the following two attributes:
|
||||
|
||||
.. attribute:: webconsole_id
|
||||
|
||||
The id by which the Scrapy web interface will known this extension, and
|
||||
also the main dir under which this extension interface will work. For
|
||||
example, assuming Scrapy web server is listening on
|
||||
http://localhost:8000/ and the ``webconsole_id='extension1'`` the web
|
||||
main page for the interface of that extension will be:
|
||||
|
||||
http://localhost:8000/extension1/
|
||||
|
||||
.. attribute:: webconsole_name
|
||||
|
||||
The name by which the Scrapy web server will know that extension. That name
|
||||
will be displayed in the main web console index, as the text that links to
|
||||
the extension main page.
|
||||
|
||||
3. implement the following method:
|
||||
|
||||
.. method:: webconsole_render(wc_request)
|
||||
|
||||
``wc_request`` is a `twisted.web.http.Request`_ object with the HTTP request
|
||||
sent to the web console.
|
||||
|
||||
.. _twisted.web.http.Request: http://python.net/crew/mwh/apidocs/twisted.web.http.Request.html
|
||||
|
||||
It must return a str with the web page to render, typically containing HTML
|
||||
code.
|
||||
|
||||
Example web console extension
|
||||
=============================
|
||||
|
||||
Here's an example of a simple web console extension that just displays a "Hello
|
||||
world!" text::
|
||||
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.management.web import webconsole_discover_module
|
||||
|
||||
class HelloWorldConsole(object):
|
||||
webconsole_id = 'helloworld'
|
||||
webconsole_name = 'Hello world'
|
||||
|
||||
def __init__(self):
|
||||
dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module)
|
||||
|
||||
def webconsole_discover_module(self):
|
||||
return self
|
||||
|
||||
def webconsole_render(self, wc_request):
|
||||
return "<html><head></head><body><h1>Hello world!</h1></body>"
|
||||
|
||||
If you start Scrapy with the web console enabled on http://localhost:8000/ and
|
||||
you access the URL:
|
||||
|
||||
http://localhost:8000/helloworld/
|
||||
|
||||
You will see a page containing a big "Hello World!" text.
|
||||
|
||||
.. _topics-webconsole-extensions-ref:
|
||||
|
||||
Available Web console extensions
|
||||
--------------------------------
|
||||
|
||||
.. module:: scrapy.contrib.webconsole
|
||||
:synopsis: Contains most built-in web console extensions
|
||||
|
||||
Here is a list of built-in web console extensions.
|
||||
|
||||
Scheduler queue extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webconsole.scheduler
|
||||
:synopsis: Scheduler queue web console extension
|
||||
|
||||
.. class:: scrapy.contrib.webconsole.scheduler.SchedulerQueue
|
||||
|
||||
Display a list of all pending Requests in the Scheduler queue, grouped by
|
||||
domain/spider.
|
||||
|
||||
Spider live stats extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webconsole.livestats
|
||||
:synopsis: Spider live stats web console extension
|
||||
|
||||
.. class:: scrapy.contrib.webconsole.livestats.LiveStats
|
||||
|
||||
Display a table with stats of all spider crawled by the current Scrapy run,
|
||||
including:
|
||||
|
||||
* Number of items scraped
|
||||
* Number of pages crawled
|
||||
* Number of pending requests in the scheduler
|
||||
* Number of pending requests in the downloader queue
|
||||
* Number of requests currently being downloaded
|
||||
|
||||
Engine status extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webconsole.enginestatus
|
||||
:synopsis: Engine status web console extension
|
||||
|
||||
.. class:: scrapy.contrib.webconsole.enginestatus.EngineStatus
|
||||
|
||||
Display the current status of the Scrapy Engine, which is just the output of
|
||||
the Scrapy engine ``getstatus()`` method.
|
||||
|
||||
Stats collector dump extension
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webconsole.stats
|
||||
:synopsis: Stats dump web console extension
|
||||
|
||||
.. class:: scrapy.contrib.webconsole.stats.StatsDump
|
||||
|
||||
Display the stats collected so far by the stats collector.
|
236
docs/topics/webservice.rst
Normal file
236
docs/topics/webservice.rst
Normal file
@ -0,0 +1,236 @@
|
||||
.. _topics-webservice:
|
||||
|
||||
===========
|
||||
Web Service
|
||||
===========
|
||||
|
||||
Scrapy comes with a built-in web service for monitoring and controlling a
|
||||
running crawler. The service exposes most resources using the `JSON-RPC 2.0`_
|
||||
protocol, but there are also other (read-only) resources which just output JSON
|
||||
data.
|
||||
|
||||
Provides an extensible web service for managing a Scrapy process. It's enabled
|
||||
by the :setting:`WEBSERVICE_ENABLED` setting. The web server will listen in the
|
||||
port specified in :setting:`WEBSERVICE_PORT`, and will log to the file
|
||||
specified in :setting:`WEBSERVICE_LOGFILE`.
|
||||
|
||||
The web service is a :ref:`built-in Scrapy extension <topics-extensions-ref>`
|
||||
which comes enabled by default, but you can also disable it if you're running
|
||||
tight on memory.
|
||||
|
||||
.. _topics-webservice-resources:
|
||||
|
||||
Web service resources
|
||||
=====================
|
||||
|
||||
The web service contains several resources, defined in the
|
||||
:setting:`WEBSERVICE_RESOURCES` setting. Each resource provides a different
|
||||
functionality. See :ref:`topics-webservice-resources-ref` for a list of
|
||||
resources available by default.
|
||||
|
||||
Althought you can implement your own resources using any protocol, there are
|
||||
two kinds of resources bundled with Scrapy:
|
||||
|
||||
* Simple JSON resources - which are read-only and just output JSON data
|
||||
* JSON-RPC resources - which provide direct access to certain Scrapy objects
|
||||
using the `JSON-RPC 2.0`_ protocol
|
||||
|
||||
.. module:: scrapy.contrib.webservice
|
||||
:synopsis: Built-in web service resources
|
||||
|
||||
.. _topics-webservice-resources-ref:
|
||||
|
||||
Available JSON-RPC resources
|
||||
----------------------------
|
||||
|
||||
These are the JSON-RPC resources available by default in Scrapy:
|
||||
|
||||
Execution Manager JSON-RPC resource
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webservice.manager
|
||||
:synopsis: Execution Manager JSON-RPC resource
|
||||
|
||||
.. class:: ManagerResource
|
||||
|
||||
Provides access to the Execution Manager that controls the crawler.
|
||||
|
||||
Available by default at: http://localhost:6080/manager
|
||||
|
||||
Stats Collector JSON-RPC resource
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webservice.stats
|
||||
:synopsis: Stats JSON-RPC resource
|
||||
|
||||
.. class:: StatsResource
|
||||
|
||||
Provides access to the Stats Collector used by the crawler.
|
||||
|
||||
Available by default at: http://localhost:6080/stats
|
||||
|
||||
Spider Manager JSON-RPC resource
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webservice.spiders
|
||||
:synopsis: Spider Manager JSON-RPC resource
|
||||
|
||||
.. class:: SpidersResource
|
||||
|
||||
Provides access to the Spider Manager used by the crawler.
|
||||
|
||||
Available by default at: http://localhost:6080/spiders
|
||||
|
||||
Extension Manager JSON-RPC resource
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webservice.extensions
|
||||
:synopsis: Extension Manager JSON-RPC resource
|
||||
|
||||
.. class:: ExtensionsResource
|
||||
|
||||
Provides access to the Extension Manager used by the crawler.
|
||||
|
||||
Available by default at: http://localhost:6080/extensions
|
||||
|
||||
Available JSON resources
|
||||
------------------------
|
||||
|
||||
These are the JSON resources available by default:
|
||||
|
||||
Extension Manager JSON-RPC resource
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webservice.enginestatus
|
||||
:synopsis: Engine Status JSON resource
|
||||
|
||||
.. class:: EngineStatusResource
|
||||
|
||||
Provides access to the Extension Manager used by the crawler.
|
||||
|
||||
Available by default at: http://localhost:6080/enginestatus
|
||||
|
||||
Web service settings
|
||||
====================
|
||||
|
||||
These are the settings that control the web service behaviour:
|
||||
|
||||
.. setting:: WEBSERVICE_ENABLED
|
||||
|
||||
WEBSERVICE_ENABLED
|
||||
------------------
|
||||
|
||||
Default: ``True``
|
||||
|
||||
A boolean which specifies if the web service will be enabled (provided its
|
||||
extension is also enabled).
|
||||
|
||||
.. setting:: WEBSERVICE_LOGFILE
|
||||
|
||||
WEBSERVICE_LOGFILE
|
||||
------------------
|
||||
|
||||
Default: ``None``
|
||||
|
||||
A file to use for logging HTTP requests made to the web service. If unset web
|
||||
the log is sent to standard scrapy log.
|
||||
|
||||
.. setting:: WEBSERVICE_PORT
|
||||
|
||||
WEBSERVICE_PORT
|
||||
---------------
|
||||
|
||||
Default: ``6080``
|
||||
|
||||
The port to use for the web service. If set to ``None`` or ``0``, a dynamically
|
||||
assigned port is used.
|
||||
|
||||
WEBSERVICE_RESOURCES
|
||||
--------------------
|
||||
|
||||
Default: ``{}``
|
||||
|
||||
The list of web service resources enabled for your project. See
|
||||
:ref:`topics-webservice-resources`. These are added to the ones available by
|
||||
default in Scrapy, defined in the :setting:`WEBSERVICE_RESOURCES_BASE` setting.
|
||||
|
||||
WEBSERVICE_RESOURCES_BASE
|
||||
-------------------------
|
||||
|
||||
Default::
|
||||
|
||||
{
|
||||
'scrapy.contrib.webservice.manager.ManagerResource': 1,
|
||||
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
|
||||
'scrapy.contrib.webservice.extensions.ExtensionsResource': 1,
|
||||
'scrapy.contrib.webservice.spiders.SpidersResource': 1,
|
||||
'scrapy.contrib.webservice.stats.StatsResource': 1,
|
||||
}
|
||||
|
||||
The list of web service resources available by default in Scrapy. You shouldn't
|
||||
change this setting in your project, change :setting:`WEBSERVICE_RESOURCES`
|
||||
instead. If you want to disable some resource set its value to ``None`` in
|
||||
:setting:`WEBSERVICE_RESOURCES`.
|
||||
|
||||
Writing a web service resource
|
||||
==============================
|
||||
|
||||
Web service resources are implemented using the Twisted Web API. See this
|
||||
`Twisted Web guide`_ for more information on Twisted web and Twisted web
|
||||
resources.
|
||||
|
||||
To write a web service resource you should subclass the :class:`JsonResource` or
|
||||
:class:`JsonRpcResource` classes and implement the :class:`renderGET` method.
|
||||
|
||||
.. class:: scrapy.webservice.JsonResource
|
||||
|
||||
A subclass of `twisted.web.resource.Resource`_ that implements a JSON web
|
||||
service resource. See
|
||||
|
||||
.. attribute:: ws_name
|
||||
|
||||
The name by which the Scrapy web service will known this resource, and
|
||||
also the path wehere this resource will listen. For example, assuming
|
||||
Scrapy web service is listening on http://localhost:6080/ and the
|
||||
``ws_name`` is ``'resource1'`` the URL for that resource will be:
|
||||
|
||||
http://localhost:6080/resource1/
|
||||
|
||||
.. class:: scrapy.webservice.JsonRpcResource(target=None)
|
||||
|
||||
This is a subclass of :class:`JsonResource` for implementing JSON-RPC
|
||||
resources. JSON-RPC resources wrap Python (Scrapy) objects around a
|
||||
JSON-RPC API. The resource wrapped must be returned by the
|
||||
:meth:`get_target` method, which returns the target passed in the
|
||||
constructor by default
|
||||
|
||||
.. method:: get_target()
|
||||
|
||||
Return the object wrapped by this JSON-RPC resource. By default, it
|
||||
returns the object passed on the constructor.
|
||||
|
||||
Examples of web service resources
|
||||
=================================
|
||||
|
||||
StatsResource (JSON-RPC resource)
|
||||
---------------------------------
|
||||
|
||||
.. literalinclude:: ../../scrapy/contrib/webservice/stats.py
|
||||
|
||||
EngineStatusResource (JSON resource)
|
||||
-------------------------------------
|
||||
|
||||
.. literalinclude:: ../../scrapy/contrib/webservice/enginestatus.py
|
||||
|
||||
Example of web service client
|
||||
=============================
|
||||
|
||||
scrapy-ws.py script
|
||||
-------------------
|
||||
|
||||
.. literalinclude:: ../../bin/scrapy-ws.py
|
||||
|
||||
.. _Twisted Web guide: http://jcalderone.livejournal.com/50562.html
|
||||
.. _JSON-RPC 2.0: http://www.jsonrpc.org/
|
||||
.. _twisted.web.resource.Resource: http://twistedmatrix.com/documents/10.0.0/api/twisted.web.resource.Resource.html
|
||||
|
@ -109,13 +109,8 @@ EXTENSIONS = {}
|
||||
|
||||
EXTENSIONS_BASE = {
|
||||
'scrapy.contrib.corestats.CoreStats': 0,
|
||||
'scrapy.management.web.WebConsole': 0,
|
||||
'scrapy.webservice.WebService': 0,
|
||||
'scrapy.telnet.TelnetConsole': 0,
|
||||
'scrapy.contrib.webconsole.scheduler.SchedulerQueue': 0,
|
||||
'scrapy.contrib.webconsole.livestats.LiveStats': 0,
|
||||
'scrapy.contrib.webconsole.spiderctl.Spiderctl': 0,
|
||||
'scrapy.contrib.webconsole.enginestatus.EngineStatus': 0,
|
||||
'scrapy.contrib.webconsole.stats.StatsDump': 0,
|
||||
'scrapy.contrib.memusage.MemoryUsage': 0,
|
||||
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
||||
'scrapy.contrib.closespider.CloseSpider': 0,
|
||||
@ -232,3 +227,14 @@ WEBCONSOLE_ENABLED = True
|
||||
WEBCONSOLE_PORT = 6080
|
||||
WEBCONSOLE_LOGFILE = None
|
||||
|
||||
WEBSERVICE_ENABLED = True
|
||||
WEBSERVICE_LOGFILE = None
|
||||
WEBSERVICE_PORT = 6080
|
||||
WEBSERVICE_RESOURCES = {}
|
||||
WEBSERVICE_RESOURCES_BASE = {
|
||||
'scrapy.contrib.webservice.manager.ManagerResource': 1,
|
||||
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
|
||||
'scrapy.contrib.webservice.extensions.ExtensionsResource': 1,
|
||||
'scrapy.contrib.webservice.spiders.SpidersResource': 1,
|
||||
'scrapy.contrib.webservice.stats.StatsResource': 1,
|
||||
}
|
||||
|
@ -0,0 +1,3 @@
|
||||
import warnings
|
||||
warnings.warn("Web console is deprecated. Consider using web service instead.", \
|
||||
DeprecationWarning, stacklevel=2)
|
0
scrapy/contrib/webservice/__init__.py
Normal file
0
scrapy/contrib/webservice/__init__.py
Normal file
24
scrapy/contrib/webservice/enginestatus.py
Normal file
24
scrapy/contrib/webservice/enginestatus.py
Normal file
@ -0,0 +1,24 @@
|
||||
from scrapy.webservice import JsonResource
|
||||
from scrapy.core.manager import scrapymanager
|
||||
from scrapy.utils.engine import get_engine_status
|
||||
|
||||
class EngineStatusResource(JsonResource):
|
||||
|
||||
ws_name = 'enginestatus'
|
||||
|
||||
def __init__(self, spider_name=None, _manager=scrapymanager):
|
||||
JsonResource.__init__(self)
|
||||
self._spider_name = spider_name
|
||||
self.isLeaf = spider_name is not None
|
||||
self._manager = _manager
|
||||
|
||||
def render_GET(self, txrequest):
|
||||
status = get_engine_status(self._manager.engine)
|
||||
if self._spider_name is None:
|
||||
return status
|
||||
for sp, st in status['spiders'].items():
|
||||
if sp.name == self._spider_name:
|
||||
return st
|
||||
|
||||
def getChild(self, name, txrequest):
|
||||
return EngineStatusResource(name, self._manager)
|
10
scrapy/contrib/webservice/extensions.py
Normal file
10
scrapy/contrib/webservice/extensions.py
Normal file
@ -0,0 +1,10 @@
|
||||
from scrapy.webservice import JsonRpcResource
|
||||
from scrapy.extension import extensions
|
||||
|
||||
class ExtensionsResource(JsonRpcResource):
|
||||
|
||||
ws_name = 'extensions'
|
||||
|
||||
def __init__(self, _extensions=extensions):
|
||||
JsonRpcResource.__init__(self)
|
||||
self._target = _extensions
|
10
scrapy/contrib/webservice/manager.py
Normal file
10
scrapy/contrib/webservice/manager.py
Normal file
@ -0,0 +1,10 @@
|
||||
from scrapy.webservice import JsonRpcResource
|
||||
from scrapy.core.manager import scrapymanager
|
||||
|
||||
class ManagerResource(JsonRpcResource):
|
||||
|
||||
ws_name = 'manager'
|
||||
|
||||
def __init__(self, _manager=scrapymanager):
|
||||
JsonRpcResource.__init__(self)
|
||||
self._target = _manager
|
10
scrapy/contrib/webservice/spiders.py
Normal file
10
scrapy/contrib/webservice/spiders.py
Normal file
@ -0,0 +1,10 @@
|
||||
from scrapy.webservice import JsonRpcResource
|
||||
from scrapy.spider import spiders
|
||||
|
||||
class SpidersResource(JsonRpcResource):
|
||||
|
||||
ws_name = 'spiders'
|
||||
|
||||
def __init__(self, _spiders=spiders):
|
||||
JsonRpcResource.__init__(self)
|
||||
self._target = _spiders
|
10
scrapy/contrib/webservice/stats.py
Normal file
10
scrapy/contrib/webservice/stats.py
Normal file
@ -0,0 +1,10 @@
|
||||
from scrapy.webservice import JsonRpcResource
|
||||
from scrapy.stats import stats
|
||||
|
||||
class StatsResource(JsonRpcResource):
|
||||
|
||||
ws_name = 'stats'
|
||||
|
||||
def __init__(self, _stats=stats):
|
||||
JsonRpcResource.__init__(self)
|
||||
self._target = _stats
|
@ -1,8 +1,6 @@
|
||||
"""
|
||||
Scrapy Web Console extension
|
||||
|
||||
See docs/topics/webconsole.rst
|
||||
"""
|
||||
import warnings
|
||||
warnings.warn("Scrapy web console is deprecated. Consider using web service instead.", \
|
||||
DeprecationWarning, stacklevel=2)
|
||||
|
||||
import re
|
||||
import socket
|
||||
|
@ -77,6 +77,6 @@ class BaseSpider(object_ref):
|
||||
raise NotImplementedError
|
||||
|
||||
def __str__(self):
|
||||
return "<%s %r>" % (type(self).__name__, self.name)
|
||||
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
|
||||
|
||||
__repr__ = __str__
|
||||
|
112
scrapy/tests/test_utils_jsonrpc.py
Normal file
112
scrapy/tests/test_utils_jsonrpc.py
Normal file
@ -0,0 +1,112 @@
|
||||
import unittest
|
||||
from cStringIO import StringIO
|
||||
|
||||
from scrapy.utils.jsonrpc import jsonrpc_client_call, jsonrpc_server_call, \
|
||||
JsonRpcError, jsonrpc_errors
|
||||
from scrapy.utils.py26 import json
|
||||
|
||||
class urllib_stub(object):
|
||||
def __init__(self, result=None, error=None):
|
||||
response = {}
|
||||
if result:
|
||||
response.update(result=result)
|
||||
if error:
|
||||
response.update(error=error)
|
||||
self.response = json.dumps(response)
|
||||
self.request = None
|
||||
|
||||
def urlopen(self, url, request):
|
||||
self.url = url
|
||||
self.request = request
|
||||
return StringIO(self.response)
|
||||
|
||||
class TestTarget(object):
|
||||
|
||||
def call(self, *args, **kwargs):
|
||||
return list(args), kwargs
|
||||
|
||||
def exception(self):
|
||||
raise Exception("testing-errors")
|
||||
|
||||
class JsonRpcUtilsTestCase(unittest.TestCase):
|
||||
|
||||
def test_jsonrpc_client_call_request(self):
|
||||
ul = urllib_stub(1)
|
||||
jsonrpc_client_call('url', 'test', 'one', 2, _urllib=ul)
|
||||
req = json.loads(ul.request)
|
||||
assert 'id' in req
|
||||
self.assertEqual(ul.url, 'url')
|
||||
self.assertEqual(req['jsonrpc'], '2.0')
|
||||
self.assertEqual(req['method'], 'test')
|
||||
self.assertEqual(req['params'], ['one', 2])
|
||||
|
||||
def test_jsonrpc_client_call_response(self):
|
||||
ul = urllib_stub()
|
||||
# must return result or error
|
||||
self.assertRaises(ValueError, jsonrpc_client_call, 'url', 'test', _urllib=ul)
|
||||
ul = urllib_stub(result={'one': 1})
|
||||
self.assertEquals(jsonrpc_client_call('url', 'test', _urllib=ul), {'one': 1})
|
||||
ul = urllib_stub(error={'code': 123, 'message': 'hello', 'data': 'some data'})
|
||||
|
||||
raised = False
|
||||
try:
|
||||
jsonrpc_client_call('url', 'test', _urllib=ul)
|
||||
except JsonRpcError, e:
|
||||
raised = True
|
||||
self.assertEqual(e.code, 123)
|
||||
self.assertEqual(e.message, 'hello')
|
||||
self.assertEqual(e.data, 'some data')
|
||||
assert '123' in str(e)
|
||||
assert 'hello' in str(e)
|
||||
assert raised, "JsonRpcError not raised"
|
||||
|
||||
def test_jsonrpc_server_call(self):
|
||||
t = TestTarget()
|
||||
r = jsonrpc_server_call(t, 'invalid json data')
|
||||
assert 'error' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] is None
|
||||
self.assertEqual(r['error']['code'], jsonrpc_errors.PARSE_ERROR)
|
||||
assert 'Traceback' in r['error']['data']
|
||||
|
||||
r = jsonrpc_server_call(t, '{"test": "test"}')
|
||||
assert 'error' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] is None
|
||||
self.assertEqual(r['error']['code'], jsonrpc_errors.INVALID_REQUEST)
|
||||
|
||||
r = jsonrpc_server_call(t, '{"method": "notfound", "id": 1}')
|
||||
assert 'error' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] == 1
|
||||
self.assertEqual(r['error']['code'], jsonrpc_errors.METHOD_NOT_FOUND)
|
||||
|
||||
r = jsonrpc_server_call(t, '{"method": "exception", "id": 1}')
|
||||
assert 'error' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] == 1
|
||||
self.assertEqual(r['error']['code'], jsonrpc_errors.INTERNAL_ERROR)
|
||||
assert 'testing-errors' in r['error']['message']
|
||||
assert 'Traceback' in r['error']['data']
|
||||
|
||||
r = jsonrpc_server_call(t, '{"method": "call", "id": 2}')
|
||||
assert 'result' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] == 2
|
||||
self.assertEqual(r['result'], ([], {}))
|
||||
|
||||
r = jsonrpc_server_call(t, '{"method": "call", "params": [456, 123], "id": 3}')
|
||||
assert 'result' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] == 3
|
||||
self.assertEqual(r['result'], ([456, 123], {}))
|
||||
|
||||
r = jsonrpc_server_call(t, '{"method": "call", "params": {"data": 789}, "id": 3}')
|
||||
assert 'result' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] == 3
|
||||
self.assertEqual(r['result'], ([], {'data': 789}))
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
125
scrapy/tests/test_utils_serialize.py
Normal file
125
scrapy/tests/test_utils_serialize.py
Normal file
@ -0,0 +1,125 @@
|
||||
import unittest
|
||||
import datetime
|
||||
from decimal import Decimal
|
||||
|
||||
from scrapy.utils.serialize import SpiderReferencer, ScrapyJSONEncoder, ScrapyJSONDecoder
|
||||
from scrapy.utils.py26 import json
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Request, Response
|
||||
|
||||
|
||||
class ExecutionEngineStub(object):
|
||||
def __init__(self, open_spiders):
|
||||
self.open_spiders = open_spiders
|
||||
|
||||
class ExecutionMangerStub(object):
|
||||
def __init__(self, open_spiders):
|
||||
self.engine = ExecutionEngineStub(open_spiders)
|
||||
|
||||
class BaseTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider1 = BaseSpider('name1')
|
||||
self.spider2 = BaseSpider('name2')
|
||||
open_spiders = set([self.spider1, self.spider2])
|
||||
manager = ExecutionMangerStub(open_spiders)
|
||||
self.spref = SpiderReferencer(manager)
|
||||
self.encoder = ScrapyJSONEncoder(spref=self.spref)
|
||||
self.decoder = ScrapyJSONDecoder(spref=self.spref)
|
||||
|
||||
class SpiderReferencerTestCase(BaseTestCase):
|
||||
|
||||
def test_spiders_and_references(self):
|
||||
ref1 = self.spref.get_reference_from_spider(self.spider1)
|
||||
assert isinstance(ref1, str)
|
||||
assert self.spider1.name in ref1
|
||||
ref2 = self.spref.get_reference_from_spider(self.spider2)
|
||||
ref1_ = self.spref.get_reference_from_spider(self.spider1)
|
||||
assert ref1 == ref1_
|
||||
assert ref1 != ref2
|
||||
|
||||
sp1 = self.spref.get_spider_from_reference(ref1)
|
||||
sp2 = self.spref.get_spider_from_reference(ref2)
|
||||
sp1_ = self.spref.get_spider_from_reference(ref1)
|
||||
assert isinstance(sp1, BaseSpider)
|
||||
assert sp1 is not sp2
|
||||
assert sp1 is sp1_
|
||||
|
||||
# must return string as-is if spider id not found
|
||||
assert 'lala' == self.spref.get_spider_from_reference('lala')
|
||||
# must raise RuntimeError if spider id is not found and spider is not running
|
||||
self.assertRaises(RuntimeError, self.spref.get_spider_from_reference, 'spider:fffffff')
|
||||
|
||||
def test_encode_decode(self):
|
||||
sr = self.spref
|
||||
sp1 = self.spider1
|
||||
sp2 = self.spider2
|
||||
ref1 = sr.get_reference_from_spider(sp1)
|
||||
ref2 = sr.get_reference_from_spider(sp2)
|
||||
|
||||
examples = [
|
||||
('lala', 'lala'),
|
||||
(sp1, ref1),
|
||||
(['lala', sp1], ['lala', ref1]),
|
||||
({'lala': sp1}, {'lala': ref1}),
|
||||
({sp1: sp2}, {ref1: ref2}),
|
||||
({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}})
|
||||
]
|
||||
for spiders, refs in examples:
|
||||
self.assertEqual(sr.encode_references(spiders), refs)
|
||||
self.assertEqual(sr.decode_references(refs), spiders)
|
||||
|
||||
class JsonEncoderTestCase(BaseTestCase):
|
||||
|
||||
def test_encode_decode(self):
|
||||
sr = self.spref
|
||||
sp1 = self.spider1
|
||||
sp2 = self.spider2
|
||||
ref1 = sr.get_reference_from_spider(sp1)
|
||||
ref2 = sr.get_reference_from_spider(sp2)
|
||||
dt = datetime.datetime(2010, 1, 2, 10, 11, 12)
|
||||
dts = "2010-01-02 10:11:12"
|
||||
d = datetime.date(2010, 1, 2)
|
||||
ds = "2010-01-02"
|
||||
t = datetime.time(10, 11, 12)
|
||||
ts = "10:11:12"
|
||||
dec = Decimal("1000.12")
|
||||
decs = "1000.12"
|
||||
|
||||
examples_encode_decode = [
|
||||
('lala', 'lala'),
|
||||
(sp1, ref1),
|
||||
(['lala', sp1], ['lala', ref1]),
|
||||
({'lala': sp1}, {'lala': ref1}),
|
||||
({sp1: sp2}, {ref1: ref2}),
|
||||
({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}})
|
||||
]
|
||||
for spiders, refs in examples_encode_decode:
|
||||
self.assertEqual(self.encoder.encode(spiders), json.dumps(refs))
|
||||
self.assertEqual(self.decoder.decode(json.dumps(refs)), spiders)
|
||||
|
||||
examples_encode_only = [
|
||||
({sp1: dt}, {ref1: dts}),
|
||||
({sp1: d}, {ref1: ds}),
|
||||
({sp1: t}, {ref1: ts}),
|
||||
({sp1: dec}, {ref1: decs}),
|
||||
]
|
||||
for spiders, refs in examples_encode_only:
|
||||
self.assertEqual(self.encoder.encode(spiders), json.dumps(refs))
|
||||
|
||||
def test_encode_request(self):
|
||||
r = Request("http://www.example.com/lala")
|
||||
rs = self.encoder.encode(r)
|
||||
assert r.method in rs
|
||||
assert r.url in rs
|
||||
|
||||
def test_encode_response(self):
|
||||
r = Response("http://www.example.com/lala")
|
||||
rs = self.encoder.encode(r)
|
||||
assert r.url in rs
|
||||
assert str(r.status) in rs
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -37,23 +37,34 @@ def get_engine_status(engine=None):
|
||||
"engine.scraper.sites[spider].needs_backout()",
|
||||
]
|
||||
|
||||
s = "Execution engine status\n\n"
|
||||
|
||||
status = {'global': {}, 'spiders': {}}
|
||||
for test in global_tests:
|
||||
try:
|
||||
s += "%-47s : %s\n" % (test, eval(test))
|
||||
status['global'][test] = eval(test)
|
||||
except Exception, e:
|
||||
s += "%-47s : %s (exception)\n" % (test, type(e).__name__)
|
||||
s += "\n"
|
||||
status['global'][test] = "%s (exception)" % type(e).__name__
|
||||
for spider in engine.downloader.sites:
|
||||
s += "Spider: %s\n" % spider
|
||||
x = {}
|
||||
for test in spider_tests:
|
||||
try:
|
||||
s += " %-50s : %s\n" % (test, eval(test))
|
||||
x[test] = eval(test)
|
||||
except Exception, e:
|
||||
s += " %-50s : %s (exception)\n" % (test, type(e).__name__)
|
||||
x[test] = "%s (exception)" % type(e).__name__
|
||||
status['spiders'][spider] = x
|
||||
return status
|
||||
|
||||
def format_engine_status(engine=None):
|
||||
status = get_engine_status(engine)
|
||||
s = "Execution engine status\n\n"
|
||||
for test, result in status['global'].items():
|
||||
s += "%-47s : %s\n" % (test, result)
|
||||
s += "\n"
|
||||
for spider, tests in status['spiders'].items():
|
||||
s += "Spider: %s\n" % spider
|
||||
for test, result in tests.items():
|
||||
s += " %-50s : %s\n" % (test, result)
|
||||
return s
|
||||
|
||||
def print_engine_status(engine=None):
|
||||
print get_engine_status(engine)
|
||||
print format_engine_status(engine)
|
||||
|
||||
|
94
scrapy/utils/jsonrpc.py
Normal file
94
scrapy/utils/jsonrpc.py
Normal file
@ -0,0 +1,94 @@
|
||||
"""
|
||||
This module implements the JSON-RPC 2.0 protocol, as defined in:
|
||||
http://groups.google.com/group/json-rpc/web/json-rpc-2-0
|
||||
"""
|
||||
|
||||
import urllib
|
||||
import traceback
|
||||
|
||||
from scrapy.utils.py26 import json
|
||||
from scrapy.utils.serialize import ScrapyJSONDecoder
|
||||
|
||||
# JSON-RPC 2.0 errors, as defined in:
|
||||
class jsonrpc_errors:
|
||||
PARSE_ERROR = -32700
|
||||
INVALID_REQUEST = -32600
|
||||
METHOD_NOT_FOUND = -32601
|
||||
INVALID_PARAMS = -32602
|
||||
INTERNAL_ERROR = -32603
|
||||
|
||||
class JsonRpcError(Exception):
|
||||
|
||||
def __init__(self, code, message, data=None):
|
||||
super(JsonRpcError, self).__init__()
|
||||
self.code = code
|
||||
self.message = message
|
||||
self.data = data
|
||||
|
||||
def __str__(self):
|
||||
return "JSON-RPC error (code %d): %s" % (self.code, self.message)
|
||||
|
||||
def jsonrpc_client_call(url, method, *args, **kwargs):
|
||||
"""Execute a JSON-RPC call on the given url"""
|
||||
_urllib = kwargs.pop('_urllib', urllib)
|
||||
req = {'jsonrpc': '2.0', 'method': method, 'params': args or kwargs, 'id': 1}
|
||||
res = json.loads(_urllib.urlopen(url, json.dumps(req)).read())
|
||||
if 'result' in res:
|
||||
return res['result']
|
||||
elif 'error' in res:
|
||||
er = res['error']
|
||||
raise JsonRpcError(er['code'], er['message'], er['data'])
|
||||
else:
|
||||
msg = "JSON-RPC response must contain 'result' or 'error': %s" % res
|
||||
raise ValueError(msg)
|
||||
|
||||
def jsonrpc_server_call(target, jsonrpc_request, json_decoder=None):
|
||||
"""Execute the given JSON-RPC request (as JSON-encoded string) on the given
|
||||
target object and return the JSON-RPC response, as a dict
|
||||
"""
|
||||
if json_decoder is None:
|
||||
json_decoder = ScrapyJSONDecoder()
|
||||
|
||||
try:
|
||||
req = json_decoder.decode(jsonrpc_request)
|
||||
except Exception, e:
|
||||
return jsonrpc_error(None, jsonrpc_errors.PARSE_ERROR, 'Parse error', \
|
||||
traceback.format_exc())
|
||||
|
||||
try:
|
||||
id, methname = req['id'], req['method']
|
||||
except KeyError:
|
||||
return jsonrpc_error(None, jsonrpc_errors.INVALID_REQUEST, 'Invalid Request')
|
||||
|
||||
try:
|
||||
method = getattr(target, methname)
|
||||
except AttributeError:
|
||||
return jsonrpc_error(id, jsonrpc_errors.METHOD_NOT_FOUND, 'Method not found')
|
||||
|
||||
params = req.get('params', [])
|
||||
a, kw = ([], params) if isinstance(params, dict) else (params, {})
|
||||
try:
|
||||
return jsonrpc_result(id, method(*a, **kw))
|
||||
except Exception, e:
|
||||
return jsonrpc_error(id, jsonrpc_errors.INTERNAL_ERROR, str(e), \
|
||||
traceback.format_exc())
|
||||
|
||||
def jsonrpc_error(id, code, message, data=None):
|
||||
"""Create JSON-RPC error response"""
|
||||
return {
|
||||
'jsonrpc': '2.0',
|
||||
'error': {
|
||||
'code': code,
|
||||
'message': message,
|
||||
'data': data,
|
||||
},
|
||||
'id': id,
|
||||
}
|
||||
|
||||
def jsonrpc_result(id, result):
|
||||
"""Create JSON-RPC result response"""
|
||||
return {
|
||||
'jsonrpc': '2.0',
|
||||
'result': result,
|
||||
'id': id,
|
||||
}
|
115
scrapy/utils/serialize.py
Normal file
115
scrapy/utils/serialize.py
Normal file
@ -0,0 +1,115 @@
|
||||
import re
|
||||
import datetime
|
||||
import decimal
|
||||
|
||||
from scrapy.core.manager import scrapymanager
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.utils.py26 import json
|
||||
|
||||
|
||||
class SpiderReferencer(object):
|
||||
"""Class to serialize (and deserialize) objects (typically dicts)
|
||||
containing references to running spiders (ie. Spider objects). This is
|
||||
required because simplejson fails to serialize dicts containing
|
||||
non-primitive types as keys, even when you override
|
||||
ScrapyJSONEncoder.default() with a custom encoding mechanism.
|
||||
"""
|
||||
|
||||
spider_ref_re = re.compile('^spider:([0-9a-f]+)(:.*)?$')
|
||||
|
||||
def __init__(self, manager=None):
|
||||
self.manager = manager or scrapymanager
|
||||
|
||||
def get_reference_from_spider(self, spider):
|
||||
return 'spider:%x:%s' % (id(spider), spider.name)
|
||||
|
||||
def get_spider_from_reference(self, ref):
|
||||
"""Returns the Spider referenced by text, if text is a spider
|
||||
reference. Otherwise it returns the text itself. If the text references
|
||||
a non-running spider it raises a RuntimeError.
|
||||
"""
|
||||
m = self.spider_ref_re.search(ref)
|
||||
if m:
|
||||
spid = int(m.group(1), 16)
|
||||
for spider in self.manager.engine.open_spiders:
|
||||
if id(spider) == spid:
|
||||
return spider
|
||||
raise RuntimeError("Spider not running: %s" % ref)
|
||||
return ref
|
||||
|
||||
def encode_references(self, obj):
|
||||
"""Look for Spider objects and replace them with spider references"""
|
||||
if isinstance(obj, BaseSpider):
|
||||
return self.get_reference_from_spider(obj)
|
||||
elif isinstance(obj, dict):
|
||||
d = {}
|
||||
for k, v in obj.items():
|
||||
k = self.encode_references(k)
|
||||
v = self.encode_references(v)
|
||||
d[k] = v
|
||||
return d
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
return [self.encode_references(x) for x in obj]
|
||||
else:
|
||||
return obj
|
||||
|
||||
def decode_references(self, obj):
|
||||
"""Look for spider references and replace them with Spider objects"""
|
||||
if isinstance(obj, basestring):
|
||||
return self.get_spider_from_reference(obj)
|
||||
elif isinstance(obj, dict):
|
||||
d = {}
|
||||
for k, v in obj.items():
|
||||
k = self.decode_references(k)
|
||||
v = self.decode_references(v)
|
||||
d[k] = v
|
||||
return d
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
return [self.decode_references(x) for x in obj]
|
||||
else:
|
||||
return obj
|
||||
|
||||
|
||||
class ScrapyJSONEncoder(json.JSONEncoder):
|
||||
|
||||
DATE_FORMAT = "%Y-%m-%d"
|
||||
TIME_FORMAT = "%H:%M:%S"
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
self.spref = kw.pop('spref', None) or SpiderReferencer()
|
||||
super(ScrapyJSONEncoder, self).__init__(*a, **kw)
|
||||
|
||||
def encode(self, o):
|
||||
if self.spref:
|
||||
o = self.spref.encode_references(o)
|
||||
return super(ScrapyJSONEncoder, self).encode(o)
|
||||
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime.datetime):
|
||||
return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
|
||||
elif isinstance(o, datetime.date):
|
||||
return o.strftime(self.DATE_FORMAT)
|
||||
elif isinstance(o, datetime.time):
|
||||
return o.strftime(self.TIME_FORMAT)
|
||||
elif isinstance(o, decimal.Decimal):
|
||||
return str(o)
|
||||
elif isinstance(o, Request):
|
||||
return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
|
||||
elif isinstance(o, Response):
|
||||
return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
|
||||
else:
|
||||
return super(ScrapyJSONEncoder, self).default(o)
|
||||
|
||||
|
||||
class ScrapyJSONDecoder(json.JSONDecoder):
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
self.spref = kw.pop('spref', None) or SpiderReferencer()
|
||||
super(ScrapyJSONDecoder, self).__init__(*a, **kw)
|
||||
|
||||
def decode(self, s):
|
||||
o = super(ScrapyJSONDecoder, self).decode(s)
|
||||
if self.spref:
|
||||
o = self.spref.decode_references(o)
|
||||
return o
|
86
scrapy/webservice.py
Normal file
86
scrapy/webservice.py
Normal file
@ -0,0 +1,86 @@
|
||||
"""
|
||||
Scrapy web services extension
|
||||
|
||||
See docs/topics/ws.rst
|
||||
"""
|
||||
|
||||
from twisted.internet import reactor
|
||||
from twisted.web import server, resource, error
|
||||
|
||||
from scrapy.core.exceptions import NotConfigured
|
||||
from scrapy.utils.jsonrpc import jsonrpc_server_call
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder, ScrapyJSONDecoder
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.conf import build_component_list
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
class JsonResource(resource.Resource):
|
||||
|
||||
ws_name = None
|
||||
json_encoder = ScrapyJSONEncoder()
|
||||
|
||||
def render(self, txrequest):
|
||||
r = resource.Resource.render(self, txrequest)
|
||||
r = self.json_encoder.encode(r)
|
||||
txrequest.setHeader('Content-Type', 'application/json')
|
||||
txrequest.setHeader('Content-Length', len(r))
|
||||
return r
|
||||
|
||||
|
||||
class JsonRpcResource(JsonResource):
|
||||
|
||||
json_decoder = ScrapyJSONDecoder()
|
||||
|
||||
def __init__(self, target=None):
|
||||
JsonResource.__init__(self)
|
||||
self._target = target
|
||||
|
||||
def render_GET(self, txrequest):
|
||||
return self.get_target()
|
||||
|
||||
def render_POST(self, txrequest):
|
||||
reqstr = txrequest.content.read()
|
||||
target = self.get_target()
|
||||
return jsonrpc_server_call(target, reqstr, self.json_decoder)
|
||||
|
||||
def getChild(self, name, txrequest):
|
||||
target = self.get_target()
|
||||
try:
|
||||
newtarget = getattr(target, name)
|
||||
return JsonRpcResource(newtarget)
|
||||
except AttributeError:
|
||||
return error.NoResource("No such child resource.")
|
||||
|
||||
def get_target(self):
|
||||
return self._target
|
||||
|
||||
|
||||
class RootResource(JsonResource):
|
||||
|
||||
def render_GET(self, txrequest):
|
||||
return {'resources': self.children.keys()}
|
||||
|
||||
def getChild(self, name, txrequest):
|
||||
if name == '':
|
||||
return self
|
||||
return JsonResource.getChild(self, name, txrequest)
|
||||
|
||||
|
||||
class WebService(server.Site):
|
||||
|
||||
def __init__(self):
|
||||
if not settings.getbool('WEBSERVICE_ENABLED'):
|
||||
raise NotConfigured
|
||||
logfile = settings['WEBSERVICE_LOGFILE']
|
||||
port = settings.getint('WEBSERVICE_PORT')
|
||||
root = RootResource()
|
||||
reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \
|
||||
settings['WEBSERVICE_RESOURCES'])
|
||||
for res_cls in map(load_object, reslist):
|
||||
res = res_cls()
|
||||
root.putChild(res.ws_name, res)
|
||||
server.Site.__init__(self, root, logPath=logfile)
|
||||
self.noisy = False
|
||||
reactor.callWhenRunning(reactor.listenTCP, port, self)
|
||||
|
Loading…
x
Reference in New Issue
Block a user