mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 15:04:27 +00:00
* Added Scrapy Web Service with documentation and tests.
* Marked Web Console as deprecated. * Removed Web Console documentation to discourage its use.
This commit is contained in:
parent
2499dfee5e
commit
6a33d6c4d0
114
bin/scrapy-ws.py
Executable file
114
bin/scrapy-ws.py
Executable file
@ -0,0 +1,114 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Example script to control and monitor Scrapy using its web service. It only
|
||||||
|
provides a reduced functionality as its main purpose is to illustrate how to
|
||||||
|
write a web service client. Feel free to improve or write you own.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys, optparse, urllib
|
||||||
|
from urlparse import urljoin
|
||||||
|
|
||||||
|
from scrapy.utils.jsonrpc import jsonrpc_client_call, JsonRpcError
|
||||||
|
from scrapy.utils.py26 import json
|
||||||
|
|
||||||
|
def get_commands():
|
||||||
|
return {
|
||||||
|
'help': cmd_help,
|
||||||
|
'run': cmd_run,
|
||||||
|
'list-available': cmd_list_available,
|
||||||
|
'list-running': cmd_list_running,
|
||||||
|
'list-resources': cmd_list_resources,
|
||||||
|
'list-extensions': cmd_list_extensions,
|
||||||
|
'get-global-stats': cmd_get_global_stats,
|
||||||
|
'get-spider-stats': cmd_get_spider_stats,
|
||||||
|
}
|
||||||
|
|
||||||
|
def cmd_help(args, opts):
|
||||||
|
"""help - list available commands"""
|
||||||
|
print "Available commands:"
|
||||||
|
for _, func in sorted(get_commands().items()):
|
||||||
|
print " ", func.__doc__
|
||||||
|
|
||||||
|
def cmd_run(args, opts):
|
||||||
|
"""run <spider_name> - schedule spider for running"""
|
||||||
|
jsonrpc_call(opts, 'manager/queue', 'append_spider_name', args[0])
|
||||||
|
|
||||||
|
def cmd_list_running(args, opts):
|
||||||
|
"""list-running - list running spiders"""
|
||||||
|
for x in json_get(opts, 'manager/engine/open_spiders'):
|
||||||
|
print x
|
||||||
|
|
||||||
|
def cmd_list_available(args, opts):
|
||||||
|
"""list-available - list name of available spiders"""
|
||||||
|
for x in jsonrpc_call(opts, 'spiders', 'list'):
|
||||||
|
print x
|
||||||
|
|
||||||
|
def cmd_list_resources(args, opts):
|
||||||
|
"""list-resources - list available web service resources"""
|
||||||
|
for x in json_get(opts, '')['resources']:
|
||||||
|
print x
|
||||||
|
|
||||||
|
def cmd_list_extensions(args, opts):
|
||||||
|
"""list-extensions - list enabled extensions"""
|
||||||
|
for x in jsonrpc_call(opts, 'extensions/enabled', 'keys'):
|
||||||
|
print x
|
||||||
|
|
||||||
|
def cmd_get_spider_stats(args, opts):
|
||||||
|
"""get-spider-stats <spider> - get stats of a running spider"""
|
||||||
|
stats = jsonrpc_call(opts, 'stats', 'get_stats', args[0])
|
||||||
|
for name, value in stats.items():
|
||||||
|
print "%-40s %s" % (name, value)
|
||||||
|
|
||||||
|
def cmd_get_global_stats(args, opts):
|
||||||
|
"""get-global-stats - get global stats"""
|
||||||
|
stats = jsonrpc_call(opts, 'stats', 'get_stats')
|
||||||
|
for name, value in stats.items():
|
||||||
|
print "%-40s %s" % (name, value)
|
||||||
|
|
||||||
|
def get_wsurl(opts, path):
|
||||||
|
return urljoin("http://%s:%s/"% (opts.host, opts.port), path)
|
||||||
|
|
||||||
|
def jsonrpc_call(opts, path, method, *args, **kwargs):
|
||||||
|
url = get_wsurl(opts, path)
|
||||||
|
return jsonrpc_client_call(url, method, *args, **kwargs)
|
||||||
|
|
||||||
|
def json_get(opts, path):
|
||||||
|
url = get_wsurl(opts, path)
|
||||||
|
return json.loads(urllib.urlopen(url).read())
|
||||||
|
|
||||||
|
def parse_opts():
|
||||||
|
usage = "%prog [options] <command> [arg] ..."
|
||||||
|
description = "Scrapy web service control script. Use '%prog help' " \
|
||||||
|
"to see the list of available commands."
|
||||||
|
op = optparse.OptionParser(usage=usage, description=description)
|
||||||
|
op.add_option("-H", dest="host", default="localhost", \
|
||||||
|
help="Scrapy host to connect to")
|
||||||
|
op.add_option("-P", dest="port", type="int", default=6080, \
|
||||||
|
help="Scrapy port to connect to")
|
||||||
|
opts, args = op.parse_args()
|
||||||
|
if not args:
|
||||||
|
op.print_help()
|
||||||
|
sys.exit(2)
|
||||||
|
cmdname, cmdargs, opts = args[0], args[1:], opts
|
||||||
|
commands = get_commands()
|
||||||
|
if cmdname not in commands:
|
||||||
|
sys.stderr.write("Unknown command: %s\n\n" % cmdname)
|
||||||
|
cmd_help(None, None)
|
||||||
|
sys.exit(1)
|
||||||
|
return commands[cmdname], cmdargs, opts
|
||||||
|
|
||||||
|
def main():
|
||||||
|
cmd, args, opts = parse_opts()
|
||||||
|
try:
|
||||||
|
cmd(args, opts)
|
||||||
|
except IndexError:
|
||||||
|
print cmd.__doc__
|
||||||
|
except JsonRpcError, e:
|
||||||
|
print str(e)
|
||||||
|
if e.data:
|
||||||
|
print "Server Traceback below:"
|
||||||
|
print e.data
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -87,7 +87,7 @@ Built-in services
|
|||||||
topics/stats
|
topics/stats
|
||||||
topics/email
|
topics/email
|
||||||
topics/telnetconsole
|
topics/telnetconsole
|
||||||
topics/webconsole
|
topics/webservice
|
||||||
|
|
||||||
:doc:`topics/logging`
|
:doc:`topics/logging`
|
||||||
Understand the simple logging facility provided by Scrapy.
|
Understand the simple logging facility provided by Scrapy.
|
||||||
@ -101,8 +101,8 @@ Built-in services
|
|||||||
:doc:`topics/telnetconsole`
|
:doc:`topics/telnetconsole`
|
||||||
Inspect a running crawler using a built-in Python console.
|
Inspect a running crawler using a built-in Python console.
|
||||||
|
|
||||||
:doc:`topics/webconsole`
|
:doc:`topics/webservice`
|
||||||
Monitor and control a crawler using a web interface.
|
Monitor and control a crawler using a web service.
|
||||||
|
|
||||||
|
|
||||||
Solving specific problems
|
Solving specific problems
|
||||||
|
@ -190,7 +190,7 @@ scraping easy and efficient, such as:
|
|||||||
* An :ref:`Interactive scraping shell console <topics-shell>`, very useful for
|
* An :ref:`Interactive scraping shell console <topics-shell>`, very useful for
|
||||||
writing and debugging your spiders
|
writing and debugging your spiders
|
||||||
|
|
||||||
* A :ref:`Web management console <topics-webconsole>` for monitoring and
|
* A builtin :ref:`Web service <topics-webservice>` for monitoring and
|
||||||
controlling your bot
|
controlling your bot
|
||||||
|
|
||||||
* A :ref:`Telnet console <topics-telnetconsole>` for full unrestricted access
|
* A :ref:`Telnet console <topics-telnetconsole>` for full unrestricted access
|
||||||
|
@ -36,10 +36,8 @@ by a string: the full Python path to the extension's class name. For example::
|
|||||||
|
|
||||||
EXTENSIONS = {
|
EXTENSIONS = {
|
||||||
'scrapy.contrib.corestats.CoreStats': 500,
|
'scrapy.contrib.corestats.CoreStats': 500,
|
||||||
'scrapy.management.web.WebConsole': 500,
|
'scrapy.webservice.WebService': 500,
|
||||||
'scrapy.management.telnet.TelnetConsole': 500,
|
'scrapy.telnet.TelnetConsole': 500,
|
||||||
'scrapy.contrib.webconsole.enginestatus.EngineStatus': 500,
|
|
||||||
'scrapy.contrib.webconsole.stats.StatsDump': 500,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -71,10 +69,10 @@ Accessing enabled extensions
|
|||||||
|
|
||||||
Even though it's not usually needed, you can access extension objects through
|
Even though it's not usually needed, you can access extension objects through
|
||||||
the :ref:`topics-extensions-ref-manager` which is populated when extensions are
|
the :ref:`topics-extensions-ref-manager` which is populated when extensions are
|
||||||
loaded. For example, to access the ``WebConsole`` extension::
|
loaded. For example, to access the ``WebService`` extension::
|
||||||
|
|
||||||
from scrapy.extension import extensions
|
from scrapy.extension import extensions
|
||||||
webconsole_extension = extensions.enabled['WebConsole']
|
webservice_extension = extensions.enabled['WebService']
|
||||||
|
|
||||||
.. see also::
|
.. see also::
|
||||||
|
|
||||||
@ -146,7 +144,7 @@ how you :ref:`configure the downloader middlewares
|
|||||||
>>> extensions.load()
|
>>> extensions.load()
|
||||||
>>> print extensions.enabled
|
>>> print extensions.enabled
|
||||||
{'CoreStats': <scrapy.contrib.corestats.CoreStats object at 0x9e272ac>,
|
{'CoreStats': <scrapy.contrib.corestats.CoreStats object at 0x9e272ac>,
|
||||||
'WebConsole': <scrapy.management.telnet.TelnetConsole instance at 0xa05670c>,
|
'WebService': <scrapy.management.telnet.TelnetConsole instance at 0xa05670c>,
|
||||||
...
|
...
|
||||||
|
|
||||||
.. attribute:: disabled
|
.. attribute:: disabled
|
||||||
@ -158,7 +156,7 @@ how you :ref:`configure the downloader middlewares
|
|||||||
>>> from scrapy.extension import extensions
|
>>> from scrapy.extension import extensions
|
||||||
>>> extensions.load()
|
>>> extensions.load()
|
||||||
>>> print extensions.disabled
|
>>> print extensions.disabled
|
||||||
{'MemoryDebugger': 'scrapy.contrib.webconsole.stats.MemoryDebugger',
|
{'MemoryDebugger': 'scrapy.contrib.memdebug.MemoryDebugger',
|
||||||
'MyExtension': 'myproject.extensions.MyExtension',
|
'MyExtension': 'myproject.extensions.MyExtension',
|
||||||
...
|
...
|
||||||
|
|
||||||
@ -193,44 +191,34 @@ Core Stats extension
|
|||||||
Enable the collection of core statistics, provided the stats collection is
|
Enable the collection of core statistics, provided the stats collection is
|
||||||
enabled (see :ref:`topics-stats`).
|
enabled (see :ref:`topics-stats`).
|
||||||
|
|
||||||
.. _topics-extensions-ref-webconsole:
|
.. _topics-extensions-ref-webservice:
|
||||||
|
|
||||||
Web console extension
|
Web service extension
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. module:: scrapy.management.web
|
.. module:: scrapy.webservice
|
||||||
:synopsis: Web management console
|
:synopsis: Web service
|
||||||
|
|
||||||
.. class:: scrapy.management.web.WebConsole
|
.. class:: scrapy.webservice.WebService
|
||||||
|
|
||||||
Provides an extensible web server for managing a Scrapy process. It's enabled
|
See `topics-webservice`.
|
||||||
by the :setting:`WEBCONSOLE_ENABLED` setting. The server will listen in the
|
|
||||||
port specified in :setting:`WEBCONSOLE_PORT`, and will log to the file
|
|
||||||
specified in :setting:`WEBCONSOLE_LOGFILE`.
|
|
||||||
|
|
||||||
The web server is designed to be extended by other extensions which can add
|
|
||||||
their own management web interfaces.
|
|
||||||
|
|
||||||
See also :ref:`topics-webconsole` for information on how to write your own web
|
|
||||||
console extension, and :ref:`topics-webconsole-extensions-ref` for a list of
|
|
||||||
available built-in (web console) extensions.
|
|
||||||
|
|
||||||
.. _topics-extensions-ref-telnetconsole:
|
.. _topics-extensions-ref-telnetconsole:
|
||||||
|
|
||||||
Telnet console extension
|
Telnet console extension
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. module:: scrapy.management.telnet
|
.. module:: scrapy.telnet
|
||||||
:synopsis: Telnet management console
|
:synopsis: Telnet console
|
||||||
|
|
||||||
.. class:: scrapy.management.telnet.TelnetConsole
|
.. class:: scrapy.telnet.TelnetConsole
|
||||||
|
|
||||||
Provides a telnet console for getting into a Python interpreter inside the
|
Provides a telnet console for getting into a Python interpreter inside the
|
||||||
currently running Scrapy process, which can be very useful for debugging.
|
currently running Scrapy process, which can be very useful for debugging.
|
||||||
|
|
||||||
The telnet console must be enabled by the :setting:`TELNETCONSOLE_ENABLED`
|
The telnet console must be enabled by the :setting:`TELNETCONSOLE_ENABLED`
|
||||||
setting, and the server will listen in the port specified in
|
setting, and the server will listen in the port specified in
|
||||||
:setting:`WEBCONSOLE_PORT`.
|
:setting:`TELNETCONSOLE_PORT`.
|
||||||
|
|
||||||
.. _topics-extensions-ref-memusage:
|
.. _topics-extensions-ref-memusage:
|
||||||
|
|
||||||
|
@ -519,13 +519,8 @@ Default::
|
|||||||
|
|
||||||
{
|
{
|
||||||
'scrapy.contrib.corestats.CoreStats': 0,
|
'scrapy.contrib.corestats.CoreStats': 0,
|
||||||
'scrapy.management.web.WebConsole': 0,
|
'scrapy.webservice.WebService': 0,
|
||||||
'scrapy.management.telnet.TelnetConsole': 0,
|
'scrapy.telnet.TelnetConsole': 0,
|
||||||
'scrapy.contrib.webconsole.scheduler.SchedulerQueue': 0,
|
|
||||||
'scrapy.contrib.webconsole.livestats.LiveStats': 0,
|
|
||||||
'scrapy.contrib.webconsole.spiderctl.Spiderctl': 0,
|
|
||||||
'scrapy.contrib.webconsole.enginestatus.EngineStatus': 0,
|
|
||||||
'scrapy.contrib.webconsole.stats.StatsDump': 0,
|
|
||||||
'scrapy.contrib.memusage.MemoryUsage': 0,
|
'scrapy.contrib.memusage.MemoryUsage': 0,
|
||||||
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
||||||
'scrapy.contrib.closedomain.CloseDomain': 0,
|
'scrapy.contrib.closedomain.CloseDomain': 0,
|
||||||
@ -1054,33 +1049,3 @@ Default: ``"%s/%s" % (BOT_NAME, BOT_VERSION)``
|
|||||||
|
|
||||||
The default User-Agent to use when crawling, unless overrided.
|
The default User-Agent to use when crawling, unless overrided.
|
||||||
|
|
||||||
.. setting:: WEBCONSOLE_ENABLED
|
|
||||||
|
|
||||||
WEBCONSOLE_ENABLED
|
|
||||||
------------------
|
|
||||||
|
|
||||||
Default: True
|
|
||||||
|
|
||||||
A boolean which specifies if the web management console will be enabled
|
|
||||||
(provided its extension is also enabled).
|
|
||||||
|
|
||||||
.. setting:: WEBCONSOLE_LOGFILE
|
|
||||||
|
|
||||||
WEBCONSOLE_LOGFILE
|
|
||||||
------------------
|
|
||||||
|
|
||||||
Default: ``None``
|
|
||||||
|
|
||||||
A file to use for logging HTTP requests made to the web console. If unset web
|
|
||||||
the log is sent to standard scrapy log.
|
|
||||||
|
|
||||||
.. setting:: WEBCONSOLE_PORT
|
|
||||||
|
|
||||||
WEBCONSOLE_PORT
|
|
||||||
---------------
|
|
||||||
|
|
||||||
Default: ``6080``
|
|
||||||
|
|
||||||
The port to use for the web console. If set to ``None`` or ``0``, a dynamically
|
|
||||||
assigned port is used. For more info see :ref:`topics-webconsole`.
|
|
||||||
|
|
||||||
|
@ -1,142 +0,0 @@
|
|||||||
.. _topics-webconsole:
|
|
||||||
|
|
||||||
===========
|
|
||||||
Web Console
|
|
||||||
===========
|
|
||||||
|
|
||||||
Scrapy comes with a built-in web server for monitoring and controlling a Scrapy
|
|
||||||
running process.
|
|
||||||
|
|
||||||
The web console is :ref:`built-in Scrapy extension
|
|
||||||
<topics-extensions-ref>` which comes enabled by default, but you can also
|
|
||||||
disable it if you're running tight on memory.
|
|
||||||
|
|
||||||
For more information about this extension see
|
|
||||||
:ref:`topics-extensions-ref-webconsole`.
|
|
||||||
|
|
||||||
Writing a web console extension
|
|
||||||
===============================
|
|
||||||
|
|
||||||
Writing a web console extension is similar to writing any other :ref:`Scrapy
|
|
||||||
extensions <topics-extensions>` except that the extension class must:
|
|
||||||
|
|
||||||
1. catch the ``scrapy.management.web.webconsole_discover_module`` signal, and
|
|
||||||
return itself in the handler.
|
|
||||||
|
|
||||||
2. have the following two attributes:
|
|
||||||
|
|
||||||
.. attribute:: webconsole_id
|
|
||||||
|
|
||||||
The id by which the Scrapy web interface will known this extension, and
|
|
||||||
also the main dir under which this extension interface will work. For
|
|
||||||
example, assuming Scrapy web server is listening on
|
|
||||||
http://localhost:8000/ and the ``webconsole_id='extension1'`` the web
|
|
||||||
main page for the interface of that extension will be:
|
|
||||||
|
|
||||||
http://localhost:8000/extension1/
|
|
||||||
|
|
||||||
.. attribute:: webconsole_name
|
|
||||||
|
|
||||||
The name by which the Scrapy web server will know that extension. That name
|
|
||||||
will be displayed in the main web console index, as the text that links to
|
|
||||||
the extension main page.
|
|
||||||
|
|
||||||
3. implement the following method:
|
|
||||||
|
|
||||||
.. method:: webconsole_render(wc_request)
|
|
||||||
|
|
||||||
``wc_request`` is a `twisted.web.http.Request`_ object with the HTTP request
|
|
||||||
sent to the web console.
|
|
||||||
|
|
||||||
.. _twisted.web.http.Request: http://python.net/crew/mwh/apidocs/twisted.web.http.Request.html
|
|
||||||
|
|
||||||
It must return a str with the web page to render, typically containing HTML
|
|
||||||
code.
|
|
||||||
|
|
||||||
Example web console extension
|
|
||||||
=============================
|
|
||||||
|
|
||||||
Here's an example of a simple web console extension that just displays a "Hello
|
|
||||||
world!" text::
|
|
||||||
|
|
||||||
from scrapy.xlib.pydispatch import dispatcher
|
|
||||||
from scrapy.management.web import webconsole_discover_module
|
|
||||||
|
|
||||||
class HelloWorldConsole(object):
|
|
||||||
webconsole_id = 'helloworld'
|
|
||||||
webconsole_name = 'Hello world'
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module)
|
|
||||||
|
|
||||||
def webconsole_discover_module(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def webconsole_render(self, wc_request):
|
|
||||||
return "<html><head></head><body><h1>Hello world!</h1></body>"
|
|
||||||
|
|
||||||
If you start Scrapy with the web console enabled on http://localhost:8000/ and
|
|
||||||
you access the URL:
|
|
||||||
|
|
||||||
http://localhost:8000/helloworld/
|
|
||||||
|
|
||||||
You will see a page containing a big "Hello World!" text.
|
|
||||||
|
|
||||||
.. _topics-webconsole-extensions-ref:
|
|
||||||
|
|
||||||
Available Web console extensions
|
|
||||||
--------------------------------
|
|
||||||
|
|
||||||
.. module:: scrapy.contrib.webconsole
|
|
||||||
:synopsis: Contains most built-in web console extensions
|
|
||||||
|
|
||||||
Here is a list of built-in web console extensions.
|
|
||||||
|
|
||||||
Scheduler queue extension
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. module:: scrapy.contrib.webconsole.scheduler
|
|
||||||
:synopsis: Scheduler queue web console extension
|
|
||||||
|
|
||||||
.. class:: scrapy.contrib.webconsole.scheduler.SchedulerQueue
|
|
||||||
|
|
||||||
Display a list of all pending Requests in the Scheduler queue, grouped by
|
|
||||||
domain/spider.
|
|
||||||
|
|
||||||
Spider live stats extension
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. module:: scrapy.contrib.webconsole.livestats
|
|
||||||
:synopsis: Spider live stats web console extension
|
|
||||||
|
|
||||||
.. class:: scrapy.contrib.webconsole.livestats.LiveStats
|
|
||||||
|
|
||||||
Display a table with stats of all spider crawled by the current Scrapy run,
|
|
||||||
including:
|
|
||||||
|
|
||||||
* Number of items scraped
|
|
||||||
* Number of pages crawled
|
|
||||||
* Number of pending requests in the scheduler
|
|
||||||
* Number of pending requests in the downloader queue
|
|
||||||
* Number of requests currently being downloaded
|
|
||||||
|
|
||||||
Engine status extension
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. module:: scrapy.contrib.webconsole.enginestatus
|
|
||||||
:synopsis: Engine status web console extension
|
|
||||||
|
|
||||||
.. class:: scrapy.contrib.webconsole.enginestatus.EngineStatus
|
|
||||||
|
|
||||||
Display the current status of the Scrapy Engine, which is just the output of
|
|
||||||
the Scrapy engine ``getstatus()`` method.
|
|
||||||
|
|
||||||
Stats collector dump extension
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
.. module:: scrapy.contrib.webconsole.stats
|
|
||||||
:synopsis: Stats dump web console extension
|
|
||||||
|
|
||||||
.. class:: scrapy.contrib.webconsole.stats.StatsDump
|
|
||||||
|
|
||||||
Display the stats collected so far by the stats collector.
|
|
236
docs/topics/webservice.rst
Normal file
236
docs/topics/webservice.rst
Normal file
@ -0,0 +1,236 @@
|
|||||||
|
.. _topics-webservice:
|
||||||
|
|
||||||
|
===========
|
||||||
|
Web Service
|
||||||
|
===========
|
||||||
|
|
||||||
|
Scrapy comes with a built-in web service for monitoring and controlling a
|
||||||
|
running crawler. The service exposes most resources using the `JSON-RPC 2.0`_
|
||||||
|
protocol, but there are also other (read-only) resources which just output JSON
|
||||||
|
data.
|
||||||
|
|
||||||
|
Provides an extensible web service for managing a Scrapy process. It's enabled
|
||||||
|
by the :setting:`WEBSERVICE_ENABLED` setting. The web server will listen in the
|
||||||
|
port specified in :setting:`WEBSERVICE_PORT`, and will log to the file
|
||||||
|
specified in :setting:`WEBSERVICE_LOGFILE`.
|
||||||
|
|
||||||
|
The web service is a :ref:`built-in Scrapy extension <topics-extensions-ref>`
|
||||||
|
which comes enabled by default, but you can also disable it if you're running
|
||||||
|
tight on memory.
|
||||||
|
|
||||||
|
.. _topics-webservice-resources:
|
||||||
|
|
||||||
|
Web service resources
|
||||||
|
=====================
|
||||||
|
|
||||||
|
The web service contains several resources, defined in the
|
||||||
|
:setting:`WEBSERVICE_RESOURCES` setting. Each resource provides a different
|
||||||
|
functionality. See :ref:`topics-webservice-resources-ref` for a list of
|
||||||
|
resources available by default.
|
||||||
|
|
||||||
|
Althought you can implement your own resources using any protocol, there are
|
||||||
|
two kinds of resources bundled with Scrapy:
|
||||||
|
|
||||||
|
* Simple JSON resources - which are read-only and just output JSON data
|
||||||
|
* JSON-RPC resources - which provide direct access to certain Scrapy objects
|
||||||
|
using the `JSON-RPC 2.0`_ protocol
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib.webservice
|
||||||
|
:synopsis: Built-in web service resources
|
||||||
|
|
||||||
|
.. _topics-webservice-resources-ref:
|
||||||
|
|
||||||
|
Available JSON-RPC resources
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
These are the JSON-RPC resources available by default in Scrapy:
|
||||||
|
|
||||||
|
Execution Manager JSON-RPC resource
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib.webservice.manager
|
||||||
|
:synopsis: Execution Manager JSON-RPC resource
|
||||||
|
|
||||||
|
.. class:: ManagerResource
|
||||||
|
|
||||||
|
Provides access to the Execution Manager that controls the crawler.
|
||||||
|
|
||||||
|
Available by default at: http://localhost:6080/manager
|
||||||
|
|
||||||
|
Stats Collector JSON-RPC resource
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib.webservice.stats
|
||||||
|
:synopsis: Stats JSON-RPC resource
|
||||||
|
|
||||||
|
.. class:: StatsResource
|
||||||
|
|
||||||
|
Provides access to the Stats Collector used by the crawler.
|
||||||
|
|
||||||
|
Available by default at: http://localhost:6080/stats
|
||||||
|
|
||||||
|
Spider Manager JSON-RPC resource
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib.webservice.spiders
|
||||||
|
:synopsis: Spider Manager JSON-RPC resource
|
||||||
|
|
||||||
|
.. class:: SpidersResource
|
||||||
|
|
||||||
|
Provides access to the Spider Manager used by the crawler.
|
||||||
|
|
||||||
|
Available by default at: http://localhost:6080/spiders
|
||||||
|
|
||||||
|
Extension Manager JSON-RPC resource
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib.webservice.extensions
|
||||||
|
:synopsis: Extension Manager JSON-RPC resource
|
||||||
|
|
||||||
|
.. class:: ExtensionsResource
|
||||||
|
|
||||||
|
Provides access to the Extension Manager used by the crawler.
|
||||||
|
|
||||||
|
Available by default at: http://localhost:6080/extensions
|
||||||
|
|
||||||
|
Available JSON resources
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
These are the JSON resources available by default:
|
||||||
|
|
||||||
|
Extension Manager JSON-RPC resource
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. module:: scrapy.contrib.webservice.enginestatus
|
||||||
|
:synopsis: Engine Status JSON resource
|
||||||
|
|
||||||
|
.. class:: EngineStatusResource
|
||||||
|
|
||||||
|
Provides access to the Extension Manager used by the crawler.
|
||||||
|
|
||||||
|
Available by default at: http://localhost:6080/enginestatus
|
||||||
|
|
||||||
|
Web service settings
|
||||||
|
====================
|
||||||
|
|
||||||
|
These are the settings that control the web service behaviour:
|
||||||
|
|
||||||
|
.. setting:: WEBSERVICE_ENABLED
|
||||||
|
|
||||||
|
WEBSERVICE_ENABLED
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Default: ``True``
|
||||||
|
|
||||||
|
A boolean which specifies if the web service will be enabled (provided its
|
||||||
|
extension is also enabled).
|
||||||
|
|
||||||
|
.. setting:: WEBSERVICE_LOGFILE
|
||||||
|
|
||||||
|
WEBSERVICE_LOGFILE
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Default: ``None``
|
||||||
|
|
||||||
|
A file to use for logging HTTP requests made to the web service. If unset web
|
||||||
|
the log is sent to standard scrapy log.
|
||||||
|
|
||||||
|
.. setting:: WEBSERVICE_PORT
|
||||||
|
|
||||||
|
WEBSERVICE_PORT
|
||||||
|
---------------
|
||||||
|
|
||||||
|
Default: ``6080``
|
||||||
|
|
||||||
|
The port to use for the web service. If set to ``None`` or ``0``, a dynamically
|
||||||
|
assigned port is used.
|
||||||
|
|
||||||
|
WEBSERVICE_RESOURCES
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
Default: ``{}``
|
||||||
|
|
||||||
|
The list of web service resources enabled for your project. See
|
||||||
|
:ref:`topics-webservice-resources`. These are added to the ones available by
|
||||||
|
default in Scrapy, defined in the :setting:`WEBSERVICE_RESOURCES_BASE` setting.
|
||||||
|
|
||||||
|
WEBSERVICE_RESOURCES_BASE
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Default::
|
||||||
|
|
||||||
|
{
|
||||||
|
'scrapy.contrib.webservice.manager.ManagerResource': 1,
|
||||||
|
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
|
||||||
|
'scrapy.contrib.webservice.extensions.ExtensionsResource': 1,
|
||||||
|
'scrapy.contrib.webservice.spiders.SpidersResource': 1,
|
||||||
|
'scrapy.contrib.webservice.stats.StatsResource': 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
The list of web service resources available by default in Scrapy. You shouldn't
|
||||||
|
change this setting in your project, change :setting:`WEBSERVICE_RESOURCES`
|
||||||
|
instead. If you want to disable some resource set its value to ``None`` in
|
||||||
|
:setting:`WEBSERVICE_RESOURCES`.
|
||||||
|
|
||||||
|
Writing a web service resource
|
||||||
|
==============================
|
||||||
|
|
||||||
|
Web service resources are implemented using the Twisted Web API. See this
|
||||||
|
`Twisted Web guide`_ for more information on Twisted web and Twisted web
|
||||||
|
resources.
|
||||||
|
|
||||||
|
To write a web service resource you should subclass the :class:`JsonResource` or
|
||||||
|
:class:`JsonRpcResource` classes and implement the :class:`renderGET` method.
|
||||||
|
|
||||||
|
.. class:: scrapy.webservice.JsonResource
|
||||||
|
|
||||||
|
A subclass of `twisted.web.resource.Resource`_ that implements a JSON web
|
||||||
|
service resource. See
|
||||||
|
|
||||||
|
.. attribute:: ws_name
|
||||||
|
|
||||||
|
The name by which the Scrapy web service will known this resource, and
|
||||||
|
also the path wehere this resource will listen. For example, assuming
|
||||||
|
Scrapy web service is listening on http://localhost:6080/ and the
|
||||||
|
``ws_name`` is ``'resource1'`` the URL for that resource will be:
|
||||||
|
|
||||||
|
http://localhost:6080/resource1/
|
||||||
|
|
||||||
|
.. class:: scrapy.webservice.JsonRpcResource(target=None)
|
||||||
|
|
||||||
|
This is a subclass of :class:`JsonResource` for implementing JSON-RPC
|
||||||
|
resources. JSON-RPC resources wrap Python (Scrapy) objects around a
|
||||||
|
JSON-RPC API. The resource wrapped must be returned by the
|
||||||
|
:meth:`get_target` method, which returns the target passed in the
|
||||||
|
constructor by default
|
||||||
|
|
||||||
|
.. method:: get_target()
|
||||||
|
|
||||||
|
Return the object wrapped by this JSON-RPC resource. By default, it
|
||||||
|
returns the object passed on the constructor.
|
||||||
|
|
||||||
|
Examples of web service resources
|
||||||
|
=================================
|
||||||
|
|
||||||
|
StatsResource (JSON-RPC resource)
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
.. literalinclude:: ../../scrapy/contrib/webservice/stats.py
|
||||||
|
|
||||||
|
EngineStatusResource (JSON resource)
|
||||||
|
-------------------------------------
|
||||||
|
|
||||||
|
.. literalinclude:: ../../scrapy/contrib/webservice/enginestatus.py
|
||||||
|
|
||||||
|
Example of web service client
|
||||||
|
=============================
|
||||||
|
|
||||||
|
scrapy-ws.py script
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. literalinclude:: ../../bin/scrapy-ws.py
|
||||||
|
|
||||||
|
.. _Twisted Web guide: http://jcalderone.livejournal.com/50562.html
|
||||||
|
.. _JSON-RPC 2.0: http://www.jsonrpc.org/
|
||||||
|
.. _twisted.web.resource.Resource: http://twistedmatrix.com/documents/10.0.0/api/twisted.web.resource.Resource.html
|
||||||
|
|
@ -109,13 +109,8 @@ EXTENSIONS = {}
|
|||||||
|
|
||||||
EXTENSIONS_BASE = {
|
EXTENSIONS_BASE = {
|
||||||
'scrapy.contrib.corestats.CoreStats': 0,
|
'scrapy.contrib.corestats.CoreStats': 0,
|
||||||
'scrapy.management.web.WebConsole': 0,
|
'scrapy.webservice.WebService': 0,
|
||||||
'scrapy.telnet.TelnetConsole': 0,
|
'scrapy.telnet.TelnetConsole': 0,
|
||||||
'scrapy.contrib.webconsole.scheduler.SchedulerQueue': 0,
|
|
||||||
'scrapy.contrib.webconsole.livestats.LiveStats': 0,
|
|
||||||
'scrapy.contrib.webconsole.spiderctl.Spiderctl': 0,
|
|
||||||
'scrapy.contrib.webconsole.enginestatus.EngineStatus': 0,
|
|
||||||
'scrapy.contrib.webconsole.stats.StatsDump': 0,
|
|
||||||
'scrapy.contrib.memusage.MemoryUsage': 0,
|
'scrapy.contrib.memusage.MemoryUsage': 0,
|
||||||
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
||||||
'scrapy.contrib.closespider.CloseSpider': 0,
|
'scrapy.contrib.closespider.CloseSpider': 0,
|
||||||
@ -232,3 +227,14 @@ WEBCONSOLE_ENABLED = True
|
|||||||
WEBCONSOLE_PORT = 6080
|
WEBCONSOLE_PORT = 6080
|
||||||
WEBCONSOLE_LOGFILE = None
|
WEBCONSOLE_LOGFILE = None
|
||||||
|
|
||||||
|
WEBSERVICE_ENABLED = True
|
||||||
|
WEBSERVICE_LOGFILE = None
|
||||||
|
WEBSERVICE_PORT = 6080
|
||||||
|
WEBSERVICE_RESOURCES = {}
|
||||||
|
WEBSERVICE_RESOURCES_BASE = {
|
||||||
|
'scrapy.contrib.webservice.manager.ManagerResource': 1,
|
||||||
|
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
|
||||||
|
'scrapy.contrib.webservice.extensions.ExtensionsResource': 1,
|
||||||
|
'scrapy.contrib.webservice.spiders.SpidersResource': 1,
|
||||||
|
'scrapy.contrib.webservice.stats.StatsResource': 1,
|
||||||
|
}
|
||||||
|
@ -0,0 +1,3 @@
|
|||||||
|
import warnings
|
||||||
|
warnings.warn("Web console is deprecated. Consider using web service instead.", \
|
||||||
|
DeprecationWarning, stacklevel=2)
|
0
scrapy/contrib/webservice/__init__.py
Normal file
0
scrapy/contrib/webservice/__init__.py
Normal file
24
scrapy/contrib/webservice/enginestatus.py
Normal file
24
scrapy/contrib/webservice/enginestatus.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
from scrapy.webservice import JsonResource
|
||||||
|
from scrapy.core.manager import scrapymanager
|
||||||
|
from scrapy.utils.engine import get_engine_status
|
||||||
|
|
||||||
|
class EngineStatusResource(JsonResource):
|
||||||
|
|
||||||
|
ws_name = 'enginestatus'
|
||||||
|
|
||||||
|
def __init__(self, spider_name=None, _manager=scrapymanager):
|
||||||
|
JsonResource.__init__(self)
|
||||||
|
self._spider_name = spider_name
|
||||||
|
self.isLeaf = spider_name is not None
|
||||||
|
self._manager = _manager
|
||||||
|
|
||||||
|
def render_GET(self, txrequest):
|
||||||
|
status = get_engine_status(self._manager.engine)
|
||||||
|
if self._spider_name is None:
|
||||||
|
return status
|
||||||
|
for sp, st in status['spiders'].items():
|
||||||
|
if sp.name == self._spider_name:
|
||||||
|
return st
|
||||||
|
|
||||||
|
def getChild(self, name, txrequest):
|
||||||
|
return EngineStatusResource(name, self._manager)
|
10
scrapy/contrib/webservice/extensions.py
Normal file
10
scrapy/contrib/webservice/extensions.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from scrapy.webservice import JsonRpcResource
|
||||||
|
from scrapy.extension import extensions
|
||||||
|
|
||||||
|
class ExtensionsResource(JsonRpcResource):
|
||||||
|
|
||||||
|
ws_name = 'extensions'
|
||||||
|
|
||||||
|
def __init__(self, _extensions=extensions):
|
||||||
|
JsonRpcResource.__init__(self)
|
||||||
|
self._target = _extensions
|
10
scrapy/contrib/webservice/manager.py
Normal file
10
scrapy/contrib/webservice/manager.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from scrapy.webservice import JsonRpcResource
|
||||||
|
from scrapy.core.manager import scrapymanager
|
||||||
|
|
||||||
|
class ManagerResource(JsonRpcResource):
|
||||||
|
|
||||||
|
ws_name = 'manager'
|
||||||
|
|
||||||
|
def __init__(self, _manager=scrapymanager):
|
||||||
|
JsonRpcResource.__init__(self)
|
||||||
|
self._target = _manager
|
10
scrapy/contrib/webservice/spiders.py
Normal file
10
scrapy/contrib/webservice/spiders.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from scrapy.webservice import JsonRpcResource
|
||||||
|
from scrapy.spider import spiders
|
||||||
|
|
||||||
|
class SpidersResource(JsonRpcResource):
|
||||||
|
|
||||||
|
ws_name = 'spiders'
|
||||||
|
|
||||||
|
def __init__(self, _spiders=spiders):
|
||||||
|
JsonRpcResource.__init__(self)
|
||||||
|
self._target = _spiders
|
10
scrapy/contrib/webservice/stats.py
Normal file
10
scrapy/contrib/webservice/stats.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from scrapy.webservice import JsonRpcResource
|
||||||
|
from scrapy.stats import stats
|
||||||
|
|
||||||
|
class StatsResource(JsonRpcResource):
|
||||||
|
|
||||||
|
ws_name = 'stats'
|
||||||
|
|
||||||
|
def __init__(self, _stats=stats):
|
||||||
|
JsonRpcResource.__init__(self)
|
||||||
|
self._target = _stats
|
@ -1,8 +1,6 @@
|
|||||||
"""
|
import warnings
|
||||||
Scrapy Web Console extension
|
warnings.warn("Scrapy web console is deprecated. Consider using web service instead.", \
|
||||||
|
DeprecationWarning, stacklevel=2)
|
||||||
See docs/topics/webconsole.rst
|
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import socket
|
import socket
|
||||||
|
@ -77,6 +77,6 @@ class BaseSpider(object_ref):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "<%s %r>" % (type(self).__name__, self.name)
|
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
|
||||||
|
|
||||||
__repr__ = __str__
|
__repr__ = __str__
|
||||||
|
112
scrapy/tests/test_utils_jsonrpc.py
Normal file
112
scrapy/tests/test_utils_jsonrpc.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
import unittest
|
||||||
|
from cStringIO import StringIO
|
||||||
|
|
||||||
|
from scrapy.utils.jsonrpc import jsonrpc_client_call, jsonrpc_server_call, \
|
||||||
|
JsonRpcError, jsonrpc_errors
|
||||||
|
from scrapy.utils.py26 import json
|
||||||
|
|
||||||
|
class urllib_stub(object):
|
||||||
|
def __init__(self, result=None, error=None):
|
||||||
|
response = {}
|
||||||
|
if result:
|
||||||
|
response.update(result=result)
|
||||||
|
if error:
|
||||||
|
response.update(error=error)
|
||||||
|
self.response = json.dumps(response)
|
||||||
|
self.request = None
|
||||||
|
|
||||||
|
def urlopen(self, url, request):
|
||||||
|
self.url = url
|
||||||
|
self.request = request
|
||||||
|
return StringIO(self.response)
|
||||||
|
|
||||||
|
class TestTarget(object):
|
||||||
|
|
||||||
|
def call(self, *args, **kwargs):
|
||||||
|
return list(args), kwargs
|
||||||
|
|
||||||
|
def exception(self):
|
||||||
|
raise Exception("testing-errors")
|
||||||
|
|
||||||
|
class JsonRpcUtilsTestCase(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_jsonrpc_client_call_request(self):
|
||||||
|
ul = urllib_stub(1)
|
||||||
|
jsonrpc_client_call('url', 'test', 'one', 2, _urllib=ul)
|
||||||
|
req = json.loads(ul.request)
|
||||||
|
assert 'id' in req
|
||||||
|
self.assertEqual(ul.url, 'url')
|
||||||
|
self.assertEqual(req['jsonrpc'], '2.0')
|
||||||
|
self.assertEqual(req['method'], 'test')
|
||||||
|
self.assertEqual(req['params'], ['one', 2])
|
||||||
|
|
||||||
|
def test_jsonrpc_client_call_response(self):
|
||||||
|
ul = urllib_stub()
|
||||||
|
# must return result or error
|
||||||
|
self.assertRaises(ValueError, jsonrpc_client_call, 'url', 'test', _urllib=ul)
|
||||||
|
ul = urllib_stub(result={'one': 1})
|
||||||
|
self.assertEquals(jsonrpc_client_call('url', 'test', _urllib=ul), {'one': 1})
|
||||||
|
ul = urllib_stub(error={'code': 123, 'message': 'hello', 'data': 'some data'})
|
||||||
|
|
||||||
|
raised = False
|
||||||
|
try:
|
||||||
|
jsonrpc_client_call('url', 'test', _urllib=ul)
|
||||||
|
except JsonRpcError, e:
|
||||||
|
raised = True
|
||||||
|
self.assertEqual(e.code, 123)
|
||||||
|
self.assertEqual(e.message, 'hello')
|
||||||
|
self.assertEqual(e.data, 'some data')
|
||||||
|
assert '123' in str(e)
|
||||||
|
assert 'hello' in str(e)
|
||||||
|
assert raised, "JsonRpcError not raised"
|
||||||
|
|
||||||
|
def test_jsonrpc_server_call(self):
|
||||||
|
t = TestTarget()
|
||||||
|
r = jsonrpc_server_call(t, 'invalid json data')
|
||||||
|
assert 'error' in r
|
||||||
|
assert r['jsonrpc'] == '2.0'
|
||||||
|
assert r['id'] is None
|
||||||
|
self.assertEqual(r['error']['code'], jsonrpc_errors.PARSE_ERROR)
|
||||||
|
assert 'Traceback' in r['error']['data']
|
||||||
|
|
||||||
|
r = jsonrpc_server_call(t, '{"test": "test"}')
|
||||||
|
assert 'error' in r
|
||||||
|
assert r['jsonrpc'] == '2.0'
|
||||||
|
assert r['id'] is None
|
||||||
|
self.assertEqual(r['error']['code'], jsonrpc_errors.INVALID_REQUEST)
|
||||||
|
|
||||||
|
r = jsonrpc_server_call(t, '{"method": "notfound", "id": 1}')
|
||||||
|
assert 'error' in r
|
||||||
|
assert r['jsonrpc'] == '2.0'
|
||||||
|
assert r['id'] == 1
|
||||||
|
self.assertEqual(r['error']['code'], jsonrpc_errors.METHOD_NOT_FOUND)
|
||||||
|
|
||||||
|
r = jsonrpc_server_call(t, '{"method": "exception", "id": 1}')
|
||||||
|
assert 'error' in r
|
||||||
|
assert r['jsonrpc'] == '2.0'
|
||||||
|
assert r['id'] == 1
|
||||||
|
self.assertEqual(r['error']['code'], jsonrpc_errors.INTERNAL_ERROR)
|
||||||
|
assert 'testing-errors' in r['error']['message']
|
||||||
|
assert 'Traceback' in r['error']['data']
|
||||||
|
|
||||||
|
r = jsonrpc_server_call(t, '{"method": "call", "id": 2}')
|
||||||
|
assert 'result' in r
|
||||||
|
assert r['jsonrpc'] == '2.0'
|
||||||
|
assert r['id'] == 2
|
||||||
|
self.assertEqual(r['result'], ([], {}))
|
||||||
|
|
||||||
|
r = jsonrpc_server_call(t, '{"method": "call", "params": [456, 123], "id": 3}')
|
||||||
|
assert 'result' in r
|
||||||
|
assert r['jsonrpc'] == '2.0'
|
||||||
|
assert r['id'] == 3
|
||||||
|
self.assertEqual(r['result'], ([456, 123], {}))
|
||||||
|
|
||||||
|
r = jsonrpc_server_call(t, '{"method": "call", "params": {"data": 789}, "id": 3}')
|
||||||
|
assert 'result' in r
|
||||||
|
assert r['jsonrpc'] == '2.0'
|
||||||
|
assert r['id'] == 3
|
||||||
|
self.assertEqual(r['result'], ([], {'data': 789}))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
|
|
125
scrapy/tests/test_utils_serialize.py
Normal file
125
scrapy/tests/test_utils_serialize.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
import unittest
|
||||||
|
import datetime
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
|
from scrapy.utils.serialize import SpiderReferencer, ScrapyJSONEncoder, ScrapyJSONDecoder
|
||||||
|
from scrapy.utils.py26 import json
|
||||||
|
from scrapy.spider import BaseSpider
|
||||||
|
from scrapy.http import Request, Response
|
||||||
|
|
||||||
|
|
||||||
|
class ExecutionEngineStub(object):
|
||||||
|
def __init__(self, open_spiders):
|
||||||
|
self.open_spiders = open_spiders
|
||||||
|
|
||||||
|
class ExecutionMangerStub(object):
|
||||||
|
def __init__(self, open_spiders):
|
||||||
|
self.engine = ExecutionEngineStub(open_spiders)
|
||||||
|
|
||||||
|
class BaseTestCase(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.spider1 = BaseSpider('name1')
|
||||||
|
self.spider2 = BaseSpider('name2')
|
||||||
|
open_spiders = set([self.spider1, self.spider2])
|
||||||
|
manager = ExecutionMangerStub(open_spiders)
|
||||||
|
self.spref = SpiderReferencer(manager)
|
||||||
|
self.encoder = ScrapyJSONEncoder(spref=self.spref)
|
||||||
|
self.decoder = ScrapyJSONDecoder(spref=self.spref)
|
||||||
|
|
||||||
|
class SpiderReferencerTestCase(BaseTestCase):
|
||||||
|
|
||||||
|
def test_spiders_and_references(self):
|
||||||
|
ref1 = self.spref.get_reference_from_spider(self.spider1)
|
||||||
|
assert isinstance(ref1, str)
|
||||||
|
assert self.spider1.name in ref1
|
||||||
|
ref2 = self.spref.get_reference_from_spider(self.spider2)
|
||||||
|
ref1_ = self.spref.get_reference_from_spider(self.spider1)
|
||||||
|
assert ref1 == ref1_
|
||||||
|
assert ref1 != ref2
|
||||||
|
|
||||||
|
sp1 = self.spref.get_spider_from_reference(ref1)
|
||||||
|
sp2 = self.spref.get_spider_from_reference(ref2)
|
||||||
|
sp1_ = self.spref.get_spider_from_reference(ref1)
|
||||||
|
assert isinstance(sp1, BaseSpider)
|
||||||
|
assert sp1 is not sp2
|
||||||
|
assert sp1 is sp1_
|
||||||
|
|
||||||
|
# must return string as-is if spider id not found
|
||||||
|
assert 'lala' == self.spref.get_spider_from_reference('lala')
|
||||||
|
# must raise RuntimeError if spider id is not found and spider is not running
|
||||||
|
self.assertRaises(RuntimeError, self.spref.get_spider_from_reference, 'spider:fffffff')
|
||||||
|
|
||||||
|
def test_encode_decode(self):
|
||||||
|
sr = self.spref
|
||||||
|
sp1 = self.spider1
|
||||||
|
sp2 = self.spider2
|
||||||
|
ref1 = sr.get_reference_from_spider(sp1)
|
||||||
|
ref2 = sr.get_reference_from_spider(sp2)
|
||||||
|
|
||||||
|
examples = [
|
||||||
|
('lala', 'lala'),
|
||||||
|
(sp1, ref1),
|
||||||
|
(['lala', sp1], ['lala', ref1]),
|
||||||
|
({'lala': sp1}, {'lala': ref1}),
|
||||||
|
({sp1: sp2}, {ref1: ref2}),
|
||||||
|
({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}})
|
||||||
|
]
|
||||||
|
for spiders, refs in examples:
|
||||||
|
self.assertEqual(sr.encode_references(spiders), refs)
|
||||||
|
self.assertEqual(sr.decode_references(refs), spiders)
|
||||||
|
|
||||||
|
class JsonEncoderTestCase(BaseTestCase):
|
||||||
|
|
||||||
|
def test_encode_decode(self):
|
||||||
|
sr = self.spref
|
||||||
|
sp1 = self.spider1
|
||||||
|
sp2 = self.spider2
|
||||||
|
ref1 = sr.get_reference_from_spider(sp1)
|
||||||
|
ref2 = sr.get_reference_from_spider(sp2)
|
||||||
|
dt = datetime.datetime(2010, 1, 2, 10, 11, 12)
|
||||||
|
dts = "2010-01-02 10:11:12"
|
||||||
|
d = datetime.date(2010, 1, 2)
|
||||||
|
ds = "2010-01-02"
|
||||||
|
t = datetime.time(10, 11, 12)
|
||||||
|
ts = "10:11:12"
|
||||||
|
dec = Decimal("1000.12")
|
||||||
|
decs = "1000.12"
|
||||||
|
|
||||||
|
examples_encode_decode = [
|
||||||
|
('lala', 'lala'),
|
||||||
|
(sp1, ref1),
|
||||||
|
(['lala', sp1], ['lala', ref1]),
|
||||||
|
({'lala': sp1}, {'lala': ref1}),
|
||||||
|
({sp1: sp2}, {ref1: ref2}),
|
||||||
|
({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}})
|
||||||
|
]
|
||||||
|
for spiders, refs in examples_encode_decode:
|
||||||
|
self.assertEqual(self.encoder.encode(spiders), json.dumps(refs))
|
||||||
|
self.assertEqual(self.decoder.decode(json.dumps(refs)), spiders)
|
||||||
|
|
||||||
|
examples_encode_only = [
|
||||||
|
({sp1: dt}, {ref1: dts}),
|
||||||
|
({sp1: d}, {ref1: ds}),
|
||||||
|
({sp1: t}, {ref1: ts}),
|
||||||
|
({sp1: dec}, {ref1: decs}),
|
||||||
|
]
|
||||||
|
for spiders, refs in examples_encode_only:
|
||||||
|
self.assertEqual(self.encoder.encode(spiders), json.dumps(refs))
|
||||||
|
|
||||||
|
def test_encode_request(self):
|
||||||
|
r = Request("http://www.example.com/lala")
|
||||||
|
rs = self.encoder.encode(r)
|
||||||
|
assert r.method in rs
|
||||||
|
assert r.url in rs
|
||||||
|
|
||||||
|
def test_encode_response(self):
|
||||||
|
r = Response("http://www.example.com/lala")
|
||||||
|
rs = self.encoder.encode(r)
|
||||||
|
assert r.url in rs
|
||||||
|
assert str(r.status) in rs
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
|
|
@ -37,23 +37,34 @@ def get_engine_status(engine=None):
|
|||||||
"engine.scraper.sites[spider].needs_backout()",
|
"engine.scraper.sites[spider].needs_backout()",
|
||||||
]
|
]
|
||||||
|
|
||||||
s = "Execution engine status\n\n"
|
status = {'global': {}, 'spiders': {}}
|
||||||
|
|
||||||
for test in global_tests:
|
for test in global_tests:
|
||||||
try:
|
try:
|
||||||
s += "%-47s : %s\n" % (test, eval(test))
|
status['global'][test] = eval(test)
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
s += "%-47s : %s (exception)\n" % (test, type(e).__name__)
|
status['global'][test] = "%s (exception)" % type(e).__name__
|
||||||
s += "\n"
|
|
||||||
for spider in engine.downloader.sites:
|
for spider in engine.downloader.sites:
|
||||||
s += "Spider: %s\n" % spider
|
x = {}
|
||||||
for test in spider_tests:
|
for test in spider_tests:
|
||||||
try:
|
try:
|
||||||
s += " %-50s : %s\n" % (test, eval(test))
|
x[test] = eval(test)
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
s += " %-50s : %s (exception)\n" % (test, type(e).__name__)
|
x[test] = "%s (exception)" % type(e).__name__
|
||||||
|
status['spiders'][spider] = x
|
||||||
|
return status
|
||||||
|
|
||||||
|
def format_engine_status(engine=None):
|
||||||
|
status = get_engine_status(engine)
|
||||||
|
s = "Execution engine status\n\n"
|
||||||
|
for test, result in status['global'].items():
|
||||||
|
s += "%-47s : %s\n" % (test, result)
|
||||||
|
s += "\n"
|
||||||
|
for spider, tests in status['spiders'].items():
|
||||||
|
s += "Spider: %s\n" % spider
|
||||||
|
for test, result in tests.items():
|
||||||
|
s += " %-50s : %s\n" % (test, result)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def print_engine_status(engine=None):
|
def print_engine_status(engine=None):
|
||||||
print get_engine_status(engine)
|
print format_engine_status(engine)
|
||||||
|
|
||||||
|
94
scrapy/utils/jsonrpc.py
Normal file
94
scrapy/utils/jsonrpc.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
"""
|
||||||
|
This module implements the JSON-RPC 2.0 protocol, as defined in:
|
||||||
|
http://groups.google.com/group/json-rpc/web/json-rpc-2-0
|
||||||
|
"""
|
||||||
|
|
||||||
|
import urllib
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from scrapy.utils.py26 import json
|
||||||
|
from scrapy.utils.serialize import ScrapyJSONDecoder
|
||||||
|
|
||||||
|
# JSON-RPC 2.0 errors, as defined in:
|
||||||
|
class jsonrpc_errors:
|
||||||
|
PARSE_ERROR = -32700
|
||||||
|
INVALID_REQUEST = -32600
|
||||||
|
METHOD_NOT_FOUND = -32601
|
||||||
|
INVALID_PARAMS = -32602
|
||||||
|
INTERNAL_ERROR = -32603
|
||||||
|
|
||||||
|
class JsonRpcError(Exception):
|
||||||
|
|
||||||
|
def __init__(self, code, message, data=None):
|
||||||
|
super(JsonRpcError, self).__init__()
|
||||||
|
self.code = code
|
||||||
|
self.message = message
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "JSON-RPC error (code %d): %s" % (self.code, self.message)
|
||||||
|
|
||||||
|
def jsonrpc_client_call(url, method, *args, **kwargs):
|
||||||
|
"""Execute a JSON-RPC call on the given url"""
|
||||||
|
_urllib = kwargs.pop('_urllib', urllib)
|
||||||
|
req = {'jsonrpc': '2.0', 'method': method, 'params': args or kwargs, 'id': 1}
|
||||||
|
res = json.loads(_urllib.urlopen(url, json.dumps(req)).read())
|
||||||
|
if 'result' in res:
|
||||||
|
return res['result']
|
||||||
|
elif 'error' in res:
|
||||||
|
er = res['error']
|
||||||
|
raise JsonRpcError(er['code'], er['message'], er['data'])
|
||||||
|
else:
|
||||||
|
msg = "JSON-RPC response must contain 'result' or 'error': %s" % res
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
def jsonrpc_server_call(target, jsonrpc_request, json_decoder=None):
|
||||||
|
"""Execute the given JSON-RPC request (as JSON-encoded string) on the given
|
||||||
|
target object and return the JSON-RPC response, as a dict
|
||||||
|
"""
|
||||||
|
if json_decoder is None:
|
||||||
|
json_decoder = ScrapyJSONDecoder()
|
||||||
|
|
||||||
|
try:
|
||||||
|
req = json_decoder.decode(jsonrpc_request)
|
||||||
|
except Exception, e:
|
||||||
|
return jsonrpc_error(None, jsonrpc_errors.PARSE_ERROR, 'Parse error', \
|
||||||
|
traceback.format_exc())
|
||||||
|
|
||||||
|
try:
|
||||||
|
id, methname = req['id'], req['method']
|
||||||
|
except KeyError:
|
||||||
|
return jsonrpc_error(None, jsonrpc_errors.INVALID_REQUEST, 'Invalid Request')
|
||||||
|
|
||||||
|
try:
|
||||||
|
method = getattr(target, methname)
|
||||||
|
except AttributeError:
|
||||||
|
return jsonrpc_error(id, jsonrpc_errors.METHOD_NOT_FOUND, 'Method not found')
|
||||||
|
|
||||||
|
params = req.get('params', [])
|
||||||
|
a, kw = ([], params) if isinstance(params, dict) else (params, {})
|
||||||
|
try:
|
||||||
|
return jsonrpc_result(id, method(*a, **kw))
|
||||||
|
except Exception, e:
|
||||||
|
return jsonrpc_error(id, jsonrpc_errors.INTERNAL_ERROR, str(e), \
|
||||||
|
traceback.format_exc())
|
||||||
|
|
||||||
|
def jsonrpc_error(id, code, message, data=None):
|
||||||
|
"""Create JSON-RPC error response"""
|
||||||
|
return {
|
||||||
|
'jsonrpc': '2.0',
|
||||||
|
'error': {
|
||||||
|
'code': code,
|
||||||
|
'message': message,
|
||||||
|
'data': data,
|
||||||
|
},
|
||||||
|
'id': id,
|
||||||
|
}
|
||||||
|
|
||||||
|
def jsonrpc_result(id, result):
|
||||||
|
"""Create JSON-RPC result response"""
|
||||||
|
return {
|
||||||
|
'jsonrpc': '2.0',
|
||||||
|
'result': result,
|
||||||
|
'id': id,
|
||||||
|
}
|
115
scrapy/utils/serialize.py
Normal file
115
scrapy/utils/serialize.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
import decimal
|
||||||
|
|
||||||
|
from scrapy.core.manager import scrapymanager
|
||||||
|
from scrapy.spider import BaseSpider
|
||||||
|
from scrapy.http import Request, Response
|
||||||
|
from scrapy.utils.py26 import json
|
||||||
|
|
||||||
|
|
||||||
|
class SpiderReferencer(object):
|
||||||
|
"""Class to serialize (and deserialize) objects (typically dicts)
|
||||||
|
containing references to running spiders (ie. Spider objects). This is
|
||||||
|
required because simplejson fails to serialize dicts containing
|
||||||
|
non-primitive types as keys, even when you override
|
||||||
|
ScrapyJSONEncoder.default() with a custom encoding mechanism.
|
||||||
|
"""
|
||||||
|
|
||||||
|
spider_ref_re = re.compile('^spider:([0-9a-f]+)(:.*)?$')
|
||||||
|
|
||||||
|
def __init__(self, manager=None):
|
||||||
|
self.manager = manager or scrapymanager
|
||||||
|
|
||||||
|
def get_reference_from_spider(self, spider):
|
||||||
|
return 'spider:%x:%s' % (id(spider), spider.name)
|
||||||
|
|
||||||
|
def get_spider_from_reference(self, ref):
|
||||||
|
"""Returns the Spider referenced by text, if text is a spider
|
||||||
|
reference. Otherwise it returns the text itself. If the text references
|
||||||
|
a non-running spider it raises a RuntimeError.
|
||||||
|
"""
|
||||||
|
m = self.spider_ref_re.search(ref)
|
||||||
|
if m:
|
||||||
|
spid = int(m.group(1), 16)
|
||||||
|
for spider in self.manager.engine.open_spiders:
|
||||||
|
if id(spider) == spid:
|
||||||
|
return spider
|
||||||
|
raise RuntimeError("Spider not running: %s" % ref)
|
||||||
|
return ref
|
||||||
|
|
||||||
|
def encode_references(self, obj):
|
||||||
|
"""Look for Spider objects and replace them with spider references"""
|
||||||
|
if isinstance(obj, BaseSpider):
|
||||||
|
return self.get_reference_from_spider(obj)
|
||||||
|
elif isinstance(obj, dict):
|
||||||
|
d = {}
|
||||||
|
for k, v in obj.items():
|
||||||
|
k = self.encode_references(k)
|
||||||
|
v = self.encode_references(v)
|
||||||
|
d[k] = v
|
||||||
|
return d
|
||||||
|
elif isinstance(obj, (list, tuple)):
|
||||||
|
return [self.encode_references(x) for x in obj]
|
||||||
|
else:
|
||||||
|
return obj
|
||||||
|
|
||||||
|
def decode_references(self, obj):
|
||||||
|
"""Look for spider references and replace them with Spider objects"""
|
||||||
|
if isinstance(obj, basestring):
|
||||||
|
return self.get_spider_from_reference(obj)
|
||||||
|
elif isinstance(obj, dict):
|
||||||
|
d = {}
|
||||||
|
for k, v in obj.items():
|
||||||
|
k = self.decode_references(k)
|
||||||
|
v = self.decode_references(v)
|
||||||
|
d[k] = v
|
||||||
|
return d
|
||||||
|
elif isinstance(obj, (list, tuple)):
|
||||||
|
return [self.decode_references(x) for x in obj]
|
||||||
|
else:
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapyJSONEncoder(json.JSONEncoder):
|
||||||
|
|
||||||
|
DATE_FORMAT = "%Y-%m-%d"
|
||||||
|
TIME_FORMAT = "%H:%M:%S"
|
||||||
|
|
||||||
|
def __init__(self, *a, **kw):
|
||||||
|
self.spref = kw.pop('spref', None) or SpiderReferencer()
|
||||||
|
super(ScrapyJSONEncoder, self).__init__(*a, **kw)
|
||||||
|
|
||||||
|
def encode(self, o):
|
||||||
|
if self.spref:
|
||||||
|
o = self.spref.encode_references(o)
|
||||||
|
return super(ScrapyJSONEncoder, self).encode(o)
|
||||||
|
|
||||||
|
def default(self, o):
|
||||||
|
if isinstance(o, datetime.datetime):
|
||||||
|
return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
|
||||||
|
elif isinstance(o, datetime.date):
|
||||||
|
return o.strftime(self.DATE_FORMAT)
|
||||||
|
elif isinstance(o, datetime.time):
|
||||||
|
return o.strftime(self.TIME_FORMAT)
|
||||||
|
elif isinstance(o, decimal.Decimal):
|
||||||
|
return str(o)
|
||||||
|
elif isinstance(o, Request):
|
||||||
|
return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
|
||||||
|
elif isinstance(o, Response):
|
||||||
|
return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
|
||||||
|
else:
|
||||||
|
return super(ScrapyJSONEncoder, self).default(o)
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapyJSONDecoder(json.JSONDecoder):
|
||||||
|
|
||||||
|
def __init__(self, *a, **kw):
|
||||||
|
self.spref = kw.pop('spref', None) or SpiderReferencer()
|
||||||
|
super(ScrapyJSONDecoder, self).__init__(*a, **kw)
|
||||||
|
|
||||||
|
def decode(self, s):
|
||||||
|
o = super(ScrapyJSONDecoder, self).decode(s)
|
||||||
|
if self.spref:
|
||||||
|
o = self.spref.decode_references(o)
|
||||||
|
return o
|
86
scrapy/webservice.py
Normal file
86
scrapy/webservice.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
"""
|
||||||
|
Scrapy web services extension
|
||||||
|
|
||||||
|
See docs/topics/ws.rst
|
||||||
|
"""
|
||||||
|
|
||||||
|
from twisted.internet import reactor
|
||||||
|
from twisted.web import server, resource, error
|
||||||
|
|
||||||
|
from scrapy.core.exceptions import NotConfigured
|
||||||
|
from scrapy.utils.jsonrpc import jsonrpc_server_call
|
||||||
|
from scrapy.utils.serialize import ScrapyJSONEncoder, ScrapyJSONDecoder
|
||||||
|
from scrapy.utils.misc import load_object
|
||||||
|
from scrapy.utils.conf import build_component_list
|
||||||
|
from scrapy.conf import settings
|
||||||
|
|
||||||
|
|
||||||
|
class JsonResource(resource.Resource):
|
||||||
|
|
||||||
|
ws_name = None
|
||||||
|
json_encoder = ScrapyJSONEncoder()
|
||||||
|
|
||||||
|
def render(self, txrequest):
|
||||||
|
r = resource.Resource.render(self, txrequest)
|
||||||
|
r = self.json_encoder.encode(r)
|
||||||
|
txrequest.setHeader('Content-Type', 'application/json')
|
||||||
|
txrequest.setHeader('Content-Length', len(r))
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
class JsonRpcResource(JsonResource):
|
||||||
|
|
||||||
|
json_decoder = ScrapyJSONDecoder()
|
||||||
|
|
||||||
|
def __init__(self, target=None):
|
||||||
|
JsonResource.__init__(self)
|
||||||
|
self._target = target
|
||||||
|
|
||||||
|
def render_GET(self, txrequest):
|
||||||
|
return self.get_target()
|
||||||
|
|
||||||
|
def render_POST(self, txrequest):
|
||||||
|
reqstr = txrequest.content.read()
|
||||||
|
target = self.get_target()
|
||||||
|
return jsonrpc_server_call(target, reqstr, self.json_decoder)
|
||||||
|
|
||||||
|
def getChild(self, name, txrequest):
|
||||||
|
target = self.get_target()
|
||||||
|
try:
|
||||||
|
newtarget = getattr(target, name)
|
||||||
|
return JsonRpcResource(newtarget)
|
||||||
|
except AttributeError:
|
||||||
|
return error.NoResource("No such child resource.")
|
||||||
|
|
||||||
|
def get_target(self):
|
||||||
|
return self._target
|
||||||
|
|
||||||
|
|
||||||
|
class RootResource(JsonResource):
|
||||||
|
|
||||||
|
def render_GET(self, txrequest):
|
||||||
|
return {'resources': self.children.keys()}
|
||||||
|
|
||||||
|
def getChild(self, name, txrequest):
|
||||||
|
if name == '':
|
||||||
|
return self
|
||||||
|
return JsonResource.getChild(self, name, txrequest)
|
||||||
|
|
||||||
|
|
||||||
|
class WebService(server.Site):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
if not settings.getbool('WEBSERVICE_ENABLED'):
|
||||||
|
raise NotConfigured
|
||||||
|
logfile = settings['WEBSERVICE_LOGFILE']
|
||||||
|
port = settings.getint('WEBSERVICE_PORT')
|
||||||
|
root = RootResource()
|
||||||
|
reslist = build_component_list(settings['WEBSERVICE_RESOURCES_BASE'], \
|
||||||
|
settings['WEBSERVICE_RESOURCES'])
|
||||||
|
for res_cls in map(load_object, reslist):
|
||||||
|
res = res_cls()
|
||||||
|
root.putChild(res.ws_name, res)
|
||||||
|
server.Site.__init__(self, root, logPath=logfile)
|
||||||
|
self.noisy = False
|
||||||
|
reactor.callWhenRunning(reactor.listenTCP, port, self)
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user