mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 12:03:40 +00:00
Merge branch 'jsonrpc-split'
This commit is contained in:
commit
94d00b2a26
@ -236,9 +236,6 @@ scraping easy and efficient, such as:
|
||||
* A :ref:`System service <topics-scrapyd>` designed to ease the deployment and
|
||||
run of your spiders in production.
|
||||
|
||||
* A built-in :ref:`Web service <topics-webservice>` for monitoring and
|
||||
controlling your bot
|
||||
|
||||
* A :ref:`Telnet console <topics-telnetconsole>` for hooking into a Python
|
||||
console running inside your Scrapy process, to introspect and debug your
|
||||
crawler
|
||||
|
@ -36,7 +36,6 @@ by a string: the full Python path to the extension's class name. For example::
|
||||
|
||||
EXTENSIONS = {
|
||||
'scrapy.contrib.corestats.CoreStats': 500,
|
||||
'scrapy.webservice.WebService': 500,
|
||||
'scrapy.telnet.TelnetConsole': 500,
|
||||
}
|
||||
|
||||
@ -178,18 +177,6 @@ Core Stats extension
|
||||
Enable the collection of core statistics, provided the stats collection is
|
||||
enabled (see :ref:`topics-stats`).
|
||||
|
||||
.. _topics-extensions-ref-webservice:
|
||||
|
||||
Web service extension
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.webservice
|
||||
:synopsis: Web service
|
||||
|
||||
.. class:: scrapy.webservice.WebService
|
||||
|
||||
See `topics-webservice`.
|
||||
|
||||
.. _topics-extensions-ref-telnetconsole:
|
||||
|
||||
Telnet console extension
|
||||
|
@ -463,7 +463,6 @@ Default::
|
||||
|
||||
{
|
||||
'scrapy.contrib.corestats.CoreStats': 0,
|
||||
'scrapy.webservice.WebService': 0,
|
||||
'scrapy.telnet.TelnetConsole': 0,
|
||||
'scrapy.contrib.memusage.MemoryUsage': 0,
|
||||
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
||||
|
@ -4,231 +4,8 @@
|
||||
Web Service
|
||||
===========
|
||||
|
||||
Scrapy comes with a built-in web service for monitoring and controlling a
|
||||
running crawler. The service exposes most resources using the `JSON-RPC 2.0`_
|
||||
protocol, but there are also other (read-only) resources which just output JSON
|
||||
data.
|
||||
webservice has been moved into a separate project.
|
||||
|
||||
Provides an extensible web service for managing a Scrapy process. It's enabled
|
||||
by the :setting:`WEBSERVICE_ENABLED` setting. The web server will listen in the
|
||||
port specified in :setting:`WEBSERVICE_PORT`, and will log to the file
|
||||
specified in :setting:`WEBSERVICE_LOGFILE`.
|
||||
|
||||
The web service is a :ref:`built-in Scrapy extension <topics-extensions-ref>`
|
||||
which comes enabled by default, but you can also disable it if you're running
|
||||
tight on memory.
|
||||
|
||||
.. _topics-webservice-resources:
|
||||
|
||||
Web service resources
|
||||
=====================
|
||||
|
||||
The web service contains several resources, defined in the
|
||||
:setting:`WEBSERVICE_RESOURCES` setting. Each resource provides a different
|
||||
functionality. See :ref:`topics-webservice-resources-ref` for a list of
|
||||
resources available by default.
|
||||
|
||||
Although you can implement your own resources using any protocol, there are
|
||||
two kinds of resources bundled with Scrapy:
|
||||
|
||||
* Simple JSON resources - which are read-only and just output JSON data
|
||||
* JSON-RPC resources - which provide direct access to certain Scrapy objects
|
||||
using the `JSON-RPC 2.0`_ protocol
|
||||
|
||||
.. module:: scrapy.contrib.webservice
|
||||
:synopsis: Built-in web service resources
|
||||
|
||||
.. _topics-webservice-resources-ref:
|
||||
|
||||
Available JSON-RPC resources
|
||||
----------------------------
|
||||
|
||||
These are the JSON-RPC resources available by default in Scrapy:
|
||||
|
||||
.. _topics-webservice-crawler:
|
||||
|
||||
Crawler JSON-RPC resource
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webservice.crawler
|
||||
:synopsis: Crawler JSON-RPC resource
|
||||
|
||||
.. class:: CrawlerResource
|
||||
|
||||
Provides access to the main Crawler object that controls the Scrapy
|
||||
process.
|
||||
|
||||
Available by default at: http://localhost:6080/crawler
|
||||
|
||||
Stats Collector JSON-RPC resource
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webservice.stats
|
||||
:synopsis: Stats JSON-RPC resource
|
||||
|
||||
.. class:: StatsResource
|
||||
|
||||
Provides access to the Stats Collector used by the crawler.
|
||||
|
||||
Available by default at: http://localhost:6080/stats
|
||||
|
||||
Spider Manager JSON-RPC resource
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You can access the spider manager JSON-RPC resource through the
|
||||
:ref:`topics-webservice-crawler` at: http://localhost:6080/crawler/spiders
|
||||
|
||||
Extension Manager JSON-RPC resource
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You can access the extension manager JSON-RPC resource through the
|
||||
:ref:`topics-webservice-crawler` at: http://localhost:6080/crawler/spiders
|
||||
|
||||
Available JSON resources
|
||||
------------------------
|
||||
|
||||
These are the JSON resources available by default:
|
||||
|
||||
Engine status JSON resource
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. module:: scrapy.contrib.webservice.enginestatus
|
||||
:synopsis: Engine Status JSON resource
|
||||
|
||||
.. class:: EngineStatusResource
|
||||
|
||||
Provides access to engine status metrics.
|
||||
|
||||
Available by default at: http://localhost:6080/enginestatus
|
||||
|
||||
Web service settings
|
||||
====================
|
||||
|
||||
These are the settings that control the web service behaviour:
|
||||
|
||||
.. setting:: WEBSERVICE_ENABLED
|
||||
|
||||
WEBSERVICE_ENABLED
|
||||
------------------
|
||||
|
||||
Default: ``True``
|
||||
|
||||
A boolean which specifies if the web service will be enabled (provided its
|
||||
extension is also enabled).
|
||||
|
||||
.. setting:: WEBSERVICE_LOGFILE
|
||||
|
||||
WEBSERVICE_LOGFILE
|
||||
------------------
|
||||
|
||||
Default: ``None``
|
||||
|
||||
A file to use for logging HTTP requests made to the web service. If unset web
|
||||
the log is sent to standard scrapy log.
|
||||
|
||||
.. setting:: WEBSERVICE_PORT
|
||||
|
||||
WEBSERVICE_PORT
|
||||
---------------
|
||||
|
||||
Default: ``[6080, 7030]``
|
||||
|
||||
The port range to use for the web service. If set to ``None`` or ``0``, a
|
||||
dynamically assigned port is used.
|
||||
|
||||
.. setting:: WEBSERVICE_HOST
|
||||
|
||||
WEBSERVICE_HOST
|
||||
---------------
|
||||
|
||||
Default: ``'127.0.0.1'``
|
||||
|
||||
The interface the web service should listen on
|
||||
|
||||
WEBSERVICE_RESOURCES
|
||||
--------------------
|
||||
|
||||
Default: ``{}``
|
||||
|
||||
The list of web service resources enabled for your project. See
|
||||
:ref:`topics-webservice-resources`. These are added to the ones available by
|
||||
default in Scrapy, defined in the :setting:`WEBSERVICE_RESOURCES_BASE` setting.
|
||||
|
||||
WEBSERVICE_RESOURCES_BASE
|
||||
-------------------------
|
||||
|
||||
Default::
|
||||
|
||||
{
|
||||
'scrapy.contrib.webservice.crawler.CrawlerResource': 1,
|
||||
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
|
||||
'scrapy.contrib.webservice.stats.StatsResource': 1,
|
||||
}
|
||||
|
||||
The list of web service resources available by default in Scrapy. You shouldn't
|
||||
change this setting in your project, change :setting:`WEBSERVICE_RESOURCES`
|
||||
instead. If you want to disable some resource set its value to ``None`` in
|
||||
:setting:`WEBSERVICE_RESOURCES`.
|
||||
|
||||
Writing a web service resource
|
||||
==============================
|
||||
|
||||
Web service resources are implemented using the Twisted Web API. See this
|
||||
`Twisted Web guide`_ for more information on Twisted web and Twisted web
|
||||
resources.
|
||||
|
||||
To write a web service resource you should subclass the :class:`JsonResource` or
|
||||
:class:`JsonRpcResource` classes and implement the :class:`renderGET` method.
|
||||
|
||||
.. class:: scrapy.webservice.JsonResource
|
||||
|
||||
A subclass of `twisted.web.resource.Resource`_ that implements a JSON web
|
||||
service resource. See
|
||||
|
||||
.. attribute:: ws_name
|
||||
|
||||
The name by which the Scrapy web service will known this resource, and
|
||||
also the path where this resource will listen. For example, assuming
|
||||
Scrapy web service is listening on http://localhost:6080/ and the
|
||||
``ws_name`` is ``'resource1'`` the URL for that resource will be:
|
||||
|
||||
http://localhost:6080/resource1/
|
||||
|
||||
.. class:: scrapy.webservice.JsonRpcResource(crawler, target=None)
|
||||
|
||||
This is a subclass of :class:`JsonResource` for implementing JSON-RPC
|
||||
resources. JSON-RPC resources wrap Python (Scrapy) objects around a
|
||||
JSON-RPC API. The resource wrapped must be returned by the
|
||||
:meth:`get_target` method, which returns the target passed in the
|
||||
constructor by default
|
||||
|
||||
.. method:: get_target()
|
||||
|
||||
Return the object wrapped by this JSON-RPC resource. By default, it
|
||||
returns the object passed on the constructor.
|
||||
|
||||
Examples of web service resources
|
||||
=================================
|
||||
|
||||
StatsResource (JSON-RPC resource)
|
||||
---------------------------------
|
||||
|
||||
.. literalinclude:: ../../scrapy/contrib/webservice/stats.py
|
||||
|
||||
EngineStatusResource (JSON resource)
|
||||
-------------------------------------
|
||||
|
||||
.. literalinclude:: ../../scrapy/contrib/webservice/enginestatus.py
|
||||
|
||||
Example of web service client
|
||||
=============================
|
||||
|
||||
scrapy-ws.py script
|
||||
-------------------
|
||||
|
||||
.. literalinclude:: ../../extras/scrapy-ws.py
|
||||
|
||||
.. _Twisted Web guide: http://jcalderone.livejournal.com/50562.html
|
||||
.. _JSON-RPC 2.0: http://www.jsonrpc.org/
|
||||
.. _twisted.web.resource.Resource: http://twistedmatrix.com/documents/10.0.0/api/twisted.web.resource.Resource.html
|
||||
Its is now hosted at:
|
||||
|
||||
https://github.com/scrapy/scrapy-jsonrpc
|
||||
|
@ -1,8 +0,0 @@
|
||||
from scrapy.webservice import JsonRpcResource
|
||||
|
||||
class CrawlerResource(JsonRpcResource):
|
||||
|
||||
ws_name = 'crawler'
|
||||
|
||||
def __init__(self, crawler):
|
||||
JsonRpcResource.__init__(self, crawler, crawler)
|
@ -1,22 +0,0 @@
|
||||
from scrapy.webservice import JsonResource
|
||||
from scrapy.utils.engine import get_engine_status
|
||||
|
||||
class EngineStatusResource(JsonResource):
|
||||
|
||||
ws_name = 'enginestatus'
|
||||
|
||||
def __init__(self, crawler, spider_name=None):
|
||||
JsonResource.__init__(self, crawler)
|
||||
self._spider_name = spider_name
|
||||
self.isLeaf = spider_name is not None
|
||||
|
||||
def render_GET(self, txrequest):
|
||||
status = get_engine_status(self.crawler.engine)
|
||||
if self._spider_name is None:
|
||||
return status
|
||||
for sp, st in status['spiders'].items():
|
||||
if sp.name == self._spider_name:
|
||||
return st
|
||||
|
||||
def getChild(self, name, txrequest):
|
||||
return EngineStatusResource(name, self.crawler)
|
@ -1,8 +0,0 @@
|
||||
from scrapy.webservice import JsonRpcResource
|
||||
|
||||
class StatsResource(JsonRpcResource):
|
||||
|
||||
ws_name = 'stats'
|
||||
|
||||
def __init__(self, crawler):
|
||||
JsonRpcResource.__init__(self, crawler, crawler.stats)
|
@ -109,7 +109,6 @@ EXTENSIONS = {}
|
||||
|
||||
EXTENSIONS_BASE = {
|
||||
'scrapy.contrib.corestats.CoreStats': 0,
|
||||
'scrapy.webservice.WebService': 0,
|
||||
'scrapy.telnet.TelnetConsole': 0,
|
||||
'scrapy.contrib.memusage.MemoryUsage': 0,
|
||||
'scrapy.contrib.memdebug.MemoryDebugger': 0,
|
||||
@ -239,17 +238,6 @@ TELNETCONSOLE_ENABLED = 1
|
||||
TELNETCONSOLE_PORT = [6023, 6073]
|
||||
TELNETCONSOLE_HOST = '127.0.0.1'
|
||||
|
||||
WEBSERVICE_ENABLED = True
|
||||
WEBSERVICE_LOGFILE = None
|
||||
WEBSERVICE_PORT = [6080, 7030]
|
||||
WEBSERVICE_HOST = '127.0.0.1'
|
||||
WEBSERVICE_RESOURCES = {}
|
||||
WEBSERVICE_RESOURCES_BASE = {
|
||||
'scrapy.contrib.webservice.crawler.CrawlerResource': 1,
|
||||
'scrapy.contrib.webservice.enginestatus.EngineStatusResource': 1,
|
||||
'scrapy.contrib.webservice.stats.StatsResource': 1,
|
||||
}
|
||||
|
||||
SPIDER_CONTRACTS = {}
|
||||
SPIDER_CONTRACTS_BASE = {
|
||||
'scrapy.contracts.default.UrlContract': 1,
|
||||
|
@ -1,97 +0,0 @@
|
||||
"""
|
||||
This module implements the JSON-RPC 2.0 protocol, as defined in:
|
||||
http://groups.google.com/group/json-rpc/web/json-rpc-2-0
|
||||
"""
|
||||
|
||||
import urllib
|
||||
import json
|
||||
import traceback
|
||||
|
||||
from scrapy.utils.serialize import ScrapyJSONDecoder
|
||||
|
||||
# JSON-RPC 2.0 errors, as defined in:
|
||||
class jsonrpc_errors:
|
||||
PARSE_ERROR = -32700
|
||||
INVALID_REQUEST = -32600
|
||||
METHOD_NOT_FOUND = -32601
|
||||
INVALID_PARAMS = -32602
|
||||
INTERNAL_ERROR = -32603
|
||||
|
||||
class JsonRpcError(Exception):
|
||||
|
||||
def __init__(self, code, message, data=None):
|
||||
super(JsonRpcError, self).__init__()
|
||||
self.code = code
|
||||
self.message = message
|
||||
self.data = data
|
||||
|
||||
def __str__(self):
|
||||
return "JSON-RPC error (code %d): %s" % (self.code, self.message)
|
||||
|
||||
def jsonrpc_client_call(url, method, *args, **kwargs):
|
||||
"""Execute a JSON-RPC call on the given url"""
|
||||
_urllib = kwargs.pop('_urllib', urllib)
|
||||
if args and kwargs:
|
||||
raise ValueError("Pass *args or **kwargs but not both to jsonrpc_client_call")
|
||||
req = {'jsonrpc': '2.0', 'method': method, 'params': args or kwargs, 'id': 1}
|
||||
res = json.loads(_urllib.urlopen(url, json.dumps(req)).read())
|
||||
if 'result' in res:
|
||||
return res['result']
|
||||
elif 'error' in res:
|
||||
er = res['error']
|
||||
raise JsonRpcError(er['code'], er['message'], er['data'])
|
||||
else:
|
||||
msg = "JSON-RPC response must contain 'result' or 'error': %s" % res
|
||||
raise ValueError(msg)
|
||||
|
||||
def jsonrpc_server_call(target, jsonrpc_request, json_decoder=None):
|
||||
"""Execute the given JSON-RPC request (as JSON-encoded string) on the given
|
||||
target object and return the JSON-RPC response, as a dict
|
||||
"""
|
||||
if json_decoder is None:
|
||||
json_decoder = ScrapyJSONDecoder()
|
||||
|
||||
try:
|
||||
req = json_decoder.decode(jsonrpc_request)
|
||||
except Exception as e:
|
||||
return jsonrpc_error(None, jsonrpc_errors.PARSE_ERROR, 'Parse error', \
|
||||
traceback.format_exc())
|
||||
|
||||
try:
|
||||
id, methname = req['id'], req['method']
|
||||
except KeyError:
|
||||
return jsonrpc_error(None, jsonrpc_errors.INVALID_REQUEST, 'Invalid Request')
|
||||
|
||||
try:
|
||||
method = getattr(target, methname)
|
||||
except AttributeError:
|
||||
return jsonrpc_error(id, jsonrpc_errors.METHOD_NOT_FOUND, 'Method not found')
|
||||
|
||||
params = req.get('params', [])
|
||||
a, kw = ([], params) if isinstance(params, dict) else (params, {})
|
||||
kw = dict([(str(k), v) for k, v in kw.items()]) # convert kw keys to str
|
||||
try:
|
||||
return jsonrpc_result(id, method(*a, **kw))
|
||||
except Exception as e:
|
||||
return jsonrpc_error(id, jsonrpc_errors.INTERNAL_ERROR, str(e), \
|
||||
traceback.format_exc())
|
||||
|
||||
def jsonrpc_error(id, code, message, data=None):
|
||||
"""Create JSON-RPC error response"""
|
||||
return {
|
||||
'jsonrpc': '2.0',
|
||||
'error': {
|
||||
'code': code,
|
||||
'message': message,
|
||||
'data': data,
|
||||
},
|
||||
'id': id,
|
||||
}
|
||||
|
||||
def jsonrpc_result(id, result):
|
||||
"""Create JSON-RPC result response"""
|
||||
return {
|
||||
'jsonrpc': '2.0',
|
||||
'result': result,
|
||||
'id': id,
|
||||
}
|
@ -1,93 +1,18 @@
|
||||
import re
|
||||
import json
|
||||
import datetime
|
||||
import decimal
|
||||
import json
|
||||
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.item import BaseItem
|
||||
|
||||
|
||||
class SpiderReferencer(object):
|
||||
"""Class to serialize (and deserialize) objects (typically dicts)
|
||||
containing references to running spiders (ie. Spider objects). This is
|
||||
required because json library fails to serialize dicts containing
|
||||
non-primitive types as keys, even when you override
|
||||
ScrapyJSONEncoder.default() with a custom encoding mechanism.
|
||||
"""
|
||||
|
||||
spider_ref_re = re.compile('^spider:([0-9a-f]+)?:?(.+)?$')
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
|
||||
def get_reference_from_spider(self, spider):
|
||||
return 'spider:%x:%s' % (id(spider), spider.name)
|
||||
|
||||
def get_spider_from_reference(self, ref):
|
||||
"""Returns the Spider referenced by text, if text is a spider
|
||||
reference. Otherwise it returns the text itself. If the text references
|
||||
a non-running spider it raises a RuntimeError.
|
||||
"""
|
||||
m = self.spider_ref_re.search(ref)
|
||||
if m:
|
||||
spid, spname = m.groups()
|
||||
for spider in self.crawler.engine.open_spiders:
|
||||
if "%x" % id(spider) == spid or spider.name == spname:
|
||||
return spider
|
||||
raise RuntimeError("Spider not running: %s" % ref)
|
||||
return ref
|
||||
|
||||
def encode_references(self, obj):
|
||||
"""Look for Spider objects and replace them with spider references"""
|
||||
if isinstance(obj, Spider):
|
||||
return self.get_reference_from_spider(obj)
|
||||
elif isinstance(obj, dict):
|
||||
d = {}
|
||||
for k, v in obj.items():
|
||||
k = self.encode_references(k)
|
||||
v = self.encode_references(v)
|
||||
d[k] = v
|
||||
return d
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
return [self.encode_references(x) for x in obj]
|
||||
else:
|
||||
return obj
|
||||
|
||||
def decode_references(self, obj):
|
||||
"""Look for spider references and replace them with Spider objects"""
|
||||
if isinstance(obj, basestring):
|
||||
return self.get_spider_from_reference(obj)
|
||||
elif isinstance(obj, dict):
|
||||
d = {}
|
||||
for k, v in obj.items():
|
||||
k = self.decode_references(k)
|
||||
v = self.decode_references(v)
|
||||
d[k] = v
|
||||
return d
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
return [self.decode_references(x) for x in obj]
|
||||
else:
|
||||
return obj
|
||||
|
||||
|
||||
class ScrapyJSONEncoder(json.JSONEncoder):
|
||||
|
||||
DATE_FORMAT = "%Y-%m-%d"
|
||||
TIME_FORMAT = "%H:%M:%S"
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
crawler = kw.pop('crawler', None)
|
||||
self.spref = kw.pop('spref', None) or SpiderReferencer(crawler)
|
||||
super(ScrapyJSONEncoder, self).__init__(*a, **kw)
|
||||
|
||||
def encode(self, o):
|
||||
if self.spref:
|
||||
o = self.spref.encode_references(o)
|
||||
return super(ScrapyJSONEncoder, self).encode(o)
|
||||
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime.datetime):
|
||||
return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
|
||||
@ -110,14 +35,4 @@ class ScrapyJSONEncoder(json.JSONEncoder):
|
||||
|
||||
|
||||
class ScrapyJSONDecoder(json.JSONDecoder):
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
crawler = kw.pop('crawler', None)
|
||||
self.spref = kw.pop('spref', None) or SpiderReferencer(crawler)
|
||||
super(ScrapyJSONDecoder, self).__init__(*a, **kw)
|
||||
|
||||
def decode(self, s):
|
||||
o = super(ScrapyJSONDecoder, self).decode(s)
|
||||
if self.spref:
|
||||
o = self.spref.decode_references(o)
|
||||
return o
|
||||
pass
|
||||
|
@ -1,20 +0,0 @@
|
||||
import json
|
||||
|
||||
from twisted.web import resource
|
||||
|
||||
class JsonResource(resource.Resource):
|
||||
|
||||
json_encoder = json.JSONEncoder()
|
||||
|
||||
def render(self, txrequest):
|
||||
r = resource.Resource.render(self, txrequest)
|
||||
return self.render_object(r, txrequest)
|
||||
|
||||
def render_object(self, obj, txrequest):
|
||||
r = self.json_encoder.encode(obj) + "\n"
|
||||
txrequest.setHeader('Content-Type', 'application/json')
|
||||
txrequest.setHeader('Access-Control-Allow-Origin', '*')
|
||||
txrequest.setHeader('Access-Control-Allow-Methods', 'GET, POST, PATCH, PUT, DELETE')
|
||||
txrequest.setHeader('Access-Control-Allow-Headers',' X-Requested-With')
|
||||
txrequest.setHeader('Content-Length', len(r))
|
||||
return r
|
@ -1,97 +0,0 @@
|
||||
"""
|
||||
Scrapy web services extension
|
||||
|
||||
See docs/topics/webservice.rst
|
||||
"""
|
||||
|
||||
from twisted.web import server, resource
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import log, signals
|
||||
from scrapy.utils.jsonrpc import jsonrpc_server_call
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder, ScrapyJSONDecoder
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.txweb import JsonResource as JsonResource_
|
||||
from scrapy.utils.reactor import listen_tcp
|
||||
from scrapy.utils.conf import build_component_list
|
||||
|
||||
|
||||
class JsonResource(JsonResource_):
|
||||
|
||||
def __init__(self, crawler, target=None):
|
||||
JsonResource_.__init__(self)
|
||||
self.crawler = crawler
|
||||
self.json_encoder = ScrapyJSONEncoder(crawler=crawler)
|
||||
|
||||
class JsonRpcResource(JsonResource):
|
||||
|
||||
def __init__(self, crawler, target=None):
|
||||
JsonResource.__init__(self, crawler, target)
|
||||
self.json_decoder = ScrapyJSONDecoder(crawler=crawler)
|
||||
self.crawler = crawler
|
||||
self._target = target
|
||||
|
||||
def render_GET(self, txrequest):
|
||||
return self.get_target()
|
||||
|
||||
def render_POST(self, txrequest):
|
||||
reqstr = txrequest.content.getvalue()
|
||||
target = self.get_target()
|
||||
return jsonrpc_server_call(target, reqstr, self.json_decoder)
|
||||
|
||||
def getChild(self, name, txrequest):
|
||||
target = self.get_target()
|
||||
try:
|
||||
newtarget = getattr(target, name)
|
||||
return JsonRpcResource(self.crawler, newtarget)
|
||||
except AttributeError:
|
||||
return resource.ErrorPage(404, "No Such Resource", "No such child resource.")
|
||||
|
||||
def get_target(self):
|
||||
return self._target
|
||||
|
||||
|
||||
class RootResource(JsonResource):
|
||||
|
||||
def render_GET(self, txrequest):
|
||||
return {'resources': self.children.keys()}
|
||||
|
||||
def getChild(self, name, txrequest):
|
||||
if name == '':
|
||||
return self
|
||||
return JsonResource.getChild(self, name, txrequest)
|
||||
|
||||
|
||||
class WebService(server.Site):
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('WEBSERVICE_ENABLED'):
|
||||
raise NotConfigured
|
||||
self.crawler = crawler
|
||||
logfile = crawler.settings['WEBSERVICE_LOGFILE']
|
||||
self.portrange = [int(x) for x in crawler.settings.getlist('WEBSERVICE_PORT')]
|
||||
self.host = crawler.settings['WEBSERVICE_HOST']
|
||||
root = RootResource(crawler)
|
||||
reslist = build_component_list(crawler.settings['WEBSERVICE_RESOURCES_BASE'], \
|
||||
crawler.settings['WEBSERVICE_RESOURCES'])
|
||||
for res_cls in map(load_object, reslist):
|
||||
res = res_cls(crawler)
|
||||
root.putChild(res.ws_name, res)
|
||||
server.Site.__init__(self, root, logPath=logfile)
|
||||
self.noisy = False
|
||||
crawler.signals.connect(self.start_listening, signals.engine_started)
|
||||
crawler.signals.connect(self.stop_listening, signals.engine_stopped)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def start_listening(self):
|
||||
self.port = listen_tcp(self.portrange, self.host, self)
|
||||
h = self.port.getHost()
|
||||
log.msg(format='Web service listening on %(host)s:%(port)d',
|
||||
level=log.DEBUG, host=h.host, port=h.port)
|
||||
|
||||
def stop_listening(self):
|
||||
self.port.stopListening()
|
||||
|
@ -1,122 +0,0 @@
|
||||
import unittest, json
|
||||
from io import BytesIO
|
||||
|
||||
from scrapy.utils.jsonrpc import jsonrpc_client_call, jsonrpc_server_call, \
|
||||
JsonRpcError, jsonrpc_errors
|
||||
from scrapy.utils.serialize import ScrapyJSONDecoder
|
||||
from tests.test_utils_serialize import CrawlerMock
|
||||
|
||||
class urllib_mock(object):
|
||||
def __init__(self, result=None, error=None):
|
||||
response = {}
|
||||
if result:
|
||||
response.update(result=result)
|
||||
if error:
|
||||
response.update(error=error)
|
||||
self.response = json.dumps(response)
|
||||
self.request = None
|
||||
|
||||
def urlopen(self, url, request):
|
||||
self.url = url
|
||||
self.request = request
|
||||
return BytesIO(self.response)
|
||||
|
||||
class TestTarget(object):
|
||||
|
||||
def call(self, *args, **kwargs):
|
||||
return list(args), kwargs
|
||||
|
||||
def exception(self):
|
||||
raise Exception("testing-errors")
|
||||
|
||||
class JsonRpcUtilsTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
crawler = CrawlerMock([])
|
||||
self.json_decoder = ScrapyJSONDecoder(crawler=crawler)
|
||||
|
||||
def test_jsonrpc_client_call_args_kwargs_raises(self):
|
||||
self.assertRaises(ValueError, jsonrpc_client_call, 'url', 'test', 'one', kw=123)
|
||||
|
||||
def test_jsonrpc_client_call_request(self):
|
||||
ul = urllib_mock(1)
|
||||
jsonrpc_client_call('url', 'test', 'one', 2, _urllib=ul)
|
||||
req = json.loads(ul.request)
|
||||
assert 'id' in req
|
||||
self.assertEqual(ul.url, 'url')
|
||||
self.assertEqual(req['jsonrpc'], '2.0')
|
||||
self.assertEqual(req['method'], 'test')
|
||||
self.assertEqual(req['params'], ['one', 2])
|
||||
|
||||
def test_jsonrpc_client_call_response(self):
|
||||
ul = urllib_mock()
|
||||
# must return result or error
|
||||
self.assertRaises(ValueError, jsonrpc_client_call, 'url', 'test', _urllib=ul)
|
||||
ul = urllib_mock(result={'one': 1})
|
||||
self.assertEquals(jsonrpc_client_call('url', 'test', _urllib=ul), {'one': 1})
|
||||
ul = urllib_mock(error={'code': 123, 'message': 'hello', 'data': 'some data'})
|
||||
|
||||
raised = False
|
||||
try:
|
||||
jsonrpc_client_call('url', 'test', _urllib=ul)
|
||||
except JsonRpcError as e:
|
||||
raised = True
|
||||
self.assertEqual(e.code, 123)
|
||||
self.assertEqual(e.message, 'hello')
|
||||
self.assertEqual(e.data, 'some data')
|
||||
assert '123' in str(e)
|
||||
assert 'hello' in str(e)
|
||||
assert raised, "JsonRpcError not raised"
|
||||
|
||||
def test_jsonrpc_server_call(self):
|
||||
t = TestTarget()
|
||||
r = jsonrpc_server_call(t, 'invalid json data', self.json_decoder)
|
||||
assert 'error' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] is None
|
||||
self.assertEqual(r['error']['code'], jsonrpc_errors.PARSE_ERROR)
|
||||
assert 'Traceback' in r['error']['data']
|
||||
|
||||
r = jsonrpc_server_call(t, '{"test": "test"}', self.json_decoder)
|
||||
assert 'error' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] is None
|
||||
self.assertEqual(r['error']['code'], jsonrpc_errors.INVALID_REQUEST)
|
||||
|
||||
r = jsonrpc_server_call(t, '{"method": "notfound", "id": 1}', self.json_decoder)
|
||||
assert 'error' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] == 1
|
||||
self.assertEqual(r['error']['code'], jsonrpc_errors.METHOD_NOT_FOUND)
|
||||
|
||||
r = jsonrpc_server_call(t, '{"method": "exception", "id": 1}', self.json_decoder)
|
||||
assert 'error' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] == 1
|
||||
self.assertEqual(r['error']['code'], jsonrpc_errors.INTERNAL_ERROR)
|
||||
assert 'testing-errors' in r['error']['message']
|
||||
assert 'Traceback' in r['error']['data']
|
||||
|
||||
r = jsonrpc_server_call(t, '{"method": "call", "id": 2}', self.json_decoder)
|
||||
assert 'result' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] == 2
|
||||
self.assertEqual(r['result'], ([], {}))
|
||||
|
||||
r = jsonrpc_server_call(t, '{"method": "call", "params": [456, 123], "id": 3}', \
|
||||
self.json_decoder)
|
||||
assert 'result' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] == 3
|
||||
self.assertEqual(r['result'], ([456, 123], {}))
|
||||
|
||||
r = jsonrpc_server_call(t, '{"method": "call", "params": {"data": 789}, "id": 3}', \
|
||||
self.json_decoder)
|
||||
assert 'result' in r
|
||||
assert r['jsonrpc'] == '2.0'
|
||||
assert r['id'] == 3
|
||||
self.assertEqual(r['result'], ([], {'data': 789}))
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -1,88 +1,20 @@
|
||||
import json
|
||||
import unittest
|
||||
import datetime
|
||||
import json
|
||||
from decimal import Decimal
|
||||
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.utils.serialize import SpiderReferencer, ScrapyJSONEncoder, ScrapyJSONDecoder
|
||||
from scrapy.spider import Spider
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder
|
||||
from scrapy.http import Request, Response
|
||||
|
||||
|
||||
class _EngineMock(object):
|
||||
def __init__(self, open_spiders):
|
||||
self.open_spiders = open_spiders
|
||||
|
||||
class CrawlerMock(object):
|
||||
def __init__(self, open_spiders):
|
||||
self.engine = _EngineMock(open_spiders)
|
||||
|
||||
class BaseTestCase(unittest.TestCase):
|
||||
class JsonEncoderTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.spider1 = Spider('name1')
|
||||
self.spider2 = Spider('name2')
|
||||
open_spiders = set([self.spider1, self.spider2])
|
||||
crawler = CrawlerMock(open_spiders)
|
||||
self.spref = SpiderReferencer(crawler)
|
||||
self.encoder = ScrapyJSONEncoder(spref=self.spref)
|
||||
self.decoder = ScrapyJSONDecoder(spref=self.spref)
|
||||
|
||||
class SpiderReferencerTestCase(BaseTestCase):
|
||||
|
||||
def test_spiders_and_references(self):
|
||||
ref1 = self.spref.get_reference_from_spider(self.spider1)
|
||||
assert isinstance(ref1, str)
|
||||
assert self.spider1.name in ref1
|
||||
ref2 = self.spref.get_reference_from_spider(self.spider2)
|
||||
ref1_ = self.spref.get_reference_from_spider(self.spider1)
|
||||
assert ref1 == ref1_
|
||||
assert ref1 != ref2
|
||||
|
||||
sp1 = self.spref.get_spider_from_reference(ref1)
|
||||
sp2 = self.spref.get_spider_from_reference(ref2)
|
||||
sp1_ = self.spref.get_spider_from_reference(ref1)
|
||||
assert isinstance(sp1, Spider)
|
||||
assert sp1 is not sp2
|
||||
assert sp1 is sp1_
|
||||
|
||||
# referring to spiders by name
|
||||
assert sp1 is self.spref.get_spider_from_reference('spider::name1')
|
||||
assert sp2 is self.spref.get_spider_from_reference('spider::name2')
|
||||
|
||||
# must return string as-is if spider id not found
|
||||
assert 'lala' == self.spref.get_spider_from_reference('lala')
|
||||
# must raise RuntimeError if spider id is not found and spider is not running
|
||||
self.assertRaises(RuntimeError, self.spref.get_spider_from_reference, 'spider:fffffff')
|
||||
self.encoder = ScrapyJSONEncoder()
|
||||
|
||||
def test_encode_decode(self):
|
||||
sr = self.spref
|
||||
sp1 = self.spider1
|
||||
sp2 = self.spider2
|
||||
ref1 = sr.get_reference_from_spider(sp1)
|
||||
ref2 = sr.get_reference_from_spider(sp2)
|
||||
|
||||
examples = [
|
||||
('lala', 'lala'),
|
||||
(sp1, ref1),
|
||||
(['lala', sp1], ['lala', ref1]),
|
||||
({'lala': sp1}, {'lala': ref1}),
|
||||
({sp1: sp2}, {ref1: ref2}),
|
||||
({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}})
|
||||
]
|
||||
for spiders, refs in examples:
|
||||
self.assertEqual(sr.encode_references(spiders), refs)
|
||||
self.assertEqual(sr.decode_references(refs), spiders)
|
||||
|
||||
class JsonEncoderTestCase(BaseTestCase):
|
||||
|
||||
def test_encode_decode(self):
|
||||
sr = self.spref
|
||||
sp1 = self.spider1
|
||||
sp2 = self.spider2
|
||||
ref1 = sr.get_reference_from_spider(sp1)
|
||||
ref2 = sr.get_reference_from_spider(sp2)
|
||||
dt = datetime.datetime(2010, 1, 2, 10, 11, 12)
|
||||
dts = "2010-01-02 10:11:12"
|
||||
d = datetime.date(2010, 1, 2)
|
||||
@ -92,42 +24,21 @@ class JsonEncoderTestCase(BaseTestCase):
|
||||
dec = Decimal("1000.12")
|
||||
decs = "1000.12"
|
||||
|
||||
examples_encode_decode = [
|
||||
('lala', 'lala'),
|
||||
(sp1, ref1),
|
||||
(['lala', sp1], ['lala', ref1]),
|
||||
({'lala': sp1}, {'lala': ref1}),
|
||||
({sp1: sp2}, {ref1: ref2}),
|
||||
({sp1: {sp2: ['lala', sp1]}}, {ref1: {ref2: ['lala', ref1]}})
|
||||
]
|
||||
for spiders, refs in examples_encode_decode:
|
||||
self.assertEqual(self.encoder.encode(spiders), json.dumps(refs))
|
||||
self.assertEqual(self.decoder.decode(json.dumps(refs)), spiders)
|
||||
for input, output in [('foo', 'foo'), (d, ds), (t, ts), (dt, dts),
|
||||
(dec, decs), (['foo', d], ['foo', ds])]:
|
||||
self.assertEqual(self.encoder.encode(input), json.dumps(output))
|
||||
|
||||
examples_encode_only = [
|
||||
({sp1: dt}, {ref1: dts}),
|
||||
({sp1: d}, {ref1: ds}),
|
||||
({sp1: t}, {ref1: ts}),
|
||||
({sp1: dec}, {ref1: decs}),
|
||||
]
|
||||
for spiders, refs in examples_encode_only:
|
||||
self.assertEqual(self.encoder.encode(spiders), json.dumps(refs))
|
||||
|
||||
assert 'Deferred' in self.encoder.encode(defer.Deferred())
|
||||
def test_encode_deferred(self):
|
||||
self.assertIn('Deferred', self.encoder.encode(defer.Deferred()))
|
||||
|
||||
def test_encode_request(self):
|
||||
r = Request("http://www.example.com/lala")
|
||||
rs = self.encoder.encode(r)
|
||||
assert r.method in rs
|
||||
assert r.url in rs
|
||||
self.assertIn(r.method, rs)
|
||||
self.assertIn(r.url, rs)
|
||||
|
||||
def test_encode_response(self):
|
||||
r = Response("http://www.example.com/lala")
|
||||
rs = self.encoder.encode(r)
|
||||
assert r.url in rs
|
||||
assert str(r.status) in rs
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
self.assertIn(r.url, rs)
|
||||
self.assertIn(str(r.status), rs)
|
||||
|
Loading…
x
Reference in New Issue
Block a user