mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 06:04:08 +00:00
remove unmantained web server code
This commit is contained in:
parent
b296d4169e
commit
7078cdc336
@ -1,115 +0,0 @@
|
||||
"""A django alike request-response model
|
||||
|
||||
most of this code is borrowed from django
|
||||
"""
|
||||
|
||||
from Cookie import SimpleCookie
|
||||
from scrapy.utils.datatypes import MultiValueDict, CaselessDict
|
||||
|
||||
|
||||
def build_httprequest(twistedrequest):
|
||||
"""Translate twisted request object to a django request approach"""
|
||||
request = HttpRequest()
|
||||
request.path = twistedrequest.path
|
||||
request.method = twistedrequest.method.upper()
|
||||
request.COOKIES = SimpleCookie(twistedrequest.received_cookies)
|
||||
request.HEADERS = Headers(twistedrequest.received_headers)
|
||||
request.ARGS = MultiValueDict(twistedrequest.args)
|
||||
request.FILES = {} # not yet supported
|
||||
request.content = twistedrequest.content
|
||||
request.twistedrequest = twistedrequest
|
||||
return request
|
||||
|
||||
|
||||
class HttpRequest(object):
|
||||
def __init__(self):
|
||||
self.path = ''
|
||||
self.method = None
|
||||
self.COOKIES = {}
|
||||
self.HEADERS = {}
|
||||
self.ARGS = {}
|
||||
self.FILES = {}
|
||||
|
||||
|
||||
class HttpResponse(object):
|
||||
status_code = 200
|
||||
|
||||
def __init__(self, content='', status=None, content_type=None):
|
||||
content_type = content_type or "text/html; charset=utf-8"
|
||||
self._headers = {'content-type': content_type}
|
||||
self.content = content
|
||||
self.cookies = SimpleCookie()
|
||||
self.status_code = status
|
||||
|
||||
def __str__(self):
|
||||
"Full HTTP message, including headers"
|
||||
return '\n'.join(['%s: %s' % (key, value)
|
||||
for key, value in self._headers.items()]) \
|
||||
+ '\n\n' + self.content
|
||||
|
||||
def __setitem__(self, header, value):
|
||||
self._headers[header.lower()] = value
|
||||
|
||||
def __delitem__(self, header):
|
||||
try:
|
||||
del self._headers[header.lower()]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
def __getitem__(self, header):
|
||||
return self._headers[header.lower()]
|
||||
|
||||
def has_header(self, header):
|
||||
"Case-insensitive check for a header"
|
||||
return self._headers.has_key(header.lower())
|
||||
|
||||
__contains__ = has_header
|
||||
|
||||
def items(self):
|
||||
return self._headers.items()
|
||||
|
||||
def get(self, header, alternate):
|
||||
return self._headers.get(header, alternate)
|
||||
|
||||
def set_cookie(self, key, value='', max_age=None, expires=None, path='/', domain=None, secure=None):
|
||||
self.cookies[key] = value
|
||||
for var in ('max_age', 'path', 'domain', 'secure', 'expires'):
|
||||
val = locals()[var]
|
||||
if val is not None:
|
||||
self.cookies[key][var.replace('_', '-')] = val
|
||||
|
||||
def delete_cookie(self, key, path='/', domain=None):
|
||||
self.cookies[key] = ''
|
||||
if path is not None:
|
||||
self.cookies[key]['path'] = path
|
||||
if domain is not None:
|
||||
self.cookies[key]['domain'] = domain
|
||||
self.cookies[key]['expires'] = 0
|
||||
self.cookies[key]['max-age'] = 0
|
||||
|
||||
|
||||
class Headers(CaselessDict):
|
||||
def __init__(self, source=None, encoding='utf-8'):
|
||||
self.encoding = encoding
|
||||
|
||||
if getattr(source, 'iteritems', None):
|
||||
d = source.iteritems()
|
||||
else:
|
||||
d = source # best effort
|
||||
|
||||
# can't use CaselessDict.__init__(self, d) because it doesn't call __setitem__
|
||||
for k,v in d:
|
||||
self.__setitem__(k.lower(), v)
|
||||
|
||||
def normkey(self, key):
|
||||
return key.title() # 'Content-Type' styles headers
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
"""Headers must not be unicode"""
|
||||
if isinstance(key, unicode):
|
||||
key = key.encode(self.encoding)
|
||||
if isinstance(value, unicode):
|
||||
value = value.encode(self.encoding)
|
||||
super(Headers, self).__setitem__(key, value)
|
||||
|
||||
|
@ -1,99 +0,0 @@
|
||||
from functools import wraps
|
||||
|
||||
from scrapy.utils.defer import mustbe_deferred
|
||||
from scrapy.utils.serialization import serialize as _serialize, unserialize as _unserialize
|
||||
|
||||
from .http import HttpResponse
|
||||
|
||||
JSON_CONTENT_TYPES = ('application/json',)
|
||||
|
||||
serialize = lambda x: _serialize(x, 'json')
|
||||
unserialize = lambda x: _unserialize(x, 'json')
|
||||
|
||||
class JsonException(Exception):
|
||||
pass
|
||||
|
||||
class JsonResponse(HttpResponse):
|
||||
def __init__(self, content=None, callback=None, serialize=serialize, *args, **kwargs):
|
||||
content = serialize(content)
|
||||
if callback: # JSONP support
|
||||
status, content = 200, '%s(%s)' % (callback, content)
|
||||
kwargs.setdefault('content_type', 'application/x-javascript')
|
||||
HttpResponse.__init__(self, content=content, *args, **kwargs)
|
||||
|
||||
class JsonResponseAccepted(JsonResponse):
|
||||
status_code = 202
|
||||
|
||||
class JsonResponseNoContent(JsonResponse):
|
||||
status_code = 204
|
||||
|
||||
class JsonResponseNotModified(JsonResponse):
|
||||
status_code = 304
|
||||
|
||||
class JsonResponseBadRequest(JsonResponse):
|
||||
status_code = 400
|
||||
|
||||
class JsonResponseUnauthorized(JsonResponse):
|
||||
status_code = 401
|
||||
|
||||
class JsonResponseForbidden(JsonResponse):
|
||||
status_code = 403
|
||||
|
||||
class JsonResponseNotFound(JsonResponse):
|
||||
status_code = 404
|
||||
|
||||
class JsonResponseInternalServerError(JsonResponse):
|
||||
status_code = 500
|
||||
|
||||
class JsonResponseNotImplemented(JsonResponse):
|
||||
status_code = 501
|
||||
|
||||
|
||||
def json(func):
|
||||
""" Decorator to wrap a json prepared view and return a JsonResponse
|
||||
|
||||
if content-type is application/json, sets request.JSON to unserialized request body.
|
||||
in case of unserialization failure, returns JsonResponseBadRequest()
|
||||
|
||||
if returned data from func is a dictionary, serialize it and returns JsonResponse()
|
||||
"""
|
||||
if not hasattr(func, '__call__'):
|
||||
raise TypeError('The argument should be a callable')
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(request, *args, **kwargs):
|
||||
json_callback = request.ARGS.get('callback') # JSONP support
|
||||
request.method = method = _x_http_method_override(request)
|
||||
request.content_type = ct = content_type(request)
|
||||
request.JSON = None
|
||||
|
||||
if method in ('POST', 'PUT'):
|
||||
if ct in JSON_CONTENT_TYPES:
|
||||
body = request.content.read()
|
||||
try:
|
||||
request.JSON = unserialize(body)
|
||||
except Exception, e:
|
||||
return JsonResponseBadRequest('Invalid json: %s' % e )
|
||||
|
||||
def _onsuccess(response):
|
||||
if not isinstance(response, HttpResponse):
|
||||
return JsonResponse(response, json_callback) # best effort
|
||||
return response
|
||||
|
||||
ret = mustbe_deferred(func, request, *args, **kwargs)
|
||||
ret.addCallback(_onsuccess)
|
||||
return ret
|
||||
return wrapper
|
||||
|
||||
def content_type(request):
|
||||
ct = request.HEADERS.get('content-type','')
|
||||
return ct.split(';')[0].strip()
|
||||
|
||||
def _x_http_method_override(request):
|
||||
""" support for X-Http-Method-Override hack
|
||||
|
||||
some clients does not support methods others than GET and POST, that clients
|
||||
has a chance to set an extra header to indicate intended method.
|
||||
"""
|
||||
return request.HEADERS.get('x-http-method-override', request.method).upper()
|
||||
|
@ -1,89 +0,0 @@
|
||||
import re
|
||||
import urllib
|
||||
import hashlib
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.core.engine import scrapyengine
|
||||
from scrapy.spider import spiders
|
||||
from scrapy.http import Request
|
||||
from scrapy.item.models import BaseItem
|
||||
from scrapy.core.exceptions import NotConfigured
|
||||
from scrapy.conf import settings
|
||||
from scrapy.utils.misc import memoize, arg_to_iter
|
||||
from scrapy.xlib.lrucache import LRUCache
|
||||
|
||||
from .site import WebSite, WebResource
|
||||
from .http import HttpResponse
|
||||
from .json import JsonResponse
|
||||
|
||||
JSONCALLBACK_RE = '^[a-zA-Z][a-zA-Z_.-]*$'
|
||||
CACHESIZE = settings.get('WS_CACHESIZE', 20)
|
||||
|
||||
|
||||
def _urlhash(request):
|
||||
h = hashlib.sha1()
|
||||
for a in sorted(request.ARGS):
|
||||
h.update(request.ARGS[a])
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
@memoize(cache=LRUCache(CACHESIZE), hash=_urlhash)
|
||||
def url_to_guid(httprequest):
|
||||
url = httprequest.ARGS.get('url')
|
||||
if not url:
|
||||
return HttpResponse('Bad Request', 400)
|
||||
url = urllib.unquote(url)
|
||||
|
||||
jsoncb = httprequest.ARGS.get('callback')
|
||||
if jsoncb and not re.match(JSONCALLBACK_RE, jsoncb):
|
||||
return HttpResponse('Bad callback argument', 400)
|
||||
|
||||
def _response(guids=(), message=None):
|
||||
content = {
|
||||
'guids': list(guids),
|
||||
'domain': getattr(spider, 'domain_name', None),
|
||||
'message': message,
|
||||
}
|
||||
return JsonResponse(content=content, callback=jsoncb)
|
||||
|
||||
spider = spiders.fromurl(url)
|
||||
if not spider:
|
||||
return _response(message='No crawler found for site')
|
||||
|
||||
if httprequest.ARGS.get('dontcrawl'):
|
||||
return _response()
|
||||
|
||||
|
||||
def _on_error(_failure):
|
||||
return _response(message='Error downloading url from site')
|
||||
|
||||
def _on_success(pagedata):
|
||||
try:
|
||||
items = spider.identify(pagedata)
|
||||
except Exception, ex:
|
||||
return _response(message='Error processing url')
|
||||
|
||||
guids = [i.guid for i in arg_to_iter(items) if isinstance(i, BaseItem)]
|
||||
return _response(guids=guids)
|
||||
|
||||
deferred = defer.Deferred().addCallbacks(_on_success, _on_error)
|
||||
request = Request(url=url, callback=deferred, dont_filter=True)
|
||||
schd = scrapyengine.download(request, spider)
|
||||
schd.chainDeferred(deferred)
|
||||
return deferred
|
||||
|
||||
|
||||
urlmapping = (
|
||||
('^ws/tools/url_to_guid/$', url_to_guid),
|
||||
)
|
||||
|
||||
|
||||
class UrlToGuidService(WebSite):
|
||||
def __init__(self):
|
||||
if not settings.getbool('WS_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
port = settings.getint('WS_PORT') or 8088
|
||||
timeout = settings.getint('WS_TIMEOUT') or 15 # seconds
|
||||
resource = WebResource(urlmapping, timeout=timeout)
|
||||
WebSite.__init__(self, port=port, resource=resource)
|
@ -1,133 +0,0 @@
|
||||
"""Twisted website object as django
|
||||
|
||||
################################################################################
|
||||
## Simple Usage example:
|
||||
|
||||
from twisted.internet import reactor
|
||||
from scrapy.contrib.web.http import WebSite, HttpResponse
|
||||
|
||||
def helloword(request):
|
||||
return HttpResponse('Hello World!')
|
||||
|
||||
def hello(request, name):
|
||||
return HttpResponse('Hello %s' % name)
|
||||
|
||||
urls = (
|
||||
('^hello/(?P<name>\w+)/$', hello),
|
||||
('^$', helloword),
|
||||
)
|
||||
|
||||
|
||||
resource = WebResource(urls)
|
||||
site = WebSite(port=8081, resource=resource)
|
||||
reactor.run()
|
||||
|
||||
# now go to http://localhost:8081/
|
||||
|
||||
|
||||
|
||||
################################################################################
|
||||
## Complex usage example:
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
from scrapy.contrib.web.http import WebSite, HttpResponse
|
||||
|
||||
def delayed(request):
|
||||
def _callback(result):
|
||||
return HttpResponse('Heavy task completed: %s' % result)
|
||||
|
||||
def _errback(_failure):
|
||||
return HttpResponse('Internal Server Error: %s' % _failure, status=500)
|
||||
|
||||
def heavytask(_):
|
||||
import random
|
||||
assert random.randint(0,1), "Exception found processing request"
|
||||
return _
|
||||
|
||||
d = defer.Deferred().addCallback(heavytask)
|
||||
d.addCallbacks(_callback, _errback)
|
||||
reactor.callLater(1, d.callback, "Well done")
|
||||
return d
|
||||
|
||||
urls = (('^delayed/$', delayed),)
|
||||
|
||||
resource = WebResource(urls)
|
||||
site = WebSite(port=8081, resource=resource)
|
||||
reactor.run()
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from twisted.web import server, resource
|
||||
from twisted.internet import reactor
|
||||
from scrapy.utils.defer import mustbe_deferred
|
||||
|
||||
from .http import HttpResponse, build_httprequest
|
||||
|
||||
|
||||
def urlresolver(urls, path):
|
||||
"""Simple path to view mapper"""
|
||||
path = path.lstrip('/')
|
||||
for pathre, view in urls:
|
||||
m = re.search(pathre, path)
|
||||
if m:
|
||||
kwargs = m.groupdict()
|
||||
args = () if kwargs else m.groups()
|
||||
return view, args, kwargs
|
||||
return None, (), {}
|
||||
|
||||
|
||||
class WebSite(server.Site):
|
||||
def __init__(self, port=None, *args, **kwargs):
|
||||
server.Site.__init__(self, *args, **kwargs)
|
||||
if port:
|
||||
self.bind(port)
|
||||
|
||||
def bind(self, port):
|
||||
from scrapy.core.engine import scrapyengine
|
||||
scrapyengine.listenTCP(port, self)
|
||||
|
||||
|
||||
class WebResource(resource.Resource):
|
||||
"""Translate twisted web approach to django alike way"""
|
||||
isLeaf = True
|
||||
debug = True
|
||||
|
||||
def __init__(self, urls, timeout=3, urlresolver=urlresolver):
|
||||
resource.Resource.__init__(self)
|
||||
self.urlresolver = urlresolver
|
||||
self.timeout = timeout
|
||||
self.urls = urls
|
||||
|
||||
def render(self, twistedrequest):
|
||||
httprequest = build_httprequest(twistedrequest)
|
||||
|
||||
def _send_response(response):
|
||||
assert isinstance(response, HttpResponse), 'view should return a HttpResponse object'
|
||||
twistedrequest.setResponseCode(response.status_code or 200)
|
||||
for key, val in response.items():
|
||||
twistedrequest.setHeader(key, response[key])
|
||||
twistedrequest.write(response.content)
|
||||
twistedrequest.finish()
|
||||
|
||||
def _on_error(_failure):
|
||||
content = _failure.getTraceback() if self.debug else 'Internal Error'
|
||||
response = HttpResponse(content=str(_failure), status=500)
|
||||
return _send_response(response)
|
||||
|
||||
view, args, kwargs = self.urlresolver(self.urls, httprequest.path)
|
||||
if not view:
|
||||
response = HttpResponse(content='Not Found', status=404)
|
||||
_send_response(response)
|
||||
return server.NOT_DONE_YET
|
||||
|
||||
deferred = mustbe_deferred(view, httprequest, *args, **kwargs)
|
||||
deferred.addCallback(_send_response)
|
||||
deferred.addErrback(_on_error)
|
||||
if not deferred.timeoutCall:
|
||||
deferred.setTimeout(self.timeout)
|
||||
|
||||
return server.NOT_DONE_YET
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user