1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 06:04:08 +00:00

remove unmantained web server code

This commit is contained in:
Daniel Grana 2009-08-10 20:28:32 -03:00
parent b296d4169e
commit 7078cdc336
5 changed files with 0 additions and 436 deletions

View File

@ -1,115 +0,0 @@
"""A django alike request-response model
most of this code is borrowed from django
"""
from Cookie import SimpleCookie
from scrapy.utils.datatypes import MultiValueDict, CaselessDict
def build_httprequest(twistedrequest):
"""Translate twisted request object to a django request approach"""
request = HttpRequest()
request.path = twistedrequest.path
request.method = twistedrequest.method.upper()
request.COOKIES = SimpleCookie(twistedrequest.received_cookies)
request.HEADERS = Headers(twistedrequest.received_headers)
request.ARGS = MultiValueDict(twistedrequest.args)
request.FILES = {} # not yet supported
request.content = twistedrequest.content
request.twistedrequest = twistedrequest
return request
class HttpRequest(object):
def __init__(self):
self.path = ''
self.method = None
self.COOKIES = {}
self.HEADERS = {}
self.ARGS = {}
self.FILES = {}
class HttpResponse(object):
status_code = 200
def __init__(self, content='', status=None, content_type=None):
content_type = content_type or "text/html; charset=utf-8"
self._headers = {'content-type': content_type}
self.content = content
self.cookies = SimpleCookie()
self.status_code = status
def __str__(self):
"Full HTTP message, including headers"
return '\n'.join(['%s: %s' % (key, value)
for key, value in self._headers.items()]) \
+ '\n\n' + self.content
def __setitem__(self, header, value):
self._headers[header.lower()] = value
def __delitem__(self, header):
try:
del self._headers[header.lower()]
except KeyError:
pass
def __getitem__(self, header):
return self._headers[header.lower()]
def has_header(self, header):
"Case-insensitive check for a header"
return self._headers.has_key(header.lower())
__contains__ = has_header
def items(self):
return self._headers.items()
def get(self, header, alternate):
return self._headers.get(header, alternate)
def set_cookie(self, key, value='', max_age=None, expires=None, path='/', domain=None, secure=None):
self.cookies[key] = value
for var in ('max_age', 'path', 'domain', 'secure', 'expires'):
val = locals()[var]
if val is not None:
self.cookies[key][var.replace('_', '-')] = val
def delete_cookie(self, key, path='/', domain=None):
self.cookies[key] = ''
if path is not None:
self.cookies[key]['path'] = path
if domain is not None:
self.cookies[key]['domain'] = domain
self.cookies[key]['expires'] = 0
self.cookies[key]['max-age'] = 0
class Headers(CaselessDict):
def __init__(self, source=None, encoding='utf-8'):
self.encoding = encoding
if getattr(source, 'iteritems', None):
d = source.iteritems()
else:
d = source # best effort
# can't use CaselessDict.__init__(self, d) because it doesn't call __setitem__
for k,v in d:
self.__setitem__(k.lower(), v)
def normkey(self, key):
return key.title() # 'Content-Type' styles headers
def __setitem__(self, key, value):
"""Headers must not be unicode"""
if isinstance(key, unicode):
key = key.encode(self.encoding)
if isinstance(value, unicode):
value = value.encode(self.encoding)
super(Headers, self).__setitem__(key, value)

View File

@ -1,99 +0,0 @@
from functools import wraps
from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.serialization import serialize as _serialize, unserialize as _unserialize
from .http import HttpResponse
JSON_CONTENT_TYPES = ('application/json',)
serialize = lambda x: _serialize(x, 'json')
unserialize = lambda x: _unserialize(x, 'json')
class JsonException(Exception):
pass
class JsonResponse(HttpResponse):
def __init__(self, content=None, callback=None, serialize=serialize, *args, **kwargs):
content = serialize(content)
if callback: # JSONP support
status, content = 200, '%s(%s)' % (callback, content)
kwargs.setdefault('content_type', 'application/x-javascript')
HttpResponse.__init__(self, content=content, *args, **kwargs)
class JsonResponseAccepted(JsonResponse):
status_code = 202
class JsonResponseNoContent(JsonResponse):
status_code = 204
class JsonResponseNotModified(JsonResponse):
status_code = 304
class JsonResponseBadRequest(JsonResponse):
status_code = 400
class JsonResponseUnauthorized(JsonResponse):
status_code = 401
class JsonResponseForbidden(JsonResponse):
status_code = 403
class JsonResponseNotFound(JsonResponse):
status_code = 404
class JsonResponseInternalServerError(JsonResponse):
status_code = 500
class JsonResponseNotImplemented(JsonResponse):
status_code = 501
def json(func):
""" Decorator to wrap a json prepared view and return a JsonResponse
if content-type is application/json, sets request.JSON to unserialized request body.
in case of unserialization failure, returns JsonResponseBadRequest()
if returned data from func is a dictionary, serialize it and returns JsonResponse()
"""
if not hasattr(func, '__call__'):
raise TypeError('The argument should be a callable')
@wraps(func)
def wrapper(request, *args, **kwargs):
json_callback = request.ARGS.get('callback') # JSONP support
request.method = method = _x_http_method_override(request)
request.content_type = ct = content_type(request)
request.JSON = None
if method in ('POST', 'PUT'):
if ct in JSON_CONTENT_TYPES:
body = request.content.read()
try:
request.JSON = unserialize(body)
except Exception, e:
return JsonResponseBadRequest('Invalid json: %s' % e )
def _onsuccess(response):
if not isinstance(response, HttpResponse):
return JsonResponse(response, json_callback) # best effort
return response
ret = mustbe_deferred(func, request, *args, **kwargs)
ret.addCallback(_onsuccess)
return ret
return wrapper
def content_type(request):
ct = request.HEADERS.get('content-type','')
return ct.split(';')[0].strip()
def _x_http_method_override(request):
""" support for X-Http-Method-Override hack
some clients does not support methods others than GET and POST, that clients
has a chance to set an extra header to indicate intended method.
"""
return request.HEADERS.get('x-http-method-override', request.method).upper()

View File

@ -1,89 +0,0 @@
import re
import urllib
import hashlib
from twisted.internet import defer
from scrapy.core.engine import scrapyengine
from scrapy.spider import spiders
from scrapy.http import Request
from scrapy.item.models import BaseItem
from scrapy.core.exceptions import NotConfigured
from scrapy.conf import settings
from scrapy.utils.misc import memoize, arg_to_iter
from scrapy.xlib.lrucache import LRUCache
from .site import WebSite, WebResource
from .http import HttpResponse
from .json import JsonResponse
JSONCALLBACK_RE = '^[a-zA-Z][a-zA-Z_.-]*$'
CACHESIZE = settings.get('WS_CACHESIZE', 20)
def _urlhash(request):
h = hashlib.sha1()
for a in sorted(request.ARGS):
h.update(request.ARGS[a])
return h.hexdigest()
@memoize(cache=LRUCache(CACHESIZE), hash=_urlhash)
def url_to_guid(httprequest):
url = httprequest.ARGS.get('url')
if not url:
return HttpResponse('Bad Request', 400)
url = urllib.unquote(url)
jsoncb = httprequest.ARGS.get('callback')
if jsoncb and not re.match(JSONCALLBACK_RE, jsoncb):
return HttpResponse('Bad callback argument', 400)
def _response(guids=(), message=None):
content = {
'guids': list(guids),
'domain': getattr(spider, 'domain_name', None),
'message': message,
}
return JsonResponse(content=content, callback=jsoncb)
spider = spiders.fromurl(url)
if not spider:
return _response(message='No crawler found for site')
if httprequest.ARGS.get('dontcrawl'):
return _response()
def _on_error(_failure):
return _response(message='Error downloading url from site')
def _on_success(pagedata):
try:
items = spider.identify(pagedata)
except Exception, ex:
return _response(message='Error processing url')
guids = [i.guid for i in arg_to_iter(items) if isinstance(i, BaseItem)]
return _response(guids=guids)
deferred = defer.Deferred().addCallbacks(_on_success, _on_error)
request = Request(url=url, callback=deferred, dont_filter=True)
schd = scrapyengine.download(request, spider)
schd.chainDeferred(deferred)
return deferred
urlmapping = (
('^ws/tools/url_to_guid/$', url_to_guid),
)
class UrlToGuidService(WebSite):
def __init__(self):
if not settings.getbool('WS_ENABLED'):
raise NotConfigured
port = settings.getint('WS_PORT') or 8088
timeout = settings.getint('WS_TIMEOUT') or 15 # seconds
resource = WebResource(urlmapping, timeout=timeout)
WebSite.__init__(self, port=port, resource=resource)

View File

@ -1,133 +0,0 @@
"""Twisted website object as django
################################################################################
## Simple Usage example:
from twisted.internet import reactor
from scrapy.contrib.web.http import WebSite, HttpResponse
def helloword(request):
return HttpResponse('Hello World!')
def hello(request, name):
return HttpResponse('Hello %s' % name)
urls = (
('^hello/(?P<name>\w+)/$', hello),
('^$', helloword),
)
resource = WebResource(urls)
site = WebSite(port=8081, resource=resource)
reactor.run()
# now go to http://localhost:8081/
################################################################################
## Complex usage example:
from twisted.internet import reactor, defer
from scrapy.contrib.web.http import WebSite, HttpResponse
def delayed(request):
def _callback(result):
return HttpResponse('Heavy task completed: %s' % result)
def _errback(_failure):
return HttpResponse('Internal Server Error: %s' % _failure, status=500)
def heavytask(_):
import random
assert random.randint(0,1), "Exception found processing request"
return _
d = defer.Deferred().addCallback(heavytask)
d.addCallbacks(_callback, _errback)
reactor.callLater(1, d.callback, "Well done")
return d
urls = (('^delayed/$', delayed),)
resource = WebResource(urls)
site = WebSite(port=8081, resource=resource)
reactor.run()
"""
import re
from twisted.web import server, resource
from twisted.internet import reactor
from scrapy.utils.defer import mustbe_deferred
from .http import HttpResponse, build_httprequest
def urlresolver(urls, path):
"""Simple path to view mapper"""
path = path.lstrip('/')
for pathre, view in urls:
m = re.search(pathre, path)
if m:
kwargs = m.groupdict()
args = () if kwargs else m.groups()
return view, args, kwargs
return None, (), {}
class WebSite(server.Site):
def __init__(self, port=None, *args, **kwargs):
server.Site.__init__(self, *args, **kwargs)
if port:
self.bind(port)
def bind(self, port):
from scrapy.core.engine import scrapyengine
scrapyengine.listenTCP(port, self)
class WebResource(resource.Resource):
"""Translate twisted web approach to django alike way"""
isLeaf = True
debug = True
def __init__(self, urls, timeout=3, urlresolver=urlresolver):
resource.Resource.__init__(self)
self.urlresolver = urlresolver
self.timeout = timeout
self.urls = urls
def render(self, twistedrequest):
httprequest = build_httprequest(twistedrequest)
def _send_response(response):
assert isinstance(response, HttpResponse), 'view should return a HttpResponse object'
twistedrequest.setResponseCode(response.status_code or 200)
for key, val in response.items():
twistedrequest.setHeader(key, response[key])
twistedrequest.write(response.content)
twistedrequest.finish()
def _on_error(_failure):
content = _failure.getTraceback() if self.debug else 'Internal Error'
response = HttpResponse(content=str(_failure), status=500)
return _send_response(response)
view, args, kwargs = self.urlresolver(self.urls, httprequest.path)
if not view:
response = HttpResponse(content='Not Found', status=404)
_send_response(response)
return server.NOT_DONE_YET
deferred = mustbe_deferred(view, httprequest, *args, **kwargs)
deferred.addCallback(_send_response)
deferred.addErrback(_on_error)
if not deferred.timeoutCall:
deferred.setTimeout(self.timeout)
return server.NOT_DONE_YET