1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 13:04:20 +00:00

* Made Response.meta attribute map to Request.meta attribute. Closes #290

* Record redirected URLs in redirect middleware. Closes #291
This commit is contained in:
Pablo Hoffman 2010-11-18 12:51:54 -02:00
parent ac007802d6
commit 91a7c25797
7 changed files with 56 additions and 34 deletions

View File

@ -382,16 +382,21 @@ HttpProxyMiddleware
.. _urllib2: http://docs.python.org/library/urllib2.html
RedirectMiddleware
-------------------
------------------
.. module:: scrapy.contrib.downloadermiddleware.redirect
:synopsis: Redirection Middleware
.. class:: RedirectMiddleware
This middlware handles redirection of requests based on response status and
This middleware handles redirection of requests based on response status and
meta-refresh html tag.
.. reqmeta:: redirect_urls
The urls which the request goes through (while being redirected) can be found
in the ``redirect_urls`` :attr:`Request.meta <scrapy.http.Request.meta>` key.
The :class:`RedirectMiddleware` can be configured through the following
settings (see the settings documentation for more info):

View File

@ -138,7 +138,8 @@ Request objects
recognized by Scrapy.
This dict is `shallow copied`_ when the request is cloned using the
``copy()`` or ``replace()`` methods.
``copy()`` or ``replace()`` methods, and can also be accesed, in your
spider, from the ``response.meta`` attribute.
.. _shallow copied: http://docs.python.org/library/copy.html
@ -221,6 +222,7 @@ Those are:
* :reqmeta:`dont_retry`
* :reqmeta:`handle_httpstatus_list`
* ``dont_merge_cookies`` (see ``cookies`` parameter of :class:`Request` constructor)
* :reqmeta:`redirect_urls`
.. _topics-request-response-ref-request-subclasses:
@ -353,7 +355,7 @@ method for this job. Here's an example spider which uses it::
Response objects
================
.. class:: Response(url, [status=200, headers, body, meta, flags])
.. class:: Response(url, [status=200, headers, body, flags])
A :class:`Response` object represents an HTTP response, which is usually
downloaded (by the Downloader) and fed to the Spiders for processing.
@ -429,9 +431,14 @@ Response objects
.. attribute:: Response.meta
A dict that contains arbitrary metadata for this response, similar to the
:attr:`Request.meta` attribute. See the :attr:`Request.meta` attribute for
more info.
A shortcut to the :attr:`Request.meta` attribute of the
:attr:`Response.request` object (ie. ``self.request.meta``).
Unlike the :attr:`Response.request` attribute, the :attr:`Response.meta`
attribute is propagated along redirects and retries, so you will get
the original :attr:`Request.meta` sent from your spider.
.. seealso:: :attr:`Request.meta` attribute
.. attribute:: Response.flags

View File

@ -50,6 +50,8 @@ class RedirectMiddleware(object):
if ttl and redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
[request.url]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request),

View File

@ -13,23 +13,24 @@ from scrapy.http.common import deprecated_setter
class Response(object_ref):
__slots__ = ['_url', 'headers', 'status', '_body', 'request', '_meta', \
__slots__ = ['_url', 'headers', 'status', '_body', 'request', \
'flags', '__weakref__']
def __init__(self, url, status=200, headers=None, body='', meta=None, flags=None):
def __init__(self, url, status=200, headers=None, body='', flags=None, request=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = None
self.request = request
self.flags = [] if flags is None else list(flags)
self._meta = dict(meta) if meta else None
@property
def meta(self):
if self._meta is None:
self._meta = {}
return self._meta
try:
return self.request.meta
except AttributeError:
raise AttributeError("Response.meta not available, this response " \
"is not tied to any request")
def _get_url(self):
return self._url
@ -61,7 +62,7 @@ class Response(object_ref):
body = property(_get_body, deprecated_setter(_set_body, 'body'))
def __repr__(self):
attrs = ['url', 'status', 'body', 'headers', 'meta', 'flags']
attrs = ['url', 'status', 'body', 'headers', 'request', 'flags']
args = ", ".join(["%s=%r" % (a, getattr(self, a)) for a in attrs])
return "%s(%s)" % (self.__class__.__name__, args)
@ -76,7 +77,7 @@ class Response(object_ref):
"""Create a new Response with the same attributes except for those
given new values.
"""
for x in ['url', 'status', 'headers', 'body', 'meta', 'flags']:
for x in ['url', 'status', 'headers', 'body', 'request', 'flags']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)

View File

@ -26,12 +26,11 @@ class TextResponse(Response):
__slots__ = ['_encoding', '_cached_benc', '_cached_ubody']
def __init__(self, url, status=200, headers=None, body=None, meta=None, \
flags=None, encoding=None):
self._encoding = encoding
def __init__(self, *args, **kwargs):
self._encoding = kwargs.pop('encoding', None)
self._cached_benc = None
self._cached_ubody = None
super(TextResponse, self).__init__(url, status, headers, body, meta, flags)
super(TextResponse, self).__init__(*args, **kwargs)
def _set_url(self, url):
if isinstance(url, unicode):

View File

@ -145,5 +145,17 @@ class RedirectMiddlewareTest(unittest.TestCase):
assert isinstance(req, Request)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
def test_redirect_urls(self):
req1 = Request('http://scrapytest.org/first')
rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
req2 = self.mw.process_response(req1, rsp1, self.spider)
rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
req3 = self.mw.process_response(req2, rsp2, self.spider)
self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
if __name__ == "__main__":
unittest.main()

View File

@ -1,7 +1,7 @@
import unittest
import weakref
from scrapy.http import Response, TextResponse, HtmlResponse, XmlResponse, Headers
from scrapy.http import Request, Response, TextResponse, HtmlResponse, XmlResponse, Headers
from scrapy.utils.encoding import resolve_encoding
@ -26,15 +26,11 @@ class BaseResponseTest(unittest.TestCase):
assert isinstance(r.headers, Headers)
self.assertEqual(r.headers, {})
self.assertEqual(r.meta, {})
meta = {"lala": "lolo"}
headers = {"caca": "coco"}
body = "a body"
r = self.response_class("http://www.example.com", meta=meta, headers=headers, body=body)
r = self.response_class("http://www.example.com", headers=headers, body=body)
assert r.meta is not meta
self.assertEqual(r.meta, meta)
assert r.headers is not headers
self.assertEqual(r.headers["caca"], "coco")
@ -48,17 +44,12 @@ class BaseResponseTest(unittest.TestCase):
"""Test Response copy"""
r1 = self.response_class("http://www.example.com", body="Some body")
r1.meta['foo'] = 'bar'
r1.flags.append('cached')
r2 = r1.copy()
self.assertEqual(r1.status, r2.status)
self.assertEqual(r1.body, r2.body)
# make sure meta dict is shallow copied
assert r1.meta is not r2.meta, "meta must be a shallow copy, not identical"
self.assertEqual(r1.meta, r2.meta)
# make sure flags list is shallow copied
assert r1.flags is not r2.flags, "flags must be a shallow copy, not identical"
self.assertEqual(r1.flags, r2.flags)
@ -67,6 +58,12 @@ class BaseResponseTest(unittest.TestCase):
assert r1.headers is not r2.headers, "headers must be a shallow copy, not identical"
self.assertEqual(r1.headers, r2.headers)
def test_copy_meta(self):
req = Request("http://www.example.com")
req.meta['foo'] = 'bar'
r1 = self.response_class("http://www.example.com", body="Some body", request=req)
assert r1.meta is req.meta
def test_copy_inherited_classes(self):
"""Test Response children copies preserve their class"""
@ -90,10 +87,9 @@ class BaseResponseTest(unittest.TestCase):
self.assertEqual((r1.headers, r2.headers), ({}, hdrs))
# Empty attributes (which may fail if not compared properly)
r3 = self.response_class("http://www.example.com", meta={'a': 1}, flags=['cached'])
r4 = r3.replace(body='', meta={}, flags=[])
r3 = self.response_class("http://www.example.com", flags=['cached'])
r4 = r3.replace(body='', flags=[])
self.assertEqual(r4.body, '')
self.assertEqual(r4.meta, {})
self.assertEqual(r4.flags, [])
def test_weakref_slots(self):