* Made Response.meta attribute map to Request.meta attribute. Closes #290

* Record redirected URLs in redirect middleware. Closes #291
2025-02-23 13:04:20 +00:00 · 2010-11-18 12:51:54 -02:00 · 2010-11-18 12:51:54 -02:00 · 91a7c25797
commit 91a7c25797
parent ac007802d6
7 changed files with 56 additions and 34 deletions
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@ -382,16 +382,21 @@ HttpProxyMiddleware
 .. _urllib2: http://docs.python.org/library/urllib2.html

 RedirectMiddleware
-------------------
+------------------

 .. module:: scrapy.contrib.downloadermiddleware.redirect
   :synopsis: Redirection Middleware

 .. class:: RedirectMiddleware

-   This middlware handles redirection of requests based on response status and
+   This middleware handles redirection of requests based on response status and
   meta-refresh html tag.

+.. reqmeta:: redirect_urls
+
+The urls which the request goes through (while being redirected) can be found
+in the ``redirect_urls`` :attr:`Request.meta <scrapy.http.Request.meta>` key.
+
 The :class:`RedirectMiddleware` can be configured through the following
 settings (see the settings documentation for more info):

--- a/docs/topics/request-response.rst
+++ b/docs/topics/request-response.rst
@ -138,7 +138,8 @@ Request objects
        recognized by Scrapy.

        This dict is `shallow copied`_ when the request is cloned using the
-        ``copy()`` or ``replace()`` methods.
+        ``copy()`` or ``replace()`` methods, and can also be accesed, in your
+        spider, from the ``response.meta`` attribute.

    .. _shallow copied: http://docs.python.org/library/copy.html

@ -221,6 +222,7 @@ Those are:
 * :reqmeta:`dont_retry`
 * :reqmeta:`handle_httpstatus_list`
 * ``dont_merge_cookies`` (see ``cookies`` parameter of :class:`Request` constructor)
+* :reqmeta:`redirect_urls`

 .. _topics-request-response-ref-request-subclasses:

@ -353,7 +355,7 @@ method for this job. Here's an example spider which uses it::
 Response objects
 ================

-.. class:: Response(url, [status=200, headers, body, meta, flags])
+.. class:: Response(url, [status=200, headers, body, flags])

    A :class:`Response` object represents an HTTP response, which is usually
    downloaded (by the Downloader) and fed to the Spiders for processing.
@ -429,9 +431,14 @@ Response objects

    .. attribute:: Response.meta

-        A dict that contains arbitrary metadata for this response, similar to the
-        :attr:`Request.meta` attribute. See the :attr:`Request.meta` attribute for
-        more info.
+        A shortcut to the :attr:`Request.meta` attribute of the
+        :attr:`Response.request` object (ie. ``self.request.meta``).
+
+        Unlike the :attr:`Response.request` attribute, the :attr:`Response.meta`
+        attribute is propagated along redirects and retries, so you will get
+        the original :attr:`Request.meta` sent from your spider.
+
+        .. seealso:: :attr:`Request.meta` attribute

    .. attribute:: Response.flags

--- a/scrapy/contrib/downloadermiddleware/redirect.py
+++ b/scrapy/contrib/downloadermiddleware/redirect.py
@ -50,6 +50,8 @@ class RedirectMiddleware(object):
        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
+            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
+                [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request),
--- a/scrapy/http/response/init.py
+++ b/scrapy/http/response/init.py
@ -13,23 +13,24 @@ from scrapy.http.common import deprecated_setter

 class Response(object_ref):

-    __slots__ = ['_url', 'headers', 'status', '_body', 'request', '_meta', \
+    __slots__ = ['_url', 'headers', 'status', '_body', 'request', \
        'flags', '__weakref__']

-    def __init__(self, url, status=200, headers=None, body='', meta=None, flags=None):
+    def __init__(self, url, status=200, headers=None, body='', flags=None, request=None):
        self.headers = Headers(headers or {})
        self.status = int(status)
        self._set_body(body)
        self._set_url(url)
-        self.request = None
+        self.request = request
        self.flags = [] if flags is None else list(flags)
-        self._meta = dict(meta) if meta else None

    @property
    def meta(self):
-        if self._meta is None:
-            self._meta = {}
-        return self._meta
+        try:
+            return self.request.meta
+        except AttributeError:
+            raise AttributeError("Response.meta not available, this response " \
+                "is not tied to any request")

    def _get_url(self):
        return self._url
@ -61,7 +62,7 @@ class Response(object_ref):
    body = property(_get_body, deprecated_setter(_set_body, 'body'))

    def __repr__(self):
-        attrs = ['url', 'status', 'body', 'headers', 'meta', 'flags']
+        attrs = ['url', 'status', 'body', 'headers', 'request', 'flags']
        args = ", ".join(["%s=%r" % (a, getattr(self, a)) for a in attrs])
        return "%s(%s)" % (self.__class__.__name__, args)

@ -76,7 +77,7 @@ class Response(object_ref):
        """Create a new Response with the same attributes except for those
        given new values.
        """
-        for x in ['url', 'status', 'headers', 'body', 'meta', 'flags']:
+        for x in ['url', 'status', 'headers', 'body', 'request', 'flags']:
            kwargs.setdefault(x, getattr(self, x))
        cls = kwargs.pop('cls', self.__class__)
        return cls(*args, **kwargs)
--- a/scrapy/http/response/text.py
+++ b/scrapy/http/response/text.py
@ -26,12 +26,11 @@ class TextResponse(Response):

    __slots__ = ['_encoding', '_cached_benc', '_cached_ubody']

-    def __init__(self, url, status=200, headers=None, body=None, meta=None, \
-            flags=None, encoding=None):
-        self._encoding = encoding
+    def __init__(self, *args, **kwargs):
+        self._encoding = kwargs.pop('encoding', None)
        self._cached_benc = None
        self._cached_ubody = None
-        super(TextResponse, self).__init__(url, status, headers, body, meta, flags)
+        super(TextResponse, self).__init__(*args, **kwargs)

    def _set_url(self, url):
        if isinstance(url, unicode):
--- a/scrapy/tests/test_downloadermiddleware_redirect.py
+++ b/scrapy/tests/test_downloadermiddleware_redirect.py
@ -145,5 +145,17 @@ class RedirectMiddlewareTest(unittest.TestCase):
        assert isinstance(req, Request)
        self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)

+    def test_redirect_urls(self):
+        req1 = Request('http://scrapytest.org/first')
+        rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
+        req2 = self.mw.process_response(req1, rsp1, self.spider)
+        rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
+        req3 = self.mw.process_response(req2, rsp2, self.spider)
+
+        self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
+        self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
+        self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
+        self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
+
 if __name__ == "__main__":
    unittest.main()
--- a/scrapy/tests/test_http_response.py
+++ b/scrapy/tests/test_http_response.py
@ -1,7 +1,7 @@
 import unittest
 import weakref

-from scrapy.http import Response, TextResponse, HtmlResponse, XmlResponse, Headers
+from scrapy.http import Request, Response, TextResponse, HtmlResponse, XmlResponse, Headers
 from scrapy.utils.encoding import resolve_encoding


@ -26,15 +26,11 @@ class BaseResponseTest(unittest.TestCase):

        assert isinstance(r.headers, Headers)
        self.assertEqual(r.headers, {})
-        self.assertEqual(r.meta, {})

-        meta = {"lala": "lolo"}
        headers = {"caca": "coco"}
        body = "a body"
-        r = self.response_class("http://www.example.com", meta=meta, headers=headers, body=body)
+        r = self.response_class("http://www.example.com", headers=headers, body=body)

-        assert r.meta is not meta
-        self.assertEqual(r.meta, meta)
        assert r.headers is not headers
        self.assertEqual(r.headers["caca"], "coco")

@ -48,17 +44,12 @@ class BaseResponseTest(unittest.TestCase):
        """Test Response copy"""

        r1 = self.response_class("http://www.example.com", body="Some body")
-        r1.meta['foo'] = 'bar'
        r1.flags.append('cached')
        r2 = r1.copy()

        self.assertEqual(r1.status, r2.status)
        self.assertEqual(r1.body, r2.body)

-        # make sure meta dict is shallow copied
-        assert r1.meta is not r2.meta, "meta must be a shallow copy, not identical"
-        self.assertEqual(r1.meta, r2.meta)
-
        # make sure flags list is shallow copied
        assert r1.flags is not r2.flags, "flags must be a shallow copy, not identical"
        self.assertEqual(r1.flags, r2.flags)
@ -67,6 +58,12 @@ class BaseResponseTest(unittest.TestCase):
        assert r1.headers is not r2.headers, "headers must be a shallow copy, not identical"
        self.assertEqual(r1.headers, r2.headers)

+    def test_copy_meta(self):
+        req = Request("http://www.example.com")
+        req.meta['foo'] = 'bar'
+        r1 = self.response_class("http://www.example.com", body="Some body", request=req)
+        assert r1.meta is req.meta
+
    def test_copy_inherited_classes(self):
        """Test Response children copies preserve their class"""

@ -90,10 +87,9 @@ class BaseResponseTest(unittest.TestCase):
        self.assertEqual((r1.headers, r2.headers), ({}, hdrs))

        # Empty attributes (which may fail if not compared properly)
-        r3 = self.response_class("http://www.example.com", meta={'a': 1}, flags=['cached'])
-        r4 = r3.replace(body='', meta={}, flags=[])
+        r3 = self.response_class("http://www.example.com", flags=['cached'])
+        r4 = r3.replace(body='', flags=[])
        self.assertEqual(r4.body, '')
-        self.assertEqual(r4.meta, {})
        self.assertEqual(r4.flags, [])

    def test_weakref_slots(self):