1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 22:44:03 +00:00

* Made Response.meta attribute map to Request.meta attribute. Closes #290

* Record redirected URLs in redirect middleware. Closes #291
This commit is contained in:
Pablo Hoffman 2010-11-18 12:51:54 -02:00
parent ac007802d6
commit 91a7c25797
7 changed files with 56 additions and 34 deletions

View File

@ -382,16 +382,21 @@ HttpProxyMiddleware
.. _urllib2: http://docs.python.org/library/urllib2.html .. _urllib2: http://docs.python.org/library/urllib2.html
RedirectMiddleware RedirectMiddleware
------------------- ------------------
.. module:: scrapy.contrib.downloadermiddleware.redirect .. module:: scrapy.contrib.downloadermiddleware.redirect
:synopsis: Redirection Middleware :synopsis: Redirection Middleware
.. class:: RedirectMiddleware .. class:: RedirectMiddleware
This middlware handles redirection of requests based on response status and This middleware handles redirection of requests based on response status and
meta-refresh html tag. meta-refresh html tag.
.. reqmeta:: redirect_urls
The urls which the request goes through (while being redirected) can be found
in the ``redirect_urls`` :attr:`Request.meta <scrapy.http.Request.meta>` key.
The :class:`RedirectMiddleware` can be configured through the following The :class:`RedirectMiddleware` can be configured through the following
settings (see the settings documentation for more info): settings (see the settings documentation for more info):

View File

@ -138,7 +138,8 @@ Request objects
recognized by Scrapy. recognized by Scrapy.
This dict is `shallow copied`_ when the request is cloned using the This dict is `shallow copied`_ when the request is cloned using the
``copy()`` or ``replace()`` methods. ``copy()`` or ``replace()`` methods, and can also be accesed, in your
spider, from the ``response.meta`` attribute.
.. _shallow copied: http://docs.python.org/library/copy.html .. _shallow copied: http://docs.python.org/library/copy.html
@ -221,6 +222,7 @@ Those are:
* :reqmeta:`dont_retry` * :reqmeta:`dont_retry`
* :reqmeta:`handle_httpstatus_list` * :reqmeta:`handle_httpstatus_list`
* ``dont_merge_cookies`` (see ``cookies`` parameter of :class:`Request` constructor) * ``dont_merge_cookies`` (see ``cookies`` parameter of :class:`Request` constructor)
* :reqmeta:`redirect_urls`
.. _topics-request-response-ref-request-subclasses: .. _topics-request-response-ref-request-subclasses:
@ -353,7 +355,7 @@ method for this job. Here's an example spider which uses it::
Response objects Response objects
================ ================
.. class:: Response(url, [status=200, headers, body, meta, flags]) .. class:: Response(url, [status=200, headers, body, flags])
A :class:`Response` object represents an HTTP response, which is usually A :class:`Response` object represents an HTTP response, which is usually
downloaded (by the Downloader) and fed to the Spiders for processing. downloaded (by the Downloader) and fed to the Spiders for processing.
@ -429,9 +431,14 @@ Response objects
.. attribute:: Response.meta .. attribute:: Response.meta
A dict that contains arbitrary metadata for this response, similar to the A shortcut to the :attr:`Request.meta` attribute of the
:attr:`Request.meta` attribute. See the :attr:`Request.meta` attribute for :attr:`Response.request` object (ie. ``self.request.meta``).
more info.
Unlike the :attr:`Response.request` attribute, the :attr:`Response.meta`
attribute is propagated along redirects and retries, so you will get
the original :attr:`Request.meta` sent from your spider.
.. seealso:: :attr:`Request.meta` attribute
.. attribute:: Response.flags .. attribute:: Response.flags

View File

@ -50,6 +50,8 @@ class RedirectMiddleware(object):
if ttl and redirects <= self.max_redirect_times: if ttl and redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
[request.url]
redirected.dont_filter = request.dont_filter redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust redirected.priority = request.priority + self.priority_adjust
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request),

View File

@ -13,23 +13,24 @@ from scrapy.http.common import deprecated_setter
class Response(object_ref): class Response(object_ref):
__slots__ = ['_url', 'headers', 'status', '_body', 'request', '_meta', \ __slots__ = ['_url', 'headers', 'status', '_body', 'request', \
'flags', '__weakref__'] 'flags', '__weakref__']
def __init__(self, url, status=200, headers=None, body='', meta=None, flags=None): def __init__(self, url, status=200, headers=None, body='', flags=None, request=None):
self.headers = Headers(headers or {}) self.headers = Headers(headers or {})
self.status = int(status) self.status = int(status)
self._set_body(body) self._set_body(body)
self._set_url(url) self._set_url(url)
self.request = None self.request = request
self.flags = [] if flags is None else list(flags) self.flags = [] if flags is None else list(flags)
self._meta = dict(meta) if meta else None
@property @property
def meta(self): def meta(self):
if self._meta is None: try:
self._meta = {} return self.request.meta
return self._meta except AttributeError:
raise AttributeError("Response.meta not available, this response " \
"is not tied to any request")
def _get_url(self): def _get_url(self):
return self._url return self._url
@ -61,7 +62,7 @@ class Response(object_ref):
body = property(_get_body, deprecated_setter(_set_body, 'body')) body = property(_get_body, deprecated_setter(_set_body, 'body'))
def __repr__(self): def __repr__(self):
attrs = ['url', 'status', 'body', 'headers', 'meta', 'flags'] attrs = ['url', 'status', 'body', 'headers', 'request', 'flags']
args = ", ".join(["%s=%r" % (a, getattr(self, a)) for a in attrs]) args = ", ".join(["%s=%r" % (a, getattr(self, a)) for a in attrs])
return "%s(%s)" % (self.__class__.__name__, args) return "%s(%s)" % (self.__class__.__name__, args)
@ -76,7 +77,7 @@ class Response(object_ref):
"""Create a new Response with the same attributes except for those """Create a new Response with the same attributes except for those
given new values. given new values.
""" """
for x in ['url', 'status', 'headers', 'body', 'meta', 'flags']: for x in ['url', 'status', 'headers', 'body', 'request', 'flags']:
kwargs.setdefault(x, getattr(self, x)) kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__) cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs) return cls(*args, **kwargs)

View File

@ -26,12 +26,11 @@ class TextResponse(Response):
__slots__ = ['_encoding', '_cached_benc', '_cached_ubody'] __slots__ = ['_encoding', '_cached_benc', '_cached_ubody']
def __init__(self, url, status=200, headers=None, body=None, meta=None, \ def __init__(self, *args, **kwargs):
flags=None, encoding=None): self._encoding = kwargs.pop('encoding', None)
self._encoding = encoding
self._cached_benc = None self._cached_benc = None
self._cached_ubody = None self._cached_ubody = None
super(TextResponse, self).__init__(url, status, headers, body, meta, flags) super(TextResponse, self).__init__(*args, **kwargs)
def _set_url(self, url): def _set_url(self, url):
if isinstance(url, unicode): if isinstance(url, unicode):

View File

@ -145,5 +145,17 @@ class RedirectMiddlewareTest(unittest.TestCase):
assert isinstance(req, Request) assert isinstance(req, Request)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider) self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
def test_redirect_urls(self):
req1 = Request('http://scrapytest.org/first')
rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
req2 = self.mw.process_response(req1, rsp1, self.spider)
rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
req3 = self.mw.process_response(req2, rsp2, self.spider)
self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -1,7 +1,7 @@
import unittest import unittest
import weakref import weakref
from scrapy.http import Response, TextResponse, HtmlResponse, XmlResponse, Headers from scrapy.http import Request, Response, TextResponse, HtmlResponse, XmlResponse, Headers
from scrapy.utils.encoding import resolve_encoding from scrapy.utils.encoding import resolve_encoding
@ -26,15 +26,11 @@ class BaseResponseTest(unittest.TestCase):
assert isinstance(r.headers, Headers) assert isinstance(r.headers, Headers)
self.assertEqual(r.headers, {}) self.assertEqual(r.headers, {})
self.assertEqual(r.meta, {})
meta = {"lala": "lolo"}
headers = {"caca": "coco"} headers = {"caca": "coco"}
body = "a body" body = "a body"
r = self.response_class("http://www.example.com", meta=meta, headers=headers, body=body) r = self.response_class("http://www.example.com", headers=headers, body=body)
assert r.meta is not meta
self.assertEqual(r.meta, meta)
assert r.headers is not headers assert r.headers is not headers
self.assertEqual(r.headers["caca"], "coco") self.assertEqual(r.headers["caca"], "coco")
@ -48,17 +44,12 @@ class BaseResponseTest(unittest.TestCase):
"""Test Response copy""" """Test Response copy"""
r1 = self.response_class("http://www.example.com", body="Some body") r1 = self.response_class("http://www.example.com", body="Some body")
r1.meta['foo'] = 'bar'
r1.flags.append('cached') r1.flags.append('cached')
r2 = r1.copy() r2 = r1.copy()
self.assertEqual(r1.status, r2.status) self.assertEqual(r1.status, r2.status)
self.assertEqual(r1.body, r2.body) self.assertEqual(r1.body, r2.body)
# make sure meta dict is shallow copied
assert r1.meta is not r2.meta, "meta must be a shallow copy, not identical"
self.assertEqual(r1.meta, r2.meta)
# make sure flags list is shallow copied # make sure flags list is shallow copied
assert r1.flags is not r2.flags, "flags must be a shallow copy, not identical" assert r1.flags is not r2.flags, "flags must be a shallow copy, not identical"
self.assertEqual(r1.flags, r2.flags) self.assertEqual(r1.flags, r2.flags)
@ -67,6 +58,12 @@ class BaseResponseTest(unittest.TestCase):
assert r1.headers is not r2.headers, "headers must be a shallow copy, not identical" assert r1.headers is not r2.headers, "headers must be a shallow copy, not identical"
self.assertEqual(r1.headers, r2.headers) self.assertEqual(r1.headers, r2.headers)
def test_copy_meta(self):
req = Request("http://www.example.com")
req.meta['foo'] = 'bar'
r1 = self.response_class("http://www.example.com", body="Some body", request=req)
assert r1.meta is req.meta
def test_copy_inherited_classes(self): def test_copy_inherited_classes(self):
"""Test Response children copies preserve their class""" """Test Response children copies preserve their class"""
@ -90,10 +87,9 @@ class BaseResponseTest(unittest.TestCase):
self.assertEqual((r1.headers, r2.headers), ({}, hdrs)) self.assertEqual((r1.headers, r2.headers), ({}, hdrs))
# Empty attributes (which may fail if not compared properly) # Empty attributes (which may fail if not compared properly)
r3 = self.response_class("http://www.example.com", meta={'a': 1}, flags=['cached']) r3 = self.response_class("http://www.example.com", flags=['cached'])
r4 = r3.replace(body='', meta={}, flags=[]) r4 = r3.replace(body='', flags=[])
self.assertEqual(r4.body, '') self.assertEqual(r4.body, '')
self.assertEqual(r4.meta, {})
self.assertEqual(r4.flags, []) self.assertEqual(r4.flags, [])
def test_weakref_slots(self): def test_weakref_slots(self):