mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 13:04:20 +00:00
* Made Response.meta attribute map to Request.meta attribute. Closes #290
* Record redirected URLs in redirect middleware. Closes #291
This commit is contained in:
parent
ac007802d6
commit
91a7c25797
@ -382,16 +382,21 @@ HttpProxyMiddleware
|
||||
.. _urllib2: http://docs.python.org/library/urllib2.html
|
||||
|
||||
RedirectMiddleware
|
||||
-------------------
|
||||
------------------
|
||||
|
||||
.. module:: scrapy.contrib.downloadermiddleware.redirect
|
||||
:synopsis: Redirection Middleware
|
||||
|
||||
.. class:: RedirectMiddleware
|
||||
|
||||
This middlware handles redirection of requests based on response status and
|
||||
This middleware handles redirection of requests based on response status and
|
||||
meta-refresh html tag.
|
||||
|
||||
.. reqmeta:: redirect_urls
|
||||
|
||||
The urls which the request goes through (while being redirected) can be found
|
||||
in the ``redirect_urls`` :attr:`Request.meta <scrapy.http.Request.meta>` key.
|
||||
|
||||
The :class:`RedirectMiddleware` can be configured through the following
|
||||
settings (see the settings documentation for more info):
|
||||
|
||||
|
@ -138,7 +138,8 @@ Request objects
|
||||
recognized by Scrapy.
|
||||
|
||||
This dict is `shallow copied`_ when the request is cloned using the
|
||||
``copy()`` or ``replace()`` methods.
|
||||
``copy()`` or ``replace()`` methods, and can also be accesed, in your
|
||||
spider, from the ``response.meta`` attribute.
|
||||
|
||||
.. _shallow copied: http://docs.python.org/library/copy.html
|
||||
|
||||
@ -221,6 +222,7 @@ Those are:
|
||||
* :reqmeta:`dont_retry`
|
||||
* :reqmeta:`handle_httpstatus_list`
|
||||
* ``dont_merge_cookies`` (see ``cookies`` parameter of :class:`Request` constructor)
|
||||
* :reqmeta:`redirect_urls`
|
||||
|
||||
.. _topics-request-response-ref-request-subclasses:
|
||||
|
||||
@ -353,7 +355,7 @@ method for this job. Here's an example spider which uses it::
|
||||
Response objects
|
||||
================
|
||||
|
||||
.. class:: Response(url, [status=200, headers, body, meta, flags])
|
||||
.. class:: Response(url, [status=200, headers, body, flags])
|
||||
|
||||
A :class:`Response` object represents an HTTP response, which is usually
|
||||
downloaded (by the Downloader) and fed to the Spiders for processing.
|
||||
@ -429,9 +431,14 @@ Response objects
|
||||
|
||||
.. attribute:: Response.meta
|
||||
|
||||
A dict that contains arbitrary metadata for this response, similar to the
|
||||
:attr:`Request.meta` attribute. See the :attr:`Request.meta` attribute for
|
||||
more info.
|
||||
A shortcut to the :attr:`Request.meta` attribute of the
|
||||
:attr:`Response.request` object (ie. ``self.request.meta``).
|
||||
|
||||
Unlike the :attr:`Response.request` attribute, the :attr:`Response.meta`
|
||||
attribute is propagated along redirects and retries, so you will get
|
||||
the original :attr:`Request.meta` sent from your spider.
|
||||
|
||||
.. seealso:: :attr:`Request.meta` attribute
|
||||
|
||||
.. attribute:: Response.flags
|
||||
|
||||
|
@ -50,6 +50,8 @@ class RedirectMiddleware(object):
|
||||
if ttl and redirects <= self.max_redirect_times:
|
||||
redirected.meta['redirect_times'] = redirects
|
||||
redirected.meta['redirect_ttl'] = ttl - 1
|
||||
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
|
||||
[request.url]
|
||||
redirected.dont_filter = request.dont_filter
|
||||
redirected.priority = request.priority + self.priority_adjust
|
||||
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request),
|
||||
|
@ -13,23 +13,24 @@ from scrapy.http.common import deprecated_setter
|
||||
|
||||
class Response(object_ref):
|
||||
|
||||
__slots__ = ['_url', 'headers', 'status', '_body', 'request', '_meta', \
|
||||
__slots__ = ['_url', 'headers', 'status', '_body', 'request', \
|
||||
'flags', '__weakref__']
|
||||
|
||||
def __init__(self, url, status=200, headers=None, body='', meta=None, flags=None):
|
||||
def __init__(self, url, status=200, headers=None, body='', flags=None, request=None):
|
||||
self.headers = Headers(headers or {})
|
||||
self.status = int(status)
|
||||
self._set_body(body)
|
||||
self._set_url(url)
|
||||
self.request = None
|
||||
self.request = request
|
||||
self.flags = [] if flags is None else list(flags)
|
||||
self._meta = dict(meta) if meta else None
|
||||
|
||||
@property
|
||||
def meta(self):
|
||||
if self._meta is None:
|
||||
self._meta = {}
|
||||
return self._meta
|
||||
try:
|
||||
return self.request.meta
|
||||
except AttributeError:
|
||||
raise AttributeError("Response.meta not available, this response " \
|
||||
"is not tied to any request")
|
||||
|
||||
def _get_url(self):
|
||||
return self._url
|
||||
@ -61,7 +62,7 @@ class Response(object_ref):
|
||||
body = property(_get_body, deprecated_setter(_set_body, 'body'))
|
||||
|
||||
def __repr__(self):
|
||||
attrs = ['url', 'status', 'body', 'headers', 'meta', 'flags']
|
||||
attrs = ['url', 'status', 'body', 'headers', 'request', 'flags']
|
||||
args = ", ".join(["%s=%r" % (a, getattr(self, a)) for a in attrs])
|
||||
return "%s(%s)" % (self.__class__.__name__, args)
|
||||
|
||||
@ -76,7 +77,7 @@ class Response(object_ref):
|
||||
"""Create a new Response with the same attributes except for those
|
||||
given new values.
|
||||
"""
|
||||
for x in ['url', 'status', 'headers', 'body', 'meta', 'flags']:
|
||||
for x in ['url', 'status', 'headers', 'body', 'request', 'flags']:
|
||||
kwargs.setdefault(x, getattr(self, x))
|
||||
cls = kwargs.pop('cls', self.__class__)
|
||||
return cls(*args, **kwargs)
|
||||
|
@ -26,12 +26,11 @@ class TextResponse(Response):
|
||||
|
||||
__slots__ = ['_encoding', '_cached_benc', '_cached_ubody']
|
||||
|
||||
def __init__(self, url, status=200, headers=None, body=None, meta=None, \
|
||||
flags=None, encoding=None):
|
||||
self._encoding = encoding
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._encoding = kwargs.pop('encoding', None)
|
||||
self._cached_benc = None
|
||||
self._cached_ubody = None
|
||||
super(TextResponse, self).__init__(url, status, headers, body, meta, flags)
|
||||
super(TextResponse, self).__init__(*args, **kwargs)
|
||||
|
||||
def _set_url(self, url):
|
||||
if isinstance(url, unicode):
|
||||
|
@ -145,5 +145,17 @@ class RedirectMiddlewareTest(unittest.TestCase):
|
||||
assert isinstance(req, Request)
|
||||
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
|
||||
|
||||
def test_redirect_urls(self):
|
||||
req1 = Request('http://scrapytest.org/first')
|
||||
rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
|
||||
req2 = self.mw.process_response(req1, rsp1, self.spider)
|
||||
rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
|
||||
req3 = self.mw.process_response(req2, rsp2, self.spider)
|
||||
|
||||
self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
|
||||
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
|
||||
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
|
||||
self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -1,7 +1,7 @@
|
||||
import unittest
|
||||
import weakref
|
||||
|
||||
from scrapy.http import Response, TextResponse, HtmlResponse, XmlResponse, Headers
|
||||
from scrapy.http import Request, Response, TextResponse, HtmlResponse, XmlResponse, Headers
|
||||
from scrapy.utils.encoding import resolve_encoding
|
||||
|
||||
|
||||
@ -26,15 +26,11 @@ class BaseResponseTest(unittest.TestCase):
|
||||
|
||||
assert isinstance(r.headers, Headers)
|
||||
self.assertEqual(r.headers, {})
|
||||
self.assertEqual(r.meta, {})
|
||||
|
||||
meta = {"lala": "lolo"}
|
||||
headers = {"caca": "coco"}
|
||||
body = "a body"
|
||||
r = self.response_class("http://www.example.com", meta=meta, headers=headers, body=body)
|
||||
r = self.response_class("http://www.example.com", headers=headers, body=body)
|
||||
|
||||
assert r.meta is not meta
|
||||
self.assertEqual(r.meta, meta)
|
||||
assert r.headers is not headers
|
||||
self.assertEqual(r.headers["caca"], "coco")
|
||||
|
||||
@ -48,17 +44,12 @@ class BaseResponseTest(unittest.TestCase):
|
||||
"""Test Response copy"""
|
||||
|
||||
r1 = self.response_class("http://www.example.com", body="Some body")
|
||||
r1.meta['foo'] = 'bar'
|
||||
r1.flags.append('cached')
|
||||
r2 = r1.copy()
|
||||
|
||||
self.assertEqual(r1.status, r2.status)
|
||||
self.assertEqual(r1.body, r2.body)
|
||||
|
||||
# make sure meta dict is shallow copied
|
||||
assert r1.meta is not r2.meta, "meta must be a shallow copy, not identical"
|
||||
self.assertEqual(r1.meta, r2.meta)
|
||||
|
||||
# make sure flags list is shallow copied
|
||||
assert r1.flags is not r2.flags, "flags must be a shallow copy, not identical"
|
||||
self.assertEqual(r1.flags, r2.flags)
|
||||
@ -67,6 +58,12 @@ class BaseResponseTest(unittest.TestCase):
|
||||
assert r1.headers is not r2.headers, "headers must be a shallow copy, not identical"
|
||||
self.assertEqual(r1.headers, r2.headers)
|
||||
|
||||
def test_copy_meta(self):
|
||||
req = Request("http://www.example.com")
|
||||
req.meta['foo'] = 'bar'
|
||||
r1 = self.response_class("http://www.example.com", body="Some body", request=req)
|
||||
assert r1.meta is req.meta
|
||||
|
||||
def test_copy_inherited_classes(self):
|
||||
"""Test Response children copies preserve their class"""
|
||||
|
||||
@ -90,10 +87,9 @@ class BaseResponseTest(unittest.TestCase):
|
||||
self.assertEqual((r1.headers, r2.headers), ({}, hdrs))
|
||||
|
||||
# Empty attributes (which may fail if not compared properly)
|
||||
r3 = self.response_class("http://www.example.com", meta={'a': 1}, flags=['cached'])
|
||||
r4 = r3.replace(body='', meta={}, flags=[])
|
||||
r3 = self.response_class("http://www.example.com", flags=['cached'])
|
||||
r4 = r3.replace(body='', flags=[])
|
||||
self.assertEqual(r4.body, '')
|
||||
self.assertEqual(r4.meta, {})
|
||||
self.assertEqual(r4.flags, [])
|
||||
|
||||
def test_weakref_slots(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user