mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 22:44:03 +00:00
* Made Response.meta attribute map to Request.meta attribute. Closes #290
* Record redirected URLs in redirect middleware. Closes #291
This commit is contained in:
parent
ac007802d6
commit
91a7c25797
@ -382,16 +382,21 @@ HttpProxyMiddleware
|
|||||||
.. _urllib2: http://docs.python.org/library/urllib2.html
|
.. _urllib2: http://docs.python.org/library/urllib2.html
|
||||||
|
|
||||||
RedirectMiddleware
|
RedirectMiddleware
|
||||||
-------------------
|
------------------
|
||||||
|
|
||||||
.. module:: scrapy.contrib.downloadermiddleware.redirect
|
.. module:: scrapy.contrib.downloadermiddleware.redirect
|
||||||
:synopsis: Redirection Middleware
|
:synopsis: Redirection Middleware
|
||||||
|
|
||||||
.. class:: RedirectMiddleware
|
.. class:: RedirectMiddleware
|
||||||
|
|
||||||
This middlware handles redirection of requests based on response status and
|
This middleware handles redirection of requests based on response status and
|
||||||
meta-refresh html tag.
|
meta-refresh html tag.
|
||||||
|
|
||||||
|
.. reqmeta:: redirect_urls
|
||||||
|
|
||||||
|
The urls which the request goes through (while being redirected) can be found
|
||||||
|
in the ``redirect_urls`` :attr:`Request.meta <scrapy.http.Request.meta>` key.
|
||||||
|
|
||||||
The :class:`RedirectMiddleware` can be configured through the following
|
The :class:`RedirectMiddleware` can be configured through the following
|
||||||
settings (see the settings documentation for more info):
|
settings (see the settings documentation for more info):
|
||||||
|
|
||||||
|
@ -138,7 +138,8 @@ Request objects
|
|||||||
recognized by Scrapy.
|
recognized by Scrapy.
|
||||||
|
|
||||||
This dict is `shallow copied`_ when the request is cloned using the
|
This dict is `shallow copied`_ when the request is cloned using the
|
||||||
``copy()`` or ``replace()`` methods.
|
``copy()`` or ``replace()`` methods, and can also be accesed, in your
|
||||||
|
spider, from the ``response.meta`` attribute.
|
||||||
|
|
||||||
.. _shallow copied: http://docs.python.org/library/copy.html
|
.. _shallow copied: http://docs.python.org/library/copy.html
|
||||||
|
|
||||||
@ -221,6 +222,7 @@ Those are:
|
|||||||
* :reqmeta:`dont_retry`
|
* :reqmeta:`dont_retry`
|
||||||
* :reqmeta:`handle_httpstatus_list`
|
* :reqmeta:`handle_httpstatus_list`
|
||||||
* ``dont_merge_cookies`` (see ``cookies`` parameter of :class:`Request` constructor)
|
* ``dont_merge_cookies`` (see ``cookies`` parameter of :class:`Request` constructor)
|
||||||
|
* :reqmeta:`redirect_urls`
|
||||||
|
|
||||||
.. _topics-request-response-ref-request-subclasses:
|
.. _topics-request-response-ref-request-subclasses:
|
||||||
|
|
||||||
@ -353,7 +355,7 @@ method for this job. Here's an example spider which uses it::
|
|||||||
Response objects
|
Response objects
|
||||||
================
|
================
|
||||||
|
|
||||||
.. class:: Response(url, [status=200, headers, body, meta, flags])
|
.. class:: Response(url, [status=200, headers, body, flags])
|
||||||
|
|
||||||
A :class:`Response` object represents an HTTP response, which is usually
|
A :class:`Response` object represents an HTTP response, which is usually
|
||||||
downloaded (by the Downloader) and fed to the Spiders for processing.
|
downloaded (by the Downloader) and fed to the Spiders for processing.
|
||||||
@ -429,9 +431,14 @@ Response objects
|
|||||||
|
|
||||||
.. attribute:: Response.meta
|
.. attribute:: Response.meta
|
||||||
|
|
||||||
A dict that contains arbitrary metadata for this response, similar to the
|
A shortcut to the :attr:`Request.meta` attribute of the
|
||||||
:attr:`Request.meta` attribute. See the :attr:`Request.meta` attribute for
|
:attr:`Response.request` object (ie. ``self.request.meta``).
|
||||||
more info.
|
|
||||||
|
Unlike the :attr:`Response.request` attribute, the :attr:`Response.meta`
|
||||||
|
attribute is propagated along redirects and retries, so you will get
|
||||||
|
the original :attr:`Request.meta` sent from your spider.
|
||||||
|
|
||||||
|
.. seealso:: :attr:`Request.meta` attribute
|
||||||
|
|
||||||
.. attribute:: Response.flags
|
.. attribute:: Response.flags
|
||||||
|
|
||||||
|
@ -50,6 +50,8 @@ class RedirectMiddleware(object):
|
|||||||
if ttl and redirects <= self.max_redirect_times:
|
if ttl and redirects <= self.max_redirect_times:
|
||||||
redirected.meta['redirect_times'] = redirects
|
redirected.meta['redirect_times'] = redirects
|
||||||
redirected.meta['redirect_ttl'] = ttl - 1
|
redirected.meta['redirect_ttl'] = ttl - 1
|
||||||
|
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
|
||||||
|
[request.url]
|
||||||
redirected.dont_filter = request.dont_filter
|
redirected.dont_filter = request.dont_filter
|
||||||
redirected.priority = request.priority + self.priority_adjust
|
redirected.priority = request.priority + self.priority_adjust
|
||||||
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request),
|
log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request),
|
||||||
|
@ -13,23 +13,24 @@ from scrapy.http.common import deprecated_setter
|
|||||||
|
|
||||||
class Response(object_ref):
|
class Response(object_ref):
|
||||||
|
|
||||||
__slots__ = ['_url', 'headers', 'status', '_body', 'request', '_meta', \
|
__slots__ = ['_url', 'headers', 'status', '_body', 'request', \
|
||||||
'flags', '__weakref__']
|
'flags', '__weakref__']
|
||||||
|
|
||||||
def __init__(self, url, status=200, headers=None, body='', meta=None, flags=None):
|
def __init__(self, url, status=200, headers=None, body='', flags=None, request=None):
|
||||||
self.headers = Headers(headers or {})
|
self.headers = Headers(headers or {})
|
||||||
self.status = int(status)
|
self.status = int(status)
|
||||||
self._set_body(body)
|
self._set_body(body)
|
||||||
self._set_url(url)
|
self._set_url(url)
|
||||||
self.request = None
|
self.request = request
|
||||||
self.flags = [] if flags is None else list(flags)
|
self.flags = [] if flags is None else list(flags)
|
||||||
self._meta = dict(meta) if meta else None
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def meta(self):
|
def meta(self):
|
||||||
if self._meta is None:
|
try:
|
||||||
self._meta = {}
|
return self.request.meta
|
||||||
return self._meta
|
except AttributeError:
|
||||||
|
raise AttributeError("Response.meta not available, this response " \
|
||||||
|
"is not tied to any request")
|
||||||
|
|
||||||
def _get_url(self):
|
def _get_url(self):
|
||||||
return self._url
|
return self._url
|
||||||
@ -61,7 +62,7 @@ class Response(object_ref):
|
|||||||
body = property(_get_body, deprecated_setter(_set_body, 'body'))
|
body = property(_get_body, deprecated_setter(_set_body, 'body'))
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
attrs = ['url', 'status', 'body', 'headers', 'meta', 'flags']
|
attrs = ['url', 'status', 'body', 'headers', 'request', 'flags']
|
||||||
args = ", ".join(["%s=%r" % (a, getattr(self, a)) for a in attrs])
|
args = ", ".join(["%s=%r" % (a, getattr(self, a)) for a in attrs])
|
||||||
return "%s(%s)" % (self.__class__.__name__, args)
|
return "%s(%s)" % (self.__class__.__name__, args)
|
||||||
|
|
||||||
@ -76,7 +77,7 @@ class Response(object_ref):
|
|||||||
"""Create a new Response with the same attributes except for those
|
"""Create a new Response with the same attributes except for those
|
||||||
given new values.
|
given new values.
|
||||||
"""
|
"""
|
||||||
for x in ['url', 'status', 'headers', 'body', 'meta', 'flags']:
|
for x in ['url', 'status', 'headers', 'body', 'request', 'flags']:
|
||||||
kwargs.setdefault(x, getattr(self, x))
|
kwargs.setdefault(x, getattr(self, x))
|
||||||
cls = kwargs.pop('cls', self.__class__)
|
cls = kwargs.pop('cls', self.__class__)
|
||||||
return cls(*args, **kwargs)
|
return cls(*args, **kwargs)
|
||||||
|
@ -26,12 +26,11 @@ class TextResponse(Response):
|
|||||||
|
|
||||||
__slots__ = ['_encoding', '_cached_benc', '_cached_ubody']
|
__slots__ = ['_encoding', '_cached_benc', '_cached_ubody']
|
||||||
|
|
||||||
def __init__(self, url, status=200, headers=None, body=None, meta=None, \
|
def __init__(self, *args, **kwargs):
|
||||||
flags=None, encoding=None):
|
self._encoding = kwargs.pop('encoding', None)
|
||||||
self._encoding = encoding
|
|
||||||
self._cached_benc = None
|
self._cached_benc = None
|
||||||
self._cached_ubody = None
|
self._cached_ubody = None
|
||||||
super(TextResponse, self).__init__(url, status, headers, body, meta, flags)
|
super(TextResponse, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
def _set_url(self, url):
|
def _set_url(self, url):
|
||||||
if isinstance(url, unicode):
|
if isinstance(url, unicode):
|
||||||
|
@ -145,5 +145,17 @@ class RedirectMiddlewareTest(unittest.TestCase):
|
|||||||
assert isinstance(req, Request)
|
assert isinstance(req, Request)
|
||||||
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
|
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
|
||||||
|
|
||||||
|
def test_redirect_urls(self):
|
||||||
|
req1 = Request('http://scrapytest.org/first')
|
||||||
|
rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
|
||||||
|
req2 = self.mw.process_response(req1, rsp1, self.spider)
|
||||||
|
rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
|
||||||
|
req3 = self.mw.process_response(req2, rsp2, self.spider)
|
||||||
|
|
||||||
|
self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
|
||||||
|
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
|
||||||
|
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
|
||||||
|
self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import unittest
|
import unittest
|
||||||
import weakref
|
import weakref
|
||||||
|
|
||||||
from scrapy.http import Response, TextResponse, HtmlResponse, XmlResponse, Headers
|
from scrapy.http import Request, Response, TextResponse, HtmlResponse, XmlResponse, Headers
|
||||||
from scrapy.utils.encoding import resolve_encoding
|
from scrapy.utils.encoding import resolve_encoding
|
||||||
|
|
||||||
|
|
||||||
@ -26,15 +26,11 @@ class BaseResponseTest(unittest.TestCase):
|
|||||||
|
|
||||||
assert isinstance(r.headers, Headers)
|
assert isinstance(r.headers, Headers)
|
||||||
self.assertEqual(r.headers, {})
|
self.assertEqual(r.headers, {})
|
||||||
self.assertEqual(r.meta, {})
|
|
||||||
|
|
||||||
meta = {"lala": "lolo"}
|
|
||||||
headers = {"caca": "coco"}
|
headers = {"caca": "coco"}
|
||||||
body = "a body"
|
body = "a body"
|
||||||
r = self.response_class("http://www.example.com", meta=meta, headers=headers, body=body)
|
r = self.response_class("http://www.example.com", headers=headers, body=body)
|
||||||
|
|
||||||
assert r.meta is not meta
|
|
||||||
self.assertEqual(r.meta, meta)
|
|
||||||
assert r.headers is not headers
|
assert r.headers is not headers
|
||||||
self.assertEqual(r.headers["caca"], "coco")
|
self.assertEqual(r.headers["caca"], "coco")
|
||||||
|
|
||||||
@ -48,17 +44,12 @@ class BaseResponseTest(unittest.TestCase):
|
|||||||
"""Test Response copy"""
|
"""Test Response copy"""
|
||||||
|
|
||||||
r1 = self.response_class("http://www.example.com", body="Some body")
|
r1 = self.response_class("http://www.example.com", body="Some body")
|
||||||
r1.meta['foo'] = 'bar'
|
|
||||||
r1.flags.append('cached')
|
r1.flags.append('cached')
|
||||||
r2 = r1.copy()
|
r2 = r1.copy()
|
||||||
|
|
||||||
self.assertEqual(r1.status, r2.status)
|
self.assertEqual(r1.status, r2.status)
|
||||||
self.assertEqual(r1.body, r2.body)
|
self.assertEqual(r1.body, r2.body)
|
||||||
|
|
||||||
# make sure meta dict is shallow copied
|
|
||||||
assert r1.meta is not r2.meta, "meta must be a shallow copy, not identical"
|
|
||||||
self.assertEqual(r1.meta, r2.meta)
|
|
||||||
|
|
||||||
# make sure flags list is shallow copied
|
# make sure flags list is shallow copied
|
||||||
assert r1.flags is not r2.flags, "flags must be a shallow copy, not identical"
|
assert r1.flags is not r2.flags, "flags must be a shallow copy, not identical"
|
||||||
self.assertEqual(r1.flags, r2.flags)
|
self.assertEqual(r1.flags, r2.flags)
|
||||||
@ -67,6 +58,12 @@ class BaseResponseTest(unittest.TestCase):
|
|||||||
assert r1.headers is not r2.headers, "headers must be a shallow copy, not identical"
|
assert r1.headers is not r2.headers, "headers must be a shallow copy, not identical"
|
||||||
self.assertEqual(r1.headers, r2.headers)
|
self.assertEqual(r1.headers, r2.headers)
|
||||||
|
|
||||||
|
def test_copy_meta(self):
|
||||||
|
req = Request("http://www.example.com")
|
||||||
|
req.meta['foo'] = 'bar'
|
||||||
|
r1 = self.response_class("http://www.example.com", body="Some body", request=req)
|
||||||
|
assert r1.meta is req.meta
|
||||||
|
|
||||||
def test_copy_inherited_classes(self):
|
def test_copy_inherited_classes(self):
|
||||||
"""Test Response children copies preserve their class"""
|
"""Test Response children copies preserve their class"""
|
||||||
|
|
||||||
@ -90,10 +87,9 @@ class BaseResponseTest(unittest.TestCase):
|
|||||||
self.assertEqual((r1.headers, r2.headers), ({}, hdrs))
|
self.assertEqual((r1.headers, r2.headers), ({}, hdrs))
|
||||||
|
|
||||||
# Empty attributes (which may fail if not compared properly)
|
# Empty attributes (which may fail if not compared properly)
|
||||||
r3 = self.response_class("http://www.example.com", meta={'a': 1}, flags=['cached'])
|
r3 = self.response_class("http://www.example.com", flags=['cached'])
|
||||||
r4 = r3.replace(body='', meta={}, flags=[])
|
r4 = r3.replace(body='', flags=[])
|
||||||
self.assertEqual(r4.body, '')
|
self.assertEqual(r4.body, '')
|
||||||
self.assertEqual(r4.meta, {})
|
|
||||||
self.assertEqual(r4.flags, [])
|
self.assertEqual(r4.flags, [])
|
||||||
|
|
||||||
def test_weakref_slots(self):
|
def test_weakref_slots(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user