mirror of
https://github.com/scrapy/scrapy.git
synced 2025-03-15 10:51:48 +00:00
Get server IP address for HTTP/1.1 responses
This commit is contained in:
parent
8b8df31961
commit
80925ab845
@ -34,7 +34,7 @@ Request objects
|
||||
:type url: string
|
||||
|
||||
:param callback: the function that will be called with the response of this
|
||||
request (once its downloaded) as its first parameter. For more information
|
||||
request (once it's downloaded) as its first parameter. For more information
|
||||
see :ref:`topics-request-response-ref-request-callback-arguments` below.
|
||||
If a Request doesn't specify a callback, the spider's
|
||||
:meth:`~scrapy.spiders.Spider.parse` method will be used.
|
||||
@ -611,6 +611,12 @@ Response objects
|
||||
This represents the :class:`Request` that generated this response.
|
||||
:type request: :class:`Request` object
|
||||
|
||||
:param ip_address: The IP address of the server from which the Response originated.
|
||||
:type ip_address: :class:`ipaddress.IPv4Address` object
|
||||
|
||||
.. FIXME: Add ipaddress.IPv6Address once it's supported
|
||||
|
||||
|
||||
.. attribute:: Response.url
|
||||
|
||||
A string containing the URL of the response.
|
||||
@ -679,6 +685,10 @@ Response objects
|
||||
they're shown on the string representation of the Response (`__str__`
|
||||
method) which is used by the engine for logging.
|
||||
|
||||
.. attribute:: Response.ip_address
|
||||
|
||||
The IP address of the server from which the Response originated.
|
||||
|
||||
.. method:: Response.copy()
|
||||
|
||||
Returns a new Response which is a copy of this Response.
|
||||
|
@ -172,7 +172,7 @@ class Downloader(object):
|
||||
return response
|
||||
dfd.addCallback(_downloaded)
|
||||
|
||||
# 3. After response arrives, remove the request from transferring
|
||||
# 3. After response arrives, remove the request from transferring
|
||||
# state to free up the transferring slot so it can be used by the
|
||||
# following requests (perhaps those which came from the downloader
|
||||
# middleware itself)
|
||||
|
@ -4,6 +4,7 @@ import logging
|
||||
import re
|
||||
import warnings
|
||||
from io import BytesIO
|
||||
from ipaddress import ip_address
|
||||
from time import time
|
||||
from urllib.parse import urldefrag
|
||||
|
||||
@ -382,7 +383,7 @@ class ScrapyAgent(object):
|
||||
def _cb_bodyready(self, txresponse, request):
|
||||
# deliverBody hangs for responses without body
|
||||
if txresponse.length == 0:
|
||||
return txresponse, b'', None
|
||||
return txresponse, b'', None, None
|
||||
|
||||
maxsize = request.meta.get('download_maxsize', self._maxsize)
|
||||
warnsize = request.meta.get('download_warnsize', self._warnsize)
|
||||
@ -418,11 +419,11 @@ class ScrapyAgent(object):
|
||||
return d
|
||||
|
||||
def _cb_bodydone(self, result, request, url):
|
||||
txresponse, body, flags = result
|
||||
txresponse, body, flags, ip_address = result
|
||||
status = int(txresponse.code)
|
||||
headers = Headers(txresponse.headers.getAllRawHeaders())
|
||||
respcls = responsetypes.from_args(headers=headers, url=url, body=body)
|
||||
return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
|
||||
return respcls(url=url, status=status, headers=headers, body=body, flags=flags, ip_address=ip_address)
|
||||
|
||||
|
||||
@implementer(IBodyProducer)
|
||||
@ -456,6 +457,11 @@ class _ResponseReader(protocol.Protocol):
|
||||
self._fail_on_dataloss_warned = False
|
||||
self._reached_warnsize = False
|
||||
self._bytes_received = 0
|
||||
self._ip_address = None
|
||||
|
||||
def connectionMade(self):
|
||||
if self._ip_address is None:
|
||||
self._ip_address = ip_address(self.transport._producer.getPeer().host)
|
||||
|
||||
def dataReceived(self, bodyBytes):
|
||||
# This maybe called several times after cancel was called with buffered data.
|
||||
@ -488,16 +494,16 @@ class _ResponseReader(protocol.Protocol):
|
||||
|
||||
body = self._bodybuf.getvalue()
|
||||
if reason.check(ResponseDone):
|
||||
self._finished.callback((self._txresponse, body, None))
|
||||
self._finished.callback((self._txresponse, body, None, self._ip_address))
|
||||
return
|
||||
|
||||
if reason.check(PotentialDataLoss):
|
||||
self._finished.callback((self._txresponse, body, ['partial']))
|
||||
self._finished.callback((self._txresponse, body, ['partial'], self._ip_address))
|
||||
return
|
||||
|
||||
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
|
||||
if not self._fail_on_dataloss:
|
||||
self._finished.callback((self._txresponse, body, ['dataloss']))
|
||||
self._finished.callback((self._txresponse, body, ['dataloss'], self._ip_address))
|
||||
return
|
||||
|
||||
elif not self._fail_on_dataloss_warned:
|
||||
|
@ -17,13 +17,14 @@ from scrapy.utils.trackref import object_ref
|
||||
|
||||
class Response(object_ref):
|
||||
|
||||
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None):
|
||||
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None, ip_address=None):
|
||||
self.headers = Headers(headers or {})
|
||||
self.status = int(status)
|
||||
self._set_body(body)
|
||||
self._set_url(url)
|
||||
self.request = request
|
||||
self.flags = [] if flags is None else list(flags)
|
||||
self.ip_address = ip_address
|
||||
|
||||
@property
|
||||
def meta(self):
|
||||
@ -76,7 +77,7 @@ class Response(object_ref):
|
||||
"""Create a new Response with the same attributes except for those
|
||||
given new values.
|
||||
"""
|
||||
for x in ['url', 'status', 'headers', 'body', 'request', 'flags']:
|
||||
for x in ['url', 'status', 'headers', 'body', 'request', 'flags', 'ip_address']:
|
||||
kwargs.setdefault(x, getattr(self, x))
|
||||
cls = kwargs.pop('cls', self.__class__)
|
||||
return cls(*args, **kwargs)
|
||||
|
@ -1,5 +1,7 @@
|
||||
import json
|
||||
import logging
|
||||
from ipaddress import IPv4Address
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from testfixtures import LogCapture
|
||||
from twisted.internet import defer
|
||||
@ -308,3 +310,28 @@ with multiples lines
|
||||
self.assertIn("[callback] status 201", str(log))
|
||||
self.assertIn("[errback] status 404", str(log))
|
||||
self.assertIn("[errback] status 500", str(log))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_dns_server_ip_address(self):
|
||||
from socket import gethostbyname
|
||||
|
||||
crawler = self.runner.create_crawler(SingleRequestSpider)
|
||||
url = 'https://example.org'
|
||||
yield crawler.crawl(seed=url)
|
||||
ip_address = crawler.spider.meta['responses'][0].ip_address
|
||||
self.assertIsInstance(ip_address, IPv4Address)
|
||||
self.assertEqual(str(ip_address), gethostbyname(urlparse(url).netloc))
|
||||
|
||||
crawler = self.runner.create_crawler(SingleRequestSpider)
|
||||
url = self.mockserver.url('/status?n=200')
|
||||
yield crawler.crawl(seed=url, mockserver=self.mockserver)
|
||||
ip_address = crawler.spider.meta['responses'][0].ip_address
|
||||
self.assertIsNone(ip_address)
|
||||
|
||||
crawler = self.runner.create_crawler(SingleRequestSpider)
|
||||
url = self.mockserver.url('/echo?body=test')
|
||||
expected_netloc, _ = urlparse(url).netloc.split(':')
|
||||
yield crawler.crawl(seed=url, mockserver=self.mockserver)
|
||||
ip_address = crawler.spider.meta['responses'][0].ip_address
|
||||
self.assertIsInstance(ip_address, IPv4Address)
|
||||
self.assertEqual(str(ip_address), gethostbyname(expected_netloc))
|
||||
|
Loading…
x
Reference in New Issue
Block a user