1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-03-15 10:51:48 +00:00

Get server IP address for HTTP/1.1 responses

This commit is contained in:
Eugenio Lacuesta 2019-08-05 11:39:07 -03:00
parent 8b8df31961
commit 80925ab845
No known key found for this signature in database
GPG Key ID: DA3EF2D0913E9810
5 changed files with 54 additions and 10 deletions

View File

@ -34,7 +34,7 @@ Request objects
:type url: string
:param callback: the function that will be called with the response of this
request (once its downloaded) as its first parameter. For more information
request (once it's downloaded) as its first parameter. For more information
see :ref:`topics-request-response-ref-request-callback-arguments` below.
If a Request doesn't specify a callback, the spider's
:meth:`~scrapy.spiders.Spider.parse` method will be used.
@ -611,6 +611,12 @@ Response objects
This represents the :class:`Request` that generated this response.
:type request: :class:`Request` object
:param ip_address: The IP address of the server from which the Response originated.
:type ip_address: :class:`ipaddress.IPv4Address` object
.. FIXME: Add ipaddress.IPv6Address once it's supported
.. attribute:: Response.url
A string containing the URL of the response.
@ -679,6 +685,10 @@ Response objects
they're shown on the string representation of the Response (`__str__`
method) which is used by the engine for logging.
.. attribute:: Response.ip_address
The IP address of the server from which the Response originated.
.. method:: Response.copy()
Returns a new Response which is a copy of this Response.

View File

@ -172,7 +172,7 @@ class Downloader(object):
return response
dfd.addCallback(_downloaded)
# 3. After response arrives, remove the request from transferring
# 3. After response arrives, remove the request from transferring
# state to free up the transferring slot so it can be used by the
# following requests (perhaps those which came from the downloader
# middleware itself)

View File

@ -4,6 +4,7 @@ import logging
import re
import warnings
from io import BytesIO
from ipaddress import ip_address
from time import time
from urllib.parse import urldefrag
@ -382,7 +383,7 @@ class ScrapyAgent(object):
def _cb_bodyready(self, txresponse, request):
# deliverBody hangs for responses without body
if txresponse.length == 0:
return txresponse, b'', None
return txresponse, b'', None, None
maxsize = request.meta.get('download_maxsize', self._maxsize)
warnsize = request.meta.get('download_warnsize', self._warnsize)
@ -418,11 +419,11 @@ class ScrapyAgent(object):
return d
def _cb_bodydone(self, result, request, url):
txresponse, body, flags = result
txresponse, body, flags, ip_address = result
status = int(txresponse.code)
headers = Headers(txresponse.headers.getAllRawHeaders())
respcls = responsetypes.from_args(headers=headers, url=url, body=body)
return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
return respcls(url=url, status=status, headers=headers, body=body, flags=flags, ip_address=ip_address)
@implementer(IBodyProducer)
@ -456,6 +457,11 @@ class _ResponseReader(protocol.Protocol):
self._fail_on_dataloss_warned = False
self._reached_warnsize = False
self._bytes_received = 0
self._ip_address = None
def connectionMade(self):
if self._ip_address is None:
self._ip_address = ip_address(self.transport._producer.getPeer().host)
def dataReceived(self, bodyBytes):
# This maybe called several times after cancel was called with buffered data.
@ -488,16 +494,16 @@ class _ResponseReader(protocol.Protocol):
body = self._bodybuf.getvalue()
if reason.check(ResponseDone):
self._finished.callback((self._txresponse, body, None))
self._finished.callback((self._txresponse, body, None, self._ip_address))
return
if reason.check(PotentialDataLoss):
self._finished.callback((self._txresponse, body, ['partial']))
self._finished.callback((self._txresponse, body, ['partial'], self._ip_address))
return
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
if not self._fail_on_dataloss:
self._finished.callback((self._txresponse, body, ['dataloss']))
self._finished.callback((self._txresponse, body, ['dataloss'], self._ip_address))
return
elif not self._fail_on_dataloss_warned:

View File

@ -17,13 +17,14 @@ from scrapy.utils.trackref import object_ref
class Response(object_ref):
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None):
def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None, ip_address=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = request
self.flags = [] if flags is None else list(flags)
self.ip_address = ip_address
@property
def meta(self):
@ -76,7 +77,7 @@ class Response(object_ref):
"""Create a new Response with the same attributes except for those
given new values.
"""
for x in ['url', 'status', 'headers', 'body', 'request', 'flags']:
for x in ['url', 'status', 'headers', 'body', 'request', 'flags', 'ip_address']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)

View File

@ -1,5 +1,7 @@
import json
import logging
from ipaddress import IPv4Address
from urllib.parse import urlparse
from testfixtures import LogCapture
from twisted.internet import defer
@ -308,3 +310,28 @@ with multiples lines
self.assertIn("[callback] status 201", str(log))
self.assertIn("[errback] status 404", str(log))
self.assertIn("[errback] status 500", str(log))
@defer.inlineCallbacks
def test_dns_server_ip_address(self):
from socket import gethostbyname
crawler = self.runner.create_crawler(SingleRequestSpider)
url = 'https://example.org'
yield crawler.crawl(seed=url)
ip_address = crawler.spider.meta['responses'][0].ip_address
self.assertIsInstance(ip_address, IPv4Address)
self.assertEqual(str(ip_address), gethostbyname(urlparse(url).netloc))
crawler = self.runner.create_crawler(SingleRequestSpider)
url = self.mockserver.url('/status?n=200')
yield crawler.crawl(seed=url, mockserver=self.mockserver)
ip_address = crawler.spider.meta['responses'][0].ip_address
self.assertIsNone(ip_address)
crawler = self.runner.create_crawler(SingleRequestSpider)
url = self.mockserver.url('/echo?body=test')
expected_netloc, _ = urlparse(url).netloc.split(':')
yield crawler.crawl(seed=url, mockserver=self.mockserver)
ip_address = crawler.spider.meta['responses'][0].ip_address
self.assertIsInstance(ip_address, IPv4Address)
self.assertEqual(str(ip_address), gethostbyname(expected_netloc))