Get server IP address for HTTP/1.1 responses

2025-03-15 10:51:48 +00:00 · 2019-08-05 11:39:07 -03:00 · 2019-08-05 11:39:07 -03:00 · 80925ab845
commit 80925ab845
parent 8b8df31961
5 changed files with 54 additions and 10 deletions
--- a/docs/topics/request-response.rst
+++ b/docs/topics/request-response.rst
@ -34,7 +34,7 @@ Request objects
    :type url: string

    :param callback: the function that will be called with the response of this
-       request (once its downloaded) as its first parameter. For more information
+       request (once it's downloaded) as its first parameter. For more information
       see :ref:`topics-request-response-ref-request-callback-arguments` below.
       If a Request doesn't specify a callback, the spider's
       :meth:`~scrapy.spiders.Spider.parse` method will be used.
@ -611,6 +611,12 @@ Response objects
        This represents the :class:`Request` that generated this response.
    :type request: :class:`Request` object

+    :param ip_address: The IP address of the server from which the Response originated.
+    :type ip_address: :class:`ipaddress.IPv4Address` object
+
+    .. FIXME: Add ipaddress.IPv6Address once it's supported
+
+
    .. attribute:: Response.url

        A string containing the URL of the response.
@ -679,6 +685,10 @@ Response objects
        they're shown on the string representation of the Response (`__str__`
        method) which is used by the engine for logging.

+    .. attribute:: Response.ip_address
+
+        The IP address of the server from which the Response originated.
+
    .. method:: Response.copy()

       Returns a new Response which is a copy of this Response.
--- a/scrapy/core/downloader/init.py
+++ b/scrapy/core/downloader/init.py
@ -172,7 +172,7 @@ class Downloader(object):
            return response
        dfd.addCallback(_downloaded)

-        # 3. After response arrives,  remove the request from transferring
+        # 3. After response arrives, remove the request from transferring
        # state to free up the transferring slot so it can be used by the
        # following requests (perhaps those which came from the downloader
        # middleware itself)
--- a/scrapy/core/downloader/handlers/http11.py
+++ b/scrapy/core/downloader/handlers/http11.py
@ -4,6 +4,7 @@ import logging
 import re
 import warnings
 from io import BytesIO
+from ipaddress import ip_address
 from time import time
 from urllib.parse import urldefrag

@ -382,7 +383,7 @@ class ScrapyAgent(object):
    def _cb_bodyready(self, txresponse, request):
        # deliverBody hangs for responses without body
        if txresponse.length == 0:
-            return txresponse, b'', None
+            return txresponse, b'', None, None

        maxsize = request.meta.get('download_maxsize', self._maxsize)
        warnsize = request.meta.get('download_warnsize', self._warnsize)
@ -418,11 +419,11 @@ class ScrapyAgent(object):
        return d

    def _cb_bodydone(self, result, request, url):
-        txresponse, body, flags = result
+        txresponse, body, flags, ip_address = result
        status = int(txresponse.code)
        headers = Headers(txresponse.headers.getAllRawHeaders())
        respcls = responsetypes.from_args(headers=headers, url=url, body=body)
-        return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
+        return respcls(url=url, status=status, headers=headers, body=body, flags=flags, ip_address=ip_address)


@implementer(IBodyProducer)
@ -456,6 +457,11 @@ class _ResponseReader(protocol.Protocol):
        self._fail_on_dataloss_warned = False
        self._reached_warnsize = False
        self._bytes_received = 0
+        self._ip_address = None
+
+    def connectionMade(self):
+        if self._ip_address is None:
+            self._ip_address = ip_address(self.transport._producer.getPeer().host)

    def dataReceived(self, bodyBytes):
        # This maybe called several times after cancel was called with buffered data.
@ -488,16 +494,16 @@ class _ResponseReader(protocol.Protocol):

        body = self._bodybuf.getvalue()
        if reason.check(ResponseDone):
-            self._finished.callback((self._txresponse, body, None))
+            self._finished.callback((self._txresponse, body, None, self._ip_address))
            return

        if reason.check(PotentialDataLoss):
-            self._finished.callback((self._txresponse, body, ['partial']))
+            self._finished.callback((self._txresponse, body, ['partial'], self._ip_address))
            return

        if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
            if not self._fail_on_dataloss:
-                self._finished.callback((self._txresponse, body, ['dataloss']))
+                self._finished.callback((self._txresponse, body, ['dataloss'], self._ip_address))
                return

            elif not self._fail_on_dataloss_warned:
--- a/scrapy/http/response/init.py
+++ b/scrapy/http/response/init.py
@ -17,13 +17,14 @@ from scrapy.utils.trackref import object_ref

 class Response(object_ref):

-    def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None):
+    def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None, ip_address=None):
        self.headers = Headers(headers or {})
        self.status = int(status)
        self._set_body(body)
        self._set_url(url)
        self.request = request
        self.flags = [] if flags is None else list(flags)
+        self.ip_address = ip_address

    @property
    def meta(self):
@ -76,7 +77,7 @@ class Response(object_ref):
        """Create a new Response with the same attributes except for those
        given new values.
        """
-        for x in ['url', 'status', 'headers', 'body', 'request', 'flags']:
+        for x in ['url', 'status', 'headers', 'body', 'request', 'flags', 'ip_address']:
            kwargs.setdefault(x, getattr(self, x))
        cls = kwargs.pop('cls', self.__class__)
        return cls(*args, **kwargs)
--- a/tests/test_crawl.py
+++ b/tests/test_crawl.py
@ -1,5 +1,7 @@
 import json
 import logging
+from ipaddress import IPv4Address
+from urllib.parse import urlparse

 from testfixtures import LogCapture
 from twisted.internet import defer
@ -308,3 +310,28 @@ with multiples lines
        self.assertIn("[callback] status 201", str(log))
        self.assertIn("[errback] status 404", str(log))
        self.assertIn("[errback] status 500", str(log))
+
+    @defer.inlineCallbacks
+    def test_dns_server_ip_address(self):
+        from socket import gethostbyname
+
+        crawler = self.runner.create_crawler(SingleRequestSpider)
+        url = 'https://example.org'
+        yield crawler.crawl(seed=url)
+        ip_address = crawler.spider.meta['responses'][0].ip_address
+        self.assertIsInstance(ip_address, IPv4Address)
+        self.assertEqual(str(ip_address), gethostbyname(urlparse(url).netloc))
+
+        crawler = self.runner.create_crawler(SingleRequestSpider)
+        url = self.mockserver.url('/status?n=200')
+        yield crawler.crawl(seed=url, mockserver=self.mockserver)
+        ip_address = crawler.spider.meta['responses'][0].ip_address
+        self.assertIsNone(ip_address)
+
+        crawler = self.runner.create_crawler(SingleRequestSpider)
+        url = self.mockserver.url('/echo?body=test')
+        expected_netloc, _ = urlparse(url).netloc.split(':')
+        yield crawler.crawl(seed=url, mockserver=self.mockserver)
+        ip_address = crawler.spider.meta['responses'][0].ip_address
+        self.assertIsInstance(ip_address, IPv4Address)
+        self.assertEqual(str(ip_address), gethostbyname(expected_netloc))