1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 10:24:24 +00:00
scrapy/tests/test_downloadermiddleware_robotstxt.py
2024-12-12 20:32:25 +05:00

256 lines
9.6 KiB
Python

from unittest import mock
from twisted.internet import error, reactor
from twisted.internet.defer import Deferred, DeferredList, maybeDeferred
from twisted.python import failure
from twisted.trial import unittest
from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
from scrapy.downloadermiddlewares.robotstxt import logger as mw_module_logger
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request, Response, TextResponse
from scrapy.http.request import NO_CALLBACK
from scrapy.settings import Settings
from tests.test_robotstxt_interface import rerp_available
class RobotsTxtMiddlewareTest(unittest.TestCase):
def setUp(self):
self.crawler = mock.MagicMock()
self.crawler.settings = Settings()
self.crawler.engine.download = mock.MagicMock()
def tearDown(self):
del self.crawler
def test_robotstxt_settings(self):
self.crawler.settings = Settings()
self.crawler.settings.set("USER_AGENT", "CustomAgent")
self.assertRaises(NotConfigured, RobotsTxtMiddleware, self.crawler)
def _get_successful_crawler(self):
crawler = self.crawler
crawler.settings.set("ROBOTSTXT_OBEY", True)
ROBOTS = """
User-Agent: *
Disallow: /admin/
Disallow: /static/
# taken from https://en.wikipedia.org/robots.txt
Disallow: /wiki/K%C3%A4ytt%C3%A4j%C3%A4:
Disallow: /wiki/Käyttäjä:
User-Agent: UnicödeBöt
Disallow: /some/randome/page.html
""".encode()
response = TextResponse("http://site.local/robots.txt", body=ROBOTS)
def return_response(request):
deferred = Deferred()
reactor.callFromThread(deferred.callback, response)
return deferred
crawler.engine.download.side_effect = return_response
return crawler
def test_robotstxt(self):
middleware = RobotsTxtMiddleware(self._get_successful_crawler())
return DeferredList(
[
self.assertNotIgnored(Request("http://site.local/allowed"), middleware),
maybeDeferred(self.assertRobotsTxtRequested, "http://site.local"),
self.assertIgnored(Request("http://site.local/admin/main"), middleware),
self.assertIgnored(Request("http://site.local/static/"), middleware),
self.assertIgnored(
Request("http://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:"),
middleware,
),
self.assertIgnored(
Request("http://site.local/wiki/Käyttäjä:"), middleware
),
],
fireOnOneErrback=True,
)
def test_robotstxt_ready_parser(self):
middleware = RobotsTxtMiddleware(self._get_successful_crawler())
d = self.assertNotIgnored(Request("http://site.local/allowed"), middleware)
d.addCallback(
lambda _: self.assertNotIgnored(
Request("http://site.local/allowed"), middleware
)
)
return d
def test_robotstxt_meta(self):
middleware = RobotsTxtMiddleware(self._get_successful_crawler())
meta = {"dont_obey_robotstxt": True}
return DeferredList(
[
self.assertNotIgnored(
Request("http://site.local/allowed", meta=meta), middleware
),
self.assertNotIgnored(
Request("http://site.local/admin/main", meta=meta), middleware
),
self.assertNotIgnored(
Request("http://site.local/static/", meta=meta), middleware
),
],
fireOnOneErrback=True,
)
def _get_garbage_crawler(self):
crawler = self.crawler
crawler.settings.set("ROBOTSTXT_OBEY", True)
response = Response(
"http://site.local/robots.txt", body=b"GIF89a\xd3\x00\xfe\x00\xa2"
)
def return_response(request):
deferred = Deferred()
reactor.callFromThread(deferred.callback, response)
return deferred
crawler.engine.download.side_effect = return_response
return crawler
def test_robotstxt_garbage(self):
# garbage response should be discarded, equal 'allow all'
middleware = RobotsTxtMiddleware(self._get_garbage_crawler())
return DeferredList(
[
self.assertNotIgnored(Request("http://site.local"), middleware),
self.assertNotIgnored(Request("http://site.local/allowed"), middleware),
self.assertNotIgnored(
Request("http://site.local/admin/main"), middleware
),
self.assertNotIgnored(Request("http://site.local/static/"), middleware),
],
fireOnOneErrback=True,
)
def _get_emptybody_crawler(self):
crawler = self.crawler
crawler.settings.set("ROBOTSTXT_OBEY", True)
response = Response("http://site.local/robots.txt")
def return_response(request):
deferred = Deferred()
reactor.callFromThread(deferred.callback, response)
return deferred
crawler.engine.download.side_effect = return_response
return crawler
def test_robotstxt_empty_response(self):
# empty response should equal 'allow all'
middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
return DeferredList(
[
self.assertNotIgnored(Request("http://site.local/allowed"), middleware),
self.assertNotIgnored(
Request("http://site.local/admin/main"), middleware
),
self.assertNotIgnored(Request("http://site.local/static/"), middleware),
],
fireOnOneErrback=True,
)
def test_robotstxt_error(self):
self.crawler.settings.set("ROBOTSTXT_OBEY", True)
err = error.DNSLookupError("Robotstxt address not found")
def return_failure(request):
deferred = Deferred()
reactor.callFromThread(deferred.errback, failure.Failure(err))
return deferred
self.crawler.engine.download.side_effect = return_failure
middleware = RobotsTxtMiddleware(self.crawler)
middleware._logerror = mock.MagicMock(side_effect=middleware._logerror)
deferred = middleware.process_request(Request("http://site.local"), None)
deferred.addCallback(lambda _: self.assertTrue(middleware._logerror.called))
return deferred
def test_robotstxt_immediate_error(self):
self.crawler.settings.set("ROBOTSTXT_OBEY", True)
err = error.DNSLookupError("Robotstxt address not found")
def immediate_failure(request):
deferred = Deferred()
deferred.errback(failure.Failure(err))
return deferred
self.crawler.engine.download.side_effect = immediate_failure
middleware = RobotsTxtMiddleware(self.crawler)
return self.assertNotIgnored(Request("http://site.local"), middleware)
def test_ignore_robotstxt_request(self):
self.crawler.settings.set("ROBOTSTXT_OBEY", True)
def ignore_request(request):
deferred = Deferred()
reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest()))
return deferred
self.crawler.engine.download.side_effect = ignore_request
middleware = RobotsTxtMiddleware(self.crawler)
mw_module_logger.error = mock.MagicMock()
d = self.assertNotIgnored(Request("http://site.local/allowed"), middleware)
d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called))
return d
def test_robotstxt_user_agent_setting(self):
crawler = self._get_successful_crawler()
crawler.settings.set("ROBOTSTXT_USER_AGENT", "Examplebot")
crawler.settings.set("USER_AGENT", "Mozilla/5.0 (X11; Linux x86_64)")
middleware = RobotsTxtMiddleware(crawler)
rp = mock.MagicMock(return_value=True)
middleware.process_request_2(rp, Request("http://site.local/allowed"), None)
rp.allowed.assert_called_once_with("http://site.local/allowed", "Examplebot")
def test_robotstxt_local_file(self):
middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
assert not middleware.process_request(
Request("data:text/plain,Hello World data"), None
)
assert not middleware.process_request(
Request("file:///tests/sample_data/test_site/nothinghere.html"), None
)
assert isinstance(
middleware.process_request(Request("http://site.local/allowed"), None),
Deferred,
)
def assertNotIgnored(self, request, middleware):
spider = None # not actually used
dfd = maybeDeferred(middleware.process_request, request, spider)
dfd.addCallback(self.assertIsNone)
return dfd
def assertIgnored(self, request, middleware):
spider = None # not actually used
return self.assertFailure(
maybeDeferred(middleware.process_request, request, spider), IgnoreRequest
)
def assertRobotsTxtRequested(self, base_url):
calls = self.crawler.engine.download.call_args_list
request = calls[0][0][0]
self.assertEqual(request.url, f"{base_url}/robots.txt")
self.assertEqual(request.callback, NO_CALLBACK)
class RobotsTxtMiddlewareWithRerpTest(RobotsTxtMiddlewareTest):
if not rerp_available():
skip = "Rerp parser is not installed"
def setUp(self):
super().setUp()
self.crawler.settings.set(
"ROBOTSTXT_PARSER", "scrapy.robotstxt.RerpRobotParser"
)