1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 21:43:53 +00:00
scrapy/tests/test_downloadermiddleware_robotstxt.py

96 lines
4.1 KiB
Python
Raw Normal View History

from __future__ import absolute_import
2014-01-11 17:29:44 +06:00
import re
2015-04-30 16:28:57 +03:00
from twisted.internet import reactor, error
from twisted.internet.defer import Deferred
2015-04-30 16:28:57 +03:00
from twisted.python import failure
from twisted.trial import unittest
from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
2014-01-11 17:29:44 +06:00
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request, Response
2014-06-09 16:51:25 -03:00
from scrapy.settings import Settings
from tests import mock
class RobotsTxtMiddlewareTest(unittest.TestCase):
def test_robotstxt(self):
middleware = self._get_middleware()
# There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously,
# and it is actually fetched only *after* first process_request completes.
# So, first process_request will always succeed.
# We defer test() because otherwise robots.txt download mock will be called after assertRaises failure.
self.assertNotIgnored(Request('http://site.local'), middleware)
def test(r):
self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
self.assertIgnored(Request('http://site.local/admin/main'), middleware)
self.assertIgnored(Request('http://site.local/static/'), middleware)
deferred = Deferred()
deferred.addCallback(test)
reactor.callFromThread(deferred.callback, None)
return deferred
def test_robotstxt_meta(self):
meta = {'dont_obey_robotstxt': True}
middleware = self._get_middleware()
self.assertNotIgnored(Request('http://site.local', meta=meta), middleware)
def test(r):
self.assertNotIgnored(Request('http://site.local/allowed', meta=meta), middleware)
self.assertNotIgnored(Request('http://site.local/admin/main', meta=meta), middleware)
self.assertNotIgnored(Request('http://site.local/static/', meta=meta), middleware)
deferred = Deferred()
deferred.addCallback(test)
reactor.callFromThread(deferred.callback, None)
return deferred
2015-04-30 16:28:57 +03:00
def test_robotstxt_error(self):
crawler = mock.MagicMock()
crawler.settings = Settings()
crawler.settings.set('ROBOTSTXT_OBEY', True)
crawler.engine.download = mock.MagicMock()
err = error.DNSLookupError('Robotstxt address not found')
def return_failure(request, spider):
deferred = Deferred()
reactor.callFromThread(deferred.errback, failure.Failure(err))
return deferred
crawler.engine.download.side_effect = return_failure
middleware = RobotsTxtMiddleware(crawler)
middleware._logerror = mock.MagicMock()
middleware.process_request(Request('http://site.local'), None)
deferred = Deferred()
deferred.addErrback(lambda _: self.assertIsNone(middleware._logerror.assert_any_call()))
reactor.callFromThread(deferred.callback, None)
return deferred
def assertNotIgnored(self, request, middleware):
spider = None # not actually used
self.assertIsNone(middleware.process_request(request, spider))
def assertIgnored(self, request, middleware):
spider = None # not actually used
self.assertRaises(IgnoreRequest, middleware.process_request, request, spider)
def _get_crawler(self):
crawler = mock.MagicMock()
2014-06-09 16:51:25 -03:00
crawler.settings = Settings()
crawler.settings.set('USER_AGENT', 'CustomAgent')
2014-01-11 17:29:44 +06:00
self.assertRaises(NotConfigured, RobotsTxtMiddleware, crawler)
2014-06-09 16:51:25 -03:00
crawler.settings.set('ROBOTSTXT_OBEY', True)
crawler.engine.download = mock.MagicMock()
2014-01-11 17:29:44 +06:00
ROBOTS = re.sub(r'^\s+(?m)', '', '''
User-Agent: *
Disallow: /admin/
Disallow: /static/
''')
response = Response('http://site.local/robots.txt', body=ROBOTS)
2014-01-11 17:29:44 +06:00
def return_response(request, spider):
deferred = Deferred()
reactor.callFromThread(deferred.callback, response)
return deferred
crawler.engine.download.side_effect = return_response
return crawler
def _get_middleware(self):
crawler = self._get_crawler()
return RobotsTxtMiddleware(crawler)