2014-09-23 00:10:43 +06:00
|
|
|
from __future__ import absolute_import
|
2014-01-11 17:29:44 +06:00
|
|
|
import re
|
2015-04-30 16:28:57 +03:00
|
|
|
from twisted.internet import reactor, error
|
2014-01-11 14:30:27 +06:00
|
|
|
from twisted.internet.defer import Deferred
|
2015-04-30 16:28:57 +03:00
|
|
|
from twisted.python import failure
|
2014-01-11 14:30:27 +06:00
|
|
|
from twisted.trial import unittest
|
2015-04-20 21:23:05 -03:00
|
|
|
from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
|
2014-01-11 17:29:44 +06:00
|
|
|
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
2014-01-11 14:30:27 +06:00
|
|
|
from scrapy.http import Request, Response
|
2014-06-09 16:51:25 -03:00
|
|
|
from scrapy.settings import Settings
|
2015-03-06 15:45:04 +02:00
|
|
|
from tests import mock
|
2014-01-11 14:30:27 +06:00
|
|
|
|
|
|
|
|
|
|
|
class RobotsTxtMiddlewareTest(unittest.TestCase):
|
|
|
|
|
2014-09-23 00:10:43 +06:00
|
|
|
def test_robotstxt(self):
|
|
|
|
middleware = self._get_middleware()
|
|
|
|
# There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously,
|
|
|
|
# and it is actually fetched only *after* first process_request completes.
|
|
|
|
# So, first process_request will always succeed.
|
|
|
|
# We defer test() because otherwise robots.txt download mock will be called after assertRaises failure.
|
|
|
|
self.assertNotIgnored(Request('http://site.local'), middleware)
|
|
|
|
def test(r):
|
|
|
|
self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
|
|
|
|
self.assertIgnored(Request('http://site.local/admin/main'), middleware)
|
|
|
|
self.assertIgnored(Request('http://site.local/static/'), middleware)
|
|
|
|
deferred = Deferred()
|
|
|
|
deferred.addCallback(test)
|
|
|
|
reactor.callFromThread(deferred.callback, None)
|
|
|
|
return deferred
|
|
|
|
|
|
|
|
def test_robotstxt_meta(self):
|
|
|
|
meta = {'dont_obey_robotstxt': True}
|
|
|
|
middleware = self._get_middleware()
|
|
|
|
self.assertNotIgnored(Request('http://site.local', meta=meta), middleware)
|
|
|
|
def test(r):
|
|
|
|
self.assertNotIgnored(Request('http://site.local/allowed', meta=meta), middleware)
|
|
|
|
self.assertNotIgnored(Request('http://site.local/admin/main', meta=meta), middleware)
|
|
|
|
self.assertNotIgnored(Request('http://site.local/static/', meta=meta), middleware)
|
|
|
|
deferred = Deferred()
|
|
|
|
deferred.addCallback(test)
|
|
|
|
reactor.callFromThread(deferred.callback, None)
|
|
|
|
return deferred
|
|
|
|
|
2015-04-30 16:28:57 +03:00
|
|
|
def test_robotstxt_error(self):
|
|
|
|
crawler = mock.MagicMock()
|
|
|
|
crawler.settings = Settings()
|
|
|
|
crawler.settings.set('ROBOTSTXT_OBEY', True)
|
|
|
|
crawler.engine.download = mock.MagicMock()
|
|
|
|
err = error.DNSLookupError('Robotstxt address not found')
|
|
|
|
def return_failure(request, spider):
|
|
|
|
deferred = Deferred()
|
|
|
|
reactor.callFromThread(deferred.errback, failure.Failure(err))
|
|
|
|
return deferred
|
|
|
|
crawler.engine.download.side_effect = return_failure
|
|
|
|
|
|
|
|
middleware = RobotsTxtMiddleware(crawler)
|
|
|
|
middleware._logerror = mock.MagicMock()
|
|
|
|
middleware.process_request(Request('http://site.local'), None)
|
|
|
|
deferred = Deferred()
|
|
|
|
deferred.addErrback(lambda _: self.assertIsNone(middleware._logerror.assert_any_call()))
|
|
|
|
reactor.callFromThread(deferred.callback, None)
|
|
|
|
return deferred
|
|
|
|
|
2014-09-23 00:10:43 +06:00
|
|
|
def assertNotIgnored(self, request, middleware):
|
|
|
|
spider = None # not actually used
|
|
|
|
self.assertIsNone(middleware.process_request(request, spider))
|
|
|
|
|
|
|
|
def assertIgnored(self, request, middleware):
|
|
|
|
spider = None # not actually used
|
|
|
|
self.assertRaises(IgnoreRequest, middleware.process_request, request, spider)
|
|
|
|
|
|
|
|
def _get_crawler(self):
|
2014-01-11 14:30:27 +06:00
|
|
|
crawler = mock.MagicMock()
|
2014-06-09 16:51:25 -03:00
|
|
|
crawler.settings = Settings()
|
|
|
|
crawler.settings.set('USER_AGENT', 'CustomAgent')
|
2014-01-11 17:29:44 +06:00
|
|
|
self.assertRaises(NotConfigured, RobotsTxtMiddleware, crawler)
|
2014-06-09 16:51:25 -03:00
|
|
|
crawler.settings.set('ROBOTSTXT_OBEY', True)
|
2014-01-11 14:30:27 +06:00
|
|
|
crawler.engine.download = mock.MagicMock()
|
2014-01-11 17:29:44 +06:00
|
|
|
ROBOTS = re.sub(r'^\s+(?m)', '', '''
|
|
|
|
User-Agent: *
|
|
|
|
Disallow: /admin/
|
|
|
|
Disallow: /static/
|
|
|
|
''')
|
2014-01-11 14:30:27 +06:00
|
|
|
response = Response('http://site.local/robots.txt', body=ROBOTS)
|
2014-01-11 17:29:44 +06:00
|
|
|
def return_response(request, spider):
|
|
|
|
deferred = Deferred()
|
|
|
|
reactor.callFromThread(deferred.callback, response)
|
|
|
|
return deferred
|
|
|
|
crawler.engine.download.side_effect = return_response
|
2014-09-23 00:10:43 +06:00
|
|
|
return crawler
|
|
|
|
|
|
|
|
def _get_middleware(self):
|
|
|
|
crawler = self._get_crawler()
|
|
|
|
return RobotsTxtMiddleware(crawler)
|