mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
188 lines
7.0 KiB
Python
188 lines
7.0 KiB
Python
from twisted.trial import unittest
|
|
|
|
from scrapy.robotstxt import decode_robotstxt
|
|
|
|
|
|
def rerp_available():
|
|
# check if robotexclusionrulesparser is installed
|
|
try:
|
|
from robotexclusionrulesparser import RobotExclusionRulesParser # noqa: F401
|
|
except ImportError:
|
|
return False
|
|
return True
|
|
|
|
|
|
def protego_available():
|
|
# check if protego parser is installed
|
|
try:
|
|
from protego import Protego # noqa: F401
|
|
except ImportError:
|
|
return False
|
|
return True
|
|
|
|
|
|
class BaseRobotParserTest:
|
|
def _setUp(self, parser_cls):
|
|
self.parser_cls = parser_cls
|
|
|
|
def test_allowed(self):
|
|
robotstxt_robotstxt_body = (
|
|
b"User-agent: * \nDisallow: /disallowed \nAllow: /allowed \nCrawl-delay: 10"
|
|
)
|
|
rp = self.parser_cls.from_crawler(
|
|
crawler=None, robotstxt_body=robotstxt_robotstxt_body
|
|
)
|
|
self.assertTrue(rp.allowed("https://www.site.local/allowed", "*"))
|
|
self.assertFalse(rp.allowed("https://www.site.local/disallowed", "*"))
|
|
|
|
def test_allowed_wildcards(self):
|
|
robotstxt_robotstxt_body = b"""User-agent: first
|
|
Disallow: /disallowed/*/end$
|
|
|
|
User-agent: second
|
|
Allow: /*allowed
|
|
Disallow: /
|
|
"""
|
|
rp = self.parser_cls.from_crawler(
|
|
crawler=None, robotstxt_body=robotstxt_robotstxt_body
|
|
)
|
|
|
|
self.assertTrue(rp.allowed("https://www.site.local/disallowed", "first"))
|
|
self.assertFalse(
|
|
rp.allowed("https://www.site.local/disallowed/xyz/end", "first")
|
|
)
|
|
self.assertFalse(
|
|
rp.allowed("https://www.site.local/disallowed/abc/end", "first")
|
|
)
|
|
self.assertTrue(
|
|
rp.allowed("https://www.site.local/disallowed/xyz/endinglater", "first")
|
|
)
|
|
|
|
self.assertTrue(rp.allowed("https://www.site.local/allowed", "second"))
|
|
self.assertTrue(rp.allowed("https://www.site.local/is_still_allowed", "second"))
|
|
self.assertTrue(rp.allowed("https://www.site.local/is_allowed_too", "second"))
|
|
|
|
def test_length_based_precedence(self):
|
|
robotstxt_robotstxt_body = b"User-agent: * \nDisallow: / \nAllow: /page"
|
|
rp = self.parser_cls.from_crawler(
|
|
crawler=None, robotstxt_body=robotstxt_robotstxt_body
|
|
)
|
|
self.assertTrue(rp.allowed("https://www.site.local/page", "*"))
|
|
|
|
def test_order_based_precedence(self):
|
|
robotstxt_robotstxt_body = b"User-agent: * \nDisallow: / \nAllow: /page"
|
|
rp = self.parser_cls.from_crawler(
|
|
crawler=None, robotstxt_body=robotstxt_robotstxt_body
|
|
)
|
|
self.assertFalse(rp.allowed("https://www.site.local/page", "*"))
|
|
|
|
def test_empty_response(self):
|
|
"""empty response should equal 'allow all'"""
|
|
rp = self.parser_cls.from_crawler(crawler=None, robotstxt_body=b"")
|
|
self.assertTrue(rp.allowed("https://site.local/", "*"))
|
|
self.assertTrue(rp.allowed("https://site.local/", "chrome"))
|
|
self.assertTrue(rp.allowed("https://site.local/index.html", "*"))
|
|
self.assertTrue(rp.allowed("https://site.local/disallowed", "*"))
|
|
|
|
def test_garbage_response(self):
|
|
"""garbage response should be discarded, equal 'allow all'"""
|
|
robotstxt_robotstxt_body = b"GIF89a\xd3\x00\xfe\x00\xa2"
|
|
rp = self.parser_cls.from_crawler(
|
|
crawler=None, robotstxt_body=robotstxt_robotstxt_body
|
|
)
|
|
self.assertTrue(rp.allowed("https://site.local/", "*"))
|
|
self.assertTrue(rp.allowed("https://site.local/", "chrome"))
|
|
self.assertTrue(rp.allowed("https://site.local/index.html", "*"))
|
|
self.assertTrue(rp.allowed("https://site.local/disallowed", "*"))
|
|
|
|
def test_unicode_url_and_useragent(self):
|
|
robotstxt_robotstxt_body = """
|
|
User-Agent: *
|
|
Disallow: /admin/
|
|
Disallow: /static/
|
|
# taken from https://en.wikipedia.org/robots.txt
|
|
Disallow: /wiki/K%C3%A4ytt%C3%A4j%C3%A4:
|
|
Disallow: /wiki/Käyttäjä:
|
|
|
|
User-Agent: UnicödeBöt
|
|
Disallow: /some/randome/page.html""".encode()
|
|
rp = self.parser_cls.from_crawler(
|
|
crawler=None, robotstxt_body=robotstxt_robotstxt_body
|
|
)
|
|
self.assertTrue(rp.allowed("https://site.local/", "*"))
|
|
self.assertFalse(rp.allowed("https://site.local/admin/", "*"))
|
|
self.assertFalse(rp.allowed("https://site.local/static/", "*"))
|
|
self.assertTrue(rp.allowed("https://site.local/admin/", "UnicödeBöt"))
|
|
self.assertFalse(
|
|
rp.allowed("https://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:", "*")
|
|
)
|
|
self.assertFalse(rp.allowed("https://site.local/wiki/Käyttäjä:", "*"))
|
|
self.assertTrue(rp.allowed("https://site.local/some/randome/page.html", "*"))
|
|
self.assertFalse(
|
|
rp.allowed("https://site.local/some/randome/page.html", "UnicödeBöt")
|
|
)
|
|
|
|
|
|
class DecodeRobotsTxtTest(unittest.TestCase):
|
|
def test_native_string_conversion(self):
|
|
robotstxt_body = b"User-agent: *\nDisallow: /\n"
|
|
decoded_content = decode_robotstxt(
|
|
robotstxt_body, spider=None, to_native_str_type=True
|
|
)
|
|
self.assertEqual(decoded_content, "User-agent: *\nDisallow: /\n")
|
|
|
|
def test_decode_utf8(self):
|
|
robotstxt_body = b"User-agent: *\nDisallow: /\n"
|
|
decoded_content = decode_robotstxt(robotstxt_body, spider=None)
|
|
self.assertEqual(decoded_content, "User-agent: *\nDisallow: /\n")
|
|
|
|
def test_decode_non_utf8(self):
|
|
robotstxt_body = b"User-agent: *\n\xffDisallow: /\n"
|
|
decoded_content = decode_robotstxt(robotstxt_body, spider=None)
|
|
self.assertEqual(decoded_content, "User-agent: *\nDisallow: /\n")
|
|
|
|
|
|
class PythonRobotParserTest(BaseRobotParserTest, unittest.TestCase):
|
|
def setUp(self):
|
|
from scrapy.robotstxt import PythonRobotParser
|
|
|
|
super()._setUp(PythonRobotParser)
|
|
|
|
def test_length_based_precedence(self):
|
|
raise unittest.SkipTest(
|
|
"RobotFileParser does not support length based directives precedence."
|
|
)
|
|
|
|
def test_allowed_wildcards(self):
|
|
raise unittest.SkipTest("RobotFileParser does not support wildcards.")
|
|
|
|
|
|
class RerpRobotParserTest(BaseRobotParserTest, unittest.TestCase):
|
|
if not rerp_available():
|
|
skip = "Rerp parser is not installed"
|
|
|
|
def setUp(self):
|
|
from scrapy.robotstxt import RerpRobotParser
|
|
|
|
super()._setUp(RerpRobotParser)
|
|
|
|
def test_length_based_precedence(self):
|
|
raise unittest.SkipTest(
|
|
"Rerp does not support length based directives precedence."
|
|
)
|
|
|
|
|
|
class ProtegoRobotParserTest(BaseRobotParserTest, unittest.TestCase):
|
|
if not protego_available():
|
|
skip = "Protego parser is not installed"
|
|
|
|
def setUp(self):
|
|
from scrapy.robotstxt import ProtegoRobotParser
|
|
|
|
super()._setUp(ProtegoRobotParser)
|
|
|
|
def test_order_based_precedence(self):
|
|
raise unittest.SkipTest(
|
|
"Protego does not support order based directives precedence."
|
|
)
|