import unittest
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
class SitemapTest(unittest.TestCase):
def test_sitemap(self):
s = Sitemap(b"""
http://www.example.com/
2009-08-16
daily
1
http://www.example.com/Special-Offers.html
2009-08-16
weekly
0.8
""")
assert s.type == 'urlset'
self.assertEqual(list(s),
[{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'}, {'priority': '0.8', 'loc': 'http://www.example.com/Special-Offers.html', 'lastmod': '2009-08-16', 'changefreq': 'weekly'}])
def test_sitemap_index(self):
s = Sitemap(b"""
http://www.example.com/sitemap1.xml.gz
2004-10-01T18:23:17+00:00
http://www.example.com/sitemap2.xml.gz
2005-01-01
""")
assert s.type == 'sitemapindex'
self.assertEqual(list(s), [{'loc': 'http://www.example.com/sitemap1.xml.gz', 'lastmod': '2004-10-01T18:23:17+00:00'}, {'loc': 'http://www.example.com/sitemap2.xml.gz', 'lastmod': '2005-01-01'}])
def test_sitemap_strip(self):
"""Assert we can deal with trailing spaces inside tags - we've
seen those
"""
s = Sitemap(b"""
http://www.example.com/
2009-08-16
daily
1
http://www.example.com/2
""")
self.assertEqual(list(s),
[{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'},
{'loc': 'http://www.example.com/2', 'lastmod': ''},
])
def test_sitemap_wrong_ns(self):
"""We have seen sitemaps with wrongs ns. Presumably, Google still works
with these, though is not 100% confirmed"""
s = Sitemap(b"""
http://www.example.com/
2009-08-16
daily
1
http://www.example.com/2
""")
self.assertEqual(list(s),
[{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'},
{'loc': 'http://www.example.com/2', 'lastmod': ''},
])
def test_sitemap_wrong_ns2(self):
"""We have seen sitemaps with wrongs ns. Presumably, Google still works
with these, though is not 100% confirmed"""
s = Sitemap(b"""
http://www.example.com/
2009-08-16
daily
1
http://www.example.com/2
""")
assert s.type == 'urlset'
self.assertEqual(list(s),
[{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'},
{'loc': 'http://www.example.com/2', 'lastmod': ''},
])
def test_sitemap_urls_from_robots(self):
robots = """User-agent: *
Disallow: /aff/
Disallow: /wl/
# Search and shopping refining
Disallow: /s*/*facet
Disallow: /s*/*tags
# Sitemap files
Sitemap: http://example.com/sitemap.xml
Sitemap: http://example.com/sitemap-product-index.xml
Sitemap: HTTP://example.com/sitemap-uppercase.xml
Sitemap: /sitemap-relative-url.xml
# Forums
Disallow: /forum/search/
Disallow: /forum/active/
"""
self.assertEqual(list(sitemap_urls_from_robots(robots, base_url='http://example.com')),
['http://example.com/sitemap.xml',
'http://example.com/sitemap-product-index.xml',
'http://example.com/sitemap-uppercase.xml',
'http://example.com/sitemap-relative-url.xml'])
def test_sitemap_blanklines(self):
"""Assert we can deal with starting blank lines before tag"""
s = Sitemap(b"""\
http://www.example.com/sitemap1.xml
2013-07-15
http://www.example.com/sitemap2.xml
2013-07-15
http://www.example.com/sitemap3.xml
2013-07-15
""")
self.assertEqual(list(s), [
{'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap1.xml'},
{'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap2.xml'},
{'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap3.xml'},
])
def test_comment(self):
s = Sitemap(b"""
http://www.example.com/
""")
self.assertEqual(list(s), [
{'loc': 'http://www.example.com/'}
])
def test_alternate(self):
s = Sitemap(b"""
http://www.example.com/english/
""")
self.assertEqual(list(s), [
{'loc': 'http://www.example.com/english/',
'alternate': ['http://www.example.com/deutsch/', 'http://www.example.com/schweiz-deutsch/', 'http://www.example.com/english/']
}
])
def test_xml_entity_expansion(self):
s = Sitemap(b"""
]>
http://127.0.0.1:8000/&xxe;
""")
self.assertEqual(list(s), [{'loc': 'http://127.0.0.1:8000/'}])
if __name__ == '__main__':
unittest.main()