import unittest from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots class SitemapTest(unittest.TestCase): def test_sitemap(self): s = Sitemap(b""" http://www.example.com/ 2009-08-16 daily 1 http://www.example.com/Special-Offers.html 2009-08-16 weekly 0.8 """) assert s.type == 'urlset' self.assertEqual(list(s), [{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'}, {'priority': '0.8', 'loc': 'http://www.example.com/Special-Offers.html', 'lastmod': '2009-08-16', 'changefreq': 'weekly'}]) def test_sitemap_index(self): s = Sitemap(b""" http://www.example.com/sitemap1.xml.gz 2004-10-01T18:23:17+00:00 http://www.example.com/sitemap2.xml.gz 2005-01-01 """) assert s.type == 'sitemapindex' self.assertEqual(list(s), [{'loc': 'http://www.example.com/sitemap1.xml.gz', 'lastmod': '2004-10-01T18:23:17+00:00'}, {'loc': 'http://www.example.com/sitemap2.xml.gz', 'lastmod': '2005-01-01'}]) def test_sitemap_strip(self): """Assert we can deal with trailing spaces inside tags - we've seen those """ s = Sitemap(b""" http://www.example.com/ 2009-08-16 daily 1 http://www.example.com/2 """) self.assertEqual(list(s), [{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'}, {'loc': 'http://www.example.com/2', 'lastmod': ''}, ]) def test_sitemap_wrong_ns(self): """We have seen sitemaps with wrongs ns. Presumably, Google still works with these, though is not 100% confirmed""" s = Sitemap(b""" http://www.example.com/ 2009-08-16 daily 1 http://www.example.com/2 """) self.assertEqual(list(s), [{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'}, {'loc': 'http://www.example.com/2', 'lastmod': ''}, ]) def test_sitemap_wrong_ns2(self): """We have seen sitemaps with wrongs ns. Presumably, Google still works with these, though is not 100% confirmed""" s = Sitemap(b""" http://www.example.com/ 2009-08-16 daily 1 http://www.example.com/2 """) assert s.type == 'urlset' self.assertEqual(list(s), [{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'}, {'loc': 'http://www.example.com/2', 'lastmod': ''}, ]) def test_sitemap_urls_from_robots(self): robots = """User-agent: * Disallow: /aff/ Disallow: /wl/ # Search and shopping refining Disallow: /s*/*facet Disallow: /s*/*tags # Sitemap files Sitemap: http://example.com/sitemap.xml Sitemap: http://example.com/sitemap-product-index.xml Sitemap: HTTP://example.com/sitemap-uppercase.xml Sitemap: /sitemap-relative-url.xml # Forums Disallow: /forum/search/ Disallow: /forum/active/ """ self.assertEqual(list(sitemap_urls_from_robots(robots, base_url='http://example.com')), ['http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml', 'http://example.com/sitemap-uppercase.xml', 'http://example.com/sitemap-relative-url.xml']) def test_sitemap_blanklines(self): """Assert we can deal with starting blank lines before tag""" s = Sitemap(b"""\ http://www.example.com/sitemap1.xml 2013-07-15 http://www.example.com/sitemap2.xml 2013-07-15 http://www.example.com/sitemap3.xml 2013-07-15 """) self.assertEqual(list(s), [ {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap1.xml'}, {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap2.xml'}, {'lastmod': '2013-07-15', 'loc': 'http://www.example.com/sitemap3.xml'}, ]) def test_comment(self): s = Sitemap(b""" http://www.example.com/ """) self.assertEqual(list(s), [ {'loc': 'http://www.example.com/'} ]) def test_alternate(self): s = Sitemap(b""" http://www.example.com/english/ """) self.assertEqual(list(s), [ {'loc': 'http://www.example.com/english/', 'alternate': ['http://www.example.com/deutsch/', 'http://www.example.com/schweiz-deutsch/', 'http://www.example.com/english/'] } ]) def test_xml_entity_expansion(self): s = Sitemap(b""" ]> http://127.0.0.1:8000/&xxe; """) self.assertEqual(list(s), [{'loc': 'http://127.0.0.1:8000/'}]) if __name__ == '__main__': unittest.main()