import unittest from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots class SitemapTest(unittest.TestCase): def test_sitemap(self): s = Sitemap( b""" http://www.example.com/ 2009-08-16 daily 1 http://www.example.com/Special-Offers.html 2009-08-16 weekly 0.8 """ ) assert s.type == "urlset" self.assertEqual( list(s), [ { "priority": "1", "loc": "http://www.example.com/", "lastmod": "2009-08-16", "changefreq": "daily", }, { "priority": "0.8", "loc": "http://www.example.com/Special-Offers.html", "lastmod": "2009-08-16", "changefreq": "weekly", }, ], ) def test_sitemap_index(self): s = Sitemap( b""" http://www.example.com/sitemap1.xml.gz 2004-10-01T18:23:17+00:00 http://www.example.com/sitemap2.xml.gz 2005-01-01 """ ) assert s.type == "sitemapindex" self.assertEqual( list(s), [ { "loc": "http://www.example.com/sitemap1.xml.gz", "lastmod": "2004-10-01T18:23:17+00:00", }, { "loc": "http://www.example.com/sitemap2.xml.gz", "lastmod": "2005-01-01", }, ], ) def test_sitemap_strip(self): """Assert we can deal with trailing spaces inside tags - we've seen those """ s = Sitemap( b""" http://www.example.com/ 2009-08-16 daily 1 http://www.example.com/2 """ ) self.assertEqual( list(s), [ { "priority": "1", "loc": "http://www.example.com/", "lastmod": "2009-08-16", "changefreq": "daily", }, {"loc": "http://www.example.com/2", "lastmod": ""}, ], ) def test_sitemap_wrong_ns(self): """We have seen sitemaps with wrongs ns. Presumably, Google still works with these, though is not 100% confirmed""" s = Sitemap( b""" http://www.example.com/ 2009-08-16 daily 1 http://www.example.com/2 """ ) self.assertEqual( list(s), [ { "priority": "1", "loc": "http://www.example.com/", "lastmod": "2009-08-16", "changefreq": "daily", }, {"loc": "http://www.example.com/2", "lastmod": ""}, ], ) def test_sitemap_wrong_ns2(self): """We have seen sitemaps with wrongs ns. Presumably, Google still works with these, though is not 100% confirmed""" s = Sitemap( b""" http://www.example.com/ 2009-08-16 daily 1 http://www.example.com/2 """ ) assert s.type == "urlset" self.assertEqual( list(s), [ { "priority": "1", "loc": "http://www.example.com/", "lastmod": "2009-08-16", "changefreq": "daily", }, {"loc": "http://www.example.com/2", "lastmod": ""}, ], ) def test_sitemap_urls_from_robots(self): robots = """User-agent: * Disallow: /aff/ Disallow: /wl/ # Search and shopping refining Disallow: /s*/*facet Disallow: /s*/*tags # Sitemap files Sitemap: http://example.com/sitemap.xml Sitemap: http://example.com/sitemap-product-index.xml Sitemap: HTTP://example.com/sitemap-uppercase.xml Sitemap: /sitemap-relative-url.xml # Forums Disallow: /forum/search/ Disallow: /forum/active/ """ self.assertEqual( list(sitemap_urls_from_robots(robots, base_url="http://example.com")), [ "http://example.com/sitemap.xml", "http://example.com/sitemap-product-index.xml", "http://example.com/sitemap-uppercase.xml", "http://example.com/sitemap-relative-url.xml", ], ) def test_sitemap_blanklines(self): """Assert we can deal with starting blank lines before tag""" s = Sitemap( b""" http://www.example.com/sitemap1.xml 2013-07-15 http://www.example.com/sitemap2.xml 2013-07-15 http://www.example.com/sitemap3.xml 2013-07-15 """ ) self.assertEqual( list(s), [ {"lastmod": "2013-07-15", "loc": "http://www.example.com/sitemap1.xml"}, {"lastmod": "2013-07-15", "loc": "http://www.example.com/sitemap2.xml"}, {"lastmod": "2013-07-15", "loc": "http://www.example.com/sitemap3.xml"}, ], ) def test_comment(self): s = Sitemap( b""" http://www.example.com/ """ ) self.assertEqual(list(s), [{"loc": "http://www.example.com/"}]) def test_alternate(self): s = Sitemap( b""" http://www.example.com/english/ """ ) self.assertEqual( list(s), [ { "loc": "http://www.example.com/english/", "alternate": [ "http://www.example.com/deutsch/", "http://www.example.com/schweiz-deutsch/", "http://www.example.com/english/", ], } ], ) def test_xml_entity_expansion(self): s = Sitemap( b""" ]> http://127.0.0.1:8000/&xxe; """ ) self.assertEqual(list(s), [{"loc": "http://127.0.0.1:8000/"}]) if __name__ == "__main__": unittest.main()