import unittest
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
class SitemapTest(unittest.TestCase):
def test_sitemap(self):
s = Sitemap(
b"""
http://www.example.com/
2009-08-16
daily
1
http://www.example.com/Special-Offers.html
2009-08-16
weekly
0.8
"""
)
assert s.type == "urlset"
self.assertEqual(
list(s),
[
{
"priority": "1",
"loc": "http://www.example.com/",
"lastmod": "2009-08-16",
"changefreq": "daily",
},
{
"priority": "0.8",
"loc": "http://www.example.com/Special-Offers.html",
"lastmod": "2009-08-16",
"changefreq": "weekly",
},
],
)
def test_sitemap_index(self):
s = Sitemap(
b"""
http://www.example.com/sitemap1.xml.gz
2004-10-01T18:23:17+00:00
http://www.example.com/sitemap2.xml.gz
2005-01-01
"""
)
assert s.type == "sitemapindex"
self.assertEqual(
list(s),
[
{
"loc": "http://www.example.com/sitemap1.xml.gz",
"lastmod": "2004-10-01T18:23:17+00:00",
},
{
"loc": "http://www.example.com/sitemap2.xml.gz",
"lastmod": "2005-01-01",
},
],
)
def test_sitemap_strip(self):
"""Assert we can deal with trailing spaces inside tags - we've
seen those
"""
s = Sitemap(
b"""
http://www.example.com/
2009-08-16
daily
1
http://www.example.com/2
"""
)
self.assertEqual(
list(s),
[
{
"priority": "1",
"loc": "http://www.example.com/",
"lastmod": "2009-08-16",
"changefreq": "daily",
},
{"loc": "http://www.example.com/2", "lastmod": ""},
],
)
def test_sitemap_wrong_ns(self):
"""We have seen sitemaps with wrongs ns. Presumably, Google still works
with these, though is not 100% confirmed"""
s = Sitemap(
b"""
http://www.example.com/
2009-08-16
daily
1
http://www.example.com/2
"""
)
self.assertEqual(
list(s),
[
{
"priority": "1",
"loc": "http://www.example.com/",
"lastmod": "2009-08-16",
"changefreq": "daily",
},
{"loc": "http://www.example.com/2", "lastmod": ""},
],
)
def test_sitemap_wrong_ns2(self):
"""We have seen sitemaps with wrongs ns. Presumably, Google still works
with these, though is not 100% confirmed"""
s = Sitemap(
b"""
http://www.example.com/
2009-08-16
daily
1
http://www.example.com/2
"""
)
assert s.type == "urlset"
self.assertEqual(
list(s),
[
{
"priority": "1",
"loc": "http://www.example.com/",
"lastmod": "2009-08-16",
"changefreq": "daily",
},
{"loc": "http://www.example.com/2", "lastmod": ""},
],
)
def test_sitemap_urls_from_robots(self):
robots = """User-agent: *
Disallow: /aff/
Disallow: /wl/
# Search and shopping refining
Disallow: /s*/*facet
Disallow: /s*/*tags
# Sitemap files
Sitemap: http://example.com/sitemap.xml
Sitemap: http://example.com/sitemap-product-index.xml
Sitemap: HTTP://example.com/sitemap-uppercase.xml
Sitemap: /sitemap-relative-url.xml
# Forums
Disallow: /forum/search/
Disallow: /forum/active/
"""
self.assertEqual(
list(sitemap_urls_from_robots(robots, base_url="http://example.com")),
[
"http://example.com/sitemap.xml",
"http://example.com/sitemap-product-index.xml",
"http://example.com/sitemap-uppercase.xml",
"http://example.com/sitemap-relative-url.xml",
],
)
def test_sitemap_blanklines(self):
"""Assert we can deal with starting blank lines before tag"""
s = Sitemap(
b"""
http://www.example.com/sitemap1.xml
2013-07-15
http://www.example.com/sitemap2.xml
2013-07-15
http://www.example.com/sitemap3.xml
2013-07-15
"""
)
self.assertEqual(
list(s),
[
{"lastmod": "2013-07-15", "loc": "http://www.example.com/sitemap1.xml"},
{"lastmod": "2013-07-15", "loc": "http://www.example.com/sitemap2.xml"},
{"lastmod": "2013-07-15", "loc": "http://www.example.com/sitemap3.xml"},
],
)
def test_comment(self):
s = Sitemap(
b"""
http://www.example.com/
"""
)
self.assertEqual(list(s), [{"loc": "http://www.example.com/"}])
def test_alternate(self):
s = Sitemap(
b"""
http://www.example.com/english/
"""
)
self.assertEqual(
list(s),
[
{
"loc": "http://www.example.com/english/",
"alternate": [
"http://www.example.com/deutsch/",
"http://www.example.com/schweiz-deutsch/",
"http://www.example.com/english/",
],
}
],
)
def test_xml_entity_expansion(self):
s = Sitemap(
b"""
]>
http://127.0.0.1:8000/&xxe;
"""
)
self.assertEqual(list(s), [{"loc": "http://127.0.0.1:8000/"}])
if __name__ == "__main__":
unittest.main()