diff --git a/scrapy/tests/test_utils_sitemap.py b/scrapy/tests/test_utils_sitemap.py index d78447ef0..fc1f3b15f 100644 --- a/scrapy/tests/test_utils_sitemap.py +++ b/scrapy/tests/test_utils_sitemap.py @@ -39,7 +39,22 @@ class SitemapTest(unittest.TestCase): assert s.type == 'sitemapindex' self.assertEqual(list(s), [{'loc': 'http://www.example.com/sitemap1.xml.gz', 'lastmod': '2004-10-01T18:23:17+00:00'}, {'loc': 'http://www.example.com/sitemap2.xml.gz', 'lastmod': '2005-01-01'}]) -class RobotsTest(unittest.TestCase): + def test_sitemap_strip(self): + """Assert we can deal with trailing spaces inside tags - we've + seen those + """ + s = Sitemap(""" + + + http://www.example.com/ + 2009-08-16 + daily + 1 + + +""") + self.assertEqual(list(s), + [{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'}]) def test_sitemap_urls_from_robots(self): robots = """User-agent: * diff --git a/scrapy/utils/sitemap.py b/scrapy/utils/sitemap.py index aad39c512..e8d3b367e 100644 --- a/scrapy/utils/sitemap.py +++ b/scrapy/utils/sitemap.py @@ -23,7 +23,7 @@ class Sitemap(object): d = {} for el in elem.getchildren(): _, name = el.tag.split('}', 1) - d[name] = el.text + d[name] = el.text.strip() yield d def sitemap_urls_from_robots(robots_text):