1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 14:44:07 +00:00

sitemaps: support trailing spaces in <loc> elements

This commit is contained in:
Pablo Hoffman 2011-06-20 21:22:16 -03:00
parent 2e74ccaa7e
commit 0305ffdd6c
2 changed files with 17 additions and 2 deletions

View File

@ -39,7 +39,22 @@ class SitemapTest(unittest.TestCase):
assert s.type == 'sitemapindex'
self.assertEqual(list(s), [{'loc': 'http://www.example.com/sitemap1.xml.gz', 'lastmod': '2004-10-01T18:23:17+00:00'}, {'loc': 'http://www.example.com/sitemap2.xml.gz', 'lastmod': '2005-01-01'}])
class RobotsTest(unittest.TestCase):
def test_sitemap_strip(self):
"""Assert we can deal with trailing spaces inside <loc> tags - we've
seen those
"""
s = Sitemap("""<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
<url>
<loc> http://www.example.com/</loc>
<lastmod>2009-08-16</lastmod>
<changefreq>daily</changefreq>
<priority>1</priority>
</url>
</urlset>
""")
self.assertEqual(list(s),
[{'priority': '1', 'loc': 'http://www.example.com/', 'lastmod': '2009-08-16', 'changefreq': 'daily'}])
def test_sitemap_urls_from_robots(self):
robots = """User-agent: *

View File

@ -23,7 +23,7 @@ class Sitemap(object):
d = {}
for el in elem.getchildren():
_, name = el.tag.split('}', 1)
d[name] = el.text
d[name] = el.text.strip()
yield d
def sitemap_urls_from_robots(robots_text):