mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 20:23:53 +00:00
SitemapSpider: added support for parsing gzipped sitemaps (patch contributed by Rolando Espinoza)
This commit is contained in:
parent
5707051352
commit
949e11ee31
@ -3,6 +3,7 @@ import re
|
|||||||
from scrapy.spider import BaseSpider
|
from scrapy.spider import BaseSpider
|
||||||
from scrapy.http import Request, XmlResponse
|
from scrapy.http import Request, XmlResponse
|
||||||
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
|
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
|
||||||
|
from scrapy.utils.gz import gunzip
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
|
|
||||||
class SitemapSpider(BaseSpider):
|
class SitemapSpider(BaseSpider):
|
||||||
@ -28,10 +29,15 @@ class SitemapSpider(BaseSpider):
|
|||||||
for url in sitemap_urls_from_robots(response.body):
|
for url in sitemap_urls_from_robots(response.body):
|
||||||
yield Request(url, callback=self._parse_sitemap)
|
yield Request(url, callback=self._parse_sitemap)
|
||||||
else:
|
else:
|
||||||
if not isinstance(response, XmlResponse):
|
if isinstance(response, XmlResponse):
|
||||||
|
body = response.body
|
||||||
|
elif is_gzipped(response):
|
||||||
|
body = gunzip(response.body)
|
||||||
|
else:
|
||||||
log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING)
|
log.msg("Ignoring non-XML sitemap: %s" % response, log.WARNING)
|
||||||
return
|
return
|
||||||
s = Sitemap(response.body)
|
|
||||||
|
s = Sitemap(body)
|
||||||
if s.type == 'sitemapindex':
|
if s.type == 'sitemapindex':
|
||||||
for loc in iterloc(s):
|
for loc in iterloc(s):
|
||||||
if any(x.search(loc) for x in self._follow):
|
if any(x.search(loc) for x in self._follow):
|
||||||
@ -43,6 +49,9 @@ class SitemapSpider(BaseSpider):
|
|||||||
yield Request(loc, callback=c)
|
yield Request(loc, callback=c)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
def is_gzipped(response):
|
||||||
|
ctype = response.headers.get('Content-Type', '')
|
||||||
|
return ctype in ('application/x-gzip', 'applicatoin/gzip')
|
||||||
|
|
||||||
def regex(x):
|
def regex(x):
|
||||||
if isinstance(x, basestring):
|
if isinstance(x, basestring):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user