mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 12:43:57 +00:00
- Fixed bug in attributes assignation (empty attributes being set)
- Added GUID setting to FeedSpider --HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40361
This commit is contained in:
parent
9b46c20da2
commit
bd38a312d4
@ -8,6 +8,7 @@ from scrapy.spider import BaseSpider
|
|||||||
from scrapy.item import ScrapedItem
|
from scrapy.item import ScrapedItem
|
||||||
from scrapy.xpath.selector import XmlXPathSelector
|
from scrapy.xpath.selector import XmlXPathSelector
|
||||||
from scrapy.core.exceptions import UsageError
|
from scrapy.core.exceptions import UsageError
|
||||||
|
from scrapy.utils.iterators import xmliter, csviter
|
||||||
from scrapy.utils.misc import hash_values
|
from scrapy.utils.misc import hash_values
|
||||||
|
|
||||||
class BasicSpider(BaseSpider):
|
class BasicSpider(BaseSpider):
|
||||||
@ -17,7 +18,7 @@ class BasicSpider(BaseSpider):
|
|||||||
gen_guid_attribs = []
|
gen_guid_attribs = []
|
||||||
|
|
||||||
def set_guid(self, item):
|
def set_guid(self, item):
|
||||||
item.guid = hash_values(*[str(getattr(item, aname) or '') for aname in self.gen_guid_attribs])
|
item.guid = hash_values(self.domain_name, *[str(getattr(item, aname) or '') for aname in self.gen_guid_attribs])
|
||||||
|
|
||||||
class CrawlSpider(BasicSpider):
|
class CrawlSpider(BasicSpider):
|
||||||
"""
|
"""
|
||||||
@ -93,6 +94,12 @@ class XMLFeedSpider(BasicSpider):
|
|||||||
iternodes = True
|
iternodes = True
|
||||||
itertag = 'product'
|
itertag = 'product'
|
||||||
|
|
||||||
|
def parse_item_wrapper(self, response, xSel):
|
||||||
|
ret = self.parse_item(response, xSel)
|
||||||
|
if isinstance(ret, ScrapedItem):
|
||||||
|
self.set_guid(ret)
|
||||||
|
return ret
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
if not hasattr(self, 'parse_item'):
|
if not hasattr(self, 'parse_item'):
|
||||||
raise NotConfigured('You must define parse_item method in order to scrape this feed')
|
raise NotConfigured('You must define parse_item method in order to scrape this feed')
|
||||||
@ -102,5 +109,5 @@ class XMLFeedSpider(BasicSpider):
|
|||||||
else:
|
else:
|
||||||
nodes = XmlXPathSelector(response).x('//%s' % self.itertag)
|
nodes = XmlXPathSelector(response).x('//%s' % self.itertag)
|
||||||
|
|
||||||
return (self.parse_item(response, xSel) for xSel in nodes)
|
return (self.parse_item_wrapper(response, xSel) for xSel in nodes)
|
||||||
|
|
||||||
|
@ -24,17 +24,18 @@ class ScrapedItem(object):
|
|||||||
|
|
||||||
def attribute(self, attrname, value, override=False, add=False, debug=False):
|
def attribute(self, attrname, value, override=False, add=False, debug=False):
|
||||||
val = self._adaptors_dict.execute(attrname, value, debug)
|
val = self._adaptors_dict.execute(attrname, value, debug)
|
||||||
curr_val = getattr(self, attrname, None)
|
if val:
|
||||||
if not curr_val:
|
curr_val = getattr(self, attrname, None)
|
||||||
setattr(self, attrname, val)
|
if not curr_val:
|
||||||
else:
|
|
||||||
if override:
|
|
||||||
setattr(self, attrname, val)
|
setattr(self, attrname, val)
|
||||||
elif add and all(hasattr(var, '__iter__') for var in (curr_val, val)):
|
else:
|
||||||
newval = []
|
if override:
|
||||||
newval.extend(curr_val)
|
setattr(self, attrname, val)
|
||||||
newval.extend(val)
|
elif add and all(hasattr(var, '__iter__') for var in (curr_val, val)):
|
||||||
setattr(self, attrname, newval)
|
newval = []
|
||||||
|
newval.extend(curr_val)
|
||||||
|
newval.extend(val)
|
||||||
|
setattr(self, attrname, newval)
|
||||||
|
|
||||||
def __sub__(self, other):
|
def __sub__(self, other):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
Loading…
x
Reference in New Issue
Block a user