1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-22 06:33:12 +00:00

Improved Adaptors code

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40185
This commit is contained in:
olveyra 2008-08-31 00:25:13 +00:00
parent 0e6562cb47
commit f3013bb9ad
2 changed files with 86 additions and 16 deletions

View File

@ -0,0 +1,73 @@
import re
class DuplicatedAdaptorName(Exception):
def __init__(self, name):
self.__name = name
def __str__(self):
return name
class Adaptor(object):
"""
Adaptors instances should be instantiated and used only
inside the AdaptorPipe.
"""
def __init__(self, function, name, attribute_re=None):
self.name = name
self.basefunction = function
if not attribute_re:
attribute_re = ".*"
self.attribute_re = re.compile(attribute_re)
def function(self, value, **pipeargs):
return self.basefunction(value, **pipeargs)
class AdaptorPipe:
def __init__(self, define_from=None, adaptorclass=None):
"""
If "define_from" is given, constructs pipeline from this.
"define_from" is an ordered tuple of triplets, each of which
has the attribute name regex, the adaptor name, and the module
path to the adaptor function. Example:
(
("url", "remove_entities", "scrapy.utils.markup.remove_entities")
(".*", "replace_tags", "scrapy.utils.markup.replace_tags")
...
)
"""
self.__adaptorspipe = []
self.__adaptorclass = adaptorclass or Adaptor
@property
def adaptors_names(self):
_adaptors = []
for a in self.__adaptorspipe:
_adaptors.append(a.name)
return _adaptors
def insertadaptor(self, function, name, attrs_re=None, after=None, before=None):
"""
Inserts a "function" as an adaptor that will apply for attribute names
which matches "attrs_re" regex, after adaptor of name "after", or before
adaptor of name "before". The "function" must always have a **keyword
argument to ignore unused keywords. "name" is the name of the adaptor.
"""
if name in self.adaptors_names:
raise DuplicatedAdaptorName(name)
else:
adaptor = self.__adaptorclass(function, name, attrs_re)
#by default append adaptor at end of pipe
pos = len(self.adaptors_names)
if after:
pos = self.adaptors_names.index(after) + 1
elif before:
pos = self.adaptors_names.index(before)
self.__adaptorspipe.insert(pos, adaptor)
return pos
def execute(self, attrname, value, **pipeargs):
"""
Execute pipeline for attribute name "attrname" and value "value".
Pass the given pipeargs to each adaptor function in the pipe.
"""
for adaptor in self.__adaptorspipe:
if adaptor.attribute_re.search(attrname):
value = adaptor.function(value, **pipeargs)
return value

View File

@ -1,13 +1,12 @@
class BaseAdaptor(object):
def function(self, item, value, **pipeargs):
raise NotImplemented
from scrapy.item.adaptors import AdaptorPipe
#default adaptors
class ExtractAdaptor(BaseAdaptor):
def function(self, item, value, **pipeargs):
if hasattr(value, 'extract'):
value = value.extract()
return value
def extract(value):
if hasattr(value, 'extract'):
value = value.extract()
return value
standardpipe = AdaptorPipe()
standardpipe.insertadaptor(extract, "extract")
class ScrapedItem(object):
"""
@ -17,14 +16,12 @@ class ScrapedItem(object):
* guid (unique global indentifier)
* url (URL where that item was scraped from)
"""
adaptors_pipe = [ExtractAdaptor()]
adaptors_pipe = standardpipe
def set_adaptors_pipe(adaptors_pipes):
ScrapedItem.adaptors_pipes = adaptors_pipes
def attribute(self, name, value, **pipeargs):
for adaptor in ScrapedItem.adaptors_pipe:
value = adaptor.function(self, value, **pipeargs)
if not hasattr(item, name):
setattr(item, name, value)
def attribute(self, attrname, value, **pipeargs):
value =ScrapedItem.adaptors_pipe.execute(attrname, value, **pipeargs)
if not hasattr(self, attrname):
setattr(self, attrname, value)