1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 22:04:01 +00:00

Modified extract adaptor to make use of "adaptor_args" (as it should), and added test for AdaptorPipes

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40673
This commit is contained in:
elpolilla 2009-01-07 14:00:54 +00:00
parent 41fbcbde48
commit e4852bca78
3 changed files with 42 additions and 6 deletions

View File

@ -10,7 +10,7 @@ from scrapy.utils.url import is_url
from scrapy.utils.python import flatten
from scrapy.xpath.selector import XPathSelector, XPathSelectorList
def extract(locations, use_unquote=True):
def extract(locations, adaptor_args=None):
"""
This adaptor tries to extract data from the given locations.
Any XPathSelector in it will be extracted, and any other data
@ -23,8 +23,8 @@ def extract(locations, use_unquote=True):
Input: anything
Output: list of extracted selectors plus anything else in the input
"""
locations = flatten([locations])
use_unquote = adaptor_args.get('use_unquote', True) if adaptor_args else True
result = []
for location in locations:

View File

@ -51,10 +51,11 @@ class AdaptorPipe(list):
return value
def __add__(self, other):
if isinstance(other, list):
return AdaptorPipe(super(AdaptorPipe, self).__add__(other))
elif callable(other):
return AdaptorPipe(self + [other])
if callable(other):
other = [other]
elif hasattr(other, '__iter__'):
other = list(other)
return AdaptorPipe(super(AdaptorPipe, self).__add__(other))
def __repr__(self):
return '<AdaptorPipe %s >' % super(AdaptorPipe, self).__repr__()

View File

@ -3,10 +3,38 @@ import os
import unittest
import re
from scrapy.item.adaptors import AdaptorPipe
from scrapy.contrib import adaptors
from scrapy.http import Response, Headers
from scrapy.xpath.selector import HtmlXPathSelector, XmlXPathSelector
class AdaptorPipeTestCase(unittest.TestCase):
def test_pipe_init(self):
self.assertRaises(TypeError, AdaptorPipe, [adaptors.extract, 'a string'])
def test_adaptor_args(self):
def sample_adaptor(value, adaptor_args):
'''Dummy adaptor that joins the received value with the given string'''
sample_text = adaptor_args.get('sample_arg', 'sample text 1')
return '%s "%s"' % (value, sample_text)
sample_value = 'hi, this is my text:'
sample_pipe = AdaptorPipe([sample_adaptor])
self.assertEqual(sample_pipe(sample_value), 'hi, this is my text: "sample text 1"')
self.assertEqual(sample_pipe(sample_value, sample_arg='foobarfoobar'),
'hi, this is my text: "foobarfoobar"')
def test_add(self):
pipe1 = AdaptorPipe([adaptors.extract])
pipe2 = [adaptors.remove_tags]
pipe3 = (adaptors.remove_root, )
sample_callable = dir
self.assertTrue(isinstance(pipe1 + pipe1, AdaptorPipe))
self.assertTrue(isinstance(pipe1 + pipe2, AdaptorPipe))
self.assertTrue(isinstance(pipe1 + pipe3, AdaptorPipe))
self.assertTrue(isinstance(pipe1 + sample_callable, AdaptorPipe))
class AdaptorsTestCase(unittest.TestCase):
def setUp(self):
self.samplesdir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'sample_data', 'adaptors'))
@ -75,9 +103,16 @@ class AdaptorsTestCase(unittest.TestCase):
def test_extract_unquoted(self):
x = self.get_selector('example.com', 'http://www.example.com/test_unquoted', 'extr_unquoted.xml', selector=XmlXPathSelector)
# test unquoting
self.assertEqual(adaptors.extract(x.x('//tag1/text()')), [u'test text & &', u'more test text &amp; &gt;', u'blah&blah'])
self.assertEqual(adaptors.extract(x.x('//tag2/text()')), [u'blaheawfds<'])
# test without unquoting
self.assertEqual(adaptors.extract(x.x('//tag1/text()'), {'use_unquote': False}),
[u'test text &amp; &amp;', u'<![CDATA[more test text &amp; &gt;]]>', u'blah&amp;blah'])
self.assertEqual(adaptors.extract(x.x('//tag2/text()'), {'use_unquote': False}), [u'blaheawfds&lt;'])
def test_extract_links(self):
test_data = """<html><body>
<div>