mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-26 22:04:01 +00:00
Modified extract adaptor to make use of "adaptor_args" (as it should), and added test for AdaptorPipes
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40673
This commit is contained in:
parent
41fbcbde48
commit
e4852bca78
@ -10,7 +10,7 @@ from scrapy.utils.url import is_url
|
||||
from scrapy.utils.python import flatten
|
||||
from scrapy.xpath.selector import XPathSelector, XPathSelectorList
|
||||
|
||||
def extract(locations, use_unquote=True):
|
||||
def extract(locations, adaptor_args=None):
|
||||
"""
|
||||
This adaptor tries to extract data from the given locations.
|
||||
Any XPathSelector in it will be extracted, and any other data
|
||||
@ -23,8 +23,8 @@ def extract(locations, use_unquote=True):
|
||||
Input: anything
|
||||
Output: list of extracted selectors plus anything else in the input
|
||||
"""
|
||||
|
||||
locations = flatten([locations])
|
||||
use_unquote = adaptor_args.get('use_unquote', True) if adaptor_args else True
|
||||
|
||||
result = []
|
||||
for location in locations:
|
||||
|
@ -51,10 +51,11 @@ class AdaptorPipe(list):
|
||||
return value
|
||||
|
||||
def __add__(self, other):
|
||||
if isinstance(other, list):
|
||||
return AdaptorPipe(super(AdaptorPipe, self).__add__(other))
|
||||
elif callable(other):
|
||||
return AdaptorPipe(self + [other])
|
||||
if callable(other):
|
||||
other = [other]
|
||||
elif hasattr(other, '__iter__'):
|
||||
other = list(other)
|
||||
return AdaptorPipe(super(AdaptorPipe, self).__add__(other))
|
||||
|
||||
def __repr__(self):
|
||||
return '<AdaptorPipe %s >' % super(AdaptorPipe, self).__repr__()
|
||||
|
@ -3,10 +3,38 @@ import os
|
||||
import unittest
|
||||
import re
|
||||
|
||||
from scrapy.item.adaptors import AdaptorPipe
|
||||
from scrapy.contrib import adaptors
|
||||
from scrapy.http import Response, Headers
|
||||
from scrapy.xpath.selector import HtmlXPathSelector, XmlXPathSelector
|
||||
|
||||
class AdaptorPipeTestCase(unittest.TestCase):
|
||||
def test_pipe_init(self):
|
||||
self.assertRaises(TypeError, AdaptorPipe, [adaptors.extract, 'a string'])
|
||||
|
||||
def test_adaptor_args(self):
|
||||
def sample_adaptor(value, adaptor_args):
|
||||
'''Dummy adaptor that joins the received value with the given string'''
|
||||
sample_text = adaptor_args.get('sample_arg', 'sample text 1')
|
||||
return '%s "%s"' % (value, sample_text)
|
||||
|
||||
sample_value = 'hi, this is my text:'
|
||||
sample_pipe = AdaptorPipe([sample_adaptor])
|
||||
self.assertEqual(sample_pipe(sample_value), 'hi, this is my text: "sample text 1"')
|
||||
self.assertEqual(sample_pipe(sample_value, sample_arg='foobarfoobar'),
|
||||
'hi, this is my text: "foobarfoobar"')
|
||||
|
||||
def test_add(self):
|
||||
pipe1 = AdaptorPipe([adaptors.extract])
|
||||
pipe2 = [adaptors.remove_tags]
|
||||
pipe3 = (adaptors.remove_root, )
|
||||
sample_callable = dir
|
||||
|
||||
self.assertTrue(isinstance(pipe1 + pipe1, AdaptorPipe))
|
||||
self.assertTrue(isinstance(pipe1 + pipe2, AdaptorPipe))
|
||||
self.assertTrue(isinstance(pipe1 + pipe3, AdaptorPipe))
|
||||
self.assertTrue(isinstance(pipe1 + sample_callable, AdaptorPipe))
|
||||
|
||||
class AdaptorsTestCase(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.samplesdir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'sample_data', 'adaptors'))
|
||||
@ -75,9 +103,16 @@ class AdaptorsTestCase(unittest.TestCase):
|
||||
|
||||
def test_extract_unquoted(self):
|
||||
x = self.get_selector('example.com', 'http://www.example.com/test_unquoted', 'extr_unquoted.xml', selector=XmlXPathSelector)
|
||||
|
||||
# test unquoting
|
||||
self.assertEqual(adaptors.extract(x.x('//tag1/text()')), [u'test text & &', u'more test text & >', u'blah&blah'])
|
||||
self.assertEqual(adaptors.extract(x.x('//tag2/text()')), [u'blaheawfds<'])
|
||||
|
||||
# test without unquoting
|
||||
self.assertEqual(adaptors.extract(x.x('//tag1/text()'), {'use_unquote': False}),
|
||||
[u'test text & &', u'<![CDATA[more test text & >]]>', u'blah&blah'])
|
||||
self.assertEqual(adaptors.extract(x.x('//tag2/text()'), {'use_unquote': False}), [u'blaheawfds<'])
|
||||
|
||||
def test_extract_links(self):
|
||||
test_data = """<html><body>
|
||||
<div>
|
||||
|
Loading…
x
Reference in New Issue
Block a user