mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-28 09:23:51 +00:00
Fix wrong slice of tokens in recursive extraction of follow region
This commit is contained in:
parent
72d08383bb
commit
d17edd4a59
@ -409,16 +409,16 @@ class RecordExtractor(object):
|
||||
_, _, nested_data = self._doextract(page, nested_regions, pindex, sindex)
|
||||
extracted_data += nested_data
|
||||
if following_regions:
|
||||
_, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_region)
|
||||
_, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index)
|
||||
extracted_data += following_data
|
||||
|
||||
elif following_regions:
|
||||
end_index, _, following_data = self._doextract(page, following_regions, start_index, end_region)
|
||||
end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index)
|
||||
if end_index is not None:
|
||||
pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions)
|
||||
extracted_data += following_data
|
||||
elif nested_regions:
|
||||
_, _, nested_data = self._doextract(page, nested_regions, start_index, end_region)
|
||||
_, _, nested_data = self._doextract(page, nested_regions, start_index, end_index)
|
||||
extracted_data += nested_data
|
||||
return pindex, sindex, extracted_data
|
||||
|
||||
|
@ -607,6 +607,32 @@ EXTRACT_PAGE19b = u"""
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
ANNOTATED_PAGE20 = u"""
|
||||
<html><body>
|
||||
<h1>Product Name</h1>
|
||||
<img src="product.jpg">
|
||||
<br/>
|
||||
<span><ins data-scrapy-annotate="{"variant": 1, "generated": true,
|
||||
"annotations": {"content": "name"}}">Twin</ins>:</span> $<ins data-scrapy-annotate="{"variant": 1, "generated": true,
|
||||
"annotations": {"content": "price"}}">270</ins> - November 2010<br/>
|
||||
<span><ins data-scrapy-annotate="{"variant": 2, "generated": true,
|
||||
"annotations": {"content": "name"}}">Queen</ins>:</span> $<ins data-scrapy-annotate="{"variant": 2, "generated": true,
|
||||
"annotations": {"content": "price"}}">330</ins> - In stock<br/>
|
||||
<br/>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
EXTRACT_PAGE20 = u"""
|
||||
<html><body>
|
||||
<h1>Product Name</h1>
|
||||
<img src="product.jpg">
|
||||
<br/>
|
||||
<span>Twin:</span> $270 - November 2010<br/>
|
||||
<span>Queen:</span> $330 - Movember 2010<br/>
|
||||
<br/>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
SAMPLE_DESCRIPTOR1 = ItemDescriptor('test', 'product test', [
|
||||
A('name', "Product name", required=True),
|
||||
A('price', "Product price, including any discounts and tax or vat",
|
||||
@ -781,6 +807,12 @@ TEST_DATA = [
|
||||
SAMPLE_DESCRIPTOR1,
|
||||
None,
|
||||
),
|
||||
('repeated partial annotations with variants', [ANNOTATED_PAGE20], EXTRACT_PAGE20, None,
|
||||
{u'variants': [
|
||||
{'price': ['270'], 'name': ['Twin']},
|
||||
{'price': ['330'], 'name': ['Queen']},
|
||||
]},
|
||||
),
|
||||
]
|
||||
|
||||
class TestExtraction(TestCase):
|
||||
|
Loading…
x
Reference in New Issue
Block a user