1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-28 09:23:51 +00:00

Fix wrong slice of tokens in recursive extraction of follow region

This commit is contained in:
Martin Olveyra 2010-10-22 14:21:52 -02:00
parent 72d08383bb
commit d17edd4a59
2 changed files with 35 additions and 3 deletions

View File

@ -409,16 +409,16 @@ class RecordExtractor(object):
_, _, nested_data = self._doextract(page, nested_regions, pindex, sindex)
extracted_data += nested_data
if following_regions:
_, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_region)
_, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index)
extracted_data += following_data
elif following_regions:
end_index, _, following_data = self._doextract(page, following_regions, start_index, end_region)
end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index)
if end_index is not None:
pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions)
extracted_data += following_data
elif nested_regions:
_, _, nested_data = self._doextract(page, nested_regions, start_index, end_region)
_, _, nested_data = self._doextract(page, nested_regions, start_index, end_index)
extracted_data += nested_data
return pindex, sindex, extracted_data

View File

@ -607,6 +607,32 @@ EXTRACT_PAGE19b = u"""
</body></html>
"""
ANNOTATED_PAGE20 = u"""
<html><body>
<h1>Product Name</h1>
<img src="product.jpg">
<br/>
<span><ins data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: true,
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Twin</ins>:</span> $<ins data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: true,
&quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">270</ins> - November 2010<br/>
<span><ins data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: true,
&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Queen</ins>:</span> $<ins data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: true,
&quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">330</ins> - In stock<br/>
<br/>
</body></html>
"""
EXTRACT_PAGE20 = u"""
<html><body>
<h1>Product Name</h1>
<img src="product.jpg">
<br/>
<span>Twin:</span> $270 - November 2010<br/>
<span>Queen:</span> $330 - Movember 2010<br/>
<br/>
</body></html>
"""
SAMPLE_DESCRIPTOR1 = ItemDescriptor('test', 'product test', [
A('name', "Product name", required=True),
A('price', "Product price, including any discounts and tax or vat",
@ -781,6 +807,12 @@ TEST_DATA = [
SAMPLE_DESCRIPTOR1,
None,
),
('repeated partial annotations with variants', [ANNOTATED_PAGE20], EXTRACT_PAGE20, None,
{u'variants': [
{'price': ['270'], 'name': ['Twin']},
{'price': ['330'], 'name': ['Queen']},
]},
),
]
class TestExtraction(TestCase):