Fix wrong slice of tokens in recursive extraction of follow region

2025-02-28 09:23:51 +00:00 · 2010-10-22 14:21:52 -02:00 · 2010-10-22 14:21:52 -02:00 · d17edd4a59
commit d17edd4a59
parent 72d08383bb
2 changed files with 35 additions and 3 deletions
--- a/scrapy/contrib/ibl/extraction/regionextract.py
+++ b/scrapy/contrib/ibl/extraction/regionextract.py
@ -409,16 +409,16 @@ class RecordExtractor(object):
                _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex)
                extracted_data += nested_data
            if following_regions:
-                _, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_region)
+                _, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index)
                extracted_data += following_data
        
        elif following_regions:
-            end_index, _, following_data = self._doextract(page, following_regions, start_index, end_region)
+            end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index)
            if end_index is not None:
                pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions)
                extracted_data += following_data
        elif nested_regions:
-            _, _, nested_data = self._doextract(page, nested_regions, start_index, end_region)
+            _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index)
            extracted_data += nested_data
        return pindex, sindex, extracted_data
                
--- a/scrapy/tests/test_contrib_ibl/test_extraction.py
+++ b/scrapy/tests/test_contrib_ibl/test_extraction.py
@ -607,6 +607,32 @@ EXTRACT_PAGE19b = u"""
 </body></html>
 """

+ANNOTATED_PAGE20 = u"""
+<html><body>
+<h1>Product Name</h1>
+<img src="product.jpg">
+<br/>
+<span><ins data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: true,                                              
+&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Twin</ins>:</span> $<ins data-scrapy-annotate="{&quot;variant&quot;: 1, &quot;generated&quot;: true,
+&quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">270</ins> - November 2010<br/>
+<span><ins data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: true,
+&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Queen</ins>:</span> $<ins data-scrapy-annotate="{&quot;variant&quot;: 2, &quot;generated&quot;: true,
+&quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">330</ins> - In stock<br/>
+<br/>
+</body></html>
+"""
+
+EXTRACT_PAGE20 = u"""
+<html><body>
+<h1>Product Name</h1>
+<img src="product.jpg">
+<br/>
+<span>Twin:</span> $270 - November 2010<br/>
+<span>Queen:</span> $330 - Movember 2010<br/>
+<br/>
+</body></html>
+"""
+
 SAMPLE_DESCRIPTOR1 = ItemDescriptor('test', 'product test', [
    A('name', "Product name", required=True),
    A('price', "Product price, including any discounts and tax or vat", 
@ -781,6 +807,12 @@ TEST_DATA = [
         SAMPLE_DESCRIPTOR1,
         None,
    ),
+    ('repeated partial annotations with variants', [ANNOTATED_PAGE20], EXTRACT_PAGE20, None,
+            {u'variants': [
+                {'price': ['270'], 'name': ['Twin']},
+                {'price': ['330'], 'name': ['Queen']},
+            ]},
+    ),
 ]

 class TestExtraction(TestCase):