1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 00:04:09 +00:00

Merge pull request #707 from redapple/gh-issue-706-start-requests

[MRG] Fix engine to support filtered start_requests
This commit is contained in:
Daniel Graña 2014-04-28 12:13:40 -04:00
commit 197e360730
3 changed files with 42 additions and 2 deletions

View File

@ -112,6 +112,7 @@ class ExecutionEngine(object):
except StopIteration:
slot.start_requests = None
except Exception as exc:
slot.start_requests = None
log.err(None, 'Obtaining request from start requests', \
spider=spider)
else:
@ -156,7 +157,8 @@ class ExecutionEngine(object):
scraper_idle = self.scraper.slot.is_idle()
pending = self.slot.scheduler.has_pending_requests()
downloading = bool(self.downloader.active)
idle = scraper_idle and not (pending or downloading)
pending_start_requests = self.slot.start_requests is not None
idle = scraper_idle and not (pending or downloading or pending_start_requests)
return idle
@property

View File

@ -157,3 +157,26 @@ class SingleRequestSpider(MetaSpider):
self.meta['failure'] = failure
if callable(self.errback_func):
return self.errback_func(failure)
class DuplicateStartRequestsSpider(Spider):
dont_filter = True
name = 'duplicatestartrequests'
distinct_urls = 2
dupe_factor = 3
def start_requests(self):
for i in range(0, self.distinct_urls):
for j in range(0, self.dupe_factor):
url = "http://localhost:8998/echo?headers=1&body=test%d" % i
yield self.make_requests_from_url(url)
def make_requests_from_url(self, url):
return Request(url, dont_filter=self.dont_filter)
def __init__(self, url="http://localhost:8998", *args, **kwargs):
super(DuplicateStartRequestsSpider, self).__init__(*args, **kwargs)
self.visited = 0
def parse(self, response):
self.visited += 1

View File

@ -5,7 +5,7 @@ from twisted.internet import defer
from twisted.trial.unittest import TestCase
from scrapy.utils.test import docrawl, get_testlog
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
BrokenStartRequestsSpider, SingleRequestSpider
BrokenStartRequestsSpider, SingleRequestSpider, DuplicateStartRequestsSpider
from scrapy.tests.mockserver import MockServer
from scrapy.http import Request
@ -113,6 +113,21 @@ class CrawlTestCase(TestCase):
#self.assertTrue(spider.seedsseen.index(None) < spider.seedsseen.index(99),
# spider.seedsseen)
@defer.inlineCallbacks
def test_start_requests_dupes(self):
settings = {"CONCURRENT_REQUESTS": 1}
spider = DuplicateStartRequestsSpider(dont_filter=True,
distinct_urls=2,
dupe_factor=3)
yield docrawl(spider, settings)
self.assertEqual(spider.visited, 6)
spider = DuplicateStartRequestsSpider(dont_filter=False,
distinct_urls=3,
dupe_factor=4)
yield docrawl(spider, settings)
self.assertEqual(spider.visited, 3)
@defer.inlineCallbacks
def test_unbounded_response(self):
# Completeness of responses without Content-Length or Transfer-Encoding