mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 00:04:09 +00:00
Merge pull request #707 from redapple/gh-issue-706-start-requests
[MRG] Fix engine to support filtered start_requests
This commit is contained in:
commit
197e360730
@ -112,6 +112,7 @@ class ExecutionEngine(object):
|
||||
except StopIteration:
|
||||
slot.start_requests = None
|
||||
except Exception as exc:
|
||||
slot.start_requests = None
|
||||
log.err(None, 'Obtaining request from start requests', \
|
||||
spider=spider)
|
||||
else:
|
||||
@ -156,7 +157,8 @@ class ExecutionEngine(object):
|
||||
scraper_idle = self.scraper.slot.is_idle()
|
||||
pending = self.slot.scheduler.has_pending_requests()
|
||||
downloading = bool(self.downloader.active)
|
||||
idle = scraper_idle and not (pending or downloading)
|
||||
pending_start_requests = self.slot.start_requests is not None
|
||||
idle = scraper_idle and not (pending or downloading or pending_start_requests)
|
||||
return idle
|
||||
|
||||
@property
|
||||
|
@ -157,3 +157,26 @@ class SingleRequestSpider(MetaSpider):
|
||||
self.meta['failure'] = failure
|
||||
if callable(self.errback_func):
|
||||
return self.errback_func(failure)
|
||||
|
||||
|
||||
class DuplicateStartRequestsSpider(Spider):
|
||||
dont_filter = True
|
||||
name = 'duplicatestartrequests'
|
||||
distinct_urls = 2
|
||||
dupe_factor = 3
|
||||
|
||||
def start_requests(self):
|
||||
for i in range(0, self.distinct_urls):
|
||||
for j in range(0, self.dupe_factor):
|
||||
url = "http://localhost:8998/echo?headers=1&body=test%d" % i
|
||||
yield self.make_requests_from_url(url)
|
||||
|
||||
def make_requests_from_url(self, url):
|
||||
return Request(url, dont_filter=self.dont_filter)
|
||||
|
||||
def __init__(self, url="http://localhost:8998", *args, **kwargs):
|
||||
super(DuplicateStartRequestsSpider, self).__init__(*args, **kwargs)
|
||||
self.visited = 0
|
||||
|
||||
def parse(self, response):
|
||||
self.visited += 1
|
||||
|
@ -5,7 +5,7 @@ from twisted.internet import defer
|
||||
from twisted.trial.unittest import TestCase
|
||||
from scrapy.utils.test import docrawl, get_testlog
|
||||
from scrapy.tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \
|
||||
BrokenStartRequestsSpider, SingleRequestSpider
|
||||
BrokenStartRequestsSpider, SingleRequestSpider, DuplicateStartRequestsSpider
|
||||
from scrapy.tests.mockserver import MockServer
|
||||
from scrapy.http import Request
|
||||
|
||||
@ -113,6 +113,21 @@ class CrawlTestCase(TestCase):
|
||||
#self.assertTrue(spider.seedsseen.index(None) < spider.seedsseen.index(99),
|
||||
# spider.seedsseen)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_start_requests_dupes(self):
|
||||
settings = {"CONCURRENT_REQUESTS": 1}
|
||||
spider = DuplicateStartRequestsSpider(dont_filter=True,
|
||||
distinct_urls=2,
|
||||
dupe_factor=3)
|
||||
yield docrawl(spider, settings)
|
||||
self.assertEqual(spider.visited, 6)
|
||||
|
||||
spider = DuplicateStartRequestsSpider(dont_filter=False,
|
||||
distinct_urls=3,
|
||||
dupe_factor=4)
|
||||
yield docrawl(spider, settings)
|
||||
self.assertEqual(spider.visited, 3)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_unbounded_response(self):
|
||||
# Completeness of responses without Content-Length or Transfer-Encoding
|
||||
|
Loading…
x
Reference in New Issue
Block a user