Merge pull request #98 from kalessin/start_requests

This will break any spider that extends `start_requests` and expect a `list` as return value. In the other side: * [Docs](http://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spider.BaseSpider.start_requests) says that return value is **iterable** not list: * Scrapy core already support consuming start_requests generator on demand so we can avoid problems like #47 * it allows extensions to change starting requests on `spider_opened` signal
2025-02-26 13:04:01 +00:00 · 2012-03-05 08:51:22 -08:00 · 2012-03-05 08:51:22 -08:00 · cc6e297062
commit cc6e297062
parent e521da2e2f f6179a927e
2 changed files with 2 additions and 4 deletions
--- a/scrapy/contrib/spiders/sitemap.py
+++ b/scrapy/contrib/spiders/sitemap.py
@ -22,7 +22,7 @@ class SitemapSpider(BaseSpider):
        self._follow = [regex(x) for x in self.sitemap_follow]

    def start_requests(self):
-        return [Request(x, callback=self._parse_sitemap) for x in self.sitemap_urls]
+        return (Request(x, callback=self._parse_sitemap) for x in self.sitemap_urls)

    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
--- a/scrapy/spider.py
+++ b/scrapy/spider.py
@ -50,10 +50,8 @@ class BaseSpider(object_ref):
        return self._settings

    def start_requests(self):
-        reqs = []
        for url in self.start_urls:
-            reqs.extend(arg_to_iter(self.make_requests_from_url(url)))
-        return reqs
+            yield self.make_requests_from_url(url)

    def make_requests_from_url(self, url):
        return Request(url, dont_filter=True)