diff --git a/scrapy/commands/fetch.py b/scrapy/commands/fetch.py index a157b19f8..6fe6d73b9 100644 --- a/scrapy/commands/fetch.py +++ b/scrapy/commands/fetch.py @@ -5,6 +5,7 @@ from w3lib.url import is_url from scrapy.commands import ScrapyCommand from scrapy.http import Request from scrapy.exceptions import UsageError +from scrapy.utils.datatypes import SequenceExclude from scrapy.utils.spider import spidercls_for_request, DefaultSpider class Command(ScrapyCommand): @@ -27,8 +28,8 @@ class Command(ScrapyCommand): help="use this spider") parser.add_option("--headers", dest="headers", action="store_true", \ help="print response HTTP headers instead of body") - parser.add_option("--no-status-aware", dest="no_status_aware", action="store_true", \ - default=False, help="do not handle status codes like redirects and print response as-is") + parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \ + default=False, help="do not handle HTTP 3xx status codes and print response as-is") def _print_headers(self, headers, prefix): for key, values in headers.items(): @@ -52,7 +53,11 @@ class Command(ScrapyCommand): raise UsageError() cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) - if opts.no_status_aware: + # by default, let the framework handle redirects, + # i.e. command handles all codes expect 3xx + if not opts.no_redirect: + request.meta['handle_httpstatus_list'] = SequenceExclude(six.moves.range(300, 400)) + else: request.meta['handle_httpstatus_all'] = True spidercls = DefaultSpider diff --git a/scrapy/commands/shell.py b/scrapy/commands/shell.py index bc0203d89..40a58d94a 100644 --- a/scrapy/commands/shell.py +++ b/scrapy/commands/shell.py @@ -36,8 +36,8 @@ class Command(ScrapyCommand): help="evaluate the code in the shell, print the result and exit") parser.add_option("--spider", dest="spider", help="use this spider") - parser.add_option("--no-status-aware", dest="no_status_aware", action="store_true", \ - default=False, help="do not transparently handle status codes like redirects") + parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \ + default=False, help="do not handle HTTP 3xx status codes and print response as-is") def update_vars(self, vars): """You can use this function to update the Scrapy objects that will be @@ -70,7 +70,7 @@ class Command(ScrapyCommand): self._start_crawler_thread() shell = Shell(crawler, update_vars=self.update_vars, code=opts.code) - shell.start(url=url, handle_statuses=opts.no_status_aware) + shell.start(url=url, redirect=not opts.no_redirect) def _start_crawler_thread(self): t = Thread(target=self.crawler_process.start, diff --git a/scrapy/shell.py b/scrapy/shell.py index 966003f17..6c78722be 100644 --- a/scrapy/shell.py +++ b/scrapy/shell.py @@ -7,6 +7,7 @@ from __future__ import print_function import os import signal +from six.moves import range import warnings from twisted.internet import reactor, threads, defer @@ -20,6 +21,7 @@ from scrapy.item import BaseItem from scrapy.settings import Settings from scrapy.spiders import Spider from scrapy.utils.console import start_python_console +from scrapy.utils.datatypes import SequenceExclude from scrapy.utils.misc import load_object from scrapy.utils.response import open_in_browser from scrapy.utils.conf import get_config @@ -40,11 +42,11 @@ class Shell(object): self.code = code self.vars = {} - def start(self, url=None, request=None, response=None, spider=None, handle_statuses=True): + def start(self, url=None, request=None, response=None, spider=None, redirect=True): # disable accidental Ctrl-C key press from shutting down the engine signal.signal(signal.SIGINT, signal.SIG_IGN) if url: - self.fetch(url, spider, handle_statuses=handle_statuses) + self.fetch(url, spider, redirect=redirect) elif request: self.fetch(request, spider) elif response: @@ -98,13 +100,15 @@ class Shell(object): self.spider = spider return spider - def fetch(self, request_or_url, spider=None, handle_statuses=False, **kwargs): + def fetch(self, request_or_url, spider=None, redirect=True, **kwargs): if isinstance(request_or_url, Request): request = request_or_url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True, **kwargs) - if handle_statuses: + if redirect: + request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400)) + else: request.meta['handle_httpstatus_all'] = True response = None try: diff --git a/scrapy/utils/datatypes.py b/scrapy/utils/datatypes.py index d04b43176..e516185bd 100644 --- a/scrapy/utils/datatypes.py +++ b/scrapy/utils/datatypes.py @@ -304,3 +304,13 @@ class LocalCache(OrderedDict): while len(self) >= self.limit: self.popitem(last=False) super(LocalCache, self).__setitem__(key, value) + + +class SequenceExclude(object): + """Object to test if an item is NOT within some sequence.""" + + def __init__(self, seq): + self.seq = seq + + def __contains__(self, item): + return item not in self.seq diff --git a/tests/test_command_fetch.py b/tests/test_command_fetch.py index 45d03a129..3fa3ed930 100644 --- a/tests/test_command_fetch.py +++ b/tests/test_command_fetch.py @@ -21,7 +21,7 @@ class FetchTest(ProcessTest, SiteTest, unittest.TestCase): @defer.inlineCallbacks def test_redirect_disabled(self): - _, out, err = yield self.execute(['--no-status-aware', self.url('/redirect-no-meta-refresh')]) + _, out, err = yield self.execute(['--no-redirect', self.url('/redirect-no-meta-refresh')]) err = err.strip() self.assertIn(b'downloader/response_status_count/302', err, err) self.assertNotIn(b'downloader/response_status_count/200', err, err)