Let framework handle only HTTP redirects by default for fetch and shell commands

2025-02-22 04:33:05 +00:00 · 2016-12-07 17:56:13 +01:00 · 2016-12-07 17:56:13 +01:00 · 778bed07bf
commit 778bed07bf
parent 9aefc0a886
5 changed files with 30 additions and 11 deletions
--- a/scrapy/commands/fetch.py
+++ b/scrapy/commands/fetch.py
@ -5,6 +5,7 @@ from w3lib.url import is_url
 from scrapy.commands import ScrapyCommand
 from scrapy.http import Request
 from scrapy.exceptions import UsageError
+from scrapy.utils.datatypes import SequenceExclude
 from scrapy.utils.spider import spidercls_for_request, DefaultSpider

 class Command(ScrapyCommand):
@ -27,8 +28,8 @@ class Command(ScrapyCommand):
            help="use this spider")
        parser.add_option("--headers", dest="headers", action="store_true", \
            help="print response HTTP headers instead of body")
-        parser.add_option("--no-status-aware", dest="no_status_aware", action="store_true", \
-            default=False, help="do not handle status codes like redirects and print response as-is")
+        parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \
+            default=False, help="do not handle HTTP 3xx status codes and print response as-is")

    def _print_headers(self, headers, prefix):
        for key, values in headers.items():
@ -52,7 +53,11 @@ class Command(ScrapyCommand):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)
-        if opts.no_status_aware:
+        # by default, let the framework handle redirects,
+        # i.e. command handles all codes expect 3xx
+        if not opts.no_redirect:
+            request.meta['handle_httpstatus_list'] = SequenceExclude(six.moves.range(300, 400))
+        else:
            request.meta['handle_httpstatus_all'] = True

        spidercls = DefaultSpider
--- a/scrapy/commands/shell.py
+++ b/scrapy/commands/shell.py
@ -36,8 +36,8 @@ class Command(ScrapyCommand):
            help="evaluate the code in the shell, print the result and exit")
        parser.add_option("--spider", dest="spider",
            help="use this spider")
-        parser.add_option("--no-status-aware", dest="no_status_aware", action="store_true", \
-            default=False, help="do not transparently handle status codes like redirects")
+        parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \
+            default=False, help="do not handle HTTP 3xx status codes and print response as-is")

    def update_vars(self, vars):
        """You can use this function to update the Scrapy objects that will be
@ -70,7 +70,7 @@ class Command(ScrapyCommand):
        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
-        shell.start(url=url, handle_statuses=opts.no_status_aware)
+        shell.start(url=url, redirect=not opts.no_redirect)

    def _start_crawler_thread(self):
        t = Thread(target=self.crawler_process.start,
--- a/scrapy/shell.py
+++ b/scrapy/shell.py
@ -7,6 +7,7 @@ from __future__ import print_function

 import os
 import signal
+from six.moves import range
 import warnings

 from twisted.internet import reactor, threads, defer
@ -20,6 +21,7 @@ from scrapy.item import BaseItem
 from scrapy.settings import Settings
 from scrapy.spiders import Spider
 from scrapy.utils.console import start_python_console
+from scrapy.utils.datatypes import SequenceExclude
 from scrapy.utils.misc import load_object
 from scrapy.utils.response import open_in_browser
 from scrapy.utils.conf import get_config
@ -40,11 +42,11 @@ class Shell(object):
        self.code = code
        self.vars = {}

-    def start(self, url=None, request=None, response=None, spider=None, handle_statuses=True):
+    def start(self, url=None, request=None, response=None, spider=None, redirect=True):
        # disable accidental Ctrl-C key press from shutting down the engine
        signal.signal(signal.SIGINT, signal.SIG_IGN)
        if url:
-            self.fetch(url, spider, handle_statuses=handle_statuses)
+            self.fetch(url, spider, redirect=redirect)
        elif request:
            self.fetch(request, spider)
        elif response:
@ -98,13 +100,15 @@ class Shell(object):
        self.spider = spider
        return spider

-    def fetch(self, request_or_url, spider=None, handle_statuses=False, **kwargs):
+    def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
        if isinstance(request_or_url, Request):
            request = request_or_url
        else:
            url = any_to_uri(request_or_url)
            request = Request(url, dont_filter=True, **kwargs)
-            if handle_statuses:
+            if redirect:
+                request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
+            else:
                request.meta['handle_httpstatus_all'] = True
        response = None
        try:
--- a/scrapy/utils/datatypes.py
+++ b/scrapy/utils/datatypes.py
@ -304,3 +304,13 @@ class LocalCache(OrderedDict):
        while len(self) >= self.limit:
            self.popitem(last=False)
        super(LocalCache, self).__setitem__(key, value)
+
+
+class SequenceExclude(object):
+    """Object to test if an item is NOT within some sequence."""
+
+    def __init__(self, seq):
+        self.seq = seq
+
+    def __contains__(self, item):
+        return item not in self.seq
--- a/tests/test_command_fetch.py
+++ b/tests/test_command_fetch.py
@ -21,7 +21,7 @@ class FetchTest(ProcessTest, SiteTest, unittest.TestCase):

    @defer.inlineCallbacks
    def test_redirect_disabled(self):
-        _, out, err = yield self.execute(['--no-status-aware', self.url('/redirect-no-meta-refresh')])
+        _, out, err = yield self.execute(['--no-redirect', self.url('/redirect-no-meta-refresh')])
        err = err.strip()
        self.assertIn(b'downloader/response_status_count/302', err, err)
        self.assertNotIn(b'downloader/response_status_count/200', err, err)