1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-22 04:33:05 +00:00

Let framework handle only HTTP redirects by default for fetch and shell commands

This commit is contained in:
Paul Tremberth 2016-12-07 17:56:13 +01:00
parent 9aefc0a886
commit 778bed07bf
5 changed files with 30 additions and 11 deletions

View File

@ -5,6 +5,7 @@ from w3lib.url import is_url
from scrapy.commands import ScrapyCommand
from scrapy.http import Request
from scrapy.exceptions import UsageError
from scrapy.utils.datatypes import SequenceExclude
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
class Command(ScrapyCommand):
@ -27,8 +28,8 @@ class Command(ScrapyCommand):
help="use this spider")
parser.add_option("--headers", dest="headers", action="store_true", \
help="print response HTTP headers instead of body")
parser.add_option("--no-status-aware", dest="no_status_aware", action="store_true", \
default=False, help="do not handle status codes like redirects and print response as-is")
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \
default=False, help="do not handle HTTP 3xx status codes and print response as-is")
def _print_headers(self, headers, prefix):
for key, values in headers.items():
@ -52,7 +53,11 @@ class Command(ScrapyCommand):
raise UsageError()
cb = lambda x: self._print_response(x, opts)
request = Request(args[0], callback=cb, dont_filter=True)
if opts.no_status_aware:
# by default, let the framework handle redirects,
# i.e. command handles all codes expect 3xx
if not opts.no_redirect:
request.meta['handle_httpstatus_list'] = SequenceExclude(six.moves.range(300, 400))
else:
request.meta['handle_httpstatus_all'] = True
spidercls = DefaultSpider

View File

@ -36,8 +36,8 @@ class Command(ScrapyCommand):
help="evaluate the code in the shell, print the result and exit")
parser.add_option("--spider", dest="spider",
help="use this spider")
parser.add_option("--no-status-aware", dest="no_status_aware", action="store_true", \
default=False, help="do not transparently handle status codes like redirects")
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \
default=False, help="do not handle HTTP 3xx status codes and print response as-is")
def update_vars(self, vars):
"""You can use this function to update the Scrapy objects that will be
@ -70,7 +70,7 @@ class Command(ScrapyCommand):
self._start_crawler_thread()
shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
shell.start(url=url, handle_statuses=opts.no_status_aware)
shell.start(url=url, redirect=not opts.no_redirect)
def _start_crawler_thread(self):
t = Thread(target=self.crawler_process.start,

View File

@ -7,6 +7,7 @@ from __future__ import print_function
import os
import signal
from six.moves import range
import warnings
from twisted.internet import reactor, threads, defer
@ -20,6 +21,7 @@ from scrapy.item import BaseItem
from scrapy.settings import Settings
from scrapy.spiders import Spider
from scrapy.utils.console import start_python_console
from scrapy.utils.datatypes import SequenceExclude
from scrapy.utils.misc import load_object
from scrapy.utils.response import open_in_browser
from scrapy.utils.conf import get_config
@ -40,11 +42,11 @@ class Shell(object):
self.code = code
self.vars = {}
def start(self, url=None, request=None, response=None, spider=None, handle_statuses=True):
def start(self, url=None, request=None, response=None, spider=None, redirect=True):
# disable accidental Ctrl-C key press from shutting down the engine
signal.signal(signal.SIGINT, signal.SIG_IGN)
if url:
self.fetch(url, spider, handle_statuses=handle_statuses)
self.fetch(url, spider, redirect=redirect)
elif request:
self.fetch(request, spider)
elif response:
@ -98,13 +100,15 @@ class Shell(object):
self.spider = spider
return spider
def fetch(self, request_or_url, spider=None, handle_statuses=False, **kwargs):
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
if isinstance(request_or_url, Request):
request = request_or_url
else:
url = any_to_uri(request_or_url)
request = Request(url, dont_filter=True, **kwargs)
if handle_statuses:
if redirect:
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
else:
request.meta['handle_httpstatus_all'] = True
response = None
try:

View File

@ -304,3 +304,13 @@ class LocalCache(OrderedDict):
while len(self) >= self.limit:
self.popitem(last=False)
super(LocalCache, self).__setitem__(key, value)
class SequenceExclude(object):
"""Object to test if an item is NOT within some sequence."""
def __init__(self, seq):
self.seq = seq
def __contains__(self, item):
return item not in self.seq

View File

@ -21,7 +21,7 @@ class FetchTest(ProcessTest, SiteTest, unittest.TestCase):
@defer.inlineCallbacks
def test_redirect_disabled(self):
_, out, err = yield self.execute(['--no-status-aware', self.url('/redirect-no-meta-refresh')])
_, out, err = yield self.execute(['--no-redirect', self.url('/redirect-no-meta-refresh')])
err = err.strip()
self.assertIn(b'downloader/response_status_count/302', err, err)
self.assertNotIn(b'downloader/response_status_count/200', err, err)