mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-22 04:33:05 +00:00
Let framework handle only HTTP redirects by default for fetch and shell commands
This commit is contained in:
parent
9aefc0a886
commit
778bed07bf
@ -5,6 +5,7 @@ from w3lib.url import is_url
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.utils.datatypes import SequenceExclude
|
||||
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
@ -27,8 +28,8 @@ class Command(ScrapyCommand):
|
||||
help="use this spider")
|
||||
parser.add_option("--headers", dest="headers", action="store_true", \
|
||||
help="print response HTTP headers instead of body")
|
||||
parser.add_option("--no-status-aware", dest="no_status_aware", action="store_true", \
|
||||
default=False, help="do not handle status codes like redirects and print response as-is")
|
||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \
|
||||
default=False, help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
|
||||
def _print_headers(self, headers, prefix):
|
||||
for key, values in headers.items():
|
||||
@ -52,7 +53,11 @@ class Command(ScrapyCommand):
|
||||
raise UsageError()
|
||||
cb = lambda x: self._print_response(x, opts)
|
||||
request = Request(args[0], callback=cb, dont_filter=True)
|
||||
if opts.no_status_aware:
|
||||
# by default, let the framework handle redirects,
|
||||
# i.e. command handles all codes expect 3xx
|
||||
if not opts.no_redirect:
|
||||
request.meta['handle_httpstatus_list'] = SequenceExclude(six.moves.range(300, 400))
|
||||
else:
|
||||
request.meta['handle_httpstatus_all'] = True
|
||||
|
||||
spidercls = DefaultSpider
|
||||
|
@ -36,8 +36,8 @@ class Command(ScrapyCommand):
|
||||
help="evaluate the code in the shell, print the result and exit")
|
||||
parser.add_option("--spider", dest="spider",
|
||||
help="use this spider")
|
||||
parser.add_option("--no-status-aware", dest="no_status_aware", action="store_true", \
|
||||
default=False, help="do not transparently handle status codes like redirects")
|
||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", \
|
||||
default=False, help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
|
||||
def update_vars(self, vars):
|
||||
"""You can use this function to update the Scrapy objects that will be
|
||||
@ -70,7 +70,7 @@ class Command(ScrapyCommand):
|
||||
self._start_crawler_thread()
|
||||
|
||||
shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
|
||||
shell.start(url=url, handle_statuses=opts.no_status_aware)
|
||||
shell.start(url=url, redirect=not opts.no_redirect)
|
||||
|
||||
def _start_crawler_thread(self):
|
||||
t = Thread(target=self.crawler_process.start,
|
||||
|
@ -7,6 +7,7 @@ from __future__ import print_function
|
||||
|
||||
import os
|
||||
import signal
|
||||
from six.moves import range
|
||||
import warnings
|
||||
|
||||
from twisted.internet import reactor, threads, defer
|
||||
@ -20,6 +21,7 @@ from scrapy.item import BaseItem
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.console import start_python_console
|
||||
from scrapy.utils.datatypes import SequenceExclude
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.response import open_in_browser
|
||||
from scrapy.utils.conf import get_config
|
||||
@ -40,11 +42,11 @@ class Shell(object):
|
||||
self.code = code
|
||||
self.vars = {}
|
||||
|
||||
def start(self, url=None, request=None, response=None, spider=None, handle_statuses=True):
|
||||
def start(self, url=None, request=None, response=None, spider=None, redirect=True):
|
||||
# disable accidental Ctrl-C key press from shutting down the engine
|
||||
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
||||
if url:
|
||||
self.fetch(url, spider, handle_statuses=handle_statuses)
|
||||
self.fetch(url, spider, redirect=redirect)
|
||||
elif request:
|
||||
self.fetch(request, spider)
|
||||
elif response:
|
||||
@ -98,13 +100,15 @@ class Shell(object):
|
||||
self.spider = spider
|
||||
return spider
|
||||
|
||||
def fetch(self, request_or_url, spider=None, handle_statuses=False, **kwargs):
|
||||
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
|
||||
if isinstance(request_or_url, Request):
|
||||
request = request_or_url
|
||||
else:
|
||||
url = any_to_uri(request_or_url)
|
||||
request = Request(url, dont_filter=True, **kwargs)
|
||||
if handle_statuses:
|
||||
if redirect:
|
||||
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
|
||||
else:
|
||||
request.meta['handle_httpstatus_all'] = True
|
||||
response = None
|
||||
try:
|
||||
|
@ -304,3 +304,13 @@ class LocalCache(OrderedDict):
|
||||
while len(self) >= self.limit:
|
||||
self.popitem(last=False)
|
||||
super(LocalCache, self).__setitem__(key, value)
|
||||
|
||||
|
||||
class SequenceExclude(object):
|
||||
"""Object to test if an item is NOT within some sequence."""
|
||||
|
||||
def __init__(self, seq):
|
||||
self.seq = seq
|
||||
|
||||
def __contains__(self, item):
|
||||
return item not in self.seq
|
||||
|
@ -21,7 +21,7 @@ class FetchTest(ProcessTest, SiteTest, unittest.TestCase):
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_redirect_disabled(self):
|
||||
_, out, err = yield self.execute(['--no-status-aware', self.url('/redirect-no-meta-refresh')])
|
||||
_, out, err = yield self.execute(['--no-redirect', self.url('/redirect-no-meta-refresh')])
|
||||
err = err.strip()
|
||||
self.assertIn(b'downloader/response_status_count/302', err, err)
|
||||
self.assertNotIn(b'downloader/response_status_count/200', err, err)
|
||||
|
Loading…
x
Reference in New Issue
Block a user