mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 00:23:52 +00:00
stop raising HttpException at download handlers and adapt download middlewares
This commit is contained in:
parent
c8827552b6
commit
7eaa3ed24d
@ -655,6 +655,27 @@ Default: ``Not Defined``
|
||||
The name of the current project. It matches the project module name as created
|
||||
by ``startproject`` command, and is only defined by project settings file.
|
||||
|
||||
.. setting:: REDIRECT_MAX_TIMES
|
||||
|
||||
REDIRECT_MAX_TIMES
|
||||
------------------
|
||||
|
||||
Default: ``20``
|
||||
|
||||
Defines the maximun times a request can be redirected. After this maximun the
|
||||
request's response is returned as is. We used Firefox default value for the
|
||||
same task.
|
||||
|
||||
.. setting:: REDIRECT_MAX_METAREFRESH_DELAY
|
||||
|
||||
REDIRECT_MAX_METAREFRESH_DELAY
|
||||
---------------------------
|
||||
|
||||
Default: ``100``
|
||||
|
||||
Some sites use meta-refresh for redirecting to a session expired page, so we
|
||||
restrict automatic redirection to a maximum delay (in seconds)
|
||||
|
||||
.. setting:: REQUEST_HEADER_ACCEPT
|
||||
|
||||
REQUEST_HEADER_ACCEPT
|
||||
|
@ -136,7 +136,8 @@ NEWSPIDER_MODULE = ''
|
||||
|
||||
PRIORITIZER = 'scrapy.core.prioritizers.RandomPrioritizer'
|
||||
|
||||
REDIRECTMIDDLEWARE_MAX_TIMES = 20 # uses Firefox default setting
|
||||
REDIRECT_MAX_METAREFRESH_DELAY = 100
|
||||
REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
|
||||
|
||||
REQUEST_HEADER_ACCEPT = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
REQUEST_HEADER_ACCEPT_LANGUAGE = 'en'
|
||||
@ -146,7 +147,7 @@ REQUESTS_PER_DOMAIN = 8 # max simultaneous requests per domain
|
||||
|
||||
# contrib.middleware.retry.RetryMiddleware default settings
|
||||
RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
|
||||
RETRY_HTTP_CODES = ['500', '503', '504', '400', '408', '200']
|
||||
RETRY_HTTP_CODES = ['500', '503', '504', '400', '408']
|
||||
|
||||
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
@ -6,7 +6,6 @@ from pydispatch import dispatcher
|
||||
from scrapy.core import signals
|
||||
from scrapy.http import Response
|
||||
from scrapy.http.cookies import CookieJar
|
||||
from scrapy.core.exceptions import HttpException
|
||||
from scrapy.conf import settings
|
||||
from scrapy import log
|
||||
|
||||
@ -44,11 +43,6 @@ class CookiesMiddleware(object):
|
||||
|
||||
return response
|
||||
|
||||
# cookies should be set on non-200 responses too
|
||||
def process_exception(self, request, exception, spider):
|
||||
if isinstance(exception, HttpException):
|
||||
self.process_response(request, exception.response, spider)
|
||||
|
||||
def domain_closed(self, domain):
|
||||
self.jars.pop(domain, None)
|
||||
|
||||
|
@ -1,13 +1,17 @@
|
||||
from scrapy.core.exceptions import HttpException
|
||||
from scrapy.utils.response import response_status_message
|
||||
|
||||
class ErrorPagesMiddleware(object):
|
||||
"""This middleware allows the spiders to receive error (non 200) responses,
|
||||
the same way the receive normal responses"""
|
||||
"""This middleware filters out responses with status code others than 2XX
|
||||
or defined in spider handle_httpstatus_list attribute.
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
if isinstance(exception, HttpException):
|
||||
statuses = getattr(spider, 'handle_httpstatus_list', None)
|
||||
httpstatus = exception.response.status
|
||||
if statuses and httpstatus in statuses:
|
||||
return exception.response
|
||||
TODO: move this mw to spidermiddleware and remove me
|
||||
"""
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
status = response.status
|
||||
if 200 <= status < 300 or status in getattr(spider, 'handle_httpstatus_list', []):
|
||||
return response
|
||||
else:
|
||||
raise HttpException(status, None, response)
|
||||
|
||||
|
@ -10,12 +10,13 @@ from pydispatch import dispatcher
|
||||
from scrapy.core import signals
|
||||
from scrapy import log
|
||||
from scrapy.http import Response, Headers
|
||||
from scrapy.core.exceptions import NotConfigured, HttpException, IgnoreRequest
|
||||
from scrapy.core.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.utils.http import headers_dict_to_raw, headers_raw_to_dict
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
class HttpCacheMiddleware(object):
|
||||
def __init__(self):
|
||||
if not settings['HTTPCACHE_DIR']:
|
||||
@ -33,37 +34,25 @@ class HttpCacheMiddleware(object):
|
||||
|
||||
key = request_fingerprint(request)
|
||||
domain = spider.domain_name
|
||||
|
||||
try:
|
||||
response = self.cache.retrieve_response(domain, key)
|
||||
except:
|
||||
log.msg("Corrupt cache for %s" % request.url, log.WARNING)
|
||||
response = False
|
||||
|
||||
if response:
|
||||
if not 200 <= int(response.status) < 300:
|
||||
raise HttpException(response.status, None, response)
|
||||
return response
|
||||
elif self.ignore_missing:
|
||||
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if not is_cacheable(request):
|
||||
return response
|
||||
|
||||
if isinstance(response, Response) and not response.meta.get('cached'):
|
||||
if is_cacheable(request):
|
||||
key = request_fingerprint(request)
|
||||
domain = spider.domain_name
|
||||
self.cache.store(domain, key, request, response)
|
||||
self.cache.store(spider.domain_name, key, request, response)
|
||||
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
if not is_cacheable(request):
|
||||
return
|
||||
|
||||
if isinstance(exception, HttpException) and isinstance(exception.response, Response):
|
||||
key = request_fingerprint(request)
|
||||
domain = spider.domain_name
|
||||
self.cache.store(domain, key, request, exception.response)
|
||||
|
||||
def is_cacheable(request):
|
||||
return request.url.scheme in ['http', 'https']
|
||||
|
@ -1,41 +1,33 @@
|
||||
from scrapy import log
|
||||
from scrapy.core.exceptions import HttpException
|
||||
from scrapy.utils.url import urljoin_rfc as urljoin
|
||||
from scrapy.utils.response import get_meta_refresh
|
||||
from scrapy.conf import settings
|
||||
|
||||
# some sites use meta-refresh for redirecting to a session expired page, so we
|
||||
# restrict automatic redirection to a maximum delay (in number of seconds)
|
||||
META_REFRESH_MAXSEC = 100
|
||||
MAX_REDIRECT_LOOP = 10
|
||||
|
||||
class RedirectMiddleware(object):
|
||||
"""Handle redirection of requests based on response status and meta-refresh html tag"""
|
||||
|
||||
def __init__(self):
|
||||
self.max_redirect_times = settings.getint('REDIRECTMIDDLEWARE_MAX_TIMES')
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
if not isinstance(exception, HttpException):
|
||||
return
|
||||
self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY')
|
||||
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
domain = spider.domain_name
|
||||
status = exception.status
|
||||
response = exception.response
|
||||
|
||||
if status in [302, 303] and 'Location' in response.headers:
|
||||
if response.status in [302, 303] and 'Location' in response.headers:
|
||||
redirected_url = urljoin(request.url, response.headers['location'])
|
||||
redirected = request.replace(url=redirected_url, method='GET', body='')
|
||||
redirected.headers.pop('Content-Type', None)
|
||||
redirected.headers.pop('Content-Length', None)
|
||||
return self._redirect(redirected, request, spider, status)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
if status in [301, 307] and 'Location' in response.headers:
|
||||
if response.status in [301, 307] and 'Location' in response.headers:
|
||||
redirected_url = urljoin(request.url, response.headers['location'])
|
||||
redirected = request.replace(url=redirected_url)
|
||||
return self._redirect(redirected, request, spider, status)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
interval, url = get_meta_refresh(response)
|
||||
if url and int(interval) < META_REFRESH_MAXSEC:
|
||||
if url and int(interval) < self.max_metarefresh_delay:
|
||||
redirected = request.replace(url=urljoin(request.url, url))
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
|
||||
|
@ -19,46 +19,50 @@ About HTTP errors to consider:
|
||||
- You may want to remove 400 from RETRY_HTTP_CODES, if you stick to the HTTP
|
||||
protocol. It's included by default because it's a common code used to
|
||||
indicate server overload, which would be something we want to retry
|
||||
- 200 is included by default (and shoudln't be removed) to check for partial
|
||||
downloads errors, which means the TCP connection has broken in the middle of
|
||||
a HTTP download
|
||||
"""
|
||||
|
||||
from twisted.internet.error import TimeoutError as ServerTimeoutError, DNSLookupError, \
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError, \
|
||||
ConnectionLost
|
||||
from twisted.internet.defer import TimeoutError as UserTimeoutError
|
||||
from twisted.web.client import PartialDownloadError
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.core.exceptions import HttpException
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.utils.response import response_status_message
|
||||
from scrapy.conf import settings
|
||||
|
||||
class RetryMiddleware(object):
|
||||
|
||||
EXCEPTIONS_TO_RETRY = (ServerTimeoutError, UserTimeoutError, DNSLookupError,
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError,
|
||||
ConnectionLost)
|
||||
ConnectionLost, PartialDownloadError)
|
||||
|
||||
def __init__(self):
|
||||
self.max_retry_times = settings.getint('RETRY_TIMES')
|
||||
self.retry_http_codes = map(int, settings.getlist('RETRY_HTTP_CODES'))
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if response.status in self.retry_http_codes:
|
||||
reason = response_status_message(response.status)
|
||||
return self._retry(request, reason, spider) or response
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
|
||||
or (isinstance(exception, HttpException) \
|
||||
and (int(exception.status) in self.retry_http_codes)):
|
||||
if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
|
||||
return self._retry(request, exception, spider)
|
||||
|
||||
retries = request.meta.get('retry_times', 0) + 1
|
||||
def _retry(self, request, reason, spider):
|
||||
retries = request.meta.get('retry_times', 0) + 1
|
||||
|
||||
if retries <= self.max_retry_times:
|
||||
log.msg("Retrying %s (failed %d times): %s" % (request, retries, exception),
|
||||
domain=spider.domain_name, level=log.DEBUG)
|
||||
retryreq = request.copy()
|
||||
retryreq.meta['retry_times'] = retries
|
||||
retryreq.dont_filter = True
|
||||
return retryreq
|
||||
else:
|
||||
log.msg("Discarding %s (failed %d times): %s" % (request, retries, exception),
|
||||
domain=spider.domain_name, level=log.DEBUG)
|
||||
if retries <= self.max_retry_times:
|
||||
log.msg("Retrying %s (failed %d times): %s" % (request, retries, reason),
|
||||
domain=spider.domain_name, level=log.DEBUG)
|
||||
retryreq = request.copy()
|
||||
retryreq.meta['retry_times'] = retries
|
||||
retryreq.dont_filter = True
|
||||
return retryreq
|
||||
else:
|
||||
log.msg("Discarding %s (failed %d times): %s" % (request, retries, reason),
|
||||
domain=spider.domain_name, level=log.DEBUG)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from scrapy.core.exceptions import HttpException, NotConfigured
|
||||
from scrapy.core.exceptions import NotConfigured
|
||||
from scrapy.stats import stats
|
||||
from scrapy.conf import settings
|
||||
|
||||
@ -18,7 +18,7 @@ class DownloaderStats(object):
|
||||
def __init__(self):
|
||||
if not settings.getbool('DOWNLOADER_STATS'):
|
||||
raise NotConfigured
|
||||
|
||||
|
||||
def process_request(self, request, spider):
|
||||
stats.incpath('_global/downloader/request_count')
|
||||
stats.incpath('%s/downloader/request_count' % spider.domain_name)
|
||||
@ -30,14 +30,12 @@ class DownloaderStats(object):
|
||||
def process_response(self, request, response, spider):
|
||||
self._inc_response_count(response, spider.domain_name)
|
||||
return response
|
||||
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__)
|
||||
stats.incpath('_global/downloader/exception_count')
|
||||
stats.incpath('%s/downloader/exception_count' % spider.domain_name)
|
||||
stats.incpath('%s/downloader/exception_type_count/%s' % (spider.domain_name, ex_class))
|
||||
if isinstance(exception, HttpException):
|
||||
self._inc_response_count(exception.response, spider.domain_name)
|
||||
|
||||
def _inc_response_count(self, response, domain):
|
||||
stats.incpath('_global/downloader/response_count')
|
||||
|
@ -16,7 +16,7 @@ except ImportError:
|
||||
from scrapy import optional_features
|
||||
from scrapy.core import signals
|
||||
from scrapy.http import Headers
|
||||
from scrapy.core.exceptions import HttpException, NotSupported
|
||||
from scrapy.core.exceptions import NotSupported
|
||||
from scrapy.utils.defer import defer_succeed
|
||||
from scrapy.conf import settings
|
||||
|
||||
@ -65,13 +65,7 @@ def create_factory(request, spider):
|
||||
signals.send_catch_log(signal=signals.response_downloaded, sender='download_http', response=r, spider=spider)
|
||||
return r
|
||||
|
||||
def _on_success(body):
|
||||
response = _create_response(body)
|
||||
if response.status not in (200, 201, 202):
|
||||
raise HttpException(response.status, None, response)
|
||||
return response
|
||||
|
||||
factory.deferred.addCallbacks(_on_success)
|
||||
factory.deferred.addCallbacks(_create_response)
|
||||
return factory
|
||||
|
||||
def download_http(request, spider):
|
||||
|
@ -3,7 +3,6 @@ from __future__ import with_statement
|
||||
from unittest import TestCase
|
||||
|
||||
from scrapy.spider import spiders
|
||||
from scrapy.core.exceptions import HttpException
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.contrib.downloadermiddleware.cookies import CookiesMiddleware
|
||||
|
||||
@ -35,21 +34,6 @@ class CookiesMiddlewareTest(TestCase):
|
||||
assert self.mw.process_request(req2, self.spider) is None
|
||||
self.assertEquals(req2.headers.get('Cookie'), "C1=value1")
|
||||
|
||||
def test_http_exception(self):
|
||||
req = Request('http://scrapytest.org/')
|
||||
assert self.mw.process_request(req, self.spider) is None
|
||||
assert 'Cookie' not in req.headers
|
||||
|
||||
headers = {'Set-Cookie': 'C1=value1; path=/'}
|
||||
res = Response('http://scrapytest.org/', headers=headers)
|
||||
exc = HttpException(302, 'Redirect', res)
|
||||
assert self.mw.process_exception(req, exc, self.spider) is None
|
||||
#assert exc.response.cookies
|
||||
|
||||
req2 = Request('http://scrapytest.org/sub1/')
|
||||
assert self.mw.process_request(req2, self.spider) is None
|
||||
self.assertEquals(req2.headers.get('Cookie'), "C1=value1")
|
||||
|
||||
def test_dont_merge_cookies(self):
|
||||
# merge some cookies into jar
|
||||
headers = {'Set-Cookie': 'C1=value1; path=/'}
|
||||
|
@ -1,7 +1,6 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware
|
||||
from scrapy.core.exceptions import HttpException
|
||||
from scrapy.dupefilter import dupefilter
|
||||
from scrapy.spider import spiders
|
||||
from scrapy.http import Request, Response, Headers
|
||||
@ -22,28 +21,24 @@ class RedirectMiddlewareTest(unittest.TestCase):
|
||||
url = 'http://www.example.com/301'
|
||||
url2 = 'http://www.example.com/redirected'
|
||||
req = Request(url)
|
||||
hdr = Headers({'Location': [url2]})
|
||||
rsp = Response(url, headers=hdr)
|
||||
exc = HttpException('301', None, rsp)
|
||||
rsp = Response(url, headers={'Location': url2}, status=301)
|
||||
|
||||
req2 = self.mw.process_exception(req, exc, self.spider)
|
||||
req2 = self.mw.process_response(req, rsp, self.spider)
|
||||
assert isinstance(req2, Request)
|
||||
self.assertEqual(req2.url, url2)
|
||||
|
||||
# response without Location header but with status code is 3XX should be ignored
|
||||
del rsp.headers['Location']
|
||||
assert self.mw.process_exception(req, exc, self.spider) is None
|
||||
assert self.mw.process_response(req, rsp, self.spider) is rsp
|
||||
|
||||
def test_redirect_302(self):
|
||||
url = 'http://www.example.com/302'
|
||||
url2 = 'http://www.example.com/redirected2'
|
||||
req = Request(url, method='POST', body='test',
|
||||
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
|
||||
hdr = Headers({'Location': [url2]})
|
||||
rsp = Response(url, headers=hdr)
|
||||
exc = HttpException('302', None, rsp)
|
||||
rsp = Response(url, headers={'Location': url2}, status=302)
|
||||
|
||||
req2 = self.mw.process_exception(req, exc, self.spider)
|
||||
req2 = self.mw.process_response(req, rsp, self.spider)
|
||||
assert isinstance(req2, Request)
|
||||
self.assertEqual(req2.url, url2)
|
||||
self.assertEqual(req2.method, 'GET')
|
||||
@ -56,7 +51,7 @@ class RedirectMiddlewareTest(unittest.TestCase):
|
||||
|
||||
# response without Location header but with status code is 3XX should be ignored
|
||||
del rsp.headers['Location']
|
||||
assert self.mw.process_exception(req, exc, self.spider) is None
|
||||
assert self.mw.process_response(req, rsp, self.spider) is rsp
|
||||
|
||||
def test_meta_refresh(self):
|
||||
body = """<html>
|
||||
@ -82,24 +77,24 @@ class RedirectMiddlewareTest(unittest.TestCase):
|
||||
def test_max_redirect_times(self):
|
||||
self.mw.max_redirect_times = 1
|
||||
req = Request('http://scrapytest.org/302')
|
||||
exc = HttpException('302', None, Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}))
|
||||
rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
|
||||
|
||||
req = self.mw.process_exception(req, exc, self.spider)
|
||||
req = self.mw.process_response(req, rsp, self.spider)
|
||||
assert isinstance(req, Request)
|
||||
assert 'redirect_times' in req.meta
|
||||
self.assertEqual(req.meta['redirect_times'], 1)
|
||||
|
||||
req = self.mw.process_exception(req, exc, self.spider)
|
||||
req = self.mw.process_response(req, rsp, self.spider)
|
||||
self.assertEqual(req, None)
|
||||
|
||||
def test_ttl(self):
|
||||
self.mw.max_redirect_times = 100
|
||||
req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
|
||||
exc = HttpException('302', None, Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}))
|
||||
rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
|
||||
|
||||
req = self.mw.process_exception(req, exc, self.spider)
|
||||
req = self.mw.process_response(req, rsp, self.spider)
|
||||
assert isinstance(req, Request)
|
||||
req = self.mw.process_exception(req, exc, self.spider)
|
||||
req = self.mw.process_response(req, rsp, self.spider)
|
||||
self.assertEqual(req, None)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -5,7 +5,6 @@ from twisted.internet.error import TimeoutError as ServerTimeoutError, DNSLookup
|
||||
ConnectionLost
|
||||
|
||||
from scrapy.contrib.downloadermiddleware.retry import RetryMiddleware
|
||||
from scrapy.core.exceptions import HttpException
|
||||
from scrapy.spider import spiders
|
||||
from scrapy.http import Request, Response
|
||||
|
||||
@ -17,20 +16,31 @@ class RetryTest(unittest.TestCase):
|
||||
self.mw = RetryMiddleware()
|
||||
self.mw.max_retry_times = 2
|
||||
|
||||
def test_process_exception_404(self):
|
||||
req404 = Request('http://www.scrapytest.org/404')
|
||||
exc404 = HttpException('404', None, Response('http://www.scrapytest.org/404', body=''))
|
||||
def test_404(self):
|
||||
req = Request('http://www.scrapytest.org/404')
|
||||
rsp = Response('http://www.scrapytest.org/404', body='', status=404)
|
||||
|
||||
# dont retry 404s
|
||||
req = self.mw.process_exception(req404, exc404, self.spider)
|
||||
self.assertTrue(req is None)
|
||||
assert self.mw.process_response(req, rsp, self.spider) is rsp
|
||||
|
||||
def test_process_exception_503(self):
|
||||
req503 = Request('http://www.scrapytest.org/503')
|
||||
exc503 = HttpException('503', None, Response('http://www.scrapytest.org/503', body=''))
|
||||
self._test_retry_exception(req503, exc503)
|
||||
def test_503(self):
|
||||
req = Request('http://www.scrapytest.org/503')
|
||||
rsp = Response('http://www.scrapytest.org/503', body='', status=503)
|
||||
|
||||
def test_process_exception_twistederrors(self):
|
||||
# first retry
|
||||
req = self.mw.process_response(req, rsp, self.spider)
|
||||
assert isinstance(req, Request)
|
||||
self.assertEqual(req.meta['retry_times'], 1)
|
||||
|
||||
# second retry
|
||||
req = self.mw.process_response(req, rsp, self.spider)
|
||||
assert isinstance(req, Request)
|
||||
self.assertEqual(req.meta['retry_times'], 2)
|
||||
|
||||
# discard it
|
||||
assert self.mw.process_response(req, rsp, self.spider) is rsp
|
||||
|
||||
def test_twistederrors(self):
|
||||
for exc in (ServerTimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost):
|
||||
req = Request('http://www.scrapytest.org/%s' % exc.__name__)
|
||||
self._test_retry_exception(req, exc())
|
||||
|
@ -4,6 +4,7 @@ scrapy.http.Response objects
|
||||
"""
|
||||
|
||||
import re
|
||||
from twisted.web import http
|
||||
from scrapy.http.response import Response
|
||||
|
||||
def body_or_str(obj, unicode=True):
|
||||
@ -36,3 +37,14 @@ def get_meta_refresh(response):
|
||||
match = META_REFRESH_RE.search(response.body[0:4096])
|
||||
response.cache['meta_refresh_url'] = match.groups() if match else (None, None)
|
||||
return response.cache['meta_refresh_url']
|
||||
|
||||
def response_status_message(status):
|
||||
"""Return status code plus status text descriptive message
|
||||
|
||||
>>> response_status_message(200)
|
||||
200 OK
|
||||
|
||||
>>> response_status_message(404)
|
||||
404 Not Found
|
||||
"""
|
||||
return '%s %s' % (status, http.responses.get(int(status)))
|
||||
|
Loading…
x
Reference in New Issue
Block a user