1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 15:24:15 +00:00

Merge pull request #201 from alexcepoi/test-fixes-mac

improve mac os compatibility
This commit is contained in:
Pablo Hoffman 2012-12-02 07:37:53 -08:00
commit 9c9a18b3a3
4 changed files with 115 additions and 85 deletions

View File

@ -35,7 +35,7 @@ vsftpd_log_file=/dev/null
vsftpd_pid=$!
fi
find -name '*.py[co]' -delete
find . -name '*.py[co]' -delete
if [ $# -eq 0 ]; then
$trial --reporter=text scrapy scrapyd
else

View File

@ -1,11 +1,16 @@
import unittest, tempfile, shutil, time
import time
import tempfile
import shutil
import unittest
from contextlib import contextmanager
from scrapy.http import Response, HtmlResponse, Request
from scrapy.spider import BaseSpider
from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage, HttpCacheMiddleware
from scrapy.settings import Settings
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.test import get_crawler
from scrapy.contrib.downloadermiddleware.httpcache import \
FilesystemCacheStorage, HttpCacheMiddleware
class HttpCacheMiddlewareTest(unittest.TestCase):
@ -16,8 +21,10 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
self.crawler = get_crawler()
self.spider = BaseSpider('example.com')
self.tmpdir = tempfile.mkdtemp()
self.request = Request('http://www.example.com', headers={'User-Agent': 'test'})
self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body='test body', status=202)
self.request = Request('http://www.example.com',
headers={'User-Agent': 'test'})
self.response = Response('http://www.example.com', headers=
{'Content-Type': 'text/html'}, body='test body', status=202)
self.crawler.stats.open_spider(self.spider)
def tearDown(self):
@ -34,113 +41,135 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
settings.update(new_settings)
return Settings(settings)
def _get_storage(self, **new_settings):
return self.storage_class(self._get_settings(**new_settings))
@contextmanager
def _storage(self, **new_settings):
settings = self._get_settings(**new_settings)
storage = self.storage_class(settings)
storage.open_spider(self.spider)
try:
yield storage
finally:
storage.close_spider(self.spider)
def _get_middleware(self, **new_settings):
mw = HttpCacheMiddleware(self._get_settings(**new_settings), self.crawler.stats)
@contextmanager
def _middleware(self, **new_settings):
settings = self._get_settings(**new_settings)
mw = HttpCacheMiddleware(settings, self.crawler.stats)
mw.spider_opened(self.spider)
return mw
try:
yield mw
finally:
mw.spider_closed(self.spider)
def test_storage(self):
storage = self._get_storage()
request2 = self.request.copy()
assert storage.retrieve_response(self.spider, request2) is None
storage.store_response(self.spider, self.request, self.response)
response2 = storage.retrieve_response(self.spider, request2)
assert isinstance(response2, HtmlResponse) # inferred from content-type header
self.assertEqualResponse(self.response, response2)
time.sleep(2) # wait for cache to expire
assert storage.retrieve_response(self.spider, request2) is None
with self._storage() as storage:
request2 = self.request.copy()
assert storage.retrieve_response(self.spider, request2) is None
storage.store_response(self.spider, self.request, self.response)
response2 = storage.retrieve_response(self.spider, request2)
assert isinstance(response2, HtmlResponse) # content-type header
self.assertEqualResponse(self.response, response2)
time.sleep(2) # wait for cache to expire
assert storage.retrieve_response(self.spider, request2) is None
def test_storage_never_expire(self):
storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=0)
assert storage.retrieve_response(self.spider, self.request) is None
storage.store_response(self.spider, self.request, self.response)
time.sleep(0.5) # give the chance to expire
assert storage.retrieve_response(self.spider, self.request)
with self._storage(HTTPCACHE_EXPIRATION_SECS=0) as storage:
assert storage.retrieve_response(self.spider, self.request) is None
storage.store_response(self.spider, self.request, self.response)
time.sleep(0.5) # give the chance to expire
assert storage.retrieve_response(self.spider, self.request)
def test_middleware(self):
mw = self._get_middleware()
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
with self._middleware() as mw:
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
def test_different_request_response_urls(self):
mw = self._get_middleware()
req = Request('http://host.com/path')
res = Response('http://host2.net/test.html')
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
with self._middleware() as mw:
req = Request('http://host.com/path')
res = Response('http://host2.net/test.html')
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
def test_middleware_ignore_missing(self):
mw = self._get_middleware(HTTPCACHE_IGNORE_MISSING=True)
self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
with self._middleware(HTTPCACHE_IGNORE_MISSING=True) as mw:
self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
def test_middleware_ignore_schemes(self):
# http responses are cached by default
req, res = Request('http://test.com/'), Response('http://test.com/')
mw = self._get_middleware()
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
with self._middleware() as mw:
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
# file response is not cached by default
req, res = Request('file:///tmp/t.txt'), Response('file:///tmp/t.txt')
mw = self._get_middleware()
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None
with self._middleware() as mw:
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None
# s3 scheme response is cached by default
req, res = Request('s3://bucket/key'), Response('http://bucket/key')
mw = self._get_middleware()
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
with self._middleware() as mw:
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
# ignore s3 scheme
req, res = Request('s3://bucket/key2'), Response('http://bucket/key2')
mw = self._get_middleware(HTTPCACHE_IGNORE_SCHEMES=['s3'])
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None
with self._middleware(HTTPCACHE_IGNORE_SCHEMES=['s3']) as mw:
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None
def test_middleware_ignore_http_codes(self):
# test response is not cached
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)
assert mw.storage.retrieve_response(self.spider, self.request) is None
assert mw.process_request(self.request, self.spider) is None
with self._middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202]) as mw:
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)
assert mw.storage.retrieve_response(self.spider, self.request) is None
assert mw.process_request(self.request, self.spider) is None
# test response is cached
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
with self._middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203]) as mw:
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
def assertEqualResponse(self, response1, response2):
self.assertEqual(response1.url, response2.url)
@ -150,4 +179,3 @@ class HttpCacheMiddlewareTest(unittest.TestCase):
if __name__ == '__main__':
unittest.main()

View File

@ -8,9 +8,11 @@ from twisted.trial import unittest
try:
from PIL import Image
skip = False
except ImportError, e:
skip = True
else:
encoders = set(('jpeg_encoder', 'jpeg_decoder'))
skip = not encoders.issubset(set(Image.core.__dict__))
def _mocked_download_func(request, info):
response = request.meta.get('response')

View File

@ -45,7 +45,7 @@ class FifoDiskQueue(object):
self.chunksize = self.info['chunksize']
self.headf = self._openchunk(self.info['head'][0], 'ab+')
self.tailf = self._openchunk(self.info['tail'][0])
self.tailf.seek(self.info['tail'][2])
os.lseek(self.tailf.fileno(), self.info['tail'][2], os.SEEK_SET)
def push(self, string):
hnum, hpos = self.info['head']