1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 19:24:12 +00:00
scrapy/tests/test_pipeline_files.py

397 lines
16 KiB
Python
Raw Normal View History

import os
import random
import time
import hashlib
import warnings
from tempfile import mkdtemp
from shutil import rmtree
2016-02-15 17:50:47 +03:00
from six.moves.urllib.parse import urlparse
from six import BytesIO
from twisted.trial import unittest
from twisted.internet import defer
2016-02-15 17:50:47 +03:00
from scrapy.pipelines.files import FilesPipeline, FSFilesStore, S3FilesStore
from scrapy.item import Item, Field
from scrapy.http import Request, Response
from scrapy.settings import Settings
2015-07-29 17:38:13 +00:00
from scrapy.utils.python import to_bytes
2016-02-15 17:50:47 +03:00
from scrapy.utils.test import assert_aws_environ, get_s3_content_and_delete
from scrapy.utils.boto import is_botocore
from tests import mock
def _mocked_download_func(request, info):
response = request.meta.get('response')
return response() if callable(response) else response
class FilesPipelineTestCase(unittest.TestCase):
def setUp(self):
self.tempdir = mkdtemp()
self.pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': self.tempdir}))
self.pipeline.download_func = _mocked_download_func
self.pipeline.open_spider(None)
def tearDown(self):
rmtree(self.tempdir)
def test_file_path(self):
file_path = self.pipeline.file_path
self.assertEqual(file_path(Request("https://dev.mydeco.com/mydeco.pdf")),
'full/c9b564df929f4bc635bdd19fde4f3d4847c757c5.pdf')
self.assertEqual(file_path(Request("http://www.maddiebrown.co.uk///catalogue-items//image_54642_12175_95307.txt")),
'full/4ce274dd83db0368bafd7e406f382ae088e39219.txt')
self.assertEqual(file_path(Request("https://dev.mydeco.com/two/dirs/with%20spaces%2Bsigns.doc")),
'full/94ccc495a17b9ac5d40e3eabf3afcb8c2c9b9e1a.doc')
self.assertEqual(file_path(Request("http://www.dfsonline.co.uk/get_prod_image.php?img=status_0907_mdm.jpg")),
'full/4507be485f38b0da8a0be9eb2e1dfab8a19223f2.jpg')
self.assertEqual(file_path(Request("http://www.dorma.co.uk/images/product_details/2532/")),
'full/97ee6f8a46cbbb418ea91502fd24176865cf39b2')
self.assertEqual(file_path(Request("http://www.dorma.co.uk/images/product_details/2532")),
'full/244e0dd7d96a3b7b01f54eded250c9e272577aa1')
self.assertEqual(file_path(Request("http://www.dorma.co.uk/images/product_details/2532"),
response=Response("http://www.dorma.co.uk/images/product_details/2532"),
info=object()),
'full/244e0dd7d96a3b7b01f54eded250c9e272577aa1')
def test_fs_store(self):
assert isinstance(self.pipeline.store, FSFilesStore)
self.assertEqual(self.pipeline.store.basedir, self.tempdir)
path = 'some/image/key.jpg'
fullpath = os.path.join(self.tempdir, 'some', 'image', 'key.jpg')
self.assertEqual(self.pipeline.store._get_filesystem_path(path), fullpath)
@defer.inlineCallbacks
def test_file_not_expired(self):
item_url = "http://example.com/file.pdf"
item = _create_item_with_files(item_url)
patchers = [
mock.patch.object(FilesPipeline, 'inc_stats', return_value=True),
mock.patch.object(FSFilesStore, 'stat_file', return_value={
'checksum': 'abc', 'last_modified': time.time()}),
mock.patch.object(FilesPipeline, 'get_media_requests',
return_value=[_prepare_request_object(item_url)])
]
for p in patchers:
p.start()
result = yield self.pipeline.process_item(item, None)
self.assertEqual(result['files'][0]['checksum'], 'abc')
for p in patchers:
p.stop()
@defer.inlineCallbacks
def test_file_expired(self):
item_url = "http://example.com/file2.pdf"
item = _create_item_with_files(item_url)
patchers = [
mock.patch.object(FSFilesStore, 'stat_file', return_value={
'checksum': 'abc',
'last_modified': time.time() - (self.pipeline.expires * 60 * 60 * 24 * 2)}),
mock.patch.object(FilesPipeline, 'get_media_requests',
return_value=[_prepare_request_object(item_url)]),
mock.patch.object(FilesPipeline, 'inc_stats', return_value=True)
]
for p in patchers:
p.start()
result = yield self.pipeline.process_item(item, None)
self.assertNotEqual(result['files'][0]['checksum'], 'abc')
for p in patchers:
p.stop()
class DeprecatedFilesPipeline(FilesPipeline):
def file_key(self, url):
2015-07-29 17:38:13 +00:00
media_guid = hashlib.sha1(to_bytes(url)).hexdigest()
media_ext = os.path.splitext(url)[1]
return 'empty/%s%s' % (media_guid, media_ext)
class DeprecatedFilesPipelineTestCase(unittest.TestCase):
def setUp(self):
self.tempdir = mkdtemp()
def init_pipeline(self, pipeline_class):
self.pipeline = pipeline_class.from_settings(Settings({'FILES_STORE': self.tempdir}))
self.pipeline.download_func = _mocked_download_func
self.pipeline.open_spider(None)
def test_default_file_key_method(self):
self.init_pipeline(FilesPipeline)
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always')
self.assertEqual(self.pipeline.file_key("https://dev.mydeco.com/mydeco.pdf"),
'full/c9b564df929f4bc635bdd19fde4f3d4847c757c5.pdf')
self.assertEqual(len(w), 1)
self.assertTrue('file_key(url) method is deprecated' in str(w[-1].message))
def test_overridden_file_key_method(self):
self.init_pipeline(DeprecatedFilesPipeline)
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always')
self.assertEqual(self.pipeline.file_path(Request("https://dev.mydeco.com/mydeco.pdf")),
'empty/c9b564df929f4bc635bdd19fde4f3d4847c757c5.pdf')
self.assertEqual(len(w), 1)
self.assertTrue('file_key(url) method is deprecated' in str(w[-1].message))
def tearDown(self):
rmtree(self.tempdir)
class FilesPipelineTestCaseFields(unittest.TestCase):
def test_item_fields_default(self):
class TestItem(Item):
name = Field()
file_urls = Field()
files = Field()
for cls in TestItem, dict:
url = 'http://www.example.com/files/1.txt'
item = cls({'name': 'item1', 'file_urls': [url]})
pipeline = FilesPipeline.from_settings(Settings({'FILES_STORE': 's3://example/files/'}))
requests = list(pipeline.get_media_requests(item, None))
self.assertEqual(requests[0].url, url)
results = [(True, {'url': url})]
pipeline.item_completed(results, item, None)
self.assertEqual(item['files'], [results[0][1]])
def test_item_fields_override_settings(self):
class TestItem(Item):
name = Field()
files = Field()
stored_file = Field()
for cls in TestItem, dict:
url = 'http://www.example.com/files/1.txt'
item = cls({'name': 'item1', 'files': [url]})
pipeline = FilesPipeline.from_settings(Settings({
'FILES_STORE': 's3://example/files/',
'FILES_URLS_FIELD': 'files',
'FILES_RESULT_FIELD': 'stored_file'
}))
requests = list(pipeline.get_media_requests(item, None))
self.assertEqual(requests[0].url, url)
results = [(True, {'url': url})]
pipeline.item_completed(results, item, None)
self.assertEqual(item['stored_file'], [results[0][1]])
class FilesPipelineTestCaseCustomSettings(unittest.TestCase):
default_cls_settings = {
"EXPIRES": 90,
"FILES_URLS_FIELD": "file_urls",
"FILES_RESULT_FIELD": "files"
}
file_cls_attr_settings_map = {
("EXPIRES", "FILES_EXPIRES", "expires"),
("FILES_URLS_FIELD", "FILES_URLS_FIELD", "files_urls_field"),
("FILES_RESULT_FIELD", "FILES_RESULT_FIELD", "files_result_field")
}
def setUp(self):
self.tempdir = mkdtemp()
def tearDown(self):
rmtree(self.tempdir)
def _generate_fake_settings(self, prefix=None):
def random_string():
return "".join([chr(random.randint(97, 123)) for _ in range(10)])
settings = {
"FILES_EXPIRES": random.randint(100, 1000),
"FILES_URLS_FIELD": random_string(),
"FILES_RESULT_FIELD": random_string(),
"FILES_STORE": self.tempdir
}
if not prefix:
return settings
return {prefix.upper() + "_" + k if k != "FILES_STORE" else k: v for k, v in settings.items()}
def _generate_fake_pipeline(self):
class UserDefinedFilePipeline(FilesPipeline):
EXPIRES = 1001
FILES_URLS_FIELD = "alfa"
FILES_RESULT_FIELD = "beta"
return UserDefinedFilePipeline
def test_different_settings_for_different_instances(self):
"""
If there are different instances with different settings they should keep
different settings.
"""
custom_settings = self._generate_fake_settings()
another_pipeline = FilesPipeline.from_settings(Settings(custom_settings))
one_pipeline = FilesPipeline(self.tempdir)
for pipe_attr, settings_attr, pipe_ins_attr in self.file_cls_attr_settings_map:
default_value = self.default_cls_settings[pipe_attr]
self.assertEqual(getattr(one_pipeline, pipe_attr), default_value)
custom_value = custom_settings[settings_attr]
self.assertNotEqual(default_value, custom_value)
self.assertEqual(getattr(another_pipeline, pipe_ins_attr), custom_value)
def test_subclass_attributes_preserved_if_no_settings(self):
"""
If subclasses override class attributes and there are no special settings those values should be kept.
"""
pipe_cls = self._generate_fake_pipeline()
pipe = pipe_cls.from_settings(Settings({"FILES_STORE": self.tempdir}))
for pipe_attr, settings_attr, pipe_ins_attr in self.file_cls_attr_settings_map:
custom_value = getattr(pipe, pipe_ins_attr)
self.assertNotEqual(custom_value, self.default_cls_settings[pipe_attr])
self.assertEqual(getattr(pipe, pipe_ins_attr), getattr(pipe, pipe_attr))
def test_subclass_attrs_preserved_custom_settings(self):
"""
If file settings are defined but they are not defined for subclass
settings should be preserved.
"""
pipeline_cls = self._generate_fake_pipeline()
settings = self._generate_fake_settings()
pipeline = pipeline_cls.from_settings(Settings(settings))
for pipe_attr, settings_attr, pipe_ins_attr in self.file_cls_attr_settings_map:
value = getattr(pipeline, pipe_ins_attr)
setting_value = settings.get(settings_attr)
self.assertNotEqual(value, self.default_cls_settings[pipe_attr])
self.assertEqual(value, setting_value)
def test_no_custom_settings_for_subclasses(self):
"""
If there are no settings for subclass and no subclass attributes, pipeline should use
attributes of base class.
"""
class UserDefinedFilesPipeline(FilesPipeline):
pass
user_pipeline = UserDefinedFilesPipeline.from_settings(Settings({"FILES_STORE": self.tempdir}))
for pipe_attr, settings_attr, pipe_ins_attr in self.file_cls_attr_settings_map:
# Values from settings for custom pipeline should be set on pipeline instance.
custom_value = self.default_cls_settings.get(pipe_attr.upper())
self.assertEqual(getattr(user_pipeline, pipe_ins_attr), custom_value)
def test_custom_settings_for_subclasses(self):
"""
If there are custom settings for subclass and NO class attributes, pipeline should use custom
settings.
"""
class UserDefinedFilesPipeline(FilesPipeline):
pass
prefix = UserDefinedFilesPipeline.__name__.upper()
settings = self._generate_fake_settings(prefix=prefix)
user_pipeline = UserDefinedFilesPipeline.from_settings(Settings(settings))
for pipe_attr, settings_attr, pipe_inst_attr in self.file_cls_attr_settings_map:
# Values from settings for custom pipeline should be set on pipeline instance.
custom_value = settings.get(prefix + "_" + settings_attr)
self.assertNotEqual(custom_value, self.default_cls_settings[pipe_attr])
self.assertEqual(getattr(user_pipeline, pipe_inst_attr), custom_value)
def test_custom_settings_and_class_attrs_for_subclasses(self):
"""
If there are custom settings for subclass AND class attributes
setting keys are preferred and override attributes.
"""
pipeline_cls = self._generate_fake_pipeline()
prefix = pipeline_cls.__name__.upper()
settings = self._generate_fake_settings(prefix=prefix)
user_pipeline = pipeline_cls.from_settings(Settings(settings))
for pipe_cls_attr, settings_attr, pipe_inst_attr in self.file_cls_attr_settings_map:
custom_value = settings.get(prefix + "_" + settings_attr)
self.assertNotEqual(custom_value, self.default_cls_settings[pipe_cls_attr])
self.assertEqual(getattr(user_pipeline, pipe_inst_attr), custom_value)
def test_cls_attrs_with_DEFAULT_prefix(self):
class UserDefinedFilesPipeline(FilesPipeline):
DEFAULT_FILES_RESULT_FIELD = "this"
DEFAULT_FILES_URLS_FIELD = "that"
pipeline = UserDefinedFilesPipeline.from_settings(Settings({"FILES_STORE": self.tempdir}))
self.assertEqual(pipeline.files_result_field, "this")
self.assertEqual(pipeline.files_urls_field, "that")
def test_user_defined_subclass_default_key_names(self):
"""Test situation when user defines subclass of FilesPipeline,
but uses attribute names for default pipeline (without prefixing
them with pipeline class name).
"""
settings = self._generate_fake_settings()
class UserPipe(FilesPipeline):
pass
pipeline_cls = UserPipe.from_settings(Settings(settings))
for pipe_attr, settings_attr, pipe_inst_attr in self.file_cls_attr_settings_map:
expected_value = settings.get(settings_attr)
self.assertEqual(getattr(pipeline_cls, pipe_inst_attr),
expected_value)
2016-02-15 17:50:47 +03:00
class TestS3FilesStore(unittest.TestCase):
@defer.inlineCallbacks
def test_persist(self):
assert_aws_environ()
uri = os.environ.get('S3_TEST_FILE_URI')
if not uri:
raise unittest.SkipTest("No S3 URI available for testing")
data = b"TestS3FilesStore: \xe2\x98\x83"
buf = BytesIO(data)
meta = {'foo': 'bar'}
path = ''
store = S3FilesStore(uri)
yield store.persist_file(
path, buf, info=None, meta=meta,
headers={'Content-Type': 'image/png'})
2016-02-15 17:50:47 +03:00
s = yield store.stat_file(path, info=None)
self.assertIn('last_modified', s)
self.assertIn('checksum', s)
self.assertEqual(s['checksum'], '3187896a9657a28163abb31667df64c8')
2016-02-15 17:50:47 +03:00
u = urlparse(uri)
content, key = get_s3_content_and_delete(
u.hostname, u.path[1:], with_key=True)
2016-02-15 17:50:47 +03:00
self.assertEqual(content, data)
if is_botocore():
self.assertEqual(key['Metadata'], {'foo': 'bar'})
self.assertEqual(
key['CacheControl'], S3FilesStore.HEADERS['Cache-Control'])
self.assertEqual(key['ContentType'], 'image/png')
else:
self.assertEqual(key.metadata, {'foo': 'bar'})
self.assertEqual(
key.cache_control, S3FilesStore.HEADERS['Cache-Control'])
self.assertEqual(key.content_type, 'image/png')
2016-02-15 17:50:47 +03:00
class ItemWithFiles(Item):
file_urls = Field()
files = Field()
def _create_item_with_files(*files):
item = ItemWithFiles()
item['file_urls'] = files
return item
def _prepare_request_object(item_url):
return Request(
item_url,
2015-07-25 17:56:46 +00:00
meta={'response': Response(item_url, status=200, body=b'data')})
if __name__ == "__main__":
unittest.main()