1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 09:43:46 +00:00

imagepipeline: simplify configuraiton using a single setting to setup different backend stores

--HG--
extra : rebase_source : cd0d5f560dcde0003974e360282a233ba387c5be
This commit is contained in:
Daniel Grana 2009-09-01 18:00:56 -03:00
parent b05fd80640
commit d6adc141de
2 changed files with 163 additions and 73 deletions

View File

@ -8,17 +8,24 @@ from __future__ import with_statement
import os
import time
import hashlib
import urlparse
import rfc822
import Image
from cStringIO import StringIO
from collections import defaultdict
from twisted.internet import defer
from scrapy.xlib.pydispatch import dispatcher
from scrapy import log
from scrapy.stats import stats
from scrapy.utils.misc import md5sum
from scrapy.core import signals
from scrapy.core.engine import scrapyengine
from scrapy.core.exceptions import DropItem, NotConfigured
from scrapy.conf import settings
from scrapy.contrib.pipeline.media import MediaPipeline
from scrapy.http import Request
from scrapy.conf import settings
class NoimagesDrop(DropItem):
@ -28,17 +35,159 @@ class ImageException(Exception):
"""General image error exception"""
class BaseImagesPipeline(MediaPipeline):
class FSImagesStore(object):
def __init__(self, basedir):
if '://' in basedir:
basedir = basedir.split('://', 1)[1]
self.basedir = basedir
self._mkdir(self.basedir)
self.created_directories = defaultdict(set)
dispatcher.connect(self.domain_closed, signals.domain_closed)
def domain_closed(self, domain):
self.created_directories.pop(domain, None)
def persist_image(self, key, image, buf, info):
absolute_path = self._get_filesystem_path(key)
self._mkdir(os.path.dirname(absolute_path), info)
image.save(absolute_path)
def stat_image(self, key, info):
absolute_path = self._get_filesystem_path(key)
try:
last_modified = os.path.getmtime(absolute_path)
except: # FIXME: catching everything!
return {}
with open(absolute_path, 'rb') as imagefile:
checksum = md5sum(imagefile)
return {'last_modified': last_modified, 'checksum': checksum}
def _get_filesystem_path(self, key):
return os.path.join(self.basedir, key)
def _mkdir(self, dirname, domain=None):
seen = self.created_directories[domain] if domain else set()
if dirname not in seen:
if not os.path.exists(dirname):
os.makedirs(dirname)
seen.add(dirname)
class S3ImagesStore(object):
request_priority = 1000
def __init__(self, uri):
assert uri.startswith('s3://')
self.bucket, self.prefix = uri[5:].split('/', 1)
self._set_custom_spider()
def _set_custom_spider(self):
domain = settings['IMAGES_S3STORE_SPIDER']
if domain:
from scrapy.spider import spiders
self.s3_spider = spiders.fromdomain(domain)
else:
self.s3_spider = None
def stat_image(self, key, info):
def _onsuccess(response):
if response.status == 200:
checksum = response.headers['Etag'].strip('"')
last_modified = response.headers['Last-Modified']
modified_tuple = rfc822.parsedate_tz(last_modified)
modified_stamp = int(rfc822.mktime_tz(modified_tuple))
return {'checksum': checksum, 'last_modified': modified_stamp}
req = self._build_request(key, method='HEAD')
return self._download_request(req, info).addCallback(_onsuccess)
def persist_image(self, key, image, buf, info):
"""Upload image to S3 storage"""
width, height = image.size
headers = {
'Content-Type': 'image/jpeg',
'X-Amz-Acl': 'public-read',
'X-Amz-Meta-Width': str(width),
'X-Amz-Meta-Height': str(height),
'Cache-Control': 'max-age=172800',
}
buf.seek(0)
req = self._build_request(key, method='PUT', body=buf.read(), headers=headers)
return self._download_request(req, info)
def _build_request(self, key, method, body=None, headers=None):
url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key)
return Request(url, method=method, body=body, headers=headers, \
priority=self.request_priority)
def _download_request(self, request, info):
"""This method is used for HEAD and PUT requests sent to amazon S3
It tries to use a specific spider domain for uploads, or defaults
to current domain spider.
"""
if self.s3_spider:
# need to use schedule to auto-open domain
return scrapyengine.schedule(request, self.s3_spider)
return scrapyengine.download(request, info.spider)
class ImagesPipeline(MediaPipeline):
"""Abstract pipeline that implement the image downloading and thumbnail generation logic
This pipeline tries to minimize network transfers and image processing,
doing stat of the images and determining if image is new, uptodate or
expired.
`new` images are those that pipeline never processed and needs to be
downloaded from supplier site the first time.
`uptodate` images are the ones that the pipeline processed and are still
valid images.
`expired` images are those that pipeline already processed but the last
modification was made long time ago, so a reprocessing is recommended to
refresh it in case of change.
IMAGES_EXPIRES setting controls the maximun days since an image was modified
to consider it uptodate.
THUMBS is a tuple of tuples, each sub-tuple is a pair of thumb_id string
and a compatible python image library size (a tuple).
See thumbnail method at http://www.pythonware.com/library/pil/handbook/image.htm
Downloaded images are skipped if sizes aren't greater than MIN_WIDTH and
MIN_HEIGHT limit. A proper log messages will be printed.
"""
MEDIA_NAME = 'image'
MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
IMAGES_EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
MEDIA_NAME = 'image'
THUMBS = (
# ('50', (50, 50)),
# ('110', (110, 110)),
# ('270', (270, 270))
)
EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
THUMBS = settings.getlist('IMAGES_THUMBS')
STORE_SCHEMES = {
'': FSImagesStore,
'file': FSImagesStore,
's3': S3ImagesStore,
}
def __init__(self):
store_uri = settings['IMAGES_STORE']
if not store_uri:
raise NotConfigured
self.store = self._get_store(store_uri)
super(ImagesPipeline, self).__init__()
def _get_store(self, uri):
scheme = urlparse.urlparse(uri).scheme
store_cls = self.STORE_SCHEMES[scheme]
return store_cls(uri)
def media_downloaded(self, response, request, info):
mtype = self.MEDIA_NAME
@ -92,7 +241,7 @@ class BaseImagesPipeline(MediaPipeline):
age_seconds = time.time() - last_modified
age_days = age_seconds / 60 / 60 / 24
if age_days > self.IMAGES_EXPIRES:
if age_days > self.EXPIRES:
return # returning None force download
referer = request.headers.get('Referer')
@ -104,15 +253,15 @@ class BaseImagesPipeline(MediaPipeline):
return {'scraped_url': request.url, 'path': key, 'checksum': checksum}
key = self.image_key(request.url)
dfd = defer.maybeDeferred(self.stat_key, key, info)
dfd = defer.maybeDeferred(self.store.stat_image, key, info)
dfd.addCallbacks(_onsuccess, lambda _:None)
dfd.addErrback(log.err, self.__class__.__name__ + '.stat_key')
dfd.addErrback(log.err, self.__class__.__name__ + '.store.stat_image')
return dfd
def image_downloaded(self, response, request, info):
first_buf = None
for key, image, buf in self.get_images(response, request, info):
self.store_image(key, image, buf, info)
self.store.persist_image(key, image, buf, info)
if first_buf is None:
first_buf = buf
first_buf.seek(0)
@ -162,59 +311,3 @@ class BaseImagesPipeline(MediaPipeline):
def thumb_key(self, url, thumb_id):
image_guid = hashlib.sha1(url).hexdigest()
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
# Required overradiable interface
def store_image(self, key, image, buf, info):
raise NotImplementedError
def stat_key(self, key, info):
raise NotImplementedError
class ImagesPipeline(BaseImagesPipeline):
"""Images pipeline with filesystem support as image's store backend
If IMAGES_DIR setting has a valid value, this pipeline is enabled and use
path defined at setting as dirname for storing images.
"""
class DomainInfo(BaseImagesPipeline.DomainInfo):
def __init__(self, domain):
self.created_directories = set()
super(ImagesPipeline.DomainInfo, self).__init__(domain)
def __init__(self):
if not settings['IMAGES_DIR']:
raise NotConfigured
self.BASEDIRNAME = settings['IMAGES_DIR']
self.mkdir(self.BASEDIRNAME)
super(ImagesPipeline, self).__init__()
def store_image(self, key, image, buf, info):
absolute_path = self.get_filesystem_path(key)
self.mkdir(os.path.dirname(absolute_path), info)
image.save(absolute_path)
def stat_key(self, key, info):
absolute_path = self.get_filesystem_path(key)
try:
last_modified = os.path.getmtime(absolute_path)
except:
return {}
with open(absolute_path, 'rb') as imagefile:
checksum = md5sum(imagefile)
return {'last_modified': last_modified, 'checksum': checksum}
def get_filesystem_path(self, key):
return os.path.join(self.BASEDIRNAME, key)
def mkdir(self, dirname, info=None):
already_created = info.created_directories if info else set()
if dirname not in already_created:
if not os.path.exists(dirname):
os.makedirs(dirname)
already_created.add(dirname)

View File

@ -54,8 +54,7 @@ class S3ImagesPipeline(BaseImagesPipeline):
def s3_request(self, key, method, body=None, headers=None):
url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket_name, self.key_prefix, key)
req = Request(url, method=method, body=body, headers=headers)
return req
return Request(url, method=method, body=body, headers=headers)
def stat_key(self, key, info):
def _onsuccess(response):
@ -97,5 +96,3 @@ class S3ImagesPipeline(BaseImagesPipeline):
# need to use schedule to auto-open domain
return scrapyengine.schedule(request, self.AmazonS3Spider)
return self.download(request, info)