diff --git a/scrapy/contrib/pipeline/images.py b/scrapy/contrib/pipeline/images.py index 7de993915..02eddcb65 100644 --- a/scrapy/contrib/pipeline/images.py +++ b/scrapy/contrib/pipeline/images.py @@ -8,17 +8,24 @@ from __future__ import with_statement import os import time import hashlib +import urlparse +import rfc822 import Image from cStringIO import StringIO +from collections import defaultdict from twisted.internet import defer +from scrapy.xlib.pydispatch import dispatcher from scrapy import log from scrapy.stats import stats from scrapy.utils.misc import md5sum +from scrapy.core import signals +from scrapy.core.engine import scrapyengine from scrapy.core.exceptions import DropItem, NotConfigured -from scrapy.conf import settings from scrapy.contrib.pipeline.media import MediaPipeline +from scrapy.http import Request +from scrapy.conf import settings class NoimagesDrop(DropItem): @@ -28,17 +35,159 @@ class ImageException(Exception): """General image error exception""" -class BaseImagesPipeline(MediaPipeline): +class FSImagesStore(object): + def __init__(self, basedir): + if '://' in basedir: + basedir = basedir.split('://', 1)[1] + self.basedir = basedir + self._mkdir(self.basedir) + self.created_directories = defaultdict(set) + dispatcher.connect(self.domain_closed, signals.domain_closed) + + def domain_closed(self, domain): + self.created_directories.pop(domain, None) + + def persist_image(self, key, image, buf, info): + absolute_path = self._get_filesystem_path(key) + self._mkdir(os.path.dirname(absolute_path), info) + image.save(absolute_path) + + def stat_image(self, key, info): + absolute_path = self._get_filesystem_path(key) + try: + last_modified = os.path.getmtime(absolute_path) + except: # FIXME: catching everything! + return {} + + with open(absolute_path, 'rb') as imagefile: + checksum = md5sum(imagefile) + + return {'last_modified': last_modified, 'checksum': checksum} + + def _get_filesystem_path(self, key): + return os.path.join(self.basedir, key) + + def _mkdir(self, dirname, domain=None): + seen = self.created_directories[domain] if domain else set() + if dirname not in seen: + if not os.path.exists(dirname): + os.makedirs(dirname) + seen.add(dirname) + + +class S3ImagesStore(object): + + request_priority = 1000 + + def __init__(self, uri): + assert uri.startswith('s3://') + self.bucket, self.prefix = uri[5:].split('/', 1) + self._set_custom_spider() + + def _set_custom_spider(self): + domain = settings['IMAGES_S3STORE_SPIDER'] + if domain: + from scrapy.spider import spiders + self.s3_spider = spiders.fromdomain(domain) + else: + self.s3_spider = None + + def stat_image(self, key, info): + def _onsuccess(response): + if response.status == 200: + checksum = response.headers['Etag'].strip('"') + last_modified = response.headers['Last-Modified'] + modified_tuple = rfc822.parsedate_tz(last_modified) + modified_stamp = int(rfc822.mktime_tz(modified_tuple)) + return {'checksum': checksum, 'last_modified': modified_stamp} + + req = self._build_request(key, method='HEAD') + return self._download_request(req, info).addCallback(_onsuccess) + + def persist_image(self, key, image, buf, info): + """Upload image to S3 storage""" + width, height = image.size + headers = { + 'Content-Type': 'image/jpeg', + 'X-Amz-Acl': 'public-read', + 'X-Amz-Meta-Width': str(width), + 'X-Amz-Meta-Height': str(height), + 'Cache-Control': 'max-age=172800', + } + + buf.seek(0) + req = self._build_request(key, method='PUT', body=buf.read(), headers=headers) + return self._download_request(req, info) + + def _build_request(self, key, method, body=None, headers=None): + url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key) + return Request(url, method=method, body=body, headers=headers, \ + priority=self.request_priority) + + def _download_request(self, request, info): + """This method is used for HEAD and PUT requests sent to amazon S3 + + It tries to use a specific spider domain for uploads, or defaults + to current domain spider. + """ + if self.s3_spider: + # need to use schedule to auto-open domain + return scrapyengine.schedule(request, self.s3_spider) + return scrapyengine.download(request, info.spider) + + +class ImagesPipeline(MediaPipeline): + """Abstract pipeline that implement the image downloading and thumbnail generation logic + + This pipeline tries to minimize network transfers and image processing, + doing stat of the images and determining if image is new, uptodate or + expired. + + `new` images are those that pipeline never processed and needs to be + downloaded from supplier site the first time. + + `uptodate` images are the ones that the pipeline processed and are still + valid images. + + `expired` images are those that pipeline already processed but the last + modification was made long time ago, so a reprocessing is recommended to + refresh it in case of change. + + IMAGES_EXPIRES setting controls the maximun days since an image was modified + to consider it uptodate. + + THUMBS is a tuple of tuples, each sub-tuple is a pair of thumb_id string + and a compatible python image library size (a tuple). + See thumbnail method at http://www.pythonware.com/library/pil/handbook/image.htm + + Downloaded images are skipped if sizes aren't greater than MIN_WIDTH and + MIN_HEIGHT limit. A proper log messages will be printed. + + """ + + MEDIA_NAME = 'image' MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0) MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0) - IMAGES_EXPIRES = settings.getint('IMAGES_EXPIRES', 90) - MEDIA_NAME = 'image' - THUMBS = ( -# ('50', (50, 50)), -# ('110', (110, 110)), -# ('270', (270, 270)) - ) + EXPIRES = settings.getint('IMAGES_EXPIRES', 90) + THUMBS = settings.getlist('IMAGES_THUMBS') + STORE_SCHEMES = { + '': FSImagesStore, + 'file': FSImagesStore, + 's3': S3ImagesStore, + } + + def __init__(self): + store_uri = settings['IMAGES_STORE'] + if not store_uri: + raise NotConfigured + self.store = self._get_store(store_uri) + super(ImagesPipeline, self).__init__() + + def _get_store(self, uri): + scheme = urlparse.urlparse(uri).scheme + store_cls = self.STORE_SCHEMES[scheme] + return store_cls(uri) def media_downloaded(self, response, request, info): mtype = self.MEDIA_NAME @@ -92,7 +241,7 @@ class BaseImagesPipeline(MediaPipeline): age_seconds = time.time() - last_modified age_days = age_seconds / 60 / 60 / 24 - if age_days > self.IMAGES_EXPIRES: + if age_days > self.EXPIRES: return # returning None force download referer = request.headers.get('Referer') @@ -104,15 +253,15 @@ class BaseImagesPipeline(MediaPipeline): return {'scraped_url': request.url, 'path': key, 'checksum': checksum} key = self.image_key(request.url) - dfd = defer.maybeDeferred(self.stat_key, key, info) + dfd = defer.maybeDeferred(self.store.stat_image, key, info) dfd.addCallbacks(_onsuccess, lambda _:None) - dfd.addErrback(log.err, self.__class__.__name__ + '.stat_key') + dfd.addErrback(log.err, self.__class__.__name__ + '.store.stat_image') return dfd def image_downloaded(self, response, request, info): first_buf = None for key, image, buf in self.get_images(response, request, info): - self.store_image(key, image, buf, info) + self.store.persist_image(key, image, buf, info) if first_buf is None: first_buf = buf first_buf.seek(0) @@ -162,59 +311,3 @@ class BaseImagesPipeline(MediaPipeline): def thumb_key(self, url, thumb_id): image_guid = hashlib.sha1(url).hexdigest() return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid) - - # Required overradiable interface - def store_image(self, key, image, buf, info): - raise NotImplementedError - - def stat_key(self, key, info): - raise NotImplementedError - - -class ImagesPipeline(BaseImagesPipeline): - """Images pipeline with filesystem support as image's store backend - - If IMAGES_DIR setting has a valid value, this pipeline is enabled and use - path defined at setting as dirname for storing images. - - """ - - class DomainInfo(BaseImagesPipeline.DomainInfo): - def __init__(self, domain): - self.created_directories = set() - super(ImagesPipeline.DomainInfo, self).__init__(domain) - - def __init__(self): - if not settings['IMAGES_DIR']: - raise NotConfigured - - self.BASEDIRNAME = settings['IMAGES_DIR'] - self.mkdir(self.BASEDIRNAME) - super(ImagesPipeline, self).__init__() - - def store_image(self, key, image, buf, info): - absolute_path = self.get_filesystem_path(key) - self.mkdir(os.path.dirname(absolute_path), info) - image.save(absolute_path) - - def stat_key(self, key, info): - absolute_path = self.get_filesystem_path(key) - try: - last_modified = os.path.getmtime(absolute_path) - except: - return {} - - with open(absolute_path, 'rb') as imagefile: - checksum = md5sum(imagefile) - - return {'last_modified': last_modified, 'checksum': checksum} - - def get_filesystem_path(self, key): - return os.path.join(self.BASEDIRNAME, key) - - def mkdir(self, dirname, info=None): - already_created = info.created_directories if info else set() - if dirname not in already_created: - if not os.path.exists(dirname): - os.makedirs(dirname) - already_created.add(dirname) diff --git a/scrapy/contrib/pipeline/s3images.py b/scrapy/contrib/pipeline/s3images.py index a1018ee4c..ece456269 100644 --- a/scrapy/contrib/pipeline/s3images.py +++ b/scrapy/contrib/pipeline/s3images.py @@ -54,8 +54,7 @@ class S3ImagesPipeline(BaseImagesPipeline): def s3_request(self, key, method, body=None, headers=None): url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket_name, self.key_prefix, key) - req = Request(url, method=method, body=body, headers=headers) - return req + return Request(url, method=method, body=body, headers=headers) def stat_key(self, key, info): def _onsuccess(response): @@ -97,5 +96,3 @@ class S3ImagesPipeline(BaseImagesPipeline): # need to use schedule to auto-open domain return scrapyengine.schedule(request, self.AmazonS3Spider) return self.download(request, info) - -