imagepipeline: simplify configuraiton using a single setting to setup different backend stores

--HG-- extra : rebase_source : cd0d5f560dcde0003974e360282a233ba387c5be
2025-02-25 09:43:46 +00:00 · 2009-09-01 18:00:56 -03:00 · 2009-09-01 18:00:56 -03:00 · d6adc141de
commit d6adc141de
parent b05fd80640
2 changed files with 163 additions and 73 deletions
--- a/scrapy/contrib/pipeline/images.py
+++ b/scrapy/contrib/pipeline/images.py
@ -8,17 +8,24 @@ from __future__ import with_statement
 import os
 import time
 import hashlib
+import urlparse
+import rfc822
 import Image
 from cStringIO import StringIO
+from collections import defaultdict

 from twisted.internet import defer

+from scrapy.xlib.pydispatch import dispatcher
 from scrapy import log
 from scrapy.stats import stats
 from scrapy.utils.misc import md5sum
+from scrapy.core import signals
+from scrapy.core.engine import scrapyengine
 from scrapy.core.exceptions import DropItem, NotConfigured
-from scrapy.conf import settings
 from scrapy.contrib.pipeline.media import MediaPipeline
+from scrapy.http import Request
+from scrapy.conf import settings


 class NoimagesDrop(DropItem):
@ -28,17 +35,159 @@ class ImageException(Exception):
    """General image error exception"""


-class BaseImagesPipeline(MediaPipeline):
+class FSImagesStore(object):

+    def __init__(self, basedir):
+        if '://' in basedir:
+            basedir = basedir.split('://', 1)[1]
+        self.basedir = basedir
+        self._mkdir(self.basedir)
+        self.created_directories = defaultdict(set)
+        dispatcher.connect(self.domain_closed, signals.domain_closed)
+
+    def domain_closed(self, domain):
+        self.created_directories.pop(domain, None)
+
+    def persist_image(self, key, image, buf, info):
+        absolute_path = self._get_filesystem_path(key)
+        self._mkdir(os.path.dirname(absolute_path), info)
+        image.save(absolute_path)
+
+    def stat_image(self, key, info):
+        absolute_path = self._get_filesystem_path(key)
+        try:
+            last_modified = os.path.getmtime(absolute_path)
+        except: # FIXME: catching everything!
+            return {}
+
+        with open(absolute_path, 'rb') as imagefile:
+            checksum = md5sum(imagefile)
+
+        return {'last_modified': last_modified, 'checksum': checksum}
+
+    def _get_filesystem_path(self, key):
+        return os.path.join(self.basedir, key)
+
+    def _mkdir(self, dirname, domain=None):
+        seen = self.created_directories[domain] if domain else set()
+        if dirname not in seen:
+            if not os.path.exists(dirname):
+                os.makedirs(dirname)
+            seen.add(dirname)
+
+
+class S3ImagesStore(object):
+
+    request_priority = 1000
+
+    def __init__(self, uri):
+        assert uri.startswith('s3://')
+        self.bucket, self.prefix = uri[5:].split('/', 1)
+        self._set_custom_spider()
+
+    def _set_custom_spider(self):
+        domain = settings['IMAGES_S3STORE_SPIDER']
+        if domain:
+            from scrapy.spider import spiders
+            self.s3_spider = spiders.fromdomain(domain)
+        else:
+            self.s3_spider = None
+
+    def stat_image(self, key, info):
+        def _onsuccess(response):
+            if response.status == 200:
+                checksum = response.headers['Etag'].strip('"')
+                last_modified = response.headers['Last-Modified']
+                modified_tuple = rfc822.parsedate_tz(last_modified)
+                modified_stamp = int(rfc822.mktime_tz(modified_tuple))
+                return {'checksum': checksum, 'last_modified': modified_stamp}
+
+        req = self._build_request(key, method='HEAD')
+        return self._download_request(req, info).addCallback(_onsuccess)
+
+    def persist_image(self, key, image, buf, info):
+        """Upload image to S3 storage"""
+        width, height = image.size
+        headers = {
+                'Content-Type': 'image/jpeg',
+                'X-Amz-Acl': 'public-read',
+                'X-Amz-Meta-Width': str(width),
+                'X-Amz-Meta-Height': str(height),
+                'Cache-Control': 'max-age=172800',
+                }
+
+        buf.seek(0)
+        req = self._build_request(key, method='PUT', body=buf.read(), headers=headers)
+        return self._download_request(req, info)
+
+    def _build_request(self, key, method, body=None, headers=None):
+        url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key)
+        return Request(url, method=method, body=body, headers=headers, \
+                priority=self.request_priority)
+
+    def _download_request(self, request, info):
+        """This method is used for HEAD and PUT requests sent to amazon S3
+
+        It tries to use a specific spider domain for uploads, or defaults
+        to current domain spider.
+        """
+        if self.s3_spider:
+            # need to use schedule to auto-open domain
+            return scrapyengine.schedule(request, self.s3_spider)
+        return scrapyengine.download(request, info.spider)
+
+
+class ImagesPipeline(MediaPipeline):
+    """Abstract pipeline that implement the image downloading and thumbnail generation logic
+
+    This pipeline tries to minimize network transfers and image processing,
+    doing stat of the images and determining if image is new, uptodate or
+    expired.
+
+    `new` images are those that pipeline never processed and needs to be
+        downloaded from supplier site the first time.
+
+    `uptodate` images are the ones that the pipeline processed and are still
+        valid images.
+
+    `expired` images are those that pipeline already processed but the last
+        modification was made long time ago, so a reprocessing is recommended to
+        refresh it in case of change.
+
+    IMAGES_EXPIRES setting controls the maximun days since an image was modified
+    to consider it uptodate.
+
+    THUMBS is a tuple of tuples, each sub-tuple is a pair of thumb_id string
+    and a compatible python image library size (a tuple).
+    See thumbnail method at http://www.pythonware.com/library/pil/handbook/image.htm
+
+    Downloaded images are skipped if sizes aren't greater than MIN_WIDTH and
+    MIN_HEIGHT limit. A proper log messages will be printed.
+
+    """
+
+    MEDIA_NAME = 'image'
    MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
    MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
-    IMAGES_EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
-    MEDIA_NAME = 'image'
-    THUMBS = (
-#             ('50', (50, 50)),
-#             ('110', (110, 110)),
-#             ('270', (270, 270))
-     )
+    EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
+    THUMBS = settings.getlist('IMAGES_THUMBS')
+    STORE_SCHEMES = {
+            '': FSImagesStore,
+            'file': FSImagesStore,
+            's3': S3ImagesStore,
+            }
+
+    def __init__(self):
+        store_uri = settings['IMAGES_STORE']
+        if not store_uri:
+            raise NotConfigured
+        self.store = self._get_store(store_uri)
+        super(ImagesPipeline, self).__init__()
+
+    def _get_store(self, uri):
+        scheme = urlparse.urlparse(uri).scheme
+        store_cls = self.STORE_SCHEMES[scheme]
+        return store_cls(uri)

    def media_downloaded(self, response, request, info):
        mtype = self.MEDIA_NAME
@ -92,7 +241,7 @@ class BaseImagesPipeline(MediaPipeline):

            age_seconds = time.time() - last_modified
            age_days = age_seconds / 60 / 60 / 24
-            if age_days > self.IMAGES_EXPIRES:
+            if age_days > self.EXPIRES:
                return # returning None force download

            referer = request.headers.get('Referer')
@ -104,15 +253,15 @@ class BaseImagesPipeline(MediaPipeline):
            return {'scraped_url': request.url, 'path': key, 'checksum': checksum}

        key = self.image_key(request.url)
-        dfd = defer.maybeDeferred(self.stat_key, key, info)
+        dfd = defer.maybeDeferred(self.store.stat_image, key, info)
        dfd.addCallbacks(_onsuccess, lambda _:None)
-        dfd.addErrback(log.err, self.__class__.__name__ + '.stat_key')
+        dfd.addErrback(log.err, self.__class__.__name__ + '.store.stat_image')
        return dfd

    def image_downloaded(self, response, request, info):
        first_buf = None
        for key, image, buf in self.get_images(response, request, info):
-            self.store_image(key, image, buf, info)
+            self.store.persist_image(key, image, buf, info)
            if first_buf is None:
                first_buf = buf
        first_buf.seek(0)
@ -162,59 +311,3 @@ class BaseImagesPipeline(MediaPipeline):
    def thumb_key(self, url, thumb_id):
        image_guid = hashlib.sha1(url).hexdigest()
        return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
-
-    # Required overradiable interface
-    def store_image(self, key, image, buf, info):
-        raise NotImplementedError
-
-    def stat_key(self, key, info):
-        raise NotImplementedError
-
-
-class ImagesPipeline(BaseImagesPipeline):
-    """Images pipeline with filesystem support as image's store backend
-
-    If IMAGES_DIR setting has a valid value, this pipeline is enabled and use
-    path defined at setting as dirname for storing images.
-
-    """
-
-    class DomainInfo(BaseImagesPipeline.DomainInfo):
-        def __init__(self, domain):
-            self.created_directories = set()
-            super(ImagesPipeline.DomainInfo, self).__init__(domain)
-
-    def __init__(self):
-        if not settings['IMAGES_DIR']:
-            raise NotConfigured
-
-        self.BASEDIRNAME = settings['IMAGES_DIR']
-        self.mkdir(self.BASEDIRNAME)
-        super(ImagesPipeline, self).__init__()
-
-    def store_image(self, key, image, buf, info):
-        absolute_path = self.get_filesystem_path(key)
-        self.mkdir(os.path.dirname(absolute_path), info)
-        image.save(absolute_path)
-
-    def stat_key(self, key, info):
-        absolute_path = self.get_filesystem_path(key)
-        try:
-            last_modified = os.path.getmtime(absolute_path)
-        except:
-            return {}
-
-        with open(absolute_path, 'rb') as imagefile:
-            checksum = md5sum(imagefile)
-
-        return {'last_modified': last_modified, 'checksum': checksum}
-
-    def get_filesystem_path(self, key):
-        return os.path.join(self.BASEDIRNAME, key)
-
-    def mkdir(self, dirname, info=None):
-        already_created = info.created_directories if info else set()
-        if dirname not in already_created:
-            if not os.path.exists(dirname):
-                os.makedirs(dirname)
-            already_created.add(dirname)
--- a/scrapy/contrib/pipeline/s3images.py
+++ b/scrapy/contrib/pipeline/s3images.py
@ -54,8 +54,7 @@ class S3ImagesPipeline(BaseImagesPipeline):

    def s3_request(self, key, method, body=None, headers=None):
        url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket_name, self.key_prefix, key)
-        req = Request(url, method=method, body=body, headers=headers)
-        return req
+        return Request(url, method=method, body=body, headers=headers)

    def stat_key(self, key, info):
        def _onsuccess(response):
@ -97,5 +96,3 @@ class S3ImagesPipeline(BaseImagesPipeline):
            # need to use schedule to auto-open domain
            return scrapyengine.schedule(request, self.AmazonS3Spider)
        return self.download(request, info)
-
-