Ported S3ImagesStore to use boto threads. This simplifies the code and makes

the following things no longer needed: 1. custom spider for S3 requests (ex. _S3AmazonAWSSpider) 2. scrapy.contrib.aws.AWSMiddleware 3. scrapy.utils.aws
2025-02-24 15:23:44 +00:00 · 2010-05-26 10:29:32 -03:00 · 2010-05-26 10:29:32 -03:00 · 8c1feb7ae4
commit 8c1feb7ae4
parent c8c19a8e53
4 changed files with 28 additions and 271 deletions
--- a/scrapy/contrib/aws.py
+++ b/scrapy/contrib/aws.py
@ -1,24 +0,0 @@
 """
 A downloader middleware for signing AWS requests just before they get into the
 downloader. It is important to sign as close to the downloader as possible
 because Amazon Web Service use timestamps for authentication.
 """
 import os
 from time import strftime, gmtime
 from scrapy.utils.aws import sign_request
 from scrapy.conf import settings
 class AWSMiddleware(object):
    def __init__(self):
        self.access_key = settings['AWS_ACCESS_KEY_ID'] or \
            os.environ.get('AWS_ACCESS_KEY_ID')
        self.secret_key = settings['AWS_SECRET_ACCESS_KEY'] or \
            os.environ.get('AWS_SECRET_ACCESS_KEY')
    def process_request(self, request, spider):
        if request.meta.get('sign_s3_request'):
            request.headers['Date'] = strftime("%a, %d %b %Y %H:%M:%S GMT", gmtime())
            sign_request(request, self.access_key, self.secret_key)
--- a/scrapy/contrib/pipeline/images.py
+++ b/scrapy/contrib/pipeline/images.py
@ -14,18 +14,15 @@ import Image
 from cStringIO import StringIO
 from collections import defaultdict
-from twisted.internet import defer
+from twisted.internet import defer, threads
 from scrapy.xlib.pydispatch import dispatcher
 from scrapy import log
 from scrapy.stats import stats
 from scrapy.utils.misc import md5sum
 from scrapy.core import signals
 from scrapy.core.engine import scrapyengine
 from scrapy.core.exceptions import DropItem, NotConfigured, IgnoreRequest
 from scrapy.spider import BaseSpider
 from scrapy.contrib.pipeline.media import MediaPipeline
 from scrapy.http import Request
 from scrapy.conf import settings
@ -78,83 +75,47 @@ class FSImagesStore(object):
            seen.add(dirname)
 class _S3AmazonAWSSpider(BaseSpider):
    """This spider is used for uploading images to Amazon S3
    It is basically not a crawling spider like a normal spider is, this spider is
    a placeholder that allows us to open a different slot in downloader and use it
    for uploads to S3.
    The use of another downloader slot for S3 images avoid the effect of normal
    spider downloader slot to be affected by requests to a complete different
    domain (s3.amazonaws.com).
    It means that a spider that uses download_delay or alike is not going to be
    delayed even more because it is uploading images to s3.
    """
    name = "s3.amazonaws.com"
    start_urls = ['http://s3.amazonaws.com/']
    max_concurrent_requests = 100
 class S3ImagesStore(object):
-    request_priority = 1000
+    AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
    AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
    def __init__(self, uri):
        assert uri.startswith('s3://')
        self.bucket, self.prefix = uri[5:].split('/', 1)
        self._set_custom_spider()
    def _set_custom_spider(self):
        use_custom_spider = bool(settings['IMAGES_S3STORE_SPIDER'])
        if use_custom_spider:
            self.s3_spider = _S3AmazonAWSSpider()
        else:
            self.s3_spider = None
    def stat_image(self, key, info):
-        def _onsuccess(response):
+        def _onsuccess(boto_key):
-            if response.status == 200:
+            checksum = boto_key.etag.strip('"')
-                checksum = response.headers['Etag'].strip('"')
+            last_modified = boto_key.last_modified
                last_modified = response.headers['Last-Modified']
            modified_tuple = rfc822.parsedate_tz(last_modified)
            modified_stamp = int(rfc822.mktime_tz(modified_tuple))
            return {'checksum': checksum, 'last_modified': modified_stamp}
-        req = self._build_request(key, method='HEAD')
+        return self._get_boto_key(key).addCallback(_onsuccess)
-        return self._download_request(req, info).addCallback(_onsuccess)
+
    def _get_boto_bucket(self):
        from boto.s3.connection import S3Connection
        c = S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY)
        return c.get_bucket(self.bucket, validate=False)
    def _get_boto_key(self, key):
        b = self._get_boto_bucket()
        key_name = '%s%s' % (self.prefix, key)
        return threads.deferToThread(b.get_key, key_name)
    def persist_image(self, key, image, buf, info):
        """Upload image to S3 storage"""
        width, height = image.size
-        headers = {
+        headers = {'Cache-Control': 'max-age=172800'} # 2 days of cache
-                'Content-Type': 'image/jpeg',
+        b = self._get_boto_bucket()
-                'X-Amz-Acl': 'public-read',
+        key_name = '%s%s' % (self.prefix, key)
-                'X-Amz-Meta-Width': str(width),
+        k = b.new_key(key_name)
-                'X-Amz-Meta-Height': str(height),
+        k.set_metadata('width', str(width))
-                'Cache-Control': 'max-age=172800',
+        k.set_metadata('height', str(height))
                }
        buf.seek(0)
-        req = self._build_request(key, method='PUT', body=buf.read(), headers=headers)
+        return threads.deferToThread(k.set_contents_from_file, buf, headers, \
-        return self._download_request(req, info)
+            policy='public-read')
    def _build_request(self, key, method, body=None, headers=None):
        url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key)
        return Request(url, method=method, body=body, headers=headers, \
                meta={'sign_s3_request': True}, priority=self.request_priority)
    def _download_request(self, request, info):
        """This method is used for HEAD and PUT requests sent to amazon S3
        It tries to use a specific spider domain for uploads, or defaults
        to current domain spider.
        """
        if self.s3_spider:
            # need to use schedule to auto-open domain
            return scrapyengine.schedule(request, self.s3_spider)
        return scrapyengine.download(request, info.spider)
 class ImagesPipeline(MediaPipeline):
--- a/scrapy/tests/test_utils_aws.py
+++ b/scrapy/tests/test_utils_aws.py
@ -1,108 +0,0 @@
 from unittest import TestCase, main
 from scrapy.utils import aws
 from scrapy.http import Request
 # just some random keys. keys are provided by amazon developer guide at
 # http://s3.amazonaws.com/awsdocs/S3/20060301/s3-dg-20060301.pdf
 # and the tests described here are the examples from that manual
 AWS_ACCESS_KEY_ID = '0PN5J17HBGZHT7JJ3X82'
 AWS_SECRET_ACCESS_KEY = 'uV3F3YluFJax1cknvbcGwgjvx4QpvB+leU8dUj2o'
 class ScrapyAWSTest(TestCase):
    def test_cannonical_string1(self):
        cs = aws.canonical_string('GET', '/johnsmith/photos/puppy.jpg', {
            'Host': 'johnsmith.s3.amazonaws.com',
            'Date': 'Tue, 27 Mar 2007 19:36:42 +0000',
            })
        self.assertEqual(cs, \
                '''GET\n\n\nTue, 27 Mar 2007 19:36:42 +0000\n/johnsmith/photos/puppy.jpg''')
    def test_cannonical_string2(self):
        cs = aws.canonical_string('PUT', '/johnsmith/photos/puppy.jpg', {
            'Content-Type': 'image/jpeg',
            'Host': 'johnsmith.s3.amazonaws.com',
            'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
            'Content-Length': '94328',
            })
        self.assertEqual(cs, \
                '''PUT\n\nimage/jpeg\nTue, 27 Mar 2007 21:15:45 +0000\n/johnsmith/photos/puppy.jpg''')
    def test_request_signing1(self):
        # gets an object from the johnsmith bucket.
        req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', headers={
            'Date': 'Tue, 27 Mar 2007 19:36:42 +0000',
            })
        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
        self.assertEqual(req.headers['Authorization'], \
                'AWS 0PN5J17HBGZHT7JJ3X82:xXjDGYUmKxnwqr5KXNPGldn5LbA=')
    def test_request_signing2(self):
        # puts an object into the johnsmith bucket.
        req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', method='PUT', headers={
            'Content-Type': 'image/jpeg',
            'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
            'Content-Length': '94328',
            })
        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
        self.assertEqual(req.headers['Authorization'], \
                'AWS 0PN5J17HBGZHT7JJ3X82:hcicpDDvL9SsO6AkvxqmIWkmOuQ=')
    def test_request_signing3(self):
        # lists the content of the johnsmith bucket.
        req = Request('http://johnsmith.s3.amazonaws.com/?prefix=photos&max-keys=50&marker=puppy', \
                method='GET', headers={
                    'User-Agent': 'Mozilla/5.0',
                    'Date': 'Tue, 27 Mar 2007 19:42:41 +0000',
                    })
        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
        self.assertEqual(req.headers['Authorization'], \
                'AWS 0PN5J17HBGZHT7JJ3X82:jsRt/rhG+Vtp88HrYL706QhE4w4=')
    def test_request_signing4(self):
        # fetches the access control policy sub-resource for the 'johnsmith' bucket.
        req = Request('http://johnsmith.s3.amazonaws.com/?acl', \
                method='GET', headers={
                    'Date': 'Tue, 27 Mar 2007 19:44:46 +0000',
                    })
        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
        self.assertEqual(req.headers['Authorization'], \
                'AWS 0PN5J17HBGZHT7JJ3X82:thdUi9VAkzhkniLj96JIrOPGi0g=')
    def test_request_signing5(self):
        # deletes an object from the 'johnsmith' bucket using the path-style and Date alternative.
        req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', \
                method='DELETE', headers={
                    'Date': 'Tue, 27 Mar 2007 21:20:27 +0000',
                    'x-amz-date': 'Tue, 27 Mar 2007 21:20:26 +0000',
                    })
        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
        self.assertEqual(req.headers['Authorization'], \
                'AWS 0PN5J17HBGZHT7JJ3X82:k3nL7gH3+PadhTEVn5Ip83xlYzk=')
    def test_request_signing6(self):
        # uploads an object to a CNAME style virtual hosted bucket with metadata.
        req = Request('http://static.johnsmith.net:8080/db-backup.dat.gz', \
                method='PUT', headers={
                    'User-Agent': 'curl/7.15.5',
                    'Host': 'static.johnsmith.net:8080',
                    'Date': 'Tue, 27 Mar 2007 21:06:08 +0000',
                    'x-amz-acl': 'public-read',
                    'content-type': 'application/x-download',
                    'Content-MD5': '4gJE4saaMU4BqNR0kLY+lw==',
                    'X-Amz-Meta-ReviewedBy': 'joe@johnsmith.net,jane@johnsmith.net',
                    'X-Amz-Meta-FileChecksum': '0x02661779',
                    'X-Amz-Meta-ChecksumAlgorithm': 'crc32',
                    'Content-Disposition': 'attachment; filename=database.dat',
                    'Content-Encoding': 'gzip',
                    'Content-Length': '5913339',
                    })
        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
        self.assertEqual(req.headers['Authorization'], \
                'AWS 0PN5J17HBGZHT7JJ3X82:C0FlOtU8Ylb9KDTpZqYkZPX91iI=')
 if __name__ == '__main__':
    main()
--- a/scrapy/utils/aws.py
+++ b/scrapy/utils/aws.py
@ -1,72 +0,0 @@
 """Helper function for working with Amazon Web Services"""
 import re
 import time
 import hmac
 import base64
 import hashlib
 from urlparse import urlsplit
 AMAZON_HEADER_PREFIX = 'x-amz-'
 # generates the aws canonical string for the given parameters
 def canonical_string(method, path, headers, expires=None):
    interesting_headers = {}
    for key in headers:
        lk = key.lower()
        if lk in set(['content-md5', 'content-type', 'date']) \
                or lk.startswith(AMAZON_HEADER_PREFIX):
            interesting_headers[lk] = headers[key].strip()
    # these keys get empty strings if they don't exist
    interesting_headers.setdefault('content-type', '')
    interesting_headers.setdefault('content-md5', '')
    # just in case someone used this.  it's not necessary in this lib.
    if 'x-amz-date' in interesting_headers:
        interesting_headers['date'] = ''
    # if you're using expires for query string auth, then it trumps date
    # (and x-amz-date)
    if expires:
        interesting_headers['date'] = str(expires)
    sorted_header_keys = interesting_headers.keys()
    sorted_header_keys.sort()
    buf = "%s\n" % method
    for key in sorted_header_keys:
        if key.startswith(AMAZON_HEADER_PREFIX):
            buf += "%s:%s\n" % (key, interesting_headers[key])
        else:
            buf += "%s\n" % interesting_headers[key]
    # don't include anything after the first ? in the resource...
    buf += "%s" % path.split('?')[0]
    # ...unless there is an acl or torrent parameter
    if re.search("[&?]acl($|=|&)", path):
        buf += "?acl"
    elif re.search("[&?]logging($|=|&)", path):
        buf += "?logging"
    elif re.search("[&?]torrent($|=|&)", path):
        buf += "?torrent"
    elif re.search("[&?]location($|=|&)", path):
        buf += "?location"
    return buf
 def sign_request(req, accesskey, secretkey):
    if 'Date' not in req.headers:
        req.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
    parsed = urlsplit(req.url)
    bucket = parsed.hostname.replace('.s3.amazonaws.com','')
    key = '%s?%s' % (parsed.path, parsed.query) if parsed.query else parsed.path
    fqkey = '/%s%s' % (bucket, key)
    c_string = canonical_string(req.method, fqkey, req.headers)
    _hmac = hmac.new(secretkey, digestmod=hashlib.sha1)
    _hmac.update(c_string)
    b64_hmac = base64.encodestring(_hmac.digest()).strip()
    req.headers['Authorization'] = "AWS %s:%s" % (accesskey, b64_hmac)