Ported S3ImagesStore to use boto threads. This simplifies the code and makes

the following things no longer needed: 1. custom spider for S3 requests (ex. _S3AmazonAWSSpider) 2. scrapy.contrib.aws.AWSMiddleware 3. scrapy.utils.aws
2025-02-24 06:24:00 +00:00 · 2010-05-26 10:29:32 -03:00 · 2010-05-26 10:29:32 -03:00 · 8c1feb7ae4
commit 8c1feb7ae4
parent c8c19a8e53
4 changed files with 28 additions and 271 deletions
--- a/scrapy/contrib/aws.py
+++ b/scrapy/contrib/aws.py
@ -1,24 +0,0 @@
-"""
-A downloader middleware for signing AWS requests just before they get into the
-downloader. It is important to sign as close to the downloader as possible
-because Amazon Web Service use timestamps for authentication.
-"""
-
-import os
-from time import strftime, gmtime
-from scrapy.utils.aws import sign_request
-from scrapy.conf import settings
-
-
-class AWSMiddleware(object):
-
-    def __init__(self):
-        self.access_key = settings['AWS_ACCESS_KEY_ID'] or \
-            os.environ.get('AWS_ACCESS_KEY_ID')
-        self.secret_key = settings['AWS_SECRET_ACCESS_KEY'] or \
-            os.environ.get('AWS_SECRET_ACCESS_KEY')
-
-    def process_request(self, request, spider):
-        if request.meta.get('sign_s3_request'):
-            request.headers['Date'] = strftime("%a, %d %b %Y %H:%M:%S GMT", gmtime())
-            sign_request(request, self.access_key, self.secret_key)
--- a/scrapy/contrib/pipeline/images.py
+++ b/scrapy/contrib/pipeline/images.py
@ -14,18 +14,15 @@ import Image
 from cStringIO import StringIO
 from collections import defaultdict

-from twisted.internet import defer
+from twisted.internet import defer, threads

 from scrapy.xlib.pydispatch import dispatcher
 from scrapy import log
 from scrapy.stats import stats
 from scrapy.utils.misc import md5sum
 from scrapy.core import signals
-from scrapy.core.engine import scrapyengine
 from scrapy.core.exceptions import DropItem, NotConfigured, IgnoreRequest
-from scrapy.spider import BaseSpider
 from scrapy.contrib.pipeline.media import MediaPipeline
-from scrapy.http import Request
 from scrapy.conf import settings


@ -78,83 +75,47 @@ class FSImagesStore(object):
            seen.add(dirname)


-class _S3AmazonAWSSpider(BaseSpider):
-    """This spider is used for uploading images to Amazon S3
-
-    It is basically not a crawling spider like a normal spider is, this spider is
-    a placeholder that allows us to open a different slot in downloader and use it
-    for uploads to S3.
-
-    The use of another downloader slot for S3 images avoid the effect of normal
-    spider downloader slot to be affected by requests to a complete different
-    domain (s3.amazonaws.com).
-
-    It means that a spider that uses download_delay or alike is not going to be
-    delayed even more because it is uploading images to s3.
-    """
-    name = "s3.amazonaws.com"
-    start_urls = ['http://s3.amazonaws.com/']
-    max_concurrent_requests = 100
-
-
 class S3ImagesStore(object):

-    request_priority = 1000
+    AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
+    AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']

    def __init__(self, uri):
        assert uri.startswith('s3://')
        self.bucket, self.prefix = uri[5:].split('/', 1)
-        self._set_custom_spider()
-
-    def _set_custom_spider(self):
-        use_custom_spider = bool(settings['IMAGES_S3STORE_SPIDER'])
-        if use_custom_spider:
-            self.s3_spider = _S3AmazonAWSSpider()
-        else:
-            self.s3_spider = None

    def stat_image(self, key, info):
-        def _onsuccess(response):
-            if response.status == 200:
-                checksum = response.headers['Etag'].strip('"')
-                last_modified = response.headers['Last-Modified']
-                modified_tuple = rfc822.parsedate_tz(last_modified)
-                modified_stamp = int(rfc822.mktime_tz(modified_tuple))
-                return {'checksum': checksum, 'last_modified': modified_stamp}
+        def _onsuccess(boto_key):
+            checksum = boto_key.etag.strip('"')
+            last_modified = boto_key.last_modified
+            modified_tuple = rfc822.parsedate_tz(last_modified)
+            modified_stamp = int(rfc822.mktime_tz(modified_tuple))
+            return {'checksum': checksum, 'last_modified': modified_stamp}

-        req = self._build_request(key, method='HEAD')
-        return self._download_request(req, info).addCallback(_onsuccess)
+        return self._get_boto_key(key).addCallback(_onsuccess)
+
+    def _get_boto_bucket(self):
+        from boto.s3.connection import S3Connection
+        c = S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY)
+        return c.get_bucket(self.bucket, validate=False)
+
+    def _get_boto_key(self, key):
+        b = self._get_boto_bucket()
+        key_name = '%s%s' % (self.prefix, key)
+        return threads.deferToThread(b.get_key, key_name)

    def persist_image(self, key, image, buf, info):
        """Upload image to S3 storage"""
        width, height = image.size
-        headers = {
-                'Content-Type': 'image/jpeg',
-                'X-Amz-Acl': 'public-read',
-                'X-Amz-Meta-Width': str(width),
-                'X-Amz-Meta-Height': str(height),
-                'Cache-Control': 'max-age=172800',
-                }
-
+        headers = {'Cache-Control': 'max-age=172800'} # 2 days of cache
+        b = self._get_boto_bucket()
+        key_name = '%s%s' % (self.prefix, key)
+        k = b.new_key(key_name)
+        k.set_metadata('width', str(width))
+        k.set_metadata('height', str(height))
        buf.seek(0)
-        req = self._build_request(key, method='PUT', body=buf.read(), headers=headers)
-        return self._download_request(req, info)
-
-    def _build_request(self, key, method, body=None, headers=None):
-        url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key)
-        return Request(url, method=method, body=body, headers=headers, \
-                meta={'sign_s3_request': True}, priority=self.request_priority)
-
-    def _download_request(self, request, info):
-        """This method is used for HEAD and PUT requests sent to amazon S3
-
-        It tries to use a specific spider domain for uploads, or defaults
-        to current domain spider.
-        """
-        if self.s3_spider:
-            # need to use schedule to auto-open domain
-            return scrapyengine.schedule(request, self.s3_spider)
-        return scrapyengine.download(request, info.spider)
+        return threads.deferToThread(k.set_contents_from_file, buf, headers, \
+            policy='public-read')


 class ImagesPipeline(MediaPipeline):
--- a/scrapy/tests/test_utils_aws.py
+++ b/scrapy/tests/test_utils_aws.py
@ -1,108 +0,0 @@
-from unittest import TestCase, main
-
-from scrapy.utils import aws
-from scrapy.http import Request
-
-# just some random keys. keys are provided by amazon developer guide at
-# http://s3.amazonaws.com/awsdocs/S3/20060301/s3-dg-20060301.pdf
-# and the tests described here are the examples from that manual
-
-AWS_ACCESS_KEY_ID = '0PN5J17HBGZHT7JJ3X82'
-AWS_SECRET_ACCESS_KEY = 'uV3F3YluFJax1cknvbcGwgjvx4QpvB+leU8dUj2o'
-
-
-class ScrapyAWSTest(TestCase):
-    def test_cannonical_string1(self):
-        cs = aws.canonical_string('GET', '/johnsmith/photos/puppy.jpg', {
-            'Host': 'johnsmith.s3.amazonaws.com',
-            'Date': 'Tue, 27 Mar 2007 19:36:42 +0000',
-            })
-        self.assertEqual(cs, \
-                '''GET\n\n\nTue, 27 Mar 2007 19:36:42 +0000\n/johnsmith/photos/puppy.jpg''')
-
-    def test_cannonical_string2(self):
-        cs = aws.canonical_string('PUT', '/johnsmith/photos/puppy.jpg', {
-            'Content-Type': 'image/jpeg',
-            'Host': 'johnsmith.s3.amazonaws.com',
-            'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
-            'Content-Length': '94328',
-            })
-        self.assertEqual(cs, \
-                '''PUT\n\nimage/jpeg\nTue, 27 Mar 2007 21:15:45 +0000\n/johnsmith/photos/puppy.jpg''')
-
-    def test_request_signing1(self):
-        # gets an object from the johnsmith bucket.
-        req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', headers={
-            'Date': 'Tue, 27 Mar 2007 19:36:42 +0000',
-            })
-        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-        self.assertEqual(req.headers['Authorization'], \
-                'AWS 0PN5J17HBGZHT7JJ3X82:xXjDGYUmKxnwqr5KXNPGldn5LbA=')
-
-    def test_request_signing2(self):
-        # puts an object into the johnsmith bucket.
-        req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', method='PUT', headers={
-            'Content-Type': 'image/jpeg',
-            'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
-            'Content-Length': '94328',
-            })
-        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-        self.assertEqual(req.headers['Authorization'], \
-                'AWS 0PN5J17HBGZHT7JJ3X82:hcicpDDvL9SsO6AkvxqmIWkmOuQ=')
-
-    def test_request_signing3(self):
-        # lists the content of the johnsmith bucket.
-        req = Request('http://johnsmith.s3.amazonaws.com/?prefix=photos&max-keys=50&marker=puppy', \
-                method='GET', headers={
-                    'User-Agent': 'Mozilla/5.0',
-                    'Date': 'Tue, 27 Mar 2007 19:42:41 +0000',
-                    })
-        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-        self.assertEqual(req.headers['Authorization'], \
-                'AWS 0PN5J17HBGZHT7JJ3X82:jsRt/rhG+Vtp88HrYL706QhE4w4=')
-
-    def test_request_signing4(self):
-        # fetches the access control policy sub-resource for the 'johnsmith' bucket.
-        req = Request('http://johnsmith.s3.amazonaws.com/?acl', \
-                method='GET', headers={
-                    'Date': 'Tue, 27 Mar 2007 19:44:46 +0000',
-                    })
-        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-        self.assertEqual(req.headers['Authorization'], \
-                'AWS 0PN5J17HBGZHT7JJ3X82:thdUi9VAkzhkniLj96JIrOPGi0g=')
-
-    def test_request_signing5(self):
-        # deletes an object from the 'johnsmith' bucket using the path-style and Date alternative.
-        req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', \
-                method='DELETE', headers={
-                    'Date': 'Tue, 27 Mar 2007 21:20:27 +0000',
-                    'x-amz-date': 'Tue, 27 Mar 2007 21:20:26 +0000',
-                    })
-        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-        self.assertEqual(req.headers['Authorization'], \
-                'AWS 0PN5J17HBGZHT7JJ3X82:k3nL7gH3+PadhTEVn5Ip83xlYzk=')
-
-    def test_request_signing6(self):
-        # uploads an object to a CNAME style virtual hosted bucket with metadata.
-        req = Request('http://static.johnsmith.net:8080/db-backup.dat.gz', \
-                method='PUT', headers={
-                    'User-Agent': 'curl/7.15.5',
-                    'Host': 'static.johnsmith.net:8080',
-                    'Date': 'Tue, 27 Mar 2007 21:06:08 +0000',
-                    'x-amz-acl': 'public-read',
-                    'content-type': 'application/x-download',
-                    'Content-MD5': '4gJE4saaMU4BqNR0kLY+lw==',
-                    'X-Amz-Meta-ReviewedBy': 'joe@johnsmith.net,jane@johnsmith.net',
-                    'X-Amz-Meta-FileChecksum': '0x02661779',
-                    'X-Amz-Meta-ChecksumAlgorithm': 'crc32',
-                    'Content-Disposition': 'attachment; filename=database.dat',
-                    'Content-Encoding': 'gzip',
-                    'Content-Length': '5913339',
-                    })
-        aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
-        self.assertEqual(req.headers['Authorization'], \
-                'AWS 0PN5J17HBGZHT7JJ3X82:C0FlOtU8Ylb9KDTpZqYkZPX91iI=')
-
-
-if __name__ == '__main__':
-    main()
--- a/scrapy/utils/aws.py
+++ b/scrapy/utils/aws.py
@ -1,72 +0,0 @@
-"""Helper function for working with Amazon Web Services"""
-
-import re
-import time
-import hmac
-import base64
-import hashlib
-from urlparse import urlsplit
-
-AMAZON_HEADER_PREFIX = 'x-amz-'
-
-# generates the aws canonical string for the given parameters
-def canonical_string(method, path, headers, expires=None):
-    interesting_headers = {}
-    for key in headers:
-        lk = key.lower()
-        if lk in set(['content-md5', 'content-type', 'date']) \
-                or lk.startswith(AMAZON_HEADER_PREFIX):
-            interesting_headers[lk] = headers[key].strip()
-
-    # these keys get empty strings if they don't exist
-    interesting_headers.setdefault('content-type', '')
-    interesting_headers.setdefault('content-md5', '')
-
-    # just in case someone used this.  it's not necessary in this lib.
-    if 'x-amz-date' in interesting_headers:
-        interesting_headers['date'] = ''
-
-    # if you're using expires for query string auth, then it trumps date
-    # (and x-amz-date)
-    if expires:
-        interesting_headers['date'] = str(expires)
-
-    sorted_header_keys = interesting_headers.keys()
-    sorted_header_keys.sort()
-
-    buf = "%s\n" % method
-    for key in sorted_header_keys:
-        if key.startswith(AMAZON_HEADER_PREFIX):
-            buf += "%s:%s\n" % (key, interesting_headers[key])
-        else:
-            buf += "%s\n" % interesting_headers[key]
-
-    # don't include anything after the first ? in the resource...
-    buf += "%s" % path.split('?')[0]
-
-    # ...unless there is an acl or torrent parameter
-    if re.search("[&?]acl($|=|&)", path):
-        buf += "?acl"
-    elif re.search("[&?]logging($|=|&)", path):
-        buf += "?logging"
-    elif re.search("[&?]torrent($|=|&)", path):
-        buf += "?torrent"
-    elif re.search("[&?]location($|=|&)", path):
-        buf += "?location"
-
-    return buf
-
-def sign_request(req, accesskey, secretkey):
-    if 'Date' not in req.headers:
-        req.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
-
-    parsed = urlsplit(req.url)
-    bucket = parsed.hostname.replace('.s3.amazonaws.com','')
-    key = '%s?%s' % (parsed.path, parsed.query) if parsed.query else parsed.path
-    fqkey = '/%s%s' % (bucket, key)
-
-    c_string = canonical_string(req.method, fqkey, req.headers)
-    _hmac = hmac.new(secretkey, digestmod=hashlib.sha1)
-    _hmac.update(c_string)
-    b64_hmac = base64.encodestring(_hmac.digest()).strip()
-    req.headers['Authorization'] = "AWS %s:%s" % (accesskey, b64_hmac)