1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 15:23:44 +00:00

Ported S3ImagesStore to use boto threads. This simplifies the code and makes

the following things no longer needed:

1. custom spider for S3 requests (ex. _S3AmazonAWSSpider)
2. scrapy.contrib.aws.AWSMiddleware
3. scrapy.utils.aws
This commit is contained in:
Pablo Hoffman 2010-05-26 10:29:32 -03:00
parent c8c19a8e53
commit 8c1feb7ae4
4 changed files with 28 additions and 271 deletions

View File

@ -1,24 +0,0 @@
"""
A downloader middleware for signing AWS requests just before they get into the
downloader. It is important to sign as close to the downloader as possible
because Amazon Web Service use timestamps for authentication.
"""
import os
from time import strftime, gmtime
from scrapy.utils.aws import sign_request
from scrapy.conf import settings
class AWSMiddleware(object):
def __init__(self):
self.access_key = settings['AWS_ACCESS_KEY_ID'] or \
os.environ.get('AWS_ACCESS_KEY_ID')
self.secret_key = settings['AWS_SECRET_ACCESS_KEY'] or \
os.environ.get('AWS_SECRET_ACCESS_KEY')
def process_request(self, request, spider):
if request.meta.get('sign_s3_request'):
request.headers['Date'] = strftime("%a, %d %b %Y %H:%M:%S GMT", gmtime())
sign_request(request, self.access_key, self.secret_key)

View File

@ -14,18 +14,15 @@ import Image
from cStringIO import StringIO from cStringIO import StringIO
from collections import defaultdict from collections import defaultdict
from twisted.internet import defer from twisted.internet import defer, threads
from scrapy.xlib.pydispatch import dispatcher from scrapy.xlib.pydispatch import dispatcher
from scrapy import log from scrapy import log
from scrapy.stats import stats from scrapy.stats import stats
from scrapy.utils.misc import md5sum from scrapy.utils.misc import md5sum
from scrapy.core import signals from scrapy.core import signals
from scrapy.core.engine import scrapyengine
from scrapy.core.exceptions import DropItem, NotConfigured, IgnoreRequest from scrapy.core.exceptions import DropItem, NotConfigured, IgnoreRequest
from scrapy.spider import BaseSpider
from scrapy.contrib.pipeline.media import MediaPipeline from scrapy.contrib.pipeline.media import MediaPipeline
from scrapy.http import Request
from scrapy.conf import settings from scrapy.conf import settings
@ -78,83 +75,47 @@ class FSImagesStore(object):
seen.add(dirname) seen.add(dirname)
class _S3AmazonAWSSpider(BaseSpider):
"""This spider is used for uploading images to Amazon S3
It is basically not a crawling spider like a normal spider is, this spider is
a placeholder that allows us to open a different slot in downloader and use it
for uploads to S3.
The use of another downloader slot for S3 images avoid the effect of normal
spider downloader slot to be affected by requests to a complete different
domain (s3.amazonaws.com).
It means that a spider that uses download_delay or alike is not going to be
delayed even more because it is uploading images to s3.
"""
name = "s3.amazonaws.com"
start_urls = ['http://s3.amazonaws.com/']
max_concurrent_requests = 100
class S3ImagesStore(object): class S3ImagesStore(object):
request_priority = 1000 AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
def __init__(self, uri): def __init__(self, uri):
assert uri.startswith('s3://') assert uri.startswith('s3://')
self.bucket, self.prefix = uri[5:].split('/', 1) self.bucket, self.prefix = uri[5:].split('/', 1)
self._set_custom_spider()
def _set_custom_spider(self):
use_custom_spider = bool(settings['IMAGES_S3STORE_SPIDER'])
if use_custom_spider:
self.s3_spider = _S3AmazonAWSSpider()
else:
self.s3_spider = None
def stat_image(self, key, info): def stat_image(self, key, info):
def _onsuccess(response): def _onsuccess(boto_key):
if response.status == 200: checksum = boto_key.etag.strip('"')
checksum = response.headers['Etag'].strip('"') last_modified = boto_key.last_modified
last_modified = response.headers['Last-Modified']
modified_tuple = rfc822.parsedate_tz(last_modified) modified_tuple = rfc822.parsedate_tz(last_modified)
modified_stamp = int(rfc822.mktime_tz(modified_tuple)) modified_stamp = int(rfc822.mktime_tz(modified_tuple))
return {'checksum': checksum, 'last_modified': modified_stamp} return {'checksum': checksum, 'last_modified': modified_stamp}
req = self._build_request(key, method='HEAD') return self._get_boto_key(key).addCallback(_onsuccess)
return self._download_request(req, info).addCallback(_onsuccess)
def _get_boto_bucket(self):
from boto.s3.connection import S3Connection
c = S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY)
return c.get_bucket(self.bucket, validate=False)
def _get_boto_key(self, key):
b = self._get_boto_bucket()
key_name = '%s%s' % (self.prefix, key)
return threads.deferToThread(b.get_key, key_name)
def persist_image(self, key, image, buf, info): def persist_image(self, key, image, buf, info):
"""Upload image to S3 storage""" """Upload image to S3 storage"""
width, height = image.size width, height = image.size
headers = { headers = {'Cache-Control': 'max-age=172800'} # 2 days of cache
'Content-Type': 'image/jpeg', b = self._get_boto_bucket()
'X-Amz-Acl': 'public-read', key_name = '%s%s' % (self.prefix, key)
'X-Amz-Meta-Width': str(width), k = b.new_key(key_name)
'X-Amz-Meta-Height': str(height), k.set_metadata('width', str(width))
'Cache-Control': 'max-age=172800', k.set_metadata('height', str(height))
}
buf.seek(0) buf.seek(0)
req = self._build_request(key, method='PUT', body=buf.read(), headers=headers) return threads.deferToThread(k.set_contents_from_file, buf, headers, \
return self._download_request(req, info) policy='public-read')
def _build_request(self, key, method, body=None, headers=None):
url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key)
return Request(url, method=method, body=body, headers=headers, \
meta={'sign_s3_request': True}, priority=self.request_priority)
def _download_request(self, request, info):
"""This method is used for HEAD and PUT requests sent to amazon S3
It tries to use a specific spider domain for uploads, or defaults
to current domain spider.
"""
if self.s3_spider:
# need to use schedule to auto-open domain
return scrapyengine.schedule(request, self.s3_spider)
return scrapyengine.download(request, info.spider)
class ImagesPipeline(MediaPipeline): class ImagesPipeline(MediaPipeline):

View File

@ -1,108 +0,0 @@
from unittest import TestCase, main
from scrapy.utils import aws
from scrapy.http import Request
# just some random keys. keys are provided by amazon developer guide at
# http://s3.amazonaws.com/awsdocs/S3/20060301/s3-dg-20060301.pdf
# and the tests described here are the examples from that manual
AWS_ACCESS_KEY_ID = '0PN5J17HBGZHT7JJ3X82'
AWS_SECRET_ACCESS_KEY = 'uV3F3YluFJax1cknvbcGwgjvx4QpvB+leU8dUj2o'
class ScrapyAWSTest(TestCase):
def test_cannonical_string1(self):
cs = aws.canonical_string('GET', '/johnsmith/photos/puppy.jpg', {
'Host': 'johnsmith.s3.amazonaws.com',
'Date': 'Tue, 27 Mar 2007 19:36:42 +0000',
})
self.assertEqual(cs, \
'''GET\n\n\nTue, 27 Mar 2007 19:36:42 +0000\n/johnsmith/photos/puppy.jpg''')
def test_cannonical_string2(self):
cs = aws.canonical_string('PUT', '/johnsmith/photos/puppy.jpg', {
'Content-Type': 'image/jpeg',
'Host': 'johnsmith.s3.amazonaws.com',
'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
'Content-Length': '94328',
})
self.assertEqual(cs, \
'''PUT\n\nimage/jpeg\nTue, 27 Mar 2007 21:15:45 +0000\n/johnsmith/photos/puppy.jpg''')
def test_request_signing1(self):
# gets an object from the johnsmith bucket.
req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', headers={
'Date': 'Tue, 27 Mar 2007 19:36:42 +0000',
})
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
self.assertEqual(req.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:xXjDGYUmKxnwqr5KXNPGldn5LbA=')
def test_request_signing2(self):
# puts an object into the johnsmith bucket.
req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', method='PUT', headers={
'Content-Type': 'image/jpeg',
'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
'Content-Length': '94328',
})
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
self.assertEqual(req.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:hcicpDDvL9SsO6AkvxqmIWkmOuQ=')
def test_request_signing3(self):
# lists the content of the johnsmith bucket.
req = Request('http://johnsmith.s3.amazonaws.com/?prefix=photos&max-keys=50&marker=puppy', \
method='GET', headers={
'User-Agent': 'Mozilla/5.0',
'Date': 'Tue, 27 Mar 2007 19:42:41 +0000',
})
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
self.assertEqual(req.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:jsRt/rhG+Vtp88HrYL706QhE4w4=')
def test_request_signing4(self):
# fetches the access control policy sub-resource for the 'johnsmith' bucket.
req = Request('http://johnsmith.s3.amazonaws.com/?acl', \
method='GET', headers={
'Date': 'Tue, 27 Mar 2007 19:44:46 +0000',
})
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
self.assertEqual(req.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:thdUi9VAkzhkniLj96JIrOPGi0g=')
def test_request_signing5(self):
# deletes an object from the 'johnsmith' bucket using the path-style and Date alternative.
req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', \
method='DELETE', headers={
'Date': 'Tue, 27 Mar 2007 21:20:27 +0000',
'x-amz-date': 'Tue, 27 Mar 2007 21:20:26 +0000',
})
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
self.assertEqual(req.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:k3nL7gH3+PadhTEVn5Ip83xlYzk=')
def test_request_signing6(self):
# uploads an object to a CNAME style virtual hosted bucket with metadata.
req = Request('http://static.johnsmith.net:8080/db-backup.dat.gz', \
method='PUT', headers={
'User-Agent': 'curl/7.15.5',
'Host': 'static.johnsmith.net:8080',
'Date': 'Tue, 27 Mar 2007 21:06:08 +0000',
'x-amz-acl': 'public-read',
'content-type': 'application/x-download',
'Content-MD5': '4gJE4saaMU4BqNR0kLY+lw==',
'X-Amz-Meta-ReviewedBy': 'joe@johnsmith.net,jane@johnsmith.net',
'X-Amz-Meta-FileChecksum': '0x02661779',
'X-Amz-Meta-ChecksumAlgorithm': 'crc32',
'Content-Disposition': 'attachment; filename=database.dat',
'Content-Encoding': 'gzip',
'Content-Length': '5913339',
})
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
self.assertEqual(req.headers['Authorization'], \
'AWS 0PN5J17HBGZHT7JJ3X82:C0FlOtU8Ylb9KDTpZqYkZPX91iI=')
if __name__ == '__main__':
main()

View File

@ -1,72 +0,0 @@
"""Helper function for working with Amazon Web Services"""
import re
import time
import hmac
import base64
import hashlib
from urlparse import urlsplit
AMAZON_HEADER_PREFIX = 'x-amz-'
# generates the aws canonical string for the given parameters
def canonical_string(method, path, headers, expires=None):
interesting_headers = {}
for key in headers:
lk = key.lower()
if lk in set(['content-md5', 'content-type', 'date']) \
or lk.startswith(AMAZON_HEADER_PREFIX):
interesting_headers[lk] = headers[key].strip()
# these keys get empty strings if they don't exist
interesting_headers.setdefault('content-type', '')
interesting_headers.setdefault('content-md5', '')
# just in case someone used this. it's not necessary in this lib.
if 'x-amz-date' in interesting_headers:
interesting_headers['date'] = ''
# if you're using expires for query string auth, then it trumps date
# (and x-amz-date)
if expires:
interesting_headers['date'] = str(expires)
sorted_header_keys = interesting_headers.keys()
sorted_header_keys.sort()
buf = "%s\n" % method
for key in sorted_header_keys:
if key.startswith(AMAZON_HEADER_PREFIX):
buf += "%s:%s\n" % (key, interesting_headers[key])
else:
buf += "%s\n" % interesting_headers[key]
# don't include anything after the first ? in the resource...
buf += "%s" % path.split('?')[0]
# ...unless there is an acl or torrent parameter
if re.search("[&?]acl($|=|&)", path):
buf += "?acl"
elif re.search("[&?]logging($|=|&)", path):
buf += "?logging"
elif re.search("[&?]torrent($|=|&)", path):
buf += "?torrent"
elif re.search("[&?]location($|=|&)", path):
buf += "?location"
return buf
def sign_request(req, accesskey, secretkey):
if 'Date' not in req.headers:
req.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
parsed = urlsplit(req.url)
bucket = parsed.hostname.replace('.s3.amazonaws.com','')
key = '%s?%s' % (parsed.path, parsed.query) if parsed.query else parsed.path
fqkey = '/%s%s' % (bucket, key)
c_string = canonical_string(req.method, fqkey, req.headers)
_hmac = hmac.new(secretkey, digestmod=hashlib.sha1)
_hmac.update(c_string)
b64_hmac = base64.encodestring(_hmac.digest()).strip()
req.headers['Authorization'] = "AWS %s:%s" % (accesskey, b64_hmac)