mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 06:24:00 +00:00
Ported S3ImagesStore to use boto threads. This simplifies the code and makes
the following things no longer needed: 1. custom spider for S3 requests (ex. _S3AmazonAWSSpider) 2. scrapy.contrib.aws.AWSMiddleware 3. scrapy.utils.aws
This commit is contained in:
parent
c8c19a8e53
commit
8c1feb7ae4
@ -1,24 +0,0 @@
|
||||
"""
|
||||
A downloader middleware for signing AWS requests just before they get into the
|
||||
downloader. It is important to sign as close to the downloader as possible
|
||||
because Amazon Web Service use timestamps for authentication.
|
||||
"""
|
||||
|
||||
import os
|
||||
from time import strftime, gmtime
|
||||
from scrapy.utils.aws import sign_request
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
class AWSMiddleware(object):
|
||||
|
||||
def __init__(self):
|
||||
self.access_key = settings['AWS_ACCESS_KEY_ID'] or \
|
||||
os.environ.get('AWS_ACCESS_KEY_ID')
|
||||
self.secret_key = settings['AWS_SECRET_ACCESS_KEY'] or \
|
||||
os.environ.get('AWS_SECRET_ACCESS_KEY')
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('sign_s3_request'):
|
||||
request.headers['Date'] = strftime("%a, %d %b %Y %H:%M:%S GMT", gmtime())
|
||||
sign_request(request, self.access_key, self.secret_key)
|
@ -14,18 +14,15 @@ import Image
|
||||
from cStringIO import StringIO
|
||||
from collections import defaultdict
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet import defer, threads
|
||||
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy import log
|
||||
from scrapy.stats import stats
|
||||
from scrapy.utils.misc import md5sum
|
||||
from scrapy.core import signals
|
||||
from scrapy.core.engine import scrapyengine
|
||||
from scrapy.core.exceptions import DropItem, NotConfigured, IgnoreRequest
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.contrib.pipeline.media import MediaPipeline
|
||||
from scrapy.http import Request
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
@ -78,83 +75,47 @@ class FSImagesStore(object):
|
||||
seen.add(dirname)
|
||||
|
||||
|
||||
class _S3AmazonAWSSpider(BaseSpider):
|
||||
"""This spider is used for uploading images to Amazon S3
|
||||
|
||||
It is basically not a crawling spider like a normal spider is, this spider is
|
||||
a placeholder that allows us to open a different slot in downloader and use it
|
||||
for uploads to S3.
|
||||
|
||||
The use of another downloader slot for S3 images avoid the effect of normal
|
||||
spider downloader slot to be affected by requests to a complete different
|
||||
domain (s3.amazonaws.com).
|
||||
|
||||
It means that a spider that uses download_delay or alike is not going to be
|
||||
delayed even more because it is uploading images to s3.
|
||||
"""
|
||||
name = "s3.amazonaws.com"
|
||||
start_urls = ['http://s3.amazonaws.com/']
|
||||
max_concurrent_requests = 100
|
||||
|
||||
|
||||
class S3ImagesStore(object):
|
||||
|
||||
request_priority = 1000
|
||||
AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
|
||||
AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
|
||||
|
||||
def __init__(self, uri):
|
||||
assert uri.startswith('s3://')
|
||||
self.bucket, self.prefix = uri[5:].split('/', 1)
|
||||
self._set_custom_spider()
|
||||
|
||||
def _set_custom_spider(self):
|
||||
use_custom_spider = bool(settings['IMAGES_S3STORE_SPIDER'])
|
||||
if use_custom_spider:
|
||||
self.s3_spider = _S3AmazonAWSSpider()
|
||||
else:
|
||||
self.s3_spider = None
|
||||
|
||||
def stat_image(self, key, info):
|
||||
def _onsuccess(response):
|
||||
if response.status == 200:
|
||||
checksum = response.headers['Etag'].strip('"')
|
||||
last_modified = response.headers['Last-Modified']
|
||||
modified_tuple = rfc822.parsedate_tz(last_modified)
|
||||
modified_stamp = int(rfc822.mktime_tz(modified_tuple))
|
||||
return {'checksum': checksum, 'last_modified': modified_stamp}
|
||||
def _onsuccess(boto_key):
|
||||
checksum = boto_key.etag.strip('"')
|
||||
last_modified = boto_key.last_modified
|
||||
modified_tuple = rfc822.parsedate_tz(last_modified)
|
||||
modified_stamp = int(rfc822.mktime_tz(modified_tuple))
|
||||
return {'checksum': checksum, 'last_modified': modified_stamp}
|
||||
|
||||
req = self._build_request(key, method='HEAD')
|
||||
return self._download_request(req, info).addCallback(_onsuccess)
|
||||
return self._get_boto_key(key).addCallback(_onsuccess)
|
||||
|
||||
def _get_boto_bucket(self):
|
||||
from boto.s3.connection import S3Connection
|
||||
c = S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY)
|
||||
return c.get_bucket(self.bucket, validate=False)
|
||||
|
||||
def _get_boto_key(self, key):
|
||||
b = self._get_boto_bucket()
|
||||
key_name = '%s%s' % (self.prefix, key)
|
||||
return threads.deferToThread(b.get_key, key_name)
|
||||
|
||||
def persist_image(self, key, image, buf, info):
|
||||
"""Upload image to S3 storage"""
|
||||
width, height = image.size
|
||||
headers = {
|
||||
'Content-Type': 'image/jpeg',
|
||||
'X-Amz-Acl': 'public-read',
|
||||
'X-Amz-Meta-Width': str(width),
|
||||
'X-Amz-Meta-Height': str(height),
|
||||
'Cache-Control': 'max-age=172800',
|
||||
}
|
||||
|
||||
headers = {'Cache-Control': 'max-age=172800'} # 2 days of cache
|
||||
b = self._get_boto_bucket()
|
||||
key_name = '%s%s' % (self.prefix, key)
|
||||
k = b.new_key(key_name)
|
||||
k.set_metadata('width', str(width))
|
||||
k.set_metadata('height', str(height))
|
||||
buf.seek(0)
|
||||
req = self._build_request(key, method='PUT', body=buf.read(), headers=headers)
|
||||
return self._download_request(req, info)
|
||||
|
||||
def _build_request(self, key, method, body=None, headers=None):
|
||||
url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key)
|
||||
return Request(url, method=method, body=body, headers=headers, \
|
||||
meta={'sign_s3_request': True}, priority=self.request_priority)
|
||||
|
||||
def _download_request(self, request, info):
|
||||
"""This method is used for HEAD and PUT requests sent to amazon S3
|
||||
|
||||
It tries to use a specific spider domain for uploads, or defaults
|
||||
to current domain spider.
|
||||
"""
|
||||
if self.s3_spider:
|
||||
# need to use schedule to auto-open domain
|
||||
return scrapyengine.schedule(request, self.s3_spider)
|
||||
return scrapyengine.download(request, info.spider)
|
||||
return threads.deferToThread(k.set_contents_from_file, buf, headers, \
|
||||
policy='public-read')
|
||||
|
||||
|
||||
class ImagesPipeline(MediaPipeline):
|
||||
|
@ -1,108 +0,0 @@
|
||||
from unittest import TestCase, main
|
||||
|
||||
from scrapy.utils import aws
|
||||
from scrapy.http import Request
|
||||
|
||||
# just some random keys. keys are provided by amazon developer guide at
|
||||
# http://s3.amazonaws.com/awsdocs/S3/20060301/s3-dg-20060301.pdf
|
||||
# and the tests described here are the examples from that manual
|
||||
|
||||
AWS_ACCESS_KEY_ID = '0PN5J17HBGZHT7JJ3X82'
|
||||
AWS_SECRET_ACCESS_KEY = 'uV3F3YluFJax1cknvbcGwgjvx4QpvB+leU8dUj2o'
|
||||
|
||||
|
||||
class ScrapyAWSTest(TestCase):
|
||||
def test_cannonical_string1(self):
|
||||
cs = aws.canonical_string('GET', '/johnsmith/photos/puppy.jpg', {
|
||||
'Host': 'johnsmith.s3.amazonaws.com',
|
||||
'Date': 'Tue, 27 Mar 2007 19:36:42 +0000',
|
||||
})
|
||||
self.assertEqual(cs, \
|
||||
'''GET\n\n\nTue, 27 Mar 2007 19:36:42 +0000\n/johnsmith/photos/puppy.jpg''')
|
||||
|
||||
def test_cannonical_string2(self):
|
||||
cs = aws.canonical_string('PUT', '/johnsmith/photos/puppy.jpg', {
|
||||
'Content-Type': 'image/jpeg',
|
||||
'Host': 'johnsmith.s3.amazonaws.com',
|
||||
'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
|
||||
'Content-Length': '94328',
|
||||
})
|
||||
self.assertEqual(cs, \
|
||||
'''PUT\n\nimage/jpeg\nTue, 27 Mar 2007 21:15:45 +0000\n/johnsmith/photos/puppy.jpg''')
|
||||
|
||||
def test_request_signing1(self):
|
||||
# gets an object from the johnsmith bucket.
|
||||
req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', headers={
|
||||
'Date': 'Tue, 27 Mar 2007 19:36:42 +0000',
|
||||
})
|
||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
||||
self.assertEqual(req.headers['Authorization'], \
|
||||
'AWS 0PN5J17HBGZHT7JJ3X82:xXjDGYUmKxnwqr5KXNPGldn5LbA=')
|
||||
|
||||
def test_request_signing2(self):
|
||||
# puts an object into the johnsmith bucket.
|
||||
req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', method='PUT', headers={
|
||||
'Content-Type': 'image/jpeg',
|
||||
'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
|
||||
'Content-Length': '94328',
|
||||
})
|
||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
||||
self.assertEqual(req.headers['Authorization'], \
|
||||
'AWS 0PN5J17HBGZHT7JJ3X82:hcicpDDvL9SsO6AkvxqmIWkmOuQ=')
|
||||
|
||||
def test_request_signing3(self):
|
||||
# lists the content of the johnsmith bucket.
|
||||
req = Request('http://johnsmith.s3.amazonaws.com/?prefix=photos&max-keys=50&marker=puppy', \
|
||||
method='GET', headers={
|
||||
'User-Agent': 'Mozilla/5.0',
|
||||
'Date': 'Tue, 27 Mar 2007 19:42:41 +0000',
|
||||
})
|
||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
||||
self.assertEqual(req.headers['Authorization'], \
|
||||
'AWS 0PN5J17HBGZHT7JJ3X82:jsRt/rhG+Vtp88HrYL706QhE4w4=')
|
||||
|
||||
def test_request_signing4(self):
|
||||
# fetches the access control policy sub-resource for the 'johnsmith' bucket.
|
||||
req = Request('http://johnsmith.s3.amazonaws.com/?acl', \
|
||||
method='GET', headers={
|
||||
'Date': 'Tue, 27 Mar 2007 19:44:46 +0000',
|
||||
})
|
||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
||||
self.assertEqual(req.headers['Authorization'], \
|
||||
'AWS 0PN5J17HBGZHT7JJ3X82:thdUi9VAkzhkniLj96JIrOPGi0g=')
|
||||
|
||||
def test_request_signing5(self):
|
||||
# deletes an object from the 'johnsmith' bucket using the path-style and Date alternative.
|
||||
req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', \
|
||||
method='DELETE', headers={
|
||||
'Date': 'Tue, 27 Mar 2007 21:20:27 +0000',
|
||||
'x-amz-date': 'Tue, 27 Mar 2007 21:20:26 +0000',
|
||||
})
|
||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
||||
self.assertEqual(req.headers['Authorization'], \
|
||||
'AWS 0PN5J17HBGZHT7JJ3X82:k3nL7gH3+PadhTEVn5Ip83xlYzk=')
|
||||
|
||||
def test_request_signing6(self):
|
||||
# uploads an object to a CNAME style virtual hosted bucket with metadata.
|
||||
req = Request('http://static.johnsmith.net:8080/db-backup.dat.gz', \
|
||||
method='PUT', headers={
|
||||
'User-Agent': 'curl/7.15.5',
|
||||
'Host': 'static.johnsmith.net:8080',
|
||||
'Date': 'Tue, 27 Mar 2007 21:06:08 +0000',
|
||||
'x-amz-acl': 'public-read',
|
||||
'content-type': 'application/x-download',
|
||||
'Content-MD5': '4gJE4saaMU4BqNR0kLY+lw==',
|
||||
'X-Amz-Meta-ReviewedBy': 'joe@johnsmith.net,jane@johnsmith.net',
|
||||
'X-Amz-Meta-FileChecksum': '0x02661779',
|
||||
'X-Amz-Meta-ChecksumAlgorithm': 'crc32',
|
||||
'Content-Disposition': 'attachment; filename=database.dat',
|
||||
'Content-Encoding': 'gzip',
|
||||
'Content-Length': '5913339',
|
||||
})
|
||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
||||
self.assertEqual(req.headers['Authorization'], \
|
||||
'AWS 0PN5J17HBGZHT7JJ3X82:C0FlOtU8Ylb9KDTpZqYkZPX91iI=')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,72 +0,0 @@
|
||||
"""Helper function for working with Amazon Web Services"""
|
||||
|
||||
import re
|
||||
import time
|
||||
import hmac
|
||||
import base64
|
||||
import hashlib
|
||||
from urlparse import urlsplit
|
||||
|
||||
AMAZON_HEADER_PREFIX = 'x-amz-'
|
||||
|
||||
# generates the aws canonical string for the given parameters
|
||||
def canonical_string(method, path, headers, expires=None):
|
||||
interesting_headers = {}
|
||||
for key in headers:
|
||||
lk = key.lower()
|
||||
if lk in set(['content-md5', 'content-type', 'date']) \
|
||||
or lk.startswith(AMAZON_HEADER_PREFIX):
|
||||
interesting_headers[lk] = headers[key].strip()
|
||||
|
||||
# these keys get empty strings if they don't exist
|
||||
interesting_headers.setdefault('content-type', '')
|
||||
interesting_headers.setdefault('content-md5', '')
|
||||
|
||||
# just in case someone used this. it's not necessary in this lib.
|
||||
if 'x-amz-date' in interesting_headers:
|
||||
interesting_headers['date'] = ''
|
||||
|
||||
# if you're using expires for query string auth, then it trumps date
|
||||
# (and x-amz-date)
|
||||
if expires:
|
||||
interesting_headers['date'] = str(expires)
|
||||
|
||||
sorted_header_keys = interesting_headers.keys()
|
||||
sorted_header_keys.sort()
|
||||
|
||||
buf = "%s\n" % method
|
||||
for key in sorted_header_keys:
|
||||
if key.startswith(AMAZON_HEADER_PREFIX):
|
||||
buf += "%s:%s\n" % (key, interesting_headers[key])
|
||||
else:
|
||||
buf += "%s\n" % interesting_headers[key]
|
||||
|
||||
# don't include anything after the first ? in the resource...
|
||||
buf += "%s" % path.split('?')[0]
|
||||
|
||||
# ...unless there is an acl or torrent parameter
|
||||
if re.search("[&?]acl($|=|&)", path):
|
||||
buf += "?acl"
|
||||
elif re.search("[&?]logging($|=|&)", path):
|
||||
buf += "?logging"
|
||||
elif re.search("[&?]torrent($|=|&)", path):
|
||||
buf += "?torrent"
|
||||
elif re.search("[&?]location($|=|&)", path):
|
||||
buf += "?location"
|
||||
|
||||
return buf
|
||||
|
||||
def sign_request(req, accesskey, secretkey):
|
||||
if 'Date' not in req.headers:
|
||||
req.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
|
||||
|
||||
parsed = urlsplit(req.url)
|
||||
bucket = parsed.hostname.replace('.s3.amazonaws.com','')
|
||||
key = '%s?%s' % (parsed.path, parsed.query) if parsed.query else parsed.path
|
||||
fqkey = '/%s%s' % (bucket, key)
|
||||
|
||||
c_string = canonical_string(req.method, fqkey, req.headers)
|
||||
_hmac = hmac.new(secretkey, digestmod=hashlib.sha1)
|
||||
_hmac.update(c_string)
|
||||
b64_hmac = base64.encodestring(_hmac.digest()).strip()
|
||||
req.headers['Authorization'] = "AWS %s:%s" % (accesskey, b64_hmac)
|
Loading…
x
Reference in New Issue
Block a user