mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 15:23:44 +00:00
Ported S3ImagesStore to use boto threads. This simplifies the code and makes
the following things no longer needed: 1. custom spider for S3 requests (ex. _S3AmazonAWSSpider) 2. scrapy.contrib.aws.AWSMiddleware 3. scrapy.utils.aws
This commit is contained in:
parent
c8c19a8e53
commit
8c1feb7ae4
@ -1,24 +0,0 @@
|
|||||||
"""
|
|
||||||
A downloader middleware for signing AWS requests just before they get into the
|
|
||||||
downloader. It is important to sign as close to the downloader as possible
|
|
||||||
because Amazon Web Service use timestamps for authentication.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
from time import strftime, gmtime
|
|
||||||
from scrapy.utils.aws import sign_request
|
|
||||||
from scrapy.conf import settings
|
|
||||||
|
|
||||||
|
|
||||||
class AWSMiddleware(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.access_key = settings['AWS_ACCESS_KEY_ID'] or \
|
|
||||||
os.environ.get('AWS_ACCESS_KEY_ID')
|
|
||||||
self.secret_key = settings['AWS_SECRET_ACCESS_KEY'] or \
|
|
||||||
os.environ.get('AWS_SECRET_ACCESS_KEY')
|
|
||||||
|
|
||||||
def process_request(self, request, spider):
|
|
||||||
if request.meta.get('sign_s3_request'):
|
|
||||||
request.headers['Date'] = strftime("%a, %d %b %Y %H:%M:%S GMT", gmtime())
|
|
||||||
sign_request(request, self.access_key, self.secret_key)
|
|
@ -14,18 +14,15 @@ import Image
|
|||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from twisted.internet import defer
|
from twisted.internet import defer, threads
|
||||||
|
|
||||||
from scrapy.xlib.pydispatch import dispatcher
|
from scrapy.xlib.pydispatch import dispatcher
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
from scrapy.stats import stats
|
from scrapy.stats import stats
|
||||||
from scrapy.utils.misc import md5sum
|
from scrapy.utils.misc import md5sum
|
||||||
from scrapy.core import signals
|
from scrapy.core import signals
|
||||||
from scrapy.core.engine import scrapyengine
|
|
||||||
from scrapy.core.exceptions import DropItem, NotConfigured, IgnoreRequest
|
from scrapy.core.exceptions import DropItem, NotConfigured, IgnoreRequest
|
||||||
from scrapy.spider import BaseSpider
|
|
||||||
from scrapy.contrib.pipeline.media import MediaPipeline
|
from scrapy.contrib.pipeline.media import MediaPipeline
|
||||||
from scrapy.http import Request
|
|
||||||
from scrapy.conf import settings
|
from scrapy.conf import settings
|
||||||
|
|
||||||
|
|
||||||
@ -78,83 +75,47 @@ class FSImagesStore(object):
|
|||||||
seen.add(dirname)
|
seen.add(dirname)
|
||||||
|
|
||||||
|
|
||||||
class _S3AmazonAWSSpider(BaseSpider):
|
|
||||||
"""This spider is used for uploading images to Amazon S3
|
|
||||||
|
|
||||||
It is basically not a crawling spider like a normal spider is, this spider is
|
|
||||||
a placeholder that allows us to open a different slot in downloader and use it
|
|
||||||
for uploads to S3.
|
|
||||||
|
|
||||||
The use of another downloader slot for S3 images avoid the effect of normal
|
|
||||||
spider downloader slot to be affected by requests to a complete different
|
|
||||||
domain (s3.amazonaws.com).
|
|
||||||
|
|
||||||
It means that a spider that uses download_delay or alike is not going to be
|
|
||||||
delayed even more because it is uploading images to s3.
|
|
||||||
"""
|
|
||||||
name = "s3.amazonaws.com"
|
|
||||||
start_urls = ['http://s3.amazonaws.com/']
|
|
||||||
max_concurrent_requests = 100
|
|
||||||
|
|
||||||
|
|
||||||
class S3ImagesStore(object):
|
class S3ImagesStore(object):
|
||||||
|
|
||||||
request_priority = 1000
|
AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
|
||||||
|
AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
|
||||||
|
|
||||||
def __init__(self, uri):
|
def __init__(self, uri):
|
||||||
assert uri.startswith('s3://')
|
assert uri.startswith('s3://')
|
||||||
self.bucket, self.prefix = uri[5:].split('/', 1)
|
self.bucket, self.prefix = uri[5:].split('/', 1)
|
||||||
self._set_custom_spider()
|
|
||||||
|
|
||||||
def _set_custom_spider(self):
|
|
||||||
use_custom_spider = bool(settings['IMAGES_S3STORE_SPIDER'])
|
|
||||||
if use_custom_spider:
|
|
||||||
self.s3_spider = _S3AmazonAWSSpider()
|
|
||||||
else:
|
|
||||||
self.s3_spider = None
|
|
||||||
|
|
||||||
def stat_image(self, key, info):
|
def stat_image(self, key, info):
|
||||||
def _onsuccess(response):
|
def _onsuccess(boto_key):
|
||||||
if response.status == 200:
|
checksum = boto_key.etag.strip('"')
|
||||||
checksum = response.headers['Etag'].strip('"')
|
last_modified = boto_key.last_modified
|
||||||
last_modified = response.headers['Last-Modified']
|
|
||||||
modified_tuple = rfc822.parsedate_tz(last_modified)
|
modified_tuple = rfc822.parsedate_tz(last_modified)
|
||||||
modified_stamp = int(rfc822.mktime_tz(modified_tuple))
|
modified_stamp = int(rfc822.mktime_tz(modified_tuple))
|
||||||
return {'checksum': checksum, 'last_modified': modified_stamp}
|
return {'checksum': checksum, 'last_modified': modified_stamp}
|
||||||
|
|
||||||
req = self._build_request(key, method='HEAD')
|
return self._get_boto_key(key).addCallback(_onsuccess)
|
||||||
return self._download_request(req, info).addCallback(_onsuccess)
|
|
||||||
|
def _get_boto_bucket(self):
|
||||||
|
from boto.s3.connection import S3Connection
|
||||||
|
c = S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY)
|
||||||
|
return c.get_bucket(self.bucket, validate=False)
|
||||||
|
|
||||||
|
def _get_boto_key(self, key):
|
||||||
|
b = self._get_boto_bucket()
|
||||||
|
key_name = '%s%s' % (self.prefix, key)
|
||||||
|
return threads.deferToThread(b.get_key, key_name)
|
||||||
|
|
||||||
def persist_image(self, key, image, buf, info):
|
def persist_image(self, key, image, buf, info):
|
||||||
"""Upload image to S3 storage"""
|
"""Upload image to S3 storage"""
|
||||||
width, height = image.size
|
width, height = image.size
|
||||||
headers = {
|
headers = {'Cache-Control': 'max-age=172800'} # 2 days of cache
|
||||||
'Content-Type': 'image/jpeg',
|
b = self._get_boto_bucket()
|
||||||
'X-Amz-Acl': 'public-read',
|
key_name = '%s%s' % (self.prefix, key)
|
||||||
'X-Amz-Meta-Width': str(width),
|
k = b.new_key(key_name)
|
||||||
'X-Amz-Meta-Height': str(height),
|
k.set_metadata('width', str(width))
|
||||||
'Cache-Control': 'max-age=172800',
|
k.set_metadata('height', str(height))
|
||||||
}
|
|
||||||
|
|
||||||
buf.seek(0)
|
buf.seek(0)
|
||||||
req = self._build_request(key, method='PUT', body=buf.read(), headers=headers)
|
return threads.deferToThread(k.set_contents_from_file, buf, headers, \
|
||||||
return self._download_request(req, info)
|
policy='public-read')
|
||||||
|
|
||||||
def _build_request(self, key, method, body=None, headers=None):
|
|
||||||
url = 'http://%s.s3.amazonaws.com/%s%s' % (self.bucket, self.prefix, key)
|
|
||||||
return Request(url, method=method, body=body, headers=headers, \
|
|
||||||
meta={'sign_s3_request': True}, priority=self.request_priority)
|
|
||||||
|
|
||||||
def _download_request(self, request, info):
|
|
||||||
"""This method is used for HEAD and PUT requests sent to amazon S3
|
|
||||||
|
|
||||||
It tries to use a specific spider domain for uploads, or defaults
|
|
||||||
to current domain spider.
|
|
||||||
"""
|
|
||||||
if self.s3_spider:
|
|
||||||
# need to use schedule to auto-open domain
|
|
||||||
return scrapyengine.schedule(request, self.s3_spider)
|
|
||||||
return scrapyengine.download(request, info.spider)
|
|
||||||
|
|
||||||
|
|
||||||
class ImagesPipeline(MediaPipeline):
|
class ImagesPipeline(MediaPipeline):
|
||||||
|
@ -1,108 +0,0 @@
|
|||||||
from unittest import TestCase, main
|
|
||||||
|
|
||||||
from scrapy.utils import aws
|
|
||||||
from scrapy.http import Request
|
|
||||||
|
|
||||||
# just some random keys. keys are provided by amazon developer guide at
|
|
||||||
# http://s3.amazonaws.com/awsdocs/S3/20060301/s3-dg-20060301.pdf
|
|
||||||
# and the tests described here are the examples from that manual
|
|
||||||
|
|
||||||
AWS_ACCESS_KEY_ID = '0PN5J17HBGZHT7JJ3X82'
|
|
||||||
AWS_SECRET_ACCESS_KEY = 'uV3F3YluFJax1cknvbcGwgjvx4QpvB+leU8dUj2o'
|
|
||||||
|
|
||||||
|
|
||||||
class ScrapyAWSTest(TestCase):
|
|
||||||
def test_cannonical_string1(self):
|
|
||||||
cs = aws.canonical_string('GET', '/johnsmith/photos/puppy.jpg', {
|
|
||||||
'Host': 'johnsmith.s3.amazonaws.com',
|
|
||||||
'Date': 'Tue, 27 Mar 2007 19:36:42 +0000',
|
|
||||||
})
|
|
||||||
self.assertEqual(cs, \
|
|
||||||
'''GET\n\n\nTue, 27 Mar 2007 19:36:42 +0000\n/johnsmith/photos/puppy.jpg''')
|
|
||||||
|
|
||||||
def test_cannonical_string2(self):
|
|
||||||
cs = aws.canonical_string('PUT', '/johnsmith/photos/puppy.jpg', {
|
|
||||||
'Content-Type': 'image/jpeg',
|
|
||||||
'Host': 'johnsmith.s3.amazonaws.com',
|
|
||||||
'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
|
|
||||||
'Content-Length': '94328',
|
|
||||||
})
|
|
||||||
self.assertEqual(cs, \
|
|
||||||
'''PUT\n\nimage/jpeg\nTue, 27 Mar 2007 21:15:45 +0000\n/johnsmith/photos/puppy.jpg''')
|
|
||||||
|
|
||||||
def test_request_signing1(self):
|
|
||||||
# gets an object from the johnsmith bucket.
|
|
||||||
req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', headers={
|
|
||||||
'Date': 'Tue, 27 Mar 2007 19:36:42 +0000',
|
|
||||||
})
|
|
||||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
|
||||||
self.assertEqual(req.headers['Authorization'], \
|
|
||||||
'AWS 0PN5J17HBGZHT7JJ3X82:xXjDGYUmKxnwqr5KXNPGldn5LbA=')
|
|
||||||
|
|
||||||
def test_request_signing2(self):
|
|
||||||
# puts an object into the johnsmith bucket.
|
|
||||||
req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', method='PUT', headers={
|
|
||||||
'Content-Type': 'image/jpeg',
|
|
||||||
'Date': 'Tue, 27 Mar 2007 21:15:45 +0000',
|
|
||||||
'Content-Length': '94328',
|
|
||||||
})
|
|
||||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
|
||||||
self.assertEqual(req.headers['Authorization'], \
|
|
||||||
'AWS 0PN5J17HBGZHT7JJ3X82:hcicpDDvL9SsO6AkvxqmIWkmOuQ=')
|
|
||||||
|
|
||||||
def test_request_signing3(self):
|
|
||||||
# lists the content of the johnsmith bucket.
|
|
||||||
req = Request('http://johnsmith.s3.amazonaws.com/?prefix=photos&max-keys=50&marker=puppy', \
|
|
||||||
method='GET', headers={
|
|
||||||
'User-Agent': 'Mozilla/5.0',
|
|
||||||
'Date': 'Tue, 27 Mar 2007 19:42:41 +0000',
|
|
||||||
})
|
|
||||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
|
||||||
self.assertEqual(req.headers['Authorization'], \
|
|
||||||
'AWS 0PN5J17HBGZHT7JJ3X82:jsRt/rhG+Vtp88HrYL706QhE4w4=')
|
|
||||||
|
|
||||||
def test_request_signing4(self):
|
|
||||||
# fetches the access control policy sub-resource for the 'johnsmith' bucket.
|
|
||||||
req = Request('http://johnsmith.s3.amazonaws.com/?acl', \
|
|
||||||
method='GET', headers={
|
|
||||||
'Date': 'Tue, 27 Mar 2007 19:44:46 +0000',
|
|
||||||
})
|
|
||||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
|
||||||
self.assertEqual(req.headers['Authorization'], \
|
|
||||||
'AWS 0PN5J17HBGZHT7JJ3X82:thdUi9VAkzhkniLj96JIrOPGi0g=')
|
|
||||||
|
|
||||||
def test_request_signing5(self):
|
|
||||||
# deletes an object from the 'johnsmith' bucket using the path-style and Date alternative.
|
|
||||||
req = Request('http://johnsmith.s3.amazonaws.com/photos/puppy.jpg', \
|
|
||||||
method='DELETE', headers={
|
|
||||||
'Date': 'Tue, 27 Mar 2007 21:20:27 +0000',
|
|
||||||
'x-amz-date': 'Tue, 27 Mar 2007 21:20:26 +0000',
|
|
||||||
})
|
|
||||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
|
||||||
self.assertEqual(req.headers['Authorization'], \
|
|
||||||
'AWS 0PN5J17HBGZHT7JJ3X82:k3nL7gH3+PadhTEVn5Ip83xlYzk=')
|
|
||||||
|
|
||||||
def test_request_signing6(self):
|
|
||||||
# uploads an object to a CNAME style virtual hosted bucket with metadata.
|
|
||||||
req = Request('http://static.johnsmith.net:8080/db-backup.dat.gz', \
|
|
||||||
method='PUT', headers={
|
|
||||||
'User-Agent': 'curl/7.15.5',
|
|
||||||
'Host': 'static.johnsmith.net:8080',
|
|
||||||
'Date': 'Tue, 27 Mar 2007 21:06:08 +0000',
|
|
||||||
'x-amz-acl': 'public-read',
|
|
||||||
'content-type': 'application/x-download',
|
|
||||||
'Content-MD5': '4gJE4saaMU4BqNR0kLY+lw==',
|
|
||||||
'X-Amz-Meta-ReviewedBy': 'joe@johnsmith.net,jane@johnsmith.net',
|
|
||||||
'X-Amz-Meta-FileChecksum': '0x02661779',
|
|
||||||
'X-Amz-Meta-ChecksumAlgorithm': 'crc32',
|
|
||||||
'Content-Disposition': 'attachment; filename=database.dat',
|
|
||||||
'Content-Encoding': 'gzip',
|
|
||||||
'Content-Length': '5913339',
|
|
||||||
})
|
|
||||||
aws.sign_request(req, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
|
||||||
self.assertEqual(req.headers['Authorization'], \
|
|
||||||
'AWS 0PN5J17HBGZHT7JJ3X82:C0FlOtU8Ylb9KDTpZqYkZPX91iI=')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -1,72 +0,0 @@
|
|||||||
"""Helper function for working with Amazon Web Services"""
|
|
||||||
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
import hmac
|
|
||||||
import base64
|
|
||||||
import hashlib
|
|
||||||
from urlparse import urlsplit
|
|
||||||
|
|
||||||
AMAZON_HEADER_PREFIX = 'x-amz-'
|
|
||||||
|
|
||||||
# generates the aws canonical string for the given parameters
|
|
||||||
def canonical_string(method, path, headers, expires=None):
|
|
||||||
interesting_headers = {}
|
|
||||||
for key in headers:
|
|
||||||
lk = key.lower()
|
|
||||||
if lk in set(['content-md5', 'content-type', 'date']) \
|
|
||||||
or lk.startswith(AMAZON_HEADER_PREFIX):
|
|
||||||
interesting_headers[lk] = headers[key].strip()
|
|
||||||
|
|
||||||
# these keys get empty strings if they don't exist
|
|
||||||
interesting_headers.setdefault('content-type', '')
|
|
||||||
interesting_headers.setdefault('content-md5', '')
|
|
||||||
|
|
||||||
# just in case someone used this. it's not necessary in this lib.
|
|
||||||
if 'x-amz-date' in interesting_headers:
|
|
||||||
interesting_headers['date'] = ''
|
|
||||||
|
|
||||||
# if you're using expires for query string auth, then it trumps date
|
|
||||||
# (and x-amz-date)
|
|
||||||
if expires:
|
|
||||||
interesting_headers['date'] = str(expires)
|
|
||||||
|
|
||||||
sorted_header_keys = interesting_headers.keys()
|
|
||||||
sorted_header_keys.sort()
|
|
||||||
|
|
||||||
buf = "%s\n" % method
|
|
||||||
for key in sorted_header_keys:
|
|
||||||
if key.startswith(AMAZON_HEADER_PREFIX):
|
|
||||||
buf += "%s:%s\n" % (key, interesting_headers[key])
|
|
||||||
else:
|
|
||||||
buf += "%s\n" % interesting_headers[key]
|
|
||||||
|
|
||||||
# don't include anything after the first ? in the resource...
|
|
||||||
buf += "%s" % path.split('?')[0]
|
|
||||||
|
|
||||||
# ...unless there is an acl or torrent parameter
|
|
||||||
if re.search("[&?]acl($|=|&)", path):
|
|
||||||
buf += "?acl"
|
|
||||||
elif re.search("[&?]logging($|=|&)", path):
|
|
||||||
buf += "?logging"
|
|
||||||
elif re.search("[&?]torrent($|=|&)", path):
|
|
||||||
buf += "?torrent"
|
|
||||||
elif re.search("[&?]location($|=|&)", path):
|
|
||||||
buf += "?location"
|
|
||||||
|
|
||||||
return buf
|
|
||||||
|
|
||||||
def sign_request(req, accesskey, secretkey):
|
|
||||||
if 'Date' not in req.headers:
|
|
||||||
req.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime())
|
|
||||||
|
|
||||||
parsed = urlsplit(req.url)
|
|
||||||
bucket = parsed.hostname.replace('.s3.amazonaws.com','')
|
|
||||||
key = '%s?%s' % (parsed.path, parsed.query) if parsed.query else parsed.path
|
|
||||||
fqkey = '/%s%s' % (bucket, key)
|
|
||||||
|
|
||||||
c_string = canonical_string(req.method, fqkey, req.headers)
|
|
||||||
_hmac = hmac.new(secretkey, digestmod=hashlib.sha1)
|
|
||||||
_hmac.update(c_string)
|
|
||||||
b64_hmac = base64.encodestring(_hmac.digest()).strip()
|
|
||||||
req.headers['Authorization'] = "AWS %s:%s" % (accesskey, b64_hmac)
|
|
Loading…
x
Reference in New Issue
Block a user