mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 15:03:41 +00:00
removed SimpledbStatsCollector from scrapy code, it was moved to https://github.com/scrapinghub/scaws
This commit is contained in:
parent
b6b0a54d9f
commit
0e008268e1
@ -227,65 +227,6 @@ DummyStatsCollector
|
||||
Stats Collector used when stats are disabled (through the
|
||||
:setting:`STATS_ENABLED` setting).
|
||||
|
||||
SimpledbStatsCollector
|
||||
----------------------
|
||||
|
||||
.. module:: scrapy.contrib.statscol
|
||||
:synopsis: Additional Stats Collectors
|
||||
|
||||
.. class:: SimpledbStatsCollector
|
||||
|
||||
A Stats collector which persists stats to `Amazon SimpleDB`_, using one
|
||||
SimpleDB item per scraping run (ie. it keeps history of all scraping runs).
|
||||
The data is persisted to the SimpleDB domain specified by the
|
||||
:setting:`STATS_SDB_DOMAIN` setting. The domain will be created if it
|
||||
doesn't exist.
|
||||
|
||||
In addition to the existing stats keys, the following keys are added at
|
||||
persitance time:
|
||||
|
||||
* ``spider``: the spider name (so you can use it later for querying stats
|
||||
for that spider)
|
||||
* ``timestamp``: the timestamp when the stats were persisted
|
||||
|
||||
Both the ``spider`` and ``timestamp`` are used to generate the SimpleDB
|
||||
item name in order to avoid overwriting stats of previous scraping runs.
|
||||
|
||||
As `required by SimpleDB`_, datetimes are stored in ISO 8601 format and
|
||||
numbers are zero-padded to 16 digits. Negative numbers are not currently
|
||||
supported.
|
||||
|
||||
This Stats Collector requires the `boto`_ library.
|
||||
|
||||
.. _Amazon SimpleDB: http://aws.amazon.com/simpledb/
|
||||
.. _required by SimpleDB: http://docs.amazonwebservices.com/AmazonSimpleDB/2009-04-15/DeveloperGuide/ZeroPadding.html
|
||||
.. _boto: http://code.google.com/p/boto/
|
||||
|
||||
This Stats Collector can be configured through the following settings:
|
||||
|
||||
.. setting:: STATS_SDB_DOMAIN
|
||||
|
||||
STATS_SDB_DOMAIN
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
Default: ``'scrapy_stats'``
|
||||
|
||||
A string containing the SimpleDB domain to use in the
|
||||
:class:`SimpledbStatsCollector`.
|
||||
|
||||
.. setting:: STATS_SDB_ASYNC
|
||||
|
||||
STATS_SDB_ASYNC
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
Default: ``False``
|
||||
|
||||
If ``True``, communication with SimpleDB will be performed asynchronously. If
|
||||
``False`` blocking IO will be used instead. This is the default as using
|
||||
asynchronous communication can result in the stats not being persisted if the
|
||||
Scrapy engine is shut down in the middle (for example, when you run only one
|
||||
spider in a process and then exit).
|
||||
|
||||
Stats signals
|
||||
=============
|
||||
|
||||
|
@ -1,57 +0,0 @@
|
||||
"""
|
||||
A Stats collector for persisting stats to Amazon SimpleDB.
|
||||
|
||||
Requires the boto library: http://code.google.com/p/boto/
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from twisted.internet import threads
|
||||
|
||||
from scrapy.utils.simpledb import to_sdb_value
|
||||
from scrapy.statscol import StatsCollector
|
||||
from scrapy import log
|
||||
from scrapy.conf import settings
|
||||
|
||||
class SimpledbStatsCollector(StatsCollector):
|
||||
|
||||
def __init__(self):
|
||||
super(SimpledbStatsCollector, self).__init__()
|
||||
self._sdbdomain = settings['STATS_SDB_DOMAIN']
|
||||
self._access_key = settings['AWS_ACCESS_KEY_ID']
|
||||
self._secret_key = settings['AWS_SECRET_ACCESS_KEY']
|
||||
|
||||
self._async = settings.getbool('STATS_SDB_ASYNC')
|
||||
import boto
|
||||
self.connect_sdb = boto.connect_sdb
|
||||
self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).create_domain(self._sdbdomain)
|
||||
|
||||
def _persist_stats(self, stats, spider=None):
|
||||
if spider is None: # only store spider-specific stats
|
||||
return
|
||||
if not self._sdbdomain:
|
||||
return
|
||||
if self._async:
|
||||
dfd = threads.deferToThread(self._persist_to_sdb, spider, stats.copy())
|
||||
dfd.addErrback(log.err, 'Error uploading stats to SimpleDB', \
|
||||
spider=spider)
|
||||
else:
|
||||
self._persist_to_sdb(spider, stats)
|
||||
|
||||
def _persist_to_sdb(self, spider, stats):
|
||||
ts = self._get_timestamp(spider).isoformat()
|
||||
sdb_item_id = "%s_%s" % (spider.name, ts)
|
||||
sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems())
|
||||
sdb_item['spider'] = spider.name
|
||||
sdb_item['timestamp'] = self._to_sdb_value(ts)
|
||||
self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).put_attributes(self._sdbdomain, sdb_item_id, sdb_item)
|
||||
|
||||
def _get_timestamp(self, spider):
|
||||
return datetime.utcnow()
|
||||
|
||||
def _to_sdb_value(self, obj, key=None):
|
||||
try:
|
||||
return to_sdb_value(obj)
|
||||
except TypeError:
|
||||
raise TypeError("%s unsupported type %r used in key %r" % \
|
||||
(type(self).__name__, type(obj).__name__, key))
|
@ -247,9 +247,6 @@ STATS_CLASS = 'scrapy.statscol.MemoryStatsCollector'
|
||||
STATS_ENABLED = True
|
||||
STATS_DUMP = True
|
||||
|
||||
STATS_SDB_DOMAIN = 'scrapy_stats'
|
||||
STATS_SDB_ASYNC = False
|
||||
|
||||
STATSMAILER_RCPTS = []
|
||||
|
||||
TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
|
||||
|
@ -1,21 +0,0 @@
|
||||
import unittest
|
||||
from datetime import datetime
|
||||
|
||||
from scrapy.utils.simpledb import to_sdb_value
|
||||
|
||||
class SimpleddbUtilsTest(unittest.TestCase):
|
||||
|
||||
def test_to_sdb_value(self):
|
||||
self.assertEqual(to_sdb_value(123), u'0000000000000123')
|
||||
self.assertEqual(to_sdb_value(123L), u'0000000000000123')
|
||||
self.assertEqual(to_sdb_value(True), u'1')
|
||||
self.assertEqual(to_sdb_value(False), u'0')
|
||||
self.assertEqual(to_sdb_value(None), u'')
|
||||
self.assertEqual(to_sdb_value(datetime(2009, 01, 01, 10, 10, 10)), \
|
||||
u'2009-01-01T10:10:10')
|
||||
self.assertEqual(to_sdb_value('test'), 'test')
|
||||
self.assertEqual(to_sdb_value(u'test'), u'test')
|
||||
self.assertRaises(TypeError, to_sdb_value, object())
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,19 +0,0 @@
|
||||
"""Helper functions for Amazon SimpleDB"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
def to_sdb_value(obj):
|
||||
"""Convert the given object to proper value to store in Amazon SimpleDB"""
|
||||
if isinstance(obj, bool):
|
||||
return u'%d' % obj
|
||||
elif isinstance(obj, (int, long)):
|
||||
return "%016d" % obj
|
||||
elif isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
elif isinstance(obj, basestring):
|
||||
return obj
|
||||
elif obj is None:
|
||||
return u''
|
||||
else:
|
||||
raise TypeError("Unsupported Type: %s" % type(obj).__name__)
|
||||
|
Loading…
x
Reference in New Issue
Block a user