1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 19:24:12 +00:00

removed SimpledbStatsCollector from scrapy code, it was moved to https://github.com/scrapinghub/scaws

This commit is contained in:
Pablo Hoffman 2011-07-20 10:38:16 -03:00
parent b6b0a54d9f
commit 0e008268e1
5 changed files with 0 additions and 159 deletions

View File

@ -227,65 +227,6 @@ DummyStatsCollector
Stats Collector used when stats are disabled (through the Stats Collector used when stats are disabled (through the
:setting:`STATS_ENABLED` setting). :setting:`STATS_ENABLED` setting).
SimpledbStatsCollector
----------------------
.. module:: scrapy.contrib.statscol
:synopsis: Additional Stats Collectors
.. class:: SimpledbStatsCollector
A Stats collector which persists stats to `Amazon SimpleDB`_, using one
SimpleDB item per scraping run (ie. it keeps history of all scraping runs).
The data is persisted to the SimpleDB domain specified by the
:setting:`STATS_SDB_DOMAIN` setting. The domain will be created if it
doesn't exist.
In addition to the existing stats keys, the following keys are added at
persitance time:
* ``spider``: the spider name (so you can use it later for querying stats
for that spider)
* ``timestamp``: the timestamp when the stats were persisted
Both the ``spider`` and ``timestamp`` are used to generate the SimpleDB
item name in order to avoid overwriting stats of previous scraping runs.
As `required by SimpleDB`_, datetimes are stored in ISO 8601 format and
numbers are zero-padded to 16 digits. Negative numbers are not currently
supported.
This Stats Collector requires the `boto`_ library.
.. _Amazon SimpleDB: http://aws.amazon.com/simpledb/
.. _required by SimpleDB: http://docs.amazonwebservices.com/AmazonSimpleDB/2009-04-15/DeveloperGuide/ZeroPadding.html
.. _boto: http://code.google.com/p/boto/
This Stats Collector can be configured through the following settings:
.. setting:: STATS_SDB_DOMAIN
STATS_SDB_DOMAIN
~~~~~~~~~~~~~~~~
Default: ``'scrapy_stats'``
A string containing the SimpleDB domain to use in the
:class:`SimpledbStatsCollector`.
.. setting:: STATS_SDB_ASYNC
STATS_SDB_ASYNC
~~~~~~~~~~~~~~~
Default: ``False``
If ``True``, communication with SimpleDB will be performed asynchronously. If
``False`` blocking IO will be used instead. This is the default as using
asynchronous communication can result in the stats not being persisted if the
Scrapy engine is shut down in the middle (for example, when you run only one
spider in a process and then exit).
Stats signals Stats signals
============= =============

View File

@ -1,57 +0,0 @@
"""
A Stats collector for persisting stats to Amazon SimpleDB.
Requires the boto library: http://code.google.com/p/boto/
"""
from datetime import datetime
from twisted.internet import threads
from scrapy.utils.simpledb import to_sdb_value
from scrapy.statscol import StatsCollector
from scrapy import log
from scrapy.conf import settings
class SimpledbStatsCollector(StatsCollector):
def __init__(self):
super(SimpledbStatsCollector, self).__init__()
self._sdbdomain = settings['STATS_SDB_DOMAIN']
self._access_key = settings['AWS_ACCESS_KEY_ID']
self._secret_key = settings['AWS_SECRET_ACCESS_KEY']
self._async = settings.getbool('STATS_SDB_ASYNC')
import boto
self.connect_sdb = boto.connect_sdb
self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).create_domain(self._sdbdomain)
def _persist_stats(self, stats, spider=None):
if spider is None: # only store spider-specific stats
return
if not self._sdbdomain:
return
if self._async:
dfd = threads.deferToThread(self._persist_to_sdb, spider, stats.copy())
dfd.addErrback(log.err, 'Error uploading stats to SimpleDB', \
spider=spider)
else:
self._persist_to_sdb(spider, stats)
def _persist_to_sdb(self, spider, stats):
ts = self._get_timestamp(spider).isoformat()
sdb_item_id = "%s_%s" % (spider.name, ts)
sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems())
sdb_item['spider'] = spider.name
sdb_item['timestamp'] = self._to_sdb_value(ts)
self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).put_attributes(self._sdbdomain, sdb_item_id, sdb_item)
def _get_timestamp(self, spider):
return datetime.utcnow()
def _to_sdb_value(self, obj, key=None):
try:
return to_sdb_value(obj)
except TypeError:
raise TypeError("%s unsupported type %r used in key %r" % \
(type(self).__name__, type(obj).__name__, key))

View File

@ -247,9 +247,6 @@ STATS_CLASS = 'scrapy.statscol.MemoryStatsCollector'
STATS_ENABLED = True STATS_ENABLED = True
STATS_DUMP = True STATS_DUMP = True
STATS_SDB_DOMAIN = 'scrapy_stats'
STATS_SDB_ASYNC = False
STATSMAILER_RCPTS = [] STATSMAILER_RCPTS = []
TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates')) TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))

View File

@ -1,21 +0,0 @@
import unittest
from datetime import datetime
from scrapy.utils.simpledb import to_sdb_value
class SimpleddbUtilsTest(unittest.TestCase):
def test_to_sdb_value(self):
self.assertEqual(to_sdb_value(123), u'0000000000000123')
self.assertEqual(to_sdb_value(123L), u'0000000000000123')
self.assertEqual(to_sdb_value(True), u'1')
self.assertEqual(to_sdb_value(False), u'0')
self.assertEqual(to_sdb_value(None), u'')
self.assertEqual(to_sdb_value(datetime(2009, 01, 01, 10, 10, 10)), \
u'2009-01-01T10:10:10')
self.assertEqual(to_sdb_value('test'), 'test')
self.assertEqual(to_sdb_value(u'test'), u'test')
self.assertRaises(TypeError, to_sdb_value, object())
if __name__ == "__main__":
unittest.main()

View File

@ -1,19 +0,0 @@
"""Helper functions for Amazon SimpleDB"""
from datetime import datetime
def to_sdb_value(obj):
"""Convert the given object to proper value to store in Amazon SimpleDB"""
if isinstance(obj, bool):
return u'%d' % obj
elif isinstance(obj, (int, long)):
return "%016d" % obj
elif isinstance(obj, datetime):
return obj.isoformat()
elif isinstance(obj, basestring):
return obj
elif obj is None:
return u''
else:
raise TypeError("Unsupported Type: %s" % type(obj).__name__)