From 0e008268e1ca41817d5b8e26d0832924d66a652c Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 20 Jul 2011 10:38:16 -0300 Subject: [PATCH] removed SimpledbStatsCollector from scrapy code, it was moved to https://github.com/scrapinghub/scaws --- docs/topics/stats.rst | 59 ----------------------------- scrapy/contrib/statscol.py | 57 ---------------------------- scrapy/settings/default_settings.py | 3 -- scrapy/tests/test_utils_simpledb.py | 21 ---------- scrapy/utils/simpledb.py | 19 ---------- 5 files changed, 159 deletions(-) delete mode 100644 scrapy/contrib/statscol.py delete mode 100644 scrapy/tests/test_utils_simpledb.py delete mode 100644 scrapy/utils/simpledb.py diff --git a/docs/topics/stats.rst b/docs/topics/stats.rst index 69b08211f..f493a2df8 100644 --- a/docs/topics/stats.rst +++ b/docs/topics/stats.rst @@ -227,65 +227,6 @@ DummyStatsCollector Stats Collector used when stats are disabled (through the :setting:`STATS_ENABLED` setting). -SimpledbStatsCollector ----------------------- - -.. module:: scrapy.contrib.statscol - :synopsis: Additional Stats Collectors - -.. class:: SimpledbStatsCollector - - A Stats collector which persists stats to `Amazon SimpleDB`_, using one - SimpleDB item per scraping run (ie. it keeps history of all scraping runs). - The data is persisted to the SimpleDB domain specified by the - :setting:`STATS_SDB_DOMAIN` setting. The domain will be created if it - doesn't exist. - - In addition to the existing stats keys, the following keys are added at - persitance time: - - * ``spider``: the spider name (so you can use it later for querying stats - for that spider) - * ``timestamp``: the timestamp when the stats were persisted - - Both the ``spider`` and ``timestamp`` are used to generate the SimpleDB - item name in order to avoid overwriting stats of previous scraping runs. - - As `required by SimpleDB`_, datetimes are stored in ISO 8601 format and - numbers are zero-padded to 16 digits. Negative numbers are not currently - supported. - - This Stats Collector requires the `boto`_ library. - -.. _Amazon SimpleDB: http://aws.amazon.com/simpledb/ -.. _required by SimpleDB: http://docs.amazonwebservices.com/AmazonSimpleDB/2009-04-15/DeveloperGuide/ZeroPadding.html -.. _boto: http://code.google.com/p/boto/ - -This Stats Collector can be configured through the following settings: - -.. setting:: STATS_SDB_DOMAIN - -STATS_SDB_DOMAIN -~~~~~~~~~~~~~~~~ - -Default: ``'scrapy_stats'`` - -A string containing the SimpleDB domain to use in the -:class:`SimpledbStatsCollector`. - -.. setting:: STATS_SDB_ASYNC - -STATS_SDB_ASYNC -~~~~~~~~~~~~~~~ - -Default: ``False`` - -If ``True``, communication with SimpleDB will be performed asynchronously. If -``False`` blocking IO will be used instead. This is the default as using -asynchronous communication can result in the stats not being persisted if the -Scrapy engine is shut down in the middle (for example, when you run only one -spider in a process and then exit). - Stats signals ============= diff --git a/scrapy/contrib/statscol.py b/scrapy/contrib/statscol.py deleted file mode 100644 index ff52f0b41..000000000 --- a/scrapy/contrib/statscol.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -A Stats collector for persisting stats to Amazon SimpleDB. - -Requires the boto library: http://code.google.com/p/boto/ -""" - -from datetime import datetime - -from twisted.internet import threads - -from scrapy.utils.simpledb import to_sdb_value -from scrapy.statscol import StatsCollector -from scrapy import log -from scrapy.conf import settings - -class SimpledbStatsCollector(StatsCollector): - - def __init__(self): - super(SimpledbStatsCollector, self).__init__() - self._sdbdomain = settings['STATS_SDB_DOMAIN'] - self._access_key = settings['AWS_ACCESS_KEY_ID'] - self._secret_key = settings['AWS_SECRET_ACCESS_KEY'] - - self._async = settings.getbool('STATS_SDB_ASYNC') - import boto - self.connect_sdb = boto.connect_sdb - self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).create_domain(self._sdbdomain) - - def _persist_stats(self, stats, spider=None): - if spider is None: # only store spider-specific stats - return - if not self._sdbdomain: - return - if self._async: - dfd = threads.deferToThread(self._persist_to_sdb, spider, stats.copy()) - dfd.addErrback(log.err, 'Error uploading stats to SimpleDB', \ - spider=spider) - else: - self._persist_to_sdb(spider, stats) - - def _persist_to_sdb(self, spider, stats): - ts = self._get_timestamp(spider).isoformat() - sdb_item_id = "%s_%s" % (spider.name, ts) - sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems()) - sdb_item['spider'] = spider.name - sdb_item['timestamp'] = self._to_sdb_value(ts) - self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).put_attributes(self._sdbdomain, sdb_item_id, sdb_item) - - def _get_timestamp(self, spider): - return datetime.utcnow() - - def _to_sdb_value(self, obj, key=None): - try: - return to_sdb_value(obj) - except TypeError: - raise TypeError("%s unsupported type %r used in key %r" % \ - (type(self).__name__, type(obj).__name__, key)) diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 9eb8eb03b..802a6dce8 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -247,9 +247,6 @@ STATS_CLASS = 'scrapy.statscol.MemoryStatsCollector' STATS_ENABLED = True STATS_DUMP = True -STATS_SDB_DOMAIN = 'scrapy_stats' -STATS_SDB_ASYNC = False - STATSMAILER_RCPTS = [] TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates')) diff --git a/scrapy/tests/test_utils_simpledb.py b/scrapy/tests/test_utils_simpledb.py deleted file mode 100644 index 6ed118b02..000000000 --- a/scrapy/tests/test_utils_simpledb.py +++ /dev/null @@ -1,21 +0,0 @@ -import unittest -from datetime import datetime - -from scrapy.utils.simpledb import to_sdb_value - -class SimpleddbUtilsTest(unittest.TestCase): - - def test_to_sdb_value(self): - self.assertEqual(to_sdb_value(123), u'0000000000000123') - self.assertEqual(to_sdb_value(123L), u'0000000000000123') - self.assertEqual(to_sdb_value(True), u'1') - self.assertEqual(to_sdb_value(False), u'0') - self.assertEqual(to_sdb_value(None), u'') - self.assertEqual(to_sdb_value(datetime(2009, 01, 01, 10, 10, 10)), \ - u'2009-01-01T10:10:10') - self.assertEqual(to_sdb_value('test'), 'test') - self.assertEqual(to_sdb_value(u'test'), u'test') - self.assertRaises(TypeError, to_sdb_value, object()) - -if __name__ == "__main__": - unittest.main() diff --git a/scrapy/utils/simpledb.py b/scrapy/utils/simpledb.py deleted file mode 100644 index ec8e3aedb..000000000 --- a/scrapy/utils/simpledb.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Helper functions for Amazon SimpleDB""" - -from datetime import datetime - -def to_sdb_value(obj): - """Convert the given object to proper value to store in Amazon SimpleDB""" - if isinstance(obj, bool): - return u'%d' % obj - elif isinstance(obj, (int, long)): - return "%016d" % obj - elif isinstance(obj, datetime): - return obj.isoformat() - elif isinstance(obj, basestring): - return obj - elif obj is None: - return u'' - else: - raise TypeError("Unsupported Type: %s" % type(obj).__name__) -