removed SimpledbStatsCollector from scrapy code, it was moved to https://github.com/scrapinghub/scaws

2025-02-24 19:24:12 +00:00 · 2011-07-20 10:38:16 -03:00 · 2011-07-20 10:38:16 -03:00 · 0e008268e1
commit 0e008268e1
parent b6b0a54d9f
5 changed files with 0 additions and 159 deletions
--- a/docs/topics/stats.rst
+++ b/docs/topics/stats.rst
@ -227,65 +227,6 @@ DummyStatsCollector
    Stats Collector used when stats are disabled (through the
    :setting:`STATS_ENABLED` setting).
 SimpledbStatsCollector
 ----------------------
 .. module:: scrapy.contrib.statscol
   :synopsis: Additional Stats Collectors
 .. class:: SimpledbStatsCollector
    A Stats collector which persists stats to `Amazon SimpleDB`_, using one
    SimpleDB item per scraping run (ie. it keeps history of all scraping runs).
    The data is persisted to the SimpleDB domain specified by the
    :setting:`STATS_SDB_DOMAIN` setting. The domain will be created if it
    doesn't exist.
    In addition to the existing stats keys, the following keys are added at
    persitance time:
        * ``spider``: the spider name (so you can use it later for querying stats
          for that spider)
        * ``timestamp``: the timestamp when the stats were persisted
    Both the ``spider`` and ``timestamp`` are used to generate the SimpleDB
    item name in order to avoid overwriting stats of previous scraping runs.
    As `required by SimpleDB`_, datetimes are stored in ISO 8601 format and
    numbers are zero-padded to 16 digits. Negative numbers are not currently
    supported.
    This Stats Collector requires the `boto`_ library.
 .. _Amazon SimpleDB: http://aws.amazon.com/simpledb/
 .. _required by SimpleDB: http://docs.amazonwebservices.com/AmazonSimpleDB/2009-04-15/DeveloperGuide/ZeroPadding.html
 .. _boto: http://code.google.com/p/boto/
 This Stats Collector can be configured through the following settings:
 .. setting:: STATS_SDB_DOMAIN
 STATS_SDB_DOMAIN
 ~~~~~~~~~~~~~~~~
 Default: ``'scrapy_stats'``
 A string containing the SimpleDB domain to use in the
 :class:`SimpledbStatsCollector`.
 .. setting:: STATS_SDB_ASYNC
 STATS_SDB_ASYNC
 ~~~~~~~~~~~~~~~
 Default: ``False``
 If ``True``, communication with SimpleDB will be performed asynchronously. If
 ``False`` blocking IO will be used instead. This is the default as using
 asynchronous communication can result in the stats not being persisted if the
 Scrapy engine is shut down in the middle (for example, when you run only one
 spider in a process and then exit).
 Stats signals
 =============
--- a/scrapy/contrib/statscol.py
+++ b/scrapy/contrib/statscol.py
@ -1,57 +0,0 @@
 """
 A Stats collector for persisting stats to Amazon SimpleDB.
 Requires the boto library: http://code.google.com/p/boto/
 """
 from datetime import datetime
 from twisted.internet import threads
 from scrapy.utils.simpledb import to_sdb_value
 from scrapy.statscol import StatsCollector
 from scrapy import log
 from scrapy.conf import settings
 class SimpledbStatsCollector(StatsCollector):
    def __init__(self):
        super(SimpledbStatsCollector, self).__init__()
        self._sdbdomain = settings['STATS_SDB_DOMAIN']
        self._access_key = settings['AWS_ACCESS_KEY_ID']
        self._secret_key = settings['AWS_SECRET_ACCESS_KEY']
        self._async = settings.getbool('STATS_SDB_ASYNC')
        import boto
        self.connect_sdb = boto.connect_sdb
        self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).create_domain(self._sdbdomain)
    def _persist_stats(self, stats, spider=None):
        if spider is None: # only store spider-specific stats
            return
        if not self._sdbdomain:
            return
        if self._async:
            dfd = threads.deferToThread(self._persist_to_sdb, spider, stats.copy())
            dfd.addErrback(log.err, 'Error uploading stats to SimpleDB', \
                spider=spider)
        else:
            self._persist_to_sdb(spider, stats)
    def _persist_to_sdb(self, spider, stats):
        ts = self._get_timestamp(spider).isoformat()
        sdb_item_id = "%s_%s" % (spider.name, ts)
        sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems())
        sdb_item['spider'] = spider.name
        sdb_item['timestamp'] = self._to_sdb_value(ts)
        self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).put_attributes(self._sdbdomain, sdb_item_id, sdb_item)
    def _get_timestamp(self, spider):
        return datetime.utcnow()
    def _to_sdb_value(self, obj, key=None):
        try:
            return to_sdb_value(obj)
        except TypeError:
            raise TypeError("%s unsupported type %r used in key %r" % \
                (type(self).__name__, type(obj).__name__, key))
--- a/scrapy/settings/default_settings.py
+++ b/scrapy/settings/default_settings.py
@ -247,9 +247,6 @@ STATS_CLASS = 'scrapy.statscol.MemoryStatsCollector'
 STATS_ENABLED = True
 STATS_DUMP = True
 STATS_SDB_DOMAIN = 'scrapy_stats'
 STATS_SDB_ASYNC = False
 STATSMAILER_RCPTS = []
 TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
--- a/scrapy/tests/test_utils_simpledb.py
+++ b/scrapy/tests/test_utils_simpledb.py
@ -1,21 +0,0 @@
 import unittest
 from datetime import datetime
 from scrapy.utils.simpledb import to_sdb_value
 class SimpleddbUtilsTest(unittest.TestCase):
    def test_to_sdb_value(self):
        self.assertEqual(to_sdb_value(123), u'0000000000000123')
        self.assertEqual(to_sdb_value(123L), u'0000000000000123')
        self.assertEqual(to_sdb_value(True), u'1')
        self.assertEqual(to_sdb_value(False), u'0')
        self.assertEqual(to_sdb_value(None), u'')
        self.assertEqual(to_sdb_value(datetime(2009, 01, 01, 10, 10, 10)), \
            u'2009-01-01T10:10:10')
        self.assertEqual(to_sdb_value('test'), 'test')
        self.assertEqual(to_sdb_value(u'test'), u'test')
        self.assertRaises(TypeError, to_sdb_value, object())
 if __name__ == "__main__":
    unittest.main()
--- a/scrapy/utils/simpledb.py
+++ b/scrapy/utils/simpledb.py
@ -1,19 +0,0 @@
 """Helper functions for Amazon SimpleDB"""
 from datetime import datetime
 def to_sdb_value(obj):
    """Convert the given object to proper value to store in Amazon SimpleDB"""
    if isinstance(obj, bool):
        return u'%d' % obj
    elif isinstance(obj, (int, long)):
        return "%016d" % obj
    elif isinstance(obj, datetime):
        return obj.isoformat()
    elif isinstance(obj, basestring):
        return obj
    elif obj is None:
        return u''
    else:
        raise TypeError("Unsupported Type: %s" % type(obj).__name__)