mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 19:24:12 +00:00
removed SimpledbStatsCollector from scrapy code, it was moved to https://github.com/scrapinghub/scaws
This commit is contained in:
parent
b6b0a54d9f
commit
0e008268e1
@ -227,65 +227,6 @@ DummyStatsCollector
|
|||||||
Stats Collector used when stats are disabled (through the
|
Stats Collector used when stats are disabled (through the
|
||||||
:setting:`STATS_ENABLED` setting).
|
:setting:`STATS_ENABLED` setting).
|
||||||
|
|
||||||
SimpledbStatsCollector
|
|
||||||
----------------------
|
|
||||||
|
|
||||||
.. module:: scrapy.contrib.statscol
|
|
||||||
:synopsis: Additional Stats Collectors
|
|
||||||
|
|
||||||
.. class:: SimpledbStatsCollector
|
|
||||||
|
|
||||||
A Stats collector which persists stats to `Amazon SimpleDB`_, using one
|
|
||||||
SimpleDB item per scraping run (ie. it keeps history of all scraping runs).
|
|
||||||
The data is persisted to the SimpleDB domain specified by the
|
|
||||||
:setting:`STATS_SDB_DOMAIN` setting. The domain will be created if it
|
|
||||||
doesn't exist.
|
|
||||||
|
|
||||||
In addition to the existing stats keys, the following keys are added at
|
|
||||||
persitance time:
|
|
||||||
|
|
||||||
* ``spider``: the spider name (so you can use it later for querying stats
|
|
||||||
for that spider)
|
|
||||||
* ``timestamp``: the timestamp when the stats were persisted
|
|
||||||
|
|
||||||
Both the ``spider`` and ``timestamp`` are used to generate the SimpleDB
|
|
||||||
item name in order to avoid overwriting stats of previous scraping runs.
|
|
||||||
|
|
||||||
As `required by SimpleDB`_, datetimes are stored in ISO 8601 format and
|
|
||||||
numbers are zero-padded to 16 digits. Negative numbers are not currently
|
|
||||||
supported.
|
|
||||||
|
|
||||||
This Stats Collector requires the `boto`_ library.
|
|
||||||
|
|
||||||
.. _Amazon SimpleDB: http://aws.amazon.com/simpledb/
|
|
||||||
.. _required by SimpleDB: http://docs.amazonwebservices.com/AmazonSimpleDB/2009-04-15/DeveloperGuide/ZeroPadding.html
|
|
||||||
.. _boto: http://code.google.com/p/boto/
|
|
||||||
|
|
||||||
This Stats Collector can be configured through the following settings:
|
|
||||||
|
|
||||||
.. setting:: STATS_SDB_DOMAIN
|
|
||||||
|
|
||||||
STATS_SDB_DOMAIN
|
|
||||||
~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Default: ``'scrapy_stats'``
|
|
||||||
|
|
||||||
A string containing the SimpleDB domain to use in the
|
|
||||||
:class:`SimpledbStatsCollector`.
|
|
||||||
|
|
||||||
.. setting:: STATS_SDB_ASYNC
|
|
||||||
|
|
||||||
STATS_SDB_ASYNC
|
|
||||||
~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
Default: ``False``
|
|
||||||
|
|
||||||
If ``True``, communication with SimpleDB will be performed asynchronously. If
|
|
||||||
``False`` blocking IO will be used instead. This is the default as using
|
|
||||||
asynchronous communication can result in the stats not being persisted if the
|
|
||||||
Scrapy engine is shut down in the middle (for example, when you run only one
|
|
||||||
spider in a process and then exit).
|
|
||||||
|
|
||||||
Stats signals
|
Stats signals
|
||||||
=============
|
=============
|
||||||
|
|
||||||
|
@ -1,57 +0,0 @@
|
|||||||
"""
|
|
||||||
A Stats collector for persisting stats to Amazon SimpleDB.
|
|
||||||
|
|
||||||
Requires the boto library: http://code.google.com/p/boto/
|
|
||||||
"""
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from twisted.internet import threads
|
|
||||||
|
|
||||||
from scrapy.utils.simpledb import to_sdb_value
|
|
||||||
from scrapy.statscol import StatsCollector
|
|
||||||
from scrapy import log
|
|
||||||
from scrapy.conf import settings
|
|
||||||
|
|
||||||
class SimpledbStatsCollector(StatsCollector):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super(SimpledbStatsCollector, self).__init__()
|
|
||||||
self._sdbdomain = settings['STATS_SDB_DOMAIN']
|
|
||||||
self._access_key = settings['AWS_ACCESS_KEY_ID']
|
|
||||||
self._secret_key = settings['AWS_SECRET_ACCESS_KEY']
|
|
||||||
|
|
||||||
self._async = settings.getbool('STATS_SDB_ASYNC')
|
|
||||||
import boto
|
|
||||||
self.connect_sdb = boto.connect_sdb
|
|
||||||
self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).create_domain(self._sdbdomain)
|
|
||||||
|
|
||||||
def _persist_stats(self, stats, spider=None):
|
|
||||||
if spider is None: # only store spider-specific stats
|
|
||||||
return
|
|
||||||
if not self._sdbdomain:
|
|
||||||
return
|
|
||||||
if self._async:
|
|
||||||
dfd = threads.deferToThread(self._persist_to_sdb, spider, stats.copy())
|
|
||||||
dfd.addErrback(log.err, 'Error uploading stats to SimpleDB', \
|
|
||||||
spider=spider)
|
|
||||||
else:
|
|
||||||
self._persist_to_sdb(spider, stats)
|
|
||||||
|
|
||||||
def _persist_to_sdb(self, spider, stats):
|
|
||||||
ts = self._get_timestamp(spider).isoformat()
|
|
||||||
sdb_item_id = "%s_%s" % (spider.name, ts)
|
|
||||||
sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems())
|
|
||||||
sdb_item['spider'] = spider.name
|
|
||||||
sdb_item['timestamp'] = self._to_sdb_value(ts)
|
|
||||||
self.connect_sdb(aws_access_key_id=self._access_key, aws_secret_access_key=self._secret_key).put_attributes(self._sdbdomain, sdb_item_id, sdb_item)
|
|
||||||
|
|
||||||
def _get_timestamp(self, spider):
|
|
||||||
return datetime.utcnow()
|
|
||||||
|
|
||||||
def _to_sdb_value(self, obj, key=None):
|
|
||||||
try:
|
|
||||||
return to_sdb_value(obj)
|
|
||||||
except TypeError:
|
|
||||||
raise TypeError("%s unsupported type %r used in key %r" % \
|
|
||||||
(type(self).__name__, type(obj).__name__, key))
|
|
@ -247,9 +247,6 @@ STATS_CLASS = 'scrapy.statscol.MemoryStatsCollector'
|
|||||||
STATS_ENABLED = True
|
STATS_ENABLED = True
|
||||||
STATS_DUMP = True
|
STATS_DUMP = True
|
||||||
|
|
||||||
STATS_SDB_DOMAIN = 'scrapy_stats'
|
|
||||||
STATS_SDB_ASYNC = False
|
|
||||||
|
|
||||||
STATSMAILER_RCPTS = []
|
STATSMAILER_RCPTS = []
|
||||||
|
|
||||||
TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
|
TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
|
||||||
|
@ -1,21 +0,0 @@
|
|||||||
import unittest
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from scrapy.utils.simpledb import to_sdb_value
|
|
||||||
|
|
||||||
class SimpleddbUtilsTest(unittest.TestCase):
|
|
||||||
|
|
||||||
def test_to_sdb_value(self):
|
|
||||||
self.assertEqual(to_sdb_value(123), u'0000000000000123')
|
|
||||||
self.assertEqual(to_sdb_value(123L), u'0000000000000123')
|
|
||||||
self.assertEqual(to_sdb_value(True), u'1')
|
|
||||||
self.assertEqual(to_sdb_value(False), u'0')
|
|
||||||
self.assertEqual(to_sdb_value(None), u'')
|
|
||||||
self.assertEqual(to_sdb_value(datetime(2009, 01, 01, 10, 10, 10)), \
|
|
||||||
u'2009-01-01T10:10:10')
|
|
||||||
self.assertEqual(to_sdb_value('test'), 'test')
|
|
||||||
self.assertEqual(to_sdb_value(u'test'), u'test')
|
|
||||||
self.assertRaises(TypeError, to_sdb_value, object())
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
@ -1,19 +0,0 @@
|
|||||||
"""Helper functions for Amazon SimpleDB"""
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
def to_sdb_value(obj):
|
|
||||||
"""Convert the given object to proper value to store in Amazon SimpleDB"""
|
|
||||||
if isinstance(obj, bool):
|
|
||||||
return u'%d' % obj
|
|
||||||
elif isinstance(obj, (int, long)):
|
|
||||||
return "%016d" % obj
|
|
||||||
elif isinstance(obj, datetime):
|
|
||||||
return obj.isoformat()
|
|
||||||
elif isinstance(obj, basestring):
|
|
||||||
return obj
|
|
||||||
elif obj is None:
|
|
||||||
return u''
|
|
||||||
else:
|
|
||||||
raise TypeError("Unsupported Type: %s" % type(obj).__name__)
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user