From a07c95003fcacaaf2674e3a9cdb243f88fa01f12 Mon Sep 17 00:00:00 2001 From: Daniel Grana Date: Fri, 23 Jan 2009 03:42:05 +0000 Subject: [PATCH] duplicatesfilter: first version of configurable duplicate requests filtering middleware --HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40763 --- .../conf/project_template/scrapy_settings.py | 3 + .../spidermiddleware/duplicatesfilter.py | 65 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 scrapy/trunk/scrapy/contrib/spidermiddleware/duplicatesfilter.py diff --git a/scrapy/trunk/scrapy/conf/project_template/scrapy_settings.py b/scrapy/trunk/scrapy/conf/project_template/scrapy_settings.py index f885d9e42..c0e60df36 100644 --- a/scrapy/trunk/scrapy/conf/project_template/scrapy_settings.py +++ b/scrapy/trunk/scrapy/conf/project_template/scrapy_settings.py @@ -69,6 +69,7 @@ SPIDER_MIDDLEWARES = ( 'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware', 'scrapy.contrib.spidermiddleware.depth.DepthMiddleware', 'scrapy.contrib.spidermiddleware.urlfilter.UrlFilterMiddleware', + 'scrapy.contrib.spidermiddleware.duplicatesfilter.DuplicatesFilterMiddleware', # Spider side ) @@ -96,3 +97,5 @@ URLLENGTH_LIMIT = 2083 WS_ENABLED = 0 SPIDERPROFILER_ENABLED = 0 + +#DUPLICATESFILTER_FILTERCLASS = 'scrapy.contrib.spidermiddleware.duplicatesfilter.SimplePerDomainFilter' diff --git a/scrapy/trunk/scrapy/contrib/spidermiddleware/duplicatesfilter.py b/scrapy/trunk/scrapy/contrib/spidermiddleware/duplicatesfilter.py new file mode 100644 index 000000000..a421a1472 --- /dev/null +++ b/scrapy/trunk/scrapy/contrib/spidermiddleware/duplicatesfilter.py @@ -0,0 +1,65 @@ +""" +DuplicatesFilterMiddleware: Filter out already visited urls +""" + +from pydispatch import dispatcher + +from scrapy.core import signals +from scrapy.http import Request +from scrapy.core.exceptions import NotConfigured +from scrapy.utils.request import request_fingerprint +from scrapy.utils.misc import load_class +from scrapy.conf import settings +from scrapy import log + + +class DuplicatesFilterMiddleware(object): + """Filter out duplicate requests to avoid visiting same page more than once""" + + def __init__(self): + clspath = settings.get('DUPLICATESFILTER_FILTERCLASS') + if not clspath: + raise NotConfigured + + self.filter = load_class(clspath)() + dispatcher.connect(self.filter.open, signals.domain_open) + dispatcher.connect(self.filter.close, signals.domain_closed) + + def process_spider_output(self, response, result, spider): + domain = spider.domain_name + + for req in result: + if isinstance(req, Request): + added = self.filter.add(domain, req) + if not (added or req.dont_filter): + log.msg('Skipped (already visited): %s' % req, log.DEBUG, domain=domain) + continue + yield req + + +class SimplePerDomainFilter(dict): + """Filter out a request if already seen for same domain""" + + def open(self, domain): + """Initialize the resources needed for filtering for this domain""" + self[domain] = set() + + def close(self, domain): + """Remove the resources reserved for filtering for this domain""" + del self[domain] + + def add(self, domain, request): + """Add the fingerprint of a request to the domain set if a equivalent fingerprint has not been added. + This method will return true if the fingerprint was added and false otherwise. + """ + fp = request_fingerprint(request) + if fp not in self[domain]: + self[domain].add(fp) + return True + return False + + def has(self, domain, request): + """Check if a request was already seen for a domain""" + return request_fingerprint(request) in self[domain] + +