1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-26 17:04:07 +00:00

duplicatesfilter: first version of configurable duplicate requests filtering middleware

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40763
This commit is contained in:
Daniel Grana 2009-01-23 03:42:05 +00:00
parent 9209dbc882
commit a07c95003f
2 changed files with 68 additions and 0 deletions

View File

@ -69,6 +69,7 @@ SPIDER_MIDDLEWARES = (
'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware',
'scrapy.contrib.spidermiddleware.depth.DepthMiddleware',
'scrapy.contrib.spidermiddleware.urlfilter.UrlFilterMiddleware',
'scrapy.contrib.spidermiddleware.duplicatesfilter.DuplicatesFilterMiddleware',
# Spider side
)
@ -96,3 +97,5 @@ URLLENGTH_LIMIT = 2083
WS_ENABLED = 0
SPIDERPROFILER_ENABLED = 0
#DUPLICATESFILTER_FILTERCLASS = 'scrapy.contrib.spidermiddleware.duplicatesfilter.SimplePerDomainFilter'

View File

@ -0,0 +1,65 @@
"""
DuplicatesFilterMiddleware: Filter out already visited urls
"""
from pydispatch import dispatcher
from scrapy.core import signals
from scrapy.http import Request
from scrapy.core.exceptions import NotConfigured
from scrapy.utils.request import request_fingerprint
from scrapy.utils.misc import load_class
from scrapy.conf import settings
from scrapy import log
class DuplicatesFilterMiddleware(object):
"""Filter out duplicate requests to avoid visiting same page more than once"""
def __init__(self):
clspath = settings.get('DUPLICATESFILTER_FILTERCLASS')
if not clspath:
raise NotConfigured
self.filter = load_class(clspath)()
dispatcher.connect(self.filter.open, signals.domain_open)
dispatcher.connect(self.filter.close, signals.domain_closed)
def process_spider_output(self, response, result, spider):
domain = spider.domain_name
for req in result:
if isinstance(req, Request):
added = self.filter.add(domain, req)
if not (added or req.dont_filter):
log.msg('Skipped (already visited): %s' % req, log.DEBUG, domain=domain)
continue
yield req
class SimplePerDomainFilter(dict):
"""Filter out a request if already seen for same domain"""
def open(self, domain):
"""Initialize the resources needed for filtering for this domain"""
self[domain] = set()
def close(self, domain):
"""Remove the resources reserved for filtering for this domain"""
del self[domain]
def add(self, domain, request):
"""Add the fingerprint of a request to the domain set if a equivalent fingerprint has not been added.
This method will return true if the fingerprint was added and false otherwise.
"""
fp = request_fingerprint(request)
if fp not in self[domain]:
self[domain].add(fp)
return True
return False
def has(self, domain, request):
"""Check if a request was already seen for a domain"""
return request_fingerprint(request) in self[domain]