1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-22 06:33:12 +00:00

added UrlFilterMiddleware

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40191
This commit is contained in:
Pablo Hoffman 2008-09-01 04:16:51 +00:00
parent d7d94482a9
commit c9cafd5c43

View File

@ -0,0 +1,17 @@
"""
UrlFilterMiddleware: canonicalizes URLs to filter out duplicated ones
"""
from scrapy.http import Request
from scrapy.utils.url import canonicalize_url
class UrlFilterMiddleware(object):
def process_result(self, response, result, spider):
disabled = getattr(spider, 'urlfilter_disabled', False)
for r in result:
if isinstance(r, Request) and not disabled:
curl = canonicalize_url(r.url)
# only assign if different to avoid re-calculating fingerprint
if curl != r.url:
r.url = curl
yield r