mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-22 06:33:12 +00:00
added UrlFilterMiddleware
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40191
This commit is contained in:
parent
d7d94482a9
commit
c9cafd5c43
17
scrapy/trunk/scrapy/contrib/spidermiddleware/urlfilter.py
Normal file
17
scrapy/trunk/scrapy/contrib/spidermiddleware/urlfilter.py
Normal file
@ -0,0 +1,17 @@
|
||||
"""
|
||||
UrlFilterMiddleware: canonicalizes URLs to filter out duplicated ones
|
||||
"""
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.url import canonicalize_url
|
||||
|
||||
class UrlFilterMiddleware(object):
|
||||
def process_result(self, response, result, spider):
|
||||
disabled = getattr(spider, 'urlfilter_disabled', False)
|
||||
for r in result:
|
||||
if isinstance(r, Request) and not disabled:
|
||||
curl = canonicalize_url(r.url)
|
||||
# only assign if different to avoid re-calculating fingerprint
|
||||
if curl != r.url:
|
||||
r.url = curl
|
||||
yield r
|
Loading…
x
Reference in New Issue
Block a user