mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 10:24:01 +00:00
184 lines
6.3 KiB
Python
184 lines
6.3 KiB
Python
import hashlib
|
|
import tempfile
|
|
import unittest
|
|
import shutil
|
|
from testfixtures import LogCapture
|
|
|
|
from scrapy.dupefilters import RFPDupeFilter
|
|
from scrapy.http import Request
|
|
from scrapy.core.scheduler import Scheduler
|
|
from scrapy.utils.python import to_bytes
|
|
from scrapy.utils.job import job_dir
|
|
from scrapy.utils.test import get_crawler
|
|
from tests.spiders import SimpleSpider
|
|
|
|
class FromCrawlerRFPDupeFilter(RFPDupeFilter):
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
debug = crawler.settings.getbool('DUPEFILTER_DEBUG')
|
|
df = cls(job_dir(crawler.settings), debug)
|
|
df.method = 'from_crawler'
|
|
return df
|
|
|
|
|
|
class FromSettingsRFPDupeFilter(RFPDupeFilter):
|
|
|
|
@classmethod
|
|
def from_settings(cls, settings):
|
|
debug = settings.getbool('DUPEFILTER_DEBUG')
|
|
df = cls(job_dir(settings), debug)
|
|
df.method = 'from_settings'
|
|
return df
|
|
|
|
|
|
class DirectDupeFilter(object):
|
|
method = 'n/a'
|
|
|
|
|
|
class RFPDupeFilterTest(unittest.TestCase):
|
|
|
|
def test_df_from_crawler_scheduler(self):
|
|
settings = {'DUPEFILTER_DEBUG': True,
|
|
'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'}
|
|
crawler = get_crawler(settings_dict=settings)
|
|
scheduler = Scheduler.from_crawler(crawler)
|
|
self.assertTrue(scheduler.df.debug)
|
|
self.assertEqual(scheduler.df.method, 'from_crawler')
|
|
|
|
def test_df_from_settings_scheduler(self):
|
|
settings = {'DUPEFILTER_DEBUG': True,
|
|
'DUPEFILTER_CLASS': __name__ + '.FromSettingsRFPDupeFilter'}
|
|
crawler = get_crawler(settings_dict=settings)
|
|
scheduler = Scheduler.from_crawler(crawler)
|
|
self.assertTrue(scheduler.df.debug)
|
|
self.assertEqual(scheduler.df.method, 'from_settings')
|
|
|
|
def test_df_direct_scheduler(self):
|
|
settings = {'DUPEFILTER_CLASS': __name__ + '.DirectDupeFilter'}
|
|
crawler = get_crawler(settings_dict=settings)
|
|
scheduler = Scheduler.from_crawler(crawler)
|
|
self.assertEqual(scheduler.df.method, 'n/a')
|
|
|
|
def test_filter(self):
|
|
dupefilter = RFPDupeFilter()
|
|
dupefilter.open()
|
|
|
|
r1 = Request('http://scrapytest.org/1')
|
|
r2 = Request('http://scrapytest.org/2')
|
|
r3 = Request('http://scrapytest.org/2')
|
|
|
|
assert not dupefilter.request_seen(r1)
|
|
assert dupefilter.request_seen(r1)
|
|
|
|
assert not dupefilter.request_seen(r2)
|
|
assert dupefilter.request_seen(r3)
|
|
|
|
dupefilter.close('finished')
|
|
|
|
def test_dupefilter_path(self):
|
|
r1 = Request('http://scrapytest.org/1')
|
|
r2 = Request('http://scrapytest.org/2')
|
|
|
|
path = tempfile.mkdtemp()
|
|
try:
|
|
df = RFPDupeFilter(path)
|
|
df.open()
|
|
assert not df.request_seen(r1)
|
|
assert df.request_seen(r1)
|
|
df.close('finished')
|
|
|
|
df2 = RFPDupeFilter(path)
|
|
df2.open()
|
|
assert df2.request_seen(r1)
|
|
assert not df2.request_seen(r2)
|
|
assert df2.request_seen(r2)
|
|
df2.close('finished')
|
|
finally:
|
|
shutil.rmtree(path)
|
|
|
|
def test_request_fingerprint(self):
|
|
"""Test if customization of request_fingerprint method will change
|
|
output of request_seen.
|
|
|
|
"""
|
|
r1 = Request('http://scrapytest.org/index.html')
|
|
r2 = Request('http://scrapytest.org/INDEX.html')
|
|
|
|
dupefilter = RFPDupeFilter()
|
|
dupefilter.open()
|
|
|
|
assert not dupefilter.request_seen(r1)
|
|
assert not dupefilter.request_seen(r2)
|
|
|
|
dupefilter.close('finished')
|
|
|
|
class CaseInsensitiveRFPDupeFilter(RFPDupeFilter):
|
|
|
|
def request_fingerprint(self, request):
|
|
fp = hashlib.sha1()
|
|
fp.update(to_bytes(request.url.lower()))
|
|
return fp.hexdigest()
|
|
|
|
case_insensitive_dupefilter = CaseInsensitiveRFPDupeFilter()
|
|
case_insensitive_dupefilter.open()
|
|
|
|
assert not case_insensitive_dupefilter.request_seen(r1)
|
|
assert case_insensitive_dupefilter.request_seen(r2)
|
|
|
|
case_insensitive_dupefilter.close('finished')
|
|
|
|
def test_log(self):
|
|
with LogCapture() as l:
|
|
settings = {'DUPEFILTER_DEBUG': False,
|
|
'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'}
|
|
crawler = get_crawler(SimpleSpider, settings_dict=settings)
|
|
scheduler = Scheduler.from_crawler(crawler)
|
|
spider = SimpleSpider.from_crawler(crawler)
|
|
|
|
dupefilter = scheduler.df
|
|
dupefilter.open()
|
|
|
|
r1 = Request('http://scrapytest.org/index.html')
|
|
r2 = Request('http://scrapytest.org/index.html')
|
|
|
|
dupefilter.log(r1, spider)
|
|
dupefilter.log(r2, spider)
|
|
|
|
assert crawler.stats.get_value('dupefilter/filtered') == 2
|
|
l.check_present(('scrapy.dupefilters', 'DEBUG',
|
|
('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
|
|
' - no more duplicates will be shown'
|
|
' (see DUPEFILTER_DEBUG to show all duplicates)')))
|
|
|
|
dupefilter.close('finished')
|
|
|
|
def test_log_debug(self):
|
|
with LogCapture() as l:
|
|
settings = {'DUPEFILTER_DEBUG': True,
|
|
'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'}
|
|
crawler = get_crawler(SimpleSpider, settings_dict=settings)
|
|
scheduler = Scheduler.from_crawler(crawler)
|
|
spider = SimpleSpider.from_crawler(crawler)
|
|
|
|
dupefilter = scheduler.df
|
|
dupefilter.open()
|
|
|
|
r1 = Request('http://scrapytest.org/index.html')
|
|
r2 = Request('http://scrapytest.org/index.html',
|
|
headers={'Referer': 'http://scrapytest.org/INDEX.html'}
|
|
)
|
|
|
|
dupefilter.log(r1, spider)
|
|
dupefilter.log(r2, spider)
|
|
|
|
assert crawler.stats.get_value('dupefilter/filtered') == 2
|
|
l.check_present(('scrapy.dupefilters', 'DEBUG',
|
|
('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
|
|
' (referer: None)')))
|
|
l.check_present(('scrapy.dupefilters', 'DEBUG',
|
|
('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
|
|
' (referer: http://scrapytest.org/INDEX.html)')))
|
|
|
|
dupefilter.close('finished')
|