mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 17:44:33 +00:00
initial version of crawl tests using a mock HTTP server (in separate process). This can also be used to benchmark scrapy performance, although a script (specially suited for that task) would be more convenient
This commit is contained in:
parent
2a5c7ed4da
commit
d0a81d369f
83
scrapy/tests/mockserver.py
Normal file
83
scrapy/tests/mockserver.py
Normal file
@ -0,0 +1,83 @@
|
||||
import json, random, urllib
|
||||
from time import time
|
||||
from twisted.web.server import Site
|
||||
from twisted.web.resource import Resource
|
||||
from twisted.internet import reactor
|
||||
|
||||
|
||||
_id = lambda x: x
|
||||
_request_args = {
|
||||
"args": _id,
|
||||
"clientproto": _id,
|
||||
"requestHeaders": lambda x: x._rawHeaders,
|
||||
"responseHeaders": lambda x: x._rawHeaders,
|
||||
"method": _id,
|
||||
"path": _id,
|
||||
"uri": _id,
|
||||
}
|
||||
|
||||
def encode_request(request):
|
||||
"""Encode request into a JSON-serializable type"""
|
||||
d = {"time": time()}
|
||||
for k, func in _request_args.iteritems():
|
||||
d[k] = func(getattr(request, k))
|
||||
return d
|
||||
|
||||
def getarg(request, name, default=None, type=str):
|
||||
if name in request.args:
|
||||
return type(request.args[name][0])
|
||||
else:
|
||||
return default
|
||||
|
||||
class Follow(Resource):
|
||||
|
||||
isLeaf = True
|
||||
|
||||
def render(self, request):
|
||||
total = getarg(request, "total", 100, type=int)
|
||||
show = getarg(request, "show", 1, type=int)
|
||||
order = getarg(request, "order", "desc")
|
||||
n = getarg(request, "n", total, type=int)
|
||||
if order == "rand":
|
||||
nlist = [random.randint(1, total) for _ in range(show)]
|
||||
else: # order == "desc"
|
||||
nlist = range(n, max(n-show, 0), -1)
|
||||
|
||||
s = """<html> <head></head> <body>"""
|
||||
args = request.args.copy()
|
||||
for nl in nlist:
|
||||
args["n"] = [str(nl)]
|
||||
argstr = urllib.urlencode(args, doseq=True)
|
||||
s += "<a href='/follow?%s'>follow %d</a><br>" % (argstr, nl)
|
||||
s += """</body>"""
|
||||
return s
|
||||
|
||||
class Log(Resource):
|
||||
|
||||
isLeaf = True
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
|
||||
def render(self, request):
|
||||
return json.dumps(self.log)
|
||||
|
||||
class Root(Resource):
|
||||
|
||||
def __init__(self):
|
||||
Resource.__init__(self)
|
||||
self.log = []
|
||||
self.putChild("follow", Follow())
|
||||
self.putChild("log", Log(self.log))
|
||||
|
||||
def getChild(self, request, name):
|
||||
return self
|
||||
|
||||
def render(self, request):
|
||||
return 'Scrapy mock HTTP server\n'
|
||||
|
||||
|
||||
root = Root()
|
||||
factory = Site(root)
|
||||
reactor.listenTCP(8998, factory)
|
||||
reactor.run()
|
60
scrapy/tests/test_crawl.py
Normal file
60
scrapy/tests/test_crawl.py
Normal file
@ -0,0 +1,60 @@
|
||||
import sys, time
|
||||
from twisted.internet import defer
|
||||
from twisted.trial.unittest import TestCase, SkipTest
|
||||
from subprocess import Popen
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Request
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
class FollowAllSpider(BaseSpider):
|
||||
|
||||
name = 'follow'
|
||||
start_urls = ["http://localhost:8998/follow?total=10&show=5&order=rand"]
|
||||
link_extractor = SgmlLinkExtractor()
|
||||
|
||||
def __init__(self):
|
||||
self.urls_visited = []
|
||||
self.times = []
|
||||
|
||||
def parse(self, response):
|
||||
self.urls_visited.append(response.url)
|
||||
self.times.append(time.time())
|
||||
for link in self.link_extractor.extract_links(response):
|
||||
yield Request(link.url, callback=self.parse)
|
||||
|
||||
def docrawl(spider, settings=None):
|
||||
crawler = get_crawler(settings)
|
||||
crawler.configure()
|
||||
crawler.crawl(spider)
|
||||
return crawler.start()
|
||||
|
||||
class CrawlTestCase(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.proc = Popen([sys.executable, '-m', 'scrapy.tests.mockserver'])
|
||||
time.sleep(0.2)
|
||||
|
||||
def tearDown(self):
|
||||
self.proc.kill()
|
||||
self.proc.wait()
|
||||
time.sleep(0.2)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_follow_all(self):
|
||||
spider = FollowAllSpider()
|
||||
yield docrawl(spider)
|
||||
self.assertEqual(len(spider.urls_visited), 11) # 10 + start_url
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_delay(self):
|
||||
# FIXME: this test fails because Scrapy leaves the reactor dirty with
|
||||
# callLater calls when download delays are used. This test should be
|
||||
# enabled after this bug is fixed.
|
||||
raise SkipTest("disabled due to a reactor leak in the scrapy downloader")
|
||||
|
||||
spider = FollowAllSpider()
|
||||
yield docrawl(spider)
|
||||
t = spider.times[0]
|
||||
for y in spider.times[1:]:
|
||||
self.assertTrue(y-t > 0.5, "download delay too small: %s" % (y-t))
|
Loading…
x
Reference in New Issue
Block a user