1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-24 17:44:33 +00:00

initial version of crawl tests using a mock HTTP server (in separate process). This can also be used to benchmark scrapy performance, although a script (specially suited for that task) would be more convenient

This commit is contained in:
Pablo Hoffman 2013-03-20 14:46:04 -03:00
parent 2a5c7ed4da
commit d0a81d369f
2 changed files with 143 additions and 0 deletions

View File

@ -0,0 +1,83 @@
import json, random, urllib
from time import time
from twisted.web.server import Site
from twisted.web.resource import Resource
from twisted.internet import reactor
_id = lambda x: x
_request_args = {
"args": _id,
"clientproto": _id,
"requestHeaders": lambda x: x._rawHeaders,
"responseHeaders": lambda x: x._rawHeaders,
"method": _id,
"path": _id,
"uri": _id,
}
def encode_request(request):
"""Encode request into a JSON-serializable type"""
d = {"time": time()}
for k, func in _request_args.iteritems():
d[k] = func(getattr(request, k))
return d
def getarg(request, name, default=None, type=str):
if name in request.args:
return type(request.args[name][0])
else:
return default
class Follow(Resource):
isLeaf = True
def render(self, request):
total = getarg(request, "total", 100, type=int)
show = getarg(request, "show", 1, type=int)
order = getarg(request, "order", "desc")
n = getarg(request, "n", total, type=int)
if order == "rand":
nlist = [random.randint(1, total) for _ in range(show)]
else: # order == "desc"
nlist = range(n, max(n-show, 0), -1)
s = """<html> <head></head> <body>"""
args = request.args.copy()
for nl in nlist:
args["n"] = [str(nl)]
argstr = urllib.urlencode(args, doseq=True)
s += "<a href='/follow?%s'>follow %d</a><br>" % (argstr, nl)
s += """</body>"""
return s
class Log(Resource):
isLeaf = True
def __init__(self, log):
self.log = log
def render(self, request):
return json.dumps(self.log)
class Root(Resource):
def __init__(self):
Resource.__init__(self)
self.log = []
self.putChild("follow", Follow())
self.putChild("log", Log(self.log))
def getChild(self, request, name):
return self
def render(self, request):
return 'Scrapy mock HTTP server\n'
root = Root()
factory = Site(root)
reactor.listenTCP(8998, factory)
reactor.run()

View File

@ -0,0 +1,60 @@
import sys, time
from twisted.internet import defer
from twisted.trial.unittest import TestCase, SkipTest
from subprocess import Popen
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.utils.test import get_crawler
class FollowAllSpider(BaseSpider):
name = 'follow'
start_urls = ["http://localhost:8998/follow?total=10&show=5&order=rand"]
link_extractor = SgmlLinkExtractor()
def __init__(self):
self.urls_visited = []
self.times = []
def parse(self, response):
self.urls_visited.append(response.url)
self.times.append(time.time())
for link in self.link_extractor.extract_links(response):
yield Request(link.url, callback=self.parse)
def docrawl(spider, settings=None):
crawler = get_crawler(settings)
crawler.configure()
crawler.crawl(spider)
return crawler.start()
class CrawlTestCase(TestCase):
def setUp(self):
self.proc = Popen([sys.executable, '-m', 'scrapy.tests.mockserver'])
time.sleep(0.2)
def tearDown(self):
self.proc.kill()
self.proc.wait()
time.sleep(0.2)
@defer.inlineCallbacks
def test_follow_all(self):
spider = FollowAllSpider()
yield docrawl(spider)
self.assertEqual(len(spider.urls_visited), 11) # 10 + start_url
@defer.inlineCallbacks
def test_delay(self):
# FIXME: this test fails because Scrapy leaves the reactor dirty with
# callLater calls when download delays are used. This test should be
# enabled after this bug is fixed.
raise SkipTest("disabled due to a reactor leak in the scrapy downloader")
spider = FollowAllSpider()
yield docrawl(spider)
t = spider.times[0]
for y in spider.times[1:]:
self.assertTrue(y-t > 0.5, "download delay too small: %s" % (y-t))