1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 15:24:15 +00:00

removed unnecesary ResponseLibxml2 extension and moved libxml2 document caching functionality to Libxml2Document using weak references

This commit is contained in:
Pablo Hoffman 2009-08-10 20:52:43 -03:00
parent 21f2bb6797
commit d334c035c0
10 changed files with 59 additions and 119 deletions

View File

@ -23,19 +23,6 @@ Core Stats extension
Enable the collection of core statistics, provided the stats collection are
enabled (see :ref:`topics-stats`).
Response Libxml2 extension
--------------------------
.. module:: scrapy.xpath.extension
:synopsis: Libxml2 document caching for Responses
.. class:: scrapy.path.extension.ResponseLibxml2
Causes the :class:`~scrapy.http.Response` objects to grow a new method
(``getlibxml2doc()``) which returns a (cached) libxml2 document of their
contents. :ref:`XPath Selectors <topics-selectors>` use this extension for
better performance, so it's highly recommended not to disable it.
.. _ref-extensions-webconsole:
Web console extension
@ -284,13 +271,3 @@ Stats collector dump WC extension
.. class:: scrapy.contrib.webconsole.stats.StatsDump
Display the stats collected so far by the stats collector.
Spider stats WC extension
-------------------------
.. module:: scrapy.contrib.webconsole.spiderstats
:synopsis: Spider stats web console extension
.. class:: scrapy.contrib.webconsole.spiderstats.SpiderStats

View File

@ -390,7 +390,6 @@ Default::
[
'scrapy.stats.corestats.CoreStats',
'scrapy.xpath.extension.ResponseLibxml2',
'scrapy.management.web.WebConsole',
'scrapy.management.telnet.TelnetConsole',
'scrapy.contrib.webconsole.scheduler.SchedulerQueue',
@ -398,7 +397,6 @@ Default::
'scrapy.contrib.webconsole.spiderctl.Spiderctl',
'scrapy.contrib.webconsole.enginestatus.EngineStatus',
'scrapy.contrib.webconsole.stats.StatsDump',
'scrapy.contrib.webconsole.spiderstats.SpiderStats',
'scrapy.contrib.spider.reloader.SpiderReloader',
'scrapy.contrib.memusage.MemoryUsage',
'scrapy.contrib.memdebug.MemoryDebugger',

View File

@ -86,7 +86,6 @@ DUPEFILTER_CLASS = 'scrapy.contrib.dupefilter.RequestFingerprintDupeFilter'
EXTENSIONS = [
'scrapy.stats.corestats.CoreStats',
'scrapy.xpath.extension.ResponseLibxml2',
'scrapy.management.web.WebConsole',
'scrapy.management.telnet.TelnetConsole',
'scrapy.contrib.webconsole.scheduler.SchedulerQueue',

View File

@ -1,36 +0,0 @@
import unittest
import libxml2
from scrapy.http import TextResponse
from scrapy.utils.test import libxml2debug
class Libxml2Test(unittest.TestCase):
@libxml2debug
def test_xpath(self):
#this test will fail in version 2.6.27 but passes on 2.6.29+
html = "<td>1<b>2</b>3</td>"
node = libxml2.htmlParseDoc(html, 'utf-8')
result = [str(r) for r in node.xpathEval('//text()')]
self.assertEquals(result, ['1', '2', '3'])
node.freeDoc()
class ResponseLibxml2DocTest(unittest.TestCase):
@libxml2debug
def test_getlibxml2doc(self):
# test to simulate '\x00' char in body of html page
#this method shouldn't raise TypeError Exception
# make sure we load the libxml2 extension
from scrapy.extension import extensions
extensions.load() #
self.body_content = 'test problematic \x00 body'
response = TextResponse('http://example.com/catalog/product/blabla-123',
headers={'Content-Type': 'text/plain; charset=utf-8'}, body=self.body_content)
response.getlibxml2doc()
if __name__ == "__main__":
unittest.main()

View File

@ -5,9 +5,10 @@ import libxml2
from scrapy.http import TextResponse, HtmlResponse, XmlResponse
from scrapy.xpath.selector import XmlXPathSelector, HtmlXPathSelector
from scrapy.xpath.document import Libxml2Document
from scrapy.utils.test import libxml2debug
class XPathTestCase(unittest.TestCase):
class XPathSelectorTestCase(unittest.TestCase):
@libxml2debug
def test_selector_simple(self):
@ -238,5 +239,44 @@ class XPathTestCase(unittest.TestCase):
u'\n ',
u'\n pff\n'])
class Libxml2DocumentTest(unittest.TestCase):
@libxml2debug
def test_response_libxml2_caching(self):
r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
r2 = r1.copy()
doc1 = Libxml2Document(r1)
doc2 = Libxml2Document(r1)
doc3 = Libxml2Document(r2)
# make sure it's cached
assert doc1 is doc2
assert doc1.xmlDoc is doc2.xmlDoc
assert doc1 is not doc3
assert doc1.xmlDoc is not doc3.xmlDoc
# don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
del doc1, doc2, doc3
@libxml2debug
def test_null_char(self):
# make sure bodies with null char ('\x00') don't raise a TypeError exception
self.body_content = 'test problematic \x00 body'
response = TextResponse('http://example.com/catalog/product/blabla-123',
headers={'Content-Type': 'text/plain; charset=utf-8'}, body=self.body_content)
Libxml2Document(response)
class Libxml2Test(unittest.TestCase):
@libxml2debug
def test_libxml2_bug_2_6_27(self):
# this test will fail in version 2.6.27 but passes on 2.6.29+
html = "<td>1<b>2</b>3</td>"
node = libxml2.htmlParseDoc(html, 'utf-8')
result = [str(r) for r in node.xpathEval('//text()')]
self.assertEquals(result, ['1', '2', '3'])
node.freeDoc()
if __name__ == "__main__":
unittest.main()

View File

@ -1,25 +0,0 @@
import unittest
from scrapy.http import HtmlResponse
from scrapy.xpath.extension import ResponseLibxml2
class ResponseLibxml2Test(unittest.TestCase):
def setUp(self):
ResponseLibxml2()
def test_response_libxml2_caching(self):
r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
r2 = r1.copy()
doc1 = r1.getlibxml2doc()
doc2 = r1.getlibxml2doc()
doc3 = r2.getlibxml2doc()
# make sure it's cached
assert doc1 is doc2
assert doc1 is not doc3
# don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
del doc1, doc2, doc3

View File

@ -6,6 +6,8 @@ import os
import libxml2
from scrapy.xpath.document import Libxml2Document
def libxml2debug(testfunction):
"""Decorator for debugging libxml2 memory leaks inside a function.

View File

@ -1,15 +1,24 @@
"""
This module contains a simple class (Libxml2Document) to wrap libxml2 documents
(xmlDoc) for proper garbage collection.
This module contains a simple class (Libxml2Document) which provides cache and
garbage collection to libxml2 documents (xmlDoc).
"""
import weakref
from scrapy.xpath.factories import xmlDoc_from_html
class Libxml2Document(object):
def __init__(self, response, factory=xmlDoc_from_html):
self.xmlDoc = factory(response)
self.xpathContext = self.xmlDoc.xpathNewContext()
cache = weakref.WeakKeyDictionary()
def __new__(cls, response, factory=xmlDoc_from_html):
cache = cls.cache.setdefault(response, {})
if factory not in cache:
obj = object.__new__(cls)
obj.xmlDoc = factory(response)
obj.xpathContext = obj.xmlDoc.xpathNewContext()
cache[factory] = obj
return cache[factory]
def __del__(self):
# we must call both cleanup functions, so we try/except all exceptions

View File

@ -1,20 +0,0 @@
"""
The ResponseLibxml2 extension causes the Response objects to grow a new method
("getlibxml2doc") which returns a (cached) libxml2 document of itself.
"""
from scrapy.http import Response
from scrapy.xpath.document import Libxml2Document
from scrapy.xpath.factories import xmlDoc_from_html
class ResponseLibxml2(object):
def __init__(self):
setattr(Response, 'getlibxml2doc', getlibxml2doc)
def getlibxml2doc(response, factory=xmlDoc_from_html):
cachekey = 'lx2doc_%s' % factory.__name__
if cachekey not in response.cache:
lx2doc = Libxml2Document(response, factory=factory)
response.cache[cachekey] = lx2doc
return response.cache[cachekey]

View File

@ -7,8 +7,8 @@ See documentation in docs/ref/selectors.rst
import libxml2
from scrapy.http import TextResponse
from scrapy.xpath.extension import Libxml2Document
from scrapy.xpath.factories import xmlDoc_from_html, xmlDoc_from_xml
from scrapy.xpath.document import Libxml2Document
from scrapy.utils.python import flatten, unicode_to_str
from scrapy.utils.misc import extract_regex
@ -19,10 +19,6 @@ class XPathSelector(object):
self.doc = parent.doc
self.xmlNode = node
elif response:
try:
# try with cached version first
self.doc = response.getlibxml2doc(factory=self._get_libxml2_doc)
except AttributeError:
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
self.xmlNode = self.doc.xmlDoc
elif text: