1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-25 15:24:15 +00:00

removed unnecesary ResponseLibxml2 extension and moved libxml2 document caching functionality to Libxml2Document using weak references

This commit is contained in:
Pablo Hoffman 2009-08-10 20:52:43 -03:00
parent 21f2bb6797
commit d334c035c0
10 changed files with 59 additions and 119 deletions

View File

@ -23,19 +23,6 @@ Core Stats extension
Enable the collection of core statistics, provided the stats collection are Enable the collection of core statistics, provided the stats collection are
enabled (see :ref:`topics-stats`). enabled (see :ref:`topics-stats`).
Response Libxml2 extension
--------------------------
.. module:: scrapy.xpath.extension
:synopsis: Libxml2 document caching for Responses
.. class:: scrapy.path.extension.ResponseLibxml2
Causes the :class:`~scrapy.http.Response` objects to grow a new method
(``getlibxml2doc()``) which returns a (cached) libxml2 document of their
contents. :ref:`XPath Selectors <topics-selectors>` use this extension for
better performance, so it's highly recommended not to disable it.
.. _ref-extensions-webconsole: .. _ref-extensions-webconsole:
Web console extension Web console extension
@ -284,13 +271,3 @@ Stats collector dump WC extension
.. class:: scrapy.contrib.webconsole.stats.StatsDump .. class:: scrapy.contrib.webconsole.stats.StatsDump
Display the stats collected so far by the stats collector. Display the stats collected so far by the stats collector.
Spider stats WC extension
-------------------------
.. module:: scrapy.contrib.webconsole.spiderstats
:synopsis: Spider stats web console extension
.. class:: scrapy.contrib.webconsole.spiderstats.SpiderStats

View File

@ -390,7 +390,6 @@ Default::
[ [
'scrapy.stats.corestats.CoreStats', 'scrapy.stats.corestats.CoreStats',
'scrapy.xpath.extension.ResponseLibxml2',
'scrapy.management.web.WebConsole', 'scrapy.management.web.WebConsole',
'scrapy.management.telnet.TelnetConsole', 'scrapy.management.telnet.TelnetConsole',
'scrapy.contrib.webconsole.scheduler.SchedulerQueue', 'scrapy.contrib.webconsole.scheduler.SchedulerQueue',
@ -398,7 +397,6 @@ Default::
'scrapy.contrib.webconsole.spiderctl.Spiderctl', 'scrapy.contrib.webconsole.spiderctl.Spiderctl',
'scrapy.contrib.webconsole.enginestatus.EngineStatus', 'scrapy.contrib.webconsole.enginestatus.EngineStatus',
'scrapy.contrib.webconsole.stats.StatsDump', 'scrapy.contrib.webconsole.stats.StatsDump',
'scrapy.contrib.webconsole.spiderstats.SpiderStats',
'scrapy.contrib.spider.reloader.SpiderReloader', 'scrapy.contrib.spider.reloader.SpiderReloader',
'scrapy.contrib.memusage.MemoryUsage', 'scrapy.contrib.memusage.MemoryUsage',
'scrapy.contrib.memdebug.MemoryDebugger', 'scrapy.contrib.memdebug.MemoryDebugger',

View File

@ -86,7 +86,6 @@ DUPEFILTER_CLASS = 'scrapy.contrib.dupefilter.RequestFingerprintDupeFilter'
EXTENSIONS = [ EXTENSIONS = [
'scrapy.stats.corestats.CoreStats', 'scrapy.stats.corestats.CoreStats',
'scrapy.xpath.extension.ResponseLibxml2',
'scrapy.management.web.WebConsole', 'scrapy.management.web.WebConsole',
'scrapy.management.telnet.TelnetConsole', 'scrapy.management.telnet.TelnetConsole',
'scrapy.contrib.webconsole.scheduler.SchedulerQueue', 'scrapy.contrib.webconsole.scheduler.SchedulerQueue',

View File

@ -1,36 +0,0 @@
import unittest
import libxml2
from scrapy.http import TextResponse
from scrapy.utils.test import libxml2debug
class Libxml2Test(unittest.TestCase):
@libxml2debug
def test_xpath(self):
#this test will fail in version 2.6.27 but passes on 2.6.29+
html = "<td>1<b>2</b>3</td>"
node = libxml2.htmlParseDoc(html, 'utf-8')
result = [str(r) for r in node.xpathEval('//text()')]
self.assertEquals(result, ['1', '2', '3'])
node.freeDoc()
class ResponseLibxml2DocTest(unittest.TestCase):
@libxml2debug
def test_getlibxml2doc(self):
# test to simulate '\x00' char in body of html page
#this method shouldn't raise TypeError Exception
# make sure we load the libxml2 extension
from scrapy.extension import extensions
extensions.load() #
self.body_content = 'test problematic \x00 body'
response = TextResponse('http://example.com/catalog/product/blabla-123',
headers={'Content-Type': 'text/plain; charset=utf-8'}, body=self.body_content)
response.getlibxml2doc()
if __name__ == "__main__":
unittest.main()

View File

@ -5,9 +5,10 @@ import libxml2
from scrapy.http import TextResponse, HtmlResponse, XmlResponse from scrapy.http import TextResponse, HtmlResponse, XmlResponse
from scrapy.xpath.selector import XmlXPathSelector, HtmlXPathSelector from scrapy.xpath.selector import XmlXPathSelector, HtmlXPathSelector
from scrapy.xpath.document import Libxml2Document
from scrapy.utils.test import libxml2debug from scrapy.utils.test import libxml2debug
class XPathTestCase(unittest.TestCase): class XPathSelectorTestCase(unittest.TestCase):
@libxml2debug @libxml2debug
def test_selector_simple(self): def test_selector_simple(self):
@ -238,5 +239,44 @@ class XPathTestCase(unittest.TestCase):
u'\n ', u'\n ',
u'\n pff\n']) u'\n pff\n'])
class Libxml2DocumentTest(unittest.TestCase):
@libxml2debug
def test_response_libxml2_caching(self):
r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
r2 = r1.copy()
doc1 = Libxml2Document(r1)
doc2 = Libxml2Document(r1)
doc3 = Libxml2Document(r2)
# make sure it's cached
assert doc1 is doc2
assert doc1.xmlDoc is doc2.xmlDoc
assert doc1 is not doc3
assert doc1.xmlDoc is not doc3.xmlDoc
# don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
del doc1, doc2, doc3
@libxml2debug
def test_null_char(self):
# make sure bodies with null char ('\x00') don't raise a TypeError exception
self.body_content = 'test problematic \x00 body'
response = TextResponse('http://example.com/catalog/product/blabla-123',
headers={'Content-Type': 'text/plain; charset=utf-8'}, body=self.body_content)
Libxml2Document(response)
class Libxml2Test(unittest.TestCase):
@libxml2debug
def test_libxml2_bug_2_6_27(self):
# this test will fail in version 2.6.27 but passes on 2.6.29+
html = "<td>1<b>2</b>3</td>"
node = libxml2.htmlParseDoc(html, 'utf-8')
result = [str(r) for r in node.xpathEval('//text()')]
self.assertEquals(result, ['1', '2', '3'])
node.freeDoc()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -1,25 +0,0 @@
import unittest
from scrapy.http import HtmlResponse
from scrapy.xpath.extension import ResponseLibxml2
class ResponseLibxml2Test(unittest.TestCase):
def setUp(self):
ResponseLibxml2()
def test_response_libxml2_caching(self):
r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
r2 = r1.copy()
doc1 = r1.getlibxml2doc()
doc2 = r1.getlibxml2doc()
doc3 = r2.getlibxml2doc()
# make sure it's cached
assert doc1 is doc2
assert doc1 is not doc3
# don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
del doc1, doc2, doc3

View File

@ -6,6 +6,8 @@ import os
import libxml2 import libxml2
from scrapy.xpath.document import Libxml2Document
def libxml2debug(testfunction): def libxml2debug(testfunction):
"""Decorator for debugging libxml2 memory leaks inside a function. """Decorator for debugging libxml2 memory leaks inside a function.

View File

@ -1,15 +1,24 @@
""" """
This module contains a simple class (Libxml2Document) to wrap libxml2 documents This module contains a simple class (Libxml2Document) which provides cache and
(xmlDoc) for proper garbage collection. garbage collection to libxml2 documents (xmlDoc).
""" """
import weakref
from scrapy.xpath.factories import xmlDoc_from_html from scrapy.xpath.factories import xmlDoc_from_html
class Libxml2Document(object): class Libxml2Document(object):
def __init__(self, response, factory=xmlDoc_from_html): cache = weakref.WeakKeyDictionary()
self.xmlDoc = factory(response)
self.xpathContext = self.xmlDoc.xpathNewContext() def __new__(cls, response, factory=xmlDoc_from_html):
cache = cls.cache.setdefault(response, {})
if factory not in cache:
obj = object.__new__(cls)
obj.xmlDoc = factory(response)
obj.xpathContext = obj.xmlDoc.xpathNewContext()
cache[factory] = obj
return cache[factory]
def __del__(self): def __del__(self):
# we must call both cleanup functions, so we try/except all exceptions # we must call both cleanup functions, so we try/except all exceptions

View File

@ -1,20 +0,0 @@
"""
The ResponseLibxml2 extension causes the Response objects to grow a new method
("getlibxml2doc") which returns a (cached) libxml2 document of itself.
"""
from scrapy.http import Response
from scrapy.xpath.document import Libxml2Document
from scrapy.xpath.factories import xmlDoc_from_html
class ResponseLibxml2(object):
def __init__(self):
setattr(Response, 'getlibxml2doc', getlibxml2doc)
def getlibxml2doc(response, factory=xmlDoc_from_html):
cachekey = 'lx2doc_%s' % factory.__name__
if cachekey not in response.cache:
lx2doc = Libxml2Document(response, factory=factory)
response.cache[cachekey] = lx2doc
return response.cache[cachekey]

View File

@ -7,8 +7,8 @@ See documentation in docs/ref/selectors.rst
import libxml2 import libxml2
from scrapy.http import TextResponse from scrapy.http import TextResponse
from scrapy.xpath.extension import Libxml2Document
from scrapy.xpath.factories import xmlDoc_from_html, xmlDoc_from_xml from scrapy.xpath.factories import xmlDoc_from_html, xmlDoc_from_xml
from scrapy.xpath.document import Libxml2Document
from scrapy.utils.python import flatten, unicode_to_str from scrapy.utils.python import flatten, unicode_to_str
from scrapy.utils.misc import extract_regex from scrapy.utils.misc import extract_regex
@ -19,11 +19,7 @@ class XPathSelector(object):
self.doc = parent.doc self.doc = parent.doc
self.xmlNode = node self.xmlNode = node
elif response: elif response:
try: self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
# try with cached version first
self.doc = response.getlibxml2doc(factory=self._get_libxml2_doc)
except AttributeError:
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
self.xmlNode = self.doc.xmlDoc self.xmlNode = self.doc.xmlDoc
elif text: elif text:
response = TextResponse(url=None, body=unicode_to_str(text), \ response = TextResponse(url=None, body=unicode_to_str(text), \