removed unnecesary ResponseLibxml2 extension and moved libxml2 document caching functionality to Libxml2Document using weak references

2025-02-25 15:24:15 +00:00 · 2009-08-10 20:52:43 -03:00 · 2009-08-10 20:52:43 -03:00 · d334c035c0
commit d334c035c0
parent 21f2bb6797
10 changed files with 59 additions and 119 deletions
--- a/docs/ref/extensions.rst
+++ b/docs/ref/extensions.rst
@ -23,19 +23,6 @@ Core Stats extension
 Enable the collection of core statistics, provided the stats collection are
 enabled (see :ref:`topics-stats`).

-Response Libxml2 extension
--------------------------
-
-.. module:: scrapy.xpath.extension
-   :synopsis: Libxml2 document caching for Responses
-
-.. class:: scrapy.path.extension.ResponseLibxml2
-
-Causes the :class:`~scrapy.http.Response` objects to grow a new method
-(``getlibxml2doc()``) which returns a (cached) libxml2 document of their
-contents. :ref:`XPath Selectors <topics-selectors>` use this extension for
-better performance, so it's highly recommended not to disable it.
-
 .. _ref-extensions-webconsole:

 Web console extension
@ -284,13 +271,3 @@ Stats collector dump WC extension
 .. class:: scrapy.contrib.webconsole.stats.StatsDump

 Display the stats collected so far by the stats collector.
-
-Spider stats WC extension
-------------------------
-
-.. module:: scrapy.contrib.webconsole.spiderstats
-   :synopsis: Spider stats web console extension
-
-.. class:: scrapy.contrib.webconsole.spiderstats.SpiderStats
-
-
--- a/docs/ref/settings.rst
+++ b/docs/ref/settings.rst
@ -390,7 +390,6 @@ Default::

    [
        'scrapy.stats.corestats.CoreStats',
-        'scrapy.xpath.extension.ResponseLibxml2',
        'scrapy.management.web.WebConsole',
        'scrapy.management.telnet.TelnetConsole',
        'scrapy.contrib.webconsole.scheduler.SchedulerQueue',
@ -398,7 +397,6 @@ Default::
        'scrapy.contrib.webconsole.spiderctl.Spiderctl',
        'scrapy.contrib.webconsole.enginestatus.EngineStatus',
        'scrapy.contrib.webconsole.stats.StatsDump',
-        'scrapy.contrib.webconsole.spiderstats.SpiderStats',
        'scrapy.contrib.spider.reloader.SpiderReloader',
        'scrapy.contrib.memusage.MemoryUsage',
        'scrapy.contrib.memdebug.MemoryDebugger',
--- a/scrapy/conf/default_settings.py
+++ b/scrapy/conf/default_settings.py
@ -86,7 +86,6 @@ DUPEFILTER_CLASS = 'scrapy.contrib.dupefilter.RequestFingerprintDupeFilter'

 EXTENSIONS = [
    'scrapy.stats.corestats.CoreStats',
-    'scrapy.xpath.extension.ResponseLibxml2',
    'scrapy.management.web.WebConsole',
    'scrapy.management.telnet.TelnetConsole',
    'scrapy.contrib.webconsole.scheduler.SchedulerQueue',
--- a/scrapy/tests/test_libxml2.py
+++ b/scrapy/tests/test_libxml2.py
@ -1,36 +0,0 @@
-import unittest
-
-import libxml2
-
-from scrapy.http import TextResponse
-from scrapy.utils.test import libxml2debug
-
-class Libxml2Test(unittest.TestCase):
-
-    @libxml2debug
-    def test_xpath(self):
-        #this test will fail in version 2.6.27 but passes on 2.6.29+
-        html = "<td>1<b>2</b>3</td>"
-        node = libxml2.htmlParseDoc(html, 'utf-8')
-        result = [str(r) for r in node.xpathEval('//text()')]
-        self.assertEquals(result, ['1', '2', '3'])
-        node.freeDoc()
-
-class ResponseLibxml2DocTest(unittest.TestCase):
-
-    @libxml2debug
-    def test_getlibxml2doc(self):
-        # test to simulate '\x00' char in body of html page
-        #this method shouldn't raise TypeError Exception
-
-        # make sure we load the libxml2 extension
-        from scrapy.extension import extensions
-        extensions.load() # 
-
-        self.body_content = 'test problematic \x00 body'
-        response = TextResponse('http://example.com/catalog/product/blabla-123',
-                            headers={'Content-Type': 'text/plain; charset=utf-8'}, body=self.body_content)
-        response.getlibxml2doc()
-
-if __name__ == "__main__":
-    unittest.main()
--- a/scrapy/tests/test_xpath.py
+++ b/scrapy/tests/test_xpath.py
@ -5,9 +5,10 @@ import libxml2

 from scrapy.http import TextResponse, HtmlResponse, XmlResponse
 from scrapy.xpath.selector import XmlXPathSelector, HtmlXPathSelector
+from scrapy.xpath.document import Libxml2Document
 from scrapy.utils.test import libxml2debug

-class XPathTestCase(unittest.TestCase):
+class XPathSelectorTestCase(unittest.TestCase):

    @libxml2debug
    def test_selector_simple(self):
@ -238,5 +239,44 @@ class XPathTestCase(unittest.TestCase):
            u'\n  ',
            u'\n  pff\n'])

+class Libxml2DocumentTest(unittest.TestCase):
+
+    @libxml2debug
+    def test_response_libxml2_caching(self):
+        r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
+        r2 = r1.copy()
+
+        doc1 = Libxml2Document(r1)
+        doc2 = Libxml2Document(r1)
+        doc3 = Libxml2Document(r2)
+
+        # make sure it's cached
+        assert doc1 is doc2
+        assert doc1.xmlDoc is doc2.xmlDoc
+        assert doc1 is not doc3
+        assert doc1.xmlDoc is not doc3.xmlDoc
+
+        # don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
+        del doc1, doc2, doc3
+
+    @libxml2debug
+    def test_null_char(self):
+        # make sure bodies with null char ('\x00') don't raise a TypeError exception
+        self.body_content = 'test problematic \x00 body'
+        response = TextResponse('http://example.com/catalog/product/blabla-123',
+                            headers={'Content-Type': 'text/plain; charset=utf-8'}, body=self.body_content)
+        Libxml2Document(response)
+
+class Libxml2Test(unittest.TestCase):
+
+    @libxml2debug
+    def test_libxml2_bug_2_6_27(self):
+        # this test will fail in version 2.6.27 but passes on 2.6.29+
+        html = "<td>1<b>2</b>3</td>"
+        node = libxml2.htmlParseDoc(html, 'utf-8')
+        result = [str(r) for r in node.xpathEval('//text()')]
+        self.assertEquals(result, ['1', '2', '3'])
+        node.freeDoc()
+
 if __name__ == "__main__":
    unittest.main()
--- a/scrapy/tests/test_xpath_extension.py
+++ b/scrapy/tests/test_xpath_extension.py
@ -1,25 +0,0 @@
-import unittest
-
-from scrapy.http import HtmlResponse
-from scrapy.xpath.extension import ResponseLibxml2
-
-class ResponseLibxml2Test(unittest.TestCase):
-
-    def setUp(self):
-        ResponseLibxml2()
-
-    def test_response_libxml2_caching(self):
-        r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
-        r2 = r1.copy()
-
-        doc1 = r1.getlibxml2doc()
-        doc2 = r1.getlibxml2doc()
-        doc3 = r2.getlibxml2doc()
-
-        # make sure it's cached
-        assert doc1 is doc2
-        assert doc1 is not doc3
-
-        # don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
-        del doc1, doc2, doc3
-
--- a/scrapy/utils/test.py
+++ b/scrapy/utils/test.py
@ -6,6 +6,8 @@ import os

 import libxml2

+from scrapy.xpath.document import Libxml2Document
+
 def libxml2debug(testfunction):
    """Decorator for debugging libxml2 memory leaks inside a function.
    
--- a/scrapy/xpath/document.py
+++ b/scrapy/xpath/document.py
@ -1,15 +1,24 @@
 """
-This module contains a simple class (Libxml2Document) to wrap libxml2 documents
-(xmlDoc) for proper garbage collection.
+This module contains a simple class (Libxml2Document) which provides cache and
+garbage collection to libxml2 documents (xmlDoc).
 """

+import weakref
+
 from scrapy.xpath.factories import xmlDoc_from_html

 class Libxml2Document(object):

-    def __init__(self, response, factory=xmlDoc_from_html):
-        self.xmlDoc = factory(response)
-        self.xpathContext = self.xmlDoc.xpathNewContext()
+    cache = weakref.WeakKeyDictionary()
+
+    def __new__(cls, response, factory=xmlDoc_from_html):
+        cache = cls.cache.setdefault(response, {})
+        if factory not in cache:
+            obj = object.__new__(cls)
+            obj.xmlDoc = factory(response)
+            obj.xpathContext = obj.xmlDoc.xpathNewContext()
+            cache[factory] = obj
+        return cache[factory]

    def __del__(self):
        # we must call both cleanup functions, so we try/except all exceptions
--- a/scrapy/xpath/extension.py
+++ b/scrapy/xpath/extension.py
@ -1,20 +0,0 @@
-"""
-The ResponseLibxml2 extension causes the Response objects to grow a new method
-("getlibxml2doc") which returns a (cached) libxml2 document of itself.
-"""
-
-from scrapy.http import Response
-from scrapy.xpath.document import Libxml2Document
-from scrapy.xpath.factories import xmlDoc_from_html
-
-class ResponseLibxml2(object):
-    def __init__(self):
-        setattr(Response, 'getlibxml2doc', getlibxml2doc)
-
-def getlibxml2doc(response, factory=xmlDoc_from_html):
-    cachekey = 'lx2doc_%s' % factory.__name__
-    if cachekey not in response.cache:
-        lx2doc = Libxml2Document(response, factory=factory)
-        response.cache[cachekey] = lx2doc
-    return response.cache[cachekey]
-
--- a/scrapy/xpath/selector.py
+++ b/scrapy/xpath/selector.py
@ -7,8 +7,8 @@ See documentation in docs/ref/selectors.rst
 import libxml2

 from scrapy.http import TextResponse
-from scrapy.xpath.extension import Libxml2Document
 from scrapy.xpath.factories import xmlDoc_from_html, xmlDoc_from_xml
+from scrapy.xpath.document import Libxml2Document
 from scrapy.utils.python import flatten, unicode_to_str
 from scrapy.utils.misc import extract_regex

@ -19,10 +19,6 @@ class XPathSelector(object):
            self.doc = parent.doc
            self.xmlNode = node
        elif response:
-            try:
-                # try with cached version first
-                self.doc = response.getlibxml2doc(factory=self._get_libxml2_doc)
-            except AttributeError:
            self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
            self.xmlNode = self.doc.xmlDoc
        elif text: