mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 15:24:15 +00:00
removed unnecesary ResponseLibxml2 extension and moved libxml2 document caching functionality to Libxml2Document using weak references
This commit is contained in:
parent
21f2bb6797
commit
d334c035c0
@ -23,19 +23,6 @@ Core Stats extension
|
||||
Enable the collection of core statistics, provided the stats collection are
|
||||
enabled (see :ref:`topics-stats`).
|
||||
|
||||
Response Libxml2 extension
|
||||
--------------------------
|
||||
|
||||
.. module:: scrapy.xpath.extension
|
||||
:synopsis: Libxml2 document caching for Responses
|
||||
|
||||
.. class:: scrapy.path.extension.ResponseLibxml2
|
||||
|
||||
Causes the :class:`~scrapy.http.Response` objects to grow a new method
|
||||
(``getlibxml2doc()``) which returns a (cached) libxml2 document of their
|
||||
contents. :ref:`XPath Selectors <topics-selectors>` use this extension for
|
||||
better performance, so it's highly recommended not to disable it.
|
||||
|
||||
.. _ref-extensions-webconsole:
|
||||
|
||||
Web console extension
|
||||
@ -284,13 +271,3 @@ Stats collector dump WC extension
|
||||
.. class:: scrapy.contrib.webconsole.stats.StatsDump
|
||||
|
||||
Display the stats collected so far by the stats collector.
|
||||
|
||||
Spider stats WC extension
|
||||
-------------------------
|
||||
|
||||
.. module:: scrapy.contrib.webconsole.spiderstats
|
||||
:synopsis: Spider stats web console extension
|
||||
|
||||
.. class:: scrapy.contrib.webconsole.spiderstats.SpiderStats
|
||||
|
||||
|
||||
|
@ -390,7 +390,6 @@ Default::
|
||||
|
||||
[
|
||||
'scrapy.stats.corestats.CoreStats',
|
||||
'scrapy.xpath.extension.ResponseLibxml2',
|
||||
'scrapy.management.web.WebConsole',
|
||||
'scrapy.management.telnet.TelnetConsole',
|
||||
'scrapy.contrib.webconsole.scheduler.SchedulerQueue',
|
||||
@ -398,7 +397,6 @@ Default::
|
||||
'scrapy.contrib.webconsole.spiderctl.Spiderctl',
|
||||
'scrapy.contrib.webconsole.enginestatus.EngineStatus',
|
||||
'scrapy.contrib.webconsole.stats.StatsDump',
|
||||
'scrapy.contrib.webconsole.spiderstats.SpiderStats',
|
||||
'scrapy.contrib.spider.reloader.SpiderReloader',
|
||||
'scrapy.contrib.memusage.MemoryUsage',
|
||||
'scrapy.contrib.memdebug.MemoryDebugger',
|
||||
|
@ -86,7 +86,6 @@ DUPEFILTER_CLASS = 'scrapy.contrib.dupefilter.RequestFingerprintDupeFilter'
|
||||
|
||||
EXTENSIONS = [
|
||||
'scrapy.stats.corestats.CoreStats',
|
||||
'scrapy.xpath.extension.ResponseLibxml2',
|
||||
'scrapy.management.web.WebConsole',
|
||||
'scrapy.management.telnet.TelnetConsole',
|
||||
'scrapy.contrib.webconsole.scheduler.SchedulerQueue',
|
||||
|
@ -1,36 +0,0 @@
|
||||
import unittest
|
||||
|
||||
import libxml2
|
||||
|
||||
from scrapy.http import TextResponse
|
||||
from scrapy.utils.test import libxml2debug
|
||||
|
||||
class Libxml2Test(unittest.TestCase):
|
||||
|
||||
@libxml2debug
|
||||
def test_xpath(self):
|
||||
#this test will fail in version 2.6.27 but passes on 2.6.29+
|
||||
html = "<td>1<b>2</b>3</td>"
|
||||
node = libxml2.htmlParseDoc(html, 'utf-8')
|
||||
result = [str(r) for r in node.xpathEval('//text()')]
|
||||
self.assertEquals(result, ['1', '2', '3'])
|
||||
node.freeDoc()
|
||||
|
||||
class ResponseLibxml2DocTest(unittest.TestCase):
|
||||
|
||||
@libxml2debug
|
||||
def test_getlibxml2doc(self):
|
||||
# test to simulate '\x00' char in body of html page
|
||||
#this method shouldn't raise TypeError Exception
|
||||
|
||||
# make sure we load the libxml2 extension
|
||||
from scrapy.extension import extensions
|
||||
extensions.load() #
|
||||
|
||||
self.body_content = 'test problematic \x00 body'
|
||||
response = TextResponse('http://example.com/catalog/product/blabla-123',
|
||||
headers={'Content-Type': 'text/plain; charset=utf-8'}, body=self.body_content)
|
||||
response.getlibxml2doc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -5,9 +5,10 @@ import libxml2
|
||||
|
||||
from scrapy.http import TextResponse, HtmlResponse, XmlResponse
|
||||
from scrapy.xpath.selector import XmlXPathSelector, HtmlXPathSelector
|
||||
from scrapy.xpath.document import Libxml2Document
|
||||
from scrapy.utils.test import libxml2debug
|
||||
|
||||
class XPathTestCase(unittest.TestCase):
|
||||
class XPathSelectorTestCase(unittest.TestCase):
|
||||
|
||||
@libxml2debug
|
||||
def test_selector_simple(self):
|
||||
@ -238,5 +239,44 @@ class XPathTestCase(unittest.TestCase):
|
||||
u'\n ',
|
||||
u'\n pff\n'])
|
||||
|
||||
class Libxml2DocumentTest(unittest.TestCase):
|
||||
|
||||
@libxml2debug
|
||||
def test_response_libxml2_caching(self):
|
||||
r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
|
||||
r2 = r1.copy()
|
||||
|
||||
doc1 = Libxml2Document(r1)
|
||||
doc2 = Libxml2Document(r1)
|
||||
doc3 = Libxml2Document(r2)
|
||||
|
||||
# make sure it's cached
|
||||
assert doc1 is doc2
|
||||
assert doc1.xmlDoc is doc2.xmlDoc
|
||||
assert doc1 is not doc3
|
||||
assert doc1.xmlDoc is not doc3.xmlDoc
|
||||
|
||||
# don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
|
||||
del doc1, doc2, doc3
|
||||
|
||||
@libxml2debug
|
||||
def test_null_char(self):
|
||||
# make sure bodies with null char ('\x00') don't raise a TypeError exception
|
||||
self.body_content = 'test problematic \x00 body'
|
||||
response = TextResponse('http://example.com/catalog/product/blabla-123',
|
||||
headers={'Content-Type': 'text/plain; charset=utf-8'}, body=self.body_content)
|
||||
Libxml2Document(response)
|
||||
|
||||
class Libxml2Test(unittest.TestCase):
|
||||
|
||||
@libxml2debug
|
||||
def test_libxml2_bug_2_6_27(self):
|
||||
# this test will fail in version 2.6.27 but passes on 2.6.29+
|
||||
html = "<td>1<b>2</b>3</td>"
|
||||
node = libxml2.htmlParseDoc(html, 'utf-8')
|
||||
result = [str(r) for r in node.xpathEval('//text()')]
|
||||
self.assertEquals(result, ['1', '2', '3'])
|
||||
node.freeDoc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -1,25 +0,0 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.xpath.extension import ResponseLibxml2
|
||||
|
||||
class ResponseLibxml2Test(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
ResponseLibxml2()
|
||||
|
||||
def test_response_libxml2_caching(self):
|
||||
r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
|
||||
r2 = r1.copy()
|
||||
|
||||
doc1 = r1.getlibxml2doc()
|
||||
doc2 = r1.getlibxml2doc()
|
||||
doc3 = r2.getlibxml2doc()
|
||||
|
||||
# make sure it's cached
|
||||
assert doc1 is doc2
|
||||
assert doc1 is not doc3
|
||||
|
||||
# don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
|
||||
del doc1, doc2, doc3
|
||||
|
@ -6,6 +6,8 @@ import os
|
||||
|
||||
import libxml2
|
||||
|
||||
from scrapy.xpath.document import Libxml2Document
|
||||
|
||||
def libxml2debug(testfunction):
|
||||
"""Decorator for debugging libxml2 memory leaks inside a function.
|
||||
|
||||
|
@ -1,15 +1,24 @@
|
||||
"""
|
||||
This module contains a simple class (Libxml2Document) to wrap libxml2 documents
|
||||
(xmlDoc) for proper garbage collection.
|
||||
This module contains a simple class (Libxml2Document) which provides cache and
|
||||
garbage collection to libxml2 documents (xmlDoc).
|
||||
"""
|
||||
|
||||
import weakref
|
||||
|
||||
from scrapy.xpath.factories import xmlDoc_from_html
|
||||
|
||||
class Libxml2Document(object):
|
||||
|
||||
def __init__(self, response, factory=xmlDoc_from_html):
|
||||
self.xmlDoc = factory(response)
|
||||
self.xpathContext = self.xmlDoc.xpathNewContext()
|
||||
cache = weakref.WeakKeyDictionary()
|
||||
|
||||
def __new__(cls, response, factory=xmlDoc_from_html):
|
||||
cache = cls.cache.setdefault(response, {})
|
||||
if factory not in cache:
|
||||
obj = object.__new__(cls)
|
||||
obj.xmlDoc = factory(response)
|
||||
obj.xpathContext = obj.xmlDoc.xpathNewContext()
|
||||
cache[factory] = obj
|
||||
return cache[factory]
|
||||
|
||||
def __del__(self):
|
||||
# we must call both cleanup functions, so we try/except all exceptions
|
||||
|
@ -1,20 +0,0 @@
|
||||
"""
|
||||
The ResponseLibxml2 extension causes the Response objects to grow a new method
|
||||
("getlibxml2doc") which returns a (cached) libxml2 document of itself.
|
||||
"""
|
||||
|
||||
from scrapy.http import Response
|
||||
from scrapy.xpath.document import Libxml2Document
|
||||
from scrapy.xpath.factories import xmlDoc_from_html
|
||||
|
||||
class ResponseLibxml2(object):
|
||||
def __init__(self):
|
||||
setattr(Response, 'getlibxml2doc', getlibxml2doc)
|
||||
|
||||
def getlibxml2doc(response, factory=xmlDoc_from_html):
|
||||
cachekey = 'lx2doc_%s' % factory.__name__
|
||||
if cachekey not in response.cache:
|
||||
lx2doc = Libxml2Document(response, factory=factory)
|
||||
response.cache[cachekey] = lx2doc
|
||||
return response.cache[cachekey]
|
||||
|
@ -7,8 +7,8 @@ See documentation in docs/ref/selectors.rst
|
||||
import libxml2
|
||||
|
||||
from scrapy.http import TextResponse
|
||||
from scrapy.xpath.extension import Libxml2Document
|
||||
from scrapy.xpath.factories import xmlDoc_from_html, xmlDoc_from_xml
|
||||
from scrapy.xpath.document import Libxml2Document
|
||||
from scrapy.utils.python import flatten, unicode_to_str
|
||||
from scrapy.utils.misc import extract_regex
|
||||
|
||||
@ -19,10 +19,6 @@ class XPathSelector(object):
|
||||
self.doc = parent.doc
|
||||
self.xmlNode = node
|
||||
elif response:
|
||||
try:
|
||||
# try with cached version first
|
||||
self.doc = response.getlibxml2doc(factory=self._get_libxml2_doc)
|
||||
except AttributeError:
|
||||
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
|
||||
self.xmlNode = self.doc.xmlDoc
|
||||
elif text:
|
||||
|
Loading…
x
Reference in New Issue
Block a user