mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 15:24:15 +00:00
removed unnecesary ResponseLibxml2 extension and moved libxml2 document caching functionality to Libxml2Document using weak references
This commit is contained in:
parent
21f2bb6797
commit
d334c035c0
@ -23,19 +23,6 @@ Core Stats extension
|
|||||||
Enable the collection of core statistics, provided the stats collection are
|
Enable the collection of core statistics, provided the stats collection are
|
||||||
enabled (see :ref:`topics-stats`).
|
enabled (see :ref:`topics-stats`).
|
||||||
|
|
||||||
Response Libxml2 extension
|
|
||||||
--------------------------
|
|
||||||
|
|
||||||
.. module:: scrapy.xpath.extension
|
|
||||||
:synopsis: Libxml2 document caching for Responses
|
|
||||||
|
|
||||||
.. class:: scrapy.path.extension.ResponseLibxml2
|
|
||||||
|
|
||||||
Causes the :class:`~scrapy.http.Response` objects to grow a new method
|
|
||||||
(``getlibxml2doc()``) which returns a (cached) libxml2 document of their
|
|
||||||
contents. :ref:`XPath Selectors <topics-selectors>` use this extension for
|
|
||||||
better performance, so it's highly recommended not to disable it.
|
|
||||||
|
|
||||||
.. _ref-extensions-webconsole:
|
.. _ref-extensions-webconsole:
|
||||||
|
|
||||||
Web console extension
|
Web console extension
|
||||||
@ -284,13 +271,3 @@ Stats collector dump WC extension
|
|||||||
.. class:: scrapy.contrib.webconsole.stats.StatsDump
|
.. class:: scrapy.contrib.webconsole.stats.StatsDump
|
||||||
|
|
||||||
Display the stats collected so far by the stats collector.
|
Display the stats collected so far by the stats collector.
|
||||||
|
|
||||||
Spider stats WC extension
|
|
||||||
-------------------------
|
|
||||||
|
|
||||||
.. module:: scrapy.contrib.webconsole.spiderstats
|
|
||||||
:synopsis: Spider stats web console extension
|
|
||||||
|
|
||||||
.. class:: scrapy.contrib.webconsole.spiderstats.SpiderStats
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -390,7 +390,6 @@ Default::
|
|||||||
|
|
||||||
[
|
[
|
||||||
'scrapy.stats.corestats.CoreStats',
|
'scrapy.stats.corestats.CoreStats',
|
||||||
'scrapy.xpath.extension.ResponseLibxml2',
|
|
||||||
'scrapy.management.web.WebConsole',
|
'scrapy.management.web.WebConsole',
|
||||||
'scrapy.management.telnet.TelnetConsole',
|
'scrapy.management.telnet.TelnetConsole',
|
||||||
'scrapy.contrib.webconsole.scheduler.SchedulerQueue',
|
'scrapy.contrib.webconsole.scheduler.SchedulerQueue',
|
||||||
@ -398,7 +397,6 @@ Default::
|
|||||||
'scrapy.contrib.webconsole.spiderctl.Spiderctl',
|
'scrapy.contrib.webconsole.spiderctl.Spiderctl',
|
||||||
'scrapy.contrib.webconsole.enginestatus.EngineStatus',
|
'scrapy.contrib.webconsole.enginestatus.EngineStatus',
|
||||||
'scrapy.contrib.webconsole.stats.StatsDump',
|
'scrapy.contrib.webconsole.stats.StatsDump',
|
||||||
'scrapy.contrib.webconsole.spiderstats.SpiderStats',
|
|
||||||
'scrapy.contrib.spider.reloader.SpiderReloader',
|
'scrapy.contrib.spider.reloader.SpiderReloader',
|
||||||
'scrapy.contrib.memusage.MemoryUsage',
|
'scrapy.contrib.memusage.MemoryUsage',
|
||||||
'scrapy.contrib.memdebug.MemoryDebugger',
|
'scrapy.contrib.memdebug.MemoryDebugger',
|
||||||
|
@ -86,7 +86,6 @@ DUPEFILTER_CLASS = 'scrapy.contrib.dupefilter.RequestFingerprintDupeFilter'
|
|||||||
|
|
||||||
EXTENSIONS = [
|
EXTENSIONS = [
|
||||||
'scrapy.stats.corestats.CoreStats',
|
'scrapy.stats.corestats.CoreStats',
|
||||||
'scrapy.xpath.extension.ResponseLibxml2',
|
|
||||||
'scrapy.management.web.WebConsole',
|
'scrapy.management.web.WebConsole',
|
||||||
'scrapy.management.telnet.TelnetConsole',
|
'scrapy.management.telnet.TelnetConsole',
|
||||||
'scrapy.contrib.webconsole.scheduler.SchedulerQueue',
|
'scrapy.contrib.webconsole.scheduler.SchedulerQueue',
|
||||||
|
@ -1,36 +0,0 @@
|
|||||||
import unittest
|
|
||||||
|
|
||||||
import libxml2
|
|
||||||
|
|
||||||
from scrapy.http import TextResponse
|
|
||||||
from scrapy.utils.test import libxml2debug
|
|
||||||
|
|
||||||
class Libxml2Test(unittest.TestCase):
|
|
||||||
|
|
||||||
@libxml2debug
|
|
||||||
def test_xpath(self):
|
|
||||||
#this test will fail in version 2.6.27 but passes on 2.6.29+
|
|
||||||
html = "<td>1<b>2</b>3</td>"
|
|
||||||
node = libxml2.htmlParseDoc(html, 'utf-8')
|
|
||||||
result = [str(r) for r in node.xpathEval('//text()')]
|
|
||||||
self.assertEquals(result, ['1', '2', '3'])
|
|
||||||
node.freeDoc()
|
|
||||||
|
|
||||||
class ResponseLibxml2DocTest(unittest.TestCase):
|
|
||||||
|
|
||||||
@libxml2debug
|
|
||||||
def test_getlibxml2doc(self):
|
|
||||||
# test to simulate '\x00' char in body of html page
|
|
||||||
#this method shouldn't raise TypeError Exception
|
|
||||||
|
|
||||||
# make sure we load the libxml2 extension
|
|
||||||
from scrapy.extension import extensions
|
|
||||||
extensions.load() #
|
|
||||||
|
|
||||||
self.body_content = 'test problematic \x00 body'
|
|
||||||
response = TextResponse('http://example.com/catalog/product/blabla-123',
|
|
||||||
headers={'Content-Type': 'text/plain; charset=utf-8'}, body=self.body_content)
|
|
||||||
response.getlibxml2doc()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
@ -5,9 +5,10 @@ import libxml2
|
|||||||
|
|
||||||
from scrapy.http import TextResponse, HtmlResponse, XmlResponse
|
from scrapy.http import TextResponse, HtmlResponse, XmlResponse
|
||||||
from scrapy.xpath.selector import XmlXPathSelector, HtmlXPathSelector
|
from scrapy.xpath.selector import XmlXPathSelector, HtmlXPathSelector
|
||||||
|
from scrapy.xpath.document import Libxml2Document
|
||||||
from scrapy.utils.test import libxml2debug
|
from scrapy.utils.test import libxml2debug
|
||||||
|
|
||||||
class XPathTestCase(unittest.TestCase):
|
class XPathSelectorTestCase(unittest.TestCase):
|
||||||
|
|
||||||
@libxml2debug
|
@libxml2debug
|
||||||
def test_selector_simple(self):
|
def test_selector_simple(self):
|
||||||
@ -238,5 +239,44 @@ class XPathTestCase(unittest.TestCase):
|
|||||||
u'\n ',
|
u'\n ',
|
||||||
u'\n pff\n'])
|
u'\n pff\n'])
|
||||||
|
|
||||||
|
class Libxml2DocumentTest(unittest.TestCase):
|
||||||
|
|
||||||
|
@libxml2debug
|
||||||
|
def test_response_libxml2_caching(self):
|
||||||
|
r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
|
||||||
|
r2 = r1.copy()
|
||||||
|
|
||||||
|
doc1 = Libxml2Document(r1)
|
||||||
|
doc2 = Libxml2Document(r1)
|
||||||
|
doc3 = Libxml2Document(r2)
|
||||||
|
|
||||||
|
# make sure it's cached
|
||||||
|
assert doc1 is doc2
|
||||||
|
assert doc1.xmlDoc is doc2.xmlDoc
|
||||||
|
assert doc1 is not doc3
|
||||||
|
assert doc1.xmlDoc is not doc3.xmlDoc
|
||||||
|
|
||||||
|
# don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
|
||||||
|
del doc1, doc2, doc3
|
||||||
|
|
||||||
|
@libxml2debug
|
||||||
|
def test_null_char(self):
|
||||||
|
# make sure bodies with null char ('\x00') don't raise a TypeError exception
|
||||||
|
self.body_content = 'test problematic \x00 body'
|
||||||
|
response = TextResponse('http://example.com/catalog/product/blabla-123',
|
||||||
|
headers={'Content-Type': 'text/plain; charset=utf-8'}, body=self.body_content)
|
||||||
|
Libxml2Document(response)
|
||||||
|
|
||||||
|
class Libxml2Test(unittest.TestCase):
|
||||||
|
|
||||||
|
@libxml2debug
|
||||||
|
def test_libxml2_bug_2_6_27(self):
|
||||||
|
# this test will fail in version 2.6.27 but passes on 2.6.29+
|
||||||
|
html = "<td>1<b>2</b>3</td>"
|
||||||
|
node = libxml2.htmlParseDoc(html, 'utf-8')
|
||||||
|
result = [str(r) for r in node.xpathEval('//text()')]
|
||||||
|
self.assertEquals(result, ['1', '2', '3'])
|
||||||
|
node.freeDoc()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -1,25 +0,0 @@
|
|||||||
import unittest
|
|
||||||
|
|
||||||
from scrapy.http import HtmlResponse
|
|
||||||
from scrapy.xpath.extension import ResponseLibxml2
|
|
||||||
|
|
||||||
class ResponseLibxml2Test(unittest.TestCase):
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
ResponseLibxml2()
|
|
||||||
|
|
||||||
def test_response_libxml2_caching(self):
|
|
||||||
r1 = HtmlResponse('http://www.example.com', body='<html><head></head><body></body></html>')
|
|
||||||
r2 = r1.copy()
|
|
||||||
|
|
||||||
doc1 = r1.getlibxml2doc()
|
|
||||||
doc2 = r1.getlibxml2doc()
|
|
||||||
doc3 = r2.getlibxml2doc()
|
|
||||||
|
|
||||||
# make sure it's cached
|
|
||||||
assert doc1 is doc2
|
|
||||||
assert doc1 is not doc3
|
|
||||||
|
|
||||||
# don't leave libxml2 documents in memory to avoid wrong libxml2 leaks reports
|
|
||||||
del doc1, doc2, doc3
|
|
||||||
|
|
@ -6,6 +6,8 @@ import os
|
|||||||
|
|
||||||
import libxml2
|
import libxml2
|
||||||
|
|
||||||
|
from scrapy.xpath.document import Libxml2Document
|
||||||
|
|
||||||
def libxml2debug(testfunction):
|
def libxml2debug(testfunction):
|
||||||
"""Decorator for debugging libxml2 memory leaks inside a function.
|
"""Decorator for debugging libxml2 memory leaks inside a function.
|
||||||
|
|
||||||
|
@ -1,15 +1,24 @@
|
|||||||
"""
|
"""
|
||||||
This module contains a simple class (Libxml2Document) to wrap libxml2 documents
|
This module contains a simple class (Libxml2Document) which provides cache and
|
||||||
(xmlDoc) for proper garbage collection.
|
garbage collection to libxml2 documents (xmlDoc).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import weakref
|
||||||
|
|
||||||
from scrapy.xpath.factories import xmlDoc_from_html
|
from scrapy.xpath.factories import xmlDoc_from_html
|
||||||
|
|
||||||
class Libxml2Document(object):
|
class Libxml2Document(object):
|
||||||
|
|
||||||
def __init__(self, response, factory=xmlDoc_from_html):
|
cache = weakref.WeakKeyDictionary()
|
||||||
self.xmlDoc = factory(response)
|
|
||||||
self.xpathContext = self.xmlDoc.xpathNewContext()
|
def __new__(cls, response, factory=xmlDoc_from_html):
|
||||||
|
cache = cls.cache.setdefault(response, {})
|
||||||
|
if factory not in cache:
|
||||||
|
obj = object.__new__(cls)
|
||||||
|
obj.xmlDoc = factory(response)
|
||||||
|
obj.xpathContext = obj.xmlDoc.xpathNewContext()
|
||||||
|
cache[factory] = obj
|
||||||
|
return cache[factory]
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
# we must call both cleanup functions, so we try/except all exceptions
|
# we must call both cleanup functions, so we try/except all exceptions
|
||||||
|
@ -1,20 +0,0 @@
|
|||||||
"""
|
|
||||||
The ResponseLibxml2 extension causes the Response objects to grow a new method
|
|
||||||
("getlibxml2doc") which returns a (cached) libxml2 document of itself.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from scrapy.http import Response
|
|
||||||
from scrapy.xpath.document import Libxml2Document
|
|
||||||
from scrapy.xpath.factories import xmlDoc_from_html
|
|
||||||
|
|
||||||
class ResponseLibxml2(object):
|
|
||||||
def __init__(self):
|
|
||||||
setattr(Response, 'getlibxml2doc', getlibxml2doc)
|
|
||||||
|
|
||||||
def getlibxml2doc(response, factory=xmlDoc_from_html):
|
|
||||||
cachekey = 'lx2doc_%s' % factory.__name__
|
|
||||||
if cachekey not in response.cache:
|
|
||||||
lx2doc = Libxml2Document(response, factory=factory)
|
|
||||||
response.cache[cachekey] = lx2doc
|
|
||||||
return response.cache[cachekey]
|
|
||||||
|
|
@ -7,8 +7,8 @@ See documentation in docs/ref/selectors.rst
|
|||||||
import libxml2
|
import libxml2
|
||||||
|
|
||||||
from scrapy.http import TextResponse
|
from scrapy.http import TextResponse
|
||||||
from scrapy.xpath.extension import Libxml2Document
|
|
||||||
from scrapy.xpath.factories import xmlDoc_from_html, xmlDoc_from_xml
|
from scrapy.xpath.factories import xmlDoc_from_html, xmlDoc_from_xml
|
||||||
|
from scrapy.xpath.document import Libxml2Document
|
||||||
from scrapy.utils.python import flatten, unicode_to_str
|
from scrapy.utils.python import flatten, unicode_to_str
|
||||||
from scrapy.utils.misc import extract_regex
|
from scrapy.utils.misc import extract_regex
|
||||||
|
|
||||||
@ -19,11 +19,7 @@ class XPathSelector(object):
|
|||||||
self.doc = parent.doc
|
self.doc = parent.doc
|
||||||
self.xmlNode = node
|
self.xmlNode = node
|
||||||
elif response:
|
elif response:
|
||||||
try:
|
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
|
||||||
# try with cached version first
|
|
||||||
self.doc = response.getlibxml2doc(factory=self._get_libxml2_doc)
|
|
||||||
except AttributeError:
|
|
||||||
self.doc = Libxml2Document(response, factory=self._get_libxml2_doc)
|
|
||||||
self.xmlNode = self.doc.xmlDoc
|
self.xmlNode = self.doc.xmlDoc
|
||||||
elif text:
|
elif text:
|
||||||
response = TextResponse(url=None, body=unicode_to_str(text), \
|
response = TextResponse(url=None, body=unicode_to_str(text), \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user