mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 10:43:48 +00:00
This commit is contained in:
parent
b14dabb281
commit
66829c962f
@ -121,7 +121,7 @@ class SgmlLinkExtractor(BaseSgmlLinkExtractor):
|
||||
body = u''.join(f
|
||||
for x in self.restrict_xpaths
|
||||
for f in sel.xpath(x).extract()
|
||||
).encode(response.encoding)
|
||||
).encode(response.encoding, errors='xmlcharrefreplace')
|
||||
else:
|
||||
body = response.body
|
||||
|
||||
|
@ -236,6 +236,13 @@ class SgmlLinkExtractorTestCase(unittest.TestCase):
|
||||
self.assertEqual(lx.extract_links(response),
|
||||
[Link(url='http://example.org/about.html', text=u'About us\xa3')])
|
||||
|
||||
def test_restrict_xpaths_with_html_entities(self):
|
||||
html = '<html><body><p><a href="/♥/you?c=€">text</a></p></body></html>'
|
||||
response = HtmlResponse("http://example.org/somepage/index.html", body=html, encoding='iso8859-15')
|
||||
links = SgmlLinkExtractor(restrict_xpaths='//p').extract_links(response)
|
||||
self.assertEqual(links,
|
||||
[Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC', text=u'text')])
|
||||
|
||||
def test_restrict_xpaths_concat_in_handle_data(self):
|
||||
"""html entities cause SGMLParser to call handle_data hook twice"""
|
||||
body = """<html><body><div><a href="/foo">>\xbe\xa9<\xb6\xab</a></body></html>"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user