mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-25 05:44:03 +00:00
fixes to lxml link extractors api and encoding handling
This commit is contained in:
parent
97c322707a
commit
87e322e568
@ -9,7 +9,7 @@ from lxml import etree
|
||||
import lxml.html
|
||||
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.python import unique as unique_list
|
||||
from scrapy.utils.python import unique as unique_list, str_to_unicode
|
||||
from scrapy.utils.url import safe_url_string, urljoin_rfc
|
||||
|
||||
class LxmlLinkExtractor(object):
|
||||
@ -33,7 +33,7 @@ class LxmlLinkExtractor(object):
|
||||
for link in links:
|
||||
link.url = urljoin_rfc(base_url, link.url, response_encoding)
|
||||
link.url = safe_url_string(link.url, response_encoding)
|
||||
link.text = link.text.decode(response_encoding)
|
||||
link.text = str_to_unicode(link.text, response_encoding)
|
||||
ret.append(link)
|
||||
|
||||
return ret
|
||||
@ -43,6 +43,10 @@ class LxmlLinkExtractor(object):
|
||||
return self._extract_links(response.body, response.url,
|
||||
response.encoding)
|
||||
|
||||
def matches(self, url):
|
||||
"""This extractor matches with any url, since it doesn't contain any patterns"""
|
||||
return True
|
||||
|
||||
|
||||
class LinkTarget(object):
|
||||
def __init__(self, scan_tag, scan_attr, process_attr):
|
||||
|
Loading…
x
Reference in New Issue
Block a user