1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 23:43:52 +00:00

LinkExtractor PY3 'unicode' type fix

This commit is contained in:
nyov 2015-07-29 15:34:27 +00:00
parent de89b1b562
commit e8205f6733
2 changed files with 4 additions and 2 deletions

View File

@ -3,6 +3,7 @@ HTMLParser-based link extractor
""" """
import warnings import warnings
import six
from six.moves.html_parser import HTMLParser from six.moves.html_parser import HTMLParser
from six.moves.urllib.parse import urljoin from six.moves.urllib.parse import urljoin
@ -39,7 +40,7 @@ class HtmlParserLinkExtractor(HTMLParser):
ret = [] ret = []
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
for link in links: for link in links:
if isinstance(link.url, unicode): if isinstance(link.url, six.text_type):
link.url = link.url.encode(response_encoding) link.url = link.url.encode(response_encoding)
try: try:
link.url = urljoin(base_url, link.url) link.url = urljoin(base_url, link.url)

View File

@ -1,6 +1,7 @@
""" """
SGMLParser-based Link extractors SGMLParser-based Link extractors
""" """
import six
from six.moves.urllib.parse import urljoin from six.moves.urllib.parse import urljoin
import warnings import warnings
from sgmllib import SGMLParser from sgmllib import SGMLParser
@ -40,7 +41,7 @@ class BaseSgmlLinkExtractor(SGMLParser):
if base_url is None: if base_url is None:
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
for link in self.links: for link in self.links:
if isinstance(link.url, unicode): if isinstance(link.url, six.text_type):
link.url = link.url.encode(response_encoding) link.url = link.url.encode(response_encoding)
try: try:
link.url = urljoin(base_url, link.url) link.url = urljoin(base_url, link.url)