mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-22 17:04:00 +00:00
PY3: use six for robotparser and urlparse
This commit is contained in:
parent
4f24e724a3
commit
6f7efa1d1d
@ -13,7 +13,7 @@ command).
|
||||
|
||||
from __future__ import print_function
|
||||
import sys, optparse, urllib, json
|
||||
from urlparse import urljoin
|
||||
from six.moves.urllib.parse import urljoin
|
||||
|
||||
from scrapy.utils.jsonrpc import jsonrpc_client_call, JsonRpcError
|
||||
|
||||
|
@ -8,7 +8,7 @@ import time
|
||||
import urllib2
|
||||
import netrc
|
||||
import json
|
||||
from urlparse import urlparse, urljoin
|
||||
from six.moves.urllib.parse import urlparse, urljoin
|
||||
from subprocess import Popen, PIPE, check_call
|
||||
|
||||
from w3lib.form import encode_multipart
|
||||
|
@ -1,7 +1,7 @@
|
||||
import base64
|
||||
from urllib import getproxies, unquote, proxy_bypass
|
||||
from urllib2 import _parse_proxy
|
||||
from urlparse import urlunparse
|
||||
from six.moves.urllib.parse import urlunparse
|
||||
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
@ -1,4 +1,4 @@
|
||||
from urlparse import urljoin
|
||||
from six.moves.urllib.parse import urljoin
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.http import HtmlResponse
|
||||
|
@ -4,7 +4,7 @@ enable this middleware and enable the ROBOTSTXT_OBEY setting.
|
||||
|
||||
"""
|
||||
|
||||
import robotparser
|
||||
from six.moves.urllib import robotparser
|
||||
|
||||
from scrapy import signals, log
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
|
@ -7,7 +7,7 @@ See documentation in docs/topics/feed-exports.rst
|
||||
import sys, os, posixpath
|
||||
from tempfile import TemporaryFile
|
||||
from datetime import datetime
|
||||
from urlparse import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
from ftplib import FTP
|
||||
|
||||
from zope.interface import Interface, implements
|
||||
|
@ -3,7 +3,7 @@ HTMLParser-based link extractor
|
||||
"""
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
from urlparse import urljoin
|
||||
from six.moves.urllib.parse import urljoin
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
|
||||
|
@ -3,7 +3,7 @@ Link extractor based on lxml.html
|
||||
"""
|
||||
|
||||
import re
|
||||
from urlparse import urlparse, urljoin
|
||||
from six.moves.urllib.parse import urlparse, urljoin
|
||||
|
||||
import lxml.etree as etree
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
import re
|
||||
from urlparse import urljoin
|
||||
from six.moves.urllib.parse import urljoin
|
||||
|
||||
from w3lib.html import remove_tags, remove_entities, replace_escape_chars
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""
|
||||
SGMLParser-based Link extractors
|
||||
"""
|
||||
from urlparse import urljoin
|
||||
from six.moves.urllib.parse import urljoin
|
||||
import warnings
|
||||
from sgmllib import SGMLParser
|
||||
|
||||
|
@ -7,7 +7,7 @@ import os
|
||||
import os.path
|
||||
import rfc822
|
||||
import time
|
||||
import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
from collections import defaultdict
|
||||
from cStringIO import StringIO
|
||||
import six
|
||||
@ -167,7 +167,7 @@ class FilesPipeline(MediaPipeline):
|
||||
if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir
|
||||
scheme = 'file'
|
||||
else:
|
||||
scheme = urlparse.urlparse(uri).scheme
|
||||
scheme = urlparse(uri).scheme
|
||||
store_cls = self.STORE_SCHEMES[scheme]
|
||||
return store_cls(uri)
|
||||
|
||||
|
@ -29,7 +29,7 @@ In case of status 200 request, response.headers will come with two keys:
|
||||
"""
|
||||
|
||||
import re
|
||||
from urlparse import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
from cStringIO import StringIO
|
||||
|
||||
from twisted.internet import reactor
|
||||
|
@ -4,7 +4,7 @@ import re
|
||||
|
||||
from time import time
|
||||
from cStringIO import StringIO
|
||||
from urlparse import urldefrag
|
||||
from six.moves.urllib.parse import urldefrag
|
||||
|
||||
from zope.interface import implements
|
||||
from twisted.internet import defer, reactor, protocol
|
||||
|
@ -1,5 +1,5 @@
|
||||
from time import time
|
||||
from urlparse import urlparse, urlunparse, urldefrag
|
||||
from six.moves.urllib.parse import urlparse, urlunparse, urldefrag
|
||||
|
||||
from twisted.web.client import HTTPClientFactory
|
||||
from twisted.web.http import HTTPClient
|
||||
|
@ -5,7 +5,8 @@ This module implements the FormRequest class which is a more covenient class
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
|
||||
import urllib, urlparse
|
||||
import urllib
|
||||
from six.moves.urllib.parse import urljoin
|
||||
import lxml.html
|
||||
import six
|
||||
from scrapy.http.request import Request
|
||||
@ -43,7 +44,7 @@ class FormRequest(Request):
|
||||
def _get_form_url(form, url):
|
||||
if url is None:
|
||||
return form.action or form.base_url
|
||||
return urlparse.urljoin(form.base_url, url)
|
||||
return urljoin(form.base_url, url)
|
||||
|
||||
def _urlencode(seq, enc):
|
||||
values = [(unicode_to_str(k, enc), unicode_to_str(v, enc))
|
||||
|
@ -3,7 +3,7 @@ Common code and definitions used by Link extractors (located in
|
||||
scrapy.contrib.linkextractor).
|
||||
"""
|
||||
import re
|
||||
from urlparse import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from scrapy.utils.url import url_is_from_any_domain
|
||||
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain, url_has_any_extension
|
||||
|
@ -1,4 +1,5 @@
|
||||
import os, urlparse
|
||||
import os
|
||||
from six.moves.urllib.parse import urlparse
|
||||
from cStringIO import StringIO
|
||||
|
||||
from zope.interface.verify import verifyObject
|
||||
@ -84,7 +85,7 @@ class S3FeedStorageTest(unittest.TestCase):
|
||||
file = storage.open(Spider("default"))
|
||||
file.write("content")
|
||||
yield storage.store(file)
|
||||
u = urlparse.urlparse(uri)
|
||||
u = urlparse(uri)
|
||||
key = connect_s3().get_bucket(u.hostname, validate=False).get_key(u.path)
|
||||
self.failUnlessEqual(key.get_contents_as_string(), "content")
|
||||
|
||||
|
@ -11,7 +11,8 @@ module with the ``runserver`` argument::
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
import sys, os, re, urlparse
|
||||
import sys, os, re
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from twisted.internet import reactor, defer
|
||||
from twisted.web import server, static, util
|
||||
@ -117,7 +118,7 @@ class CrawlerRun(object):
|
||||
return "http://localhost:%s%s" % (self.portno, path)
|
||||
|
||||
def getpath(self, url):
|
||||
u = urlparse.urlparse(url)
|
||||
u = urlparse(url)
|
||||
return u.path
|
||||
|
||||
def item_scraped(self, item, spider, response):
|
||||
|
@ -1,4 +1,4 @@
|
||||
from urlparse import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
from unittest import TestCase
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
|
@ -1,7 +1,7 @@
|
||||
import cgi
|
||||
import unittest
|
||||
import xmlrpclib
|
||||
from urlparse import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from scrapy.http import Request, FormRequest, XmlRpcRequest, Headers, HtmlResponse
|
||||
|
||||
|
@ -5,7 +5,7 @@ from scrapy.spider import Spider
|
||||
from scrapy.contrib.spidermiddleware.offsite import OffsiteMiddleware
|
||||
from scrapy.utils.test import get_crawler
|
||||
|
||||
from urlparse import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
class TestOffsiteMiddleware(TestCase):
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from urlparse import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
import unittest
|
||||
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
import unittest
|
||||
import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
@ -13,7 +13,7 @@ class HttpobjUtilsTest(unittest.TestCase):
|
||||
req1a = urlparse_cached(request1)
|
||||
req1b = urlparse_cached(request1)
|
||||
req2 = urlparse_cached(request2)
|
||||
urlp = urlparse.urlparse(url)
|
||||
urlp = urlparse(url)
|
||||
|
||||
assert req1a == req2
|
||||
assert req1a == urlp
|
||||
|
@ -1,6 +1,6 @@
|
||||
import os
|
||||
import unittest
|
||||
import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from scrapy.http import Response, TextResponse, HtmlResponse
|
||||
from scrapy.utils.response import response_httprepr, open_in_browser, get_meta_refresh
|
||||
@ -24,7 +24,7 @@ class ResponseUtilsTest(unittest.TestCase):
|
||||
url = "http:///www.example.com/some/page.html"
|
||||
body = "<html> <head> <title>test page</title> </head> <body>test body</body> </html>"
|
||||
def browser_open(burl):
|
||||
path = urlparse.urlparse(burl).path
|
||||
path = urlparse(burl).path
|
||||
if not os.path.exists(path):
|
||||
path = burl.replace('file://', '')
|
||||
bbody = open(path).read()
|
||||
|
@ -3,7 +3,7 @@ from twisted.internet import defer
|
||||
Tests borrowed from the twisted.web.client tests.
|
||||
"""
|
||||
import os
|
||||
from urlparse import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from twisted.trial import unittest
|
||||
from twisted.web import server, static, error, util
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
import weakref
|
||||
|
||||
from urlparse import urlparse
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
_urlparse_cache = weakref.WeakKeyDictionary()
|
||||
def urlparse_cached(request_or_response):
|
||||
|
@ -6,7 +6,7 @@ scrapy.http.Request objects
|
||||
from __future__ import print_function
|
||||
import hashlib
|
||||
import weakref
|
||||
from urlparse import urlunparse
|
||||
from six.moves.urllib.parse import urlunparse
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
from w3lib.http import basic_auth_header
|
||||
|
@ -1,5 +1,5 @@
|
||||
from __future__ import print_function
|
||||
import urlparse
|
||||
from six.moves.urllib.parse import urljoin
|
||||
|
||||
from twisted.internet import reactor
|
||||
from twisted.web import server, resource, static, util
|
||||
@ -14,7 +14,7 @@ class SiteTest(object):
|
||||
self.site.stopListening()
|
||||
|
||||
def url(self, path):
|
||||
return urlparse.urljoin(self.baseurl, path)
|
||||
return urljoin(self.baseurl, path)
|
||||
|
||||
def test_site():
|
||||
r = resource.Resource()
|
||||
|
@ -6,7 +6,7 @@ Some of the functions that used to be imported from this module have been moved
|
||||
to the w3lib.url module. Always import those from there instead.
|
||||
"""
|
||||
import posixpath
|
||||
import urlparse
|
||||
from six.moves.urllib.parse import ParseResult, urlunparse, urldefrag, urlparse
|
||||
import urllib
|
||||
import cgi
|
||||
|
||||
@ -59,7 +59,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
|
||||
query = urllib.urlencode(keyvals)
|
||||
path = safe_url_string(_unquotepath(path)) or '/'
|
||||
fragment = '' if not keep_fragments else fragment
|
||||
return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
|
||||
return urlunparse((scheme, netloc.lower(), path, params, query, fragment))
|
||||
|
||||
|
||||
def _unquotepath(path):
|
||||
@ -72,8 +72,8 @@ def parse_url(url, encoding=None):
|
||||
"""Return urlparsed url from the given argument (which could be an already
|
||||
parsed url)
|
||||
"""
|
||||
return url if isinstance(url, urlparse.ParseResult) else \
|
||||
urlparse.urlparse(unicode_to_str(url, encoding))
|
||||
return url if isinstance(url, ParseResult) else \
|
||||
urlparse(unicode_to_str(url, encoding))
|
||||
|
||||
|
||||
def escape_ajax(url):
|
||||
@ -99,7 +99,7 @@ def escape_ajax(url):
|
||||
>>> escape_ajax("www.example.com/ajax.html")
|
||||
'www.example.com/ajax.html'
|
||||
"""
|
||||
defrag, frag = urlparse.urldefrag(url)
|
||||
defrag, frag = urldefrag(url)
|
||||
if not frag.startswith('!'):
|
||||
return url
|
||||
return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
|
||||
|
Loading…
x
Reference in New Issue
Block a user