1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-22 17:04:00 +00:00

PY3: use six for robotparser and urlparse

This commit is contained in:
Felix Yan 2014-07-14 21:26:37 +08:00
parent 4f24e724a3
commit 6f7efa1d1d
29 changed files with 43 additions and 40 deletions

View File

@ -13,7 +13,7 @@ command).
from __future__ import print_function
import sys, optparse, urllib, json
from urlparse import urljoin
from six.moves.urllib.parse import urljoin
from scrapy.utils.jsonrpc import jsonrpc_client_call, JsonRpcError

View File

@ -8,7 +8,7 @@ import time
import urllib2
import netrc
import json
from urlparse import urlparse, urljoin
from six.moves.urllib.parse import urlparse, urljoin
from subprocess import Popen, PIPE, check_call
from w3lib.form import encode_multipart

View File

@ -1,7 +1,7 @@
import base64
from urllib import getproxies, unquote, proxy_bypass
from urllib2 import _parse_proxy
from urlparse import urlunparse
from six.moves.urllib.parse import urlunparse
from scrapy.utils.httpobj import urlparse_cached
from scrapy.exceptions import NotConfigured

View File

@ -1,4 +1,4 @@
from urlparse import urljoin
from six.moves.urllib.parse import urljoin
from scrapy import log
from scrapy.http import HtmlResponse

View File

@ -4,7 +4,7 @@ enable this middleware and enable the ROBOTSTXT_OBEY setting.
"""
import robotparser
from six.moves.urllib import robotparser
from scrapy import signals, log
from scrapy.exceptions import NotConfigured, IgnoreRequest

View File

@ -7,7 +7,7 @@ See documentation in docs/topics/feed-exports.rst
import sys, os, posixpath
from tempfile import TemporaryFile
from datetime import datetime
from urlparse import urlparse
from six.moves.urllib.parse import urlparse
from ftplib import FTP
from zope.interface import Interface, implements

View File

@ -3,7 +3,7 @@ HTMLParser-based link extractor
"""
from HTMLParser import HTMLParser
from urlparse import urljoin
from six.moves.urllib.parse import urljoin
from w3lib.url import safe_url_string

View File

@ -3,7 +3,7 @@ Link extractor based on lxml.html
"""
import re
from urlparse import urlparse, urljoin
from six.moves.urllib.parse import urlparse, urljoin
import lxml.etree as etree

View File

@ -1,5 +1,5 @@
import re
from urlparse import urljoin
from six.moves.urllib.parse import urljoin
from w3lib.html import remove_tags, remove_entities, replace_escape_chars

View File

@ -1,7 +1,7 @@
"""
SGMLParser-based Link extractors
"""
from urlparse import urljoin
from six.moves.urllib.parse import urljoin
import warnings
from sgmllib import SGMLParser

View File

@ -7,7 +7,7 @@ import os
import os.path
import rfc822
import time
import urlparse
from six.moves.urllib.parse import urlparse
from collections import defaultdict
from cStringIO import StringIO
import six
@ -167,7 +167,7 @@ class FilesPipeline(MediaPipeline):
if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir
scheme = 'file'
else:
scheme = urlparse.urlparse(uri).scheme
scheme = urlparse(uri).scheme
store_cls = self.STORE_SCHEMES[scheme]
return store_cls(uri)

View File

@ -29,7 +29,7 @@ In case of status 200 request, response.headers will come with two keys:
"""
import re
from urlparse import urlparse
from six.moves.urllib.parse import urlparse
from cStringIO import StringIO
from twisted.internet import reactor

View File

@ -4,7 +4,7 @@ import re
from time import time
from cStringIO import StringIO
from urlparse import urldefrag
from six.moves.urllib.parse import urldefrag
from zope.interface import implements
from twisted.internet import defer, reactor, protocol

View File

@ -1,5 +1,5 @@
from time import time
from urlparse import urlparse, urlunparse, urldefrag
from six.moves.urllib.parse import urlparse, urlunparse, urldefrag
from twisted.web.client import HTTPClientFactory
from twisted.web.http import HTTPClient

View File

@ -5,7 +5,8 @@ This module implements the FormRequest class which is a more covenient class
See documentation in docs/topics/request-response.rst
"""
import urllib, urlparse
import urllib
from six.moves.urllib.parse import urljoin
import lxml.html
import six
from scrapy.http.request import Request
@ -43,7 +44,7 @@ class FormRequest(Request):
def _get_form_url(form, url):
if url is None:
return form.action or form.base_url
return urlparse.urljoin(form.base_url, url)
return urljoin(form.base_url, url)
def _urlencode(seq, enc):
values = [(unicode_to_str(k, enc), unicode_to_str(v, enc))

View File

@ -3,7 +3,7 @@ Common code and definitions used by Link extractors (located in
scrapy.contrib.linkextractor).
"""
import re
from urlparse import urlparse
from six.moves.urllib.parse import urlparse
from scrapy.utils.url import url_is_from_any_domain
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain, url_has_any_extension

View File

@ -1,4 +1,5 @@
import os, urlparse
import os
from six.moves.urllib.parse import urlparse
from cStringIO import StringIO
from zope.interface.verify import verifyObject
@ -84,7 +85,7 @@ class S3FeedStorageTest(unittest.TestCase):
file = storage.open(Spider("default"))
file.write("content")
yield storage.store(file)
u = urlparse.urlparse(uri)
u = urlparse(uri)
key = connect_s3().get_bucket(u.hostname, validate=False).get_key(u.path)
self.failUnlessEqual(key.get_contents_as_string(), "content")

View File

@ -11,7 +11,8 @@ module with the ``runserver`` argument::
"""
from __future__ import print_function
import sys, os, re, urlparse
import sys, os, re
from six.moves.urllib.parse import urlparse
from twisted.internet import reactor, defer
from twisted.web import server, static, util
@ -117,7 +118,7 @@ class CrawlerRun(object):
return "http://localhost:%s%s" % (self.portno, path)
def getpath(self, url):
u = urlparse.urlparse(url)
u = urlparse(url)
return u.path
def item_scraped(self, item, spider, response):

View File

@ -1,4 +1,4 @@
from urlparse import urlparse
from six.moves.urllib.parse import urlparse
from unittest import TestCase
from scrapy.http import Request, Response

View File

@ -1,7 +1,7 @@
import cgi
import unittest
import xmlrpclib
from urlparse import urlparse
from six.moves.urllib.parse import urlparse
from scrapy.http import Request, FormRequest, XmlRpcRequest, Headers, HtmlResponse

View File

@ -5,7 +5,7 @@ from scrapy.spider import Spider
from scrapy.contrib.spidermiddleware.offsite import OffsiteMiddleware
from scrapy.utils.test import get_crawler
from urlparse import urlparse
from six.moves.urllib.parse import urlparse
class TestOffsiteMiddleware(TestCase):

View File

@ -1,4 +1,4 @@
from urlparse import urlparse
from six.moves.urllib.parse import urlparse
import unittest

View File

@ -1,5 +1,5 @@
import unittest
import urlparse
from six.moves.urllib.parse import urlparse
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
@ -13,7 +13,7 @@ class HttpobjUtilsTest(unittest.TestCase):
req1a = urlparse_cached(request1)
req1b = urlparse_cached(request1)
req2 = urlparse_cached(request2)
urlp = urlparse.urlparse(url)
urlp = urlparse(url)
assert req1a == req2
assert req1a == urlp

View File

@ -1,6 +1,6 @@
import os
import unittest
import urlparse
from six.moves.urllib.parse import urlparse
from scrapy.http import Response, TextResponse, HtmlResponse
from scrapy.utils.response import response_httprepr, open_in_browser, get_meta_refresh
@ -24,7 +24,7 @@ class ResponseUtilsTest(unittest.TestCase):
url = "http:///www.example.com/some/page.html"
body = "<html> <head> <title>test page</title> </head> <body>test body</body> </html>"
def browser_open(burl):
path = urlparse.urlparse(burl).path
path = urlparse(burl).path
if not os.path.exists(path):
path = burl.replace('file://', '')
bbody = open(path).read()

View File

@ -3,7 +3,7 @@ from twisted.internet import defer
Tests borrowed from the twisted.web.client tests.
"""
import os
from urlparse import urlparse
from six.moves.urllib.parse import urlparse
from twisted.trial import unittest
from twisted.web import server, static, error, util

View File

@ -2,7 +2,7 @@
import weakref
from urlparse import urlparse
from six.moves.urllib.parse import urlparse
_urlparse_cache = weakref.WeakKeyDictionary()
def urlparse_cached(request_or_response):

View File

@ -6,7 +6,7 @@ scrapy.http.Request objects
from __future__ import print_function
import hashlib
import weakref
from urlparse import urlunparse
from six.moves.urllib.parse import urlunparse
from twisted.internet.defer import Deferred
from w3lib.http import basic_auth_header

View File

@ -1,5 +1,5 @@
from __future__ import print_function
import urlparse
from six.moves.urllib.parse import urljoin
from twisted.internet import reactor
from twisted.web import server, resource, static, util
@ -14,7 +14,7 @@ class SiteTest(object):
self.site.stopListening()
def url(self, path):
return urlparse.urljoin(self.baseurl, path)
return urljoin(self.baseurl, path)
def test_site():
r = resource.Resource()

View File

@ -6,7 +6,7 @@ Some of the functions that used to be imported from this module have been moved
to the w3lib.url module. Always import those from there instead.
"""
import posixpath
import urlparse
from six.moves.urllib.parse import ParseResult, urlunparse, urldefrag, urlparse
import urllib
import cgi
@ -59,7 +59,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
query = urllib.urlencode(keyvals)
path = safe_url_string(_unquotepath(path)) or '/'
fragment = '' if not keep_fragments else fragment
return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
return urlunparse((scheme, netloc.lower(), path, params, query, fragment))
def _unquotepath(path):
@ -72,8 +72,8 @@ def parse_url(url, encoding=None):
"""Return urlparsed url from the given argument (which could be an already
parsed url)
"""
return url if isinstance(url, urlparse.ParseResult) else \
urlparse.urlparse(unicode_to_str(url, encoding))
return url if isinstance(url, ParseResult) else \
urlparse(unicode_to_str(url, encoding))
def escape_ajax(url):
@ -99,7 +99,7 @@ def escape_ajax(url):
>>> escape_ajax("www.example.com/ajax.html")
'www.example.com/ajax.html'
"""
defrag, frag = urlparse.urldefrag(url)
defrag, frag = urldefrag(url)
if not frag.startswith('!'):
return url
return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])