mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-24 02:04:22 +00:00
ported internal scrapy.utils imports to w3lib
This commit is contained in:
parent
fcc8d73840
commit
18d303b5f1
@ -1,10 +1,9 @@
|
||||
import sys
|
||||
from w3lib.url import is_url
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.conf import settings
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.url import is_url
|
||||
from scrapy.utils.conf import arglist_to_dict
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
@ -11,10 +11,11 @@ import netrc
|
||||
from urlparse import urlparse, urljoin
|
||||
from subprocess import Popen, PIPE, check_call
|
||||
|
||||
from w3lib.form import encode_multipart
|
||||
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.utils.py26 import json
|
||||
from scrapy.utils.multipart import encode_multipart
|
||||
from scrapy.utils.http import basic_auth_header
|
||||
from scrapy.utils.conf import get_config, closest_scrapy_cfg
|
||||
|
||||
|
@ -1,10 +1,11 @@
|
||||
import pprint
|
||||
|
||||
from w3lib.url import is_url
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.utils.url import is_url
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
@ -1,9 +1,9 @@
|
||||
from w3lib.url import is_url
|
||||
from scrapy.command import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.utils import display
|
||||
from scrapy.utils.spider import iterate_spider_output, create_spider_for_request
|
||||
from scrapy.utils.url import is_url
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy import log
|
||||
|
||||
|
@ -4,7 +4,7 @@ HTTP basic auth downloader middleware
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from scrapy.utils.http import basic_auth_header
|
||||
from w3lib.http import basic_auth_header
|
||||
from scrapy.utils.python import WeakKeyCache
|
||||
|
||||
|
||||
|
@ -5,13 +5,14 @@ from os.path import join, exists
|
||||
from time import time
|
||||
import cPickle as pickle
|
||||
|
||||
from w3lib.http import headers_dict_to_raw, headers_raw_to_dict
|
||||
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy import signals
|
||||
from scrapy.http import Headers
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.utils.http import headers_dict_to_raw, headers_raw_to_dict
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.project import data_path
|
||||
|
@ -1,6 +1,7 @@
|
||||
from w3lib.url import urljoin_rfc
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.url import urljoin_rfc
|
||||
from scrapy.utils.response import get_meta_refresh
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
from scrapy.conf import settings
|
||||
|
@ -12,14 +12,14 @@ from ftplib import FTP
|
||||
from shutil import copyfileobj
|
||||
|
||||
from zope.interface import Interface, implements
|
||||
|
||||
from twisted.internet import defer, threads
|
||||
from w3lib.url import file_uri_to_path
|
||||
|
||||
from scrapy import log, signals
|
||||
from scrapy.xlib.pydispatch import dispatcher
|
||||
from scrapy.utils.ftp import ftp_makedirs_cwd
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.url import file_uri_to_path
|
||||
from scrapy.conf import settings
|
||||
|
||||
|
||||
|
@ -4,9 +4,10 @@ HTMLParser-based link extractor
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
|
||||
from w3lib.url import safe_url_string, urljoin_rfc
|
||||
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.python import unique as unique_list
|
||||
from scrapy.utils.url import safe_url_string, urljoin_rfc
|
||||
|
||||
class HtmlParserLinkExtractor(HTMLParser):
|
||||
|
||||
|
@ -3,9 +3,9 @@ This module implements the HtmlImageLinkExtractor for extracting
|
||||
image links only.
|
||||
"""
|
||||
|
||||
|
||||
from w3lib.url import urljoin_rfc
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.url import canonicalize_url, urljoin_rfc
|
||||
from scrapy.utils.url import canonicalize_url
|
||||
from scrapy.utils.python import unicode_to_str, flatten
|
||||
from scrapy.selector.libxml2sel import XPathSelectorList, HtmlXPathSelector
|
||||
|
||||
|
@ -7,10 +7,10 @@ because it collides with the lxml library module.
|
||||
|
||||
from lxml import etree
|
||||
import lxml.html
|
||||
from w3lib.url import safe_url_string, urljoin_rfc
|
||||
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.python import unique as unique_list, str_to_unicode
|
||||
from scrapy.utils.url import safe_url_string, urljoin_rfc
|
||||
|
||||
class LxmlLinkExtractor(object):
|
||||
def __init__(self, tag="a", attr="href", process=None, unique=False):
|
||||
|
@ -1,7 +1,7 @@
|
||||
import re
|
||||
|
||||
from scrapy.utils.url import urljoin_rfc
|
||||
from scrapy.utils.markup import remove_tags, remove_entities, replace_escape_chars
|
||||
from w3lib.url import urljoin_rfc
|
||||
from w3lib.html import remove_tags, remove_entities, replace_escape_chars
|
||||
|
||||
from scrapy.link import Link
|
||||
from .sgml import SgmlLinkExtractor
|
||||
|
@ -4,11 +4,13 @@ SGMLParser-based Link extractors
|
||||
|
||||
import re
|
||||
|
||||
from w3lib.url import safe_url_string, urljoin_rfc
|
||||
|
||||
from scrapy.selector import HtmlXPathSelector
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.python import FixedSGMLParser, unique as unique_list, str_to_unicode
|
||||
from scrapy.utils.url import safe_url_string, urljoin_rfc, canonicalize_url, url_is_from_any_domain
|
||||
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
|
||||
from scrapy.utils.response import get_base_url
|
||||
|
||||
class BaseSgmlLinkExtractor(FixedSGMLParser):
|
||||
|
@ -1,9 +1,10 @@
|
||||
"""Request Extractors"""
|
||||
from w3lib.url import safe_url_string, urljoin_rfc
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.selector import HtmlXPathSelector
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.python import FixedSGMLParser, str_to_unicode
|
||||
from scrapy.utils.url import safe_url_string, urljoin_rfc
|
||||
|
||||
from itertools import ifilter
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
from w3lib.url import file_uri_to_path
|
||||
from scrapy.core.downloader.responsetypes import responsetypes
|
||||
from scrapy.utils.url import file_uri_to_path
|
||||
from scrapy.utils.decorator import defers
|
||||
|
||||
class FileDownloadHandler(object):
|
||||
|
@ -1,5 +1,5 @@
|
||||
from w3lib.http import headers_dict_to_raw
|
||||
from scrapy.utils.datatypes import CaselessDict
|
||||
from scrapy.utils.http import headers_dict_to_raw
|
||||
|
||||
|
||||
class Headers(CaselessDict):
|
||||
|
@ -7,8 +7,9 @@ See documentation in docs/topics/request-response.rst
|
||||
|
||||
import copy
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
|
||||
from scrapy.http.headers import Headers
|
||||
from scrapy.utils.url import safe_url_string
|
||||
from scrapy.utils.trackref import object_ref
|
||||
from scrapy.utils.decorator import deprecated
|
||||
from scrapy.http.common import deprecated_setter
|
||||
|
@ -7,6 +7,7 @@ See documentation in docs/topics/shell.rst
|
||||
import signal
|
||||
|
||||
from twisted.internet import reactor, threads
|
||||
from w3lib.url import any_to_uri
|
||||
|
||||
from scrapy.item import BaseItem
|
||||
from scrapy.spider import BaseSpider
|
||||
@ -14,7 +15,6 @@ from scrapy.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector
|
||||
from scrapy.utils.spider import create_spider_for_request
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.response import open_in_browser
|
||||
from scrapy.utils.url import any_to_uri
|
||||
from scrapy.utils.console import start_python_console
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.http import Request, Response, HtmlResponse, XmlResponse
|
||||
|
@ -1,13 +1,13 @@
|
||||
import os, urlparse
|
||||
from cStringIO import StringIO
|
||||
|
||||
from zope.interface.verify import verifyObject
|
||||
from twisted.trial import unittest
|
||||
from twisted.internet import defer
|
||||
from cStringIO import StringIO
|
||||
from w3lib.url import path_to_file_uri
|
||||
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.contrib.feedexport import IFeedStorage, FileFeedStorage, FTPFeedStorage, S3FeedStorage, StdoutFeedStorage
|
||||
from scrapy.utils.url import path_to_file_uri
|
||||
from scrapy.utils.test import assert_aws_environ
|
||||
|
||||
class FeedStorageTest(unittest.TestCase):
|
||||
|
@ -8,6 +8,7 @@ from twisted.web import server, static, util, resource
|
||||
from twisted.web.test.test_webclient import ForeverTakingResource, \
|
||||
NoLengthResource, HostHeaderResource, \
|
||||
PayloadResource, BrokenDownloadResource
|
||||
from w3lib.url import path_to_file_uri
|
||||
|
||||
from scrapy.core.downloader.webclient import PartialDownloadError
|
||||
from scrapy.core.downloader.handlers.file import FileDownloadHandler
|
||||
@ -15,7 +16,6 @@ from scrapy.core.downloader.handlers.http import HttpDownloadHandler
|
||||
from scrapy.core.downloader.handlers.s3 import S3DownloadHandler
|
||||
from scrapy.spider import BaseSpider
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.url import path_to_file_uri
|
||||
from scrapy import optional_features
|
||||
|
||||
|
||||
|
@ -4,8 +4,8 @@ import re
|
||||
import hashlib
|
||||
from pkgutil import iter_modules
|
||||
|
||||
from w3lib.html import remove_entities
|
||||
from scrapy.utils.python import flatten
|
||||
from scrapy.utils.markup import remove_entities
|
||||
|
||||
def arg_to_iter(arg):
|
||||
"""Convert an argument to an iterable. The argument can be a None, single
|
||||
|
@ -8,9 +8,10 @@ import weakref
|
||||
from base64 import urlsafe_b64encode
|
||||
from urlparse import urlunparse
|
||||
|
||||
from w3lib.http import basic_auth_header
|
||||
|
||||
from scrapy.utils.url import canonicalize_url
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.http import basic_auth_header
|
||||
|
||||
|
||||
_fingerprint_cache = weakref.WeakKeyDictionary()
|
||||
|
Loading…
x
Reference in New Issue
Block a user