1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-28 17:57:55 +00:00

issue GH #1550 - scrapy shell argument fixes: "example.com" requests "http://example.com"; "example" requests "file://example"; "./example.com" requests "file://example.com"

This commit is contained in:
Leonid Amirov 2015-11-02 16:08:19 +03:00
parent a41c64bfb9
commit bc9db65358

View File

@ -5,11 +5,13 @@ See documentation in docs/topics/shell.rst
""" """
from threading import Thread from threading import Thread
import urlparse
from w3lib.url import any_to_uri from w3lib.url import any_to_uri
from scrapy.commands import ScrapyCommand from scrapy.commands import ScrapyCommand
from scrapy.shell import Shell from scrapy.shell import Shell
from scrapy.http import Request from scrapy.http import Request
from scrapy.utils.url import add_http_if_no_scheme
from scrapy.utils.spider import spidercls_for_request, DefaultSpider from scrapy.utils.spider import spidercls_for_request, DefaultSpider
@ -43,8 +45,17 @@ class Command(ScrapyCommand):
def run(self, args, opts): def run(self, args, opts):
url = args[0] if args else None url = args[0] if args else None
if url: if url:
parts = urlparse.urlsplit(url)
if not parts.scheme:
if "." not in parts.path.split("/", 1)[0]:
url = any_to_uri(url) url = any_to_uri(url)
for pattern in ["/", "./", "../"]:
if url.startswith(pattern):
url = any_to_uri(url)
break
url = add_http_if_no_scheme(url)
spider_loader = self.crawler_process.spider_loader spider_loader = self.crawler_process.spider_loader
spidercls = DefaultSpider spidercls = DefaultSpider