mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 15:04:21 +00:00
Improved parse command and LinkExtractor's matches method
--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40456
This commit is contained in:
parent
eff09f2f78
commit
c75ac38b92
@ -1,46 +1,95 @@
|
|||||||
from scrapy.command.commands.parse_method import Command as ScrapyCommand
|
from scrapy.command import ScrapyCommand
|
||||||
from scrapy.fetcher import fetch
|
from scrapy.fetcher import fetch
|
||||||
|
from scrapy.http import Request
|
||||||
|
from scrapy.item import ScrapedItem
|
||||||
from scrapy.spider import spiders
|
from scrapy.spider import spiders
|
||||||
|
from scrapy.utils import display
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
|
|
||||||
class Command(ScrapyCommand):
|
class Command(ScrapyCommand):
|
||||||
def syntax(self):
|
def syntax(self):
|
||||||
return "[options] <url>"
|
return "[options] <url> <method>"
|
||||||
|
|
||||||
def short_desc(self):
|
def short_desc(self):
|
||||||
return "Parse the URL and print its results"
|
return "Parse the URL with the given spider method and print the results"
|
||||||
|
|
||||||
def add_options(self, parser):
|
def add_options(self, parser):
|
||||||
ScrapyCommand.add_options(self, parser)
|
ScrapyCommand.add_options(self, parser)
|
||||||
parser.add_option("--identify", dest="identify", action="store_true", help="try to use identify instead of parse")
|
parser.add_option("--nolinks", dest="nolinks", action="store_true", help="don't show extracted links")
|
||||||
|
parser.add_option("--noitems", dest="noitems", action="store_true", help="don't show scraped items")
|
||||||
|
parser.add_option("--nocolour", dest="nocolour", action="store_true", help="avoid using pygments to colorize the output")
|
||||||
|
parser.add_option("--matches", dest="matches", action="store_true", help="avoid using pygments to colorize the output")
|
||||||
|
|
||||||
|
def pipeline_process(self, item, spider, opts):
|
||||||
|
return item
|
||||||
|
|
||||||
|
def run_method(self, spider, response, method, args, opts):
|
||||||
|
spider = spiders.fromurl(response.url)
|
||||||
|
if not spider:
|
||||||
|
log.msg('Couldnt find spider for url: %s' % response.url, level=log.ERROR)
|
||||||
|
return (), ()
|
||||||
|
|
||||||
|
items = []
|
||||||
|
links = []
|
||||||
|
if method:
|
||||||
|
method_fcn = method if callable(method) else getattr(spider, method, None)
|
||||||
|
if not method_fcn:
|
||||||
|
log.msg('Couldnt find method %s in %s spider' % (method, spider.domain_name))
|
||||||
|
return (), ()
|
||||||
|
|
||||||
|
result = method_fcn(response)
|
||||||
|
links = [i for i in result if isinstance(i, Request)]
|
||||||
|
items = [self.pipeline_process(i, spider, opts) for i in result if isinstance(i, ScrapedItem)]
|
||||||
|
|
||||||
|
return items, links
|
||||||
|
|
||||||
|
def print_results(self, items, links, opts):
|
||||||
|
display.nocolour = opts.nocolour
|
||||||
|
if not opts.noitems:
|
||||||
|
for item in items:
|
||||||
|
for key in item.__dict__.keys():
|
||||||
|
if key.startswith('_'):
|
||||||
|
item.__dict__.pop(key, None)
|
||||||
|
print "# Scraped Items", "-"*60
|
||||||
|
display.pprint(list(items))
|
||||||
|
|
||||||
|
if not opts.nolinks:
|
||||||
|
print "# Links", "-"*68
|
||||||
|
display.pprint(list(links))
|
||||||
|
|
||||||
def run(self, args, opts):
|
def run(self, args, opts):
|
||||||
if not args:
|
if opts.matches:
|
||||||
print "A URL is required"
|
url = args[0]
|
||||||
|
method = None
|
||||||
|
else:
|
||||||
|
if len(args) < 2:
|
||||||
|
print "A URL and method is required"
|
||||||
return
|
return
|
||||||
|
else:
|
||||||
|
url, method = args[:2]
|
||||||
|
|
||||||
items = set()
|
items = set()
|
||||||
links = set()
|
links = set()
|
||||||
responses = fetch(args)
|
for response in fetch([url]):
|
||||||
for response in responses:
|
|
||||||
spider = spiders.fromurl(response.url)
|
spider = spiders.fromurl(response.url)
|
||||||
if spider:
|
if not spider:
|
||||||
if opts.identify and hasattr(spider, 'identify_products'):
|
|
||||||
ret_items, ret_links = ScrapyCommand.run_method(self, response, 'identify_products', args, opts)
|
|
||||||
else:
|
|
||||||
if hasattr(spider, 'rules'):
|
|
||||||
for rule in spider.rules:
|
|
||||||
if rule.link_extractor.match(response.url):
|
|
||||||
ret_items, ret_links = ScrapyCommand.run_method(self, response, rule.callback, args, opts)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
ret_items, ret_links = ScrapyCommand.run_method(self, response, 'parse', args, opts)
|
|
||||||
items = items.union(ret_items)
|
|
||||||
links = links.union(ret_links)
|
|
||||||
else:
|
|
||||||
log.msg('Couldnt find spider for "%s"' % response.url)
|
log.msg('Couldnt find spider for "%s"' % response.url)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if items or links:
|
if method:
|
||||||
|
ret_items, ret_links = self.run_method(spider, response, method, args, opts)
|
||||||
|
items = items.union(ret_items)
|
||||||
|
links = links.union(ret_links)
|
||||||
|
else:
|
||||||
|
if hasattr(spider, 'rules'):
|
||||||
|
for rule in spider.rules:
|
||||||
|
if rule.link_extractor.matches(response.url):
|
||||||
|
ret_items, ret_links = self.run_method(spider, response, rule.callback, args, opts)
|
||||||
|
items = items.union(ret_items)
|
||||||
|
links = links.union(ret_links)
|
||||||
|
else:
|
||||||
|
log.msg('No rules found for spider "%s", please specify a parsing method' % spider.domain_name)
|
||||||
|
continue
|
||||||
|
|
||||||
self.print_results(items, links, opts)
|
self.print_results(items, links, opts)
|
||||||
|
|
||||||
|
@ -1,73 +0,0 @@
|
|||||||
from scrapy.command import ScrapyCommand
|
|
||||||
from scrapy.fetcher import fetch
|
|
||||||
from scrapy.http import Request
|
|
||||||
from scrapy.item import ScrapedItem
|
|
||||||
from scrapy.spider import spiders
|
|
||||||
from scrapy.utils import display
|
|
||||||
from scrapy import log
|
|
||||||
|
|
||||||
class Command(ScrapyCommand):
|
|
||||||
def syntax(self):
|
|
||||||
return "[options] <url> <method>"
|
|
||||||
|
|
||||||
def short_desc(self):
|
|
||||||
return "Parse the URL with the given spider method and print the results"
|
|
||||||
|
|
||||||
def add_options(self, parser):
|
|
||||||
ScrapyCommand.add_options(self, parser)
|
|
||||||
parser.add_option("--links", dest="links", action="store_true", help="show extracted links")
|
|
||||||
parser.add_option("--noitems", dest="noitems", action="store_true", help="don't show scraped items")
|
|
||||||
parser.add_option("--nocolour", dest="nocolour", action="store_true", help="avoid using pygments to colorize the output")
|
|
||||||
|
|
||||||
def pipeline_process(self, item, opts):
|
|
||||||
item.set_guid(spider)
|
|
||||||
for key in item.__dict__.keys():
|
|
||||||
if key.startswith('_'):
|
|
||||||
item.__dict__.pop(key, None)
|
|
||||||
return item
|
|
||||||
|
|
||||||
def run_method(self, response, method, args, opts):
|
|
||||||
spider = spiders.fromurl(response.url)
|
|
||||||
if not spider:
|
|
||||||
log.msg('Couldnt find spider for url: %s' % response.url, level=log.ERROR)
|
|
||||||
return (), ()
|
|
||||||
|
|
||||||
items = []
|
|
||||||
links = []
|
|
||||||
if method:
|
|
||||||
method_fcn = method if callable(method) else getattr(spider, method, None)
|
|
||||||
if not method_fcn:
|
|
||||||
log.msg('Couldnt find method %s in spider %s' % (method, spider.__name__))
|
|
||||||
return (), ()
|
|
||||||
|
|
||||||
result = method_fcn(response)
|
|
||||||
links = [i for i in result if isinstance(i, Request)]
|
|
||||||
items = [self.pipeline_process(i, opts) for i in result if isinstance(i, ScrapedItem)]
|
|
||||||
|
|
||||||
return items, links
|
|
||||||
|
|
||||||
def print_results(self, items, links, opts):
|
|
||||||
display.nocolour = opts.nocolour
|
|
||||||
if not opts.noitems:
|
|
||||||
print "# Scraped Items", "-"*60
|
|
||||||
display.pprint(list(items))
|
|
||||||
|
|
||||||
if opts.links:
|
|
||||||
print "# Links", "-"*68
|
|
||||||
display.pprint(list(links))
|
|
||||||
|
|
||||||
def run(self, args, opts):
|
|
||||||
if len(args) < 2:
|
|
||||||
print "A URL and method is required"
|
|
||||||
return
|
|
||||||
|
|
||||||
items = set()
|
|
||||||
links = set()
|
|
||||||
url, method = args[:2]
|
|
||||||
for response in fetch([url]):
|
|
||||||
ret_items, ret_links = self.run_method(response, method, args, opts)
|
|
||||||
items = items.union(ret_items)
|
|
||||||
links = links.union(ret_links)
|
|
||||||
|
|
||||||
self.print_results(items, links, opts)
|
|
||||||
|
|
@ -71,6 +71,11 @@ class LinkExtractor(FixedSGMLParser):
|
|||||||
if self.current_link and not self.current_link.text:
|
if self.current_link and not self.current_link.text:
|
||||||
self.current_link.text = data
|
self.current_link.text = data
|
||||||
|
|
||||||
|
def matches(self, url):
|
||||||
|
"""This extractor matches with any url, since
|
||||||
|
it doesn't contain any patterns"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class Link(object):
|
class Link(object):
|
||||||
"""
|
"""
|
||||||
|
@ -72,5 +72,12 @@ class RegexLinkExtractor(LinkExtractor):
|
|||||||
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
def match(self, url):
|
def matches(self, url):
|
||||||
return any(regex.search(url) for regex in self.allow_res) and not any(regex.search(url) for regex in self.deny_res)
|
if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
|
||||||
|
return False
|
||||||
|
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
|
||||||
|
return False
|
||||||
|
|
||||||
|
allowed = [regex.search(url) for regex in self.allow_res]
|
||||||
|
denied = [regex.search(url) for regex in self.deny_res]
|
||||||
|
return any(allowed) and not any(denied)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user