1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 08:24:22 +00:00

Improved parse command and LinkExtractor's matches method

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40456
This commit is contained in:
elpolilla 2008-12-02 12:20:55 +00:00
parent eff09f2f78
commit c75ac38b92
4 changed files with 88 additions and 100 deletions

View File

@ -1,46 +1,95 @@
from scrapy.command.commands.parse_method import Command as ScrapyCommand
from scrapy.command import ScrapyCommand
from scrapy.fetcher import fetch
from scrapy.http import Request
from scrapy.item import ScrapedItem
from scrapy.spider import spiders
from scrapy.utils import display
from scrapy import log
class Command(ScrapyCommand):
def syntax(self):
return "[options] <url>"
return "[options] <url> <method>"
def short_desc(self):
return "Parse the URL and print its results"
return "Parse the URL with the given spider method and print the results"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--identify", dest="identify", action="store_true", help="try to use identify instead of parse")
parser.add_option("--nolinks", dest="nolinks", action="store_true", help="don't show extracted links")
parser.add_option("--noitems", dest="noitems", action="store_true", help="don't show scraped items")
parser.add_option("--nocolour", dest="nocolour", action="store_true", help="avoid using pygments to colorize the output")
parser.add_option("--matches", dest="matches", action="store_true", help="avoid using pygments to colorize the output")
def pipeline_process(self, item, spider, opts):
return item
def run_method(self, spider, response, method, args, opts):
spider = spiders.fromurl(response.url)
if not spider:
log.msg('Couldnt find spider for url: %s' % response.url, level=log.ERROR)
return (), ()
items = []
links = []
if method:
method_fcn = method if callable(method) else getattr(spider, method, None)
if not method_fcn:
log.msg('Couldnt find method %s in %s spider' % (method, spider.domain_name))
return (), ()
result = method_fcn(response)
links = [i for i in result if isinstance(i, Request)]
items = [self.pipeline_process(i, spider, opts) for i in result if isinstance(i, ScrapedItem)]
return items, links
def print_results(self, items, links, opts):
display.nocolour = opts.nocolour
if not opts.noitems:
for item in items:
for key in item.__dict__.keys():
if key.startswith('_'):
item.__dict__.pop(key, None)
print "# Scraped Items", "-"*60
display.pprint(list(items))
if not opts.nolinks:
print "# Links", "-"*68
display.pprint(list(links))
def run(self, args, opts):
if not args:
print "A URL is required"
return
if opts.matches:
url = args[0]
method = None
else:
if len(args) < 2:
print "A URL and method is required"
return
else:
url, method = args[:2]
items = set()
links = set()
responses = fetch(args)
for response in responses:
for response in fetch([url]):
spider = spiders.fromurl(response.url)
if spider:
if opts.identify and hasattr(spider, 'identify_products'):
ret_items, ret_links = ScrapyCommand.run_method(self, response, 'identify_products', args, opts)
else:
if hasattr(spider, 'rules'):
for rule in spider.rules:
if rule.link_extractor.match(response.url):
ret_items, ret_links = ScrapyCommand.run_method(self, response, rule.callback, args, opts)
break
else:
ret_items, ret_links = ScrapyCommand.run_method(self, response, 'parse', args, opts)
items = items.union(ret_items)
links = links.union(ret_links)
else:
if not spider:
log.msg('Couldnt find spider for "%s"' % response.url)
continue
if items or links:
self.print_results(items, links, opts)
if method:
ret_items, ret_links = self.run_method(spider, response, method, args, opts)
items = items.union(ret_items)
links = links.union(ret_links)
else:
if hasattr(spider, 'rules'):
for rule in spider.rules:
if rule.link_extractor.matches(response.url):
ret_items, ret_links = self.run_method(spider, response, rule.callback, args, opts)
items = items.union(ret_items)
links = links.union(ret_links)
else:
log.msg('No rules found for spider "%s", please specify a parsing method' % spider.domain_name)
continue
self.print_results(items, links, opts)

View File

@ -1,73 +0,0 @@
from scrapy.command import ScrapyCommand
from scrapy.fetcher import fetch
from scrapy.http import Request
from scrapy.item import ScrapedItem
from scrapy.spider import spiders
from scrapy.utils import display
from scrapy import log
class Command(ScrapyCommand):
def syntax(self):
return "[options] <url> <method>"
def short_desc(self):
return "Parse the URL with the given spider method and print the results"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--links", dest="links", action="store_true", help="show extracted links")
parser.add_option("--noitems", dest="noitems", action="store_true", help="don't show scraped items")
parser.add_option("--nocolour", dest="nocolour", action="store_true", help="avoid using pygments to colorize the output")
def pipeline_process(self, item, opts):
item.set_guid(spider)
for key in item.__dict__.keys():
if key.startswith('_'):
item.__dict__.pop(key, None)
return item
def run_method(self, response, method, args, opts):
spider = spiders.fromurl(response.url)
if not spider:
log.msg('Couldnt find spider for url: %s' % response.url, level=log.ERROR)
return (), ()
items = []
links = []
if method:
method_fcn = method if callable(method) else getattr(spider, method, None)
if not method_fcn:
log.msg('Couldnt find method %s in spider %s' % (method, spider.__name__))
return (), ()
result = method_fcn(response)
links = [i for i in result if isinstance(i, Request)]
items = [self.pipeline_process(i, opts) for i in result if isinstance(i, ScrapedItem)]
return items, links
def print_results(self, items, links, opts):
display.nocolour = opts.nocolour
if not opts.noitems:
print "# Scraped Items", "-"*60
display.pprint(list(items))
if opts.links:
print "# Links", "-"*68
display.pprint(list(links))
def run(self, args, opts):
if len(args) < 2:
print "A URL and method is required"
return
items = set()
links = set()
url, method = args[:2]
for response in fetch([url]):
ret_items, ret_links = self.run_method(response, method, args, opts)
items = items.union(ret_items)
links = links.union(ret_links)
self.print_results(items, links, opts)

View File

@ -71,6 +71,11 @@ class LinkExtractor(FixedSGMLParser):
if self.current_link and not self.current_link.text:
self.current_link.text = data
def matches(self, url):
"""This extractor matches with any url, since
it doesn't contain any patterns"""
return True
class Link(object):
"""

View File

@ -72,5 +72,12 @@ class RegexLinkExtractor(LinkExtractor):
return links
def match(self, url):
return any(regex.search(url) for regex in self.allow_res) and not any(regex.search(url) for regex in self.deny_res)
def matches(self, url):
if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
return False
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
return False
allowed = [regex.search(url) for regex in self.allow_res]
denied = [regex.search(url) for regex in self.deny_res]
return any(allowed) and not any(denied)