Improved parse command and LinkExtractor's matches method

--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40456
2025-02-27 15:04:21 +00:00 · 2008-12-02 12:20:55 +00:00 · 2008-12-02 12:20:55 +00:00 · c75ac38b92
commit c75ac38b92
parent eff09f2f78
4 changed files with 88 additions and 100 deletions
--- a/scrapy/trunk/scrapy/command/commands/parse.py
+++ b/scrapy/trunk/scrapy/command/commands/parse.py
@ -1,46 +1,95 @@
-from scrapy.command.commands.parse_method import Command as ScrapyCommand
+from scrapy.command import ScrapyCommand
 from scrapy.fetcher import fetch
 from scrapy.http import Request
 from scrapy.item import ScrapedItem
 from scrapy.spider import spiders
 from scrapy.utils import display
 from scrapy import log
 class Command(ScrapyCommand):
    def syntax(self):
-        return "[options] <url>"
+        return "[options] <url> <method>"
    def short_desc(self):
-        return "Parse the URL and print its results"
+        return "Parse the URL with the given spider method and print the results"
    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
-        parser.add_option("--identify", dest="identify", action="store_true", help="try to use identify instead of parse")
+        parser.add_option("--nolinks", dest="nolinks", action="store_true", help="don't show extracted links")
        parser.add_option("--noitems", dest="noitems", action="store_true", help="don't show scraped items")
        parser.add_option("--nocolour", dest="nocolour", action="store_true", help="avoid using pygments to colorize the output")
        parser.add_option("--matches", dest="matches", action="store_true", help="avoid using pygments to colorize the output")
    def pipeline_process(self, item, spider, opts):
        return item
    def run_method(self, spider, response, method, args, opts):
        spider = spiders.fromurl(response.url)
        if not spider:
            log.msg('Couldnt find spider for url: %s' % response.url, level=log.ERROR)
            return (), ()
        items = []
        links = []
        if method:
            method_fcn = method if callable(method) else getattr(spider, method, None)
            if not method_fcn:
                log.msg('Couldnt find method %s in %s spider' % (method, spider.domain_name))
                return (), ()
            result = method_fcn(response)
            links = [i for i in result if isinstance(i, Request)]
            items = [self.pipeline_process(i, spider, opts) for i in result if isinstance(i, ScrapedItem)]
        return items, links
    def print_results(self, items, links, opts):
        display.nocolour = opts.nocolour
        if not opts.noitems:
            for item in items:
                for key in item.__dict__.keys():
                   if key.startswith('_'):
                       item.__dict__.pop(key, None)
            print "# Scraped Items", "-"*60
            display.pprint(list(items))
        if not opts.nolinks:
            print "# Links", "-"*68
            display.pprint(list(links))
    def run(self, args, opts):
-        if not args:
+        if opts.matches:
-            print "A URL is required"
+            url = args[0]
            method = None            
        else:
            if len(args) < 2:
                print "A URL and method is required"
                return
            else:
                url, method = args[:2]
        items = set()
        links = set()
-        responses = fetch(args)
+        for response in fetch([url]):
        for response in responses:
            spider = spiders.fromurl(response.url)
-            if spider:
+            if not spider:
                if opts.identify and hasattr(spider, 'identify_products'):
                    ret_items, ret_links = ScrapyCommand.run_method(self, response, 'identify_products', args, opts)
                else:
                    if hasattr(spider, 'rules'):
                        for rule in spider.rules:
                            if rule.link_extractor.match(response.url):
                                ret_items, ret_links = ScrapyCommand.run_method(self, response, rule.callback, args, opts)
                                break
                    else:
                        ret_items, ret_links = ScrapyCommand.run_method(self, response, 'parse', args, opts)
                    items = items.union(ret_items)
                    links = links.union(ret_links)
            else:
                log.msg('Couldnt find spider for "%s"' % response.url)
                continue
-        if items or links:
+            if method:
                ret_items, ret_links = self.run_method(spider, response, method, args, opts)
                items = items.union(ret_items)
                links = links.union(ret_links)
            else:
                if hasattr(spider, 'rules'):
                    for rule in spider.rules:
                        if rule.link_extractor.matches(response.url):
                            ret_items, ret_links = self.run_method(spider, response, rule.callback, args, opts)
                            items = items.union(ret_items)
                            links = links.union(ret_links)
                else:
                    log.msg('No rules found for spider "%s", please specify a parsing method' % spider.domain_name)
                    continue
        self.print_results(items, links, opts)
--- a/scrapy/trunk/scrapy/command/commands/parse_method.py
+++ b/scrapy/trunk/scrapy/command/commands/parse_method.py
@ -1,73 +0,0 @@
 from scrapy.command import ScrapyCommand
 from scrapy.fetcher import fetch
 from scrapy.http import Request
 from scrapy.item import ScrapedItem
 from scrapy.spider import spiders
 from scrapy.utils import display
 from scrapy import log
 class Command(ScrapyCommand):
    def syntax(self):
        return "[options] <url> <method>"
    def short_desc(self):
        return "Parse the URL with the given spider method and print the results"
    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
        parser.add_option("--links", dest="links", action="store_true", help="show extracted links")
        parser.add_option("--noitems", dest="noitems", action="store_true", help="don't show scraped items")
        parser.add_option("--nocolour", dest="nocolour", action="store_true", help="avoid using pygments to colorize the output")
    def pipeline_process(self, item, opts):
        item.set_guid(spider)
        for key in item.__dict__.keys():
           if key.startswith('_'):
               item.__dict__.pop(key, None)
        return item
    def run_method(self, response, method, args, opts):
        spider = spiders.fromurl(response.url)
        if not spider:
            log.msg('Couldnt find spider for url: %s' % response.url, level=log.ERROR)
            return (), ()
        items = []
        links = []
        if method:
            method_fcn = method if callable(method) else getattr(spider, method, None)
            if not method_fcn:
                log.msg('Couldnt find method %s in spider %s' % (method, spider.__name__))
                return (), ()
            result = method_fcn(response)
            links = [i for i in result if isinstance(i, Request)]
            items = [self.pipeline_process(i, opts) for i in result if isinstance(i, ScrapedItem)]
        return items, links
    def print_results(self, items, links, opts):
        display.nocolour = opts.nocolour
        if not opts.noitems:
            print "# Scraped Items", "-"*60
            display.pprint(list(items))
        if opts.links:
            print "# Links", "-"*68
            display.pprint(list(links))
    def run(self, args, opts):
        if len(args) < 2:
            print "A URL and method is required"
            return
        items = set()
        links = set()
        url, method = args[:2]
        for response in fetch([url]):
            ret_items, ret_links = self.run_method(response, method, args, opts)
            items = items.union(ret_items)
            links = links.union(ret_links)
        self.print_results(items, links, opts)
--- a/scrapy/trunk/scrapy/link/init.py
+++ b/scrapy/trunk/scrapy/link/init.py
@ -71,6 +71,11 @@ class LinkExtractor(FixedSGMLParser):
        if self.current_link and not self.current_link.text:
            self.current_link.text = data
    def matches(self, url):
        """This extractor matches with any url, since
        it doesn't contain any patterns"""
        return True
 class Link(object):
    """
--- a/scrapy/trunk/scrapy/link/extractors.py
+++ b/scrapy/trunk/scrapy/link/extractors.py
@ -72,5 +72,12 @@ class RegexLinkExtractor(LinkExtractor):
        return links
-    def match(self, url):
+    def matches(self, url):
-        return any(regex.search(url) for regex in self.allow_res) and not any(regex.search(url) for regex in self.deny_res)
+        if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
            return False
        if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
            return False
        allowed = [regex.search(url) for regex in self.allow_res]
        denied = [regex.search(url) for regex in self.deny_res]
        return any(allowed) and not any(denied)