Improved parse command and LinkExtractor's matches method

--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40456
2025-02-27 08:24:22 +00:00 · 2008-12-02 12:20:55 +00:00 · 2008-12-02 12:20:55 +00:00 · c75ac38b92
commit c75ac38b92
parent eff09f2f78
4 changed files with 88 additions and 100 deletions
--- a/scrapy/trunk/scrapy/command/commands/parse.py
+++ b/scrapy/trunk/scrapy/command/commands/parse.py
@ -1,46 +1,95 @@
-from scrapy.command.commands.parse_method import Command as ScrapyCommand
+from scrapy.command import ScrapyCommand
 from scrapy.fetcher import fetch
+from scrapy.http import Request
+from scrapy.item import ScrapedItem
 from scrapy.spider import spiders
+from scrapy.utils import display
 from scrapy import log

 class Command(ScrapyCommand):
    def syntax(self):
-        return "[options] <url>"
+        return "[options] <url> <method>"

    def short_desc(self):
-        return "Parse the URL and print its results"
+        return "Parse the URL with the given spider method and print the results"

    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
-        parser.add_option("--identify", dest="identify", action="store_true", help="try to use identify instead of parse")
+        parser.add_option("--nolinks", dest="nolinks", action="store_true", help="don't show extracted links")
+        parser.add_option("--noitems", dest="noitems", action="store_true", help="don't show scraped items")
+        parser.add_option("--nocolour", dest="nocolour", action="store_true", help="avoid using pygments to colorize the output")
+        parser.add_option("--matches", dest="matches", action="store_true", help="avoid using pygments to colorize the output")
+
+    def pipeline_process(self, item, spider, opts):
+        return item
+
+    def run_method(self, spider, response, method, args, opts):
+        spider = spiders.fromurl(response.url)
+        if not spider:
+            log.msg('Couldnt find spider for url: %s' % response.url, level=log.ERROR)
+            return (), ()
+
+        items = []
+        links = []
+        if method:
+            method_fcn = method if callable(method) else getattr(spider, method, None)
+            if not method_fcn:
+                log.msg('Couldnt find method %s in %s spider' % (method, spider.domain_name))
+                return (), ()
+
+            result = method_fcn(response)
+            links = [i for i in result if isinstance(i, Request)]
+            items = [self.pipeline_process(i, spider, opts) for i in result if isinstance(i, ScrapedItem)]
+
+        return items, links
+
+    def print_results(self, items, links, opts):
+        display.nocolour = opts.nocolour
+        if not opts.noitems:
+            for item in items:
+                for key in item.__dict__.keys():
+                   if key.startswith('_'):
+                       item.__dict__.pop(key, None)
+            print "# Scraped Items", "-"*60
+            display.pprint(list(items))
+
+        if not opts.nolinks:
+            print "# Links", "-"*68
+            display.pprint(list(links))

    def run(self, args, opts):
-        if not args:
-            print "A URL is required"
-            return
+        if opts.matches:
+            url = args[0]
+            method = None            
+        else:
+            if len(args) < 2:
+                print "A URL and method is required"
+                return
+            else:
+                url, method = args[:2]

        items = set()
        links = set()
-        responses = fetch(args)
-        for response in responses:
+        for response in fetch([url]):
            spider = spiders.fromurl(response.url)
-            if spider:
-                if opts.identify and hasattr(spider, 'identify_products'):
-                    ret_items, ret_links = ScrapyCommand.run_method(self, response, 'identify_products', args, opts)
-                else:
-                    if hasattr(spider, 'rules'):
-                        for rule in spider.rules:
-                            if rule.link_extractor.match(response.url):
-                                ret_items, ret_links = ScrapyCommand.run_method(self, response, rule.callback, args, opts)
-                                break
-                    else:
-                        ret_items, ret_links = ScrapyCommand.run_method(self, response, 'parse', args, opts)
-                    items = items.union(ret_items)
-                    links = links.union(ret_links)
-            else:
+            if not spider:
                log.msg('Couldnt find spider for "%s"' % response.url)
                continue

-        if items or links:
-            self.print_results(items, links, opts)
+            if method:
+                ret_items, ret_links = self.run_method(spider, response, method, args, opts)
+                items = items.union(ret_items)
+                links = links.union(ret_links)
+            else:
+                if hasattr(spider, 'rules'):
+                    for rule in spider.rules:
+                        if rule.link_extractor.matches(response.url):
+                            ret_items, ret_links = self.run_method(spider, response, rule.callback, args, opts)
+                            items = items.union(ret_items)
+                            links = links.union(ret_links)
+                else:
+                    log.msg('No rules found for spider "%s", please specify a parsing method' % spider.domain_name)
+                    continue
+
+        self.print_results(items, links, opts)

--- a/scrapy/trunk/scrapy/command/commands/parse_method.py
+++ b/scrapy/trunk/scrapy/command/commands/parse_method.py
@ -1,73 +0,0 @@
-from scrapy.command import ScrapyCommand
-from scrapy.fetcher import fetch
-from scrapy.http import Request
-from scrapy.item import ScrapedItem
-from scrapy.spider import spiders
-from scrapy.utils import display
-from scrapy import log
-
-class Command(ScrapyCommand):
-    def syntax(self):
-        return "[options] <url> <method>"
-
-    def short_desc(self):
-        return "Parse the URL with the given spider method and print the results"
-
-    def add_options(self, parser):
-        ScrapyCommand.add_options(self, parser)
-        parser.add_option("--links", dest="links", action="store_true", help="show extracted links")
-        parser.add_option("--noitems", dest="noitems", action="store_true", help="don't show scraped items")
-        parser.add_option("--nocolour", dest="nocolour", action="store_true", help="avoid using pygments to colorize the output")
-
-    def pipeline_process(self, item, opts):
-        item.set_guid(spider)
-        for key in item.__dict__.keys():
-           if key.startswith('_'):
-               item.__dict__.pop(key, None)
-        return item
-
-    def run_method(self, response, method, args, opts):
-        spider = spiders.fromurl(response.url)
-        if not spider:
-            log.msg('Couldnt find spider for url: %s' % response.url, level=log.ERROR)
-            return (), ()
-
-        items = []
-        links = []
-        if method:
-            method_fcn = method if callable(method) else getattr(spider, method, None)
-            if not method_fcn:
-                log.msg('Couldnt find method %s in spider %s' % (method, spider.__name__))
-                return (), ()
-
-            result = method_fcn(response)
-            links = [i for i in result if isinstance(i, Request)]
-            items = [self.pipeline_process(i, opts) for i in result if isinstance(i, ScrapedItem)]
-
-        return items, links
-
-    def print_results(self, items, links, opts):
-        display.nocolour = opts.nocolour
-        if not opts.noitems:
-            print "# Scraped Items", "-"*60
-            display.pprint(list(items))
-
-        if opts.links:
-            print "# Links", "-"*68
-            display.pprint(list(links))
-
-    def run(self, args, opts):
-        if len(args) < 2:
-            print "A URL and method is required"
-            return
-
-        items = set()
-        links = set()
-        url, method = args[:2]
-        for response in fetch([url]):
-            ret_items, ret_links = self.run_method(response, method, args, opts)
-            items = items.union(ret_items)
-            links = links.union(ret_links)
-
-        self.print_results(items, links, opts)
-
--- a/scrapy/trunk/scrapy/link/init.py
+++ b/scrapy/trunk/scrapy/link/init.py
@ -71,6 +71,11 @@ class LinkExtractor(FixedSGMLParser):
        if self.current_link and not self.current_link.text:
            self.current_link.text = data

+    def matches(self, url):
+        """This extractor matches with any url, since
+        it doesn't contain any patterns"""
+        return True
+

 class Link(object):
    """
--- a/scrapy/trunk/scrapy/link/extractors.py
+++ b/scrapy/trunk/scrapy/link/extractors.py
@ -72,5 +72,12 @@ class RegexLinkExtractor(LinkExtractor):

        return links

-    def match(self, url):
-        return any(regex.search(url) for regex in self.allow_res) and not any(regex.search(url) for regex in self.deny_res)
+    def matches(self, url):
+        if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
+            return False
+        if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
+            return False
+            
+        allowed = [regex.search(url) for regex in self.allow_res]
+        denied = [regex.search(url) for regex in self.deny_res]
+        return any(allowed) and not any(denied)