this changeset improves the extractors' implementation:

* moved LinkExtractor.extract_links to a private method and created wrapper in order to be able to work with text directly * removed fugly new_response_from_xpaths from scrapy.utils.response and replaced it with a better internal algorithm * moved former _normalize_input from scrapy.utils.iterators to scrapy.utils.response to fill the hole * turned extractors' output lists into generators; this is safe because the result is always used in for..in constructs * adapted test for generators (test should be rewritted anyways) --HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40597
2025-02-27 03:23:50 +00:00 · 2009-01-02 08:42:36 +00:00 · 2009-01-02 08:42:36 +00:00 · ebd5f465aa
commit ebd5f465aa
parent c9e48dc5ed
5 changed files with 42 additions and 47 deletions
--- a/scrapy/trunk/scrapy/link/init.py
+++ b/scrapy/trunk/scrapy/link/init.py
@ -35,18 +35,20 @@ class LinkExtractor(FixedSGMLParser):
        self.scan_attr = attr if callable(attr) else lambda a: a == attr
        self.current_link = None

-    def extract_links(self, response, unique=False):
+    def _extract_links(self, response_text, response_url, unique):
        self.reset()
        self.unique = unique
-        self.feed(response.body.to_string())
+        self.feed(response_text)
        self.close()
-        
-        base_url = self.base_url if self.base_url else response.url
-        ret = []
+
+        base_url = self.base_url if self.base_url else response_url
        for link in self.links:
            link.url = urljoin(base_url, link.url).strip()
-            ret.append(link)
-        return ret
+            yield link
+
+    def extract_links(self, response, unique=False):
+        # wrapper needed to allow to work directly with text
+        return self._extract_links(response.body.to_string(), response.url, unique)

    def reset(self):
        FixedSGMLParser.reset(self)
--- a/scrapy/trunk/scrapy/link/extractors.py
+++ b/scrapy/trunk/scrapy/link/extractors.py
@ -9,9 +9,8 @@ import urlparse

 from scrapy.link import LinkExtractor, Link
 from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
-from scrapy.utils.response import new_response_from_xpaths
 from scrapy.utils.python import unicode_to_str
-from scrapy.xpath.selector import HtmlXPathSelector
+from scrapy.xpath import HtmlXPathSelector

 _re_type = type(re.compile("", 0))

@ -41,7 +40,7 @@ class RegexLinkExtractor(LinkExtractor):

    If no allow/deny arguments are given, match all links.
    """
-    
+
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
                 tags=('a', 'area'), attrs=('href'), canonicalize=True):
        self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in allow]
@ -56,23 +55,25 @@ class RegexLinkExtractor(LinkExtractor):

    def extract_links(self, response, unique=True):
        if self.restrict_xpaths:
-            response = new_response_from_xpaths(response, self.restrict_xpaths)
+            hxs = HtmlXPathSelector(response)
+            html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
+            links = self._extract_links(html_slice, response.url, unique)
+        else:
+            links = LinkExtractor.extract_links(self, response, unique)

-        links = LinkExtractor.extract_links(self, response, unique)
-        links = [link for link in links if _is_valid_url(link.url)]
+        links = (link for link in links if _is_valid_url(link.url))

        if self.allow_res:
-            links = [link for link in links if _matches(link.url, self.allow_res)]
+            links = (link for link in links if _matches(link.url, self.allow_res))
        if self.deny_res:
-            links = [link for link in links if not _matches(link.url, self.deny_res)]
+            links = (link for link in links if not _matches(link.url, self.deny_res))
        if self.allow_domains:
-            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
+            links = (link for link in links if url_is_from_any_domain(link.url, self.allow_domains))
        if self.deny_domains:
-            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
-        
+            links = (link for link in links if not url_is_from_any_domain(link.url, self.deny_domains))
+
        if self.canonicalize:
-            for link in links:
-                link.url = canonicalize_url(link.url)
+            links = (canonicalize_url(link.url) for link in links)

        return links

@ -81,7 +82,7 @@ class RegexLinkExtractor(LinkExtractor):
            return False
        if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
            return False
-            
+
        allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
        denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
        return any(allowed) and not any(denied)
--- a/scrapy/trunk/scrapy/tests/test_link.py
+++ b/scrapy/trunk/scrapy/tests/test_link.py
@ -17,7 +17,8 @@ class LinkExtractorTestCase(unittest.TestCase):
        response = Response("example.org", "http://example.org/somepage/index.html", body=html)

        lx = LinkExtractor()  # default: tag=a, attr=href
-        self.assertEqual(lx.extract_links(response),
+        links = [link for link in lx.extract_links(response)]
+        self.assertEqual(links,
                         [Link(url='http://example.org/somepage/item/12.html', text='Item 12'), 
                          Link(url='http://example.org/about.html', text='About us'),
                          Link(url='http://example.org/othercat.html', text='Other category'), 
@ -30,7 +31,8 @@ class LinkExtractorTestCase(unittest.TestCase):
        response = Response("example.org", "http://example.org/somepage/index.html", body=html)

        lx = LinkExtractor()  # default: tag=a, attr=href
-        self.assertEqual(lx.extract_links(response),
+        links = [link for link in lx.extract_links(response)]
+        self.assertEqual(links,
                         [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])

    def test_matches(self):
--- a/scrapy/trunk/scrapy/utils/iterators.py
+++ b/scrapy/trunk/scrapy/utils/iterators.py
@ -1,18 +1,9 @@
 import re, csv

 from scrapy.xpath import XmlXPathSelector
-from scrapy.http import Response
 from scrapy import log
-from scrapy.utils.python import re_rsearch, str_to_unicode, unicode_to_str
-
-def _normalize_input(obj, unicode=True):
-    assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
-    if isinstance(obj, Response):
-        return obj.body.to_unicode() if unicode else obj.body.to_string()
-    elif isinstance(obj, str):
-        return obj.decode('utf-8') if unicode else obj
-    else:
-        return obj if unicode else obj.encode('utf-8')
+from scrapy.utils.python import re_rsearch, str_to_unicode
+from scrapy.utils.response import body_or_str

 def xmliter(obj, nodename):
    """Return a iterator of XPathSelector's over all nodes of a XML document,
@ -25,7 +16,7 @@ def xmliter(obj, nodename):
    """
    HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename, re.S)
    HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename, re.S)
-    text = _normalize_input(obj)
+    text = body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ''
@ -46,14 +37,14 @@ def csviter(obj, delimiter=None, headers=None):
    - a string encoded as utf-8

    delimiter is the character used to separate field on the given obj.
-    
+
    headers is an iterable that when provided offers the keys
    for the returned dictionaries, if not the first row is used.
    """
    def _getrow(csv_r):
        return [str_to_unicode(field) for field in csv_r.next()]

-    lines = _normalize_input(obj, unicode=False).splitlines(True)
+    lines = body_or_str(obj, unicode=False).splitlines(True)
    if delimiter:
        csv_r = csv.reader(lines, delimiter=delimiter)
    else:
--- a/scrapy/trunk/scrapy/utils/response.py
+++ b/scrapy/trunk/scrapy/utils/response.py
@ -3,14 +3,13 @@ This module provides some useful functions for working with
 scrapy.http.Response objects
 """

-from scrapy.xpath import XPathSelector
-from scrapy.http.response import ResponseBody
+from scrapy.http.response import Response

-def new_response_from_xpaths(response, xpaths):
-    """Return a new response constructed by applying the given xpaths to the
-    original response body
-    """
-    xs = XPathSelector(response)
-    parts = [''.join([n for n in xs.x(x).extract()]) for x in xpaths]
-    new_body_content = ''.join(parts)
-    return response.replace(body=ResponseBody(content=new_body_content, declared_encoding=response.body.get_declared_encoding()))
+def body_or_str(obj, unicode=True):
+    assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
+    if isinstance(obj, Response):
+        return obj.body.to_unicode() if unicode else obj.body.to_string()
+    elif isinstance(obj, str):
+        return obj.decode('utf-8') if unicode else obj
+    else:
+        return obj if unicode else obj.encode('utf-8')