fixes to lxml link extractors api and encoding handling

2025-02-25 05:44:03 +00:00 · 2009-11-04 12:39:58 -02:00 · 2009-11-04 12:39:58 -02:00 · 87e322e568
commit 87e322e568
parent 97c322707a
1 changed files with 6 additions and 2 deletions
--- a/scrapy/contrib/linkextractors/lxmlparser.py
+++ b/scrapy/contrib/linkextractors/lxmlparser.py
@ -9,7 +9,7 @@ from lxml import etree
 import lxml.html

 from scrapy.link import Link
-from scrapy.utils.python import unique as unique_list
+from scrapy.utils.python import unique as unique_list, str_to_unicode
 from scrapy.utils.url import safe_url_string, urljoin_rfc

 class LxmlLinkExtractor(object):
@ -33,7 +33,7 @@ class LxmlLinkExtractor(object):
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
-            link.text = link.text.decode(response_encoding)
+            link.text = str_to_unicode(link.text, response_encoding)
            ret.append(link)

        return ret
@ -43,6 +43,10 @@ class LxmlLinkExtractor(object):
        return self._extract_links(response.body, response.url, 
                                   response.encoding)

+    def matches(self, url):
+        """This extractor matches with any url, since it doesn't contain any patterns"""
+        return True
+

 class LinkTarget(object):
    def __init__(self, scan_tag, scan_attr, process_attr):