Fixed ExtractImages adaptor, which wasnt looking for a base tag

--HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40414
2025-02-27 04:44:26 +00:00 · 2008-11-25 01:41:22 +00:00 · 2008-11-25 01:41:22 +00:00 · f558a49d2c
commit f558a49d2c
parent 5028146e8c
1 changed files with 14 additions and 4 deletions
--- a/scrapy/trunk/scrapy/contrib/adaptors/extraction.py
+++ b/scrapy/trunk/scrapy/contrib/adaptors/extraction.py
@ -3,6 +3,7 @@ Adaptors related with extraction of data
 """

 import urlparse
+import re
 from scrapy.http import Response
 from scrapy.utils.python import flatten
 from scrapy.xpath.selector import XPathSelector, XPathSelectorList
@ -51,7 +52,19 @@ class ExtractImages(object):
    Output: list of unicodes
    """
    def __init__(self, response=None, base_url=None):
-        self.base_url = response.url if response else base_url
+        BASETAG_RE = re.compile(r'<base\s+href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I)
+
+        if response:
+            match = BASETAG_RE.search(response.body.to_string()[0:4096])
+            if match:
+                self.base_url = match.group(1)
+            else:
+                self.base_url = response.url
+        else:
+            self.base_url = base_url
+
+        if not self.base_url:
+            raise AttributeError('You must specify either a response or a base_url to the ExtractImages adaptor.')

    def extract_from_xpath(self, selector):
        ret = []
@ -76,9 +89,6 @@ class ExtractImages(object):
        return ret

    def __call__(self, locations):
-        if not self.base_url:
-            raise AttributeError('You must specify either a response or a base_url to the ExtractImages adaptor.')
-        
        if isinstance(locations, basestring):
            locations = [locations]