1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 03:23:50 +00:00

this changeset improves the extractors' implementation:

* moved LinkExtractor.extract_links to a private method and created wrapper in order to be able to work with text directly
* removed fugly new_response_from_xpaths from scrapy.utils.response and replaced it with a better internal algorithm
* moved former _normalize_input from scrapy.utils.iterators to scrapy.utils.response to fill the hole
* turned extractors' output lists into generators; this is safe because the result is always used in for..in constructs
* adapted test for generators (test should be rewritted anyways)

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40597
This commit is contained in:
samus_ 2009-01-02 08:42:36 +00:00
parent c9e48dc5ed
commit ebd5f465aa
5 changed files with 42 additions and 47 deletions

View File

@ -35,18 +35,20 @@ class LinkExtractor(FixedSGMLParser):
self.scan_attr = attr if callable(attr) else lambda a: a == attr
self.current_link = None
def extract_links(self, response, unique=False):
def _extract_links(self, response_text, response_url, unique):
self.reset()
self.unique = unique
self.feed(response.body.to_string())
self.feed(response_text)
self.close()
base_url = self.base_url if self.base_url else response.url
ret = []
base_url = self.base_url if self.base_url else response_url
for link in self.links:
link.url = urljoin(base_url, link.url).strip()
ret.append(link)
return ret
yield link
def extract_links(self, response, unique=False):
# wrapper needed to allow to work directly with text
return self._extract_links(response.body.to_string(), response.url, unique)
def reset(self):
FixedSGMLParser.reset(self)

View File

@ -9,9 +9,8 @@ import urlparse
from scrapy.link import LinkExtractor, Link
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
from scrapy.utils.response import new_response_from_xpaths
from scrapy.utils.python import unicode_to_str
from scrapy.xpath.selector import HtmlXPathSelector
from scrapy.xpath import HtmlXPathSelector
_re_type = type(re.compile("", 0))
@ -41,7 +40,7 @@ class RegexLinkExtractor(LinkExtractor):
If no allow/deny arguments are given, match all links.
"""
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href'), canonicalize=True):
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in allow]
@ -56,23 +55,25 @@ class RegexLinkExtractor(LinkExtractor):
def extract_links(self, response, unique=True):
if self.restrict_xpaths:
response = new_response_from_xpaths(response, self.restrict_xpaths)
hxs = HtmlXPathSelector(response)
html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
links = self._extract_links(html_slice, response.url, unique)
else:
links = LinkExtractor.extract_links(self, response, unique)
links = LinkExtractor.extract_links(self, response, unique)
links = [link for link in links if _is_valid_url(link.url)]
links = (link for link in links if _is_valid_url(link.url))
if self.allow_res:
links = [link for link in links if _matches(link.url, self.allow_res)]
links = (link for link in links if _matches(link.url, self.allow_res))
if self.deny_res:
links = [link for link in links if not _matches(link.url, self.deny_res)]
links = (link for link in links if not _matches(link.url, self.deny_res))
if self.allow_domains:
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
links = (link for link in links if url_is_from_any_domain(link.url, self.allow_domains))
if self.deny_domains:
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
links = (link for link in links if not url_is_from_any_domain(link.url, self.deny_domains))
if self.canonicalize:
for link in links:
link.url = canonicalize_url(link.url)
links = (canonicalize_url(link.url) for link in links)
return links
@ -81,7 +82,7 @@ class RegexLinkExtractor(LinkExtractor):
return False
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
return False
allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
return any(allowed) and not any(denied)

View File

@ -17,7 +17,8 @@ class LinkExtractorTestCase(unittest.TestCase):
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
lx = LinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_links(response),
links = [link for link in lx.extract_links(response)]
self.assertEqual(links,
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
Link(url='http://example.org/about.html', text='About us'),
Link(url='http://example.org/othercat.html', text='Other category'),
@ -30,7 +31,8 @@ class LinkExtractorTestCase(unittest.TestCase):
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
lx = LinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_links(response),
links = [link for link in lx.extract_links(response)]
self.assertEqual(links,
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
def test_matches(self):

View File

@ -1,18 +1,9 @@
import re, csv
from scrapy.xpath import XmlXPathSelector
from scrapy.http import Response
from scrapy import log
from scrapy.utils.python import re_rsearch, str_to_unicode, unicode_to_str
def _normalize_input(obj, unicode=True):
assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
if isinstance(obj, Response):
return obj.body.to_unicode() if unicode else obj.body.to_string()
elif isinstance(obj, str):
return obj.decode('utf-8') if unicode else obj
else:
return obj if unicode else obj.encode('utf-8')
from scrapy.utils.python import re_rsearch, str_to_unicode
from scrapy.utils.response import body_or_str
def xmliter(obj, nodename):
"""Return a iterator of XPathSelector's over all nodes of a XML document,
@ -25,7 +16,7 @@ def xmliter(obj, nodename):
"""
HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename, re.S)
HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename, re.S)
text = _normalize_input(obj)
text = body_or_str(obj)
header_start = re.search(HEADER_START_RE, text)
header_start = header_start.group(1).strip() if header_start else ''
@ -46,14 +37,14 @@ def csviter(obj, delimiter=None, headers=None):
- a string encoded as utf-8
delimiter is the character used to separate field on the given obj.
headers is an iterable that when provided offers the keys
for the returned dictionaries, if not the first row is used.
"""
def _getrow(csv_r):
return [str_to_unicode(field) for field in csv_r.next()]
lines = _normalize_input(obj, unicode=False).splitlines(True)
lines = body_or_str(obj, unicode=False).splitlines(True)
if delimiter:
csv_r = csv.reader(lines, delimiter=delimiter)
else:

View File

@ -3,14 +3,13 @@ This module provides some useful functions for working with
scrapy.http.Response objects
"""
from scrapy.xpath import XPathSelector
from scrapy.http.response import ResponseBody
from scrapy.http.response import Response
def new_response_from_xpaths(response, xpaths):
"""Return a new response constructed by applying the given xpaths to the
original response body
"""
xs = XPathSelector(response)
parts = [''.join([n for n in xs.x(x).extract()]) for x in xpaths]
new_body_content = ''.join(parts)
return response.replace(body=ResponseBody(content=new_body_content, declared_encoding=response.body.get_declared_encoding()))
def body_or_str(obj, unicode=True):
assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
if isinstance(obj, Response):
return obj.body.to_unicode() if unicode else obj.body.to_string()
elif isinstance(obj, str):
return obj.decode('utf-8') if unicode else obj
else:
return obj if unicode else obj.encode('utf-8')