1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-27 06:23:41 +00:00

this changeset improves the extractors' implementation:

* moved LinkExtractor.extract_links to a private method and created wrapper in order to be able to work with text directly
* removed fugly new_response_from_xpaths from scrapy.utils.response and replaced it with a better internal algorithm
* moved former _normalize_input from scrapy.utils.iterators to scrapy.utils.response to fill the hole
* turned extractors' output lists into generators; this is safe because the result is always used in for..in constructs
* adapted test for generators (test should be rewritted anyways)

--HG--
extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40597
This commit is contained in:
samus_ 2009-01-02 08:42:36 +00:00
parent c9e48dc5ed
commit ebd5f465aa
5 changed files with 42 additions and 47 deletions

View File

@ -35,18 +35,20 @@ class LinkExtractor(FixedSGMLParser):
self.scan_attr = attr if callable(attr) else lambda a: a == attr
self.current_link = None
def extract_links(self, response, unique=False):
def _extract_links(self, response_text, response_url, unique):
self.reset()
self.unique = unique
self.feed(response.body.to_string())
self.feed(response_text)
self.close()
base_url = self.base_url if self.base_url else response.url
ret = []
base_url = self.base_url if self.base_url else response_url
for link in self.links:
link.url = urljoin(base_url, link.url).strip()
ret.append(link)
return ret
yield link
def extract_links(self, response, unique=False):
# wrapper needed to allow to work directly with text
return self._extract_links(response.body.to_string(), response.url, unique)
def reset(self):
FixedSGMLParser.reset(self)

View File

@ -9,9 +9,8 @@ import urlparse
from scrapy.link import LinkExtractor, Link
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
from scrapy.utils.response import new_response_from_xpaths
from scrapy.utils.python import unicode_to_str
from scrapy.xpath.selector import HtmlXPathSelector
from scrapy.xpath import HtmlXPathSelector
_re_type = type(re.compile("", 0))
@ -56,23 +55,25 @@ class RegexLinkExtractor(LinkExtractor):
def extract_links(self, response, unique=True):
if self.restrict_xpaths:
response = new_response_from_xpaths(response, self.restrict_xpaths)
hxs = HtmlXPathSelector(response)
html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
links = self._extract_links(html_slice, response.url, unique)
else:
links = LinkExtractor.extract_links(self, response, unique)
links = LinkExtractor.extract_links(self, response, unique)
links = [link for link in links if _is_valid_url(link.url)]
links = (link for link in links if _is_valid_url(link.url))
if self.allow_res:
links = [link for link in links if _matches(link.url, self.allow_res)]
links = (link for link in links if _matches(link.url, self.allow_res))
if self.deny_res:
links = [link for link in links if not _matches(link.url, self.deny_res)]
links = (link for link in links if not _matches(link.url, self.deny_res))
if self.allow_domains:
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
links = (link for link in links if url_is_from_any_domain(link.url, self.allow_domains))
if self.deny_domains:
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
links = (link for link in links if not url_is_from_any_domain(link.url, self.deny_domains))
if self.canonicalize:
for link in links:
link.url = canonicalize_url(link.url)
links = (canonicalize_url(link.url) for link in links)
return links

View File

@ -17,7 +17,8 @@ class LinkExtractorTestCase(unittest.TestCase):
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
lx = LinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_links(response),
links = [link for link in lx.extract_links(response)]
self.assertEqual(links,
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
Link(url='http://example.org/about.html', text='About us'),
Link(url='http://example.org/othercat.html', text='Other category'),
@ -30,7 +31,8 @@ class LinkExtractorTestCase(unittest.TestCase):
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
lx = LinkExtractor() # default: tag=a, attr=href
self.assertEqual(lx.extract_links(response),
links = [link for link in lx.extract_links(response)]
self.assertEqual(links,
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
def test_matches(self):

View File

@ -1,18 +1,9 @@
import re, csv
from scrapy.xpath import XmlXPathSelector
from scrapy.http import Response
from scrapy import log
from scrapy.utils.python import re_rsearch, str_to_unicode, unicode_to_str
def _normalize_input(obj, unicode=True):
assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
if isinstance(obj, Response):
return obj.body.to_unicode() if unicode else obj.body.to_string()
elif isinstance(obj, str):
return obj.decode('utf-8') if unicode else obj
else:
return obj if unicode else obj.encode('utf-8')
from scrapy.utils.python import re_rsearch, str_to_unicode
from scrapy.utils.response import body_or_str
def xmliter(obj, nodename):
"""Return a iterator of XPathSelector's over all nodes of a XML document,
@ -25,7 +16,7 @@ def xmliter(obj, nodename):
"""
HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename, re.S)
HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename, re.S)
text = _normalize_input(obj)
text = body_or_str(obj)
header_start = re.search(HEADER_START_RE, text)
header_start = header_start.group(1).strip() if header_start else ''
@ -53,7 +44,7 @@ def csviter(obj, delimiter=None, headers=None):
def _getrow(csv_r):
return [str_to_unicode(field) for field in csv_r.next()]
lines = _normalize_input(obj, unicode=False).splitlines(True)
lines = body_or_str(obj, unicode=False).splitlines(True)
if delimiter:
csv_r = csv.reader(lines, delimiter=delimiter)
else:

View File

@ -3,14 +3,13 @@ This module provides some useful functions for working with
scrapy.http.Response objects
"""
from scrapy.xpath import XPathSelector
from scrapy.http.response import ResponseBody
from scrapy.http.response import Response
def new_response_from_xpaths(response, xpaths):
"""Return a new response constructed by applying the given xpaths to the
original response body
"""
xs = XPathSelector(response)
parts = [''.join([n for n in xs.x(x).extract()]) for x in xpaths]
new_body_content = ''.join(parts)
return response.replace(body=ResponseBody(content=new_body_content, declared_encoding=response.body.get_declared_encoding()))
def body_or_str(obj, unicode=True):
assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
if isinstance(obj, Response):
return obj.body.to_unicode() if unicode else obj.body.to_string()
elif isinstance(obj, str):
return obj.decode('utf-8') if unicode else obj
else:
return obj if unicode else obj.encode('utf-8')