mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-27 06:23:41 +00:00
this changeset improves the extractors' implementation:
* moved LinkExtractor.extract_links to a private method and created wrapper in order to be able to work with text directly * removed fugly new_response_from_xpaths from scrapy.utils.response and replaced it with a better internal algorithm * moved former _normalize_input from scrapy.utils.iterators to scrapy.utils.response to fill the hole * turned extractors' output lists into generators; this is safe because the result is always used in for..in constructs * adapted test for generators (test should be rewritted anyways) --HG-- extra : convert_revision : svn%3Ab85faa78-f9eb-468e-a121-7cced6da292c%40597
This commit is contained in:
parent
c9e48dc5ed
commit
ebd5f465aa
@ -35,18 +35,20 @@ class LinkExtractor(FixedSGMLParser):
|
||||
self.scan_attr = attr if callable(attr) else lambda a: a == attr
|
||||
self.current_link = None
|
||||
|
||||
def extract_links(self, response, unique=False):
|
||||
def _extract_links(self, response_text, response_url, unique):
|
||||
self.reset()
|
||||
self.unique = unique
|
||||
self.feed(response.body.to_string())
|
||||
self.feed(response_text)
|
||||
self.close()
|
||||
|
||||
base_url = self.base_url if self.base_url else response.url
|
||||
ret = []
|
||||
base_url = self.base_url if self.base_url else response_url
|
||||
for link in self.links:
|
||||
link.url = urljoin(base_url, link.url).strip()
|
||||
ret.append(link)
|
||||
return ret
|
||||
yield link
|
||||
|
||||
def extract_links(self, response, unique=False):
|
||||
# wrapper needed to allow to work directly with text
|
||||
return self._extract_links(response.body.to_string(), response.url, unique)
|
||||
|
||||
def reset(self):
|
||||
FixedSGMLParser.reset(self)
|
||||
|
@ -9,9 +9,8 @@ import urlparse
|
||||
|
||||
from scrapy.link import LinkExtractor, Link
|
||||
from scrapy.utils.url import canonicalize_url, url_is_from_any_domain
|
||||
from scrapy.utils.response import new_response_from_xpaths
|
||||
from scrapy.utils.python import unicode_to_str
|
||||
from scrapy.xpath.selector import HtmlXPathSelector
|
||||
from scrapy.xpath import HtmlXPathSelector
|
||||
|
||||
_re_type = type(re.compile("", 0))
|
||||
|
||||
@ -56,23 +55,25 @@ class RegexLinkExtractor(LinkExtractor):
|
||||
|
||||
def extract_links(self, response, unique=True):
|
||||
if self.restrict_xpaths:
|
||||
response = new_response_from_xpaths(response, self.restrict_xpaths)
|
||||
hxs = HtmlXPathSelector(response)
|
||||
html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.x(xpath_expr).extract()) for xpath_expr in self.restrict_xpaths)
|
||||
links = self._extract_links(html_slice, response.url, unique)
|
||||
else:
|
||||
links = LinkExtractor.extract_links(self, response, unique)
|
||||
|
||||
links = LinkExtractor.extract_links(self, response, unique)
|
||||
links = [link for link in links if _is_valid_url(link.url)]
|
||||
links = (link for link in links if _is_valid_url(link.url))
|
||||
|
||||
if self.allow_res:
|
||||
links = [link for link in links if _matches(link.url, self.allow_res)]
|
||||
links = (link for link in links if _matches(link.url, self.allow_res))
|
||||
if self.deny_res:
|
||||
links = [link for link in links if not _matches(link.url, self.deny_res)]
|
||||
links = (link for link in links if not _matches(link.url, self.deny_res))
|
||||
if self.allow_domains:
|
||||
links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
|
||||
links = (link for link in links if url_is_from_any_domain(link.url, self.allow_domains))
|
||||
if self.deny_domains:
|
||||
links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]
|
||||
links = (link for link in links if not url_is_from_any_domain(link.url, self.deny_domains))
|
||||
|
||||
if self.canonicalize:
|
||||
for link in links:
|
||||
link.url = canonicalize_url(link.url)
|
||||
links = (canonicalize_url(link.url) for link in links)
|
||||
|
||||
return links
|
||||
|
||||
|
@ -17,7 +17,8 @@ class LinkExtractorTestCase(unittest.TestCase):
|
||||
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
|
||||
|
||||
lx = LinkExtractor() # default: tag=a, attr=href
|
||||
self.assertEqual(lx.extract_links(response),
|
||||
links = [link for link in lx.extract_links(response)]
|
||||
self.assertEqual(links,
|
||||
[Link(url='http://example.org/somepage/item/12.html', text='Item 12'),
|
||||
Link(url='http://example.org/about.html', text='About us'),
|
||||
Link(url='http://example.org/othercat.html', text='Other category'),
|
||||
@ -30,7 +31,8 @@ class LinkExtractorTestCase(unittest.TestCase):
|
||||
response = Response("example.org", "http://example.org/somepage/index.html", body=html)
|
||||
|
||||
lx = LinkExtractor() # default: tag=a, attr=href
|
||||
self.assertEqual(lx.extract_links(response),
|
||||
links = [link for link in lx.extract_links(response)]
|
||||
self.assertEqual(links,
|
||||
[Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')])
|
||||
|
||||
def test_matches(self):
|
||||
|
@ -1,18 +1,9 @@
|
||||
import re, csv
|
||||
|
||||
from scrapy.xpath import XmlXPathSelector
|
||||
from scrapy.http import Response
|
||||
from scrapy import log
|
||||
from scrapy.utils.python import re_rsearch, str_to_unicode, unicode_to_str
|
||||
|
||||
def _normalize_input(obj, unicode=True):
|
||||
assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
|
||||
if isinstance(obj, Response):
|
||||
return obj.body.to_unicode() if unicode else obj.body.to_string()
|
||||
elif isinstance(obj, str):
|
||||
return obj.decode('utf-8') if unicode else obj
|
||||
else:
|
||||
return obj if unicode else obj.encode('utf-8')
|
||||
from scrapy.utils.python import re_rsearch, str_to_unicode
|
||||
from scrapy.utils.response import body_or_str
|
||||
|
||||
def xmliter(obj, nodename):
|
||||
"""Return a iterator of XPathSelector's over all nodes of a XML document,
|
||||
@ -25,7 +16,7 @@ def xmliter(obj, nodename):
|
||||
"""
|
||||
HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename, re.S)
|
||||
HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename, re.S)
|
||||
text = _normalize_input(obj)
|
||||
text = body_or_str(obj)
|
||||
|
||||
header_start = re.search(HEADER_START_RE, text)
|
||||
header_start = header_start.group(1).strip() if header_start else ''
|
||||
@ -53,7 +44,7 @@ def csviter(obj, delimiter=None, headers=None):
|
||||
def _getrow(csv_r):
|
||||
return [str_to_unicode(field) for field in csv_r.next()]
|
||||
|
||||
lines = _normalize_input(obj, unicode=False).splitlines(True)
|
||||
lines = body_or_str(obj, unicode=False).splitlines(True)
|
||||
if delimiter:
|
||||
csv_r = csv.reader(lines, delimiter=delimiter)
|
||||
else:
|
||||
|
@ -3,14 +3,13 @@ This module provides some useful functions for working with
|
||||
scrapy.http.Response objects
|
||||
"""
|
||||
|
||||
from scrapy.xpath import XPathSelector
|
||||
from scrapy.http.response import ResponseBody
|
||||
from scrapy.http.response import Response
|
||||
|
||||
def new_response_from_xpaths(response, xpaths):
|
||||
"""Return a new response constructed by applying the given xpaths to the
|
||||
original response body
|
||||
"""
|
||||
xs = XPathSelector(response)
|
||||
parts = [''.join([n for n in xs.x(x).extract()]) for x in xpaths]
|
||||
new_body_content = ''.join(parts)
|
||||
return response.replace(body=ResponseBody(content=new_body_content, declared_encoding=response.body.get_declared_encoding()))
|
||||
def body_or_str(obj, unicode=True):
|
||||
assert isinstance(obj, (Response, basestring)), "obj must be Response or basestring, not %s" % type(obj).__name__
|
||||
if isinstance(obj, Response):
|
||||
return obj.body.to_unicode() if unicode else obj.body.to_string()
|
||||
elif isinstance(obj, str):
|
||||
return obj.decode('utf-8') if unicode else obj
|
||||
else:
|
||||
return obj if unicode else obj.encode('utf-8')
|
||||
|
Loading…
x
Reference in New Issue
Block a user