1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-23 13:24:20 +00:00

Automated merge with ssh://hg.scrapy.org:2222/scrapy-0.12

This commit is contained in:
Pablo Hoffman 2011-02-24 15:28:57 -02:00
commit cfd11df539
15 changed files with 152 additions and 28 deletions

View File

@ -3,7 +3,7 @@
Frequently Asked Questions
==========================
How does Scrapy compare to BeautifulSoul or lxml?
How does Scrapy compare to BeautifulSoup or lxml?
-------------------------------------------------
`BeautifulSoup`_ and `lxml`_ are libraries for parsing HTML and XML. Scrapy is
@ -29,7 +29,7 @@ comparing `jinja2`_ to `Django`_.
What Python versions does Scrapy support?
-----------------------------------------
Scrapy runs in Python 2.5, 2.6 and 2.6. But it's recommended you use Python 2.6
Scrapy runs in Python 2.5, 2.6 and 2.7. But it's recommended you use Python 2.6
or above, since the Python 2.5 standard library has a few bugs in their URL
handling libraries. Some of these Python 2.5 bugs not only affect Scrapy but
any user code, such as spiders. You can see a list of `Python 2.5 bugs that

View File

@ -86,7 +86,7 @@ in your Ubuntu servers.
So, if you plan to deploy Scrapyd on a Ubuntu server, just add the Ubuntu
repositories as described in :ref:`topics-ubuntu` and then run::
aptitude install scrapyd-0.12
aptitude install scrapyd-0.13
This will install Scrapyd in your Ubuntu server creating a ``scrapy`` user
which Scrapyd will run as. It will also create some directories and files that

View File

@ -13,7 +13,7 @@ latest bug fixes.
To use the packages, just add the following line to your
``/etc/apt/sources.list``, and then run ``aptitude update`` and ``aptitude
install scrapy-0.12``::
install scrapy-0.13``::
deb http://archive.scrapy.org/ubuntu DISTRO main

View File

@ -2,8 +2,8 @@
Scrapy - a screen scraping framework written in Python
"""
version_info = (0, 12, 0)
__version__ = "0.12.0"
version_info = (0, 13, 0)
__version__ = "0.13.0"
import sys, os, warnings

View File

@ -57,6 +57,8 @@ class Command(ScrapyCommand):
help="list available projects on TARGET")
parser.add_option("--egg", metavar="FILE",
help="use the given egg, instead of building it")
parser.add_option("--build-egg", metavar="FILE",
help="only build the egg, don't deploy it")
def run(self, args, opts):
try:
@ -75,18 +77,26 @@ class Command(ScrapyCommand):
projects = json.loads(f.read())['projects']
print os.linesep.join(projects)
return
target_name = _get_target_name(args)
target = _get_target(target_name)
project = _get_project(target, opts)
version = _get_version(target, opts)
tmpdir = None
if opts.egg:
_log("Using egg: %s" % opts.egg)
egg = opts.egg
else:
_log("Building egg of %s-%s" % (project, version))
if opts.build_egg: # build egg only
egg, tmpdir = _build_egg()
_upload_egg(target, egg, project, version)
_log("Writing egg to %s" % opts.build_egg)
shutil.copyfile(egg, opts.build_egg)
else: # buld egg and deploy
target_name = _get_target_name(args)
target = _get_target(target_name)
project = _get_project(target, opts)
version = _get_version(target, opts)
if opts.egg:
_log("Using egg: %s" % opts.egg)
egg = opts.egg
else:
_log("Building egg of %s-%s" % (project, version))
egg, tmpdir = _build_egg()
_upload_egg(target, egg, project, version)
if tmpdir:
shutil.rmtree(tmpdir)

View File

@ -75,7 +75,7 @@ class BasicTypeExtractor(object):
u'<div data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x<b> xx</b></div>',\
u'<div>a name<b> id-9</b></div>')
>>> ex = BasicTypeExtractor(template.annotations[0])
>>> ex.extract(page, 0, 3, [LabelledRegion(*(1,2))])
>>> ex.extract(page, 0, 3, [LabelledRegion(1, 2)])
[(u'name', u'a name')]
"""
@ -395,7 +395,7 @@ class RecordExtractor(object):
s, p, e = similar_region(page.page_tokens, self.template_tokens, \
i, start, sindex)
if s > 0:
similar_ignored_regions.append(LabelledRegion(*(p, e)))
similar_ignored_regions.append(LabelledRegion(p, e))
start = e or start
extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions)
if extracted_data:

View File

@ -132,6 +132,10 @@ def image_url(txt):
['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']
"""
imgurl = extract_image_url(txt)
return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
def extract_image_url(txt):
txt = url(txt)
imgurl = None
if txt:
@ -153,4 +157,4 @@ def image_url(txt):
imgurl = urlparse.urlunparse(parsed)
if not imgurl:
imgurl = txt
return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
return imgurl

View File

@ -80,8 +80,8 @@ class HtmlTag(HtmlDataFragment):
def __repr__(self):
return str(self)
_ATTR = "((?:[^=/>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?"
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s+" + _ATTR + ")+\s*|\s*)(\/?)>"
_ATTR = "((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?"
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
_DOCTYPE = r"<!DOCTYPE.*?>"
_SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
_COMMENT = "(<!--.*?-->)"

View File

@ -2,14 +2,19 @@ from scrapy.http import Response
from scrapy.selector import XmlXPathSelector
def xmliter_lxml(obj, nodename):
def xmliter_lxml(obj, nodename, namespace=None):
from lxml import etree
reader = _StreamReader(obj)
iterable = etree.iterparse(reader, tag=nodename, encoding=reader.encoding)
tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
for _, node in iterable:
nodetext = etree.tostring(node)
node.clear()
yield XmlXPathSelector(text=nodetext).select('//' + nodename)[0]
xs = XmlXPathSelector(text=nodetext)
if namespace:
xs.register_namespace('x', namespace)
yield xs.select(selxpath)[0]
class _StreamReader(object):

View File

@ -18,6 +18,9 @@ class Link(object):
def __eq__(self, other):
return self.url == other.url and self.text == other.text
def __hash__(self):
return hash(self.url) ^ hash(self.text)
def __repr__(self):
return '<Link url=%r text=%r >' % (self.url, self.text)

View File

@ -516,7 +516,7 @@ ANNOTATED_PAGE19 = u"""
<div>
<p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Product name</p>
<p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">60.00</p>
<img data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;src&quot;: &quot;image_urls&quot;}}"src="image.jpg" />
<img data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;src&quot;: &quot;image_urls&quot;}}" src="image.jpg" />
<p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;required&quot;: [&quot;description&quot;], &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">description</p>
</div>
</body></html>

View File

@ -137,3 +137,19 @@ class TestParseHtml(TestCase):
parsed = list(parse_html("<IMG SRC='http://images.play.com/banners/SAM550a.jpg' align='left' / hspace=5>"))
self.assertEqual(parsed[0].attributes, {'src': 'http://images.play.com/banners/SAM550a.jpg', \
'align': 'left', 'hspace': '5', '/': None})
def test_no_ending_body(self):
"""Test case when no ending body nor html elements are present"""
parsed = [_decode_element(d) for d in PARSED7]
self._test_sample(PAGE7, parsed)
def test_malformed(self):
"""Test parsing of some malformed cases"""
parsed = [_decode_element(d) for d in PARSED8]
self._test_sample(PAGE8, parsed)
def test_malformed2(self):
"""Test case when attributes are not separated by space (still recognizable because of quotes)"""
parsed = [_decode_element(d) for d in PARSED9]
self._test_sample(PAGE9, parsed)

View File

@ -246,3 +246,32 @@ PARSED7 = [
{'end': 99, 'start': 85},
]
PAGE8 = u"""<a href="/overview.asp?id=277"><img border="0" src="/img/5200814311.jpg" title=\'Vinyl Cornice\'</a></td><table width=\'5\'>"""
PARSED8 = [
{'attributes' : {u'href' : u"/overview.asp?id=277"}, 'end': 31, 'start': 0, 'tag': u'a', 'tag_type': 1},
{'attributes' : {u'src' : u"/img/5200814311.jpg", u'border' : u"0", u'title': u'Vinyl Cornice'}, 'end': 94, 'start': 31, 'tag': u'img', 'tag_type': 1},
{'attributes' : {}, 'end': 98, 'start': 94, 'tag': u'a', 'tag_type': 2},
{'attributes' : {}, 'end': 103, 'start': 98, 'tag': u'td', 'tag_type': 2},
{'attributes' : {u'width': u'5'}, 'end': 120, 'start': 103, 'tag': u'table', 'tag_type': 1}
]
PAGE9 = u"""\
<html>\
<body>\
<img width='230' height='150'src='/images/9589.jpg' >\
<a href="/product/9589">Click here</a>\
</body>\
</html>\
"""
PARSED9 = [
{'attributes' : {}, 'end': 6, 'start': 0, 'tag': 'html', 'tag_type': 1},
{'attributes' : {}, 'end': 12, 'start': 6, 'tag': 'body', 'tag_type': 1},
{'attributes' : {'width': '230', 'height': '150', 'src': '/images/9589.jpg'}, 'end': 65, 'start': 12, 'tag': 'img', 'tag_type': 1},
{'attributes' : {'href': '/product/9589'}, 'end': 89, 'start': 65, 'tag': 'a', 'tag_type': 1},
{'end': 99, 'start': 89},
{'attributes' : {}, 'end': 103, 'start': 99, 'tag': 'a', 'tag_type': 2},
{'attributes' : {}, 'end': 110, 'start': 103, 'tag': 'body', 'tag_type': 2},
{'attributes' : {}, 'end': 117, 'start': 110, 'tag': 'html', 'tag_type': 2},
]

28
scrapy/tests/test_link.py Normal file
View File

@ -0,0 +1,28 @@
import unittest
from scrapy.link import Link
class LinkTest(unittest.TestCase):
def test_eq_and_hash(self):
l1 = Link("http://www.example.com")
l2 = Link("http://www.example.com/other")
l3 = Link("http://www.example.com")
self.assertEqual(l1, l1)
self.assertEqual(hash(l1), hash(l1))
self.assertNotEqual(l1, l2)
self.assertNotEqual(hash(l1), hash(l2))
self.assertEqual(l1, l3)
self.assertEqual(hash(l1), hash(l3))
l4 = Link("http://www.example.com", text="test")
l5 = Link("http://www.example.com", text="test2")
l6 = Link("http://www.example.com", text="test")
self.assertEqual(l4, l4)
self.assertEqual(hash(l4), hash(l4))
self.assertNotEqual(l4, l5)
self.assertNotEqual(hash(l4), hash(l5))
self.assertEqual(l4, l6)
self.assertEqual(hash(l4), hash(l6))

View File

@ -29,12 +29,12 @@ class XmliterTestCase(unittest.TestCase):
for x in self.xmliter(response, 'product'):
attrs.append((x.select("@id").extract(), x.select("name/text()").extract(), x.select("./type/text()").extract()))
self.assertEqual(attrs,
self.assertEqual(attrs,
[(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])
def test_xmliter_text(self):
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
self.assertEqual([x.select("text()").extract() for x in self.xmliter(body, 'product')],
[[u'one'], [u'two']])
@ -74,7 +74,7 @@ class XmliterTestCase(unittest.TestCase):
def test_xmliter_exception(self):
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
iter = self.xmliter(body, 'product')
iter.next()
iter.next()
@ -97,6 +97,35 @@ class LxmlXmliterTestCase(XmliterTestCase):
except ImportError:
skip = "lxml not available"
def test_xmliter_iterate_namespace(self):
body = """\
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns="http://base.google.com/ns/1.0">
<channel>
<title>My Dummy Company</title>
<link>http://www.mydummycompany.com</link>
<description>This is a dummy company. We do nothing.</description>
<item>
<title>Item 1</title>
<description>This is item 1</description>
<link>http://www.mydummycompany.com/items/1</link>
<image_link>http://www.mydummycompany.com/images/item1.jpg</image_link>
<image_link>http://www.mydummycompany.com/images/item2.jpg</image_link>
</item>
</channel>
</rss>
"""
response = XmlResponse(url='http://mydummycompany.com', body=body)
no_namespace_iter = self.xmliter(response, 'image_link')
self.assertEqual(len(list(no_namespace_iter)), 0)
namespace_iter = self.xmliter(response, 'image_link', 'http://base.google.com/ns/1.0')
node = namespace_iter.next()
self.assertEqual(node.select('text()').extract(), ['http://www.mydummycompany.com/images/item1.jpg'])
node = namespace_iter.next()
self.assertEqual(node.select('text()').extract(), ['http://www.mydummycompany.com/images/item2.jpg'])
class UtilsCsvTestCase(unittest.TestCase):
sample_feeds_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data', 'feeds')