mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-23 13:24:20 +00:00
Automated merge with ssh://hg.scrapy.org:2222/scrapy-0.12
This commit is contained in:
commit
cfd11df539
@ -3,7 +3,7 @@
|
||||
Frequently Asked Questions
|
||||
==========================
|
||||
|
||||
How does Scrapy compare to BeautifulSoul or lxml?
|
||||
How does Scrapy compare to BeautifulSoup or lxml?
|
||||
-------------------------------------------------
|
||||
|
||||
`BeautifulSoup`_ and `lxml`_ are libraries for parsing HTML and XML. Scrapy is
|
||||
@ -29,7 +29,7 @@ comparing `jinja2`_ to `Django`_.
|
||||
What Python versions does Scrapy support?
|
||||
-----------------------------------------
|
||||
|
||||
Scrapy runs in Python 2.5, 2.6 and 2.6. But it's recommended you use Python 2.6
|
||||
Scrapy runs in Python 2.5, 2.6 and 2.7. But it's recommended you use Python 2.6
|
||||
or above, since the Python 2.5 standard library has a few bugs in their URL
|
||||
handling libraries. Some of these Python 2.5 bugs not only affect Scrapy but
|
||||
any user code, such as spiders. You can see a list of `Python 2.5 bugs that
|
||||
|
@ -86,7 +86,7 @@ in your Ubuntu servers.
|
||||
So, if you plan to deploy Scrapyd on a Ubuntu server, just add the Ubuntu
|
||||
repositories as described in :ref:`topics-ubuntu` and then run::
|
||||
|
||||
aptitude install scrapyd-0.12
|
||||
aptitude install scrapyd-0.13
|
||||
|
||||
This will install Scrapyd in your Ubuntu server creating a ``scrapy`` user
|
||||
which Scrapyd will run as. It will also create some directories and files that
|
||||
|
@ -13,7 +13,7 @@ latest bug fixes.
|
||||
|
||||
To use the packages, just add the following line to your
|
||||
``/etc/apt/sources.list``, and then run ``aptitude update`` and ``aptitude
|
||||
install scrapy-0.12``::
|
||||
install scrapy-0.13``::
|
||||
|
||||
deb http://archive.scrapy.org/ubuntu DISTRO main
|
||||
|
||||
|
@ -2,8 +2,8 @@
|
||||
Scrapy - a screen scraping framework written in Python
|
||||
"""
|
||||
|
||||
version_info = (0, 12, 0)
|
||||
__version__ = "0.12.0"
|
||||
version_info = (0, 13, 0)
|
||||
__version__ = "0.13.0"
|
||||
|
||||
import sys, os, warnings
|
||||
|
||||
|
@ -57,6 +57,8 @@ class Command(ScrapyCommand):
|
||||
help="list available projects on TARGET")
|
||||
parser.add_option("--egg", metavar="FILE",
|
||||
help="use the given egg, instead of building it")
|
||||
parser.add_option("--build-egg", metavar="FILE",
|
||||
help="only build the egg, don't deploy it")
|
||||
|
||||
def run(self, args, opts):
|
||||
try:
|
||||
@ -75,18 +77,26 @@ class Command(ScrapyCommand):
|
||||
projects = json.loads(f.read())['projects']
|
||||
print os.linesep.join(projects)
|
||||
return
|
||||
target_name = _get_target_name(args)
|
||||
target = _get_target(target_name)
|
||||
project = _get_project(target, opts)
|
||||
version = _get_version(target, opts)
|
||||
|
||||
tmpdir = None
|
||||
if opts.egg:
|
||||
_log("Using egg: %s" % opts.egg)
|
||||
egg = opts.egg
|
||||
else:
|
||||
_log("Building egg of %s-%s" % (project, version))
|
||||
|
||||
if opts.build_egg: # build egg only
|
||||
egg, tmpdir = _build_egg()
|
||||
_upload_egg(target, egg, project, version)
|
||||
_log("Writing egg to %s" % opts.build_egg)
|
||||
shutil.copyfile(egg, opts.build_egg)
|
||||
else: # buld egg and deploy
|
||||
target_name = _get_target_name(args)
|
||||
target = _get_target(target_name)
|
||||
project = _get_project(target, opts)
|
||||
version = _get_version(target, opts)
|
||||
if opts.egg:
|
||||
_log("Using egg: %s" % opts.egg)
|
||||
egg = opts.egg
|
||||
else:
|
||||
_log("Building egg of %s-%s" % (project, version))
|
||||
egg, tmpdir = _build_egg()
|
||||
_upload_egg(target, egg, project, version)
|
||||
|
||||
if tmpdir:
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
|
@ -75,7 +75,7 @@ class BasicTypeExtractor(object):
|
||||
u'<div data-scrapy-annotate="{"annotations": {"content": "name"}}">x<b> xx</b></div>',\
|
||||
u'<div>a name<b> id-9</b></div>')
|
||||
>>> ex = BasicTypeExtractor(template.annotations[0])
|
||||
>>> ex.extract(page, 0, 3, [LabelledRegion(*(1,2))])
|
||||
>>> ex.extract(page, 0, 3, [LabelledRegion(1, 2)])
|
||||
[(u'name', u'a name')]
|
||||
"""
|
||||
|
||||
@ -395,7 +395,7 @@ class RecordExtractor(object):
|
||||
s, p, e = similar_region(page.page_tokens, self.template_tokens, \
|
||||
i, start, sindex)
|
||||
if s > 0:
|
||||
similar_ignored_regions.append(LabelledRegion(*(p, e)))
|
||||
similar_ignored_regions.append(LabelledRegion(p, e))
|
||||
start = e or start
|
||||
extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions)
|
||||
if extracted_data:
|
||||
|
@ -132,6 +132,10 @@ def image_url(txt):
|
||||
['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']
|
||||
|
||||
"""
|
||||
imgurl = extract_image_url(txt)
|
||||
return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
|
||||
|
||||
def extract_image_url(txt):
|
||||
txt = url(txt)
|
||||
imgurl = None
|
||||
if txt:
|
||||
@ -153,4 +157,4 @@ def image_url(txt):
|
||||
imgurl = urlparse.urlunparse(parsed)
|
||||
if not imgurl:
|
||||
imgurl = txt
|
||||
return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
|
||||
return imgurl
|
||||
|
@ -80,8 +80,8 @@ class HtmlTag(HtmlDataFragment):
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
_ATTR = "((?:[^=/>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?"
|
||||
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s+" + _ATTR + ")+\s*|\s*)(\/?)>"
|
||||
_ATTR = "((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?"
|
||||
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
|
||||
_DOCTYPE = r"<!DOCTYPE.*?>"
|
||||
_SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
|
||||
_COMMENT = "(<!--.*?-->)"
|
||||
|
@ -2,14 +2,19 @@ from scrapy.http import Response
|
||||
from scrapy.selector import XmlXPathSelector
|
||||
|
||||
|
||||
def xmliter_lxml(obj, nodename):
|
||||
def xmliter_lxml(obj, nodename, namespace=None):
|
||||
from lxml import etree
|
||||
reader = _StreamReader(obj)
|
||||
iterable = etree.iterparse(reader, tag=nodename, encoding=reader.encoding)
|
||||
tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
|
||||
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
|
||||
selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
|
||||
for _, node in iterable:
|
||||
nodetext = etree.tostring(node)
|
||||
node.clear()
|
||||
yield XmlXPathSelector(text=nodetext).select('//' + nodename)[0]
|
||||
xs = XmlXPathSelector(text=nodetext)
|
||||
if namespace:
|
||||
xs.register_namespace('x', namespace)
|
||||
yield xs.select(selxpath)[0]
|
||||
|
||||
|
||||
class _StreamReader(object):
|
||||
|
@ -18,6 +18,9 @@ class Link(object):
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.url == other.url and self.text == other.text
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.url) ^ hash(self.text)
|
||||
|
||||
def __repr__(self):
|
||||
return '<Link url=%r text=%r >' % (self.url, self.text)
|
||||
|
@ -516,7 +516,7 @@ ANNOTATED_PAGE19 = u"""
|
||||
<div>
|
||||
<p data-scrapy-annotate="{"variant": 0, "annotations": {"content": "name"}}">Product name</p>
|
||||
<p data-scrapy-annotate="{"variant": 0, "annotations": {"content": "price"}}">60.00</p>
|
||||
<img data-scrapy-annotate="{"variant": 0, "annotations": {"src": "image_urls"}}"src="image.jpg" />
|
||||
<img data-scrapy-annotate="{"variant": 0, "annotations": {"src": "image_urls"}}" src="image.jpg" />
|
||||
<p data-scrapy-annotate="{"variant": 0, "required": ["description"], "annotations": {"content": "description"}}">description</p>
|
||||
</div>
|
||||
</body></html>
|
||||
|
@ -137,3 +137,19 @@ class TestParseHtml(TestCase):
|
||||
parsed = list(parse_html("<IMG SRC='http://images.play.com/banners/SAM550a.jpg' align='left' / hspace=5>"))
|
||||
self.assertEqual(parsed[0].attributes, {'src': 'http://images.play.com/banners/SAM550a.jpg', \
|
||||
'align': 'left', 'hspace': '5', '/': None})
|
||||
|
||||
def test_no_ending_body(self):
|
||||
"""Test case when no ending body nor html elements are present"""
|
||||
parsed = [_decode_element(d) for d in PARSED7]
|
||||
self._test_sample(PAGE7, parsed)
|
||||
|
||||
def test_malformed(self):
|
||||
"""Test parsing of some malformed cases"""
|
||||
parsed = [_decode_element(d) for d in PARSED8]
|
||||
self._test_sample(PAGE8, parsed)
|
||||
|
||||
def test_malformed2(self):
|
||||
"""Test case when attributes are not separated by space (still recognizable because of quotes)"""
|
||||
parsed = [_decode_element(d) for d in PARSED9]
|
||||
self._test_sample(PAGE9, parsed)
|
||||
|
||||
|
@ -246,3 +246,32 @@ PARSED7 = [
|
||||
{'end': 99, 'start': 85},
|
||||
]
|
||||
|
||||
PAGE8 = u"""<a href="/overview.asp?id=277"><img border="0" src="/img/5200814311.jpg" title=\'Vinyl Cornice\'</a></td><table width=\'5\'>"""
|
||||
|
||||
PARSED8 = [
|
||||
{'attributes' : {u'href' : u"/overview.asp?id=277"}, 'end': 31, 'start': 0, 'tag': u'a', 'tag_type': 1},
|
||||
{'attributes' : {u'src' : u"/img/5200814311.jpg", u'border' : u"0", u'title': u'Vinyl Cornice'}, 'end': 94, 'start': 31, 'tag': u'img', 'tag_type': 1},
|
||||
{'attributes' : {}, 'end': 98, 'start': 94, 'tag': u'a', 'tag_type': 2},
|
||||
{'attributes' : {}, 'end': 103, 'start': 98, 'tag': u'td', 'tag_type': 2},
|
||||
{'attributes' : {u'width': u'5'}, 'end': 120, 'start': 103, 'tag': u'table', 'tag_type': 1}
|
||||
]
|
||||
|
||||
PAGE9 = u"""\
|
||||
<html>\
|
||||
<body>\
|
||||
<img width='230' height='150'src='/images/9589.jpg' >\
|
||||
<a href="/product/9589">Click here</a>\
|
||||
</body>\
|
||||
</html>\
|
||||
"""
|
||||
|
||||
PARSED9 = [
|
||||
{'attributes' : {}, 'end': 6, 'start': 0, 'tag': 'html', 'tag_type': 1},
|
||||
{'attributes' : {}, 'end': 12, 'start': 6, 'tag': 'body', 'tag_type': 1},
|
||||
{'attributes' : {'width': '230', 'height': '150', 'src': '/images/9589.jpg'}, 'end': 65, 'start': 12, 'tag': 'img', 'tag_type': 1},
|
||||
{'attributes' : {'href': '/product/9589'}, 'end': 89, 'start': 65, 'tag': 'a', 'tag_type': 1},
|
||||
{'end': 99, 'start': 89},
|
||||
{'attributes' : {}, 'end': 103, 'start': 99, 'tag': 'a', 'tag_type': 2},
|
||||
{'attributes' : {}, 'end': 110, 'start': 103, 'tag': 'body', 'tag_type': 2},
|
||||
{'attributes' : {}, 'end': 117, 'start': 110, 'tag': 'html', 'tag_type': 2},
|
||||
]
|
||||
|
28
scrapy/tests/test_link.py
Normal file
28
scrapy/tests/test_link.py
Normal file
@ -0,0 +1,28 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.link import Link
|
||||
|
||||
class LinkTest(unittest.TestCase):
|
||||
|
||||
def test_eq_and_hash(self):
|
||||
l1 = Link("http://www.example.com")
|
||||
l2 = Link("http://www.example.com/other")
|
||||
l3 = Link("http://www.example.com")
|
||||
|
||||
self.assertEqual(l1, l1)
|
||||
self.assertEqual(hash(l1), hash(l1))
|
||||
self.assertNotEqual(l1, l2)
|
||||
self.assertNotEqual(hash(l1), hash(l2))
|
||||
self.assertEqual(l1, l3)
|
||||
self.assertEqual(hash(l1), hash(l3))
|
||||
|
||||
l4 = Link("http://www.example.com", text="test")
|
||||
l5 = Link("http://www.example.com", text="test2")
|
||||
l6 = Link("http://www.example.com", text="test")
|
||||
|
||||
self.assertEqual(l4, l4)
|
||||
self.assertEqual(hash(l4), hash(l4))
|
||||
self.assertNotEqual(l4, l5)
|
||||
self.assertNotEqual(hash(l4), hash(l5))
|
||||
self.assertEqual(l4, l6)
|
||||
self.assertEqual(hash(l4), hash(l6))
|
@ -29,12 +29,12 @@ class XmliterTestCase(unittest.TestCase):
|
||||
for x in self.xmliter(response, 'product'):
|
||||
attrs.append((x.select("@id").extract(), x.select("name/text()").extract(), x.select("./type/text()").extract()))
|
||||
|
||||
self.assertEqual(attrs,
|
||||
self.assertEqual(attrs,
|
||||
[(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])
|
||||
|
||||
def test_xmliter_text(self):
|
||||
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
|
||||
|
||||
|
||||
self.assertEqual([x.select("text()").extract() for x in self.xmliter(body, 'product')],
|
||||
[[u'one'], [u'two']])
|
||||
|
||||
@ -74,7 +74,7 @@ class XmliterTestCase(unittest.TestCase):
|
||||
|
||||
def test_xmliter_exception(self):
|
||||
body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
|
||||
|
||||
|
||||
iter = self.xmliter(body, 'product')
|
||||
iter.next()
|
||||
iter.next()
|
||||
@ -97,6 +97,35 @@ class LxmlXmliterTestCase(XmliterTestCase):
|
||||
except ImportError:
|
||||
skip = "lxml not available"
|
||||
|
||||
def test_xmliter_iterate_namespace(self):
|
||||
body = """\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0" xmlns="http://base.google.com/ns/1.0">
|
||||
<channel>
|
||||
<title>My Dummy Company</title>
|
||||
<link>http://www.mydummycompany.com</link>
|
||||
<description>This is a dummy company. We do nothing.</description>
|
||||
<item>
|
||||
<title>Item 1</title>
|
||||
<description>This is item 1</description>
|
||||
<link>http://www.mydummycompany.com/items/1</link>
|
||||
<image_link>http://www.mydummycompany.com/images/item1.jpg</image_link>
|
||||
<image_link>http://www.mydummycompany.com/images/item2.jpg</image_link>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
response = XmlResponse(url='http://mydummycompany.com', body=body)
|
||||
|
||||
no_namespace_iter = self.xmliter(response, 'image_link')
|
||||
self.assertEqual(len(list(no_namespace_iter)), 0)
|
||||
|
||||
namespace_iter = self.xmliter(response, 'image_link', 'http://base.google.com/ns/1.0')
|
||||
node = namespace_iter.next()
|
||||
self.assertEqual(node.select('text()').extract(), ['http://www.mydummycompany.com/images/item1.jpg'])
|
||||
node = namespace_iter.next()
|
||||
self.assertEqual(node.select('text()').extract(), ['http://www.mydummycompany.com/images/item2.jpg'])
|
||||
|
||||
|
||||
class UtilsCsvTestCase(unittest.TestCase):
|
||||
sample_feeds_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data', 'feeds')
|
||||
|
Loading…
x
Reference in New Issue
Block a user