Automated merge with ssh://hg.scrapy.org:2222/scrapy-0.12

2025-02-23 13:24:20 +00:00 · 2011-02-24 15:28:57 -02:00 · 2011-02-24 15:28:57 -02:00 · cfd11df539
commit cfd11df539
parent 8f7e163b04 32fa2add75
15 changed files with 152 additions and 28 deletions
--- a/docs/faq.rst
+++ b/docs/faq.rst
@ -3,7 +3,7 @@
 Frequently Asked Questions
 ==========================

-How does Scrapy compare to BeautifulSoul or lxml?
+How does Scrapy compare to BeautifulSoup or lxml?
 -------------------------------------------------

 `BeautifulSoup`_ and `lxml`_ are libraries for parsing HTML and XML. Scrapy is
@ -29,7 +29,7 @@ comparing `jinja2`_ to `Django`_.
 What Python versions does Scrapy support?
 -----------------------------------------

-Scrapy runs in Python 2.5, 2.6 and 2.6. But it's recommended you use Python 2.6
+Scrapy runs in Python 2.5, 2.6 and 2.7. But it's recommended you use Python 2.6
 or above, since the Python 2.5 standard library has a few bugs in their URL
 handling libraries. Some of these Python 2.5 bugs not only affect Scrapy but
 any user code, such as spiders. You can see a list of `Python 2.5 bugs that
--- a/docs/topics/scrapyd.rst
+++ b/docs/topics/scrapyd.rst
@ -86,7 +86,7 @@ in your Ubuntu servers.
 So, if you plan to deploy Scrapyd on a Ubuntu server, just add the Ubuntu
 repositories as described in :ref:`topics-ubuntu` and then run::

-    aptitude install scrapyd-0.12
+    aptitude install scrapyd-0.13

 This will install Scrapyd in your Ubuntu server creating a ``scrapy`` user
 which Scrapyd will run as. It will also create some directories and files that
--- a/docs/topics/ubuntu.rst
+++ b/docs/topics/ubuntu.rst
@ -13,7 +13,7 @@ latest bug fixes.

 To use the packages, just add the following line to your
 ``/etc/apt/sources.list``, and then run ``aptitude update`` and ``aptitude
-install scrapy-0.12``::
+install scrapy-0.13``::

    deb http://archive.scrapy.org/ubuntu DISTRO main

--- a/scrapy/init.py
+++ b/scrapy/init.py
@ -2,8 +2,8 @@
 Scrapy - a screen scraping framework written in Python
 """

-version_info = (0, 12, 0)
-__version__ = "0.12.0"
+version_info = (0, 13, 0)
+__version__ = "0.13.0"

 import sys, os, warnings

--- a/scrapy/commands/deploy.py
+++ b/scrapy/commands/deploy.py
@ -57,6 +57,8 @@ class Command(ScrapyCommand):
            help="list available projects on TARGET")
        parser.add_option("--egg", metavar="FILE",
            help="use the given egg, instead of building it")
+        parser.add_option("--build-egg", metavar="FILE",
+            help="only build the egg, don't deploy it")

    def run(self, args, opts):
        try:
@ -75,18 +77,26 @@ class Command(ScrapyCommand):
            projects = json.loads(f.read())['projects']
            print os.linesep.join(projects)
            return
-        target_name = _get_target_name(args)
-        target = _get_target(target_name)
-        project = _get_project(target, opts)
-        version = _get_version(target, opts)
+
        tmpdir = None
-        if opts.egg:
-            _log("Using egg: %s" % opts.egg)
-            egg = opts.egg
-        else:
-            _log("Building egg of %s-%s" % (project, version))
+
+        if opts.build_egg: # build egg only
            egg, tmpdir = _build_egg()
-        _upload_egg(target, egg, project, version)
+            _log("Writing egg to %s" % opts.build_egg)
+            shutil.copyfile(egg, opts.build_egg)
+        else: # buld egg and deploy
+            target_name = _get_target_name(args)
+            target = _get_target(target_name)
+            project = _get_project(target, opts)
+            version = _get_version(target, opts)
+            if opts.egg:
+                _log("Using egg: %s" % opts.egg)
+                egg = opts.egg
+            else:
+                _log("Building egg of %s-%s" % (project, version))
+                egg, tmpdir = _build_egg()
+            _upload_egg(target, egg, project, version)
+
        if tmpdir:
            shutil.rmtree(tmpdir)

--- a/scrapy/contrib/ibl/extraction/regionextract.py
+++ b/scrapy/contrib/ibl/extraction/regionextract.py
@ -75,7 +75,7 @@ class BasicTypeExtractor(object):
        u'<div data-scrapy-annotate="{&quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">x<b> xx</b></div>',\
        u'<div>a name<b> id-9</b></div>')
    >>> ex = BasicTypeExtractor(template.annotations[0])
-    >>> ex.extract(page, 0, 3, [LabelledRegion(*(1,2))])
+    >>> ex.extract(page, 0, 3, [LabelledRegion(1, 2)])
    [(u'name', u'a name')]
    """

@ -395,7 +395,7 @@ class RecordExtractor(object):
                    s, p, e = similar_region(page.page_tokens, self.template_tokens, \
                              i, start, sindex)
                    if s > 0:
-                        similar_ignored_regions.append(LabelledRegion(*(p, e)))
+                        similar_ignored_regions.append(LabelledRegion(p, e))
                        start = e or start
                extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions)
                if extracted_data:
--- a/scrapy/contrib/ibl/extractors.py
+++ b/scrapy/contrib/ibl/extractors.py
@ -132,6 +132,10 @@ def image_url(txt):
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
+    imgurl = extract_image_url(txt)
+    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
+
+def extract_image_url(txt):
    txt = url(txt)
    imgurl = None
    if txt:
@ -153,4 +157,4 @@ def image_url(txt):
            imgurl = urlparse.urlunparse(parsed)
        if not imgurl:
            imgurl = txt
-    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
+    return imgurl
--- a/scrapy/contrib/ibl/htmlpage.py
+++ b/scrapy/contrib/ibl/htmlpage.py
@ -80,8 +80,8 @@ class HtmlTag(HtmlDataFragment):
    def __repr__(self):
        return str(self)

-_ATTR = "((?:[^=/>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?"
-_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s+" + _ATTR + ")+\s*|\s*)(\/?)>"
+_ATTR = "((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?"
+_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
 _DOCTYPE = r"<!DOCTYPE.*?>"
 _SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
 _COMMENT = "(<!--.*?-->)"
--- a/scrapy/contrib_exp/iterators.py
+++ b/scrapy/contrib_exp/iterators.py
@ -2,14 +2,19 @@ from scrapy.http import Response
 from scrapy.selector import XmlXPathSelector


-def xmliter_lxml(obj, nodename):
+def xmliter_lxml(obj, nodename, namespace=None):
    from lxml import etree
    reader = _StreamReader(obj)
-    iterable = etree.iterparse(reader, tag=nodename, encoding=reader.encoding)
+    tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
+    iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
+    selxpath = '//' + ('x:%s' % nodename if namespace else nodename)
    for _, node in iterable:
        nodetext = etree.tostring(node)
        node.clear()
-        yield XmlXPathSelector(text=nodetext).select('//' + nodename)[0]
+        xs = XmlXPathSelector(text=nodetext)
+        if namespace:
+            xs.register_namespace('x', namespace)
+        yield xs.select(selxpath)[0]


 class _StreamReader(object):
--- a/scrapy/link.py
+++ b/scrapy/link.py
@ -18,6 +18,9 @@ class Link(object):

    def __eq__(self, other):
        return self.url == other.url and self.text == other.text
+    
+    def __hash__(self):
+        return hash(self.url) ^ hash(self.text)

    def __repr__(self):
        return '<Link url=%r text=%r >' % (self.url, self.text)
--- a/scrapy/tests/test_contrib_ibl/test_extraction.py
+++ b/scrapy/tests/test_contrib_ibl/test_extraction.py
@ -516,7 +516,7 @@ ANNOTATED_PAGE19 = u"""
 <div>
 <p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Product name</p>
 <p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">60.00</p>
-<img data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;src&quot;: &quot;image_urls&quot;}}"src="image.jpg" />
+<img data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;src&quot;: &quot;image_urls&quot;}}" src="image.jpg" />
 <p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;required&quot;: [&quot;description&quot;], &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">description</p>
 </div>
 </body></html>
--- a/scrapy/tests/test_contrib_ibl/test_htmlpage.py
+++ b/scrapy/tests/test_contrib_ibl/test_htmlpage.py
@ -137,3 +137,19 @@ class TestParseHtml(TestCase):
        parsed = list(parse_html("<IMG SRC='http://images.play.com/banners/SAM550a.jpg' align='left' / hspace=5>"))
        self.assertEqual(parsed[0].attributes, {'src': 'http://images.play.com/banners/SAM550a.jpg', \
                                                'align': 'left', 'hspace': '5', '/': None})
+
+    def test_no_ending_body(self):
+        """Test case when no ending body nor html elements are present"""
+        parsed = [_decode_element(d) for d in PARSED7]
+        self._test_sample(PAGE7, parsed)
+
+    def test_malformed(self):
+        """Test parsing of some malformed cases"""
+        parsed = [_decode_element(d) for d in PARSED8]
+        self._test_sample(PAGE8, parsed)
+
+    def test_malformed2(self):
+        """Test case when attributes are not separated by space (still recognizable because of quotes)"""
+        parsed = [_decode_element(d) for d in PARSED9]
+        self._test_sample(PAGE9, parsed)
+
--- a/scrapy/tests/test_contrib_ibl/test_htmlpage_data.py
+++ b/scrapy/tests/test_contrib_ibl/test_htmlpage_data.py
@ -246,3 +246,32 @@ PARSED7 = [
    {'end': 99, 'start': 85},
 ]

+PAGE8 = u"""<a href="/overview.asp?id=277"><img border="0" src="/img/5200814311.jpg" title=\'Vinyl Cornice\'</a></td><table width=\'5\'>"""
+
+PARSED8 = [
+   {'attributes' : {u'href' : u"/overview.asp?id=277"}, 'end': 31, 'start': 0, 'tag': u'a', 'tag_type': 1},
+   {'attributes' : {u'src' : u"/img/5200814311.jpg", u'border' : u"0", u'title': u'Vinyl Cornice'}, 'end': 94, 'start': 31, 'tag': u'img', 'tag_type': 1},
+   {'attributes' : {}, 'end': 98, 'start': 94, 'tag': u'a', 'tag_type': 2},
+   {'attributes' : {}, 'end': 103, 'start': 98, 'tag': u'td', 'tag_type': 2},
+   {'attributes' : {u'width': u'5'}, 'end': 120, 'start': 103, 'tag': u'table', 'tag_type': 1}
+]
+
+PAGE9 = u"""\
+<html>\
+<body>\
+<img width='230' height='150'src='/images/9589.jpg' >\
+<a href="/product/9589">Click here</a>\
+</body>\
+</html>\
+"""
+
+PARSED9 = [
+    {'attributes' : {}, 'end': 6, 'start': 0, 'tag': 'html', 'tag_type': 1},
+    {'attributes' : {}, 'end': 12, 'start': 6, 'tag': 'body', 'tag_type': 1},
+    {'attributes' : {'width': '230', 'height': '150', 'src': '/images/9589.jpg'}, 'end': 65, 'start': 12, 'tag': 'img', 'tag_type': 1},
+    {'attributes' : {'href': '/product/9589'}, 'end': 89, 'start': 65, 'tag': 'a', 'tag_type': 1},
+    {'end': 99, 'start': 89},
+    {'attributes' : {}, 'end': 103, 'start': 99, 'tag': 'a', 'tag_type': 2},
+    {'attributes' : {}, 'end': 110, 'start': 103, 'tag': 'body', 'tag_type': 2},
+    {'attributes' : {}, 'end': 117, 'start': 110, 'tag': 'html', 'tag_type': 2},
+]
--- a/scrapy/tests/test_link.py
+++ b/scrapy/tests/test_link.py
@ -0,0 +1,28 @@
+import unittest
+
+from scrapy.link import Link
+
+class LinkTest(unittest.TestCase):
+
+    def test_eq_and_hash(self):
+        l1 = Link("http://www.example.com")
+        l2 = Link("http://www.example.com/other")
+        l3 = Link("http://www.example.com")
+
+        self.assertEqual(l1, l1)
+        self.assertEqual(hash(l1), hash(l1))
+        self.assertNotEqual(l1, l2)
+        self.assertNotEqual(hash(l1), hash(l2))
+        self.assertEqual(l1, l3)
+        self.assertEqual(hash(l1), hash(l3))
+
+        l4 = Link("http://www.example.com", text="test")
+        l5 = Link("http://www.example.com", text="test2")
+        l6 = Link("http://www.example.com", text="test")
+
+        self.assertEqual(l4, l4)
+        self.assertEqual(hash(l4), hash(l4))
+        self.assertNotEqual(l4, l5)
+        self.assertNotEqual(hash(l4), hash(l5))
+        self.assertEqual(l4, l6)
+        self.assertEqual(hash(l4), hash(l6))
--- a/scrapy/tests/test_utils_iterators.py
+++ b/scrapy/tests/test_utils_iterators.py
@ -29,12 +29,12 @@ class XmliterTestCase(unittest.TestCase):
        for x in self.xmliter(response, 'product'):
            attrs.append((x.select("@id").extract(), x.select("name/text()").extract(), x.select("./type/text()").extract()))

-        self.assertEqual(attrs, 
+        self.assertEqual(attrs,
                         [(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])])

    def test_xmliter_text(self):
        body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
-        
+
        self.assertEqual([x.select("text()").extract() for x in self.xmliter(body, 'product')],
                         [[u'one'], [u'two']])

@ -74,7 +74,7 @@ class XmliterTestCase(unittest.TestCase):

    def test_xmliter_exception(self):
        body = u"""<?xml version="1.0" encoding="UTF-8"?><products><product>one</product><product>two</product></products>"""
-        
+
        iter = self.xmliter(body, 'product')
        iter.next()
        iter.next()
@ -97,6 +97,35 @@ class LxmlXmliterTestCase(XmliterTestCase):
    except ImportError:
        skip = "lxml not available"

+    def test_xmliter_iterate_namespace(self):
+        body = """\
+            <?xml version="1.0" encoding="UTF-8"?>
+            <rss version="2.0" xmlns="http://base.google.com/ns/1.0">
+                <channel>
+                <title>My Dummy Company</title>
+                <link>http://www.mydummycompany.com</link>
+                <description>This is a dummy company. We do nothing.</description>
+                <item>
+                    <title>Item 1</title>
+                    <description>This is item 1</description>
+                    <link>http://www.mydummycompany.com/items/1</link>
+                    <image_link>http://www.mydummycompany.com/images/item1.jpg</image_link>
+                    <image_link>http://www.mydummycompany.com/images/item2.jpg</image_link>
+                </item>
+                </channel>
+            </rss>
+        """
+        response = XmlResponse(url='http://mydummycompany.com', body=body)
+
+        no_namespace_iter = self.xmliter(response, 'image_link')
+        self.assertEqual(len(list(no_namespace_iter)), 0)
+
+        namespace_iter = self.xmliter(response, 'image_link', 'http://base.google.com/ns/1.0')
+        node = namespace_iter.next()
+        self.assertEqual(node.select('text()').extract(), ['http://www.mydummycompany.com/images/item1.jpg'])
+        node = namespace_iter.next()
+        self.assertEqual(node.select('text()').extract(), ['http://www.mydummycompany.com/images/item2.jpg'])
+

 class UtilsCsvTestCase(unittest.TestCase):
    sample_feeds_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data', 'feeds')