From cd42bd7d0c8c8d9d5a4a3f38e442b5020a68116b Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Sun, 2 Jan 2011 17:21:31 -0200 Subject: [PATCH 01/14] Bumped version to 0.13 --- scrapy/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy/__init__.py b/scrapy/__init__.py index 3d8a7a4ae..c73dae498 100644 --- a/scrapy/__init__.py +++ b/scrapy/__init__.py @@ -2,8 +2,8 @@ Scrapy - a screen scraping framework written in Python """ -version_info = (0, 12, 0) -__version__ = "0.12.0" +version_info = (0, 13, 0) +__version__ = "0.13.0" import sys, os, warnings From d7f193cbeac9dba4eda7bb463b6590900bbb2998 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Sun, 2 Jan 2011 17:29:43 -0200 Subject: [PATCH 02/14] bumped version to 0.13 in documentation --- docs/topics/scrapyd.rst | 2 +- docs/topics/ubuntu.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/topics/scrapyd.rst b/docs/topics/scrapyd.rst index 6bfe8d3f7..143e983a3 100644 --- a/docs/topics/scrapyd.rst +++ b/docs/topics/scrapyd.rst @@ -86,7 +86,7 @@ in your Ubuntu servers. So, if you plan to deploy Scrapyd on a Ubuntu server, just add the Ubuntu repositories as described in :ref:`topics-ubuntu` and then run:: - aptitude install scrapyd-0.12 + aptitude install scrapyd-0.13 This will install Scrapyd in your Ubuntu server creating a ``scrapy`` user which Scrapyd will run as. It will also create some directories and files that diff --git a/docs/topics/ubuntu.rst b/docs/topics/ubuntu.rst index 13bdd4b7b..6cd164f7b 100644 --- a/docs/topics/ubuntu.rst +++ b/docs/topics/ubuntu.rst @@ -13,7 +13,7 @@ latest bug fixes. To use the packages, just add the following line to your ``/etc/apt/sources.list``, and then run ``aptitude update`` and ``aptitude -install scrapy-0.12``:: +install scrapy-0.13``:: deb http://archive.scrapy.org/ubuntu DISTRO main From 579463aff252c30e97ccb2045e84f4f4d69690da Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Tue, 4 Jan 2011 13:57:32 -0200 Subject: [PATCH 03/14] make scrapy*-0.13 packages conflict with scrapy*-0.12 packages --- debian/control | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debian/control b/debian/control index b4b159470..cc0d49040 100644 --- a/debian/control +++ b/debian/control @@ -9,7 +9,7 @@ Homepage: http://scrapy.org/ Package: scrapy-SUFFIX Architecture: all Depends: ${python:Depends}, python-libxml2, python-twisted, python-openssl -Conflicts: python-scrapy, scrapy, scrapy-0.11 +Conflicts: python-scrapy, scrapy, scrapy-0.11, scrapy-0.12 Provides: python-scrapy, scrapy Description: Python web crawling and scraping framework Scrapy is a fast high-level screen scraping and web crawling framework, @@ -20,7 +20,7 @@ Description: Python web crawling and scraping framework Package: scrapyd-SUFFIX Architecture: all Depends: scrapy, python-setuptools -Conflicts: scrapyd, scrapyd-0.11 +Conflicts: scrapyd, scrapyd-0.11, scrapyd-0.12 Provides: scrapyd Description: Scrapy Service The Scrapy service allows you to deploy your Scrapy projects by building From 0ba9999cca6526bd1aea84eb5ce808fc840639b7 Mon Sep 17 00:00:00 2001 From: Martin Olveyra Date: Wed, 5 Jan 2011 11:02:05 -0200 Subject: [PATCH 04/14] Handle badformed tags with no trailing > --- scrapy/contrib/ibl/htmlpage.py | 2 +- scrapy/tests/test_contrib_ibl/test_extraction.py | 2 +- scrapy/tests/test_contrib_ibl/test_htmlpage.py | 10 ++++++++++ scrapy/tests/test_contrib_ibl/test_htmlpage_data.py | 9 +++++++++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/scrapy/contrib/ibl/htmlpage.py b/scrapy/contrib/ibl/htmlpage.py index 023030bdc..30c21c043 100644 --- a/scrapy/contrib/ibl/htmlpage.py +++ b/scrapy/contrib/ibl/htmlpage.py @@ -81,7 +81,7 @@ class HtmlTag(HtmlDataFragment): return str(self) _ATTR = "((?:[^=/>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?" -_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s+" + _ATTR + ")+\s*|\s*)(\/?)>" +_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s+" + _ATTR + ")+\s*|\s*)(\/?)>?" _DOCTYPE = r"" _SCRIPT = "()(.*?)()" _COMMENT = "()" diff --git a/scrapy/tests/test_contrib_ibl/test_extraction.py b/scrapy/tests/test_contrib_ibl/test_extraction.py index f79c9dc78..e9ce86f7e 100644 --- a/scrapy/tests/test_contrib_ibl/test_extraction.py +++ b/scrapy/tests/test_contrib_ibl/test_extraction.py @@ -516,7 +516,7 @@ ANNOTATED_PAGE19 = u"""

Product name

60.00

- +

description

diff --git a/scrapy/tests/test_contrib_ibl/test_htmlpage.py b/scrapy/tests/test_contrib_ibl/test_htmlpage.py index cdba56853..b3bc91fa5 100644 --- a/scrapy/tests/test_contrib_ibl/test_htmlpage.py +++ b/scrapy/tests/test_contrib_ibl/test_htmlpage.py @@ -137,3 +137,13 @@ class TestParseHtml(TestCase): parsed = list(parse_html("")) self.assertEqual(parsed[0].attributes, {'src': 'http://images.play.com/banners/SAM550a.jpg', \ 'align': 'left', 'hspace': '5', '/': None}) + + def test_no_ending_body(self): + """Test case when no ending body nor html elements are present""" + parsed = [_decode_element(d) for d in PARSED7] + self._test_sample(PAGE7, parsed) + + def test_malformed(self): + """Test parsing of some malformed cases""" + parsed = [_decode_element(d) for d in PARSED8] + self._test_sample(PAGE8, parsed) diff --git a/scrapy/tests/test_contrib_ibl/test_htmlpage_data.py b/scrapy/tests/test_contrib_ibl/test_htmlpage_data.py index 62cd6b526..39771a666 100644 --- a/scrapy/tests/test_contrib_ibl/test_htmlpage_data.py +++ b/scrapy/tests/test_contrib_ibl/test_htmlpage_data.py @@ -246,3 +246,12 @@ PARSED7 = [ {'end': 99, 'start': 85}, ] +PAGE8 = u"""""" + +PARSED8 = [ + {'attributes' : {u'href' : u"/overview.asp?id=277"}, 'end': 31, 'start': 0, 'tag': u'a', 'tag_type': 1}, + {'attributes' : {u'src' : u"/img/5200814311.jpg", u'border' : u"0", u'title': u'Vinyl Cornice'}, 'end': 94, 'start': 31, 'tag': u'img', 'tag_type': 1}, + {'attributes' : {}, 'end': 98, 'start': 94, 'tag': u'a', 'tag_type': 2}, + {'attributes' : {}, 'end': 103, 'start': 98, 'tag': u'td', 'tag_type': 2}, + {'attributes' : {u'width': u'5'}, 'end': 120, 'start': 103, 'tag': u'table', 'tag_type': 1} +] From ebf5ad933e718198cfa557272693fe0aa57a775f Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 5 Jan 2011 11:59:19 -0200 Subject: [PATCH 05/14] fixed compatibility with python 2.5 and removed unused code --- scrapy/utils/memory.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scrapy/utils/memory.py b/scrapy/utils/memory.py index bed6821f9..3340a0a6e 100644 --- a/scrapy/utils/memory.py +++ b/scrapy/utils/memory.py @@ -1,3 +1,5 @@ +from __future__ import with_statement + import os import sys import struct @@ -64,7 +66,4 @@ def _vmvalue_solaris(vmkey, pid): vm_in_kB = parts[vmkey_index[vmkey]] - def kB_to_Bytes(kB): - return kB * 1024 - return vm_in_kB * 1024 From 32adbea545e4c4780f7c3545c093a7fb93433a3b Mon Sep 17 00:00:00 2001 From: Martin Olveyra Date: Mon, 24 Jan 2011 18:40:42 -0200 Subject: [PATCH 06/14] handle case when attributes are not separated by space (still recognizable because of quotes) --- scrapy/contrib/ibl/htmlpage.py | 4 ++-- .../tests/test_contrib_ibl/test_htmlpage.py | 6 ++++++ .../test_contrib_ibl/test_htmlpage_data.py | 20 +++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/scrapy/contrib/ibl/htmlpage.py b/scrapy/contrib/ibl/htmlpage.py index 30c21c043..c86ec9670 100644 --- a/scrapy/contrib/ibl/htmlpage.py +++ b/scrapy/contrib/ibl/htmlpage.py @@ -80,8 +80,8 @@ class HtmlTag(HtmlDataFragment): def __repr__(self): return str(self) -_ATTR = "((?:[^=/>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?" -_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s+" + _ATTR + ")+\s*|\s*)(\/?)>?" +_ATTR = "((?:[^=/<>\s]|/(?!>))+)(?:\s*=(?:\s*\"(.*?)\"|\s*'(.*?)'|([^>\s]+))?)?" +_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?" _DOCTYPE = r"" _SCRIPT = "()(.*?)()" _COMMENT = "()" diff --git a/scrapy/tests/test_contrib_ibl/test_htmlpage.py b/scrapy/tests/test_contrib_ibl/test_htmlpage.py index b3bc91fa5..d85ae1afb 100644 --- a/scrapy/tests/test_contrib_ibl/test_htmlpage.py +++ b/scrapy/tests/test_contrib_ibl/test_htmlpage.py @@ -147,3 +147,9 @@ class TestParseHtml(TestCase): """Test parsing of some malformed cases""" parsed = [_decode_element(d) for d in PARSED8] self._test_sample(PAGE8, parsed) + + def test_malformed2(self): + """Test case when attributes are not separated by space (still recognizable because of quotes)""" + parsed = [_decode_element(d) for d in PARSED9] + self._test_sample(PAGE9, parsed) + diff --git a/scrapy/tests/test_contrib_ibl/test_htmlpage_data.py b/scrapy/tests/test_contrib_ibl/test_htmlpage_data.py index 39771a666..f54dc9f8c 100644 --- a/scrapy/tests/test_contrib_ibl/test_htmlpage_data.py +++ b/scrapy/tests/test_contrib_ibl/test_htmlpage_data.py @@ -255,3 +255,23 @@ PARSED8 = [ {'attributes' : {}, 'end': 103, 'start': 98, 'tag': u'td', 'tag_type': 2}, {'attributes' : {u'width': u'5'}, 'end': 120, 'start': 103, 'tag': u'table', 'tag_type': 1} ] + +PAGE9 = u"""\ +\ +\ +\ +Click here\ +\ +\ +""" + +PARSED9 = [ + {'attributes' : {}, 'end': 6, 'start': 0, 'tag': 'html', 'tag_type': 1}, + {'attributes' : {}, 'end': 12, 'start': 6, 'tag': 'body', 'tag_type': 1}, + {'attributes' : {'width': '230', 'height': '150', 'src': '/images/9589.jpg'}, 'end': 65, 'start': 12, 'tag': 'img', 'tag_type': 1}, + {'attributes' : {'href': '/product/9589'}, 'end': 89, 'start': 65, 'tag': 'a', 'tag_type': 1}, + {'end': 99, 'start': 89}, + {'attributes' : {}, 'end': 103, 'start': 99, 'tag': 'a', 'tag_type': 2}, + {'attributes' : {}, 'end': 110, 'start': 103, 'tag': 'body', 'tag_type': 2}, + {'attributes' : {}, 'end': 117, 'start': 110, 'tag': 'html', 'tag_type': 2}, +] From c5351d2f4882e423828e747b528179d5b1486f7e Mon Sep 17 00:00:00 2001 From: Shane Evans Date: Tue, 25 Jan 2011 19:23:50 -0200 Subject: [PATCH 07/14] add __hash__ method to Link object to be compatible with the __eq__ method --- scrapy/link.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrapy/link.py b/scrapy/link.py index ddae07823..371a987cb 100644 --- a/scrapy/link.py +++ b/scrapy/link.py @@ -18,6 +18,9 @@ class Link(object): def __eq__(self, other): return self.url == other.url and self.text == other.text + + def __hash__(self): + return hash(self.url) ^ hash(self.text) def __repr__(self): return '' % (self.url, self.text) From 632bc27deb6e2425f25a3c909a2d80ea4c5df0c4 Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Tue, 25 Jan 2011 19:51:17 -0200 Subject: [PATCH 08/14] added tests for Link object --- scrapy/tests/test_link.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 scrapy/tests/test_link.py diff --git a/scrapy/tests/test_link.py b/scrapy/tests/test_link.py new file mode 100644 index 000000000..32e0095e6 --- /dev/null +++ b/scrapy/tests/test_link.py @@ -0,0 +1,28 @@ +import unittest + +from scrapy.link import Link + +class LinkTest(unittest.TestCase): + + def test_eq_and_hash(self): + l1 = Link("http://www.example.com") + l2 = Link("http://www.example.com/other") + l3 = Link("http://www.example.com") + + self.assertEqual(l1, l1) + self.assertEqual(hash(l1), hash(l1)) + self.assertNotEqual(l1, l2) + self.assertNotEqual(hash(l1), hash(l2)) + self.assertEqual(l1, l3) + self.assertEqual(hash(l1), hash(l3)) + + l4 = Link("http://www.example.com", text="test") + l5 = Link("http://www.example.com", text="test2") + l6 = Link("http://www.example.com", text="test") + + self.assertEqual(l4, l4) + self.assertEqual(hash(l4), hash(l4)) + self.assertNotEqual(l4, l5) + self.assertNotEqual(hash(l4), hash(l5)) + self.assertEqual(l4, l6) + self.assertEqual(hash(l4), hash(l6)) From bfc6c3809b538ac72b5b677234ebcaacfd1190db Mon Sep 17 00:00:00 2001 From: Ismael Carnales Date: Wed, 9 Feb 2011 16:20:48 -0200 Subject: [PATCH 09/14] Add namespace support to xmliter_lxml --- scrapy/contrib_exp/iterators.py | 11 +++++++--- scrapy/tests/test_utils_iterators.py | 32 +++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/scrapy/contrib_exp/iterators.py b/scrapy/contrib_exp/iterators.py index 0fc73e194..d42a25c3f 100644 --- a/scrapy/contrib_exp/iterators.py +++ b/scrapy/contrib_exp/iterators.py @@ -2,14 +2,19 @@ from scrapy.http import Response from scrapy.selector import XmlXPathSelector -def xmliter_lxml(obj, nodename): +def xmliter_lxml(obj, nodename, namespace=None): from lxml import etree reader = _StreamReader(obj) - iterable = etree.iterparse(reader, tag=nodename, encoding=reader.encoding) + tag = '{%s}%s' % (namespace, nodename) if namespace else nodename + iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) for _, node in iterable: nodetext = etree.tostring(node) node.clear() - yield XmlXPathSelector(text=nodetext).select('//' + nodename)[0] + xs = XmlXPathSelector(text=nodetext) + if namespace: + xs.register_namespace('x', namespace) + nodename = 'x:%s' % nodename + yield xs.select('//' + nodename)[0] class _StreamReader(object): diff --git a/scrapy/tests/test_utils_iterators.py b/scrapy/tests/test_utils_iterators.py index d32d1e658..79435d615 100644 --- a/scrapy/tests/test_utils_iterators.py +++ b/scrapy/tests/test_utils_iterators.py @@ -29,12 +29,12 @@ class XmliterTestCase(unittest.TestCase): for x in self.xmliter(response, 'product'): attrs.append((x.select("@id").extract(), x.select("name/text()").extract(), x.select("./type/text()").extract())) - self.assertEqual(attrs, + self.assertEqual(attrs, [(['001'], ['Name 1'], ['Type 1']), (['002'], ['Name 2'], ['Type 2'])]) def test_xmliter_text(self): body = u"""onetwo""" - + self.assertEqual([x.select("text()").extract() for x in self.xmliter(body, 'product')], [[u'one'], [u'two']]) @@ -74,7 +74,7 @@ class XmliterTestCase(unittest.TestCase): def test_xmliter_exception(self): body = u"""onetwo""" - + iter = self.xmliter(body, 'product') iter.next() iter.next() @@ -97,6 +97,32 @@ class LxmlXmliterTestCase(XmliterTestCase): except ImportError: skip = "lxml not available" + def test_xmliter_iterate_namespace(self): + body = """\ + + + + My Dummy Company + http://www.mydummycompany.com + This is a dummy company. We do nothing. + + Item 1 + This is item 1 + http://www.mydummycompany.com/items/1 + http://www.mydummycompany.com/images/item1.jpg + + + + """ + response = XmlResponse(url='http://mydummycompany.com', body=body) + + no_namespace_iter = self.xmliter(response, 'image_link') + self.assertEqual(len(list(no_namespace_iter)), 0) + + namespace_iter = self.xmliter(response, 'image_link', 'http://base.google.com/ns/1.0') + node = namespace_iter.next() + self.assertEqual(node.select('text()').extract(), ['http://www.mydummycompany.com/images/item1.jpg']) + class UtilsCsvTestCase(unittest.TestCase): sample_feeds_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'sample_data', 'feeds') From 9b07b0ab0a218998191ec9b8537cdf0aea0455a5 Mon Sep 17 00:00:00 2001 From: Ismael Carnales Date: Fri, 11 Feb 2011 11:41:44 -0200 Subject: [PATCH 10/14] Fix xmliter_lxml --- scrapy/contrib_exp/iterators.py | 4 ++-- scrapy/tests/test_utils_iterators.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scrapy/contrib_exp/iterators.py b/scrapy/contrib_exp/iterators.py index d42a25c3f..0f3a8c694 100644 --- a/scrapy/contrib_exp/iterators.py +++ b/scrapy/contrib_exp/iterators.py @@ -7,14 +7,14 @@ def xmliter_lxml(obj, nodename, namespace=None): reader = _StreamReader(obj) tag = '{%s}%s' % (namespace, nodename) if namespace else nodename iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) + selxpath = '//' + ('x:%s' % nodename if namespace else nodename) for _, node in iterable: nodetext = etree.tostring(node) node.clear() xs = XmlXPathSelector(text=nodetext) if namespace: xs.register_namespace('x', namespace) - nodename = 'x:%s' % nodename - yield xs.select('//' + nodename)[0] + yield xs.select(selxpath)[0] class _StreamReader(object): diff --git a/scrapy/tests/test_utils_iterators.py b/scrapy/tests/test_utils_iterators.py index 79435d615..06fb4effe 100644 --- a/scrapy/tests/test_utils_iterators.py +++ b/scrapy/tests/test_utils_iterators.py @@ -110,6 +110,7 @@ class LxmlXmliterTestCase(XmliterTestCase): This is item 1 http://www.mydummycompany.com/items/1 http://www.mydummycompany.com/images/item1.jpg + http://www.mydummycompany.com/images/item2.jpg @@ -122,6 +123,8 @@ class LxmlXmliterTestCase(XmliterTestCase): namespace_iter = self.xmliter(response, 'image_link', 'http://base.google.com/ns/1.0') node = namespace_iter.next() self.assertEqual(node.select('text()').extract(), ['http://www.mydummycompany.com/images/item1.jpg']) + node = namespace_iter.next() + self.assertEqual(node.select('text()').extract(), ['http://www.mydummycompany.com/images/item2.jpg']) class UtilsCsvTestCase(unittest.TestCase): From a1c3fa5dd827e1f842e4270cadeec35539f09e38 Mon Sep 17 00:00:00 2001 From: Shane Evans Date: Tue, 15 Feb 2011 15:42:10 -0200 Subject: [PATCH 11/14] small refactor of image extraction --- scrapy/contrib/ibl/extractors.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scrapy/contrib/ibl/extractors.py b/scrapy/contrib/ibl/extractors.py index 9c460df10..858aa9d55 100644 --- a/scrapy/contrib/ibl/extractors.py +++ b/scrapy/contrib/ibl/extractors.py @@ -132,6 +132,10 @@ def image_url(txt): ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom'] """ + imgurl = extract_image_url(txt) + return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None + +def extract_image_url(txt): txt = url(txt) imgurl = None if txt: @@ -153,4 +157,4 @@ def image_url(txt): imgurl = urlparse.urlunparse(parsed) if not imgurl: imgurl = txt - return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None + return imgurl From c55355642c26717265167db5acf002edf83ee7ea Mon Sep 17 00:00:00 2001 From: Daniel Grana Date: Wed, 16 Feb 2011 08:57:42 -0200 Subject: [PATCH 12/14] fix FAQ typos reported by marlun_ at #scrapy IRC channel --- docs/faq.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/faq.rst b/docs/faq.rst index 2444a3c05..5783fdcaf 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -3,7 +3,7 @@ Frequently Asked Questions ========================== -How does Scrapy compare to BeautifulSoul or lxml? +How does Scrapy compare to BeautifulSoup or lxml? ------------------------------------------------- `BeautifulSoup`_ and `lxml`_ are libraries for parsing HTML and XML. Scrapy is @@ -29,7 +29,7 @@ comparing `jinja2`_ to `Django`_. What Python versions does Scrapy support? ----------------------------------------- -Scrapy runs in Python 2.5, 2.6 and 2.6. But it's recommended you use Python 2.6 +Scrapy runs in Python 2.5, 2.6 and 2.7. But it's recommended you use Python 2.6 or above, since the Python 2.5 standard library has a few bugs in their URL handling libraries. Some of these Python 2.5 bugs not only affect Scrapy but any user code, such as spiders. You can see a list of `Python 2.5 bugs that From fe9febe2b10d787e00c7e48c0966f27082e82d6d Mon Sep 17 00:00:00 2001 From: Pablo Hoffman Date: Wed, 23 Feb 2011 18:10:16 -0200 Subject: [PATCH 13/14] added --build-only option to deploy command, to build the egg without deploying it --- scrapy/commands/deploy.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/scrapy/commands/deploy.py b/scrapy/commands/deploy.py index 0dcf5037f..c22b9c710 100644 --- a/scrapy/commands/deploy.py +++ b/scrapy/commands/deploy.py @@ -57,6 +57,8 @@ class Command(ScrapyCommand): help="list available projects on TARGET") parser.add_option("--egg", metavar="FILE", help="use the given egg, instead of building it") + parser.add_option("--build-egg", metavar="FILE", + help="only build the egg, don't deploy it") def run(self, args, opts): try: @@ -75,18 +77,26 @@ class Command(ScrapyCommand): projects = json.loads(f.read())['projects'] print os.linesep.join(projects) return - target_name = _get_target_name(args) - target = _get_target(target_name) - project = _get_project(target, opts) - version = _get_version(target, opts) + tmpdir = None - if opts.egg: - _log("Using egg: %s" % opts.egg) - egg = opts.egg - else: - _log("Building egg of %s-%s" % (project, version)) + + if opts.build_egg: # build egg only egg, tmpdir = _build_egg() - _upload_egg(target, egg, project, version) + _log("Writing egg to %s" % opts.build_egg) + shutil.copyfile(egg, opts.build_egg) + else: # buld egg and deploy + target_name = _get_target_name(args) + target = _get_target(target_name) + project = _get_project(target, opts) + version = _get_version(target, opts) + if opts.egg: + _log("Using egg: %s" % opts.egg) + egg = opts.egg + else: + _log("Building egg of %s-%s" % (project, version)) + egg, tmpdir = _build_egg() + _upload_egg(target, egg, project, version) + if tmpdir: shutil.rmtree(tmpdir) From 32fa2add753306fd20da483c01532ac080488f14 Mon Sep 17 00:00:00 2001 From: Shane Evans Date: Thu, 24 Feb 2011 14:21:23 -0200 Subject: [PATCH 14/14] style fix to ibl contrib --- scrapy/contrib/ibl/extraction/regionextract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy/contrib/ibl/extraction/regionextract.py b/scrapy/contrib/ibl/extraction/regionextract.py index db5eb2a38..c748a6a79 100644 --- a/scrapy/contrib/ibl/extraction/regionextract.py +++ b/scrapy/contrib/ibl/extraction/regionextract.py @@ -75,7 +75,7 @@ class BasicTypeExtractor(object): u'
x xx
',\ u'
a name id-9
') >>> ex = BasicTypeExtractor(template.annotations[0]) - >>> ex.extract(page, 0, 3, [LabelledRegion(*(1,2))]) + >>> ex.extract(page, 0, 3, [LabelledRegion(1, 2)]) [(u'name', u'a name')] """ @@ -395,7 +395,7 @@ class RecordExtractor(object): s, p, e = similar_region(page.page_tokens, self.template_tokens, \ i, start, sindex) if s > 0: - similar_ignored_regions.append(LabelledRegion(*(p, e))) + similar_ignored_regions.append(LabelledRegion(p, e)) start = e or start extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions) if extracted_data: