From 32b9f788be278ede969af4c1d1bf703a44e75dfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Gra=C3=B1a?= Date: Fri, 13 Apr 2012 10:52:39 -0300 Subject: [PATCH] lxml form request cleanup. #111 * remove unused _nons function copied from lxml.html * compute clickables only if dont_click is False * less _get_clickables function branch nesting --- scrapy/http/request/form.py | 88 +++++++++++++++---------------------- 1 file changed, 36 insertions(+), 52 deletions(-) diff --git a/scrapy/http/request/form.py b/scrapy/http/request/form.py index 3be5aafac..896ce3448 100644 --- a/scrapy/http/request/form.py +++ b/scrapy/http/request/form.py @@ -12,8 +12,6 @@ from lxml import html from scrapy.http.request import Request from scrapy.utils.python import unicode_to_str -XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" - class MultipleElementsFound(Exception): pass @@ -53,13 +51,6 @@ class FormRequest(Request): url = form.action or form.base_url return cls(url, method=form.method, formdata=formdata, **kwargs) -# Copied from lxml.html to avoid relying on a non-public function -def _nons(tag): - if isinstance(tag, basestring): - if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: - return tag.split('}')[-1] - return tag - def _urlencode(seq, enc): values = [(unicode_to_str(k, enc), unicode_to_str(v, enc)) for k, vs in seq @@ -92,13 +83,12 @@ def _get_form(hxs, formname, formnumber, response): def _get_inputs(form, formdata, dont_click, clickdata, response): inputs = [(n, v) for n, v in form.form_values() if n not in formdata] - clickables = [el for el in form.inputs if el.type == 'submit'] - # If we are allowed to click on buttons and we have clickable - # elements, we move on to see if we have any clickdata - if not dont_click and clickables: - clickable = _get_clickable(clickdata, clickables, form) - inputs.append(clickable) + if not dont_click: + clickables = [el for el in form.inputs if el.type == 'submit'] + if clickables: + clickable = _get_clickable(clickdata, clickables, form) + inputs.append(clickable) inputs.extend(formdata.iteritems()) return inputs @@ -109,42 +99,36 @@ def _get_clickable(clickdata, clickables, form): if the latter is given. If not, it returns the first clickable element found """ - # If clickdata is given, we compare it to the clickable elements - # to find a match - if clickdata is not None: - # We first look to see if the number is specified in - # clickdata, because that uniquely identifies the element - nr = clickdata.get('nr', None) - if nr is not None: - try: - el = list(form.inputs)[nr] - except IndexError: - pass - else: - return (el.name, el.value) - - # We didn't find it, so now we build an XPath expression - # out of the other arguments, because they can be used - # as such - else: - xpath_pred = [] - for k, v in clickdata.items(): - if k == 'coord': - v = ','.join(str(c) for c in v) - xpath_pred.append('[@%s="%s"]' % (k, v)) - - xpath_expr = '//*%s' % ''.join(xpath_pred) - el = form.xpath(xpath_expr) - if len(el) > 1: - raise MultipleElementsFound( - "Multiple elements found (%r) matching the criteria" - " in clickdata: %r" % (el, clickdata) - ) - else: - return (el[0].name, el[0].value) - - # If we don't have clickdata, we just use the first - # clickable element - else: + # If we don't have clickdata, we just use the first clickable element + if clickdata is None: el = clickables.pop(0) return (el.name, el.value) + + # If clickdata is given, we compare it to the clickable elements to find a + # match. We first look to see if the number is specified in clickdata, + # because that uniquely identifies the element + nr = clickdata.get('nr', None) + if nr is not None: + try: + el = list(form.inputs)[nr] + except IndexError: + pass + else: + return (el.name, el.value) + + # We didn't find it, so now we build an XPath expression out of the other + # arguments, because they can be used as such + xpath_pred = [] + for k, v in clickdata.items(): + if k == 'coord': + v = ','.join(str(c) for c in v) + xpath_pred.append('[@%s="%s"]' % (k, v)) + + xpath_expr = '//*%s' % ''.join(xpath_pred) + el = form.xpath(xpath_expr) + if len(el) > 1: + raise MultipleElementsFound("Multiple elements found (%r) " + "matching the criteria in clickdata: %r" + % (el, clickdata)) + else: + return (el[0].name, el[0].value)