mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
Merge remote-tracking branch 'origin/master' into asyncio-startrequests-asyncgen
This commit is contained in:
commit
b2f43d51ac
@ -49,7 +49,7 @@ master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = 'Scrapy'
|
||||
copyright = '2008–{}, Scrapy developers'.format(datetime.now().year)
|
||||
copyright = f'2008–{datetime.now().year}, Scrapy developers'
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
|
@ -101,10 +101,10 @@ This is the code for our first Spider. Save it in a file named
|
||||
|
||||
def parse(self, response):
|
||||
page = response.url.split("/")[-2]
|
||||
filename = 'quotes-%s.html' % page
|
||||
filename = f'quotes-{page}.html'
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(response.body)
|
||||
self.log('Saved file %s' % filename)
|
||||
self.log(f'Saved file {filename}')
|
||||
|
||||
|
||||
As you can see, our Spider subclasses :class:`scrapy.Spider <scrapy.spiders.Spider>`
|
||||
@ -190,7 +190,7 @@ for your spider::
|
||||
|
||||
def parse(self, response):
|
||||
page = response.url.split("/")[-2]
|
||||
filename = 'quotes-%s.html' % page
|
||||
filename = f'quotes-{page}.html'
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(response.body)
|
||||
|
||||
|
@ -5,9 +5,9 @@ Using your browser's Developer Tools for scraping
|
||||
=================================================
|
||||
|
||||
Here is a general guide on how to use your browser's Developer Tools
|
||||
to ease the scraping process. Today almost all browsers come with
|
||||
to ease the scraping process. Today almost all browsers come with
|
||||
built in `Developer Tools`_ and although we will use Firefox in this
|
||||
guide, the concepts are applicable to any other browser.
|
||||
guide, the concepts are applicable to any other browser.
|
||||
|
||||
In this guide we'll introduce the basic tools to use from a browser's
|
||||
Developer Tools by scraping `quotes.toscrape.com`_.
|
||||
@ -41,16 +41,16 @@ Therefore, you should keep in mind the following things:
|
||||
Inspecting a website
|
||||
====================
|
||||
|
||||
By far the most handy feature of the Developer Tools is the `Inspector`
|
||||
feature, which allows you to inspect the underlying HTML code of
|
||||
any webpage. To demonstrate the Inspector, let's look at the
|
||||
By far the most handy feature of the Developer Tools is the `Inspector`
|
||||
feature, which allows you to inspect the underlying HTML code of
|
||||
any webpage. To demonstrate the Inspector, let's look at the
|
||||
`quotes.toscrape.com`_-site.
|
||||
|
||||
On the site we have a total of ten quotes from various authors with specific
|
||||
tags, as well as the Top Ten Tags. Let's say we want to extract all the quotes
|
||||
on this page, without any meta-information about authors, tags, etc.
|
||||
tags, as well as the Top Ten Tags. Let's say we want to extract all the quotes
|
||||
on this page, without any meta-information about authors, tags, etc.
|
||||
|
||||
Instead of viewing the whole source code for the page, we can simply right click
|
||||
Instead of viewing the whole source code for the page, we can simply right click
|
||||
on a quote and select ``Inspect Element (Q)``, which opens up the `Inspector`.
|
||||
In it you should see something like this:
|
||||
|
||||
@ -97,16 +97,16 @@ Then, back to your web browser, right-click on the ``span`` tag, select
|
||||
>>> response.xpath('/html/body/div/div[2]/div[1]/div[1]/span[1]/text()').getall()
|
||||
['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”']
|
||||
|
||||
Adding ``text()`` at the end we are able to extract the first quote with this
|
||||
Adding ``text()`` at the end we are able to extract the first quote with this
|
||||
basic selector. But this XPath is not really that clever. All it does is
|
||||
go down a desired path in the source code starting from ``html``. So let's
|
||||
see if we can refine our XPath a bit:
|
||||
go down a desired path in the source code starting from ``html``. So let's
|
||||
see if we can refine our XPath a bit:
|
||||
|
||||
If we check the `Inspector` again we'll see that directly beneath our
|
||||
expanded ``div`` tag we have nine identical ``div`` tags, each with the
|
||||
same attributes as our first. If we expand any of them, we'll see the same
|
||||
If we check the `Inspector` again we'll see that directly beneath our
|
||||
expanded ``div`` tag we have nine identical ``div`` tags, each with the
|
||||
same attributes as our first. If we expand any of them, we'll see the same
|
||||
structure as with our first quote: Two ``span`` tags and one ``div`` tag. We can
|
||||
expand each ``span`` tag with the ``class="text"`` inside our ``div`` tags and
|
||||
expand each ``span`` tag with the ``class="text"`` inside our ``div`` tags and
|
||||
see each quote:
|
||||
|
||||
.. code-block:: html
|
||||
@ -121,7 +121,7 @@ see each quote:
|
||||
|
||||
|
||||
With this knowledge we can refine our XPath: Instead of a path to follow,
|
||||
we'll simply select all ``span`` tags with the ``class="text"`` by using
|
||||
we'll simply select all ``span`` tags with the ``class="text"`` by using
|
||||
the `has-class-extension`_:
|
||||
|
||||
>>> response.xpath('//span[has-class("text")]/text()').getall()
|
||||
@ -130,45 +130,45 @@ the `has-class-extension`_:
|
||||
'“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
|
||||
...]
|
||||
|
||||
And with one simple, cleverer XPath we are able to extract all quotes from
|
||||
the page. We could have constructed a loop over our first XPath to increase
|
||||
the number of the last ``div``, but this would have been unnecessarily
|
||||
And with one simple, cleverer XPath we are able to extract all quotes from
|
||||
the page. We could have constructed a loop over our first XPath to increase
|
||||
the number of the last ``div``, but this would have been unnecessarily
|
||||
complex and by simply constructing an XPath with ``has-class("text")``
|
||||
we were able to extract all quotes in one line.
|
||||
we were able to extract all quotes in one line.
|
||||
|
||||
The `Inspector` has a lot of other helpful features, such as searching in the
|
||||
The `Inspector` has a lot of other helpful features, such as searching in the
|
||||
source code or directly scrolling to an element you selected. Let's demonstrate
|
||||
a use case:
|
||||
a use case:
|
||||
|
||||
Say you want to find the ``Next`` button on the page. Type ``Next`` into the
|
||||
search bar on the top right of the `Inspector`. You should get two results.
|
||||
The first is a ``li`` tag with the ``class="next"``, the second the text
|
||||
Say you want to find the ``Next`` button on the page. Type ``Next`` into the
|
||||
search bar on the top right of the `Inspector`. You should get two results.
|
||||
The first is a ``li`` tag with the ``class="next"``, the second the text
|
||||
of an ``a`` tag. Right click on the ``a`` tag and select ``Scroll into View``.
|
||||
If you hover over the tag, you'll see the button highlighted. From here
|
||||
we could easily create a :ref:`Link Extractor <topics-link-extractors>` to
|
||||
follow the pagination. On a simple site such as this, there may not be
|
||||
we could easily create a :ref:`Link Extractor <topics-link-extractors>` to
|
||||
follow the pagination. On a simple site such as this, there may not be
|
||||
the need to find an element visually but the ``Scroll into View`` function
|
||||
can be quite useful on complex sites.
|
||||
can be quite useful on complex sites.
|
||||
|
||||
Note that the search bar can also be used to search for and test CSS
|
||||
selectors. For example, you could search for ``span.text`` to find
|
||||
all quote texts. Instead of a full text search, this searches for
|
||||
exactly the ``span`` tag with the ``class="text"`` in the page.
|
||||
selectors. For example, you could search for ``span.text`` to find
|
||||
all quote texts. Instead of a full text search, this searches for
|
||||
exactly the ``span`` tag with the ``class="text"`` in the page.
|
||||
|
||||
.. _topics-network-tool:
|
||||
|
||||
The Network-tool
|
||||
================
|
||||
While scraping you may come across dynamic webpages where some parts
|
||||
of the page are loaded dynamically through multiple requests. While
|
||||
this can be quite tricky, the `Network`-tool in the Developer Tools
|
||||
of the page are loaded dynamically through multiple requests. While
|
||||
this can be quite tricky, the `Network`-tool in the Developer Tools
|
||||
greatly facilitates this task. To demonstrate the Network-tool, let's
|
||||
take a look at the page `quotes.toscrape.com/scroll`_.
|
||||
take a look at the page `quotes.toscrape.com/scroll`_.
|
||||
|
||||
The page is quite similar to the basic `quotes.toscrape.com`_-page,
|
||||
but instead of the above-mentioned ``Next`` button, the page
|
||||
automatically loads new quotes when you scroll to the bottom. We
|
||||
could go ahead and try out different XPaths directly, but instead
|
||||
The page is quite similar to the basic `quotes.toscrape.com`_-page,
|
||||
but instead of the above-mentioned ``Next`` button, the page
|
||||
automatically loads new quotes when you scroll to the bottom. We
|
||||
could go ahead and try out different XPaths directly, but instead
|
||||
we'll check another quite useful command from the Scrapy shell:
|
||||
|
||||
.. skip: next
|
||||
@ -179,9 +179,9 @@ we'll check another quite useful command from the Scrapy shell:
|
||||
(...)
|
||||
>>> view(response)
|
||||
|
||||
A browser window should open with the webpage but with one
|
||||
crucial difference: Instead of the quotes we just see a greenish
|
||||
bar with the word ``Loading...``.
|
||||
A browser window should open with the webpage but with one
|
||||
crucial difference: Instead of the quotes we just see a greenish
|
||||
bar with the word ``Loading...``.
|
||||
|
||||
.. image:: _images/network_01.png
|
||||
:width: 777
|
||||
@ -189,21 +189,21 @@ bar with the word ``Loading...``.
|
||||
:alt: Response from quotes.toscrape.com/scroll
|
||||
|
||||
The ``view(response)`` command let's us view the response our
|
||||
shell or later our spider receives from the server. Here we see
|
||||
that some basic template is loaded which includes the title,
|
||||
shell or later our spider receives from the server. Here we see
|
||||
that some basic template is loaded which includes the title,
|
||||
the login-button and the footer, but the quotes are missing. This
|
||||
tells us that the quotes are being loaded from a different request
|
||||
than ``quotes.toscrape/scroll``.
|
||||
than ``quotes.toscrape/scroll``.
|
||||
|
||||
If you click on the ``Network`` tab, you will probably only see
|
||||
two entries. The first thing we do is enable persistent logs by
|
||||
clicking on ``Persist Logs``. If this option is disabled, the
|
||||
If you click on the ``Network`` tab, you will probably only see
|
||||
two entries. The first thing we do is enable persistent logs by
|
||||
clicking on ``Persist Logs``. If this option is disabled, the
|
||||
log is automatically cleared each time you navigate to a different
|
||||
page. Enabling this option is a good default, since it gives us
|
||||
control on when to clear the logs.
|
||||
page. Enabling this option is a good default, since it gives us
|
||||
control on when to clear the logs.
|
||||
|
||||
If we reload the page now, you'll see the log get populated with six
|
||||
new requests.
|
||||
new requests.
|
||||
|
||||
.. image:: _images/network_02.png
|
||||
:width: 777
|
||||
@ -212,31 +212,31 @@ new requests.
|
||||
|
||||
Here we see every request that has been made when reloading the page
|
||||
and can inspect each request and its response. So let's find out
|
||||
where our quotes are coming from:
|
||||
where our quotes are coming from:
|
||||
|
||||
First click on the request with the name ``scroll``. On the right
|
||||
First click on the request with the name ``scroll``. On the right
|
||||
you can now inspect the request. In ``Headers`` you'll find details
|
||||
about the request headers, such as the URL, the method, the IP-address,
|
||||
and so on. We'll ignore the other tabs and click directly on ``Response``.
|
||||
|
||||
What you should see in the ``Preview`` pane is the rendered HTML-code,
|
||||
that is exactly what we saw when we called ``view(response)`` in the
|
||||
shell. Accordingly the ``type`` of the request in the log is ``html``.
|
||||
The other requests have types like ``css`` or ``js``, but what
|
||||
interests us is the one request called ``quotes?page=1`` with the
|
||||
type ``json``.
|
||||
What you should see in the ``Preview`` pane is the rendered HTML-code,
|
||||
that is exactly what we saw when we called ``view(response)`` in the
|
||||
shell. Accordingly the ``type`` of the request in the log is ``html``.
|
||||
The other requests have types like ``css`` or ``js``, but what
|
||||
interests us is the one request called ``quotes?page=1`` with the
|
||||
type ``json``.
|
||||
|
||||
If we click on this request, we see that the request URL is
|
||||
If we click on this request, we see that the request URL is
|
||||
``http://quotes.toscrape.com/api/quotes?page=1`` and the response
|
||||
is a JSON-object that contains our quotes. We can also right-click
|
||||
on the request and open ``Open in new tab`` to get a better overview.
|
||||
on the request and open ``Open in new tab`` to get a better overview.
|
||||
|
||||
.. image:: _images/network_03.png
|
||||
:width: 777
|
||||
:height: 375
|
||||
:alt: JSON-object returned from the quotes.toscrape API
|
||||
|
||||
With this response we can now easily parse the JSON-object and
|
||||
With this response we can now easily parse the JSON-object and
|
||||
also request each page to get every quote on the site::
|
||||
|
||||
import scrapy
|
||||
@ -255,17 +255,17 @@ also request each page to get every quote on the site::
|
||||
yield {"quote": quote["text"]}
|
||||
if data["has_next"]:
|
||||
self.page += 1
|
||||
url = "http://quotes.toscrape.com/api/quotes?page={}".format(self.page)
|
||||
url = f"http://quotes.toscrape.com/api/quotes?page={self.page}"
|
||||
yield scrapy.Request(url=url, callback=self.parse)
|
||||
|
||||
This spider starts at the first page of the quotes-API. With each
|
||||
response, we parse the ``response.text`` and assign it to ``data``.
|
||||
This lets us operate on the JSON-object like on a Python dictionary.
|
||||
This spider starts at the first page of the quotes-API. With each
|
||||
response, we parse the ``response.text`` and assign it to ``data``.
|
||||
This lets us operate on the JSON-object like on a Python dictionary.
|
||||
We iterate through the ``quotes`` and print out the ``quote["text"]``.
|
||||
If the handy ``has_next`` element is ``true`` (try loading
|
||||
If the handy ``has_next`` element is ``true`` (try loading
|
||||
`quotes.toscrape.com/api/quotes?page=10`_ in your browser or a
|
||||
page-number greater than 10), we increment the ``page`` attribute
|
||||
and ``yield`` a new request, inserting the incremented page-number
|
||||
page-number greater than 10), we increment the ``page`` attribute
|
||||
and ``yield`` a new request, inserting the incremented page-number
|
||||
into our ``url``.
|
||||
|
||||
.. _requests-from-curl:
|
||||
@ -298,7 +298,7 @@ Note that to translate a cURL command into a Scrapy request,
|
||||
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
|
||||
|
||||
As you can see, with a few inspections in the `Network`-tool we
|
||||
were able to easily replicate the dynamic requests of the scrolling
|
||||
were able to easily replicate the dynamic requests of the scrolling
|
||||
functionality of the page. Crawling dynamic pages can be quite
|
||||
daunting and pages can be very complex, but it (mostly) boils down
|
||||
to identifying the correct request and replicating it in your spider.
|
||||
|
@ -57,7 +57,7 @@ value of one of their fields::
|
||||
adapter = ItemAdapter(item)
|
||||
year = adapter['year']
|
||||
if year not in self.year_to_exporter:
|
||||
f = open('{}.xml'.format(year), 'wb')
|
||||
f = open(f'{year}.xml', 'wb')
|
||||
exporter = XmlItemExporter(f)
|
||||
exporter.start_exporting()
|
||||
self.year_to_exporter[year] = exporter
|
||||
@ -98,7 +98,7 @@ Example::
|
||||
import scrapy
|
||||
|
||||
def serialize_price(value):
|
||||
return '$ %s' % str(value)
|
||||
return f'$ {str(value)}'
|
||||
|
||||
class Product(scrapy.Item):
|
||||
name = scrapy.Field()
|
||||
@ -122,7 +122,7 @@ Example::
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
if field == 'price':
|
||||
return '$ %s' % str(value)
|
||||
return f'$ {str(value)}'
|
||||
return super(Product, self).serialize_field(field, name, value)
|
||||
|
||||
.. _topics-exporters-reference:
|
||||
|
@ -96,7 +96,7 @@ contain a price::
|
||||
adapter['price'] = adapter['price'] * self.vat_factor
|
||||
return item
|
||||
else:
|
||||
raise DropItem("Missing price in %s" % item)
|
||||
raise DropItem(f"Missing price in {item}")
|
||||
|
||||
|
||||
Write items to a JSON file
|
||||
@ -211,7 +211,7 @@ item.
|
||||
# Save screenshot to file, filename will be hash of url.
|
||||
url = adapter["url"]
|
||||
url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
|
||||
filename = "{}.png".format(url_hash)
|
||||
filename = f"{url_hash}.png"
|
||||
with open(filename, "wb") as f:
|
||||
f.write(response.body)
|
||||
|
||||
@ -240,7 +240,7 @@ returns multiples items with the same id::
|
||||
def process_item(self, item, spider):
|
||||
adapter = ItemAdapter(item)
|
||||
if adapter['id'] in self.ids_seen:
|
||||
raise DropItem("Duplicate item found: %r" % item)
|
||||
raise DropItem(f"Duplicate item found: {item!r}")
|
||||
else:
|
||||
self.ids_seen.add(adapter['id'])
|
||||
return item
|
||||
|
@ -102,7 +102,7 @@ A real example
|
||||
Let's see a concrete example of a hypothetical case of memory leaks.
|
||||
Suppose we have some spider with a line similar to this one::
|
||||
|
||||
return Request("http://www.somenastyspider.com/product.php?pid=%d" % product_id,
|
||||
return Request(f"http://www.somenastyspider.com/product.php?pid={product_id}",
|
||||
callback=self.parse, cb_kwargs={'referer': response})
|
||||
|
||||
That line is passing a response reference inside a request which effectively
|
||||
|
@ -328,8 +328,9 @@ too. Here's an example:
|
||||
'<a href="image5.html">Name: My image 5 <br><img src="image5_thumb.jpg"></a>']
|
||||
|
||||
>>> for index, link in enumerate(links):
|
||||
... args = (index, link.xpath('@href').get(), link.xpath('img/@src').get())
|
||||
... print('Link number %d points to url %r and image %r' % args)
|
||||
... href_xpath = link.xpath('@href').get()
|
||||
... img_xpath = link.xpath('img/@src').get()
|
||||
... print(f'Link number {index} points to url {href_xpath!r} and image {img_xpath!r}')
|
||||
Link number 0 points to url 'image1.html' and image 'image1_thumb.jpg'
|
||||
Link number 1 points to url 'image2.html' and image 'image2_thumb.jpg'
|
||||
Link number 2 points to url 'image3.html' and image 'image3_thumb.jpg'
|
||||
@ -822,7 +823,7 @@ with groups of itemscopes and corresponding itemprops::
|
||||
... props = scope.xpath('''
|
||||
... set:difference(./descendant::*/@itemprop,
|
||||
... .//*[@itemscope]/*/@itemprop)''')
|
||||
... print(" properties: %s" % (props.getall()))
|
||||
... print(f" properties: {props.getall()}")
|
||||
... print("")
|
||||
|
||||
current scope: ['http://schema.org/Product']
|
||||
|
@ -136,7 +136,7 @@ In a spider, the settings are available through ``self.settings``::
|
||||
start_urls = ['http://example.com']
|
||||
|
||||
def parse(self, response):
|
||||
print("Existing settings: %s" % self.settings.attributes.keys())
|
||||
print(f"Existing settings: {self.settings.attributes.keys()}")
|
||||
|
||||
.. note::
|
||||
The ``settings`` attribute is set in the base Spider class after the spider
|
||||
|
@ -287,7 +287,7 @@ Spiders can access arguments in their `__init__` methods::
|
||||
|
||||
def __init__(self, category=None, *args, **kwargs):
|
||||
super(MySpider, self).__init__(*args, **kwargs)
|
||||
self.start_urls = ['http://www.example.com/categories/%s' % category]
|
||||
self.start_urls = [f'http://www.example.com/categories/{category}']
|
||||
# ...
|
||||
|
||||
The default `__init__` method will take any spider arguments
|
||||
@ -300,7 +300,7 @@ The above example can also be written as follows::
|
||||
name = 'myspider'
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request('http://www.example.com/categories/%s' % self.category)
|
||||
yield scrapy.Request(f'http://www.example.com/categories/{self.category}')
|
||||
|
||||
Keep in mind that spider arguments are only strings.
|
||||
The spider will not do any parsing on its own.
|
||||
|
@ -37,7 +37,7 @@ class Root(Resource):
|
||||
if now - self.lastmark >= 3:
|
||||
self.lastmark = now
|
||||
qps = len(self.tail) / sum(self.tail)
|
||||
print('samplesize={0} concurrent={1} qps={2:0.2f}'.format(len(self.tail), self.concurrent, qps))
|
||||
print(f'samplesize={len(self.tail)} concurrent={self.concurrent} qps={qps:0.2f}')
|
||||
|
||||
if 'latency' in request.args:
|
||||
latency = float(request.args['latency'][0])
|
||||
|
@ -37,11 +37,11 @@ class QPSSpider(Spider):
|
||||
def start_requests(self):
|
||||
url = self.benchurl
|
||||
if self.latency is not None:
|
||||
url += '?latency={0}'.format(self.latency)
|
||||
url += f'?latency={self.latency}'
|
||||
|
||||
slots = int(self.slots)
|
||||
if slots > 1:
|
||||
urls = [url.replace('localhost', '127.0.0.%d' % (x + 1)) for x in range(slots)]
|
||||
urls = [url.replace('localhost', f'127.0.0.{x + 1}') for x in range(slots)]
|
||||
else:
|
||||
urls = [url]
|
||||
|
||||
|
@ -44,7 +44,7 @@ def _get_commands_from_entry_points(inproject, group='scrapy.commands'):
|
||||
if inspect.isclass(obj):
|
||||
cmds[entry_point.name] = obj()
|
||||
else:
|
||||
raise Exception("Invalid entry point %s" % entry_point.name)
|
||||
raise Exception(f"Invalid entry point {entry_point.name}")
|
||||
return cmds
|
||||
|
||||
|
||||
@ -67,11 +67,11 @@ def _pop_command_name(argv):
|
||||
|
||||
|
||||
def _print_header(settings, inproject):
|
||||
version = scrapy.__version__
|
||||
if inproject:
|
||||
print("Scrapy %s - project: %s\n" % (scrapy.__version__,
|
||||
settings['BOT_NAME']))
|
||||
print(f"Scrapy {version} - project: {settings['BOT_NAME']}\n")
|
||||
else:
|
||||
print("Scrapy %s - no active project\n" % scrapy.__version__)
|
||||
print(f"Scrapy {version} - no active project\n")
|
||||
|
||||
|
||||
def _print_commands(settings, inproject):
|
||||
@ -81,7 +81,7 @@ def _print_commands(settings, inproject):
|
||||
print("Available commands:")
|
||||
cmds = _get_commands_dict(settings, inproject)
|
||||
for cmdname, cmdclass in sorted(cmds.items()):
|
||||
print(" %-13s %s" % (cmdname, cmdclass.short_desc()))
|
||||
print(f" {cmdname:<13} {cmdclass.short_desc()}")
|
||||
if not inproject:
|
||||
print()
|
||||
print(" [ more ] More commands available when run from project directory")
|
||||
@ -91,7 +91,7 @@ def _print_commands(settings, inproject):
|
||||
|
||||
def _print_unknown_command(settings, cmdname, inproject):
|
||||
_print_header(settings, inproject)
|
||||
print("Unknown command: %s\n" % cmdname)
|
||||
print(f"Unknown command: {cmdname}\n")
|
||||
print('Use "scrapy" to see available commands')
|
||||
|
||||
|
||||
@ -133,7 +133,7 @@ def execute(argv=None, settings=None):
|
||||
sys.exit(2)
|
||||
|
||||
cmd = cmds[cmdname]
|
||||
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
|
||||
parser.usage = f"scrapy {cmdname} {cmd.syntax()}"
|
||||
parser.description = cmd.long_desc()
|
||||
settings.setdict(cmd.default_settings, priority='command')
|
||||
cmd.settings = settings
|
||||
@ -155,7 +155,7 @@ def _run_command(cmd, args, opts):
|
||||
|
||||
def _run_command_profiled(cmd, args, opts):
|
||||
if opts.profile:
|
||||
sys.stderr.write("scrapy: writing cProfile stats to %r\n" % opts.profile)
|
||||
sys.stderr.write(f"scrapy: writing cProfile stats to {opts.profile!r}\n")
|
||||
loc = locals()
|
||||
p = cProfile.Profile()
|
||||
p.runctx('cmd.run(args, opts)', globals(), loc)
|
||||
|
@ -61,7 +61,7 @@ class ScrapyCommand:
|
||||
group.add_option("--logfile", metavar="FILE",
|
||||
help="log file. if omitted stderr will be used")
|
||||
group.add_option("-L", "--loglevel", metavar="LEVEL", default=None,
|
||||
help="log level (default: %s)" % self.settings['LOG_LEVEL'])
|
||||
help=f"log level (default: {self.settings['LOG_LEVEL']})")
|
||||
group.add_option("--nolog", action="store_true",
|
||||
help="disable logging completely")
|
||||
group.add_option("--profile", metavar="FILE", default=None,
|
||||
|
@ -50,7 +50,7 @@ class _BenchSpider(scrapy.Spider):
|
||||
|
||||
def start_requests(self):
|
||||
qargs = {'total': self.total, 'show': self.show}
|
||||
url = '{}?{}'.format(self.baseurl, urlencode(qargs, doseq=1))
|
||||
url = f'{self.baseurl}?{urlencode(qargs, doseq=1)}'
|
||||
return [scrapy.Request(url, dont_filter=True)]
|
||||
|
||||
def parse(self, response):
|
||||
|
@ -17,7 +17,7 @@ class TextTestResult(_TextTestResult):
|
||||
plural = "s" if run != 1 else ""
|
||||
|
||||
writeln(self.separator2)
|
||||
writeln("Ran %d contract%s in %.3fs" % (run, plural, stop - start))
|
||||
writeln(f"Ran {run} contract{plural} in {stop - start:.3f}s")
|
||||
writeln()
|
||||
|
||||
infos = []
|
||||
@ -25,14 +25,14 @@ class TextTestResult(_TextTestResult):
|
||||
write("FAILED")
|
||||
failed, errored = map(len, (self.failures, self.errors))
|
||||
if failed:
|
||||
infos.append("failures=%d" % failed)
|
||||
infos.append(f"failures={failed}")
|
||||
if errored:
|
||||
infos.append("errors=%d" % errored)
|
||||
infos.append(f"errors={errored}")
|
||||
else:
|
||||
write("OK")
|
||||
|
||||
if infos:
|
||||
writeln(" (%s)" % (", ".join(infos),))
|
||||
writeln(f" ({', '.join(infos)})")
|
||||
else:
|
||||
write("\n")
|
||||
|
||||
@ -85,7 +85,7 @@ class Command(ScrapyCommand):
|
||||
continue
|
||||
print(spider)
|
||||
for method in sorted(methods):
|
||||
print(' * %s' % method)
|
||||
print(f' * {method}')
|
||||
else:
|
||||
start = time.time()
|
||||
self.crawler_process.start()
|
||||
|
@ -32,8 +32,8 @@ class Command(ScrapyCommand):
|
||||
try:
|
||||
spidercls = self.crawler_process.spider_loader.load(args[0])
|
||||
except KeyError:
|
||||
return self._err("Spider not found: %s" % args[0])
|
||||
return self._err(f"Spider not found: {args[0]}")
|
||||
|
||||
sfile = sys.modules[spidercls.__module__].__file__
|
||||
sfile = sfile.replace('.pyc', '.py')
|
||||
self.exitcode = os.system('%s "%s"' % (editor, sfile))
|
||||
self.exitcode = os.system(f'{editor} "{sfile}"')
|
||||
|
@ -73,17 +73,18 @@ class Command(ScrapyCommand):
|
||||
if template_file:
|
||||
self._genspider(module, name, domain, opts.template, template_file)
|
||||
if opts.edit:
|
||||
self.exitcode = os.system('scrapy edit "%s"' % name)
|
||||
self.exitcode = os.system(f'scrapy edit "{name}"')
|
||||
|
||||
def _genspider(self, module, name, domain, template_name, template_file):
|
||||
"""Generate the spider module, based on the given template"""
|
||||
capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
|
||||
tvars = {
|
||||
'project_name': self.settings.get('BOT_NAME'),
|
||||
'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
|
||||
'module': module,
|
||||
'name': name,
|
||||
'domain': domain,
|
||||
'classname': '%sSpider' % ''.join(s.capitalize() for s in module.split('_'))
|
||||
'classname': f'{capitalized_module}Spider'
|
||||
}
|
||||
if self.settings.get('NEWSPIDER_MODULE'):
|
||||
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
|
||||
@ -91,32 +92,32 @@ class Command(ScrapyCommand):
|
||||
else:
|
||||
spiders_module = None
|
||||
spiders_dir = "."
|
||||
spider_file = "%s.py" % join(spiders_dir, module)
|
||||
spider_file = f"{join(spiders_dir, module)}.py"
|
||||
shutil.copyfile(template_file, spider_file)
|
||||
render_templatefile(spider_file, **tvars)
|
||||
print("Created spider %r using template %r "
|
||||
% (name, template_name), end=('' if spiders_module else '\n'))
|
||||
print(f"Created spider {name!r} using template {template_name!r} ",
|
||||
end=('' if spiders_module else '\n'))
|
||||
if spiders_module:
|
||||
print("in module:\n %s.%s" % (spiders_module.__name__, module))
|
||||
print("in module:\n {spiders_module.__name__}.{module}")
|
||||
|
||||
def _find_template(self, template):
|
||||
template_file = join(self.templates_dir, '%s.tmpl' % template)
|
||||
template_file = join(self.templates_dir, f'{template}.tmpl')
|
||||
if exists(template_file):
|
||||
return template_file
|
||||
print("Unable to find template: %s\n" % template)
|
||||
print(f"Unable to find template: {template}\n")
|
||||
print('Use "scrapy genspider --list" to see all available templates.')
|
||||
|
||||
def _list_templates(self):
|
||||
print("Available templates:")
|
||||
for filename in sorted(os.listdir(self.templates_dir)):
|
||||
if filename.endswith('.tmpl'):
|
||||
print(" %s" % splitext(filename)[0])
|
||||
print(f" {splitext(filename)[0]}")
|
||||
|
||||
def _spider_exists(self, name):
|
||||
if not self.settings.get('NEWSPIDER_MODULE'):
|
||||
# if run as a standalone command and file with same filename already exists
|
||||
if exists(name + ".py"):
|
||||
print("%s already exists" % (abspath(name + ".py")))
|
||||
print(f"{abspath(name + '.py')} already exists")
|
||||
return True
|
||||
return False
|
||||
|
||||
@ -126,8 +127,8 @@ class Command(ScrapyCommand):
|
||||
pass
|
||||
else:
|
||||
# if spider with same name exists
|
||||
print("Spider %r already exists in module:" % name)
|
||||
print(" %s" % spidercls.__module__)
|
||||
print(f"Spider {name!r} already exists in module:")
|
||||
print(f" {spidercls.__module__}")
|
||||
return True
|
||||
|
||||
# a file with the same name exists in the target directory
|
||||
@ -135,7 +136,7 @@ class Command(ScrapyCommand):
|
||||
spiders_dir = dirname(spiders_module.__file__)
|
||||
spiders_dir_abs = abspath(spiders_dir)
|
||||
if exists(join(spiders_dir_abs, name + ".py")):
|
||||
print("%s already exists" % (join(spiders_dir_abs, (name + ".py"))))
|
||||
print(f"{join(spiders_dir_abs, (name + '.py'))} already exists")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
@ -96,13 +96,13 @@ class Command(BaseRunSpiderCommand):
|
||||
|
||||
if opts.verbose:
|
||||
for level in range(1, self.max_level + 1):
|
||||
print('\n>>> DEPTH LEVEL: %s <<<' % level)
|
||||
print(f'\n>>> DEPTH LEVEL: {level} <<<')
|
||||
if not opts.noitems:
|
||||
self.print_items(level, colour)
|
||||
if not opts.nolinks:
|
||||
self.print_requests(level, colour)
|
||||
else:
|
||||
print('\n>>> STATUS DEPTH LEVEL %s <<<' % self.max_level)
|
||||
print(f'\n>>> STATUS DEPTH LEVEL {self.max_level} <<<')
|
||||
if not opts.noitems:
|
||||
self.print_items(colour=colour)
|
||||
if not opts.nolinks:
|
||||
|
@ -12,7 +12,7 @@ def _import_file(filepath):
|
||||
dirname, file = os.path.split(abspath)
|
||||
fname, fext = os.path.splitext(file)
|
||||
if fext != '.py':
|
||||
raise ValueError("Not a Python source file: %s" % abspath)
|
||||
raise ValueError(f"Not a Python source file: {abspath}")
|
||||
if dirname:
|
||||
sys.path = [dirname] + sys.path
|
||||
try:
|
||||
@ -42,14 +42,14 @@ class Command(BaseRunSpiderCommand):
|
||||
raise UsageError()
|
||||
filename = args[0]
|
||||
if not os.path.exists(filename):
|
||||
raise UsageError("File not found: %s\n" % filename)
|
||||
raise UsageError(f"File not found: {filename}\n")
|
||||
try:
|
||||
module = _import_file(filename)
|
||||
except (ImportError, ValueError) as e:
|
||||
raise UsageError("Unable to load %r: %s\n" % (filename, e))
|
||||
raise UsageError(f"Unable to load {filename!r}: {e}\n")
|
||||
spclasses = list(iter_spider_classes(module))
|
||||
if not spclasses:
|
||||
raise UsageError("No spider found in file: %s\n" % filename)
|
||||
raise UsageError(f"No spider found in file: {filename}\n")
|
||||
spidercls = spclasses.pop()
|
||||
|
||||
self.crawler_process.crawl(spidercls, **opts.spargs)
|
||||
|
@ -52,7 +52,7 @@ class Command(ScrapyCommand):
|
||||
print('Error: Project names must begin with a letter and contain'
|
||||
' only\nletters, numbers and underscores')
|
||||
elif _module_exists(project_name):
|
||||
print('Error: Module %r already exists' % project_name)
|
||||
print(f'Error: Module {project_name!r} already exists')
|
||||
else:
|
||||
return True
|
||||
return False
|
||||
@ -100,7 +100,7 @@ class Command(ScrapyCommand):
|
||||
|
||||
if exists(join(project_dir, 'scrapy.cfg')):
|
||||
self.exitcode = 1
|
||||
print('Error: scrapy.cfg already exists in %s' % abspath(project_dir))
|
||||
print(f'Error: scrapy.cfg already exists in {abspath(project_dir)}')
|
||||
return
|
||||
|
||||
if not self._is_valid_name(project_name):
|
||||
@ -113,11 +113,11 @@ class Command(ScrapyCommand):
|
||||
path = join(*paths)
|
||||
tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
|
||||
render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
|
||||
print("New Scrapy project '%s', using template directory '%s', "
|
||||
"created in:" % (project_name, self.templates_dir))
|
||||
print(" %s\n" % abspath(project_dir))
|
||||
print(f"New Scrapy project '{project_name}', using template directory "
|
||||
f"'{self.templates_dir}', created in:")
|
||||
print(f" {abspath(project_dir)}\n")
|
||||
print("You can start your first spider with:")
|
||||
print(" cd %s" % project_dir)
|
||||
print(f" cd {project_dir}")
|
||||
print(" scrapy genspider example example.com")
|
||||
|
||||
@property
|
||||
|
@ -23,8 +23,7 @@ class Command(ScrapyCommand):
|
||||
if opts.verbose:
|
||||
versions = scrapy_components_versions()
|
||||
width = max(len(n) for (n, _) in versions)
|
||||
patt = "%-{}s : %s".format(width)
|
||||
for name, version in versions:
|
||||
print(patt % (name, version))
|
||||
print(f"{name:<{width}} : {version}")
|
||||
else:
|
||||
print("Scrapy %s" % scrapy.__version__)
|
||||
print(f"Scrapy {scrapy.__version__}")
|
||||
|
@ -112,8 +112,8 @@ class Contract:
|
||||
request_cls = None
|
||||
|
||||
def __init__(self, method, *args):
|
||||
self.testcase_pre = _create_testcase(method, '@%s pre-hook' % self.name)
|
||||
self.testcase_post = _create_testcase(method, '@%s post-hook' % self.name)
|
||||
self.testcase_pre = _create_testcase(method, f'@{self.name} pre-hook')
|
||||
self.testcase_post = _create_testcase(method, f'@{self.name} post-hook')
|
||||
self.args = args
|
||||
|
||||
def add_pre_hook(self, request, results):
|
||||
@ -172,8 +172,8 @@ def _create_testcase(method, desc):
|
||||
|
||||
class ContractTestCase(TestCase):
|
||||
def __str__(_self):
|
||||
return "[%s] %s (%s)" % (spider, method.__name__, desc)
|
||||
return f"[{spider}] {method.__name__} ({desc})"
|
||||
|
||||
name = '%s_%s' % (spider, method.__name__)
|
||||
name = f'{spider}_{method.__name__}'
|
||||
setattr(ContractTestCase, name, lambda x: x)
|
||||
return ContractTestCase(name)
|
||||
|
@ -60,8 +60,7 @@ class ReturnsContract(Contract):
|
||||
|
||||
if len(self.args) not in [1, 2, 3]:
|
||||
raise ValueError(
|
||||
"Incorrect argument quantity: expected 1, 2 or 3, got %i"
|
||||
% len(self.args)
|
||||
f"Incorrect argument quantity: expected 1, 2 or 3, got {len(self.args)}"
|
||||
)
|
||||
self.obj_name = self.args[0] or None
|
||||
self.obj_type_verifier = self.object_type_verifiers[self.obj_name]
|
||||
@ -88,10 +87,9 @@ class ReturnsContract(Contract):
|
||||
if self.min_bound == self.max_bound:
|
||||
expected = self.min_bound
|
||||
else:
|
||||
expected = '%s..%s' % (self.min_bound, self.max_bound)
|
||||
expected = f'{self.min_bound}..{self.max_bound}'
|
||||
|
||||
raise ContractFail("Returned %s %s, expected %s" %
|
||||
(occurrences, self.obj_name, expected))
|
||||
raise ContractFail(f"Returned {occurrences} {self.obj_name}, expected {expected}")
|
||||
|
||||
|
||||
class ScrapesContract(Contract):
|
||||
@ -106,5 +104,5 @@ class ScrapesContract(Contract):
|
||||
if is_item(x):
|
||||
missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
|
||||
if missing:
|
||||
missing_str = ", ".join(missing)
|
||||
raise ContractFail("Missing fields: %s" % missing_str)
|
||||
missing_fields = ", ".join(missing)
|
||||
raise ContractFail(f"Missing fields: {missing_fields}")
|
||||
|
@ -41,17 +41,17 @@ class Slot:
|
||||
|
||||
def __repr__(self):
|
||||
cls_name = self.__class__.__name__
|
||||
return "%s(concurrency=%r, delay=%0.2f, randomize_delay=%r)" % (
|
||||
cls_name, self.concurrency, self.delay, self.randomize_delay)
|
||||
return (f"{cls_name}(concurrency={self.concurrency!r}, "
|
||||
f"delay={self.delay:.2f}, "
|
||||
f"randomize_delay={self.randomize_delay!r})")
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
"<downloader.Slot concurrency=%r delay=%0.2f randomize_delay=%r "
|
||||
"len(active)=%d len(queue)=%d len(transferring)=%d lastseen=%s>" % (
|
||||
self.concurrency, self.delay, self.randomize_delay,
|
||||
len(self.active), len(self.queue), len(self.transferring),
|
||||
datetime.fromtimestamp(self.lastseen).isoformat()
|
||||
)
|
||||
f"<downloader.Slot concurrency={self.concurrency!r} "
|
||||
f"delay={self.delay:.2f} randomize_delay={self.randomize_delay!r} "
|
||||
f"len(active)={len(self.active)} len(queue)={len(self.queue)} "
|
||||
f"len(transferring)={len(self.transferring)} "
|
||||
f"lastseen={datetime.fromtimestamp(self.lastseen).isoformat()}>"
|
||||
)
|
||||
|
||||
|
||||
|
@ -71,8 +71,7 @@ class DownloadHandlers:
|
||||
scheme = urlparse_cached(request).scheme
|
||||
handler = self._get_handler(scheme)
|
||||
if not handler:
|
||||
raise NotSupported("Unsupported URL scheme '%s': %s" %
|
||||
(scheme, self._notconfigured[scheme]))
|
||||
raise NotSupported(f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}")
|
||||
return handler.download_request(request, spider)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
|
@ -60,11 +60,11 @@ class HTTP11DownloadHandler:
|
||||
settings=settings,
|
||||
crawler=crawler,
|
||||
)
|
||||
msg = """
|
||||
'%s' does not accept `method` argument (type OpenSSL.SSL method,\
|
||||
e.g. OpenSSL.SSL.SSLv23_METHOD) and/or `tls_verbose_logging` argument and/or `tls_ciphers` argument.\
|
||||
Please upgrade your context factory class to handle them or ignore them.""" % (
|
||||
settings['DOWNLOADER_CLIENTCONTEXTFACTORY'],)
|
||||
msg = f"""
|
||||
'{settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]}' does not accept `method` \
|
||||
argument (type OpenSSL.SSL method, e.g. OpenSSL.SSL.SSLv23_METHOD) and/or \
|
||||
`tls_verbose_logging` argument and/or `tls_ciphers` argument.\
|
||||
Please upgrade your context factory class to handle them or ignore them."""
|
||||
warnings.warn(msg)
|
||||
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
|
||||
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
|
||||
@ -169,8 +169,9 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
|
||||
else:
|
||||
extra = rcvd_bytes[:32]
|
||||
self._tunnelReadyDeferred.errback(
|
||||
TunnelError('Could not open CONNECT tunnel with proxy %s:%s [%r]' % (
|
||||
self._host, self._port, extra)))
|
||||
TunnelError('Could not open CONNECT tunnel with proxy '
|
||||
f'{self._host}:{self._port} [{extra!r}]')
|
||||
)
|
||||
|
||||
def connectFailed(self, reason):
|
||||
"""Propagates the errback to the appropriate deferred."""
|
||||
@ -371,7 +372,7 @@ class ScrapyAgent:
|
||||
if self._txresponse:
|
||||
self._txresponse._transport.stopProducing()
|
||||
|
||||
raise TimeoutError("Getting %s took longer than %s seconds." % (url, timeout))
|
||||
raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
|
||||
|
||||
def _cb_latency(self, result, request, start_time):
|
||||
request.meta['download_latency'] = time() - start_time
|
||||
|
@ -56,7 +56,7 @@ class S3DownloadHandler:
|
||||
import botocore.credentials
|
||||
kw.pop('anon', None)
|
||||
if kw:
|
||||
raise TypeError('Unexpected keyword arguments: %s' % kw)
|
||||
raise TypeError(f'Unexpected keyword arguments: {kw}')
|
||||
if not self.anon:
|
||||
SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
|
||||
self._signer = SignerCls(botocore.credentials.Credentials(
|
||||
@ -85,14 +85,14 @@ class S3DownloadHandler:
|
||||
scheme = 'https' if request.meta.get('is_secure') else 'http'
|
||||
bucket = p.hostname
|
||||
path = p.path + '?' + p.query if p.query else p.path
|
||||
url = '%s://%s.s3.amazonaws.com%s' % (scheme, bucket, path)
|
||||
url = f'{scheme}://{bucket}.s3.amazonaws.com{path}'
|
||||
if self.anon:
|
||||
request = request.replace(url=url)
|
||||
elif self._signer is not None:
|
||||
import botocore.awsrequest
|
||||
awsrequest = botocore.awsrequest.AWSRequest(
|
||||
method=request.method,
|
||||
url='%s://s3.amazonaws.com/%s%s' % (scheme, bucket, path),
|
||||
url=f'{scheme}://s3.amazonaws.com/{bucket}{path}',
|
||||
headers=request.headers.to_unicode_dict(),
|
||||
data=request.body)
|
||||
self._signer.add_auth(awsrequest)
|
||||
|
@ -36,8 +36,9 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
response = yield deferred_from_coro(method(request=request, spider=spider))
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput(
|
||||
"Middleware %s.process_request must return None, Response or Request, got %s"
|
||||
% (method.__self__.__class__.__name__, response.__class__.__name__)
|
||||
f"Middleware {method.__self__.__class__.__name__}"
|
||||
".process_request must return None, Response or "
|
||||
f"Request, got {response.__class__.__name__}"
|
||||
)
|
||||
if response:
|
||||
return response
|
||||
@ -54,8 +55,9 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
|
||||
if not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput(
|
||||
"Middleware %s.process_response must return Response or Request, got %s"
|
||||
% (method.__self__.__class__.__name__, type(response))
|
||||
f"Middleware {method.__self__.__class__.__name__}"
|
||||
".process_response must return Response or Request, "
|
||||
f"got {type(response)}"
|
||||
)
|
||||
if isinstance(response, Request):
|
||||
return response
|
||||
@ -68,8 +70,9 @@ class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput(
|
||||
"Middleware %s.process_exception must return None, Response or Request, got %s"
|
||||
% (method.__self__.__class__.__name__, type(response))
|
||||
f"Middleware {method.__self__.__class__.__name__}"
|
||||
".process_exception must return None, Response or "
|
||||
f"Request, got {type(response)}"
|
||||
)
|
||||
if response:
|
||||
return response
|
||||
|
@ -88,8 +88,8 @@ class ScrapyHTTPPageGetter(HTTPClient):
|
||||
self.transport.stopProducing()
|
||||
|
||||
self.factory.noPage(
|
||||
defer.TimeoutError("Getting %s took longer than %s seconds."
|
||||
% (self.factory.url, self.factory.timeout)))
|
||||
defer.TimeoutError(f"Getting {self.factory.url} took longer "
|
||||
f"than {self.factory.timeout} seconds."))
|
||||
|
||||
|
||||
# This class used to inherit from Twisted’s
|
||||
@ -155,7 +155,7 @@ class ScrapyHTTPClientFactory(ClientFactory):
|
||||
self.headers['Content-Length'] = 0
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s: %s>" % (self.__class__.__name__, self.url)
|
||||
return f"<{self.__class__.__name__}: {self.url}>"
|
||||
|
||||
def _cancelTimeout(self, result, timeoutCall):
|
||||
if timeoutCall.active():
|
||||
|
@ -199,8 +199,8 @@ class ExecutionEngine:
|
||||
def _handle_downloader_output(self, response, request, spider):
|
||||
if not isinstance(response, (Request, Response, Failure)):
|
||||
raise TypeError(
|
||||
"Incorrect type: expected Request, Response or Failure, got %s: %r"
|
||||
% (type(response), response)
|
||||
"Incorrect type: expected Request, Response or Failure, got "
|
||||
f"{type(response)}: {response!r}"
|
||||
)
|
||||
# downloader middleware can return requests (for example, redirects)
|
||||
if isinstance(response, Request):
|
||||
@ -242,7 +242,7 @@ class ExecutionEngine:
|
||||
|
||||
def crawl(self, request, spider):
|
||||
if spider not in self.open_spiders:
|
||||
raise RuntimeError("Spider %r not opened when crawling: %s" % (spider.name, request))
|
||||
raise RuntimeError(f"Spider {spider.name!r} not opened when crawling: {request}")
|
||||
self.schedule(request, spider)
|
||||
self.slot.nextcall.schedule()
|
||||
|
||||
@ -267,8 +267,8 @@ class ExecutionEngine:
|
||||
def _on_success(response):
|
||||
if not isinstance(response, (Response, Request)):
|
||||
raise TypeError(
|
||||
"Incorrect type: expected Response or Request, got %s: %r"
|
||||
% (type(response), response)
|
||||
"Incorrect type: expected Response or Request, got "
|
||||
f"{type(response)}: {response!r}"
|
||||
)
|
||||
if isinstance(response, Response):
|
||||
if response.request is None:
|
||||
@ -296,7 +296,7 @@ class ExecutionEngine:
|
||||
@defer.inlineCallbacks
|
||||
def open_spider(self, spider, start_requests=(), close_if_idle=True, new_queue_behavior=False):
|
||||
if not self.has_capacity():
|
||||
raise RuntimeError("No free spider slot when opening %r" % spider.name)
|
||||
raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
|
||||
logger.info("Spider opened", extra={'spider': spider})
|
||||
nextcall = CallLaterOnce(self._next_request, spider)
|
||||
scheduler = self.scheduler_cls.from_crawler(self.crawler)
|
||||
|
@ -125,7 +125,7 @@ class Scraper:
|
||||
Handle the downloaded response or failure through the spider callback/errback
|
||||
"""
|
||||
if not isinstance(result, (Response, Failure)):
|
||||
raise TypeError("Incorrect type: expected Response or Failure, got %s: %r" % (type(result), result))
|
||||
raise TypeError(f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}")
|
||||
dfd = self._scrape2(result, request, spider) # returns spider's processed output
|
||||
dfd.addErrback(self.handle_spider_error, request, result, spider)
|
||||
dfd.addCallback(self.handle_spider_output, request, result, spider)
|
||||
@ -173,7 +173,7 @@ class Scraper:
|
||||
spider=spider
|
||||
)
|
||||
self.crawler.stats.inc_value(
|
||||
"spider_exceptions/%s" % _failure.value.__class__.__name__,
|
||||
f"spider_exceptions/{_failure.value.__class__.__name__}",
|
||||
spider=spider
|
||||
)
|
||||
|
||||
|
@ -19,10 +19,7 @@ def _isiterable(possible_iterator):
|
||||
|
||||
|
||||
def _fname(f):
|
||||
return "{}.{}".format(
|
||||
f.__self__.__class__.__name__,
|
||||
f.__func__.__name__
|
||||
)
|
||||
return f"{f.__self__.__class__.__name__}.{f.__func__.__name__}"
|
||||
|
||||
|
||||
class SpiderMiddlewareManager(MiddlewareManager):
|
||||
@ -51,8 +48,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
try:
|
||||
result = method(response=response, spider=spider)
|
||||
if result is not None:
|
||||
msg = "Middleware {} must return None or raise an exception, got {}"
|
||||
raise _InvalidOutput(msg.format(_fname(method), type(result)))
|
||||
msg = (f"Middleware {_fname(method)} must return None "
|
||||
f"or raise an exception, got {type(result)}")
|
||||
raise _InvalidOutput(msg)
|
||||
except _InvalidOutput:
|
||||
raise
|
||||
except Exception:
|
||||
@ -86,8 +84,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
elif result is None:
|
||||
continue
|
||||
else:
|
||||
msg = "Middleware {} must return None or an iterable, got {}"
|
||||
raise _InvalidOutput(msg.format(_fname(method), type(result)))
|
||||
msg = (f"Middleware {_fname(method)} must return None "
|
||||
f"or an iterable, got {type(result)}")
|
||||
raise _InvalidOutput(msg)
|
||||
return _failure
|
||||
|
||||
def process_spider_output(result, start_index=0):
|
||||
@ -110,8 +109,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
|
||||
if _isiterable(result):
|
||||
result = _evaluate_iterable(result, method_index + 1, recovered)
|
||||
else:
|
||||
msg = "Middleware {} must return an iterable, got {}"
|
||||
raise _InvalidOutput(msg.format(_fname(method), type(result)))
|
||||
msg = (f"Middleware {_fname(method)} must return an "
|
||||
f"iterable, got {type(result)}")
|
||||
raise _InvalidOutput(msg)
|
||||
|
||||
return MutableChain(result, recovered)
|
||||
|
||||
|
@ -54,8 +54,8 @@ class CookiesMiddleware:
|
||||
cl = [to_unicode(c, errors='replace')
|
||||
for c in request.headers.getlist('Cookie')]
|
||||
if cl:
|
||||
cookies = "\n".join("Cookie: {}\n".format(c) for c in cl)
|
||||
msg = "Sending cookies to: {}\n{}".format(request, cookies)
|
||||
cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
|
||||
msg = f"Sending cookies to: {request}\n{cookies}"
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _debug_set_cookie(self, response, spider):
|
||||
@ -63,8 +63,8 @@ class CookiesMiddleware:
|
||||
cl = [to_unicode(c, errors='replace')
|
||||
for c in response.headers.getlist('Set-Cookie')]
|
||||
if cl:
|
||||
cookies = "\n".join("Set-Cookie: {}\n".format(c) for c in cl)
|
||||
msg = "Received cookies from: {}\n{}".format(response, cookies)
|
||||
cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
|
||||
msg = f"Received cookies from: {response}\n{cookies}"
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _format_cookie(self, cookie, request):
|
||||
@ -90,9 +90,9 @@ class CookiesMiddleware:
|
||||
request, cookie)
|
||||
decoded[key] = cookie[key].decode("latin1", errors="replace")
|
||||
|
||||
cookie_str = "{}={}".format(decoded.pop("name"), decoded.pop("value"))
|
||||
cookie_str = f"{decoded.pop('name')}={decoded.pop('value')}"
|
||||
for key, value in decoded.items(): # path, domain
|
||||
cookie_str += "; {}={}".format(key.capitalize(), value)
|
||||
cookie_str += f"; {key.capitalize()}={value}"
|
||||
return cookie_str
|
||||
|
||||
def _get_request_cookies(self, jar, request):
|
||||
|
@ -24,7 +24,7 @@ class HttpProxyMiddleware:
|
||||
|
||||
def _basic_auth_header(self, username, password):
|
||||
user_pass = to_bytes(
|
||||
'%s:%s' % (unquote(username), unquote(password)),
|
||||
f'{unquote(username)}:{unquote(password)}',
|
||||
encoding=self.auth_encoding)
|
||||
return base64.b64encode(user_pass)
|
||||
|
||||
|
@ -88,7 +88,7 @@ class RetryMiddleware:
|
||||
reason = global_object_name(reason.__class__)
|
||||
|
||||
stats.inc_value('retry/count')
|
||||
stats.inc_value('retry/reason_count/%s' % reason)
|
||||
stats.inc_value(f'retry/reason_count/{reason}')
|
||||
return retryreq
|
||||
else:
|
||||
stats.inc_value('retry/max_reached')
|
||||
|
@ -61,7 +61,7 @@ class RobotsTxtMiddleware:
|
||||
|
||||
if netloc not in self._parsers:
|
||||
self._parsers[netloc] = Deferred()
|
||||
robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
|
||||
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
|
||||
robotsreq = Request(
|
||||
robotsurl,
|
||||
priority=self.DOWNLOAD_PRIORITY,
|
||||
@ -94,7 +94,7 @@ class RobotsTxtMiddleware:
|
||||
|
||||
def _parse_robots(self, response, netloc, spider):
|
||||
self.crawler.stats.inc_value('robotstxt/response_count')
|
||||
self.crawler.stats.inc_value('robotstxt/response_status_count/{}'.format(response.status))
|
||||
self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}')
|
||||
rp = self._parserimpl.from_crawler(self.crawler, response.body)
|
||||
rp_dfd = self._parsers[netloc]
|
||||
self._parsers[netloc] = rp
|
||||
@ -102,7 +102,7 @@ class RobotsTxtMiddleware:
|
||||
|
||||
def _robots_error(self, failure, netloc):
|
||||
if failure.type is not IgnoreRequest:
|
||||
key = 'robotstxt/exception_count/{}'.format(failure.type)
|
||||
key = f'robotstxt/exception_count/{failure.type}'
|
||||
self.crawler.stats.inc_value(key)
|
||||
rp_dfd = self._parsers[netloc]
|
||||
self._parsers[netloc] = None
|
||||
|
@ -17,13 +17,13 @@ class DownloaderStats:
|
||||
|
||||
def process_request(self, request, spider):
|
||||
self.stats.inc_value('downloader/request_count', spider=spider)
|
||||
self.stats.inc_value('downloader/request_method_count/%s' % request.method, spider=spider)
|
||||
self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
|
||||
reqlen = len(request_httprepr(request))
|
||||
self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
self.stats.inc_value('downloader/response_count', spider=spider)
|
||||
self.stats.inc_value('downloader/response_status_count/%s' % response.status, spider=spider)
|
||||
self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
|
||||
reslen = len(response_httprepr(response))
|
||||
self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
|
||||
return response
|
||||
@ -31,4 +31,4 @@ class DownloaderStats:
|
||||
def process_exception(self, request, exception, spider):
|
||||
ex_class = global_object_name(exception.__class__)
|
||||
self.stats.inc_value('downloader/exception_count', spider=spider)
|
||||
self.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
|
||||
self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)
|
||||
|
@ -39,7 +39,7 @@ class BaseItemExporter:
|
||||
self.export_empty_fields = options.pop('export_empty_fields', False)
|
||||
self.indent = options.pop('indent', None)
|
||||
if not dont_fail and options:
|
||||
raise TypeError("Unexpected options: %s" % ', '.join(options.keys()))
|
||||
raise TypeError(f"Unexpected options: {', '.join(options.keys())}")
|
||||
|
||||
def export_item(self, item):
|
||||
raise NotImplementedError
|
||||
|
@ -43,4 +43,4 @@ class CoreStats:
|
||||
def item_dropped(self, item, spider, exception):
|
||||
reason = exception.__class__.__name__
|
||||
self.stats.inc_value('item_dropped_count', spider=spider)
|
||||
self.stats.inc_value('item_dropped_reasons_count/%s' % reason, spider=spider)
|
||||
self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)
|
||||
|
@ -48,7 +48,7 @@ class StackTraceDump:
|
||||
for id_, frame in sys._current_frames().items():
|
||||
name = id2name.get(id_, '')
|
||||
dump = ''.join(traceback.format_stack(frame))
|
||||
dumps += "# Thread: {0}({1})\n{2}\n".format(name, id_, dump)
|
||||
dumps += f"# Thread: {name}({id_})\n{dump}\n"
|
||||
return dumps
|
||||
|
||||
|
||||
|
@ -223,7 +223,7 @@ class DbmCacheStorage:
|
||||
self.db = None
|
||||
|
||||
def open_spider(self, spider):
|
||||
dbpath = os.path.join(self.cachedir, '%s.db' % spider.name)
|
||||
dbpath = os.path.join(self.cachedir, f'{spider.name}.db')
|
||||
self.db = self.dbmodule.open(dbpath, 'c')
|
||||
|
||||
logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
|
||||
@ -251,13 +251,13 @@ class DbmCacheStorage:
|
||||
'headers': dict(response.headers),
|
||||
'body': response.body,
|
||||
}
|
||||
self.db['%s_data' % key] = pickle.dumps(data, protocol=4)
|
||||
self.db['%s_time' % key] = str(time())
|
||||
self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
|
||||
self.db[f'{key}_time'] = str(time())
|
||||
|
||||
def _read_data(self, spider, request):
|
||||
key = self._request_key(request)
|
||||
db = self.db
|
||||
tkey = '%s_time' % key
|
||||
tkey = f'{key}_time'
|
||||
if tkey not in db:
|
||||
return # not found
|
||||
|
||||
@ -265,7 +265,7 @@ class DbmCacheStorage:
|
||||
if 0 < self.expiration_secs < time() - float(ts):
|
||||
return # expired
|
||||
|
||||
return pickle.loads(db['%s_data' % key])
|
||||
return pickle.loads(db[f'{key}_data'])
|
||||
|
||||
def _request_key(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
@ -30,4 +30,4 @@ class MemoryDebugger:
|
||||
for cls, wdict in live_refs.items():
|
||||
if not wdict:
|
||||
continue
|
||||
self.stats.set_value('memdebug/live_refs/%s' % cls.__name__, len(wdict), spider=spider)
|
||||
self.stats.set_value(f'memdebug/live_refs/{cls.__name__}', len(wdict), spider=spider)
|
||||
|
@ -82,8 +82,8 @@ class MemoryUsage:
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = (
|
||||
"%s terminated: memory usage exceeded %dM at %s"
|
||||
% (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
|
||||
f"{self.crawler.settings['BOT_NAME']} terminated: "
|
||||
f"memory usage exceeded {mem}M at {socket.gethostname()}"
|
||||
)
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/limit_notified', 1)
|
||||
@ -105,8 +105,8 @@ class MemoryUsage:
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = (
|
||||
"%s warning: memory usage reached %dM at %s"
|
||||
% (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
|
||||
f"{self.crawler.settings['BOT_NAME']} warning: "
|
||||
f"memory usage reached {mem}M at {socket.gethostname()}"
|
||||
)
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/warning_notified', 1)
|
||||
@ -115,9 +115,9 @@ class MemoryUsage:
|
||||
def _send_report(self, rcpts, subject):
|
||||
"""send notification mail with some additional useful info"""
|
||||
stats = self.crawler.stats
|
||||
s = "Memory usage at engine startup : %dM\r\n" % (stats.get_value('memusage/startup')/1024/1024)
|
||||
s += "Maximum memory usage : %dM\r\n" % (stats.get_value('memusage/max')/1024/1024)
|
||||
s += "Current memory usage : %dM\r\n" % (self.get_virtual_size()/1024/1024)
|
||||
s = f"Memory usage at engine startup : {stats.get_value('memusage/startup')/1024/1024}M\r\n"
|
||||
s += f"Maximum memory usage : {stats.get_value('memusage/max')/1024/1024}M\r\n"
|
||||
s += f"Current memory usage : {self.get_virtual_size()/1024/1024}M\r\n"
|
||||
|
||||
s += "ENGINE STATUS ------------------------------------------------------- \r\n"
|
||||
s += "\r\n"
|
||||
|
@ -24,11 +24,11 @@ class StatsMailer:
|
||||
o = cls(crawler.stats, recipients, mail)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
|
||||
def spider_closed(self, spider):
|
||||
spider_stats = self.stats.get_stats(spider)
|
||||
body = "Global stats\n\n"
|
||||
body += "\n".join("%-50s : %s" % i for i in self.stats.get_stats().items())
|
||||
body += "\n\n%s stats\n\n" % spider.name
|
||||
body += "\n".join("%-50s : %s" % i for i in spider_stats.items())
|
||||
return self.mail.send(self.recipients, "Scrapy stats for: %s" % spider.name, body)
|
||||
body += "\n".join(f"{k:<50} : {v}" for k, v in self.stats.get_stats().items())
|
||||
body += f"\n\n{spider.name} stats\n\n"
|
||||
body += "\n".join(f"{k:<50} : {v}" for k, v in spider_stats.items())
|
||||
return self.mail.send(self.recipients, f"Scrapy stats for: {spider.name}", body)
|
||||
|
@ -1,6 +1,6 @@
|
||||
def obsolete_setter(setter, attrname):
|
||||
def newsetter(self, value):
|
||||
c = self.__class__.__name__
|
||||
msg = "%s.%s is not modifiable, use %s.replace() instead" % (c, attrname, c)
|
||||
msg = f"{c}.{attrname} is not modifiable, use {c}.replace() instead"
|
||||
raise AttributeError(msg)
|
||||
return newsetter
|
||||
|
@ -33,7 +33,7 @@ class Headers(CaselessDict):
|
||||
elif isinstance(x, int):
|
||||
return str(x).encode(self.encoding)
|
||||
else:
|
||||
raise TypeError('Unsupported value type: {}'.format(type(x)))
|
||||
raise TypeError(f'Unsupported value type: {type(x)}')
|
||||
|
||||
def __getitem__(self, key):
|
||||
try:
|
||||
|
@ -25,13 +25,13 @@ class Request(object_ref):
|
||||
self._set_url(url)
|
||||
self._set_body(body)
|
||||
if not isinstance(priority, int):
|
||||
raise TypeError("Request priority not an integer: %r" % priority)
|
||||
raise TypeError(f"Request priority not an integer: {priority!r}")
|
||||
self.priority = priority
|
||||
|
||||
if callback is not None and not callable(callback):
|
||||
raise TypeError('callback must be a callable, got %s' % type(callback).__name__)
|
||||
raise TypeError(f'callback must be a callable, got {type(callback).__name__}')
|
||||
if errback is not None and not callable(errback):
|
||||
raise TypeError('errback must be a callable, got %s' % type(errback).__name__)
|
||||
raise TypeError(f'errback must be a callable, got {type(errback).__name__}')
|
||||
self.callback = callback
|
||||
self.errback = errback
|
||||
|
||||
@ -60,13 +60,13 @@ class Request(object_ref):
|
||||
|
||||
def _set_url(self, url):
|
||||
if not isinstance(url, str):
|
||||
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
|
||||
raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}')
|
||||
|
||||
s = safe_url_string(url, self.encoding)
|
||||
self._url = escape_ajax(s)
|
||||
|
||||
if ('://' not in self._url) and (not self._url.startswith('data:')):
|
||||
raise ValueError('Missing scheme in request url: %s' % self._url)
|
||||
raise ValueError(f'Missing scheme in request url: {self._url}')
|
||||
|
||||
url = property(_get_url, obsolete_setter(_set_url, 'url'))
|
||||
|
||||
@ -86,7 +86,7 @@ class Request(object_ref):
|
||||
return self._encoding
|
||||
|
||||
def __str__(self):
|
||||
return "<%s %s>" % (self.method, self.url)
|
||||
return f"<{self.method} {self.url}>"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
|
@ -80,15 +80,15 @@ def _get_form(response, formname, formid, formnumber, formxpath):
|
||||
base_url=get_base_url(response))
|
||||
forms = root.xpath('//form')
|
||||
if not forms:
|
||||
raise ValueError("No <form> element found in %s" % response)
|
||||
raise ValueError(f"No <form> element found in {response}")
|
||||
|
||||
if formname is not None:
|
||||
f = root.xpath('//form[@name="%s"]' % formname)
|
||||
f = root.xpath(f'//form[@name="{formname}"]')
|
||||
if f:
|
||||
return f[0]
|
||||
|
||||
if formid is not None:
|
||||
f = root.xpath('//form[@id="%s"]' % formid)
|
||||
f = root.xpath(f'//form[@id="{formid}"]')
|
||||
if f:
|
||||
return f[0]
|
||||
|
||||
@ -103,7 +103,7 @@ def _get_form(response, formname, formid, formnumber, formxpath):
|
||||
el = el.getparent()
|
||||
if el is None:
|
||||
break
|
||||
raise ValueError('No <form> element found with %s' % formxpath)
|
||||
raise ValueError(f'No <form> element found with {formxpath}')
|
||||
|
||||
# If we get here, it means that either formname was None
|
||||
# or invalid
|
||||
@ -111,8 +111,7 @@ def _get_form(response, formname, formid, formnumber, formxpath):
|
||||
try:
|
||||
form = forms[formnumber]
|
||||
except IndexError:
|
||||
raise IndexError("Form number %d not found in %s" %
|
||||
(formnumber, response))
|
||||
raise IndexError(f"Form number {formnumber} not found in {response}")
|
||||
else:
|
||||
return form
|
||||
|
||||
@ -205,12 +204,12 @@ def _get_clickable(clickdata, form):
|
||||
|
||||
# We didn't find it, so now we build an XPath expression out of the other
|
||||
# arguments, because they can be used as such
|
||||
xpath = './/*' + ''.join('[@%s="%s"]' % c for c in clickdata.items())
|
||||
xpath = './/*' + ''.join(f'[@{k}="{v}"]' for k, v in clickdata.items())
|
||||
el = form.xpath(xpath)
|
||||
if len(el) == 1:
|
||||
return (el[0].get('name'), el[0].get('value') or '')
|
||||
elif len(el) > 1:
|
||||
raise ValueError("Multiple elements found (%r) matching the criteria "
|
||||
"in clickdata: %r" % (el, clickdata))
|
||||
raise ValueError(f"Multiple elements found ({el!r}) matching the "
|
||||
f"criteria in clickdata: {clickdata!r}")
|
||||
else:
|
||||
raise ValueError('No clickable element matching clickdata: %r' % (clickdata,))
|
||||
raise ValueError(f'No clickable element matching clickdata: {clickdata!r}')
|
||||
|
@ -55,8 +55,8 @@ class Response(object_ref):
|
||||
if isinstance(url, str):
|
||||
self._url = url
|
||||
else:
|
||||
raise TypeError('%s url must be str, got %s:' %
|
||||
(type(self).__name__, type(url).__name__))
|
||||
raise TypeError(f'{type(self).__name__} url must be str, '
|
||||
f'got {type(url).__name__}')
|
||||
|
||||
url = property(_get_url, obsolete_setter(_set_url, 'url'))
|
||||
|
||||
@ -77,7 +77,7 @@ class Response(object_ref):
|
||||
body = property(_get_body, obsolete_setter(_set_body, 'body'))
|
||||
|
||||
def __str__(self):
|
||||
return "<%d %s>" % (self.status, self.url)
|
||||
return f"<{self.status} {self.url}>"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
|
@ -47,8 +47,8 @@ class TextResponse(Response):
|
||||
self._body = b'' # used by encoding detection
|
||||
if isinstance(body, str):
|
||||
if self._encoding is None:
|
||||
raise TypeError('Cannot convert unicode body - %s has no encoding' %
|
||||
type(self).__name__)
|
||||
raise TypeError('Cannot convert unicode body - '
|
||||
f'{type(self).__name__} has no encoding')
|
||||
self._body = body.encode(self._encoding)
|
||||
else:
|
||||
super()._set_body(body)
|
||||
@ -92,7 +92,7 @@ class TextResponse(Response):
|
||||
# _body_inferred_encoding is called
|
||||
benc = self.encoding
|
||||
if self._cached_ubody is None:
|
||||
charset = 'charset=%s' % benc
|
||||
charset = f'charset={benc}'
|
||||
self._cached_ubody = html_to_unicode(charset, self.body)[1]
|
||||
return self._cached_ubody
|
||||
|
||||
@ -255,12 +255,11 @@ def _url_from_selector(sel):
|
||||
# e.g. ::attr(href) result
|
||||
return strip_html5_whitespace(sel.root)
|
||||
if not hasattr(sel.root, 'tag'):
|
||||
raise _InvalidSelector("Unsupported selector: %s" % sel)
|
||||
raise _InvalidSelector(f"Unsupported selector: {sel}")
|
||||
if sel.root.tag not in ('a', 'link'):
|
||||
raise _InvalidSelector("Only <a> and <link> elements are supported; got <%s>" %
|
||||
sel.root.tag)
|
||||
raise _InvalidSelector("Only <a> and <link> elements are supported; "
|
||||
f"got <{sel.root.tag}>")
|
||||
href = sel.root.get('href')
|
||||
if href is None:
|
||||
raise _InvalidSelector("<%s> element has no href attribute: %s" %
|
||||
(sel.root.tag, sel))
|
||||
raise _InvalidSelector(f"<{sel.root.tag}> element has no href attribute: {sel}")
|
||||
return strip_html5_whitespace(href)
|
||||
|
@ -96,19 +96,19 @@ class DictItem(MutableMapping, BaseItem):
|
||||
if key in self.fields:
|
||||
self._values[key] = value
|
||||
else:
|
||||
raise KeyError("%s does not support field: %s" % (self.__class__.__name__, key))
|
||||
raise KeyError(f"{self.__class__.__name__} does not support field: {key}")
|
||||
|
||||
def __delitem__(self, key):
|
||||
del self._values[key]
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name in self.fields:
|
||||
raise AttributeError("Use item[%r] to get field value" % name)
|
||||
raise AttributeError(f"Use item[{name!r}] to get field value")
|
||||
raise AttributeError(name)
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
if not name.startswith('_'):
|
||||
raise AttributeError("Use item[%r] = %r to set field value" % (name, value))
|
||||
raise AttributeError(f"Use item[{name!r}] = {value!r} to set field value")
|
||||
super().__setattr__(name, value)
|
||||
|
||||
def __len__(self):
|
||||
|
@ -14,7 +14,7 @@ class Link:
|
||||
def __init__(self, url, text='', fragment='', nofollow=False):
|
||||
if not isinstance(url, str):
|
||||
got = url.__class__.__name__
|
||||
raise TypeError("Link urls must be str objects, got %s" % got)
|
||||
raise TypeError(f"Link urls must be str objects, got {got}")
|
||||
self.url = url
|
||||
self.text = text
|
||||
self.fragment = fragment
|
||||
@ -33,6 +33,6 @@ class Link:
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
'Link(url=%r, text=%r, fragment=%r, nofollow=%r)'
|
||||
% (self.url, self.text, self.fragment, self.nofollow)
|
||||
f'Link(url={self.url!r}, text={self.text!r}, '
|
||||
f'fragment={self.fragment!r}, nofollow={self.nofollow!r})'
|
||||
)
|
||||
|
@ -54,8 +54,8 @@ class LogFormatter:
|
||||
|
||||
def crawled(self, request, response, spider):
|
||||
"""Logs a message when the crawler finds a webpage."""
|
||||
request_flags = ' %s' % str(request.flags) if request.flags else ''
|
||||
response_flags = ' %s' % str(response.flags) if response.flags else ''
|
||||
request_flags = f' {str(request.flags)}' if request.flags else ''
|
||||
response_flags = f' {str(response.flags)}' if response.flags else ''
|
||||
return {
|
||||
'level': logging.DEBUG,
|
||||
'msg': CRAWLEDMSG,
|
||||
|
@ -108,7 +108,7 @@ class S3FilesStore:
|
||||
from boto.s3.connection import S3Connection
|
||||
self.S3Connection = S3Connection
|
||||
if not uri.startswith("s3://"):
|
||||
raise ValueError("Incorrect URI scheme in %s, expected 's3'" % uri)
|
||||
raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'")
|
||||
self.bucket, self.prefix = uri[5:].split('/', 1)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
@ -133,7 +133,7 @@ class S3FilesStore:
|
||||
return c.get_bucket(self.bucket, validate=False)
|
||||
|
||||
def _get_boto_key(self, path):
|
||||
key_name = '%s%s' % (self.prefix, path)
|
||||
key_name = f'{self.prefix}{path}'
|
||||
if self.is_botocore:
|
||||
return threads.deferToThread(
|
||||
self.s3_client.head_object,
|
||||
@ -145,7 +145,7 @@ class S3FilesStore:
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
"""Upload file to S3 storage"""
|
||||
key_name = '%s%s' % (self.prefix, path)
|
||||
key_name = f'{self.prefix}{path}'
|
||||
buf.seek(0)
|
||||
if self.is_botocore:
|
||||
extra = self._headers_to_botocore_kwargs(self.HEADERS)
|
||||
@ -208,8 +208,7 @@ class S3FilesStore:
|
||||
try:
|
||||
kwarg = mapping[key]
|
||||
except KeyError:
|
||||
raise TypeError(
|
||||
'Header "%s" is not supported by botocore' % key)
|
||||
raise TypeError(f'Header "{key}" is not supported by botocore')
|
||||
else:
|
||||
extra[kwarg] = value
|
||||
return extra
|
||||
@ -283,7 +282,7 @@ class FTPFilesStore:
|
||||
|
||||
def __init__(self, uri):
|
||||
if not uri.startswith("ftp://"):
|
||||
raise ValueError("Incorrect URI scheme in %s, expected 'ftp'" % uri)
|
||||
raise ValueError(f"Incorrect URI scheme in {uri}, expected 'ftp'")
|
||||
u = urlparse(uri)
|
||||
self.port = u.port
|
||||
self.host = u.hostname
|
||||
@ -293,7 +292,7 @@ class FTPFilesStore:
|
||||
self.basedir = u.path.rstrip('/')
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
path = '%s/%s' % (self.basedir, path)
|
||||
path = f'{self.basedir}/{path}'
|
||||
return threads.deferToThread(
|
||||
ftp_store_file, path=path, file=buf,
|
||||
host=self.host, port=self.port, username=self.username,
|
||||
@ -308,10 +307,10 @@ class FTPFilesStore:
|
||||
ftp.login(self.username, self.password)
|
||||
if self.USE_ACTIVE_MODE:
|
||||
ftp.set_pasv(False)
|
||||
file_path = "%s/%s" % (self.basedir, path)
|
||||
last_modified = float(ftp.voidcmd("MDTM %s" % file_path)[4:].strip())
|
||||
file_path = f"{self.basedir}/{path}"
|
||||
last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
|
||||
m = hashlib.md5()
|
||||
ftp.retrbinary('RETR %s' % file_path, m.update)
|
||||
ftp.retrbinary(f'RETR {file_path}', m.update)
|
||||
return {'last_modified': last_modified, 'checksum': m.hexdigest()}
|
||||
# The file doesn't exist
|
||||
except Exception:
|
||||
@ -515,7 +514,7 @@ class FilesPipeline(MediaPipeline):
|
||||
|
||||
def inc_stats(self, spider, status):
|
||||
spider.crawler.stats.inc_value('file_count', spider=spider)
|
||||
spider.crawler.stats.inc_value('file_status_count/%s' % status, spider=spider)
|
||||
spider.crawler.stats.inc_value(f'file_status_count/{status}', spider=spider)
|
||||
|
||||
# Overridable Interface
|
||||
def get_media_requests(self, item, info):
|
||||
@ -545,4 +544,4 @@ class FilesPipeline(MediaPipeline):
|
||||
media_type = mimetypes.guess_type(request.url)[0]
|
||||
if media_type:
|
||||
media_ext = mimetypes.guess_extension(media_type)
|
||||
return 'full/%s%s' % (media_guid, media_ext)
|
||||
return f'full/{media_guid}{media_ext}'
|
||||
|
@ -125,8 +125,9 @@ class ImagesPipeline(FilesPipeline):
|
||||
|
||||
width, height = orig_image.size
|
||||
if width < self.min_width or height < self.min_height:
|
||||
raise ImageException("Image too small (%dx%d < %dx%d)" %
|
||||
(width, height, self.min_width, self.min_height))
|
||||
raise ImageException("Image too small "
|
||||
f"({width}x{height} < "
|
||||
f"{self.min_width}x{self.min_height})")
|
||||
|
||||
image, buf = self.convert_image(orig_image)
|
||||
yield path, image, buf
|
||||
@ -168,8 +169,8 @@ class ImagesPipeline(FilesPipeline):
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
|
||||
return 'full/%s.jpg' % (image_guid)
|
||||
return f'full/{image_guid}.jpg'
|
||||
|
||||
def thumb_path(self, request, thumb_id, response=None, info=None):
|
||||
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
|
||||
return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
|
||||
return f'thumbs/{thumb_id}/{thumb_guid}.jpg'
|
||||
|
@ -61,7 +61,7 @@ class MediaPipeline:
|
||||
'MYPIPE_IMAGES'
|
||||
"""
|
||||
class_name = self.__class__.__name__
|
||||
formatted_key = "{}_{}".format(class_name.upper(), key)
|
||||
formatted_key = f"{class_name.upper()}_{key}"
|
||||
if (
|
||||
not base_class_name
|
||||
or class_name == base_class_name
|
||||
@ -151,9 +151,8 @@ class MediaPipeline:
|
||||
if 'item' not in sig.parameters:
|
||||
old_params = str(sig)[1:-1]
|
||||
new_params = old_params + ", *, item=None"
|
||||
warn('%s(self, %s) is deprecated, '
|
||||
'please use %s(self, %s)'
|
||||
% (func.__name__, old_params, func.__name__, new_params),
|
||||
warn(f'{func.__name__}(self, {old_params}) is deprecated, '
|
||||
f'please use {func.__name__}(self, {new_params})',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
self._expects_item[func.__name__] = False
|
||||
|
||||
|
@ -141,17 +141,16 @@ class DownloaderAwarePriorityQueue:
|
||||
|
||||
def __init__(self, crawler, downstream_queue_cls, key, slot_startprios=()):
|
||||
if crawler.settings.getint('CONCURRENT_REQUESTS_PER_IP') != 0:
|
||||
raise ValueError('"%s" does not support CONCURRENT_REQUESTS_PER_IP'
|
||||
% (self.__class__,))
|
||||
raise ValueError(f'"{self.__class__}" does not support CONCURRENT_REQUESTS_PER_IP')
|
||||
|
||||
if slot_startprios and not isinstance(slot_startprios, dict):
|
||||
raise ValueError("DownloaderAwarePriorityQueue accepts "
|
||||
"``slot_startprios`` as a dict; %r instance "
|
||||
"``slot_startprios`` as a dict; "
|
||||
f"{slot_startprios.__class__!r} instance "
|
||||
"is passed. Most likely, it means the state is"
|
||||
"created by an incompatible priority queue. "
|
||||
"Only a crawl started with the same priority "
|
||||
"queue class can be resumed." %
|
||||
slot_startprios.__class__)
|
||||
"queue class can be resumed.")
|
||||
|
||||
self._downloader_interface = DownloaderInterface(crawler)
|
||||
self.downstream_queue_cls = downstream_queue_cls
|
||||
|
@ -45,7 +45,7 @@ class ResponseTypes:
|
||||
elif mimetype in self.classes:
|
||||
return self.classes[mimetype]
|
||||
else:
|
||||
basetype = "%s/*" % mimetype.split('/')[0]
|
||||
basetype = f"{mimetype.split('/')[0]}/*"
|
||||
return self.classes.get(basetype, Response)
|
||||
|
||||
def from_content_type(self, content_type, content_encoding=None):
|
||||
|
@ -66,8 +66,8 @@ class Selector(_ParselSelector, object_ref):
|
||||
|
||||
def __init__(self, response=None, text=None, type=None, root=None, **kwargs):
|
||||
if response is not None and text is not None:
|
||||
raise ValueError('%s.__init__() received both response and text'
|
||||
% self.__class__.__name__)
|
||||
raise ValueError(f'{self.__class__.__name__}.__init__() received '
|
||||
'both response and text')
|
||||
|
||||
st = _st(response, type or self._default_type)
|
||||
|
||||
|
@ -52,7 +52,7 @@ class SettingsAttribute:
|
||||
self.priority = priority
|
||||
|
||||
def __str__(self):
|
||||
return "<SettingsAttribute value={self.value!r} priority={self.priority}>".format(self=self)
|
||||
return f"<SettingsAttribute value={self.value!r} priority={self.priority}>"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
|
@ -287,7 +287,7 @@ TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
|
||||
|
||||
URLLENGTH_LIMIT = 2083
|
||||
|
||||
USER_AGENT = 'Scrapy/%s (+https://scrapy.org)' % import_module('scrapy').__version__
|
||||
USER_AGENT = f'Scrapy/{import_module("scrapy").__version__} (+https://scrapy.org)'
|
||||
|
||||
TELNETCONSOLE_ENABLED = 1
|
||||
TELNETCONSOLE_PORT = [6023, 6073]
|
||||
|
@ -140,7 +140,7 @@ class Shell:
|
||||
b.append(" scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
|
||||
for k, v in sorted(self.vars.items()):
|
||||
if self._is_relevant(v):
|
||||
b.append(" %-10s %s" % (k, v))
|
||||
b.append(f" {k:<10} {v}")
|
||||
b.append("Useful shortcuts:")
|
||||
if self.inthread:
|
||||
b.append(" fetch(url[, redirect=True]) "
|
||||
@ -150,7 +150,7 @@ class Shell:
|
||||
b.append(" shelp() Shell help (print this help)")
|
||||
b.append(" view(response) View response in a browser")
|
||||
|
||||
return "\n".join("[s] %s" % line for line in b)
|
||||
return "\n".join(f"[s] {line}" for line in b)
|
||||
|
||||
def _is_relevant(self, value):
|
||||
return isinstance(value, self.relevant_classes) or is_item(value)
|
||||
|
@ -27,7 +27,7 @@ class SpiderLoader:
|
||||
dupes = []
|
||||
for name, locations in self._found.items():
|
||||
dupes.extend([
|
||||
" {cls} named {name!r} (in {module})".format(module=mod, cls=cls, name=name)
|
||||
f" {cls} named {name!r} (in {mod})"
|
||||
for mod, cls in locations
|
||||
if len(locations) > 1
|
||||
])
|
||||
@ -36,7 +36,7 @@ class SpiderLoader:
|
||||
dupes_string = "\n\n".join(dupes)
|
||||
warnings.warn(
|
||||
"There are several spiders with the same name:\n\n"
|
||||
"{}\n\n This can cause unexpected behavior.".format(dupes_string),
|
||||
f"{dupes_string}\n\n This can cause unexpected behavior.",
|
||||
category=UserWarning,
|
||||
)
|
||||
|
||||
@ -53,10 +53,9 @@ class SpiderLoader:
|
||||
except ImportError:
|
||||
if self.warn_only:
|
||||
warnings.warn(
|
||||
"\n{tb}Could not load spiders from module '{modname}'. "
|
||||
"See above traceback for details.".format(
|
||||
modname=name, tb=traceback.format_exc()
|
||||
),
|
||||
f"\n{traceback.format_exc()}Could not load spiders "
|
||||
f"from module '{name}'. "
|
||||
"See above traceback for details.",
|
||||
category=RuntimeWarning,
|
||||
)
|
||||
else:
|
||||
@ -75,7 +74,7 @@ class SpiderLoader:
|
||||
try:
|
||||
return self._spiders[spider_name]
|
||||
except KeyError:
|
||||
raise KeyError("Spider not found: {}".format(spider_name))
|
||||
raise KeyError(f"Spider not found: {spider_name}")
|
||||
|
||||
def find_by_request(self, request):
|
||||
"""
|
||||
|
@ -43,7 +43,7 @@ class DepthMiddleware:
|
||||
return False
|
||||
else:
|
||||
if self.verbose_stats:
|
||||
self.stats.inc_value('request_depth_count/%s' % depth,
|
||||
self.stats.inc_value(f'request_depth_count/{depth}',
|
||||
spider=spider)
|
||||
self.stats.max_value('request_depth_max', depth,
|
||||
spider=spider)
|
||||
|
@ -48,7 +48,7 @@ class HttpErrorMiddleware:
|
||||
if isinstance(exception, HttpError):
|
||||
spider.crawler.stats.inc_value('httperror/response_ignored_count')
|
||||
spider.crawler.stats.inc_value(
|
||||
'httperror/response_ignored_status_count/%s' % response.status
|
||||
f'httperror/response_ignored_status_count/{response.status}'
|
||||
)
|
||||
logger.info(
|
||||
"Ignoring response %(response)r: HTTP status code is not handled or not allowed",
|
||||
|
@ -61,15 +61,15 @@ class OffsiteMiddleware:
|
||||
continue
|
||||
elif url_pattern.match(domain):
|
||||
message = ("allowed_domains accepts only domains, not URLs. "
|
||||
"Ignoring URL entry %s in allowed_domains." % domain)
|
||||
f"Ignoring URL entry {domain} in allowed_domains.")
|
||||
warnings.warn(message, URLWarning)
|
||||
elif port_pattern.search(domain):
|
||||
message = ("allowed_domains accepts only domains without ports. "
|
||||
"Ignoring entry %s in allowed_domains." % domain)
|
||||
f"Ignoring entry {domain} in allowed_domains.")
|
||||
warnings.warn(message, PortWarning)
|
||||
else:
|
||||
domains.append(re.escape(domain))
|
||||
regex = r'^(.*\.)?(%s)$' % '|'.join(domains)
|
||||
regex = fr'^(.*\.)?({"|".join(domains)})$'
|
||||
return re.compile(regex)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
|
@ -278,7 +278,7 @@ def _load_policy_class(policy, warning_only=False):
|
||||
try:
|
||||
return _policy_classes[policy.lower()]
|
||||
except KeyError:
|
||||
msg = "Could not load referrer policy %r" % policy
|
||||
msg = f"Could not load referrer policy {policy!r}"
|
||||
if not warning_only:
|
||||
raise RuntimeError(msg)
|
||||
else:
|
||||
|
@ -26,7 +26,7 @@ class Spider(object_ref):
|
||||
if name is not None:
|
||||
self.name = name
|
||||
elif not getattr(self, 'name', None):
|
||||
raise ValueError("%s must have a name" % type(self).__name__)
|
||||
raise ValueError(f"{type(self).__name__} must have a name")
|
||||
self.__dict__.update(kwargs)
|
||||
if not hasattr(self, 'start_urls'):
|
||||
self.start_urls = []
|
||||
@ -67,9 +67,8 @@ class Spider(object_ref):
|
||||
warnings.warn(
|
||||
"Spider.make_requests_from_url method is deprecated; it "
|
||||
"won't be called in future Scrapy releases. Please "
|
||||
"override Spider.start_requests method instead (see %s.%s)." % (
|
||||
cls.__module__, cls.__name__
|
||||
),
|
||||
"override Spider.start_requests method instead "
|
||||
f"(see {cls.__module__}.{cls.__name__}).",
|
||||
)
|
||||
for url in self.start_urls:
|
||||
yield self.make_requests_from_url(url)
|
||||
@ -91,7 +90,7 @@ class Spider(object_ref):
|
||||
return self.parse(response, **kwargs)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
raise NotImplementedError('{}.parse callback is not defined'.format(self.__class__.__name__))
|
||||
raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined')
|
||||
|
||||
@classmethod
|
||||
def update_settings(cls, settings):
|
||||
@ -108,7 +107,7 @@ class Spider(object_ref):
|
||||
return closed(reason)
|
||||
|
||||
def __str__(self):
|
||||
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
|
||||
return f"<{type(self).__name__} {self.name!r} at 0x{id(self):0x}>"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
|
@ -71,11 +71,11 @@ class XMLFeedSpider(Spider):
|
||||
elif self.iterator == 'xml':
|
||||
selector = Selector(response, type='xml')
|
||||
self._register_namespaces(selector)
|
||||
nodes = selector.xpath('//%s' % self.itertag)
|
||||
nodes = selector.xpath(f'//{self.itertag}')
|
||||
elif self.iterator == 'html':
|
||||
selector = Selector(response, type='html')
|
||||
self._register_namespaces(selector)
|
||||
nodes = selector.xpath('//%s' % self.itertag)
|
||||
nodes = selector.xpath(f'//{self.itertag}')
|
||||
else:
|
||||
raise NotSupported('Unsupported node iterator')
|
||||
|
||||
|
@ -21,8 +21,8 @@ class Root(Resource):
|
||||
for nl in nlist:
|
||||
args['n'] = nl
|
||||
argstr = urlencode(args, doseq=True)
|
||||
request.write("<a href='/follow?{0}'>follow {1}</a><br>"
|
||||
.format(argstr, nl).encode('utf8'))
|
||||
request.write(f"<a href='/follow?{argstr}'>follow {nl}</a><br>"
|
||||
.encode('utf8'))
|
||||
request.write(b"</body></html>")
|
||||
return b''
|
||||
|
||||
@ -39,6 +39,6 @@ if __name__ == '__main__':
|
||||
|
||||
def _print_listening():
|
||||
httpHost = httpPort.getHost()
|
||||
print("Bench server at http://{}:{}".format(httpHost.host, httpHost.port))
|
||||
print(f"Bench server at http://{httpHost.host}:{httpHost.port}")
|
||||
reactor.callWhenRunning(_print_listening)
|
||||
reactor.run()
|
||||
|
@ -17,8 +17,8 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
|
||||
|
||||
def _check_components(complist):
|
||||
if len({convert(c) for c in complist}) != len(complist):
|
||||
raise ValueError('Some paths in {!r} convert to the same object, '
|
||||
'please update your settings'.format(complist))
|
||||
raise ValueError(f'Some paths in {complist!r} convert to the same object, '
|
||||
'please update your settings')
|
||||
|
||||
def _map_keys(compdict):
|
||||
if isinstance(compdict, BaseSettings):
|
||||
@ -26,9 +26,10 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
|
||||
for k, v in compdict.items():
|
||||
prio = compdict.getpriority(k)
|
||||
if compbs.getpriority(convert(k)) == prio:
|
||||
raise ValueError('Some paths in {!r} convert to the same '
|
||||
raise ValueError(f'Some paths in {list(compdict.keys())!r} '
|
||||
'convert to the same '
|
||||
'object, please update your settings'
|
||||
''.format(list(compdict.keys())))
|
||||
)
|
||||
else:
|
||||
compbs.set(convert(k), v, priority=prio)
|
||||
return compbs
|
||||
@ -40,8 +41,8 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
|
||||
"""Fail if a value in the components dict is not a real number or None."""
|
||||
for name, value in compdict.items():
|
||||
if value is not None and not isinstance(value, numbers.Real):
|
||||
raise ValueError('Invalid value {} for component {}, please provide '
|
||||
'a real number or None instead'.format(value, name))
|
||||
raise ValueError(f'Invalid value {value} for component {name}, '
|
||||
'please provide a real number or None instead')
|
||||
|
||||
# BEGIN Backward compatibility for old (base, custom) call signature
|
||||
if isinstance(custom, (list, tuple)):
|
||||
@ -141,12 +142,10 @@ def feed_process_params_from_cli(settings, output, output_format=None,
|
||||
def check_valid_format(output_format):
|
||||
if output_format not in valid_output_formats:
|
||||
raise UsageError(
|
||||
"Unrecognized output format '%s'. Set a supported one (%s) "
|
||||
f"Unrecognized output format '{output_format}'. "
|
||||
f"Set a supported one ({tuple(valid_output_formats)}) "
|
||||
"after a colon at the end of the output URI (i.e. -o/-O "
|
||||
"<URI>:<FORMAT>) or as a file extension." % (
|
||||
output_format,
|
||||
tuple(valid_output_formats),
|
||||
)
|
||||
"<URI>:<FORMAT>) or as a file extension."
|
||||
)
|
||||
|
||||
overwrite = False
|
||||
|
@ -9,7 +9,7 @@ from w3lib.http import basic_auth_header
|
||||
|
||||
class CurlParser(argparse.ArgumentParser):
|
||||
def error(self, message):
|
||||
error_msg = 'There was an error parsing the curl command: {}'.format(message)
|
||||
error_msg = f'There was an error parsing the curl command: {message}'
|
||||
raise ValueError(error_msg)
|
||||
|
||||
|
||||
@ -52,7 +52,7 @@ def curl_to_request_kwargs(curl_command, ignore_unknown_options=True):
|
||||
parsed_args, argv = curl_parser.parse_known_args(curl_args[1:])
|
||||
|
||||
if argv:
|
||||
msg = 'Unrecognized options: {}'.format(', '.join(argv))
|
||||
msg = f'Unrecognized options: {", ".join(argv)}'
|
||||
if ignore_unknown_options:
|
||||
warnings.warn(msg)
|
||||
else:
|
||||
|
@ -14,9 +14,9 @@ def deprecated(use_instead=None):
|
||||
def deco(func):
|
||||
@wraps(func)
|
||||
def wrapped(*args, **kwargs):
|
||||
message = "Call to deprecated function %s." % func.__name__
|
||||
message = f"Call to deprecated function {func.__name__}."
|
||||
if use_instead:
|
||||
message += " Use %s instead." % use_instead
|
||||
message += f" Use {use_instead} instead."
|
||||
warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2)
|
||||
return func(*args, **kwargs)
|
||||
return wrapped
|
||||
|
@ -8,9 +8,8 @@ from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
def attribute(obj, oldattr, newattr, version='0.12'):
|
||||
cname = obj.__class__.__name__
|
||||
warnings.warn(
|
||||
"%s.%s attribute is deprecated and will be no longer supported "
|
||||
"in Scrapy %s, use %s.%s attribute instead"
|
||||
% (cname, oldattr, version, cname, newattr),
|
||||
f"{cname}.{oldattr} attribute is deprecated and will be no longer supported "
|
||||
f"in Scrapy {version}, use {cname}.{newattr} attribute instead",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=3)
|
||||
|
||||
@ -116,7 +115,7 @@ def create_deprecated_class(
|
||||
# deprecated class is in jinja2 template). __module__ attribute is not
|
||||
# important enough to raise an exception as users may be unable
|
||||
# to fix inspect.stack() errors.
|
||||
warnings.warn("Error detecting parent module: %r" % e)
|
||||
warnings.warn(f"Error detecting parent module: {e!r}")
|
||||
|
||||
return deprecated_cls
|
||||
|
||||
@ -124,7 +123,7 @@ def create_deprecated_class(
|
||||
def _clspath(cls, forced=None):
|
||||
if forced is not None:
|
||||
return forced
|
||||
return '{}.{}'.format(cls.__module__, cls.__name__)
|
||||
return f'{cls.__module__}.{cls.__name__}'
|
||||
|
||||
|
||||
DEPRECATION_RULES = [
|
||||
@ -137,7 +136,7 @@ def update_classpath(path):
|
||||
for prefix, replacement in DEPRECATION_RULES:
|
||||
if isinstance(path, str) and path.startswith(prefix):
|
||||
new_path = path.replace(prefix, replacement, 1)
|
||||
warnings.warn("`{}` class is deprecated, use `{}` instead".format(path, new_path),
|
||||
warnings.warn(f"`{path}` class is deprecated, use `{new_path}` instead",
|
||||
ScrapyDeprecationWarning)
|
||||
return new_path
|
||||
return path
|
||||
|
@ -29,7 +29,7 @@ def get_engine_status(engine):
|
||||
try:
|
||||
checks += [(test, eval(test))]
|
||||
except Exception as e:
|
||||
checks += [(test, "%s (exception)" % type(e).__name__)]
|
||||
checks += [(test, f"{type(e).__name__} (exception)")]
|
||||
|
||||
return checks
|
||||
|
||||
@ -38,7 +38,7 @@ def format_engine_status(engine=None):
|
||||
checks = get_engine_status(engine)
|
||||
s = "Execution engine status\n\n"
|
||||
for test, result in checks:
|
||||
s += "%-47s : %s\n" % (test, result)
|
||||
s += f"{test:<47} : {result}\n"
|
||||
s += "\n"
|
||||
|
||||
return s
|
||||
|
@ -33,5 +33,5 @@ def ftp_store_file(
|
||||
dirname, filename = posixpath.split(path)
|
||||
ftp_makedirs_cwd(ftp, dirname)
|
||||
command = 'STOR' if overwrite else 'APPE'
|
||||
ftp.storbinary('%s %s' % (command, filename), file)
|
||||
ftp.storbinary(f'{command} {filename}', file)
|
||||
file.close()
|
||||
|
@ -22,8 +22,8 @@ def xmliter(obj, nodename):
|
||||
"""
|
||||
nodename_patt = re.escape(nodename)
|
||||
|
||||
HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename_patt, re.S)
|
||||
HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename_patt, re.S)
|
||||
HEADER_START_RE = re.compile(fr'^(.*?)<\s*{nodename_patt}(?:\s|>)', re.S)
|
||||
HEADER_END_RE = re.compile(fr'<\s*/{nodename_patt}\s*>', re.S)
|
||||
text = _body_or_str(obj)
|
||||
|
||||
header_start = re.search(HEADER_START_RE, text)
|
||||
@ -31,7 +31,7 @@ def xmliter(obj, nodename):
|
||||
header_end = re_rsearch(HEADER_END_RE, text)
|
||||
header_end = text[header_end[1]:].strip() if header_end else ''
|
||||
|
||||
r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL)
|
||||
r = re.compile(fr'<{nodename_patt}[\s>].*?</{nodename_patt}>', re.DOTALL)
|
||||
for match in r.finditer(text):
|
||||
nodetext = header_start + match.group() + header_end
|
||||
yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
|
||||
@ -40,9 +40,9 @@ def xmliter(obj, nodename):
|
||||
def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
|
||||
from lxml import etree
|
||||
reader = _StreamReader(obj)
|
||||
tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
|
||||
tag = f'{{{namespace}}}{nodename}'if namespace else nodename
|
||||
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
|
||||
selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename)
|
||||
selxpath = '//' + (f'{prefix}:{nodename}' if namespace else nodename)
|
||||
for _, node in iterable:
|
||||
nodetext = etree.tostring(node, encoding='unicode')
|
||||
node.clear()
|
||||
@ -131,8 +131,7 @@ def _body_or_str(obj, unicode=True):
|
||||
if not isinstance(obj, expected_types):
|
||||
expected_types_str = " or ".join(t.__name__ for t in expected_types)
|
||||
raise TypeError(
|
||||
"Object %r must be %s, not %s"
|
||||
% (obj, expected_types_str, type(obj).__name__)
|
||||
f"Object {obj!r} must be {expected_types_str}, not {type(obj).__name__}"
|
||||
)
|
||||
if isinstance(obj, Response):
|
||||
if not unicode:
|
||||
|
@ -143,7 +143,7 @@ def log_scrapy_info(settings):
|
||||
logger.info("Scrapy %(version)s started (bot: %(bot)s)",
|
||||
{'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
|
||||
versions = [
|
||||
"%s %s" % (name, version)
|
||||
f"{name} {version}"
|
||||
for name, version in scrapy_components_versions()
|
||||
if name != "Scrapy"
|
||||
]
|
||||
@ -187,7 +187,7 @@ class LogCounterHandler(logging.Handler):
|
||||
self.crawler = crawler
|
||||
|
||||
def emit(self, record):
|
||||
sname = 'log_count/{}'.format(record.levelname)
|
||||
sname = f'log_count/{record.levelname}'
|
||||
self.crawler.stats.inc_value(sname)
|
||||
|
||||
|
||||
|
@ -56,7 +56,7 @@ def load_object(path):
|
||||
try:
|
||||
dot = path.rindex('.')
|
||||
except ValueError:
|
||||
raise ValueError("Error loading object '%s': not a full path" % path)
|
||||
raise ValueError(f"Error loading object '{path}': not a full path")
|
||||
|
||||
module, name = path[:dot], path[dot + 1:]
|
||||
mod = import_module(module)
|
||||
@ -64,7 +64,7 @@ def load_object(path):
|
||||
try:
|
||||
obj = getattr(mod, name)
|
||||
except AttributeError:
|
||||
raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
|
||||
raise NameError(f"Module '{module}' doesn't define any object named '{name}'")
|
||||
|
||||
return obj
|
||||
|
||||
@ -173,7 +173,7 @@ def create_instance(objcls, settings, crawler, *args, **kwargs):
|
||||
instance = objcls(*args, **kwargs)
|
||||
method_name = '__new__'
|
||||
if instance is None:
|
||||
raise TypeError("%s.%s returned None" % (objcls.__qualname__, method_name))
|
||||
raise TypeError(f"{objcls.__qualname__}.{method_name} returned None")
|
||||
return instance
|
||||
|
||||
|
||||
@ -244,9 +244,10 @@ def warn_on_generator_with_return_value(spider, callable):
|
||||
"""
|
||||
if is_generator_with_return_value(callable):
|
||||
warnings.warn(
|
||||
'The "{}.{}" method is a generator and includes a "return" statement with a '
|
||||
'value different than None. This could lead to unexpected behaviour. Please see '
|
||||
f'The "{spider.__class__.__name__}.{callable.__name__}" method is '
|
||||
'a generator and includes a "return" statement with a value '
|
||||
'different than None. This could lead to unexpected behaviour. Please see '
|
||||
'https://docs.python.org/3/reference/simple_stmts.html#the-return-statement '
|
||||
'for details about the semantics of the "return" statement within generators'
|
||||
.format(spider.__class__.__name__, callable.__name__), stacklevel=2,
|
||||
'for details about the semantics of the "return" statement within generators',
|
||||
stacklevel=2,
|
||||
)
|
||||
|
@ -20,7 +20,7 @@ def inside_project():
|
||||
try:
|
||||
import_module(scrapy_module)
|
||||
except ImportError as exc:
|
||||
warnings.warn("Cannot import scrapy settings module %s: %s" % (scrapy_module, exc))
|
||||
warnings.warn(f"Cannot import scrapy settings module {scrapy_module}: {exc}")
|
||||
else:
|
||||
return True
|
||||
return bool(closest_scrapy_cfg())
|
||||
@ -90,7 +90,7 @@ def get_project_settings():
|
||||
warnings.warn(
|
||||
'Use of environment variables prefixed with SCRAPY_ to override '
|
||||
'settings is deprecated. The following environment variables are '
|
||||
'currently defined: {}'.format(setting_envvar_list),
|
||||
f'currently defined: {setting_envvar_list}',
|
||||
ScrapyDeprecationWarning
|
||||
)
|
||||
settings.setdict(scrapy_envvars, priority='project')
|
||||
|
@ -91,7 +91,7 @@ def to_unicode(text, encoding=None, errors='strict'):
|
||||
return text
|
||||
if not isinstance(text, (bytes, str)):
|
||||
raise TypeError('to_unicode must receive a bytes or str '
|
||||
'object, got %s' % type(text).__name__)
|
||||
f'object, got {type(text).__name__}')
|
||||
if encoding is None:
|
||||
encoding = 'utf-8'
|
||||
return text.decode(encoding, errors)
|
||||
@ -104,7 +104,7 @@ def to_bytes(text, encoding=None, errors='strict'):
|
||||
return text
|
||||
if not isinstance(text, str):
|
||||
raise TypeError('to_bytes must receive a str or bytes '
|
||||
'object, got %s' % type(text).__name__)
|
||||
f'object, got {type(text).__name__}')
|
||||
if encoding is None:
|
||||
encoding = 'utf-8'
|
||||
return text.encode(encoding, errors)
|
||||
@ -174,7 +174,7 @@ def binary_is_text(data):
|
||||
does not contain unprintable control characters.
|
||||
"""
|
||||
if not isinstance(data, bytes):
|
||||
raise TypeError("data must be bytes, got '%s'" % type(data).__name__)
|
||||
raise TypeError(f"data must be bytes, got '{type(data).__name__}'")
|
||||
return all(c not in _BINARYCHARS for c in data)
|
||||
|
||||
|
||||
@ -217,7 +217,7 @@ def get_func_args(func, stripself=False):
|
||||
else:
|
||||
return get_func_args(func.__call__, True)
|
||||
else:
|
||||
raise TypeError('%s is not callable' % type(func))
|
||||
raise TypeError(f'{type(func)} is not callable')
|
||||
if stripself:
|
||||
func_args.pop(0)
|
||||
return func_args
|
||||
@ -250,7 +250,7 @@ def get_spec(func):
|
||||
elif hasattr(func, '__call__'):
|
||||
spec = _getargspec_py23(func.__call__)
|
||||
else:
|
||||
raise TypeError('%s is not callable' % type(func))
|
||||
raise TypeError(f'{type(func)} is not callable')
|
||||
|
||||
defaults = spec.defaults or []
|
||||
|
||||
@ -322,7 +322,7 @@ def global_object_name(obj):
|
||||
>>> global_object_name(Request)
|
||||
'scrapy.http.request.Request'
|
||||
"""
|
||||
return "%s.%s" % (obj.__module__, obj.__name__)
|
||||
return f"{obj.__module__}.{obj.__name__}"
|
||||
|
||||
|
||||
if hasattr(sys, "pypy_version_info"):
|
||||
|
@ -10,7 +10,7 @@ def listen_tcp(portrange, host, factory):
|
||||
"""Like reactor.listenTCP but tries different ports in a range."""
|
||||
from twisted.internet import reactor
|
||||
if len(portrange) > 2:
|
||||
raise ValueError("invalid portrange: %s" % portrange)
|
||||
raise ValueError(f"invalid portrange: {portrange}")
|
||||
if not portrange:
|
||||
return reactor.listenTCP(0, factory, interface=host)
|
||||
if not hasattr(portrange, '__iter__'):
|
||||
@ -78,9 +78,9 @@ def verify_installed_reactor(reactor_path):
|
||||
from twisted.internet import reactor
|
||||
reactor_class = load_object(reactor_path)
|
||||
if not isinstance(reactor, reactor_class):
|
||||
msg = "The installed reactor ({}.{}) does not match the requested one ({})".format(
|
||||
reactor.__module__, reactor.__class__.__name__, reactor_path
|
||||
)
|
||||
msg = ("The installed reactor "
|
||||
f"({reactor.__module__}.{reactor.__class__.__name__}) does not "
|
||||
f"match the requested one ({reactor_path})")
|
||||
raise Exception(msg)
|
||||
|
||||
|
||||
|
@ -84,7 +84,7 @@ def _find_method(obj, func):
|
||||
# https://docs.python.org/3/reference/datamodel.html
|
||||
if obj_func.__func__ is func.__func__:
|
||||
return name
|
||||
raise ValueError("Function %s is not an instance method in: %s" % (func, obj))
|
||||
raise ValueError(f"Function {func} is not an instance method in: {obj}")
|
||||
|
||||
|
||||
def _get_method(obj, name):
|
||||
@ -92,4 +92,4 @@ def _get_method(obj, name):
|
||||
try:
|
||||
return getattr(obj, name)
|
||||
except AttributeError:
|
||||
raise ValueError("Method %r not found in: %s" % (name, obj))
|
||||
raise ValueError(f"Method {name!r} not found in: {obj}")
|
||||
|
@ -39,7 +39,7 @@ def response_status_message(status):
|
||||
"""Return status code plus status text descriptive message
|
||||
"""
|
||||
message = http.RESPONSES.get(int(status), "Unknown Status")
|
||||
return '%s %s' % (status, to_unicode(message))
|
||||
return f'{status} {to_unicode(message)}'
|
||||
|
||||
|
||||
def response_httprepr(response):
|
||||
@ -69,15 +69,15 @@ def open_in_browser(response, _openfunc=webbrowser.open):
|
||||
body = response.body
|
||||
if isinstance(response, HtmlResponse):
|
||||
if b'<base' not in body:
|
||||
repl = '<head><base href="%s">' % response.url
|
||||
repl = f'<head><base href="{response.url}">'
|
||||
body = body.replace(b'<head>', to_bytes(repl))
|
||||
ext = '.html'
|
||||
elif isinstance(response, TextResponse):
|
||||
ext = '.txt'
|
||||
else:
|
||||
raise TypeError("Unsupported response type: %s" %
|
||||
response.__class__.__name__)
|
||||
raise TypeError("Unsupported response type: "
|
||||
f"{response.__class__.__name__}")
|
||||
fd, fname = tempfile.mkstemp(ext)
|
||||
os.write(fd, body)
|
||||
os.close(fd)
|
||||
return _openfunc("file://%s" % fname)
|
||||
return _openfunc(f"file://{fname}")
|
||||
|
@ -17,7 +17,7 @@ class ScrapyJSONEncoder(json.JSONEncoder):
|
||||
if isinstance(o, set):
|
||||
return list(o)
|
||||
elif isinstance(o, datetime.datetime):
|
||||
return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
|
||||
return o.strftime(f"{self.DATE_FORMAT} {self.TIME_FORMAT}")
|
||||
elif isinstance(o, datetime.date):
|
||||
return o.strftime(self.DATE_FORMAT)
|
||||
elif isinstance(o, datetime.time):
|
||||
@ -29,9 +29,9 @@ class ScrapyJSONEncoder(json.JSONEncoder):
|
||||
elif is_item(o):
|
||||
return ItemAdapter(o).asdict()
|
||||
elif isinstance(o, Request):
|
||||
return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
|
||||
return f"<{type(o).__name__} {o.method} {o.url}>"
|
||||
elif isinstance(o, Response):
|
||||
return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
|
||||
return f"<{type(o).__name__} {o.status} {o.url}>"
|
||||
else:
|
||||
return super().default(o)
|
||||
|
||||
|
@ -50,7 +50,7 @@ def get_temp_key_info(ssl_object):
|
||||
key_info.append(ffi_buf_to_string(cname))
|
||||
else:
|
||||
key_info.append(ffi_buf_to_string(pyOpenSSLutil.lib.OBJ_nid2sn(key_type)))
|
||||
key_info.append('%s bits' % pyOpenSSLutil.lib.EVP_PKEY_bits(temp_key))
|
||||
key_info.append(f'{pyOpenSSLutil.lib.EVP_PKEY_bits(temp_key)} bits')
|
||||
return ', '.join(key_info)
|
||||
|
||||
|
||||
@ -58,4 +58,4 @@ def get_openssl_version():
|
||||
system_openssl = OpenSSL.SSL.SSLeay_version(
|
||||
OpenSSL.SSL.SSLEAY_VERSION
|
||||
).decode('ascii', errors='replace')
|
||||
return '{} ({})'.format(OpenSSL.version.__version__, system_openssl)
|
||||
return f'{OpenSSL.version.__version__} ({system_openssl})'
|
||||
|
@ -12,10 +12,12 @@ def render_templatefile(path, **kwargs):
|
||||
content = string.Template(raw).substitute(**kwargs)
|
||||
|
||||
render_path = path[:-len('.tmpl')] if path.endswith('.tmpl') else path
|
||||
|
||||
if path.endswith('.tmpl'):
|
||||
os.rename(path, render_path)
|
||||
|
||||
with open(render_path, 'wb') as fp:
|
||||
fp.write(content.encode('utf8'))
|
||||
if path.endswith('.tmpl'):
|
||||
os.remove(path)
|
||||
|
||||
|
||||
CAMELCASE_INVALID_CHARS = re.compile(r'[^a-zA-Z\d]')
|
||||
|
@ -79,7 +79,7 @@ def get_ftp_content_and_delete(
|
||||
|
||||
def buffer_data(data):
|
||||
ftp_data.append(data)
|
||||
ftp.retrbinary('RETR %s' % path, buffer_data)
|
||||
ftp.retrbinary(f'RETR {path}', buffer_data)
|
||||
dirname, filename = split(path)
|
||||
ftp.cwd(dirname)
|
||||
ftp.delete(filename)
|
||||
|
@ -23,10 +23,10 @@ class ProcessTest:
|
||||
|
||||
def _process_finished(self, pp, cmd, check_code):
|
||||
if pp.exitcode and check_code:
|
||||
msg = "process %s exit with code %d" % (cmd, pp.exitcode)
|
||||
msg += "\n>>> stdout <<<\n%s" % pp.out
|
||||
msg = f"process {cmd} exit with code {pp.exitcode}"
|
||||
msg += f"\n>>> stdout <<<\n{pp.out}"
|
||||
msg += "\n"
|
||||
msg += "\n>>> stderr <<<\n%s" % pp.err
|
||||
msg += f"\n>>> stderr <<<\n{pp.err}"
|
||||
raise RuntimeError(msg)
|
||||
return pp.exitcode, pp.out, pp.err
|
||||
|
||||
|
@ -9,7 +9,7 @@ class SiteTest:
|
||||
from twisted.internet import reactor
|
||||
super().setUp()
|
||||
self.site = reactor.listenTCP(0, test_site(), interface="127.0.0.1")
|
||||
self.baseurl = "http://localhost:%d/" % self.site.getHost().port
|
||||
self.baseurl = f"http://localhost:{self.site.getHost().port}/"
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
@ -40,5 +40,5 @@ def test_site():
|
||||
if __name__ == '__main__':
|
||||
from twisted.internet import reactor
|
||||
port = reactor.listenTCP(0, test_site(), interface="127.0.0.1")
|
||||
print("http://localhost:%d/" % port.getHost().port)
|
||||
print(f"http://localhost:{port.getHost().port}/")
|
||||
reactor.run()
|
||||
|
@ -41,9 +41,7 @@ def format_live_refs(ignore=NoneType):
|
||||
if issubclass(cls, ignore):
|
||||
continue
|
||||
oldest = min(wdict.values())
|
||||
s += "%-30s %6d oldest: %ds ago\n" % (
|
||||
cls.__name__, len(wdict), now - oldest
|
||||
)
|
||||
s += f"{cls.__name__:<30} {len(wdict):6} oldest: {int(now - oldest)}s ago\n"
|
||||
return s
|
||||
|
||||
|
||||
|
@ -22,7 +22,7 @@ def url_is_from_any_domain(url, domains):
|
||||
if not host:
|
||||
return False
|
||||
domains = [d.lower() for d in domains]
|
||||
return any((host == d) or (host.endswith('.%s' % d)) for d in domains)
|
||||
return any((host == d) or (host.endswith(f'.{d}')) for d in domains)
|
||||
|
||||
|
||||
def url_is_from_spider(url, spider):
|
||||
@ -153,7 +153,7 @@ def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=
|
||||
if (parsed_url.scheme, parsed_url.port) in (('http', 80),
|
||||
('https', 443),
|
||||
('ftp', 21)):
|
||||
netloc = netloc.replace(':{p.port}'.format(p=parsed_url), '')
|
||||
netloc = netloc.replace(f':{parsed_url.port}', '')
|
||||
return urlunparse((
|
||||
parsed_url.scheme,
|
||||
netloc,
|
||||
|
@ -38,7 +38,7 @@ class LocalhostSpider(Spider):
|
||||
if __name__ == "__main__":
|
||||
with MockServer() as mock_http_server, MockDNSServer() as mock_dns_server:
|
||||
port = urlparse(mock_http_server.http_address).port
|
||||
url = "http://not.a.real.domain:{port}/echo".format(port=port)
|
||||
url = f"http://not.a.real.domain:{port}/echo"
|
||||
|
||||
servers = [(mock_dns_server.host, mock_dns_server.port)]
|
||||
reactor.installResolver(createResolver(servers=servers))
|
||||
|
@ -73,7 +73,7 @@ class Follow(LeafResource):
|
||||
for nl in nlist:
|
||||
args[b"n"] = [to_bytes(str(nl))]
|
||||
argstr = urlencode(args, doseq=True)
|
||||
s += "<a href='/follow?%s'>follow %d</a><br>" % (argstr, nl)
|
||||
s += f"<a href='/follow?{argstr}'>follow {nl}</a><br>"
|
||||
s += """</body>"""
|
||||
request.write(to_bytes(s))
|
||||
request.finish()
|
||||
@ -91,7 +91,7 @@ class Delay(LeafResource):
|
||||
return NOT_DONE_YET
|
||||
|
||||
def _delayedRender(self, request, n):
|
||||
request.write(to_bytes("Response delayed for %0.3f seconds\n" % n))
|
||||
request.write(to_bytes(f"Response delayed for {n:.3f} seconds\n"))
|
||||
request.finish()
|
||||
|
||||
|
||||
@ -310,8 +310,8 @@ if __name__ == "__main__":
|
||||
def print_listening():
|
||||
httpHost = httpPort.getHost()
|
||||
httpsHost = httpsPort.getHost()
|
||||
httpAddress = "http://%s:%d" % (httpHost.host, httpHost.port)
|
||||
httpsAddress = "https://%s:%d" % (httpsHost.host, httpsHost.port)
|
||||
httpAddress = f'http://{httpHost.host}:{httpHost.port}'
|
||||
httpsAddress = f'https://{httpsHost.host}:{httpsHost.port}'
|
||||
print(httpAddress)
|
||||
print(httpsAddress)
|
||||
|
||||
@ -323,7 +323,7 @@ if __name__ == "__main__":
|
||||
|
||||
def print_listening():
|
||||
host = listener.getHost()
|
||||
print("%s:%s" % (host.host, host.port))
|
||||
print(f"{host.host}:{host.port}")
|
||||
|
||||
reactor.callWhenRunning(print_listening)
|
||||
reactor.run()
|
||||
|
@ -45,7 +45,7 @@ class FollowAllSpider(MetaSpider):
|
||||
self.urls_visited = []
|
||||
self.times = []
|
||||
qargs = {'total': total, 'show': show, 'order': order, 'maxlatency': maxlatency}
|
||||
url = self.mockserver.url("/follow?%s" % urlencode(qargs, doseq=1))
|
||||
url = self.mockserver.url(f"/follow?{urlencode(qargs, doseq=1)}")
|
||||
self.start_urls = [url]
|
||||
|
||||
def parse(self, response):
|
||||
@ -67,7 +67,7 @@ class DelaySpider(MetaSpider):
|
||||
|
||||
def start_requests(self):
|
||||
self.t1 = time.time()
|
||||
url = self.mockserver.url("/delay?n=%s&b=%s" % (self.n, self.b))
|
||||
url = self.mockserver.url(f"/delay?n={self.n}&b={self.b}")
|
||||
yield Request(url, callback=self.parse, errback=self.errback)
|
||||
|
||||
def parse(self, response):
|
||||
@ -177,7 +177,7 @@ class AsyncDefAsyncioGenComplexSpider(SimpleSpider):
|
||||
depth = 2
|
||||
|
||||
def _get_req(self, index, cb=None):
|
||||
return Request(self.mockserver.url("/status?n=200&request=%d" % index),
|
||||
return Request(self.mockserver.url(f"/status?n=200&request={index}"),
|
||||
meta={'index': index},
|
||||
dont_filter=True,
|
||||
callback=cb)
|
||||
@ -236,7 +236,7 @@ class YieldingRequestsSpider(FollowAllSpider):
|
||||
def start_requests(self):
|
||||
for s in range(self.number_of_start_requests):
|
||||
qargs = {'total': 10, 'seed': s}
|
||||
url = self.mockserver.url("/follow?%s") % urlencode(qargs, doseq=1)
|
||||
url = self.mockserver.url(f"/follow?{urlencode(qargs, doseq=1)}")
|
||||
yield Request(url, meta={'seed': s})
|
||||
|
||||
|
||||
@ -288,7 +288,7 @@ class DuplicateStartRequestsSpider(MockServerSpider):
|
||||
def start_requests(self):
|
||||
for i in range(0, self.distinct_urls):
|
||||
for j in range(0, self.dupe_factor):
|
||||
url = self.mockserver.url("/echo?headers=1&body=test%d" % i)
|
||||
url = self.mockserver.url(f"/echo?headers=1&body=test{i}")
|
||||
yield Request(url, dont_filter=self.dont_filter)
|
||||
|
||||
def __init__(self, url="http://localhost:8998", *args, **kwargs):
|
||||
|
@ -4,7 +4,7 @@
|
||||
class TestExtension:
|
||||
|
||||
def __init__(self, settings):
|
||||
settings.set('TEST1', "%s + %s" % (settings['TEST1'], 'started'))
|
||||
settings.set('TEST1', f"{settings['TEST1']} + started")
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
|
@ -14,20 +14,20 @@ class CheckCommandTest(CommandTest):
|
||||
|
||||
def _write_contract(self, contracts, parse_def):
|
||||
with open(self.spider, 'w') as file:
|
||||
file.write("""
|
||||
file.write(f"""
|
||||
import scrapy
|
||||
|
||||
class CheckSpider(scrapy.Spider):
|
||||
name = '{0}'
|
||||
name = '{self.spider_name}'
|
||||
start_urls = ['http://example.com']
|
||||
|
||||
def parse(self, response, **cb_kwargs):
|
||||
\"\"\"
|
||||
@url http://example.com
|
||||
{1}
|
||||
{contracts}
|
||||
\"\"\"
|
||||
{2}
|
||||
""".format(self.spider_name, contracts, parse_def))
|
||||
{parse_def}
|
||||
""")
|
||||
|
||||
def _test_contract(self, contracts='', parse_def='pass'):
|
||||
self._write_contract(contracts, parse_def)
|
||||
|
@ -21,14 +21,14 @@ class ParseCommandTest(ProcessTest, SiteTest, CommandTest):
|
||||
self.spider_name = 'parse_spider'
|
||||
fname = abspath(join(self.proj_mod_path, 'spiders', 'myspider.py'))
|
||||
with open(fname, 'w') as f:
|
||||
f.write("""
|
||||
f.write(f"""
|
||||
import scrapy
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
|
||||
|
||||
class MySpider(scrapy.Spider):
|
||||
name = '{0}'
|
||||
name = '{self.spider_name}'
|
||||
|
||||
def parse(self, response):
|
||||
if getattr(self, 'test_arg', None):
|
||||
@ -58,7 +58,7 @@ class MySpider(scrapy.Spider):
|
||||
self.logger.debug('It Does Not Work :(')
|
||||
|
||||
class MyGoodCrawlSpider(CrawlSpider):
|
||||
name = 'goodcrawl{0}'
|
||||
name = 'goodcrawl{self.spider_name}'
|
||||
|
||||
rules = (
|
||||
Rule(LinkExtractor(allow=r'/html'), callback='parse_item', follow=True),
|
||||
@ -74,7 +74,7 @@ class MyGoodCrawlSpider(CrawlSpider):
|
||||
|
||||
class MyBadCrawlSpider(CrawlSpider):
|
||||
'''Spider which doesn't define a parse_item callback while using it in a rule.'''
|
||||
name = 'badcrawl{0}'
|
||||
name = 'badcrawl{self.spider_name}'
|
||||
|
||||
rules = (
|
||||
Rule(LinkExtractor(allow=r'/html'), callback='parse_item', follow=True),
|
||||
@ -82,7 +82,7 @@ class MyBadCrawlSpider(CrawlSpider):
|
||||
|
||||
def parse(self, response):
|
||||
return [scrapy.Item(), dict(foo='bar')]
|
||||
""".format(self.spider_name))
|
||||
""")
|
||||
|
||||
fname = abspath(join(self.proj_mod_path, 'pipelines.py'))
|
||||
with open(fname, 'w') as f:
|
||||
@ -99,9 +99,9 @@ class MyPipeline:
|
||||
|
||||
fname = abspath(join(self.proj_mod_path, 'settings.py'))
|
||||
with open(fname, 'a') as f:
|
||||
f.write("""
|
||||
ITEM_PIPELINES = {'%s.pipelines.MyPipeline': 1}
|
||||
""" % self.project_name)
|
||||
f.write(f"""
|
||||
ITEM_PIPELINES = {{'{self.project_name}.pipelines.MyPipeline': 1}}
|
||||
""")
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_spider_arguments(self):
|
||||
|
@ -65,8 +65,8 @@ class ShellTest(ProcessTest, SiteTest, unittest.TestCase):
|
||||
def test_fetch_redirect_follow_302(self):
|
||||
"""Test that calling ``fetch(url)`` follows HTTP redirects by default."""
|
||||
url = self.url('/redirect-no-meta-refresh')
|
||||
code = "fetch('{0}')"
|
||||
errcode, out, errout = yield self.execute(['-c', code.format(url)])
|
||||
code = f"fetch('{url}')"
|
||||
errcode, out, errout = yield self.execute(['-c', code])
|
||||
self.assertEqual(errcode, 0, out)
|
||||
assert b'Redirecting (302)' in errout
|
||||
assert b'Crawled (200)' in errout
|
||||
@ -75,23 +75,23 @@ class ShellTest(ProcessTest, SiteTest, unittest.TestCase):
|
||||
def test_fetch_redirect_not_follow_302(self):
|
||||
"""Test that calling ``fetch(url, redirect=False)`` disables automatic redirects."""
|
||||
url = self.url('/redirect-no-meta-refresh')
|
||||
code = "fetch('{0}', redirect=False)"
|
||||
errcode, out, errout = yield self.execute(['-c', code.format(url)])
|
||||
code = f"fetch('{url}', redirect=False)"
|
||||
errcode, out, errout = yield self.execute(['-c', code])
|
||||
self.assertEqual(errcode, 0, out)
|
||||
assert b'Crawled (302)' in errout
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_request_replace(self):
|
||||
url = self.url('/text')
|
||||
code = "fetch('{0}') or fetch(response.request.replace(method='POST'))"
|
||||
errcode, out, _ = yield self.execute(['-c', code.format(url)])
|
||||
code = f"fetch('{url}') or fetch(response.request.replace(method='POST'))"
|
||||
errcode, out, _ = yield self.execute(['-c', code])
|
||||
self.assertEqual(errcode, 0, out)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def test_scrapy_import(self):
|
||||
url = self.url('/text')
|
||||
code = "fetch(scrapy.Request('{0}'))"
|
||||
errcode, out, _ = yield self.execute(['-c', code.format(url)])
|
||||
code = f"fetch(scrapy.Request('{url}'))"
|
||||
errcode, out, _ = yield self.execute(['-c', code])
|
||||
self.assertEqual(errcode, 0, out)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
|
@ -16,7 +16,7 @@ class VersionTest(ProcessTest, unittest.TestCase):
|
||||
_, out, _ = yield self.execute([])
|
||||
self.assertEqual(
|
||||
out.strip().decode(encoding),
|
||||
"Scrapy %s" % scrapy.__version__,
|
||||
f"Scrapy {scrapy.__version__}",
|
||||
)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user