1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00

Merge remote-tracking branch 'origin/master' into asyncio-startrequests-asyncgen

This commit is contained in:
Andrey Rakhmatullin 2020-08-31 18:02:54 +05:00
commit b2f43d51ac
130 changed files with 625 additions and 581 deletions

View File

@ -49,7 +49,7 @@ master_doc = 'index'
# General information about the project.
project = 'Scrapy'
copyright = '2008{}, Scrapy developers'.format(datetime.now().year)
copyright = f'2008{datetime.now().year}, Scrapy developers'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the

View File

@ -101,10 +101,10 @@ This is the code for our first Spider. Save it in a file named
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
filename = f'quotes-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
self.log(f'Saved file {filename}')
As you can see, our Spider subclasses :class:`scrapy.Spider <scrapy.spiders.Spider>`
@ -190,7 +190,7 @@ for your spider::
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
filename = f'quotes-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)

View File

@ -5,9 +5,9 @@ Using your browser's Developer Tools for scraping
=================================================
Here is a general guide on how to use your browser's Developer Tools
to ease the scraping process. Today almost all browsers come with
to ease the scraping process. Today almost all browsers come with
built in `Developer Tools`_ and although we will use Firefox in this
guide, the concepts are applicable to any other browser.
guide, the concepts are applicable to any other browser.
In this guide we'll introduce the basic tools to use from a browser's
Developer Tools by scraping `quotes.toscrape.com`_.
@ -41,16 +41,16 @@ Therefore, you should keep in mind the following things:
Inspecting a website
====================
By far the most handy feature of the Developer Tools is the `Inspector`
feature, which allows you to inspect the underlying HTML code of
any webpage. To demonstrate the Inspector, let's look at the
By far the most handy feature of the Developer Tools is the `Inspector`
feature, which allows you to inspect the underlying HTML code of
any webpage. To demonstrate the Inspector, let's look at the
`quotes.toscrape.com`_-site.
On the site we have a total of ten quotes from various authors with specific
tags, as well as the Top Ten Tags. Let's say we want to extract all the quotes
on this page, without any meta-information about authors, tags, etc.
tags, as well as the Top Ten Tags. Let's say we want to extract all the quotes
on this page, without any meta-information about authors, tags, etc.
Instead of viewing the whole source code for the page, we can simply right click
Instead of viewing the whole source code for the page, we can simply right click
on a quote and select ``Inspect Element (Q)``, which opens up the `Inspector`.
In it you should see something like this:
@ -97,16 +97,16 @@ Then, back to your web browser, right-click on the ``span`` tag, select
>>> response.xpath('/html/body/div/div[2]/div[1]/div[1]/span[1]/text()').getall()
['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”']
Adding ``text()`` at the end we are able to extract the first quote with this
Adding ``text()`` at the end we are able to extract the first quote with this
basic selector. But this XPath is not really that clever. All it does is
go down a desired path in the source code starting from ``html``. So let's
see if we can refine our XPath a bit:
go down a desired path in the source code starting from ``html``. So let's
see if we can refine our XPath a bit:
If we check the `Inspector` again we'll see that directly beneath our
expanded ``div`` tag we have nine identical ``div`` tags, each with the
same attributes as our first. If we expand any of them, we'll see the same
If we check the `Inspector` again we'll see that directly beneath our
expanded ``div`` tag we have nine identical ``div`` tags, each with the
same attributes as our first. If we expand any of them, we'll see the same
structure as with our first quote: Two ``span`` tags and one ``div`` tag. We can
expand each ``span`` tag with the ``class="text"`` inside our ``div`` tags and
expand each ``span`` tag with the ``class="text"`` inside our ``div`` tags and
see each quote:
.. code-block:: html
@ -121,7 +121,7 @@ see each quote:
With this knowledge we can refine our XPath: Instead of a path to follow,
we'll simply select all ``span`` tags with the ``class="text"`` by using
we'll simply select all ``span`` tags with the ``class="text"`` by using
the `has-class-extension`_:
>>> response.xpath('//span[has-class("text")]/text()').getall()
@ -130,45 +130,45 @@ the `has-class-extension`_:
'“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
...]
And with one simple, cleverer XPath we are able to extract all quotes from
the page. We could have constructed a loop over our first XPath to increase
the number of the last ``div``, but this would have been unnecessarily
And with one simple, cleverer XPath we are able to extract all quotes from
the page. We could have constructed a loop over our first XPath to increase
the number of the last ``div``, but this would have been unnecessarily
complex and by simply constructing an XPath with ``has-class("text")``
we were able to extract all quotes in one line.
we were able to extract all quotes in one line.
The `Inspector` has a lot of other helpful features, such as searching in the
The `Inspector` has a lot of other helpful features, such as searching in the
source code or directly scrolling to an element you selected. Let's demonstrate
a use case:
a use case:
Say you want to find the ``Next`` button on the page. Type ``Next`` into the
search bar on the top right of the `Inspector`. You should get two results.
The first is a ``li`` tag with the ``class="next"``, the second the text
Say you want to find the ``Next`` button on the page. Type ``Next`` into the
search bar on the top right of the `Inspector`. You should get two results.
The first is a ``li`` tag with the ``class="next"``, the second the text
of an ``a`` tag. Right click on the ``a`` tag and select ``Scroll into View``.
If you hover over the tag, you'll see the button highlighted. From here
we could easily create a :ref:`Link Extractor <topics-link-extractors>` to
follow the pagination. On a simple site such as this, there may not be
we could easily create a :ref:`Link Extractor <topics-link-extractors>` to
follow the pagination. On a simple site such as this, there may not be
the need to find an element visually but the ``Scroll into View`` function
can be quite useful on complex sites.
can be quite useful on complex sites.
Note that the search bar can also be used to search for and test CSS
selectors. For example, you could search for ``span.text`` to find
all quote texts. Instead of a full text search, this searches for
exactly the ``span`` tag with the ``class="text"`` in the page.
selectors. For example, you could search for ``span.text`` to find
all quote texts. Instead of a full text search, this searches for
exactly the ``span`` tag with the ``class="text"`` in the page.
.. _topics-network-tool:
The Network-tool
================
While scraping you may come across dynamic webpages where some parts
of the page are loaded dynamically through multiple requests. While
this can be quite tricky, the `Network`-tool in the Developer Tools
of the page are loaded dynamically through multiple requests. While
this can be quite tricky, the `Network`-tool in the Developer Tools
greatly facilitates this task. To demonstrate the Network-tool, let's
take a look at the page `quotes.toscrape.com/scroll`_.
take a look at the page `quotes.toscrape.com/scroll`_.
The page is quite similar to the basic `quotes.toscrape.com`_-page,
but instead of the above-mentioned ``Next`` button, the page
automatically loads new quotes when you scroll to the bottom. We
could go ahead and try out different XPaths directly, but instead
The page is quite similar to the basic `quotes.toscrape.com`_-page,
but instead of the above-mentioned ``Next`` button, the page
automatically loads new quotes when you scroll to the bottom. We
could go ahead and try out different XPaths directly, but instead
we'll check another quite useful command from the Scrapy shell:
.. skip: next
@ -179,9 +179,9 @@ we'll check another quite useful command from the Scrapy shell:
(...)
>>> view(response)
A browser window should open with the webpage but with one
crucial difference: Instead of the quotes we just see a greenish
bar with the word ``Loading...``.
A browser window should open with the webpage but with one
crucial difference: Instead of the quotes we just see a greenish
bar with the word ``Loading...``.
.. image:: _images/network_01.png
:width: 777
@ -189,21 +189,21 @@ bar with the word ``Loading...``.
:alt: Response from quotes.toscrape.com/scroll
The ``view(response)`` command let's us view the response our
shell or later our spider receives from the server. Here we see
that some basic template is loaded which includes the title,
shell or later our spider receives from the server. Here we see
that some basic template is loaded which includes the title,
the login-button and the footer, but the quotes are missing. This
tells us that the quotes are being loaded from a different request
than ``quotes.toscrape/scroll``.
than ``quotes.toscrape/scroll``.
If you click on the ``Network`` tab, you will probably only see
two entries. The first thing we do is enable persistent logs by
clicking on ``Persist Logs``. If this option is disabled, the
If you click on the ``Network`` tab, you will probably only see
two entries. The first thing we do is enable persistent logs by
clicking on ``Persist Logs``. If this option is disabled, the
log is automatically cleared each time you navigate to a different
page. Enabling this option is a good default, since it gives us
control on when to clear the logs.
page. Enabling this option is a good default, since it gives us
control on when to clear the logs.
If we reload the page now, you'll see the log get populated with six
new requests.
new requests.
.. image:: _images/network_02.png
:width: 777
@ -212,31 +212,31 @@ new requests.
Here we see every request that has been made when reloading the page
and can inspect each request and its response. So let's find out
where our quotes are coming from:
where our quotes are coming from:
First click on the request with the name ``scroll``. On the right
First click on the request with the name ``scroll``. On the right
you can now inspect the request. In ``Headers`` you'll find details
about the request headers, such as the URL, the method, the IP-address,
and so on. We'll ignore the other tabs and click directly on ``Response``.
What you should see in the ``Preview`` pane is the rendered HTML-code,
that is exactly what we saw when we called ``view(response)`` in the
shell. Accordingly the ``type`` of the request in the log is ``html``.
The other requests have types like ``css`` or ``js``, but what
interests us is the one request called ``quotes?page=1`` with the
type ``json``.
What you should see in the ``Preview`` pane is the rendered HTML-code,
that is exactly what we saw when we called ``view(response)`` in the
shell. Accordingly the ``type`` of the request in the log is ``html``.
The other requests have types like ``css`` or ``js``, but what
interests us is the one request called ``quotes?page=1`` with the
type ``json``.
If we click on this request, we see that the request URL is
If we click on this request, we see that the request URL is
``http://quotes.toscrape.com/api/quotes?page=1`` and the response
is a JSON-object that contains our quotes. We can also right-click
on the request and open ``Open in new tab`` to get a better overview.
on the request and open ``Open in new tab`` to get a better overview.
.. image:: _images/network_03.png
:width: 777
:height: 375
:alt: JSON-object returned from the quotes.toscrape API
With this response we can now easily parse the JSON-object and
With this response we can now easily parse the JSON-object and
also request each page to get every quote on the site::
import scrapy
@ -255,17 +255,17 @@ also request each page to get every quote on the site::
yield {"quote": quote["text"]}
if data["has_next"]:
self.page += 1
url = "http://quotes.toscrape.com/api/quotes?page={}".format(self.page)
url = f"http://quotes.toscrape.com/api/quotes?page={self.page}"
yield scrapy.Request(url=url, callback=self.parse)
This spider starts at the first page of the quotes-API. With each
response, we parse the ``response.text`` and assign it to ``data``.
This lets us operate on the JSON-object like on a Python dictionary.
This spider starts at the first page of the quotes-API. With each
response, we parse the ``response.text`` and assign it to ``data``.
This lets us operate on the JSON-object like on a Python dictionary.
We iterate through the ``quotes`` and print out the ``quote["text"]``.
If the handy ``has_next`` element is ``true`` (try loading
If the handy ``has_next`` element is ``true`` (try loading
`quotes.toscrape.com/api/quotes?page=10`_ in your browser or a
page-number greater than 10), we increment the ``page`` attribute
and ``yield`` a new request, inserting the incremented page-number
page-number greater than 10), we increment the ``page`` attribute
and ``yield`` a new request, inserting the incremented page-number
into our ``url``.
.. _requests-from-curl:
@ -298,7 +298,7 @@ Note that to translate a cURL command into a Scrapy request,
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
As you can see, with a few inspections in the `Network`-tool we
were able to easily replicate the dynamic requests of the scrolling
were able to easily replicate the dynamic requests of the scrolling
functionality of the page. Crawling dynamic pages can be quite
daunting and pages can be very complex, but it (mostly) boils down
to identifying the correct request and replicating it in your spider.

View File

@ -57,7 +57,7 @@ value of one of their fields::
adapter = ItemAdapter(item)
year = adapter['year']
if year not in self.year_to_exporter:
f = open('{}.xml'.format(year), 'wb')
f = open(f'{year}.xml', 'wb')
exporter = XmlItemExporter(f)
exporter.start_exporting()
self.year_to_exporter[year] = exporter
@ -98,7 +98,7 @@ Example::
import scrapy
def serialize_price(value):
return '$ %s' % str(value)
return f'$ {str(value)}'
class Product(scrapy.Item):
name = scrapy.Field()
@ -122,7 +122,7 @@ Example::
def serialize_field(self, field, name, value):
if field == 'price':
return '$ %s' % str(value)
return f'$ {str(value)}'
return super(Product, self).serialize_field(field, name, value)
.. _topics-exporters-reference:

View File

@ -96,7 +96,7 @@ contain a price::
adapter['price'] = adapter['price'] * self.vat_factor
return item
else:
raise DropItem("Missing price in %s" % item)
raise DropItem(f"Missing price in {item}")
Write items to a JSON file
@ -211,7 +211,7 @@ item.
# Save screenshot to file, filename will be hash of url.
url = adapter["url"]
url_hash = hashlib.md5(url.encode("utf8")).hexdigest()
filename = "{}.png".format(url_hash)
filename = f"{url_hash}.png"
with open(filename, "wb") as f:
f.write(response.body)
@ -240,7 +240,7 @@ returns multiples items with the same id::
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if adapter['id'] in self.ids_seen:
raise DropItem("Duplicate item found: %r" % item)
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(adapter['id'])
return item

View File

@ -102,7 +102,7 @@ A real example
Let's see a concrete example of a hypothetical case of memory leaks.
Suppose we have some spider with a line similar to this one::
return Request("http://www.somenastyspider.com/product.php?pid=%d" % product_id,
return Request(f"http://www.somenastyspider.com/product.php?pid={product_id}",
callback=self.parse, cb_kwargs={'referer': response})
That line is passing a response reference inside a request which effectively

View File

@ -328,8 +328,9 @@ too. Here's an example:
'<a href="image5.html">Name: My image 5 <br><img src="image5_thumb.jpg"></a>']
>>> for index, link in enumerate(links):
... args = (index, link.xpath('@href').get(), link.xpath('img/@src').get())
... print('Link number %d points to url %r and image %r' % args)
... href_xpath = link.xpath('@href').get()
... img_xpath = link.xpath('img/@src').get()
... print(f'Link number {index} points to url {href_xpath!r} and image {img_xpath!r}')
Link number 0 points to url 'image1.html' and image 'image1_thumb.jpg'
Link number 1 points to url 'image2.html' and image 'image2_thumb.jpg'
Link number 2 points to url 'image3.html' and image 'image3_thumb.jpg'
@ -822,7 +823,7 @@ with groups of itemscopes and corresponding itemprops::
... props = scope.xpath('''
... set:difference(./descendant::*/@itemprop,
... .//*[@itemscope]/*/@itemprop)''')
... print(" properties: %s" % (props.getall()))
... print(f" properties: {props.getall()}")
... print("")
current scope: ['http://schema.org/Product']

View File

@ -136,7 +136,7 @@ In a spider, the settings are available through ``self.settings``::
start_urls = ['http://example.com']
def parse(self, response):
print("Existing settings: %s" % self.settings.attributes.keys())
print(f"Existing settings: {self.settings.attributes.keys()}")
.. note::
The ``settings`` attribute is set in the base Spider class after the spider

View File

@ -287,7 +287,7 @@ Spiders can access arguments in their `__init__` methods::
def __init__(self, category=None, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
self.start_urls = ['http://www.example.com/categories/%s' % category]
self.start_urls = [f'http://www.example.com/categories/{category}']
# ...
The default `__init__` method will take any spider arguments
@ -300,7 +300,7 @@ The above example can also be written as follows::
name = 'myspider'
def start_requests(self):
yield scrapy.Request('http://www.example.com/categories/%s' % self.category)
yield scrapy.Request(f'http://www.example.com/categories/{self.category}')
Keep in mind that spider arguments are only strings.
The spider will not do any parsing on its own.

View File

@ -37,7 +37,7 @@ class Root(Resource):
if now - self.lastmark >= 3:
self.lastmark = now
qps = len(self.tail) / sum(self.tail)
print('samplesize={0} concurrent={1} qps={2:0.2f}'.format(len(self.tail), self.concurrent, qps))
print(f'samplesize={len(self.tail)} concurrent={self.concurrent} qps={qps:0.2f}')
if 'latency' in request.args:
latency = float(request.args['latency'][0])

View File

@ -37,11 +37,11 @@ class QPSSpider(Spider):
def start_requests(self):
url = self.benchurl
if self.latency is not None:
url += '?latency={0}'.format(self.latency)
url += f'?latency={self.latency}'
slots = int(self.slots)
if slots > 1:
urls = [url.replace('localhost', '127.0.0.%d' % (x + 1)) for x in range(slots)]
urls = [url.replace('localhost', f'127.0.0.{x + 1}') for x in range(slots)]
else:
urls = [url]

View File

@ -44,7 +44,7 @@ def _get_commands_from_entry_points(inproject, group='scrapy.commands'):
if inspect.isclass(obj):
cmds[entry_point.name] = obj()
else:
raise Exception("Invalid entry point %s" % entry_point.name)
raise Exception(f"Invalid entry point {entry_point.name}")
return cmds
@ -67,11 +67,11 @@ def _pop_command_name(argv):
def _print_header(settings, inproject):
version = scrapy.__version__
if inproject:
print("Scrapy %s - project: %s\n" % (scrapy.__version__,
settings['BOT_NAME']))
print(f"Scrapy {version} - project: {settings['BOT_NAME']}\n")
else:
print("Scrapy %s - no active project\n" % scrapy.__version__)
print(f"Scrapy {version} - no active project\n")
def _print_commands(settings, inproject):
@ -81,7 +81,7 @@ def _print_commands(settings, inproject):
print("Available commands:")
cmds = _get_commands_dict(settings, inproject)
for cmdname, cmdclass in sorted(cmds.items()):
print(" %-13s %s" % (cmdname, cmdclass.short_desc()))
print(f" {cmdname:<13} {cmdclass.short_desc()}")
if not inproject:
print()
print(" [ more ] More commands available when run from project directory")
@ -91,7 +91,7 @@ def _print_commands(settings, inproject):
def _print_unknown_command(settings, cmdname, inproject):
_print_header(settings, inproject)
print("Unknown command: %s\n" % cmdname)
print(f"Unknown command: {cmdname}\n")
print('Use "scrapy" to see available commands')
@ -133,7 +133,7 @@ def execute(argv=None, settings=None):
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.usage = f"scrapy {cmdname} {cmd.syntax()}"
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
@ -155,7 +155,7 @@ def _run_command(cmd, args, opts):
def _run_command_profiled(cmd, args, opts):
if opts.profile:
sys.stderr.write("scrapy: writing cProfile stats to %r\n" % opts.profile)
sys.stderr.write(f"scrapy: writing cProfile stats to {opts.profile!r}\n")
loc = locals()
p = cProfile.Profile()
p.runctx('cmd.run(args, opts)', globals(), loc)

View File

@ -61,7 +61,7 @@ class ScrapyCommand:
group.add_option("--logfile", metavar="FILE",
help="log file. if omitted stderr will be used")
group.add_option("-L", "--loglevel", metavar="LEVEL", default=None,
help="log level (default: %s)" % self.settings['LOG_LEVEL'])
help=f"log level (default: {self.settings['LOG_LEVEL']})")
group.add_option("--nolog", action="store_true",
help="disable logging completely")
group.add_option("--profile", metavar="FILE", default=None,

View File

@ -50,7 +50,7 @@ class _BenchSpider(scrapy.Spider):
def start_requests(self):
qargs = {'total': self.total, 'show': self.show}
url = '{}?{}'.format(self.baseurl, urlencode(qargs, doseq=1))
url = f'{self.baseurl}?{urlencode(qargs, doseq=1)}'
return [scrapy.Request(url, dont_filter=True)]
def parse(self, response):

View File

@ -17,7 +17,7 @@ class TextTestResult(_TextTestResult):
plural = "s" if run != 1 else ""
writeln(self.separator2)
writeln("Ran %d contract%s in %.3fs" % (run, plural, stop - start))
writeln(f"Ran {run} contract{plural} in {stop - start:.3f}s")
writeln()
infos = []
@ -25,14 +25,14 @@ class TextTestResult(_TextTestResult):
write("FAILED")
failed, errored = map(len, (self.failures, self.errors))
if failed:
infos.append("failures=%d" % failed)
infos.append(f"failures={failed}")
if errored:
infos.append("errors=%d" % errored)
infos.append(f"errors={errored}")
else:
write("OK")
if infos:
writeln(" (%s)" % (", ".join(infos),))
writeln(f" ({', '.join(infos)})")
else:
write("\n")
@ -85,7 +85,7 @@ class Command(ScrapyCommand):
continue
print(spider)
for method in sorted(methods):
print(' * %s' % method)
print(f' * {method}')
else:
start = time.time()
self.crawler_process.start()

View File

@ -32,8 +32,8 @@ class Command(ScrapyCommand):
try:
spidercls = self.crawler_process.spider_loader.load(args[0])
except KeyError:
return self._err("Spider not found: %s" % args[0])
return self._err(f"Spider not found: {args[0]}")
sfile = sys.modules[spidercls.__module__].__file__
sfile = sfile.replace('.pyc', '.py')
self.exitcode = os.system('%s "%s"' % (editor, sfile))
self.exitcode = os.system(f'{editor} "{sfile}"')

View File

@ -73,17 +73,18 @@ class Command(ScrapyCommand):
if template_file:
self._genspider(module, name, domain, opts.template, template_file)
if opts.edit:
self.exitcode = os.system('scrapy edit "%s"' % name)
self.exitcode = os.system(f'scrapy edit "{name}"')
def _genspider(self, module, name, domain, template_name, template_file):
"""Generate the spider module, based on the given template"""
capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
tvars = {
'project_name': self.settings.get('BOT_NAME'),
'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
'module': module,
'name': name,
'domain': domain,
'classname': '%sSpider' % ''.join(s.capitalize() for s in module.split('_'))
'classname': f'{capitalized_module}Spider'
}
if self.settings.get('NEWSPIDER_MODULE'):
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
@ -91,32 +92,32 @@ class Command(ScrapyCommand):
else:
spiders_module = None
spiders_dir = "."
spider_file = "%s.py" % join(spiders_dir, module)
spider_file = f"{join(spiders_dir, module)}.py"
shutil.copyfile(template_file, spider_file)
render_templatefile(spider_file, **tvars)
print("Created spider %r using template %r "
% (name, template_name), end=('' if spiders_module else '\n'))
print(f"Created spider {name!r} using template {template_name!r} ",
end=('' if spiders_module else '\n'))
if spiders_module:
print("in module:\n %s.%s" % (spiders_module.__name__, module))
print("in module:\n {spiders_module.__name__}.{module}")
def _find_template(self, template):
template_file = join(self.templates_dir, '%s.tmpl' % template)
template_file = join(self.templates_dir, f'{template}.tmpl')
if exists(template_file):
return template_file
print("Unable to find template: %s\n" % template)
print(f"Unable to find template: {template}\n")
print('Use "scrapy genspider --list" to see all available templates.')
def _list_templates(self):
print("Available templates:")
for filename in sorted(os.listdir(self.templates_dir)):
if filename.endswith('.tmpl'):
print(" %s" % splitext(filename)[0])
print(f" {splitext(filename)[0]}")
def _spider_exists(self, name):
if not self.settings.get('NEWSPIDER_MODULE'):
# if run as a standalone command and file with same filename already exists
if exists(name + ".py"):
print("%s already exists" % (abspath(name + ".py")))
print(f"{abspath(name + '.py')} already exists")
return True
return False
@ -126,8 +127,8 @@ class Command(ScrapyCommand):
pass
else:
# if spider with same name exists
print("Spider %r already exists in module:" % name)
print(" %s" % spidercls.__module__)
print(f"Spider {name!r} already exists in module:")
print(f" {spidercls.__module__}")
return True
# a file with the same name exists in the target directory
@ -135,7 +136,7 @@ class Command(ScrapyCommand):
spiders_dir = dirname(spiders_module.__file__)
spiders_dir_abs = abspath(spiders_dir)
if exists(join(spiders_dir_abs, name + ".py")):
print("%s already exists" % (join(spiders_dir_abs, (name + ".py"))))
print(f"{join(spiders_dir_abs, (name + '.py'))} already exists")
return True
return False

View File

@ -96,13 +96,13 @@ class Command(BaseRunSpiderCommand):
if opts.verbose:
for level in range(1, self.max_level + 1):
print('\n>>> DEPTH LEVEL: %s <<<' % level)
print(f'\n>>> DEPTH LEVEL: {level} <<<')
if not opts.noitems:
self.print_items(level, colour)
if not opts.nolinks:
self.print_requests(level, colour)
else:
print('\n>>> STATUS DEPTH LEVEL %s <<<' % self.max_level)
print(f'\n>>> STATUS DEPTH LEVEL {self.max_level} <<<')
if not opts.noitems:
self.print_items(colour=colour)
if not opts.nolinks:

View File

@ -12,7 +12,7 @@ def _import_file(filepath):
dirname, file = os.path.split(abspath)
fname, fext = os.path.splitext(file)
if fext != '.py':
raise ValueError("Not a Python source file: %s" % abspath)
raise ValueError(f"Not a Python source file: {abspath}")
if dirname:
sys.path = [dirname] + sys.path
try:
@ -42,14 +42,14 @@ class Command(BaseRunSpiderCommand):
raise UsageError()
filename = args[0]
if not os.path.exists(filename):
raise UsageError("File not found: %s\n" % filename)
raise UsageError(f"File not found: {filename}\n")
try:
module = _import_file(filename)
except (ImportError, ValueError) as e:
raise UsageError("Unable to load %r: %s\n" % (filename, e))
raise UsageError(f"Unable to load {filename!r}: {e}\n")
spclasses = list(iter_spider_classes(module))
if not spclasses:
raise UsageError("No spider found in file: %s\n" % filename)
raise UsageError(f"No spider found in file: {filename}\n")
spidercls = spclasses.pop()
self.crawler_process.crawl(spidercls, **opts.spargs)

View File

@ -52,7 +52,7 @@ class Command(ScrapyCommand):
print('Error: Project names must begin with a letter and contain'
' only\nletters, numbers and underscores')
elif _module_exists(project_name):
print('Error: Module %r already exists' % project_name)
print(f'Error: Module {project_name!r} already exists')
else:
return True
return False
@ -100,7 +100,7 @@ class Command(ScrapyCommand):
if exists(join(project_dir, 'scrapy.cfg')):
self.exitcode = 1
print('Error: scrapy.cfg already exists in %s' % abspath(project_dir))
print(f'Error: scrapy.cfg already exists in {abspath(project_dir)}')
return
if not self._is_valid_name(project_name):
@ -113,11 +113,11 @@ class Command(ScrapyCommand):
path = join(*paths)
tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
print("New Scrapy project '%s', using template directory '%s', "
"created in:" % (project_name, self.templates_dir))
print(" %s\n" % abspath(project_dir))
print(f"New Scrapy project '{project_name}', using template directory "
f"'{self.templates_dir}', created in:")
print(f" {abspath(project_dir)}\n")
print("You can start your first spider with:")
print(" cd %s" % project_dir)
print(f" cd {project_dir}")
print(" scrapy genspider example example.com")
@property

View File

@ -23,8 +23,7 @@ class Command(ScrapyCommand):
if opts.verbose:
versions = scrapy_components_versions()
width = max(len(n) for (n, _) in versions)
patt = "%-{}s : %s".format(width)
for name, version in versions:
print(patt % (name, version))
print(f"{name:<{width}} : {version}")
else:
print("Scrapy %s" % scrapy.__version__)
print(f"Scrapy {scrapy.__version__}")

View File

@ -112,8 +112,8 @@ class Contract:
request_cls = None
def __init__(self, method, *args):
self.testcase_pre = _create_testcase(method, '@%s pre-hook' % self.name)
self.testcase_post = _create_testcase(method, '@%s post-hook' % self.name)
self.testcase_pre = _create_testcase(method, f'@{self.name} pre-hook')
self.testcase_post = _create_testcase(method, f'@{self.name} post-hook')
self.args = args
def add_pre_hook(self, request, results):
@ -172,8 +172,8 @@ def _create_testcase(method, desc):
class ContractTestCase(TestCase):
def __str__(_self):
return "[%s] %s (%s)" % (spider, method.__name__, desc)
return f"[{spider}] {method.__name__} ({desc})"
name = '%s_%s' % (spider, method.__name__)
name = f'{spider}_{method.__name__}'
setattr(ContractTestCase, name, lambda x: x)
return ContractTestCase(name)

View File

@ -60,8 +60,7 @@ class ReturnsContract(Contract):
if len(self.args) not in [1, 2, 3]:
raise ValueError(
"Incorrect argument quantity: expected 1, 2 or 3, got %i"
% len(self.args)
f"Incorrect argument quantity: expected 1, 2 or 3, got {len(self.args)}"
)
self.obj_name = self.args[0] or None
self.obj_type_verifier = self.object_type_verifiers[self.obj_name]
@ -88,10 +87,9 @@ class ReturnsContract(Contract):
if self.min_bound == self.max_bound:
expected = self.min_bound
else:
expected = '%s..%s' % (self.min_bound, self.max_bound)
expected = f'{self.min_bound}..{self.max_bound}'
raise ContractFail("Returned %s %s, expected %s" %
(occurrences, self.obj_name, expected))
raise ContractFail(f"Returned {occurrences} {self.obj_name}, expected {expected}")
class ScrapesContract(Contract):
@ -106,5 +104,5 @@ class ScrapesContract(Contract):
if is_item(x):
missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
if missing:
missing_str = ", ".join(missing)
raise ContractFail("Missing fields: %s" % missing_str)
missing_fields = ", ".join(missing)
raise ContractFail(f"Missing fields: {missing_fields}")

View File

@ -41,17 +41,17 @@ class Slot:
def __repr__(self):
cls_name = self.__class__.__name__
return "%s(concurrency=%r, delay=%0.2f, randomize_delay=%r)" % (
cls_name, self.concurrency, self.delay, self.randomize_delay)
return (f"{cls_name}(concurrency={self.concurrency!r}, "
f"delay={self.delay:.2f}, "
f"randomize_delay={self.randomize_delay!r})")
def __str__(self):
return (
"<downloader.Slot concurrency=%r delay=%0.2f randomize_delay=%r "
"len(active)=%d len(queue)=%d len(transferring)=%d lastseen=%s>" % (
self.concurrency, self.delay, self.randomize_delay,
len(self.active), len(self.queue), len(self.transferring),
datetime.fromtimestamp(self.lastseen).isoformat()
)
f"<downloader.Slot concurrency={self.concurrency!r} "
f"delay={self.delay:.2f} randomize_delay={self.randomize_delay!r} "
f"len(active)={len(self.active)} len(queue)={len(self.queue)} "
f"len(transferring)={len(self.transferring)} "
f"lastseen={datetime.fromtimestamp(self.lastseen).isoformat()}>"
)

View File

@ -71,8 +71,7 @@ class DownloadHandlers:
scheme = urlparse_cached(request).scheme
handler = self._get_handler(scheme)
if not handler:
raise NotSupported("Unsupported URL scheme '%s': %s" %
(scheme, self._notconfigured[scheme]))
raise NotSupported(f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}")
return handler.download_request(request, spider)
@defer.inlineCallbacks

View File

@ -60,11 +60,11 @@ class HTTP11DownloadHandler:
settings=settings,
crawler=crawler,
)
msg = """
'%s' does not accept `method` argument (type OpenSSL.SSL method,\
e.g. OpenSSL.SSL.SSLv23_METHOD) and/or `tls_verbose_logging` argument and/or `tls_ciphers` argument.\
Please upgrade your context factory class to handle them or ignore them.""" % (
settings['DOWNLOADER_CLIENTCONTEXTFACTORY'],)
msg = f"""
'{settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]}' does not accept `method` \
argument (type OpenSSL.SSL method, e.g. OpenSSL.SSL.SSLv23_METHOD) and/or \
`tls_verbose_logging` argument and/or `tls_ciphers` argument.\
Please upgrade your context factory class to handle them or ignore them."""
warnings.warn(msg)
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
@ -169,8 +169,9 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
else:
extra = rcvd_bytes[:32]
self._tunnelReadyDeferred.errback(
TunnelError('Could not open CONNECT tunnel with proxy %s:%s [%r]' % (
self._host, self._port, extra)))
TunnelError('Could not open CONNECT tunnel with proxy '
f'{self._host}:{self._port} [{extra!r}]')
)
def connectFailed(self, reason):
"""Propagates the errback to the appropriate deferred."""
@ -371,7 +372,7 @@ class ScrapyAgent:
if self._txresponse:
self._txresponse._transport.stopProducing()
raise TimeoutError("Getting %s took longer than %s seconds." % (url, timeout))
raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
def _cb_latency(self, result, request, start_time):
request.meta['download_latency'] = time() - start_time

View File

@ -56,7 +56,7 @@ class S3DownloadHandler:
import botocore.credentials
kw.pop('anon', None)
if kw:
raise TypeError('Unexpected keyword arguments: %s' % kw)
raise TypeError(f'Unexpected keyword arguments: {kw}')
if not self.anon:
SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
self._signer = SignerCls(botocore.credentials.Credentials(
@ -85,14 +85,14 @@ class S3DownloadHandler:
scheme = 'https' if request.meta.get('is_secure') else 'http'
bucket = p.hostname
path = p.path + '?' + p.query if p.query else p.path
url = '%s://%s.s3.amazonaws.com%s' % (scheme, bucket, path)
url = f'{scheme}://{bucket}.s3.amazonaws.com{path}'
if self.anon:
request = request.replace(url=url)
elif self._signer is not None:
import botocore.awsrequest
awsrequest = botocore.awsrequest.AWSRequest(
method=request.method,
url='%s://s3.amazonaws.com/%s%s' % (scheme, bucket, path),
url=f'{scheme}://s3.amazonaws.com/{bucket}{path}',
headers=request.headers.to_unicode_dict(),
data=request.body)
self._signer.add_auth(awsrequest)

View File

@ -36,8 +36,9 @@ class DownloaderMiddlewareManager(MiddlewareManager):
response = yield deferred_from_coro(method(request=request, spider=spider))
if response is not None and not isinstance(response, (Response, Request)):
raise _InvalidOutput(
"Middleware %s.process_request must return None, Response or Request, got %s"
% (method.__self__.__class__.__name__, response.__class__.__name__)
f"Middleware {method.__self__.__class__.__name__}"
".process_request must return None, Response or "
f"Request, got {response.__class__.__name__}"
)
if response:
return response
@ -54,8 +55,9 @@ class DownloaderMiddlewareManager(MiddlewareManager):
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
if not isinstance(response, (Response, Request)):
raise _InvalidOutput(
"Middleware %s.process_response must return Response or Request, got %s"
% (method.__self__.__class__.__name__, type(response))
f"Middleware {method.__self__.__class__.__name__}"
".process_response must return Response or Request, "
f"got {type(response)}"
)
if isinstance(response, Request):
return response
@ -68,8 +70,9 @@ class DownloaderMiddlewareManager(MiddlewareManager):
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
if response is not None and not isinstance(response, (Response, Request)):
raise _InvalidOutput(
"Middleware %s.process_exception must return None, Response or Request, got %s"
% (method.__self__.__class__.__name__, type(response))
f"Middleware {method.__self__.__class__.__name__}"
".process_exception must return None, Response or "
f"Request, got {type(response)}"
)
if response:
return response

View File

@ -88,8 +88,8 @@ class ScrapyHTTPPageGetter(HTTPClient):
self.transport.stopProducing()
self.factory.noPage(
defer.TimeoutError("Getting %s took longer than %s seconds."
% (self.factory.url, self.factory.timeout)))
defer.TimeoutError(f"Getting {self.factory.url} took longer "
f"than {self.factory.timeout} seconds."))
# This class used to inherit from Twisteds
@ -155,7 +155,7 @@ class ScrapyHTTPClientFactory(ClientFactory):
self.headers['Content-Length'] = 0
def __repr__(self):
return "<%s: %s>" % (self.__class__.__name__, self.url)
return f"<{self.__class__.__name__}: {self.url}>"
def _cancelTimeout(self, result, timeoutCall):
if timeoutCall.active():

View File

@ -199,8 +199,8 @@ class ExecutionEngine:
def _handle_downloader_output(self, response, request, spider):
if not isinstance(response, (Request, Response, Failure)):
raise TypeError(
"Incorrect type: expected Request, Response or Failure, got %s: %r"
% (type(response), response)
"Incorrect type: expected Request, Response or Failure, got "
f"{type(response)}: {response!r}"
)
# downloader middleware can return requests (for example, redirects)
if isinstance(response, Request):
@ -242,7 +242,7 @@ class ExecutionEngine:
def crawl(self, request, spider):
if spider not in self.open_spiders:
raise RuntimeError("Spider %r not opened when crawling: %s" % (spider.name, request))
raise RuntimeError(f"Spider {spider.name!r} not opened when crawling: {request}")
self.schedule(request, spider)
self.slot.nextcall.schedule()
@ -267,8 +267,8 @@ class ExecutionEngine:
def _on_success(response):
if not isinstance(response, (Response, Request)):
raise TypeError(
"Incorrect type: expected Response or Request, got %s: %r"
% (type(response), response)
"Incorrect type: expected Response or Request, got "
f"{type(response)}: {response!r}"
)
if isinstance(response, Response):
if response.request is None:
@ -296,7 +296,7 @@ class ExecutionEngine:
@defer.inlineCallbacks
def open_spider(self, spider, start_requests=(), close_if_idle=True, new_queue_behavior=False):
if not self.has_capacity():
raise RuntimeError("No free spider slot when opening %r" % spider.name)
raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
logger.info("Spider opened", extra={'spider': spider})
nextcall = CallLaterOnce(self._next_request, spider)
scheduler = self.scheduler_cls.from_crawler(self.crawler)

View File

@ -125,7 +125,7 @@ class Scraper:
Handle the downloaded response or failure through the spider callback/errback
"""
if not isinstance(result, (Response, Failure)):
raise TypeError("Incorrect type: expected Response or Failure, got %s: %r" % (type(result), result))
raise TypeError(f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}")
dfd = self._scrape2(result, request, spider) # returns spider's processed output
dfd.addErrback(self.handle_spider_error, request, result, spider)
dfd.addCallback(self.handle_spider_output, request, result, spider)
@ -173,7 +173,7 @@ class Scraper:
spider=spider
)
self.crawler.stats.inc_value(
"spider_exceptions/%s" % _failure.value.__class__.__name__,
f"spider_exceptions/{_failure.value.__class__.__name__}",
spider=spider
)

View File

@ -19,10 +19,7 @@ def _isiterable(possible_iterator):
def _fname(f):
return "{}.{}".format(
f.__self__.__class__.__name__,
f.__func__.__name__
)
return f"{f.__self__.__class__.__name__}.{f.__func__.__name__}"
class SpiderMiddlewareManager(MiddlewareManager):
@ -51,8 +48,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
try:
result = method(response=response, spider=spider)
if result is not None:
msg = "Middleware {} must return None or raise an exception, got {}"
raise _InvalidOutput(msg.format(_fname(method), type(result)))
msg = (f"Middleware {_fname(method)} must return None "
f"or raise an exception, got {type(result)}")
raise _InvalidOutput(msg)
except _InvalidOutput:
raise
except Exception:
@ -86,8 +84,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
elif result is None:
continue
else:
msg = "Middleware {} must return None or an iterable, got {}"
raise _InvalidOutput(msg.format(_fname(method), type(result)))
msg = (f"Middleware {_fname(method)} must return None "
f"or an iterable, got {type(result)}")
raise _InvalidOutput(msg)
return _failure
def process_spider_output(result, start_index=0):
@ -110,8 +109,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
if _isiterable(result):
result = _evaluate_iterable(result, method_index + 1, recovered)
else:
msg = "Middleware {} must return an iterable, got {}"
raise _InvalidOutput(msg.format(_fname(method), type(result)))
msg = (f"Middleware {_fname(method)} must return an "
f"iterable, got {type(result)}")
raise _InvalidOutput(msg)
return MutableChain(result, recovered)

View File

@ -54,8 +54,8 @@ class CookiesMiddleware:
cl = [to_unicode(c, errors='replace')
for c in request.headers.getlist('Cookie')]
if cl:
cookies = "\n".join("Cookie: {}\n".format(c) for c in cl)
msg = "Sending cookies to: {}\n{}".format(request, cookies)
cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
msg = f"Sending cookies to: {request}\n{cookies}"
logger.debug(msg, extra={'spider': spider})
def _debug_set_cookie(self, response, spider):
@ -63,8 +63,8 @@ class CookiesMiddleware:
cl = [to_unicode(c, errors='replace')
for c in response.headers.getlist('Set-Cookie')]
if cl:
cookies = "\n".join("Set-Cookie: {}\n".format(c) for c in cl)
msg = "Received cookies from: {}\n{}".format(response, cookies)
cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
msg = f"Received cookies from: {response}\n{cookies}"
logger.debug(msg, extra={'spider': spider})
def _format_cookie(self, cookie, request):
@ -90,9 +90,9 @@ class CookiesMiddleware:
request, cookie)
decoded[key] = cookie[key].decode("latin1", errors="replace")
cookie_str = "{}={}".format(decoded.pop("name"), decoded.pop("value"))
cookie_str = f"{decoded.pop('name')}={decoded.pop('value')}"
for key, value in decoded.items(): # path, domain
cookie_str += "; {}={}".format(key.capitalize(), value)
cookie_str += f"; {key.capitalize()}={value}"
return cookie_str
def _get_request_cookies(self, jar, request):

View File

@ -24,7 +24,7 @@ class HttpProxyMiddleware:
def _basic_auth_header(self, username, password):
user_pass = to_bytes(
'%s:%s' % (unquote(username), unquote(password)),
f'{unquote(username)}:{unquote(password)}',
encoding=self.auth_encoding)
return base64.b64encode(user_pass)

View File

@ -88,7 +88,7 @@ class RetryMiddleware:
reason = global_object_name(reason.__class__)
stats.inc_value('retry/count')
stats.inc_value('retry/reason_count/%s' % reason)
stats.inc_value(f'retry/reason_count/{reason}')
return retryreq
else:
stats.inc_value('retry/max_reached')

View File

@ -61,7 +61,7 @@ class RobotsTxtMiddleware:
if netloc not in self._parsers:
self._parsers[netloc] = Deferred()
robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
@ -94,7 +94,7 @@ class RobotsTxtMiddleware:
def _parse_robots(self, response, netloc, spider):
self.crawler.stats.inc_value('robotstxt/response_count')
self.crawler.stats.inc_value('robotstxt/response_status_count/{}'.format(response.status))
self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}')
rp = self._parserimpl.from_crawler(self.crawler, response.body)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = rp
@ -102,7 +102,7 @@ class RobotsTxtMiddleware:
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = 'robotstxt/exception_count/{}'.format(failure.type)
key = f'robotstxt/exception_count/{failure.type}'
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None

View File

@ -17,13 +17,13 @@ class DownloaderStats:
def process_request(self, request, spider):
self.stats.inc_value('downloader/request_count', spider=spider)
self.stats.inc_value('downloader/request_method_count/%s' % request.method, spider=spider)
self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
reqlen = len(request_httprepr(request))
self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
def process_response(self, request, response, spider):
self.stats.inc_value('downloader/response_count', spider=spider)
self.stats.inc_value('downloader/response_status_count/%s' % response.status, spider=spider)
self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
reslen = len(response_httprepr(response))
self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
return response
@ -31,4 +31,4 @@ class DownloaderStats:
def process_exception(self, request, exception, spider):
ex_class = global_object_name(exception.__class__)
self.stats.inc_value('downloader/exception_count', spider=spider)
self.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)

View File

@ -39,7 +39,7 @@ class BaseItemExporter:
self.export_empty_fields = options.pop('export_empty_fields', False)
self.indent = options.pop('indent', None)
if not dont_fail and options:
raise TypeError("Unexpected options: %s" % ', '.join(options.keys()))
raise TypeError(f"Unexpected options: {', '.join(options.keys())}")
def export_item(self, item):
raise NotImplementedError

View File

@ -43,4 +43,4 @@ class CoreStats:
def item_dropped(self, item, spider, exception):
reason = exception.__class__.__name__
self.stats.inc_value('item_dropped_count', spider=spider)
self.stats.inc_value('item_dropped_reasons_count/%s' % reason, spider=spider)
self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)

View File

@ -48,7 +48,7 @@ class StackTraceDump:
for id_, frame in sys._current_frames().items():
name = id2name.get(id_, '')
dump = ''.join(traceback.format_stack(frame))
dumps += "# Thread: {0}({1})\n{2}\n".format(name, id_, dump)
dumps += f"# Thread: {name}({id_})\n{dump}\n"
return dumps

View File

@ -223,7 +223,7 @@ class DbmCacheStorage:
self.db = None
def open_spider(self, spider):
dbpath = os.path.join(self.cachedir, '%s.db' % spider.name)
dbpath = os.path.join(self.cachedir, f'{spider.name}.db')
self.db = self.dbmodule.open(dbpath, 'c')
logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
@ -251,13 +251,13 @@ class DbmCacheStorage:
'headers': dict(response.headers),
'body': response.body,
}
self.db['%s_data' % key] = pickle.dumps(data, protocol=4)
self.db['%s_time' % key] = str(time())
self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
self.db[f'{key}_time'] = str(time())
def _read_data(self, spider, request):
key = self._request_key(request)
db = self.db
tkey = '%s_time' % key
tkey = f'{key}_time'
if tkey not in db:
return # not found
@ -265,7 +265,7 @@ class DbmCacheStorage:
if 0 < self.expiration_secs < time() - float(ts):
return # expired
return pickle.loads(db['%s_data' % key])
return pickle.loads(db[f'{key}_data'])
def _request_key(self, request):
return request_fingerprint(request)

View File

@ -30,4 +30,4 @@ class MemoryDebugger:
for cls, wdict in live_refs.items():
if not wdict:
continue
self.stats.set_value('memdebug/live_refs/%s' % cls.__name__, len(wdict), spider=spider)
self.stats.set_value(f'memdebug/live_refs/{cls.__name__}', len(wdict), spider=spider)

View File

@ -82,8 +82,8 @@ class MemoryUsage:
{'memusage': mem}, extra={'crawler': self.crawler})
if self.notify_mails:
subj = (
"%s terminated: memory usage exceeded %dM at %s"
% (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
f"{self.crawler.settings['BOT_NAME']} terminated: "
f"memory usage exceeded {mem}M at {socket.gethostname()}"
)
self._send_report(self.notify_mails, subj)
self.crawler.stats.set_value('memusage/limit_notified', 1)
@ -105,8 +105,8 @@ class MemoryUsage:
{'memusage': mem}, extra={'crawler': self.crawler})
if self.notify_mails:
subj = (
"%s warning: memory usage reached %dM at %s"
% (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
f"{self.crawler.settings['BOT_NAME']} warning: "
f"memory usage reached {mem}M at {socket.gethostname()}"
)
self._send_report(self.notify_mails, subj)
self.crawler.stats.set_value('memusage/warning_notified', 1)
@ -115,9 +115,9 @@ class MemoryUsage:
def _send_report(self, rcpts, subject):
"""send notification mail with some additional useful info"""
stats = self.crawler.stats
s = "Memory usage at engine startup : %dM\r\n" % (stats.get_value('memusage/startup')/1024/1024)
s += "Maximum memory usage : %dM\r\n" % (stats.get_value('memusage/max')/1024/1024)
s += "Current memory usage : %dM\r\n" % (self.get_virtual_size()/1024/1024)
s = f"Memory usage at engine startup : {stats.get_value('memusage/startup')/1024/1024}M\r\n"
s += f"Maximum memory usage : {stats.get_value('memusage/max')/1024/1024}M\r\n"
s += f"Current memory usage : {self.get_virtual_size()/1024/1024}M\r\n"
s += "ENGINE STATUS ------------------------------------------------------- \r\n"
s += "\r\n"

View File

@ -24,11 +24,11 @@ class StatsMailer:
o = cls(crawler.stats, recipients, mail)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_closed(self, spider):
spider_stats = self.stats.get_stats(spider)
body = "Global stats\n\n"
body += "\n".join("%-50s : %s" % i for i in self.stats.get_stats().items())
body += "\n\n%s stats\n\n" % spider.name
body += "\n".join("%-50s : %s" % i for i in spider_stats.items())
return self.mail.send(self.recipients, "Scrapy stats for: %s" % spider.name, body)
body += "\n".join(f"{k:<50} : {v}" for k, v in self.stats.get_stats().items())
body += f"\n\n{spider.name} stats\n\n"
body += "\n".join(f"{k:<50} : {v}" for k, v in spider_stats.items())
return self.mail.send(self.recipients, f"Scrapy stats for: {spider.name}", body)

View File

@ -1,6 +1,6 @@
def obsolete_setter(setter, attrname):
def newsetter(self, value):
c = self.__class__.__name__
msg = "%s.%s is not modifiable, use %s.replace() instead" % (c, attrname, c)
msg = f"{c}.{attrname} is not modifiable, use {c}.replace() instead"
raise AttributeError(msg)
return newsetter

View File

@ -33,7 +33,7 @@ class Headers(CaselessDict):
elif isinstance(x, int):
return str(x).encode(self.encoding)
else:
raise TypeError('Unsupported value type: {}'.format(type(x)))
raise TypeError(f'Unsupported value type: {type(x)}')
def __getitem__(self, key):
try:

View File

@ -25,13 +25,13 @@ class Request(object_ref):
self._set_url(url)
self._set_body(body)
if not isinstance(priority, int):
raise TypeError("Request priority not an integer: %r" % priority)
raise TypeError(f"Request priority not an integer: {priority!r}")
self.priority = priority
if callback is not None and not callable(callback):
raise TypeError('callback must be a callable, got %s' % type(callback).__name__)
raise TypeError(f'callback must be a callable, got {type(callback).__name__}')
if errback is not None and not callable(errback):
raise TypeError('errback must be a callable, got %s' % type(errback).__name__)
raise TypeError(f'errback must be a callable, got {type(errback).__name__}')
self.callback = callback
self.errback = errback
@ -60,13 +60,13 @@ class Request(object_ref):
def _set_url(self, url):
if not isinstance(url, str):
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}')
s = safe_url_string(url, self.encoding)
self._url = escape_ajax(s)
if ('://' not in self._url) and (not self._url.startswith('data:')):
raise ValueError('Missing scheme in request url: %s' % self._url)
raise ValueError(f'Missing scheme in request url: {self._url}')
url = property(_get_url, obsolete_setter(_set_url, 'url'))
@ -86,7 +86,7 @@ class Request(object_ref):
return self._encoding
def __str__(self):
return "<%s %s>" % (self.method, self.url)
return f"<{self.method} {self.url}>"
__repr__ = __str__

View File

@ -80,15 +80,15 @@ def _get_form(response, formname, formid, formnumber, formxpath):
base_url=get_base_url(response))
forms = root.xpath('//form')
if not forms:
raise ValueError("No <form> element found in %s" % response)
raise ValueError(f"No <form> element found in {response}")
if formname is not None:
f = root.xpath('//form[@name="%s"]' % formname)
f = root.xpath(f'//form[@name="{formname}"]')
if f:
return f[0]
if formid is not None:
f = root.xpath('//form[@id="%s"]' % formid)
f = root.xpath(f'//form[@id="{formid}"]')
if f:
return f[0]
@ -103,7 +103,7 @@ def _get_form(response, formname, formid, formnumber, formxpath):
el = el.getparent()
if el is None:
break
raise ValueError('No <form> element found with %s' % formxpath)
raise ValueError(f'No <form> element found with {formxpath}')
# If we get here, it means that either formname was None
# or invalid
@ -111,8 +111,7 @@ def _get_form(response, formname, formid, formnumber, formxpath):
try:
form = forms[formnumber]
except IndexError:
raise IndexError("Form number %d not found in %s" %
(formnumber, response))
raise IndexError(f"Form number {formnumber} not found in {response}")
else:
return form
@ -205,12 +204,12 @@ def _get_clickable(clickdata, form):
# We didn't find it, so now we build an XPath expression out of the other
# arguments, because they can be used as such
xpath = './/*' + ''.join('[@%s="%s"]' % c for c in clickdata.items())
xpath = './/*' + ''.join(f'[@{k}="{v}"]' for k, v in clickdata.items())
el = form.xpath(xpath)
if len(el) == 1:
return (el[0].get('name'), el[0].get('value') or '')
elif len(el) > 1:
raise ValueError("Multiple elements found (%r) matching the criteria "
"in clickdata: %r" % (el, clickdata))
raise ValueError(f"Multiple elements found ({el!r}) matching the "
f"criteria in clickdata: {clickdata!r}")
else:
raise ValueError('No clickable element matching clickdata: %r' % (clickdata,))
raise ValueError(f'No clickable element matching clickdata: {clickdata!r}')

View File

@ -55,8 +55,8 @@ class Response(object_ref):
if isinstance(url, str):
self._url = url
else:
raise TypeError('%s url must be str, got %s:' %
(type(self).__name__, type(url).__name__))
raise TypeError(f'{type(self).__name__} url must be str, '
f'got {type(url).__name__}')
url = property(_get_url, obsolete_setter(_set_url, 'url'))
@ -77,7 +77,7 @@ class Response(object_ref):
body = property(_get_body, obsolete_setter(_set_body, 'body'))
def __str__(self):
return "<%d %s>" % (self.status, self.url)
return f"<{self.status} {self.url}>"
__repr__ = __str__

View File

@ -47,8 +47,8 @@ class TextResponse(Response):
self._body = b'' # used by encoding detection
if isinstance(body, str):
if self._encoding is None:
raise TypeError('Cannot convert unicode body - %s has no encoding' %
type(self).__name__)
raise TypeError('Cannot convert unicode body - '
f'{type(self).__name__} has no encoding')
self._body = body.encode(self._encoding)
else:
super()._set_body(body)
@ -92,7 +92,7 @@ class TextResponse(Response):
# _body_inferred_encoding is called
benc = self.encoding
if self._cached_ubody is None:
charset = 'charset=%s' % benc
charset = f'charset={benc}'
self._cached_ubody = html_to_unicode(charset, self.body)[1]
return self._cached_ubody
@ -255,12 +255,11 @@ def _url_from_selector(sel):
# e.g. ::attr(href) result
return strip_html5_whitespace(sel.root)
if not hasattr(sel.root, 'tag'):
raise _InvalidSelector("Unsupported selector: %s" % sel)
raise _InvalidSelector(f"Unsupported selector: {sel}")
if sel.root.tag not in ('a', 'link'):
raise _InvalidSelector("Only <a> and <link> elements are supported; got <%s>" %
sel.root.tag)
raise _InvalidSelector("Only <a> and <link> elements are supported; "
f"got <{sel.root.tag}>")
href = sel.root.get('href')
if href is None:
raise _InvalidSelector("<%s> element has no href attribute: %s" %
(sel.root.tag, sel))
raise _InvalidSelector(f"<{sel.root.tag}> element has no href attribute: {sel}")
return strip_html5_whitespace(href)

View File

@ -96,19 +96,19 @@ class DictItem(MutableMapping, BaseItem):
if key in self.fields:
self._values[key] = value
else:
raise KeyError("%s does not support field: %s" % (self.__class__.__name__, key))
raise KeyError(f"{self.__class__.__name__} does not support field: {key}")
def __delitem__(self, key):
del self._values[key]
def __getattr__(self, name):
if name in self.fields:
raise AttributeError("Use item[%r] to get field value" % name)
raise AttributeError(f"Use item[{name!r}] to get field value")
raise AttributeError(name)
def __setattr__(self, name, value):
if not name.startswith('_'):
raise AttributeError("Use item[%r] = %r to set field value" % (name, value))
raise AttributeError(f"Use item[{name!r}] = {value!r} to set field value")
super().__setattr__(name, value)
def __len__(self):

View File

@ -14,7 +14,7 @@ class Link:
def __init__(self, url, text='', fragment='', nofollow=False):
if not isinstance(url, str):
got = url.__class__.__name__
raise TypeError("Link urls must be str objects, got %s" % got)
raise TypeError(f"Link urls must be str objects, got {got}")
self.url = url
self.text = text
self.fragment = fragment
@ -33,6 +33,6 @@ class Link:
def __repr__(self):
return (
'Link(url=%r, text=%r, fragment=%r, nofollow=%r)'
% (self.url, self.text, self.fragment, self.nofollow)
f'Link(url={self.url!r}, text={self.text!r}, '
f'fragment={self.fragment!r}, nofollow={self.nofollow!r})'
)

View File

@ -54,8 +54,8 @@ class LogFormatter:
def crawled(self, request, response, spider):
"""Logs a message when the crawler finds a webpage."""
request_flags = ' %s' % str(request.flags) if request.flags else ''
response_flags = ' %s' % str(response.flags) if response.flags else ''
request_flags = f' {str(request.flags)}' if request.flags else ''
response_flags = f' {str(response.flags)}' if response.flags else ''
return {
'level': logging.DEBUG,
'msg': CRAWLEDMSG,

View File

@ -108,7 +108,7 @@ class S3FilesStore:
from boto.s3.connection import S3Connection
self.S3Connection = S3Connection
if not uri.startswith("s3://"):
raise ValueError("Incorrect URI scheme in %s, expected 's3'" % uri)
raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'")
self.bucket, self.prefix = uri[5:].split('/', 1)
def stat_file(self, path, info):
@ -133,7 +133,7 @@ class S3FilesStore:
return c.get_bucket(self.bucket, validate=False)
def _get_boto_key(self, path):
key_name = '%s%s' % (self.prefix, path)
key_name = f'{self.prefix}{path}'
if self.is_botocore:
return threads.deferToThread(
self.s3_client.head_object,
@ -145,7 +145,7 @@ class S3FilesStore:
def persist_file(self, path, buf, info, meta=None, headers=None):
"""Upload file to S3 storage"""
key_name = '%s%s' % (self.prefix, path)
key_name = f'{self.prefix}{path}'
buf.seek(0)
if self.is_botocore:
extra = self._headers_to_botocore_kwargs(self.HEADERS)
@ -208,8 +208,7 @@ class S3FilesStore:
try:
kwarg = mapping[key]
except KeyError:
raise TypeError(
'Header "%s" is not supported by botocore' % key)
raise TypeError(f'Header "{key}" is not supported by botocore')
else:
extra[kwarg] = value
return extra
@ -283,7 +282,7 @@ class FTPFilesStore:
def __init__(self, uri):
if not uri.startswith("ftp://"):
raise ValueError("Incorrect URI scheme in %s, expected 'ftp'" % uri)
raise ValueError(f"Incorrect URI scheme in {uri}, expected 'ftp'")
u = urlparse(uri)
self.port = u.port
self.host = u.hostname
@ -293,7 +292,7 @@ class FTPFilesStore:
self.basedir = u.path.rstrip('/')
def persist_file(self, path, buf, info, meta=None, headers=None):
path = '%s/%s' % (self.basedir, path)
path = f'{self.basedir}/{path}'
return threads.deferToThread(
ftp_store_file, path=path, file=buf,
host=self.host, port=self.port, username=self.username,
@ -308,10 +307,10 @@ class FTPFilesStore:
ftp.login(self.username, self.password)
if self.USE_ACTIVE_MODE:
ftp.set_pasv(False)
file_path = "%s/%s" % (self.basedir, path)
last_modified = float(ftp.voidcmd("MDTM %s" % file_path)[4:].strip())
file_path = f"{self.basedir}/{path}"
last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
m = hashlib.md5()
ftp.retrbinary('RETR %s' % file_path, m.update)
ftp.retrbinary(f'RETR {file_path}', m.update)
return {'last_modified': last_modified, 'checksum': m.hexdigest()}
# The file doesn't exist
except Exception:
@ -515,7 +514,7 @@ class FilesPipeline(MediaPipeline):
def inc_stats(self, spider, status):
spider.crawler.stats.inc_value('file_count', spider=spider)
spider.crawler.stats.inc_value('file_status_count/%s' % status, spider=spider)
spider.crawler.stats.inc_value(f'file_status_count/{status}', spider=spider)
# Overridable Interface
def get_media_requests(self, item, info):
@ -545,4 +544,4 @@ class FilesPipeline(MediaPipeline):
media_type = mimetypes.guess_type(request.url)[0]
if media_type:
media_ext = mimetypes.guess_extension(media_type)
return 'full/%s%s' % (media_guid, media_ext)
return f'full/{media_guid}{media_ext}'

View File

@ -125,8 +125,9 @@ class ImagesPipeline(FilesPipeline):
width, height = orig_image.size
if width < self.min_width or height < self.min_height:
raise ImageException("Image too small (%dx%d < %dx%d)" %
(width, height, self.min_width, self.min_height))
raise ImageException("Image too small "
f"({width}x{height} < "
f"{self.min_width}x{self.min_height})")
image, buf = self.convert_image(orig_image)
yield path, image, buf
@ -168,8 +169,8 @@ class ImagesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
return 'full/%s.jpg' % (image_guid)
return f'full/{image_guid}.jpg'
def thumb_path(self, request, thumb_id, response=None, info=None):
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
return f'thumbs/{thumb_id}/{thumb_guid}.jpg'

View File

@ -61,7 +61,7 @@ class MediaPipeline:
'MYPIPE_IMAGES'
"""
class_name = self.__class__.__name__
formatted_key = "{}_{}".format(class_name.upper(), key)
formatted_key = f"{class_name.upper()}_{key}"
if (
not base_class_name
or class_name == base_class_name
@ -151,9 +151,8 @@ class MediaPipeline:
if 'item' not in sig.parameters:
old_params = str(sig)[1:-1]
new_params = old_params + ", *, item=None"
warn('%s(self, %s) is deprecated, '
'please use %s(self, %s)'
% (func.__name__, old_params, func.__name__, new_params),
warn(f'{func.__name__}(self, {old_params}) is deprecated, '
f'please use {func.__name__}(self, {new_params})',
ScrapyDeprecationWarning, stacklevel=2)
self._expects_item[func.__name__] = False

View File

@ -141,17 +141,16 @@ class DownloaderAwarePriorityQueue:
def __init__(self, crawler, downstream_queue_cls, key, slot_startprios=()):
if crawler.settings.getint('CONCURRENT_REQUESTS_PER_IP') != 0:
raise ValueError('"%s" does not support CONCURRENT_REQUESTS_PER_IP'
% (self.__class__,))
raise ValueError(f'"{self.__class__}" does not support CONCURRENT_REQUESTS_PER_IP')
if slot_startprios and not isinstance(slot_startprios, dict):
raise ValueError("DownloaderAwarePriorityQueue accepts "
"``slot_startprios`` as a dict; %r instance "
"``slot_startprios`` as a dict; "
f"{slot_startprios.__class__!r} instance "
"is passed. Most likely, it means the state is"
"created by an incompatible priority queue. "
"Only a crawl started with the same priority "
"queue class can be resumed." %
slot_startprios.__class__)
"queue class can be resumed.")
self._downloader_interface = DownloaderInterface(crawler)
self.downstream_queue_cls = downstream_queue_cls

View File

@ -45,7 +45,7 @@ class ResponseTypes:
elif mimetype in self.classes:
return self.classes[mimetype]
else:
basetype = "%s/*" % mimetype.split('/')[0]
basetype = f"{mimetype.split('/')[0]}/*"
return self.classes.get(basetype, Response)
def from_content_type(self, content_type, content_encoding=None):

View File

@ -66,8 +66,8 @@ class Selector(_ParselSelector, object_ref):
def __init__(self, response=None, text=None, type=None, root=None, **kwargs):
if response is not None and text is not None:
raise ValueError('%s.__init__() received both response and text'
% self.__class__.__name__)
raise ValueError(f'{self.__class__.__name__}.__init__() received '
'both response and text')
st = _st(response, type or self._default_type)

View File

@ -52,7 +52,7 @@ class SettingsAttribute:
self.priority = priority
def __str__(self):
return "<SettingsAttribute value={self.value!r} priority={self.priority}>".format(self=self)
return f"<SettingsAttribute value={self.value!r} priority={self.priority}>"
__repr__ = __str__

View File

@ -287,7 +287,7 @@ TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
URLLENGTH_LIMIT = 2083
USER_AGENT = 'Scrapy/%s (+https://scrapy.org)' % import_module('scrapy').__version__
USER_AGENT = f'Scrapy/{import_module("scrapy").__version__} (+https://scrapy.org)'
TELNETCONSOLE_ENABLED = 1
TELNETCONSOLE_PORT = [6023, 6073]

View File

@ -140,7 +140,7 @@ class Shell:
b.append(" scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
for k, v in sorted(self.vars.items()):
if self._is_relevant(v):
b.append(" %-10s %s" % (k, v))
b.append(f" {k:<10} {v}")
b.append("Useful shortcuts:")
if self.inthread:
b.append(" fetch(url[, redirect=True]) "
@ -150,7 +150,7 @@ class Shell:
b.append(" shelp() Shell help (print this help)")
b.append(" view(response) View response in a browser")
return "\n".join("[s] %s" % line for line in b)
return "\n".join(f"[s] {line}" for line in b)
def _is_relevant(self, value):
return isinstance(value, self.relevant_classes) or is_item(value)

View File

@ -27,7 +27,7 @@ class SpiderLoader:
dupes = []
for name, locations in self._found.items():
dupes.extend([
" {cls} named {name!r} (in {module})".format(module=mod, cls=cls, name=name)
f" {cls} named {name!r} (in {mod})"
for mod, cls in locations
if len(locations) > 1
])
@ -36,7 +36,7 @@ class SpiderLoader:
dupes_string = "\n\n".join(dupes)
warnings.warn(
"There are several spiders with the same name:\n\n"
"{}\n\n This can cause unexpected behavior.".format(dupes_string),
f"{dupes_string}\n\n This can cause unexpected behavior.",
category=UserWarning,
)
@ -53,10 +53,9 @@ class SpiderLoader:
except ImportError:
if self.warn_only:
warnings.warn(
"\n{tb}Could not load spiders from module '{modname}'. "
"See above traceback for details.".format(
modname=name, tb=traceback.format_exc()
),
f"\n{traceback.format_exc()}Could not load spiders "
f"from module '{name}'. "
"See above traceback for details.",
category=RuntimeWarning,
)
else:
@ -75,7 +74,7 @@ class SpiderLoader:
try:
return self._spiders[spider_name]
except KeyError:
raise KeyError("Spider not found: {}".format(spider_name))
raise KeyError(f"Spider not found: {spider_name}")
def find_by_request(self, request):
"""

View File

@ -43,7 +43,7 @@ class DepthMiddleware:
return False
else:
if self.verbose_stats:
self.stats.inc_value('request_depth_count/%s' % depth,
self.stats.inc_value(f'request_depth_count/{depth}',
spider=spider)
self.stats.max_value('request_depth_max', depth,
spider=spider)

View File

@ -48,7 +48,7 @@ class HttpErrorMiddleware:
if isinstance(exception, HttpError):
spider.crawler.stats.inc_value('httperror/response_ignored_count')
spider.crawler.stats.inc_value(
'httperror/response_ignored_status_count/%s' % response.status
f'httperror/response_ignored_status_count/{response.status}'
)
logger.info(
"Ignoring response %(response)r: HTTP status code is not handled or not allowed",

View File

@ -61,15 +61,15 @@ class OffsiteMiddleware:
continue
elif url_pattern.match(domain):
message = ("allowed_domains accepts only domains, not URLs. "
"Ignoring URL entry %s in allowed_domains." % domain)
f"Ignoring URL entry {domain} in allowed_domains.")
warnings.warn(message, URLWarning)
elif port_pattern.search(domain):
message = ("allowed_domains accepts only domains without ports. "
"Ignoring entry %s in allowed_domains." % domain)
f"Ignoring entry {domain} in allowed_domains.")
warnings.warn(message, PortWarning)
else:
domains.append(re.escape(domain))
regex = r'^(.*\.)?(%s)$' % '|'.join(domains)
regex = fr'^(.*\.)?({"|".join(domains)})$'
return re.compile(regex)
def spider_opened(self, spider):

View File

@ -278,7 +278,7 @@ def _load_policy_class(policy, warning_only=False):
try:
return _policy_classes[policy.lower()]
except KeyError:
msg = "Could not load referrer policy %r" % policy
msg = f"Could not load referrer policy {policy!r}"
if not warning_only:
raise RuntimeError(msg)
else:

View File

@ -26,7 +26,7 @@ class Spider(object_ref):
if name is not None:
self.name = name
elif not getattr(self, 'name', None):
raise ValueError("%s must have a name" % type(self).__name__)
raise ValueError(f"{type(self).__name__} must have a name")
self.__dict__.update(kwargs)
if not hasattr(self, 'start_urls'):
self.start_urls = []
@ -67,9 +67,8 @@ class Spider(object_ref):
warnings.warn(
"Spider.make_requests_from_url method is deprecated; it "
"won't be called in future Scrapy releases. Please "
"override Spider.start_requests method instead (see %s.%s)." % (
cls.__module__, cls.__name__
),
"override Spider.start_requests method instead "
f"(see {cls.__module__}.{cls.__name__}).",
)
for url in self.start_urls:
yield self.make_requests_from_url(url)
@ -91,7 +90,7 @@ class Spider(object_ref):
return self.parse(response, **kwargs)
def parse(self, response, **kwargs):
raise NotImplementedError('{}.parse callback is not defined'.format(self.__class__.__name__))
raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined')
@classmethod
def update_settings(cls, settings):
@ -108,7 +107,7 @@ class Spider(object_ref):
return closed(reason)
def __str__(self):
return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
return f"<{type(self).__name__} {self.name!r} at 0x{id(self):0x}>"
__repr__ = __str__

View File

@ -71,11 +71,11 @@ class XMLFeedSpider(Spider):
elif self.iterator == 'xml':
selector = Selector(response, type='xml')
self._register_namespaces(selector)
nodes = selector.xpath('//%s' % self.itertag)
nodes = selector.xpath(f'//{self.itertag}')
elif self.iterator == 'html':
selector = Selector(response, type='html')
self._register_namespaces(selector)
nodes = selector.xpath('//%s' % self.itertag)
nodes = selector.xpath(f'//{self.itertag}')
else:
raise NotSupported('Unsupported node iterator')

View File

@ -21,8 +21,8 @@ class Root(Resource):
for nl in nlist:
args['n'] = nl
argstr = urlencode(args, doseq=True)
request.write("<a href='/follow?{0}'>follow {1}</a><br>"
.format(argstr, nl).encode('utf8'))
request.write(f"<a href='/follow?{argstr}'>follow {nl}</a><br>"
.encode('utf8'))
request.write(b"</body></html>")
return b''
@ -39,6 +39,6 @@ if __name__ == '__main__':
def _print_listening():
httpHost = httpPort.getHost()
print("Bench server at http://{}:{}".format(httpHost.host, httpHost.port))
print(f"Bench server at http://{httpHost.host}:{httpHost.port}")
reactor.callWhenRunning(_print_listening)
reactor.run()

View File

@ -17,8 +17,8 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
def _check_components(complist):
if len({convert(c) for c in complist}) != len(complist):
raise ValueError('Some paths in {!r} convert to the same object, '
'please update your settings'.format(complist))
raise ValueError(f'Some paths in {complist!r} convert to the same object, '
'please update your settings')
def _map_keys(compdict):
if isinstance(compdict, BaseSettings):
@ -26,9 +26,10 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
for k, v in compdict.items():
prio = compdict.getpriority(k)
if compbs.getpriority(convert(k)) == prio:
raise ValueError('Some paths in {!r} convert to the same '
raise ValueError(f'Some paths in {list(compdict.keys())!r} '
'convert to the same '
'object, please update your settings'
''.format(list(compdict.keys())))
)
else:
compbs.set(convert(k), v, priority=prio)
return compbs
@ -40,8 +41,8 @@ def build_component_list(compdict, custom=None, convert=update_classpath):
"""Fail if a value in the components dict is not a real number or None."""
for name, value in compdict.items():
if value is not None and not isinstance(value, numbers.Real):
raise ValueError('Invalid value {} for component {}, please provide '
'a real number or None instead'.format(value, name))
raise ValueError(f'Invalid value {value} for component {name}, '
'please provide a real number or None instead')
# BEGIN Backward compatibility for old (base, custom) call signature
if isinstance(custom, (list, tuple)):
@ -141,12 +142,10 @@ def feed_process_params_from_cli(settings, output, output_format=None,
def check_valid_format(output_format):
if output_format not in valid_output_formats:
raise UsageError(
"Unrecognized output format '%s'. Set a supported one (%s) "
f"Unrecognized output format '{output_format}'. "
f"Set a supported one ({tuple(valid_output_formats)}) "
"after a colon at the end of the output URI (i.e. -o/-O "
"<URI>:<FORMAT>) or as a file extension." % (
output_format,
tuple(valid_output_formats),
)
"<URI>:<FORMAT>) or as a file extension."
)
overwrite = False

View File

@ -9,7 +9,7 @@ from w3lib.http import basic_auth_header
class CurlParser(argparse.ArgumentParser):
def error(self, message):
error_msg = 'There was an error parsing the curl command: {}'.format(message)
error_msg = f'There was an error parsing the curl command: {message}'
raise ValueError(error_msg)
@ -52,7 +52,7 @@ def curl_to_request_kwargs(curl_command, ignore_unknown_options=True):
parsed_args, argv = curl_parser.parse_known_args(curl_args[1:])
if argv:
msg = 'Unrecognized options: {}'.format(', '.join(argv))
msg = f'Unrecognized options: {", ".join(argv)}'
if ignore_unknown_options:
warnings.warn(msg)
else:

View File

@ -14,9 +14,9 @@ def deprecated(use_instead=None):
def deco(func):
@wraps(func)
def wrapped(*args, **kwargs):
message = "Call to deprecated function %s." % func.__name__
message = f"Call to deprecated function {func.__name__}."
if use_instead:
message += " Use %s instead." % use_instead
message += f" Use {use_instead} instead."
warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2)
return func(*args, **kwargs)
return wrapped

View File

@ -8,9 +8,8 @@ from scrapy.exceptions import ScrapyDeprecationWarning
def attribute(obj, oldattr, newattr, version='0.12'):
cname = obj.__class__.__name__
warnings.warn(
"%s.%s attribute is deprecated and will be no longer supported "
"in Scrapy %s, use %s.%s attribute instead"
% (cname, oldattr, version, cname, newattr),
f"{cname}.{oldattr} attribute is deprecated and will be no longer supported "
f"in Scrapy {version}, use {cname}.{newattr} attribute instead",
ScrapyDeprecationWarning,
stacklevel=3)
@ -116,7 +115,7 @@ def create_deprecated_class(
# deprecated class is in jinja2 template). __module__ attribute is not
# important enough to raise an exception as users may be unable
# to fix inspect.stack() errors.
warnings.warn("Error detecting parent module: %r" % e)
warnings.warn(f"Error detecting parent module: {e!r}")
return deprecated_cls
@ -124,7 +123,7 @@ def create_deprecated_class(
def _clspath(cls, forced=None):
if forced is not None:
return forced
return '{}.{}'.format(cls.__module__, cls.__name__)
return f'{cls.__module__}.{cls.__name__}'
DEPRECATION_RULES = [
@ -137,7 +136,7 @@ def update_classpath(path):
for prefix, replacement in DEPRECATION_RULES:
if isinstance(path, str) and path.startswith(prefix):
new_path = path.replace(prefix, replacement, 1)
warnings.warn("`{}` class is deprecated, use `{}` instead".format(path, new_path),
warnings.warn(f"`{path}` class is deprecated, use `{new_path}` instead",
ScrapyDeprecationWarning)
return new_path
return path

View File

@ -29,7 +29,7 @@ def get_engine_status(engine):
try:
checks += [(test, eval(test))]
except Exception as e:
checks += [(test, "%s (exception)" % type(e).__name__)]
checks += [(test, f"{type(e).__name__} (exception)")]
return checks
@ -38,7 +38,7 @@ def format_engine_status(engine=None):
checks = get_engine_status(engine)
s = "Execution engine status\n\n"
for test, result in checks:
s += "%-47s : %s\n" % (test, result)
s += f"{test:<47} : {result}\n"
s += "\n"
return s

View File

@ -33,5 +33,5 @@ def ftp_store_file(
dirname, filename = posixpath.split(path)
ftp_makedirs_cwd(ftp, dirname)
command = 'STOR' if overwrite else 'APPE'
ftp.storbinary('%s %s' % (command, filename), file)
ftp.storbinary(f'{command} {filename}', file)
file.close()

View File

@ -22,8 +22,8 @@ def xmliter(obj, nodename):
"""
nodename_patt = re.escape(nodename)
HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename_patt, re.S)
HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename_patt, re.S)
HEADER_START_RE = re.compile(fr'^(.*?)<\s*{nodename_patt}(?:\s|>)', re.S)
HEADER_END_RE = re.compile(fr'<\s*/{nodename_patt}\s*>', re.S)
text = _body_or_str(obj)
header_start = re.search(HEADER_START_RE, text)
@ -31,7 +31,7 @@ def xmliter(obj, nodename):
header_end = re_rsearch(HEADER_END_RE, text)
header_end = text[header_end[1]:].strip() if header_end else ''
r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL)
r = re.compile(fr'<{nodename_patt}[\s>].*?</{nodename_patt}>', re.DOTALL)
for match in r.finditer(text):
nodetext = header_start + match.group() + header_end
yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
@ -40,9 +40,9 @@ def xmliter(obj, nodename):
def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
from lxml import etree
reader = _StreamReader(obj)
tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
tag = f'{{{namespace}}}{nodename}'if namespace else nodename
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename)
selxpath = '//' + (f'{prefix}:{nodename}' if namespace else nodename)
for _, node in iterable:
nodetext = etree.tostring(node, encoding='unicode')
node.clear()
@ -131,8 +131,7 @@ def _body_or_str(obj, unicode=True):
if not isinstance(obj, expected_types):
expected_types_str = " or ".join(t.__name__ for t in expected_types)
raise TypeError(
"Object %r must be %s, not %s"
% (obj, expected_types_str, type(obj).__name__)
f"Object {obj!r} must be {expected_types_str}, not {type(obj).__name__}"
)
if isinstance(obj, Response):
if not unicode:

View File

@ -143,7 +143,7 @@ def log_scrapy_info(settings):
logger.info("Scrapy %(version)s started (bot: %(bot)s)",
{'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
versions = [
"%s %s" % (name, version)
f"{name} {version}"
for name, version in scrapy_components_versions()
if name != "Scrapy"
]
@ -187,7 +187,7 @@ class LogCounterHandler(logging.Handler):
self.crawler = crawler
def emit(self, record):
sname = 'log_count/{}'.format(record.levelname)
sname = f'log_count/{record.levelname}'
self.crawler.stats.inc_value(sname)

View File

@ -56,7 +56,7 @@ def load_object(path):
try:
dot = path.rindex('.')
except ValueError:
raise ValueError("Error loading object '%s': not a full path" % path)
raise ValueError(f"Error loading object '{path}': not a full path")
module, name = path[:dot], path[dot + 1:]
mod = import_module(module)
@ -64,7 +64,7 @@ def load_object(path):
try:
obj = getattr(mod, name)
except AttributeError:
raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
raise NameError(f"Module '{module}' doesn't define any object named '{name}'")
return obj
@ -173,7 +173,7 @@ def create_instance(objcls, settings, crawler, *args, **kwargs):
instance = objcls(*args, **kwargs)
method_name = '__new__'
if instance is None:
raise TypeError("%s.%s returned None" % (objcls.__qualname__, method_name))
raise TypeError(f"{objcls.__qualname__}.{method_name} returned None")
return instance
@ -244,9 +244,10 @@ def warn_on_generator_with_return_value(spider, callable):
"""
if is_generator_with_return_value(callable):
warnings.warn(
'The "{}.{}" method is a generator and includes a "return" statement with a '
'value different than None. This could lead to unexpected behaviour. Please see '
f'The "{spider.__class__.__name__}.{callable.__name__}" method is '
'a generator and includes a "return" statement with a value '
'different than None. This could lead to unexpected behaviour. Please see '
'https://docs.python.org/3/reference/simple_stmts.html#the-return-statement '
'for details about the semantics of the "return" statement within generators'
.format(spider.__class__.__name__, callable.__name__), stacklevel=2,
'for details about the semantics of the "return" statement within generators',
stacklevel=2,
)

View File

@ -20,7 +20,7 @@ def inside_project():
try:
import_module(scrapy_module)
except ImportError as exc:
warnings.warn("Cannot import scrapy settings module %s: %s" % (scrapy_module, exc))
warnings.warn(f"Cannot import scrapy settings module {scrapy_module}: {exc}")
else:
return True
return bool(closest_scrapy_cfg())
@ -90,7 +90,7 @@ def get_project_settings():
warnings.warn(
'Use of environment variables prefixed with SCRAPY_ to override '
'settings is deprecated. The following environment variables are '
'currently defined: {}'.format(setting_envvar_list),
f'currently defined: {setting_envvar_list}',
ScrapyDeprecationWarning
)
settings.setdict(scrapy_envvars, priority='project')

View File

@ -91,7 +91,7 @@ def to_unicode(text, encoding=None, errors='strict'):
return text
if not isinstance(text, (bytes, str)):
raise TypeError('to_unicode must receive a bytes or str '
'object, got %s' % type(text).__name__)
f'object, got {type(text).__name__}')
if encoding is None:
encoding = 'utf-8'
return text.decode(encoding, errors)
@ -104,7 +104,7 @@ def to_bytes(text, encoding=None, errors='strict'):
return text
if not isinstance(text, str):
raise TypeError('to_bytes must receive a str or bytes '
'object, got %s' % type(text).__name__)
f'object, got {type(text).__name__}')
if encoding is None:
encoding = 'utf-8'
return text.encode(encoding, errors)
@ -174,7 +174,7 @@ def binary_is_text(data):
does not contain unprintable control characters.
"""
if not isinstance(data, bytes):
raise TypeError("data must be bytes, got '%s'" % type(data).__name__)
raise TypeError(f"data must be bytes, got '{type(data).__name__}'")
return all(c not in _BINARYCHARS for c in data)
@ -217,7 +217,7 @@ def get_func_args(func, stripself=False):
else:
return get_func_args(func.__call__, True)
else:
raise TypeError('%s is not callable' % type(func))
raise TypeError(f'{type(func)} is not callable')
if stripself:
func_args.pop(0)
return func_args
@ -250,7 +250,7 @@ def get_spec(func):
elif hasattr(func, '__call__'):
spec = _getargspec_py23(func.__call__)
else:
raise TypeError('%s is not callable' % type(func))
raise TypeError(f'{type(func)} is not callable')
defaults = spec.defaults or []
@ -322,7 +322,7 @@ def global_object_name(obj):
>>> global_object_name(Request)
'scrapy.http.request.Request'
"""
return "%s.%s" % (obj.__module__, obj.__name__)
return f"{obj.__module__}.{obj.__name__}"
if hasattr(sys, "pypy_version_info"):

View File

@ -10,7 +10,7 @@ def listen_tcp(portrange, host, factory):
"""Like reactor.listenTCP but tries different ports in a range."""
from twisted.internet import reactor
if len(portrange) > 2:
raise ValueError("invalid portrange: %s" % portrange)
raise ValueError(f"invalid portrange: {portrange}")
if not portrange:
return reactor.listenTCP(0, factory, interface=host)
if not hasattr(portrange, '__iter__'):
@ -78,9 +78,9 @@ def verify_installed_reactor(reactor_path):
from twisted.internet import reactor
reactor_class = load_object(reactor_path)
if not isinstance(reactor, reactor_class):
msg = "The installed reactor ({}.{}) does not match the requested one ({})".format(
reactor.__module__, reactor.__class__.__name__, reactor_path
)
msg = ("The installed reactor "
f"({reactor.__module__}.{reactor.__class__.__name__}) does not "
f"match the requested one ({reactor_path})")
raise Exception(msg)

View File

@ -84,7 +84,7 @@ def _find_method(obj, func):
# https://docs.python.org/3/reference/datamodel.html
if obj_func.__func__ is func.__func__:
return name
raise ValueError("Function %s is not an instance method in: %s" % (func, obj))
raise ValueError(f"Function {func} is not an instance method in: {obj}")
def _get_method(obj, name):
@ -92,4 +92,4 @@ def _get_method(obj, name):
try:
return getattr(obj, name)
except AttributeError:
raise ValueError("Method %r not found in: %s" % (name, obj))
raise ValueError(f"Method {name!r} not found in: {obj}")

View File

@ -39,7 +39,7 @@ def response_status_message(status):
"""Return status code plus status text descriptive message
"""
message = http.RESPONSES.get(int(status), "Unknown Status")
return '%s %s' % (status, to_unicode(message))
return f'{status} {to_unicode(message)}'
def response_httprepr(response):
@ -69,15 +69,15 @@ def open_in_browser(response, _openfunc=webbrowser.open):
body = response.body
if isinstance(response, HtmlResponse):
if b'<base' not in body:
repl = '<head><base href="%s">' % response.url
repl = f'<head><base href="{response.url}">'
body = body.replace(b'<head>', to_bytes(repl))
ext = '.html'
elif isinstance(response, TextResponse):
ext = '.txt'
else:
raise TypeError("Unsupported response type: %s" %
response.__class__.__name__)
raise TypeError("Unsupported response type: "
f"{response.__class__.__name__}")
fd, fname = tempfile.mkstemp(ext)
os.write(fd, body)
os.close(fd)
return _openfunc("file://%s" % fname)
return _openfunc(f"file://{fname}")

View File

@ -17,7 +17,7 @@ class ScrapyJSONEncoder(json.JSONEncoder):
if isinstance(o, set):
return list(o)
elif isinstance(o, datetime.datetime):
return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
return o.strftime(f"{self.DATE_FORMAT} {self.TIME_FORMAT}")
elif isinstance(o, datetime.date):
return o.strftime(self.DATE_FORMAT)
elif isinstance(o, datetime.time):
@ -29,9 +29,9 @@ class ScrapyJSONEncoder(json.JSONEncoder):
elif is_item(o):
return ItemAdapter(o).asdict()
elif isinstance(o, Request):
return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
return f"<{type(o).__name__} {o.method} {o.url}>"
elif isinstance(o, Response):
return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
return f"<{type(o).__name__} {o.status} {o.url}>"
else:
return super().default(o)

View File

@ -50,7 +50,7 @@ def get_temp_key_info(ssl_object):
key_info.append(ffi_buf_to_string(cname))
else:
key_info.append(ffi_buf_to_string(pyOpenSSLutil.lib.OBJ_nid2sn(key_type)))
key_info.append('%s bits' % pyOpenSSLutil.lib.EVP_PKEY_bits(temp_key))
key_info.append(f'{pyOpenSSLutil.lib.EVP_PKEY_bits(temp_key)} bits')
return ', '.join(key_info)
@ -58,4 +58,4 @@ def get_openssl_version():
system_openssl = OpenSSL.SSL.SSLeay_version(
OpenSSL.SSL.SSLEAY_VERSION
).decode('ascii', errors='replace')
return '{} ({})'.format(OpenSSL.version.__version__, system_openssl)
return f'{OpenSSL.version.__version__} ({system_openssl})'

View File

@ -12,10 +12,12 @@ def render_templatefile(path, **kwargs):
content = string.Template(raw).substitute(**kwargs)
render_path = path[:-len('.tmpl')] if path.endswith('.tmpl') else path
if path.endswith('.tmpl'):
os.rename(path, render_path)
with open(render_path, 'wb') as fp:
fp.write(content.encode('utf8'))
if path.endswith('.tmpl'):
os.remove(path)
CAMELCASE_INVALID_CHARS = re.compile(r'[^a-zA-Z\d]')

View File

@ -79,7 +79,7 @@ def get_ftp_content_and_delete(
def buffer_data(data):
ftp_data.append(data)
ftp.retrbinary('RETR %s' % path, buffer_data)
ftp.retrbinary(f'RETR {path}', buffer_data)
dirname, filename = split(path)
ftp.cwd(dirname)
ftp.delete(filename)

View File

@ -23,10 +23,10 @@ class ProcessTest:
def _process_finished(self, pp, cmd, check_code):
if pp.exitcode and check_code:
msg = "process %s exit with code %d" % (cmd, pp.exitcode)
msg += "\n>>> stdout <<<\n%s" % pp.out
msg = f"process {cmd} exit with code {pp.exitcode}"
msg += f"\n>>> stdout <<<\n{pp.out}"
msg += "\n"
msg += "\n>>> stderr <<<\n%s" % pp.err
msg += f"\n>>> stderr <<<\n{pp.err}"
raise RuntimeError(msg)
return pp.exitcode, pp.out, pp.err

View File

@ -9,7 +9,7 @@ class SiteTest:
from twisted.internet import reactor
super().setUp()
self.site = reactor.listenTCP(0, test_site(), interface="127.0.0.1")
self.baseurl = "http://localhost:%d/" % self.site.getHost().port
self.baseurl = f"http://localhost:{self.site.getHost().port}/"
def tearDown(self):
super().tearDown()
@ -40,5 +40,5 @@ def test_site():
if __name__ == '__main__':
from twisted.internet import reactor
port = reactor.listenTCP(0, test_site(), interface="127.0.0.1")
print("http://localhost:%d/" % port.getHost().port)
print(f"http://localhost:{port.getHost().port}/")
reactor.run()

View File

@ -41,9 +41,7 @@ def format_live_refs(ignore=NoneType):
if issubclass(cls, ignore):
continue
oldest = min(wdict.values())
s += "%-30s %6d oldest: %ds ago\n" % (
cls.__name__, len(wdict), now - oldest
)
s += f"{cls.__name__:<30} {len(wdict):6} oldest: {int(now - oldest)}s ago\n"
return s

View File

@ -22,7 +22,7 @@ def url_is_from_any_domain(url, domains):
if not host:
return False
domains = [d.lower() for d in domains]
return any((host == d) or (host.endswith('.%s' % d)) for d in domains)
return any((host == d) or (host.endswith(f'.{d}')) for d in domains)
def url_is_from_spider(url, spider):
@ -153,7 +153,7 @@ def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=
if (parsed_url.scheme, parsed_url.port) in (('http', 80),
('https', 443),
('ftp', 21)):
netloc = netloc.replace(':{p.port}'.format(p=parsed_url), '')
netloc = netloc.replace(f':{parsed_url.port}', '')
return urlunparse((
parsed_url.scheme,
netloc,

View File

@ -38,7 +38,7 @@ class LocalhostSpider(Spider):
if __name__ == "__main__":
with MockServer() as mock_http_server, MockDNSServer() as mock_dns_server:
port = urlparse(mock_http_server.http_address).port
url = "http://not.a.real.domain:{port}/echo".format(port=port)
url = f"http://not.a.real.domain:{port}/echo"
servers = [(mock_dns_server.host, mock_dns_server.port)]
reactor.installResolver(createResolver(servers=servers))

View File

@ -73,7 +73,7 @@ class Follow(LeafResource):
for nl in nlist:
args[b"n"] = [to_bytes(str(nl))]
argstr = urlencode(args, doseq=True)
s += "<a href='/follow?%s'>follow %d</a><br>" % (argstr, nl)
s += f"<a href='/follow?{argstr}'>follow {nl}</a><br>"
s += """</body>"""
request.write(to_bytes(s))
request.finish()
@ -91,7 +91,7 @@ class Delay(LeafResource):
return NOT_DONE_YET
def _delayedRender(self, request, n):
request.write(to_bytes("Response delayed for %0.3f seconds\n" % n))
request.write(to_bytes(f"Response delayed for {n:.3f} seconds\n"))
request.finish()
@ -310,8 +310,8 @@ if __name__ == "__main__":
def print_listening():
httpHost = httpPort.getHost()
httpsHost = httpsPort.getHost()
httpAddress = "http://%s:%d" % (httpHost.host, httpHost.port)
httpsAddress = "https://%s:%d" % (httpsHost.host, httpsHost.port)
httpAddress = f'http://{httpHost.host}:{httpHost.port}'
httpsAddress = f'https://{httpsHost.host}:{httpsHost.port}'
print(httpAddress)
print(httpsAddress)
@ -323,7 +323,7 @@ if __name__ == "__main__":
def print_listening():
host = listener.getHost()
print("%s:%s" % (host.host, host.port))
print(f"{host.host}:{host.port}")
reactor.callWhenRunning(print_listening)
reactor.run()

View File

@ -45,7 +45,7 @@ class FollowAllSpider(MetaSpider):
self.urls_visited = []
self.times = []
qargs = {'total': total, 'show': show, 'order': order, 'maxlatency': maxlatency}
url = self.mockserver.url("/follow?%s" % urlencode(qargs, doseq=1))
url = self.mockserver.url(f"/follow?{urlencode(qargs, doseq=1)}")
self.start_urls = [url]
def parse(self, response):
@ -67,7 +67,7 @@ class DelaySpider(MetaSpider):
def start_requests(self):
self.t1 = time.time()
url = self.mockserver.url("/delay?n=%s&b=%s" % (self.n, self.b))
url = self.mockserver.url(f"/delay?n={self.n}&b={self.b}")
yield Request(url, callback=self.parse, errback=self.errback)
def parse(self, response):
@ -177,7 +177,7 @@ class AsyncDefAsyncioGenComplexSpider(SimpleSpider):
depth = 2
def _get_req(self, index, cb=None):
return Request(self.mockserver.url("/status?n=200&request=%d" % index),
return Request(self.mockserver.url(f"/status?n=200&request={index}"),
meta={'index': index},
dont_filter=True,
callback=cb)
@ -236,7 +236,7 @@ class YieldingRequestsSpider(FollowAllSpider):
def start_requests(self):
for s in range(self.number_of_start_requests):
qargs = {'total': 10, 'seed': s}
url = self.mockserver.url("/follow?%s") % urlencode(qargs, doseq=1)
url = self.mockserver.url(f"/follow?{urlencode(qargs, doseq=1)}")
yield Request(url, meta={'seed': s})
@ -288,7 +288,7 @@ class DuplicateStartRequestsSpider(MockServerSpider):
def start_requests(self):
for i in range(0, self.distinct_urls):
for j in range(0, self.dupe_factor):
url = self.mockserver.url("/echo?headers=1&body=test%d" % i)
url = self.mockserver.url(f"/echo?headers=1&body=test{i}")
yield Request(url, dont_filter=self.dont_filter)
def __init__(self, url="http://localhost:8998", *args, **kwargs):

View File

@ -4,7 +4,7 @@
class TestExtension:
def __init__(self, settings):
settings.set('TEST1', "%s + %s" % (settings['TEST1'], 'started'))
settings.set('TEST1', f"{settings['TEST1']} + started")
@classmethod
def from_crawler(cls, crawler):

View File

@ -14,20 +14,20 @@ class CheckCommandTest(CommandTest):
def _write_contract(self, contracts, parse_def):
with open(self.spider, 'w') as file:
file.write("""
file.write(f"""
import scrapy
class CheckSpider(scrapy.Spider):
name = '{0}'
name = '{self.spider_name}'
start_urls = ['http://example.com']
def parse(self, response, **cb_kwargs):
\"\"\"
@url http://example.com
{1}
{contracts}
\"\"\"
{2}
""".format(self.spider_name, contracts, parse_def))
{parse_def}
""")
def _test_contract(self, contracts='', parse_def='pass'):
self._write_contract(contracts, parse_def)

View File

@ -21,14 +21,14 @@ class ParseCommandTest(ProcessTest, SiteTest, CommandTest):
self.spider_name = 'parse_spider'
fname = abspath(join(self.proj_mod_path, 'spiders', 'myspider.py'))
with open(fname, 'w') as f:
f.write("""
f.write(f"""
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class MySpider(scrapy.Spider):
name = '{0}'
name = '{self.spider_name}'
def parse(self, response):
if getattr(self, 'test_arg', None):
@ -58,7 +58,7 @@ class MySpider(scrapy.Spider):
self.logger.debug('It Does Not Work :(')
class MyGoodCrawlSpider(CrawlSpider):
name = 'goodcrawl{0}'
name = 'goodcrawl{self.spider_name}'
rules = (
Rule(LinkExtractor(allow=r'/html'), callback='parse_item', follow=True),
@ -74,7 +74,7 @@ class MyGoodCrawlSpider(CrawlSpider):
class MyBadCrawlSpider(CrawlSpider):
'''Spider which doesn't define a parse_item callback while using it in a rule.'''
name = 'badcrawl{0}'
name = 'badcrawl{self.spider_name}'
rules = (
Rule(LinkExtractor(allow=r'/html'), callback='parse_item', follow=True),
@ -82,7 +82,7 @@ class MyBadCrawlSpider(CrawlSpider):
def parse(self, response):
return [scrapy.Item(), dict(foo='bar')]
""".format(self.spider_name))
""")
fname = abspath(join(self.proj_mod_path, 'pipelines.py'))
with open(fname, 'w') as f:
@ -99,9 +99,9 @@ class MyPipeline:
fname = abspath(join(self.proj_mod_path, 'settings.py'))
with open(fname, 'a') as f:
f.write("""
ITEM_PIPELINES = {'%s.pipelines.MyPipeline': 1}
""" % self.project_name)
f.write(f"""
ITEM_PIPELINES = {{'{self.project_name}.pipelines.MyPipeline': 1}}
""")
@defer.inlineCallbacks
def test_spider_arguments(self):

View File

@ -65,8 +65,8 @@ class ShellTest(ProcessTest, SiteTest, unittest.TestCase):
def test_fetch_redirect_follow_302(self):
"""Test that calling ``fetch(url)`` follows HTTP redirects by default."""
url = self.url('/redirect-no-meta-refresh')
code = "fetch('{0}')"
errcode, out, errout = yield self.execute(['-c', code.format(url)])
code = f"fetch('{url}')"
errcode, out, errout = yield self.execute(['-c', code])
self.assertEqual(errcode, 0, out)
assert b'Redirecting (302)' in errout
assert b'Crawled (200)' in errout
@ -75,23 +75,23 @@ class ShellTest(ProcessTest, SiteTest, unittest.TestCase):
def test_fetch_redirect_not_follow_302(self):
"""Test that calling ``fetch(url, redirect=False)`` disables automatic redirects."""
url = self.url('/redirect-no-meta-refresh')
code = "fetch('{0}', redirect=False)"
errcode, out, errout = yield self.execute(['-c', code.format(url)])
code = f"fetch('{url}', redirect=False)"
errcode, out, errout = yield self.execute(['-c', code])
self.assertEqual(errcode, 0, out)
assert b'Crawled (302)' in errout
@defer.inlineCallbacks
def test_request_replace(self):
url = self.url('/text')
code = "fetch('{0}') or fetch(response.request.replace(method='POST'))"
errcode, out, _ = yield self.execute(['-c', code.format(url)])
code = f"fetch('{url}') or fetch(response.request.replace(method='POST'))"
errcode, out, _ = yield self.execute(['-c', code])
self.assertEqual(errcode, 0, out)
@defer.inlineCallbacks
def test_scrapy_import(self):
url = self.url('/text')
code = "fetch(scrapy.Request('{0}'))"
errcode, out, _ = yield self.execute(['-c', code.format(url)])
code = f"fetch(scrapy.Request('{url}'))"
errcode, out, _ = yield self.execute(['-c', code])
self.assertEqual(errcode, 0, out)
@defer.inlineCallbacks

View File

@ -16,7 +16,7 @@ class VersionTest(ProcessTest, unittest.TestCase):
_, out, _ = yield self.execute([])
self.assertEqual(
out.strip().decode(encoding),
"Scrapy %s" % scrapy.__version__,
f"Scrapy {scrapy.__version__}",
)
@defer.inlineCallbacks

Some files were not shown because too many files have changed in this diff Show More