mirror of
https://github.com/scrapy/scrapy.git
synced 2025-02-06 11:00:46 +00:00
Fix typos
This commit is contained in:
parent
65d60b9692
commit
3a263280ba
@ -1830,7 +1830,7 @@ New features
|
||||
* A new scheduler priority queue,
|
||||
``scrapy.pqueues.DownloaderAwarePriorityQueue``, may be
|
||||
:ref:`enabled <broad-crawls-scheduler-priority-queue>` for a significant
|
||||
scheduling improvement on crawls targetting multiple web domains, at the
|
||||
scheduling improvement on crawls targeting multiple web domains, at the
|
||||
cost of no :setting:`CONCURRENT_REQUESTS_PER_IP` support (:issue:`3520`)
|
||||
|
||||
* A new :attr:`Request.cb_kwargs <scrapy.http.Request.cb_kwargs>` attribute
|
||||
@ -2868,7 +2868,7 @@ Bug fixes
|
||||
- Fix for selected callbacks when using ``CrawlSpider`` with :command:`scrapy parse <parse>`
|
||||
(:issue:`2225`).
|
||||
- Fix for invalid JSON and XML files when spider yields no items (:issue:`872`).
|
||||
- Implement ``flush()`` fpr ``StreamLogger`` avoiding a warning in logs (:issue:`2125`).
|
||||
- Implement ``flush()`` for ``StreamLogger`` avoiding a warning in logs (:issue:`2125`).
|
||||
|
||||
Refactoring
|
||||
~~~~~~~~~~~
|
||||
@ -3731,7 +3731,7 @@ Scrapy 0.24.3 (2014-08-09)
|
||||
- adding some xpath tips to selectors docs (:commit:`2d103e0`)
|
||||
- fix tests to account for https://github.com/scrapy/w3lib/pull/23 (:commit:`f8d366a`)
|
||||
- get_func_args maximum recursion fix #728 (:commit:`81344ea`)
|
||||
- Updated input/ouput processor example according to #560. (:commit:`f7c4ea8`)
|
||||
- Updated input/output processor example according to #560. (:commit:`f7c4ea8`)
|
||||
- Fixed Python syntax in tutorial. (:commit:`db59ed9`)
|
||||
- Add test case for tunneling proxy (:commit:`f090260`)
|
||||
- Bugfix for leaking Proxy-Authorization header to remote host when using tunneling (:commit:`d8793af`)
|
||||
@ -4393,7 +4393,7 @@ Scrapyd changes
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
- Scrapyd now uses one process per spider
|
||||
- It stores one log file per spider run, and rotate them keeping the lastest 5 logs per spider (by default)
|
||||
- It stores one log file per spider run, and rotate them keeping the latest 5 logs per spider (by default)
|
||||
- A minimal web ui was added, available at http://localhost:6800 by default
|
||||
- There is now a ``scrapy server`` command to start a Scrapyd server of the current project
|
||||
|
||||
@ -4429,7 +4429,7 @@ New features and improvements
|
||||
- Added two new methods to item pipeline open_spider(), close_spider() with deferred support (#195)
|
||||
- Support for overriding default request headers per spider (#181)
|
||||
- Replaced default Spider Manager with one with similar functionality but not depending on Twisted Plugins (#186)
|
||||
- Splitted Debian package into two packages - the library and the service (#187)
|
||||
- Split Debian package into two packages - the library and the service (#187)
|
||||
- Scrapy log refactoring (#188)
|
||||
- New extension for keeping persistent spider contexts among different runs (#203)
|
||||
- Added ``dont_redirect`` request.meta key for avoiding redirects (#233)
|
||||
|
@ -1566,7 +1566,7 @@ If a reactor is already installed,
|
||||
|
||||
:meth:`CrawlerRunner.__init__ <scrapy.crawler.CrawlerRunner.__init__>` raises
|
||||
:exc:`Exception` if the installed reactor does not match the
|
||||
:setting:`TWISTED_REACTOR` setting; therfore, having top-level
|
||||
:setting:`TWISTED_REACTOR` setting; therefore, having top-level
|
||||
:mod:`~twisted.internet.reactor` imports in project files and imported
|
||||
third-party libraries will make Scrapy raise :exc:`Exception` when
|
||||
it checks which reactor is installed.
|
||||
@ -1658,7 +1658,7 @@ Default: ``"Scrapy/VERSION (+https://scrapy.org)"``
|
||||
The default User-Agent to use when crawling, unless overridden. This user agent is
|
||||
also used by :class:`~scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware`
|
||||
if :setting:`ROBOTSTXT_USER_AGENT` setting is ``None`` and
|
||||
there is no overridding User-Agent header specified for the request.
|
||||
there is no overriding User-Agent header specified for the request.
|
||||
|
||||
|
||||
Settings documented elsewhere:
|
||||
|
@ -99,7 +99,7 @@ Available Shortcuts
|
||||
shortcuts
|
||||
|
||||
- ``fetch(url[, redirect=True])`` - fetch a new response from the given URL
|
||||
and update all related objects accordingly. You can optionaly ask for HTTP
|
||||
and update all related objects accordingly. You can optionally ask for HTTP
|
||||
3xx redirections to not be followed by passing ``redirect=False``
|
||||
|
||||
- ``fetch(request)`` - fetch a new response from the given request and update
|
||||
|
@ -372,7 +372,7 @@ CrawlSpider
|
||||
described below. If multiple rules match the same link, the first one
|
||||
will be used, according to the order they're defined in this attribute.
|
||||
|
||||
This spider also exposes an overrideable method:
|
||||
This spider also exposes an overridable method:
|
||||
|
||||
.. method:: parse_start_url(response, **kwargs)
|
||||
|
||||
@ -534,7 +534,7 @@ XMLFeedSpider
|
||||
itertag = 'n:url'
|
||||
# ...
|
||||
|
||||
Apart from these new attributes, this spider has the following overrideable
|
||||
Apart from these new attributes, this spider has the following overridable
|
||||
methods too:
|
||||
|
||||
.. method:: adapt_response(response)
|
||||
|
@ -13,7 +13,7 @@ There are 3 numbers in a Scrapy version: *A.B.C*
|
||||
large changes.
|
||||
* *B* is the release number. This will include many changes including features
|
||||
and things that possibly break backward compatibility, although we strive to
|
||||
keep theses cases at a minimum.
|
||||
keep these cases at a minimum.
|
||||
* *C* is the bugfix release number.
|
||||
|
||||
Backward-incompatibilities are explicitly mentioned in the :ref:`release notes <news>`,
|
||||
|
@ -1,5 +1,5 @@
|
||||
"""
|
||||
A spider that generate light requests to meassure QPS troughput
|
||||
A spider that generate light requests to meassure QPS throughput
|
||||
|
||||
usage:
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
An extension to retry failed requests that are potentially caused by temporary
|
||||
problems such as a connection timeout or HTTP 500 error.
|
||||
|
||||
You can change the behaviour of this middleware by modifing the scraping settings:
|
||||
You can change the behaviour of this middleware by modifying the scraping settings:
|
||||
RETRY_TIMES - how many times to retry a failed page
|
||||
RETRY_HTTP_CODES - which HTTP response codes to retry
|
||||
|
||||
|
@ -30,7 +30,7 @@ class BaseItemExporter:
|
||||
self._configure(kwargs, dont_fail=dont_fail)
|
||||
|
||||
def _configure(self, options, dont_fail=False):
|
||||
"""Configure the exporter by poping options from the ``options`` dict.
|
||||
"""Configure the exporter by popping options from the ``options`` dict.
|
||||
If dont_fail is set, it won't raise an exception on unexpected options
|
||||
(useful for using with keyword arguments in subclasses ``__init__`` methods)
|
||||
"""
|
||||
|
@ -88,7 +88,7 @@ class LxmlParserLinkExtractor:
|
||||
def _process_links(self, links):
|
||||
""" Normalize and filter extracted links
|
||||
|
||||
The subclass should override it if neccessary
|
||||
The subclass should override it if necessary
|
||||
"""
|
||||
return self._deduplicate_if_needed(links)
|
||||
|
||||
|
@ -85,7 +85,7 @@ class S3FilesStore:
|
||||
AWS_USE_SSL = None
|
||||
AWS_VERIFY = None
|
||||
|
||||
POLICY = 'private' # Overriden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
|
||||
POLICY = 'private' # Overridden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
|
||||
HEADERS = {
|
||||
'Cache-Control': 'max-age=172800',
|
||||
}
|
||||
@ -142,7 +142,7 @@ class S3FilesStore:
|
||||
**extra)
|
||||
|
||||
def _headers_to_botocore_kwargs(self, headers):
|
||||
""" Convert headers to botocore keyword agruments.
|
||||
""" Convert headers to botocore keyword arguments.
|
||||
"""
|
||||
# This is required while we need to support both boto and botocore.
|
||||
mapping = CaselessDict({
|
||||
@ -190,7 +190,7 @@ class GCSFilesStore:
|
||||
CACHE_CONTROL = 'max-age=172800'
|
||||
|
||||
# The bucket's default object ACL will be applied to the object.
|
||||
# Overriden from settings.FILES_STORE_GCS_ACL in FilesPipeline.from_settings.
|
||||
# Overridden from settings.FILES_STORE_GCS_ACL in FilesPipeline.from_settings.
|
||||
POLICY = None
|
||||
|
||||
def __init__(self, uri):
|
||||
@ -291,7 +291,7 @@ class FilesPipeline(MediaPipeline):
|
||||
"""Abstract pipeline that implement the file downloading
|
||||
|
||||
This pipeline tries to minimize network transfers and file processing,
|
||||
doing stat of the files and determining if file is new, uptodate or
|
||||
doing stat of the files and determining if file is new, up-to-date or
|
||||
expired.
|
||||
|
||||
``new`` files are those that pipeline never processed and needs to be
|
||||
|
@ -43,7 +43,7 @@ class XMLFeedSpider(Spider):
|
||||
return response
|
||||
|
||||
def parse_node(self, response, selector):
|
||||
"""This method must be overriden with your custom spider functionality"""
|
||||
"""This method must be overridden with your custom spider functionality"""
|
||||
if hasattr(self, 'parse_item'): # backward compatibility
|
||||
return self.parse_item(response, selector)
|
||||
raise NotImplementedError
|
||||
@ -113,7 +113,7 @@ class CSVFeedSpider(Spider):
|
||||
return response
|
||||
|
||||
def parse_row(self, response, row):
|
||||
"""This method must be overriden with your custom spider functionality"""
|
||||
"""This method must be overridden with your custom spider functionality"""
|
||||
raise NotImplementedError
|
||||
|
||||
def parse_rows(self, response):
|
||||
|
@ -14,7 +14,7 @@ def _embed_ipython_shell(namespace={}, banner=''):
|
||||
@wraps(_embed_ipython_shell)
|
||||
def wrapper(namespace=namespace, banner=''):
|
||||
config = load_default_config()
|
||||
# Always use .instace() to ensure _instance propagation to all parents
|
||||
# Always use .instance() to ensure _instance propagation to all parents
|
||||
# this is needed for <TAB> completion works well for new imports
|
||||
# and clear the instance to always have the fresh env
|
||||
# on repeated breaks like with inspect_response()
|
||||
|
@ -41,7 +41,7 @@ class CaselessDict(dict):
|
||||
return key.lower()
|
||||
|
||||
def normvalue(self, value):
|
||||
"""Method to normalize values prior to be setted"""
|
||||
"""Method to normalize values prior to be set"""
|
||||
return value
|
||||
|
||||
def get(self, key, def_val=None):
|
||||
|
@ -34,7 +34,7 @@ def defer_succeed(result) -> Deferred:
|
||||
"""Same as twisted.internet.defer.succeed but delay calling callback until
|
||||
next reactor loop
|
||||
|
||||
It delays by 100ms so reactor has a chance to go trough readers and writers
|
||||
It delays by 100ms so reactor has a chance to go through readers and writers
|
||||
before attending pending delayed calls, so do not set delay to zero.
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
|
@ -48,7 +48,7 @@ def request_fingerprint(
|
||||
the fingerprint.
|
||||
|
||||
For this reason, request headers are ignored by default when calculating
|
||||
the fingeprint. If you want to include specific headers use the
|
||||
the fingerprint. If you want to include specific headers use the
|
||||
include_headers argument, which is a list of Request headers to include.
|
||||
|
||||
Also, servers usually ignore fragments in urls when handling requests,
|
||||
@ -78,7 +78,7 @@ def request_fingerprint(
|
||||
|
||||
|
||||
def request_authenticate(request: Request, username: str, password: str) -> None:
|
||||
"""Autenticate the given request (in place) using the HTTP basic access
|
||||
"""Authenticate the given request (in place) using the HTTP basic access
|
||||
authentication mechanism (RFC 2617) and the given username and password
|
||||
"""
|
||||
request.headers['Authorization'] = basic_auth_header(username, password)
|
||||
|
@ -260,7 +260,7 @@ ItemForm
|
||||
ia['width'] = x.x('//p[@class="width"]')
|
||||
ia['volume'] = x.x('//p[@class="volume"]')
|
||||
|
||||
# another example passing parametes on instance
|
||||
# another example passing parameters on instance
|
||||
ia = NewsForm(response, encoding='utf-8')
|
||||
ia['name'] = x.x('//p[@class="name"]')
|
||||
|
||||
|
@ -107,7 +107,7 @@ gUsing default_builder
|
||||
|
||||
|
||||
This will use default_builder as the builder for every field in the item class.
|
||||
As a reducer is not set reducers will be set based on Item Field classess.
|
||||
As a reducer is not set reducers will be set based on Item Field classes.
|
||||
|
||||
gReset default_builder for a field
|
||||
==================================
|
||||
|
@ -64,7 +64,7 @@ Request Processors takes requests objects and can perform any action to them,
|
||||
like filtering or modifying on the fly.
|
||||
|
||||
The current ``LinkExtractor`` had integrated link processing, like
|
||||
canonicalize. Request Processors can be reutilized and applied in serie.
|
||||
canonicalize. Request Processors can be reutilized and applied in series.
|
||||
|
||||
Request Generator
|
||||
-----------------
|
||||
|
@ -22,7 +22,7 @@ Instead, the hooks are spread over:
|
||||
* Downloader handlers (DOWNLOADER_HANDLERS)
|
||||
* Item pipelines (ITEM_PIPELINES)
|
||||
* Feed exporters and storages (FEED_EXPORTERS, FEED_STORAGES)
|
||||
* Overrideable components (DUPEFILTER_CLASS, STATS_CLASS, SCHEDULER, SPIDER_MANAGER_CLASS, ITEM_PROCESSOR, etc)
|
||||
* Overridable components (DUPEFILTER_CLASS, STATS_CLASS, SCHEDULER, SPIDER_MANAGER_CLASS, ITEM_PROCESSOR, etc)
|
||||
* Generic extensions (EXTENSIONS)
|
||||
* CLI commands (COMMANDS_MODULE)
|
||||
|
||||
|
@ -19,7 +19,7 @@ class BaseResponseTest(unittest.TestCase):
|
||||
response_class = Response
|
||||
|
||||
def test_init(self):
|
||||
# Response requires url in the consturctor
|
||||
# Response requires url in the constructor
|
||||
self.assertRaises(Exception, self.response_class)
|
||||
self.assertTrue(isinstance(self.response_class('http://example.com/'), self.response_class))
|
||||
self.assertRaises(TypeError, self.response_class, b"http://example.com")
|
||||
@ -392,7 +392,7 @@ class TextResponseTest(BaseResponseTest):
|
||||
def test_declared_encoding_invalid(self):
|
||||
"""Check that unknown declared encodings are ignored"""
|
||||
r = self.response_class("http://www.example.com",
|
||||
headers={"Content-type": ["text/html; charset=UKNOWN"]},
|
||||
headers={"Content-type": ["text/html; charset=UNKNOWN"]},
|
||||
body=b"\xc2\xa3")
|
||||
self.assertEqual(r._declared_encoding(), None)
|
||||
self._assert_response_values(r, 'utf-8', "\xa3")
|
||||
|
@ -106,9 +106,9 @@ class CrawlTestCase(TestCase):
|
||||
"""
|
||||
Downloader middleware which returns a response with an specific 'request' attribute.
|
||||
|
||||
* The spider callback should receive the overriden response.request
|
||||
* Handlers listening to the response_received signal should receive the overriden response.request
|
||||
* The "crawled" log message should show the overriden response.request
|
||||
* The spider callback should receive the overridden response.request
|
||||
* Handlers listening to the response_received signal should receive the overridden response.request
|
||||
* The "crawled" log message should show the overridden response.request
|
||||
"""
|
||||
signal_params = {}
|
||||
|
||||
@ -144,7 +144,7 @@ class CrawlTestCase(TestCase):
|
||||
An exception is raised but caught by the next middleware, which
|
||||
returns a Response with a specific 'request' attribute.
|
||||
|
||||
The spider callback should receive the overriden response.request
|
||||
The spider callback should receive the overridden response.request
|
||||
"""
|
||||
url = self.mockserver.url("/status?n=200")
|
||||
runner = CrawlerRunner(settings={
|
||||
|
@ -23,7 +23,7 @@ class MustbeDeferredTest(unittest.TestCase):
|
||||
|
||||
dfd = mustbe_deferred(_append, 1)
|
||||
dfd.addCallback(self.assertEqual, [1, 2]) # it is [1] with maybeDeferred
|
||||
steps.append(2) # add another value, that should be catched by assertEqual
|
||||
steps.append(2) # add another value, that should be caught by assertEqual
|
||||
return dfd
|
||||
|
||||
def test_unfired_deferred(self):
|
||||
@ -37,7 +37,7 @@ class MustbeDeferredTest(unittest.TestCase):
|
||||
|
||||
dfd = mustbe_deferred(_append, 1)
|
||||
dfd.addCallback(self.assertEqual, [1, 2]) # it is [1] with maybeDeferred
|
||||
steps.append(2) # add another value, that should be catched by assertEqual
|
||||
steps.append(2) # add another value, that should be caught by assertEqual
|
||||
return dfd
|
||||
|
||||
|
||||
|
@ -36,7 +36,7 @@ class UtilsRenderTemplateFileTestCase(unittest.TestCase):
|
||||
self.assertEqual(result.read().decode('utf8'), rendered)
|
||||
|
||||
os.remove(render_path)
|
||||
assert not os.path.exists(render_path) # Failure of test iself
|
||||
assert not os.path.exists(render_path) # Failure of test itself
|
||||
|
||||
|
||||
if '__main__' == __name__:
|
||||
|
Loading…
x
Reference in New Issue
Block a user