1
0
mirror of https://github.com/scrapy/scrapy.git synced 2025-02-06 11:00:46 +00:00

adding black formatter to all the code

This commit is contained in:
Emmanuel Rondan 2022-11-29 11:30:46 -03:00
parent 5bd27191a2
commit e211ec0aa2
308 changed files with 16487 additions and 11921 deletions

View File

@ -9,7 +9,7 @@ from tests.keys import generate_keys
def _py_files(folder):
return (str(p) for p in Path(folder).rglob('*.py'))
return (str(p) for p in Path(folder).rglob("*.py"))
collect_ignore = [
@ -21,16 +21,16 @@ collect_ignore = [
*_py_files("tests/CrawlerRunner"),
]
with Path('tests/ignores.txt').open(encoding="utf-8") as reader:
with Path("tests/ignores.txt").open(encoding="utf-8") as reader:
for line in reader:
file_path = line.strip()
if file_path and file_path[0] != '#':
if file_path and file_path[0] != "#":
collect_ignore.append(file_path)
if not H2_ENABLED:
collect_ignore.extend(
(
'scrapy/core/downloader/handlers/http2.py',
"scrapy/core/downloader/handlers/http2.py",
*_py_files("scrapy/core/http2"),
)
)
@ -50,7 +50,7 @@ def pytest_addoption(parser):
)
@pytest.fixture(scope='class')
@pytest.fixture(scope="class")
def reactor_pytest(request):
if not request.cls:
# doctests
@ -61,14 +61,17 @@ def reactor_pytest(request):
@pytest.fixture(autouse=True)
def only_asyncio(request, reactor_pytest):
if request.node.get_closest_marker('only_asyncio') and reactor_pytest != 'asyncio':
pytest.skip('This test is only run with --reactor=asyncio')
if request.node.get_closest_marker("only_asyncio") and reactor_pytest != "asyncio":
pytest.skip("This test is only run with --reactor=asyncio")
@pytest.fixture(autouse=True)
def only_not_asyncio(request, reactor_pytest):
if request.node.get_closest_marker('only_not_asyncio') and reactor_pytest == 'asyncio':
pytest.skip('This test is only run without --reactor=asyncio')
if (
request.node.get_closest_marker("only_not_asyncio")
and reactor_pytest == "asyncio"
):
pytest.skip("This test is only run without --reactor=asyncio")
def pytest_configure(config):

View File

@ -11,15 +11,15 @@ class settingslist_node(nodes.General, nodes.Element):
class SettingsListDirective(Directive):
def run(self):
return [settingslist_node('')]
return [settingslist_node("")]
def is_setting_index(node):
if node.tagname == 'index' and node['entries']:
if node.tagname == "index" and node["entries"]:
# index entries for setting directives look like:
# [('pair', 'SETTING_NAME; setting', 'std:setting-SETTING_NAME', '')]
entry_type, info, refid = node['entries'][0][:3]
return entry_type == 'pair' and info.endswith('; setting')
entry_type, info, refid = node["entries"][0][:3]
return entry_type == "pair" and info.endswith("; setting")
return False
@ -30,14 +30,14 @@ def get_setting_target(node):
def get_setting_name_and_refid(node):
"""Extract setting name from directive index node"""
entry_type, info, refid = node['entries'][0][:3]
return info.replace('; setting', ''), refid
entry_type, info, refid = node["entries"][0][:3]
return info.replace("; setting", ""), refid
def collect_scrapy_settings_refs(app, doctree):
env = app.builder.env
if not hasattr(env, 'scrapy_all_settings'):
if not hasattr(env, "scrapy_all_settings"):
env.scrapy_all_settings = []
for node in doctree.traverse(is_setting_index):
@ -46,18 +46,23 @@ def collect_scrapy_settings_refs(app, doctree):
setting_name, refid = get_setting_name_and_refid(node)
env.scrapy_all_settings.append({
'docname': env.docname,
'setting_name': setting_name,
'refid': refid,
})
env.scrapy_all_settings.append(
{
"docname": env.docname,
"setting_name": setting_name,
"refid": refid,
}
)
def make_setting_element(setting_data, app, fromdocname):
refnode = make_refnode(app.builder, fromdocname,
todocname=setting_data['docname'],
targetid=setting_data['refid'],
child=nodes.Text(setting_data['setting_name']))
refnode = make_refnode(
app.builder,
fromdocname,
todocname=setting_data["docname"],
targetid=setting_data["refid"],
child=nodes.Text(setting_data["setting_name"]),
)
p = nodes.paragraph()
p += refnode
@ -71,10 +76,13 @@ def replace_settingslist_nodes(app, doctree, fromdocname):
for node in doctree.traverse(settingslist_node):
settings_list = nodes.bullet_list()
settings_list.extend([make_setting_element(d, app, fromdocname)
for d in sorted(env.scrapy_all_settings,
key=itemgetter('setting_name'))
if fromdocname != d['docname']])
settings_list.extend(
[
make_setting_element(d, app, fromdocname)
for d in sorted(env.scrapy_all_settings, key=itemgetter("setting_name"))
if fromdocname != d["docname"]
]
)
node.replace_self(settings_list)
@ -99,41 +107,41 @@ def setup(app):
rolename="reqmeta",
indextemplate="pair: %s; reqmeta",
)
app.add_role('source', source_role)
app.add_role('commit', commit_role)
app.add_role('issue', issue_role)
app.add_role('rev', rev_role)
app.add_role("source", source_role)
app.add_role("commit", commit_role)
app.add_role("issue", issue_role)
app.add_role("rev", rev_role)
app.add_node(settingslist_node)
app.add_directive('settingslist', SettingsListDirective)
app.add_directive("settingslist", SettingsListDirective)
app.connect('doctree-read', collect_scrapy_settings_refs)
app.connect('doctree-resolved', replace_settingslist_nodes)
app.connect("doctree-read", collect_scrapy_settings_refs)
app.connect("doctree-resolved", replace_settingslist_nodes)
def source_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
ref = 'https://github.com/scrapy/scrapy/blob/master/' + text
ref = "https://github.com/scrapy/scrapy/blob/master/" + text
set_classes(options)
node = nodes.reference(rawtext, text, refuri=ref, **options)
return [node], []
def issue_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
ref = 'https://github.com/scrapy/scrapy/issues/' + text
ref = "https://github.com/scrapy/scrapy/issues/" + text
set_classes(options)
node = nodes.reference(rawtext, 'issue ' + text, refuri=ref, **options)
node = nodes.reference(rawtext, "issue " + text, refuri=ref, **options)
return [node], []
def commit_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
ref = 'https://github.com/scrapy/scrapy/commit/' + text
ref = "https://github.com/scrapy/scrapy/commit/" + text
set_classes(options)
node = nodes.reference(rawtext, 'commit ' + text, refuri=ref, **options)
node = nodes.reference(rawtext, "commit " + text, refuri=ref, **options)
return [node], []
def rev_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
ref = 'http://hg.scrapy.org/scrapy/changeset/' + text
ref = "http://hg.scrapy.org/scrapy/changeset/" + text
set_classes(options)
node = nodes.reference(rawtext, 'r' + text, refuri=ref, **options)
node = nodes.reference(rawtext, "r" + text, refuri=ref, **options)
return [node], []

View File

@ -25,30 +25,30 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
'hoverxref.extension',
'notfound.extension',
'scrapydocs',
'sphinx.ext.autodoc',
'sphinx.ext.coverage',
'sphinx.ext.intersphinx',
'sphinx.ext.viewcode',
"hoverxref.extension",
"notfound.extension",
"scrapydocs",
"sphinx.ext.autodoc",
"sphinx.ext.coverage",
"sphinx.ext.intersphinx",
"sphinx.ext.viewcode",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
templates_path = ["_templates"]
# The suffix of source filenames.
source_suffix = '.rst'
source_suffix = ".rst"
# The encoding of source files.
#source_encoding = 'utf-8'
# source_encoding = 'utf-8'
# The master toctree document.
master_doc = 'index'
master_doc = "index"
# General information about the project.
project = 'Scrapy'
copyright = f'2008{datetime.now().year}, Scrapy developers'
project = "Scrapy"
copyright = f"2008{datetime.now().year}, Scrapy developers"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@ -57,50 +57,51 @@ copyright = f'2008{datetime.now().year}, Scrapy developers'
# The short X.Y version.
try:
import scrapy
version = '.'.join(map(str, scrapy.version_info[:2]))
version = ".".join(map(str, scrapy.version_info[:2]))
release = scrapy.__version__
except ImportError:
version = ''
release = ''
version = ""
release = ""
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
language = 'en'
language = "en"
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# today_fmt = '%B %d, %Y'
# List of documents that shouldn't be included in the build.
#unused_docs = []
# unused_docs = []
exclude_patterns = ['build']
exclude_patterns = ["build"]
# List of directories, relative to source directory, that shouldn't be searched
# for source files.
exclude_trees = ['.build']
exclude_trees = [".build"]
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
pygments_style = "sphinx"
# List of Sphinx warnings that will not be raised
suppress_warnings = ['epub.unknown_project_files']
suppress_warnings = ["epub.unknown_project_files"]
# Options for HTML output
@ -108,17 +109,18 @@ suppress_warnings = ['epub.unknown_project_files']
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'sphinx_rtd_theme'
html_theme = "sphinx_rtd_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
# Add path to the RTD explicitly to robustify builds (otherwise might
# fail in a clean Debian build env)
import sphinx_rtd_theme
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# The style sheet to use for HTML and HTML Help pages. A file of that name
@ -128,44 +130,44 @@ html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_static_path = ["_static"]
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
html_last_updated_fmt = '%b %d, %Y'
html_last_updated_fmt = "%b %d, %Y"
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# html_additional_pages = {}
# If false, no module index is generated.
#html_use_modindex = True
# html_use_modindex = True
# If false, no index is generated.
#html_use_index = True
# html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# html_split_index = False
# If true, the reST sources are included in the HTML build as _sources/<name>.
html_copy_source = True
@ -173,16 +175,16 @@ html_copy_source = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# html_use_opensearch = ''
# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = ''
# html_file_suffix = ''
# Output file base name for HTML help builder.
htmlhelp_basename = 'Scrapydoc'
htmlhelp_basename = "Scrapydoc"
html_css_files = [
'custom.css',
"custom.css",
]
@ -190,34 +192,33 @@ html_css_files = [
# ------------------------
# The paper size ('letter' or 'a4').
#latex_paper_size = 'letter'
# latex_paper_size = 'letter'
# The font size ('10pt', '11pt' or '12pt').
#latex_font_size = '10pt'
# latex_font_size = '10pt'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, document class [howto/manual]).
latex_documents = [
('index', 'Scrapy.tex', 'Scrapy Documentation',
'Scrapy developers', 'manual'),
("index", "Scrapy.tex", "Scrapy Documentation", "Scrapy developers", "manual"),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# latex_use_parts = False
# Additional stuff for the LaTeX preamble.
#latex_preamble = ''
# latex_preamble = ''
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# latex_appendices = []
# If false, no module index is generated.
#latex_use_modindex = True
# latex_use_modindex = True
# Options for the linkcheck builder
@ -226,8 +227,9 @@ latex_documents = [
# A list of regular expressions that match URIs that should not be checked when
# doing a linkcheck build.
linkcheck_ignore = [
'http://localhost:\d+', 'http://hg.scrapy.org',
'http://directory.google.com/'
"http://localhost:\d+",
"http://hg.scrapy.org",
"http://directory.google.com/",
]
@ -237,44 +239,35 @@ coverage_ignore_pyobjects = [
# Contracts add_pre_hook and add_post_hook are not documented because
# they should be transparent to contract developers, for whom pre_hook and
# post_hook should be the actual concern.
r'\bContract\.add_(pre|post)_hook$',
r"\bContract\.add_(pre|post)_hook$",
# ContractsManager is an internal class, developers are not expected to
# interact with it directly in any way.
r'\bContractsManager\b$',
r"\bContractsManager\b$",
# For default contracts we only want to document their general purpose in
# their __init__ method, the methods they reimplement to achieve that purpose
# should be irrelevant to developers using those contracts.
r'\w+Contract\.(adjust_request_args|(pre|post)_process)$',
r"\w+Contract\.(adjust_request_args|(pre|post)_process)$",
# Methods of downloader middlewares are not documented, only the classes
# themselves, since downloader middlewares are controlled through Scrapy
# settings.
r'^scrapy\.downloadermiddlewares\.\w*?\.(\w*?Middleware|DownloaderStats)\.',
r"^scrapy\.downloadermiddlewares\.\w*?\.(\w*?Middleware|DownloaderStats)\.",
# Base classes of downloader middlewares are implementation details that
# are not meant for users.
r'^scrapy\.downloadermiddlewares\.\w*?\.Base\w*?Middleware',
r"^scrapy\.downloadermiddlewares\.\w*?\.Base\w*?Middleware",
# Private exception used by the command-line interface implementation.
r'^scrapy\.exceptions\.UsageError',
r"^scrapy\.exceptions\.UsageError",
# Methods of BaseItemExporter subclasses are only documented in
# BaseItemExporter.
r'^scrapy\.exporters\.(?!BaseItemExporter\b)\w*?\.',
r"^scrapy\.exporters\.(?!BaseItemExporter\b)\w*?\.",
# Extension behavior is only modified through settings. Methods of
# extension classes, as well as helper functions, are implementation
# details that are not documented.
r'^scrapy\.extensions\.[a-z]\w*?\.[A-Z]\w*?\.', # methods
r'^scrapy\.extensions\.[a-z]\w*?\.[a-z]', # helper functions
r"^scrapy\.extensions\.[a-z]\w*?\.[A-Z]\w*?\.", # methods
r"^scrapy\.extensions\.[a-z]\w*?\.[a-z]", # helper functions
# Never documented before, and deprecated now.
r'^scrapy\.linkextractors\.FilteringLinkExtractor$',
r"^scrapy\.linkextractors\.FilteringLinkExtractor$",
# Implementation detail of LxmlLinkExtractor
r'^scrapy\.linkextractors\.lxmlhtml\.LxmlParserLinkExtractor',
r"^scrapy\.linkextractors\.lxmlhtml\.LxmlParserLinkExtractor",
]
@ -282,18 +275,18 @@ coverage_ignore_pyobjects = [
# -------------------------------------
intersphinx_mapping = {
'attrs': ('https://www.attrs.org/en/stable/', None),
'coverage': ('https://coverage.readthedocs.io/en/stable', None),
'cryptography' : ('https://cryptography.io/en/latest/', None),
'cssselect': ('https://cssselect.readthedocs.io/en/latest', None),
'itemloaders': ('https://itemloaders.readthedocs.io/en/latest/', None),
'pytest': ('https://docs.pytest.org/en/latest', None),
'python': ('https://docs.python.org/3', None),
'sphinx': ('https://www.sphinx-doc.org/en/master', None),
'tox': ('https://tox.wiki/en/latest/', None),
'twisted': ('https://docs.twisted.org/en/stable/', None),
'twistedapi': ('https://docs.twisted.org/en/stable/api/', None),
'w3lib': ('https://w3lib.readthedocs.io/en/latest', None),
"attrs": ("https://www.attrs.org/en/stable/", None),
"coverage": ("https://coverage.readthedocs.io/en/stable", None),
"cryptography": ("https://cryptography.io/en/latest/", None),
"cssselect": ("https://cssselect.readthedocs.io/en/latest", None),
"itemloaders": ("https://itemloaders.readthedocs.io/en/latest/", None),
"pytest": ("https://docs.pytest.org/en/latest", None),
"python": ("https://docs.python.org/3", None),
"sphinx": ("https://www.sphinx-doc.org/en/master", None),
"tox": ("https://tox.wiki/en/latest/", None),
"twisted": ("https://docs.twisted.org/en/stable/", None),
"twistedapi": ("https://docs.twisted.org/en/stable/api/", None),
"w3lib": ("https://w3lib.readthedocs.io/en/latest", None),
}
intersphinx_disabled_reftypes = []
@ -313,16 +306,16 @@ hoverxref_role_types = {
"setting": "tooltip",
"signal": "tooltip",
}
hoverxref_roles = ['command', 'reqmeta', 'setting', 'signal']
hoverxref_roles = ["command", "reqmeta", "setting", "signal"]
def setup(app):
app.connect('autodoc-skip-member', maybe_skip_member)
app.connect("autodoc-skip-member", maybe_skip_member)
def maybe_skip_member(app, what, name, obj, skip, options):
if not skip:
# autodocs was generating a text "alias of" for the following members
# https://github.com/sphinx-doc/sphinx/issues/4422
return name in {'default_item_class', 'default_selector_class'}
return name in {"default_item_class", "default_selector_class"}
return skip

View File

@ -15,20 +15,20 @@ from scrapy.http.response.html import HtmlResponse
def load_response(url: str, filename: str) -> HtmlResponse:
input_path = Path(__file__).parent / '_tests' / filename
input_path = Path(__file__).parent / "_tests" / filename
return HtmlResponse(url, body=input_path.read_bytes())
def setup(namespace):
namespace['load_response'] = load_response
namespace["load_response"] = load_response
pytest_collect_file = Sybil(
parsers=[
DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE),
PythonCodeBlockParser(future_imports=['print_function']),
PythonCodeBlockParser(future_imports=["print_function"]),
skip,
],
pattern='*.rst',
pattern="*.rst",
setup=setup,
).pytest()

View File

@ -25,7 +25,7 @@ def main():
_contents = None
# A regex that matches standard linkcheck output lines
line_re = re.compile(r'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))')
line_re = re.compile(r"(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))")
# Read lines from the linkcheck output file
try:
@ -66,5 +66,5 @@ def main():
print("Not Understood: " + line)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -7,7 +7,6 @@ from twisted.internet import reactor
class Root(Resource):
def __init__(self):
Resource.__init__(self)
self.concurrent = 0
@ -26,9 +25,9 @@ class Root(Resource):
delta = now - self.lasttime
# reset stats on high iter-request times caused by client restarts
if delta > 3: # seconds
if delta > 3: # seconds
self._reset_stats()
return ''
return ""
self.tail.appendleft(delta)
self.lasttime = now
@ -37,15 +36,17 @@ class Root(Resource):
if now - self.lastmark >= 3:
self.lastmark = now
qps = len(self.tail) / sum(self.tail)
print(f'samplesize={len(self.tail)} concurrent={self.concurrent} qps={qps:0.2f}')
print(
f"samplesize={len(self.tail)} concurrent={self.concurrent} qps={qps:0.2f}"
)
if 'latency' in request.args:
latency = float(request.args['latency'][0])
if "latency" in request.args:
latency = float(request.args["latency"][0])
reactor.callLater(latency, self._finish, request)
return NOT_DONE_YET
self.concurrent -= 1
return ''
return ""
def _finish(self, request):
self.concurrent -= 1

View File

@ -13,13 +13,13 @@ from scrapy.http import Request
class QPSSpider(Spider):
name = 'qps'
benchurl = 'http://localhost:8880/'
name = "qps"
benchurl = "http://localhost:8880/"
# Max concurrency is limited by global CONCURRENT_REQUESTS setting
max_concurrent_requests = 8
# Requests per second goal
qps = None # same as: 1 / download_delay
qps = None # same as: 1 / download_delay
download_delay = None
# time in seconds to delay server responses
latency = None
@ -37,11 +37,11 @@ class QPSSpider(Spider):
def start_requests(self):
url = self.benchurl
if self.latency is not None:
url += f'?latency={self.latency}'
url += f"?latency={self.latency}"
slots = int(self.slots)
if slots > 1:
urls = [url.replace('localhost', f'127.0.0.{x + 1}') for x in range(slots)]
urls = [url.replace("localhost", f"127.0.0.{x + 1}") for x in range(slots)]
else:
urls = [url]

View File

@ -16,14 +16,21 @@ from scrapy.item import Item, Field
__all__ = [
'__version__', 'version_info', 'twisted_version', 'Spider',
'Request', 'FormRequest', 'Selector', 'Item', 'Field',
"__version__",
"version_info",
"twisted_version",
"Spider",
"Request",
"FormRequest",
"Selector",
"Item",
"Field",
]
# Scrapy and Twisted versions
__version__ = (pkgutil.get_data(__package__, "VERSION") or b"").decode("ascii").strip()
version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split('.'))
version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split("."))
twisted_version = (_txv.major, _txv.minor, _txv.micro)
@ -34,7 +41,7 @@ if sys.version_info < (3, 7):
# Ignore noisy twisted deprecation warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
warnings.filterwarnings("ignore", category=DeprecationWarning, module="twisted")
del pkgutil

View File

@ -1,4 +1,4 @@
from scrapy.cmdline import execute
if __name__ == '__main__':
if __name__ == "__main__":
execute()

View File

@ -17,7 +17,7 @@ from scrapy.utils.python import garbage_collect
class ScrapyArgumentParser(argparse.ArgumentParser):
def _parse_optional(self, arg_string):
# if starts with -: it means that is a parameter not a argument
if arg_string[:2] == '-:':
if arg_string[:2] == "-:":
return None
return super()._parse_optional(arg_string)
@ -41,12 +41,12 @@ def _get_commands_from_module(module, inproject):
d = {}
for cmd in _iter_command_classes(module):
if inproject or not cmd.requires_project:
cmdname = cmd.__module__.split('.')[-1]
cmdname = cmd.__module__.split(".")[-1]
d[cmdname] = cmd()
return d
def _get_commands_from_entry_points(inproject, group='scrapy.commands'):
def _get_commands_from_entry_points(inproject, group="scrapy.commands"):
cmds = {}
for entry_point in pkg_resources.iter_entry_points(group):
obj = entry_point.load()
@ -58,9 +58,9 @@ def _get_commands_from_entry_points(inproject, group='scrapy.commands'):
def _get_commands_dict(settings, inproject):
cmds = _get_commands_from_module('scrapy.commands', inproject)
cmds = _get_commands_from_module("scrapy.commands", inproject)
cmds.update(_get_commands_from_entry_points(inproject))
cmds_module = settings['COMMANDS_MODULE']
cmds_module = settings["COMMANDS_MODULE"]
if cmds_module:
cmds.update(_get_commands_from_module(cmds_module, inproject))
return cmds
@ -69,7 +69,7 @@ def _get_commands_dict(settings, inproject):
def _pop_command_name(argv):
i = 0
for arg in argv[1:]:
if not arg.startswith('-'):
if not arg.startswith("-"):
del argv[i]
return arg
i += 1
@ -124,11 +124,11 @@ def execute(argv=None, settings=None):
settings = get_project_settings()
# set EDITOR from environment if available
try:
editor = os.environ['EDITOR']
editor = os.environ["EDITOR"]
except KeyError:
pass
else:
settings['EDITOR'] = editor
settings["EDITOR"] = editor
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
@ -141,11 +141,13 @@ def execute(argv=None, settings=None):
sys.exit(2)
cmd = cmds[cmdname]
parser = ScrapyArgumentParser(formatter_class=ScrapyHelpFormatter,
usage=f"scrapy {cmdname} {cmd.syntax()}",
conflict_handler='resolve',
description=cmd.long_desc())
settings.setdict(cmd.default_settings, priority='command')
parser = ScrapyArgumentParser(
formatter_class=ScrapyHelpFormatter,
usage=f"scrapy {cmdname} {cmd.syntax()}",
conflict_handler="resolve",
description=cmd.long_desc(),
)
settings.setdict(cmd.default_settings, priority="command")
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_known_args(args=argv[1:])
@ -168,12 +170,12 @@ def _run_command_profiled(cmd, args, opts):
sys.stderr.write(f"scrapy: writing cProfile stats to {opts.profile!r}\n")
loc = locals()
p = cProfile.Profile()
p.runctx('cmd.run(args, opts)', globals(), loc)
p.runctx("cmd.run(args, opts)", globals(), loc)
if opts.profile:
p.dump_stats(opts.profile)
if __name__ == '__main__':
if __name__ == "__main__":
try:
execute()
finally:

View File

@ -27,7 +27,7 @@ class ScrapyCommand:
self.settings: Any = None # set in scrapy.cmdline
def set_crawler(self, crawler):
if hasattr(self, '_crawler'):
if hasattr(self, "_crawler"):
raise RuntimeError("crawler already set")
self._crawler = crawler
@ -61,41 +61,58 @@ class ScrapyCommand:
"""
Populate option parse with options available for this command
"""
group = parser.add_argument_group(title='Global Options')
group.add_argument("--logfile", metavar="FILE",
help="log file. if omitted stderr will be used")
group.add_argument("-L", "--loglevel", metavar="LEVEL", default=None,
help=f"log level (default: {self.settings['LOG_LEVEL']})")
group.add_argument("--nolog", action="store_true",
help="disable logging completely")
group.add_argument("--profile", metavar="FILE", default=None,
help="write python cProfile stats to FILE")
group.add_argument("--pidfile", metavar="FILE",
help="write process ID to FILE")
group.add_argument("-s", "--set", action="append", default=[], metavar="NAME=VALUE",
help="set/override setting (may be repeated)")
group = parser.add_argument_group(title="Global Options")
group.add_argument(
"--logfile", metavar="FILE", help="log file. if omitted stderr will be used"
)
group.add_argument(
"-L",
"--loglevel",
metavar="LEVEL",
default=None,
help=f"log level (default: {self.settings['LOG_LEVEL']})",
)
group.add_argument(
"--nolog", action="store_true", help="disable logging completely"
)
group.add_argument(
"--profile",
metavar="FILE",
default=None,
help="write python cProfile stats to FILE",
)
group.add_argument("--pidfile", metavar="FILE", help="write process ID to FILE")
group.add_argument(
"-s",
"--set",
action="append",
default=[],
metavar="NAME=VALUE",
help="set/override setting (may be repeated)",
)
group.add_argument("--pdb", action="store_true", help="enable pdb on failure")
def process_options(self, args, opts):
try:
self.settings.setdict(arglist_to_dict(opts.set),
priority='cmdline')
self.settings.setdict(arglist_to_dict(opts.set), priority="cmdline")
except ValueError:
raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False)
if opts.logfile:
self.settings.set('LOG_ENABLED', True, priority='cmdline')
self.settings.set('LOG_FILE', opts.logfile, priority='cmdline')
self.settings.set("LOG_ENABLED", True, priority="cmdline")
self.settings.set("LOG_FILE", opts.logfile, priority="cmdline")
if opts.loglevel:
self.settings.set('LOG_ENABLED', True, priority='cmdline')
self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline')
self.settings.set("LOG_ENABLED", True, priority="cmdline")
self.settings.set("LOG_LEVEL", opts.loglevel, priority="cmdline")
if opts.nolog:
self.settings.set('LOG_ENABLED', False, priority='cmdline')
self.settings.set("LOG_ENABLED", False, priority="cmdline")
if opts.pidfile:
Path(opts.pidfile).write_text(str(os.getpid()) + os.linesep, encoding="utf-8")
Path(opts.pidfile).write_text(
str(os.getpid()) + os.linesep, encoding="utf-8"
)
if opts.pdb:
failure.startDebugMode()
@ -111,18 +128,39 @@ class BaseRunSpiderCommand(ScrapyCommand):
"""
Common class used to share functionality between the crawl, parse and runspider commands
"""
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_argument("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_argument("-o", "--output", metavar="FILE", action="append",
help="append scraped items to the end of FILE (use - for stdout),"
" to define format set a colon at the end of the output URI (i.e. -o FILE:FORMAT)")
parser.add_argument("-O", "--overwrite-output", metavar="FILE", action="append",
help="dump scraped items into FILE, overwriting any existing file,"
" to define format set a colon at the end of the output URI (i.e. -O FILE:FORMAT)")
parser.add_argument("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items")
parser.add_argument(
"-a",
dest="spargs",
action="append",
default=[],
metavar="NAME=VALUE",
help="set spider argument (may be repeated)",
)
parser.add_argument(
"-o",
"--output",
metavar="FILE",
action="append",
help="append scraped items to the end of FILE (use - for stdout),"
" to define format set a colon at the end of the output URI (i.e. -o FILE:FORMAT)",
)
parser.add_argument(
"-O",
"--overwrite-output",
metavar="FILE",
action="append",
help="dump scraped items into FILE, overwriting any existing file,"
" to define format set a colon at the end of the output URI (i.e. -O FILE:FORMAT)",
)
parser.add_argument(
"-t",
"--output-format",
metavar="FORMAT",
help="format to use for dumping items",
)
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
@ -137,16 +175,21 @@ class BaseRunSpiderCommand(ScrapyCommand):
opts.output_format,
opts.overwrite_output,
)
self.settings.set('FEEDS', feeds, priority='cmdline')
self.settings.set("FEEDS", feeds, priority="cmdline")
class ScrapyHelpFormatter(argparse.HelpFormatter):
"""
Help Formatter for scrapy command line help messages.
"""
def __init__(self, prog, indent_increment=2, max_help_position=24, width=None):
super().__init__(prog, indent_increment=indent_increment,
max_help_position=max_help_position, width=width)
super().__init__(
prog,
indent_increment=indent_increment,
max_help_position=max_help_position,
width=width,
)
def _join_parts(self, part_strings):
parts = self.format_part_strings(part_strings)
@ -157,11 +200,13 @@ class ScrapyHelpFormatter(argparse.HelpFormatter):
Underline and title case command line help message headers.
"""
if part_strings and part_strings[0].startswith("usage: "):
part_strings[0] = "Usage\n=====\n " + part_strings[0][len('usage: '):]
headings = [i for i in range(len(part_strings)) if part_strings[i].endswith(':\n')]
part_strings[0] = "Usage\n=====\n " + part_strings[0][len("usage: ") :]
headings = [
i for i in range(len(part_strings)) if part_strings[i].endswith(":\n")
]
for index in headings[::-1]:
char = '-' if "Global Options" in part_strings[index] else '='
char = "-" if "Global Options" in part_strings[index] else "="
part_strings[index] = part_strings[index][:-2].title()
underline = ''.join(["\n", (char * len(part_strings[index])), "\n"])
underline = "".join(["\n", (char * len(part_strings[index])), "\n"])
part_strings.insert(index + 1, underline)
return part_strings

View File

@ -11,9 +11,9 @@ from scrapy.linkextractors import LinkExtractor
class Command(ScrapyCommand):
default_settings = {
'LOG_LEVEL': 'INFO',
'LOGSTATS_INTERVAL': 1,
'CLOSESPIDER_TIMEOUT': 10,
"LOG_LEVEL": "INFO",
"LOGSTATS_INTERVAL": 1,
"CLOSESPIDER_TIMEOUT": 10,
}
def short_desc(self):
@ -26,12 +26,11 @@ class Command(ScrapyCommand):
class _BenchServer:
def __enter__(self):
from scrapy.utils.test import get_testenv
pargs = [sys.executable, '-u', '-m', 'scrapy.utils.benchserver']
self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE,
env=get_testenv())
pargs = [sys.executable, "-u", "-m", "scrapy.utils.benchserver"]
self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE, env=get_testenv())
self.proc.stdout.readline()
def __exit__(self, exc_type, exc_value, traceback):
@ -42,15 +41,16 @@ class _BenchServer:
class _BenchSpider(scrapy.Spider):
"""A spider that follows all links"""
name = 'follow'
name = "follow"
total = 10000
show = 20
baseurl = 'http://localhost:8998'
baseurl = "http://localhost:8998"
link_extractor = LinkExtractor()
def start_requests(self):
qargs = {'total': self.total, 'show': self.show}
url = f'{self.baseurl}?{urlencode(qargs, doseq=True)}'
qargs = {"total": self.total, "show": self.show}
url = f"{self.baseurl}?{urlencode(qargs, doseq=True)}"
return [scrapy.Request(url, dont_filter=True)]
def parse(self, response):

View File

@ -39,7 +39,7 @@ class TextTestResult(_TextTestResult):
class Command(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
default_settings = {"LOG_ENABLED": False}
def syntax(self):
return "[options] <spider>"
@ -49,14 +49,25 @@ class Command(ScrapyCommand):
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_argument("-l", "--list", dest="list", action="store_true",
help="only list contracts, without checking them")
parser.add_argument("-v", "--verbose", dest="verbose", default=False, action='store_true',
help="print contract tests for all spiders")
parser.add_argument(
"-l",
"--list",
dest="list",
action="store_true",
help="only list contracts, without checking them",
)
parser.add_argument(
"-v",
"--verbose",
dest="verbose",
default=False,
action="store_true",
help="print contract tests for all spiders",
)
def run(self, args, opts):
# load contracts
contracts = build_component_list(self.settings.getwithbase('SPIDER_CONTRACTS'))
contracts = build_component_list(self.settings.getwithbase("SPIDER_CONTRACTS"))
conman = ContractsManager(load_object(c) for c in contracts)
runner = TextTestRunner(verbosity=2 if opts.verbose else 1)
result = TextTestResult(runner.stream, runner.descriptions, runner.verbosity)
@ -66,7 +77,7 @@ class Command(ScrapyCommand):
spider_loader = self.crawler_process.spider_loader
with set_environ(SCRAPY_CHECK='true'):
with set_environ(SCRAPY_CHECK="true"):
for spidername in args or spider_loader.list():
spidercls = spider_loader.load(spidername)
spidercls.start_requests = lambda s: conman.from_spider(s, result)
@ -85,7 +96,7 @@ class Command(ScrapyCommand):
continue
print(spider)
for method in sorted(methods):
print(f' * {method}')
print(f" * {method}")
else:
start = time.time()
self.crawler_process.start()

View File

@ -16,18 +16,23 @@ class Command(BaseRunSpiderCommand):
if len(args) < 1:
raise UsageError()
elif len(args) > 1:
raise UsageError("running 'scrapy crawl' with more than one spider is not supported")
raise UsageError(
"running 'scrapy crawl' with more than one spider is not supported"
)
spname = args[0]
crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception):
if getattr(crawl_defer, "result", None) is not None and issubclass(
crawl_defer.result.type, Exception
):
self.exitcode = 1
else:
self.crawler_process.start()
if (
self.crawler_process.bootstrap_failed
or hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception
or hasattr(self.crawler_process, "has_exception")
and self.crawler_process.has_exception
):
self.exitcode = 1

View File

@ -8,7 +8,7 @@ from scrapy.exceptions import UsageError
class Command(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
default_settings = {"LOG_ENABLED": False}
def syntax(self):
return "<spider>"
@ -17,8 +17,10 @@ class Command(ScrapyCommand):
return "Edit spider"
def long_desc(self):
return ("Edit a spider using the editor defined in the EDITOR environment"
" variable or else the EDITOR setting")
return (
"Edit a spider using the editor defined in the EDITOR environment"
" variable or else the EDITOR setting"
)
def _err(self, msg):
sys.stderr.write(msg + os.linesep)
@ -28,12 +30,12 @@ class Command(ScrapyCommand):
if len(args) != 1:
raise UsageError()
editor = self.settings['EDITOR']
editor = self.settings["EDITOR"]
try:
spidercls = self.crawler_process.spider_loader.load(args[0])
except KeyError:
return self._err(f"Spider not found: {args[0]}")
sfile = sys.modules[spidercls.__module__].__file__
sfile = sfile.replace('.pyc', '.py')
sfile = sfile.replace(".pyc", ".py")
self.exitcode = os.system(f'{editor} "{sfile}"')

View File

@ -27,38 +27,51 @@ class Command(ScrapyCommand):
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_argument("--spider", dest="spider", help="use this spider")
parser.add_argument("--headers", dest="headers", action="store_true",
help="print response HTTP headers instead of body")
parser.add_argument("--no-redirect", dest="no_redirect", action="store_true", default=False,
help="do not handle HTTP 3xx status codes and print response as-is")
parser.add_argument(
"--headers",
dest="headers",
action="store_true",
help="print response HTTP headers instead of body",
)
parser.add_argument(
"--no-redirect",
dest="no_redirect",
action="store_true",
default=False,
help="do not handle HTTP 3xx status codes and print response as-is",
)
def _print_headers(self, headers, prefix):
for key, values in headers.items():
for value in values:
self._print_bytes(prefix + b' ' + key + b': ' + value)
self._print_bytes(prefix + b" " + key + b": " + value)
def _print_response(self, response, opts):
if opts.headers:
self._print_headers(response.request.headers, b'>')
print('>')
self._print_headers(response.headers, b'<')
self._print_headers(response.request.headers, b">")
print(">")
self._print_headers(response.headers, b"<")
else:
self._print_bytes(response.body)
def _print_bytes(self, bytes_):
sys.stdout.buffer.write(bytes_ + b'\n')
sys.stdout.buffer.write(bytes_ + b"\n")
def run(self, args, opts):
if len(args) != 1 or not is_url(args[0]):
raise UsageError()
request = Request(args[0], callback=self._print_response,
cb_kwargs={"opts": opts}, dont_filter=True)
request = Request(
args[0],
callback=self._print_response,
cb_kwargs={"opts": opts},
dont_filter=True,
)
# by default, let the framework handle redirects,
# i.e. command handles all codes expect 3xx
if not opts.no_redirect:
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
request.meta["handle_httpstatus_list"] = SequenceExclude(range(300, 400))
else:
request.meta['handle_httpstatus_all'] = True
request.meta["handle_httpstatus_all"] = True
spidercls = DefaultSpider
spider_loader = self.crawler_process.spider_loader

View File

@ -18,7 +18,7 @@ def sanitize_module_name(module_name):
with underscores and prefixing it with a letter if it doesn't start
with one
"""
module_name = module_name.replace('-', '_').replace('.', '_')
module_name = module_name.replace("-", "_").replace(".", "_")
if module_name[0] not in string.ascii_letters:
module_name = "a" + module_name
return module_name
@ -27,7 +27,7 @@ def sanitize_module_name(module_name):
def extract_domain(url):
"""Extract domain name from URL string"""
o = urlparse(url)
if o.scheme == '' and o.netloc == '':
if o.scheme == "" and o.netloc == "":
o = urlparse("//" + url.lstrip("/"))
return o.netloc
@ -35,7 +35,7 @@ def extract_domain(url):
class Command(ScrapyCommand):
requires_project = False
default_settings = {'LOG_ENABLED': False}
default_settings = {"LOG_ENABLED": False}
def syntax(self):
return "[options] <name> <domain>"
@ -45,16 +45,40 @@ class Command(ScrapyCommand):
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_argument("-l", "--list", dest="list", action="store_true",
help="List available templates")
parser.add_argument("-e", "--edit", dest="edit", action="store_true",
help="Edit spider after creating it")
parser.add_argument("-d", "--dump", dest="dump", metavar="TEMPLATE",
help="Dump template to standard output")
parser.add_argument("-t", "--template", dest="template", default="basic",
help="Uses a custom template.")
parser.add_argument("--force", dest="force", action="store_true",
help="If the spider already exists, overwrite it with the template")
parser.add_argument(
"-l",
"--list",
dest="list",
action="store_true",
help="List available templates",
)
parser.add_argument(
"-e",
"--edit",
dest="edit",
action="store_true",
help="Edit spider after creating it",
)
parser.add_argument(
"-d",
"--dump",
dest="dump",
metavar="TEMPLATE",
help="Dump template to standard output",
)
parser.add_argument(
"-t",
"--template",
dest="template",
default="basic",
help="Uses a custom template.",
)
parser.add_argument(
"--force",
dest="force",
action="store_true",
help="If the spider already exists, overwrite it with the template",
)
def run(self, args, opts):
if opts.list:
@ -72,7 +96,7 @@ class Command(ScrapyCommand):
domain = extract_domain(url)
module = sanitize_module_name(name)
if self.settings.get('BOT_NAME') == module:
if self.settings.get("BOT_NAME") == module:
print("Cannot create a spider with the same name as your project")
return
@ -87,17 +111,17 @@ class Command(ScrapyCommand):
def _genspider(self, module, name, domain, template_name, template_file):
"""Generate the spider module, based on the given template"""
capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
capitalized_module = "".join(s.capitalize() for s in module.split("_"))
tvars = {
'project_name': self.settings.get('BOT_NAME'),
'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
'module': module,
'name': name,
'domain': domain,
'classname': f'{capitalized_module}Spider'
"project_name": self.settings.get("BOT_NAME"),
"ProjectName": string_camelcase(self.settings.get("BOT_NAME")),
"module": module,
"name": name,
"domain": domain,
"classname": f"{capitalized_module}Spider",
}
if self.settings.get('NEWSPIDER_MODULE'):
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
if self.settings.get("NEWSPIDER_MODULE"):
spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
spiders_dir = Path(spiders_module.__file__).parent.resolve()
else:
spiders_module = None
@ -105,13 +129,15 @@ class Command(ScrapyCommand):
spider_file = f"{spiders_dir / module}.py"
shutil.copyfile(template_file, spider_file)
render_templatefile(spider_file, **tvars)
print(f"Created spider {name!r} using template {template_name!r} ",
end=('' if spiders_module else '\n'))
print(
f"Created spider {name!r} using template {template_name!r} ",
end=("" if spiders_module else "\n"),
)
if spiders_module:
print(f"in module:\n {spiders_module.__name__}.{module}")
def _find_template(self, template: str) -> Optional[Path]:
template_file = Path(self.templates_dir, f'{template}.tmpl')
template_file = Path(self.templates_dir, f"{template}.tmpl")
if template_file.exists():
return template_file
print(f"Unable to find template: {template}\n")
@ -121,11 +147,11 @@ class Command(ScrapyCommand):
def _list_templates(self):
print("Available templates:")
for file in sorted(Path(self.templates_dir).iterdir()):
if file.suffix == '.tmpl':
if file.suffix == ".tmpl":
print(f" {file.stem}")
def _spider_exists(self, name: str) -> bool:
if not self.settings.get('NEWSPIDER_MODULE'):
if not self.settings.get("NEWSPIDER_MODULE"):
# if run as a standalone command and file with same filename already exists
path = Path(name + ".py")
if path.exists():
@ -148,7 +174,7 @@ class Command(ScrapyCommand):
return True
# a file with the same name exists in the target directory
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
spiders_dir = Path(cast(str, spiders_module.__file__)).parent
spiders_dir_abs = spiders_dir.resolve()
path = spiders_dir_abs / (name + ".py")
@ -160,7 +186,9 @@ class Command(ScrapyCommand):
@property
def templates_dir(self) -> str:
return str(Path(
self.settings['TEMPLATES_DIR'] or Path(scrapy.__path__[0], 'templates'),
'spiders'
))
return str(
Path(
self.settings["TEMPLATES_DIR"] or Path(scrapy.__path__[0], "templates"),
"spiders",
)
)

View File

@ -4,7 +4,7 @@ from scrapy.commands import ScrapyCommand
class Command(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
default_settings = {"LOG_ENABLED": False}
def short_desc(self):
return "List available spiders"

View File

@ -32,28 +32,72 @@ class Command(BaseRunSpiderCommand):
def add_options(self, parser):
BaseRunSpiderCommand.add_options(self, parser)
parser.add_argument("--spider", dest="spider", default=None,
help="use this spider without looking for one")
parser.add_argument("--pipelines", action="store_true",
help="process items through pipelines")
parser.add_argument("--nolinks", dest="nolinks", action="store_true",
help="don't show links to follow (extracted requests)")
parser.add_argument("--noitems", dest="noitems", action="store_true",
help="don't show scraped items")
parser.add_argument("--nocolour", dest="nocolour", action="store_true",
help="avoid using pygments to colorize the output")
parser.add_argument("-r", "--rules", dest="rules", action="store_true",
help="use CrawlSpider rules to discover the callback")
parser.add_argument("-c", "--callback", dest="callback",
help="use this callback for parsing, instead looking for a callback")
parser.add_argument("-m", "--meta", dest="meta",
help="inject extra meta into the Request, it must be a valid raw json string")
parser.add_argument("--cbkwargs", dest="cbkwargs",
help="inject extra callback kwargs into the Request, it must be a valid raw json string")
parser.add_argument("-d", "--depth", dest="depth", type=int, default=1,
help="maximum depth for parsing requests [default: %(default)s]")
parser.add_argument("-v", "--verbose", dest="verbose", action="store_true",
help="print each depth level one by one")
parser.add_argument(
"--spider",
dest="spider",
default=None,
help="use this spider without looking for one",
)
parser.add_argument(
"--pipelines", action="store_true", help="process items through pipelines"
)
parser.add_argument(
"--nolinks",
dest="nolinks",
action="store_true",
help="don't show links to follow (extracted requests)",
)
parser.add_argument(
"--noitems",
dest="noitems",
action="store_true",
help="don't show scraped items",
)
parser.add_argument(
"--nocolour",
dest="nocolour",
action="store_true",
help="avoid using pygments to colorize the output",
)
parser.add_argument(
"-r",
"--rules",
dest="rules",
action="store_true",
help="use CrawlSpider rules to discover the callback",
)
parser.add_argument(
"-c",
"--callback",
dest="callback",
help="use this callback for parsing, instead looking for a callback",
)
parser.add_argument(
"-m",
"--meta",
dest="meta",
help="inject extra meta into the Request, it must be a valid raw json string",
)
parser.add_argument(
"--cbkwargs",
dest="cbkwargs",
help="inject extra callback kwargs into the Request, it must be a valid raw json string",
)
parser.add_argument(
"-d",
"--depth",
dest="depth",
type=int,
default=1,
help="maximum depth for parsing requests [default: %(default)s]",
)
parser.add_argument(
"-v",
"--verbose",
dest="verbose",
action="store_true",
help="print each depth level one by one",
)
@property
def max_level(self):
@ -98,13 +142,13 @@ class Command(BaseRunSpiderCommand):
if opts.verbose:
for level in range(1, self.max_level + 1):
print(f'\n>>> DEPTH LEVEL: {level} <<<')
print(f"\n>>> DEPTH LEVEL: {level} <<<")
if not opts.noitems:
self.print_items(level, colour)
if not opts.nolinks:
self.print_requests(level, colour)
else:
print(f'\n>>> STATUS DEPTH LEVEL {self.max_level} <<<')
print(f"\n>>> STATUS DEPTH LEVEL {self.max_level} <<<")
if not opts.noitems:
self.print_items(colour=colour)
if not opts.nolinks:
@ -125,14 +169,16 @@ class Command(BaseRunSpiderCommand):
return d
def get_callback_from_rules(self, spider, response):
if getattr(spider, 'rules', None):
if getattr(spider, "rules", None):
for rule in spider.rules:
if rule.link_extractor.matches(response.url):
return rule.callback or "parse"
else:
logger.error('No CrawlSpider rules found in spider %(spider)r, '
'please specify a callback to use for parsing',
{'spider': spider.name})
logger.error(
"No CrawlSpider rules found in spider %(spider)r, "
"please specify a callback to use for parsing",
{"spider": spider.name},
)
def set_spidercls(self, url, opts):
spider_loader = self.crawler_process.spider_loader
@ -140,15 +186,17 @@ class Command(BaseRunSpiderCommand):
try:
self.spidercls = spider_loader.load(opts.spider)
except KeyError:
logger.error('Unable to find spider: %(spider)s',
{'spider': opts.spider})
logger.error(
"Unable to find spider: %(spider)s", {"spider": opts.spider}
)
else:
self.spidercls = spidercls_for_request(spider_loader, Request(url))
if not self.spidercls:
logger.error('Unable to find spider for: %(url)s', {'url': url})
logger.error("Unable to find spider for: %(url)s", {"url": url})
def _start_requests(spider):
yield self.prepare_request(spider, Request(url), opts)
if self.spidercls:
self.spidercls.start_requests = _start_requests
@ -158,8 +206,7 @@ class Command(BaseRunSpiderCommand):
self.crawler_process.start()
if not self.first_response:
logger.error('No response downloaded for: %(url)s',
{'url': url})
logger.error("No response downloaded for: %(url)s", {"url": url})
def scraped_data(self, args):
items, requests, opts, depth, spider, callback = args
@ -173,8 +220,8 @@ class Command(BaseRunSpiderCommand):
scraped_data = items if opts.output else []
if depth < opts.depth:
for req in requests:
req.meta['_depth'] = depth + 1
req.meta['_callback'] = req.callback
req.meta["_depth"] = depth + 1
req.meta["_callback"] = req.callback
req.callback = callback
scraped_data += requests
@ -187,7 +234,7 @@ class Command(BaseRunSpiderCommand):
self.first_response = response
# determine real callback
cb = response.meta['_callback']
cb = response.meta["_callback"]
if not cb:
if opts.callback:
cb = opts.callback
@ -195,23 +242,27 @@ class Command(BaseRunSpiderCommand):
cb = self.get_callback_from_rules(spider, response)
if not cb:
logger.error('Cannot find a rule that matches %(url)r in spider: %(spider)s',
{'url': response.url, 'spider': spider.name})
logger.error(
"Cannot find a rule that matches %(url)r in spider: %(spider)s",
{"url": response.url, "spider": spider.name},
)
return
else:
cb = 'parse'
cb = "parse"
if not callable(cb):
cb_method = getattr(spider, cb, None)
if callable(cb_method):
cb = cb_method
else:
logger.error('Cannot find callback %(callback)r in spider: %(spider)s',
{'callback': cb, 'spider': spider.name})
logger.error(
"Cannot find callback %(callback)r in spider: %(spider)s",
{"callback": cb, "spider": spider.name},
)
return
# parse items and requests
depth = response.meta['_depth']
depth = response.meta["_depth"]
d = self.run_callback(response, cb, cb_kwargs)
d.addCallback(self._get_items_and_requests, opts, depth, spider, callback)
@ -226,8 +277,8 @@ class Command(BaseRunSpiderCommand):
if opts.cbkwargs:
request.cb_kwargs.update(opts.cbkwargs)
request.meta['_depth'] = 1
request.meta['_callback'] = request.callback
request.meta["_depth"] = 1
request.meta["_callback"] = request.callback
request.callback = callback
return request
@ -242,16 +293,22 @@ class Command(BaseRunSpiderCommand):
try:
opts.meta = json.loads(opts.meta)
except ValueError:
raise UsageError("Invalid -m/--meta value, pass a valid json string to -m or --meta. "
"Example: --meta='{\"foo\" : \"bar\"}'", print_help=False)
raise UsageError(
"Invalid -m/--meta value, pass a valid json string to -m or --meta. "
'Example: --meta=\'{"foo" : "bar"}\'',
print_help=False,
)
def process_request_cb_kwargs(self, opts):
if opts.cbkwargs:
try:
opts.cbkwargs = json.loads(opts.cbkwargs)
except ValueError:
raise UsageError("Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
"Example: --cbkwargs='{\"foo\" : \"bar\"}'", print_help=False)
raise UsageError(
"Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
'Example: --cbkwargs=\'{"foo" : "bar"}\'',
print_help=False,
)
def run(self, args, opts):
# parse arguments

View File

@ -12,7 +12,7 @@ from scrapy.commands import BaseRunSpiderCommand
def _import_file(filepath: Union[str, PathLike]) -> ModuleType:
abspath = Path(filepath).resolve()
if abspath.suffix not in ('.py', '.pyw'):
if abspath.suffix not in (".py", ".pyw"):
raise ValueError(f"Not a Python source file: {abspath}")
dirname = str(abspath.parent)
sys.path = [dirname] + sys.path
@ -26,7 +26,7 @@ def _import_file(filepath: Union[str, PathLike]) -> ModuleType:
class Command(BaseRunSpiderCommand):
requires_project = False
default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
default_settings = {"SPIDER_LOADER_WARN_ONLY": True}
def syntax(self):
return "[options] <spider_file>"

View File

@ -7,8 +7,7 @@ from scrapy.settings import BaseSettings
class Command(ScrapyCommand):
requires_project = False
default_settings = {'LOG_ENABLED': False,
'SPIDER_LOADER_WARN_ONLY': True}
default_settings = {"LOG_ENABLED": False, "SPIDER_LOADER_WARN_ONLY": True}
def syntax(self):
return "[options]"
@ -18,16 +17,33 @@ class Command(ScrapyCommand):
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_argument("--get", dest="get", metavar="SETTING",
help="print raw setting value")
parser.add_argument("--getbool", dest="getbool", metavar="SETTING",
help="print setting value, interpreted as a boolean")
parser.add_argument("--getint", dest="getint", metavar="SETTING",
help="print setting value, interpreted as an integer")
parser.add_argument("--getfloat", dest="getfloat", metavar="SETTING",
help="print setting value, interpreted as a float")
parser.add_argument("--getlist", dest="getlist", metavar="SETTING",
help="print setting value, interpreted as a list")
parser.add_argument(
"--get", dest="get", metavar="SETTING", help="print raw setting value"
)
parser.add_argument(
"--getbool",
dest="getbool",
metavar="SETTING",
help="print setting value, interpreted as a boolean",
)
parser.add_argument(
"--getint",
dest="getint",
metavar="SETTING",
help="print setting value, interpreted as an integer",
)
parser.add_argument(
"--getfloat",
dest="getfloat",
metavar="SETTING",
help="print setting value, interpreted as a float",
)
parser.add_argument(
"--getlist",
dest="getlist",
metavar="SETTING",
help="print setting value, interpreted as a list",
)
def run(self, args, opts):
settings = self.crawler_process.settings

View File

@ -16,9 +16,9 @@ class Command(ScrapyCommand):
requires_project = False
default_settings = {
'KEEP_ALIVE': True,
'LOGSTATS_INTERVAL': 0,
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
"KEEP_ALIVE": True,
"LOGSTATS_INTERVAL": 0,
"DUPEFILTER_CLASS": "scrapy.dupefilters.BaseDupeFilter",
}
def syntax(self):
@ -28,17 +28,26 @@ class Command(ScrapyCommand):
return "Interactive scraping console"
def long_desc(self):
return ("Interactive console for scraping the given url or file. "
"Use ./file.html syntax or full path for local file.")
return (
"Interactive console for scraping the given url or file. "
"Use ./file.html syntax or full path for local file."
)
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_argument("-c", dest="code",
help="evaluate the code in the shell, print the result and exit")
parser.add_argument("--spider", dest="spider",
help="use this spider")
parser.add_argument("--no-redirect", dest="no_redirect", action="store_true", default=False,
help="do not handle HTTP 3xx status codes and print response as-is")
parser.add_argument(
"-c",
dest="code",
help="evaluate the code in the shell, print the result and exit",
)
parser.add_argument("--spider", dest="spider", help="use this spider")
parser.add_argument(
"--no-redirect",
dest="no_redirect",
action="store_true",
default=False,
help="do not handle HTTP 3xx status codes and print response as-is",
)
def update_vars(self, vars):
"""You can use this function to update the Scrapy objects that will be
@ -58,8 +67,9 @@ class Command(ScrapyCommand):
if opts.spider:
spidercls = spider_loader.load(opts.spider)
elif url:
spidercls = spidercls_for_request(spider_loader, Request(url),
spidercls, log_multiple=True)
spidercls = spidercls_for_request(
spider_loader, Request(url), spidercls, log_multiple=True
)
# The crawler is created this way since the Shell manually handles the
# crawling engine, so the set up in the crawl method won't work
@ -74,7 +84,9 @@ class Command(ScrapyCommand):
shell.start(url=url, redirect=not opts.no_redirect)
def _start_crawler_thread(self):
t = Thread(target=self.crawler_process.start,
kwargs={'stop_after_crawl': False, 'install_signal_handlers': False})
t = Thread(
target=self.crawler_process.start,
kwargs={"stop_after_crawl": False, "install_signal_handlers": False},
)
t.daemon = True
t.start()

View File

@ -13,14 +13,14 @@ from scrapy.exceptions import UsageError
TEMPLATES_TO_RENDER = (
('scrapy.cfg',),
('${project_name}', 'settings.py.tmpl'),
('${project_name}', 'items.py.tmpl'),
('${project_name}', 'pipelines.py.tmpl'),
('${project_name}', 'middlewares.py.tmpl'),
("scrapy.cfg",),
("${project_name}", "settings.py.tmpl"),
("${project_name}", "items.py.tmpl"),
("${project_name}", "pipelines.py.tmpl"),
("${project_name}", "middlewares.py.tmpl"),
)
IGNORE = ignore_patterns('*.pyc', '__pycache__', '.svn')
IGNORE = ignore_patterns("*.pyc", "__pycache__", ".svn")
def _make_writable(path):
@ -31,8 +31,7 @@ def _make_writable(path):
class Command(ScrapyCommand):
requires_project = False
default_settings = {'LOG_ENABLED': False,
'SPIDER_LOADER_WARN_ONLY': True}
default_settings = {"LOG_ENABLED": False, "SPIDER_LOADER_WARN_ONLY": True}
def syntax(self):
return "<project_name> [project_dir]"
@ -45,11 +44,13 @@ class Command(ScrapyCommand):
spec = find_spec(module_name)
return spec is not None and spec.loader is not None
if not re.search(r'^[_a-zA-Z]\w*$', project_name):
print('Error: Project names must begin with a letter and contain'
' only\nletters, numbers and underscores')
if not re.search(r"^[_a-zA-Z]\w*$", project_name):
print(
"Error: Project names must begin with a letter and contain"
" only\nletters, numbers and underscores"
)
elif _module_exists(project_name):
print(f'Error: Module {project_name!r} already exists')
print(f"Error: Module {project_name!r} already exists")
else:
return True
return False
@ -96,9 +97,9 @@ class Command(ScrapyCommand):
else:
project_dir = Path(args[0])
if (project_dir / 'scrapy.cfg').exists():
if (project_dir / "scrapy.cfg").exists():
self.exitcode = 1
print(f'Error: scrapy.cfg already exists in {project_dir.resolve()}')
print(f"Error: scrapy.cfg already exists in {project_dir.resolve()}")
return
if not self._is_valid_name(project_name):
@ -106,12 +107,24 @@ class Command(ScrapyCommand):
return
self._copytree(Path(self.templates_dir), project_dir.resolve())
move(project_dir / 'module', project_dir / project_name)
move(project_dir / "module", project_dir / project_name)
for paths in TEMPLATES_TO_RENDER:
tplfile = Path(project_dir, *(string.Template(s).substitute(project_name=project_name) for s in paths))
render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
print(f"New Scrapy project '{project_name}', using template directory "
f"'{self.templates_dir}', created in:")
tplfile = Path(
project_dir,
*(
string.Template(s).substitute(project_name=project_name)
for s in paths
),
)
render_templatefile(
tplfile,
project_name=project_name,
ProjectName=string_camelcase(project_name),
)
print(
f"New Scrapy project '{project_name}', using template directory "
f"'{self.templates_dir}', created in:"
)
print(f" {project_dir.resolve()}\n")
print("You can start your first spider with:")
print(f" cd {project_dir}")
@ -119,7 +132,9 @@ class Command(ScrapyCommand):
@property
def templates_dir(self) -> str:
return str(Path(
self.settings['TEMPLATES_DIR'] or Path(scrapy.__path__[0], 'templates'),
'project'
))
return str(
Path(
self.settings["TEMPLATES_DIR"] or Path(scrapy.__path__[0], "templates"),
"project",
)
)

View File

@ -5,8 +5,7 @@ from scrapy.utils.versions import scrapy_components_versions
class Command(ScrapyCommand):
default_settings = {'LOG_ENABLED': False,
'SPIDER_LOADER_WARN_ONLY': True}
default_settings = {"LOG_ENABLED": False, "SPIDER_LOADER_WARN_ONLY": True}
def syntax(self):
return "[-v]"
@ -16,8 +15,13 @@ class Command(ScrapyCommand):
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_argument("--verbose", "-v", dest="verbose", action="store_true",
help="also display twisted/python/platform info (useful for bug reports)")
parser.add_argument(
"--verbose",
"-v",
dest="verbose",
action="store_true",
help="also display twisted/python/platform info (useful for bug reports)",
)
def run(self, args, opts):
if opts.verbose:

View File

@ -4,16 +4,17 @@ from scrapy.utils.response import open_in_browser
class Command(fetch.Command):
def short_desc(self):
return "Open URL in browser, as seen by Scrapy"
def long_desc(self):
return "Fetch a URL using the Scrapy downloader and show its contents in a browser"
return (
"Fetch a URL using the Scrapy downloader and show its contents in a browser"
)
def add_options(self, parser):
super().add_options(parser)
parser.add_argument('--headers', help=argparse.SUPPRESS)
parser.add_argument("--headers", help=argparse.SUPPRESS)
def _print_response(self, response, opts):
open_in_browser(response)

View File

@ -11,16 +11,17 @@ from scrapy.utils.spider import iterate_spider_output
class Contract:
""" Abstract class for contracts """
"""Abstract class for contracts"""
request_cls = None
def __init__(self, method, *args):
self.testcase_pre = _create_testcase(method, f'@{self.name} pre-hook')
self.testcase_post = _create_testcase(method, f'@{self.name} post-hook')
self.testcase_pre = _create_testcase(method, f"@{self.name} pre-hook")
self.testcase_post = _create_testcase(method, f"@{self.name} post-hook")
self.args = args
def add_pre_hook(self, request, results):
if hasattr(self, 'pre_process'):
if hasattr(self, "pre_process"):
cb = request.callback
@wraps(cb)
@ -43,7 +44,7 @@ class Contract:
return request
def add_post_hook(self, request, results):
if hasattr(self, 'post_process'):
if hasattr(self, "post_process"):
cb = request.callback
@wraps(cb)
@ -88,12 +89,12 @@ class ContractsManager:
def extract_contracts(self, method):
contracts = []
for line in method.__doc__.split('\n'):
for line in method.__doc__.split("\n"):
line = line.strip()
if line.startswith('@'):
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
args = re.split(r'\s+', args)
if line.startswith("@"):
name, args = re.match(r"@(\w+)\s*(.*)", line).groups()
args = re.split(r"\s+", args)
contracts.append(self.contracts[name](method, *args))
@ -106,7 +107,7 @@ class ContractsManager:
try:
requests.append(self.from_method(bound_method, results))
except Exception:
case = _create_testcase(bound_method, 'contract')
case = _create_testcase(bound_method, "contract")
results.addError(case, sys.exc_info())
return requests
@ -124,13 +125,13 @@ class ContractsManager:
# Don't filter requests to allow
# testing different callbacks on the same URL.
kwargs['dont_filter'] = True
kwargs['callback'] = method
kwargs["dont_filter"] = True
kwargs["callback"] = method
for contract in contracts:
kwargs = contract.adjust_request_args(kwargs)
args.remove('self')
args.remove("self")
# check if all positional arguments are defined in kwargs
if set(args).issubset(set(kwargs)):
@ -146,7 +147,7 @@ class ContractsManager:
return request
def _clean_req(self, request, method, results):
""" stop the request from returning objects and records any errors """
"""stop the request from returning objects and records any errors"""
cb = request.callback
@ -156,11 +157,11 @@ class ContractsManager:
output = cb(response, **cb_kwargs)
output = list(iterate_spider_output(output))
except Exception:
case = _create_testcase(method, 'callback')
case = _create_testcase(method, "callback")
results.addError(case, sys.exc_info())
def eb_wrapper(failure):
case = _create_testcase(method, 'errback')
case = _create_testcase(method, "errback")
exc_info = failure.type, failure.value, failure.getTracebackObject()
results.addError(case, exc_info)
@ -175,6 +176,6 @@ def _create_testcase(method, desc):
def __str__(_self):
return f"[{spider}] {method.__name__} ({desc})"
name = f'{spider}_{method.__name__}'
name = f"{spider}_{method.__name__}"
setattr(ContractTestCase, name, lambda x: x)
return ContractTestCase(name)

View File

@ -9,50 +9,50 @@ from scrapy.http import Request
# contracts
class UrlContract(Contract):
""" Contract to set the url of the request (mandatory)
@url http://scrapy.org
"""Contract to set the url of the request (mandatory)
@url http://scrapy.org
"""
name = 'url'
name = "url"
def adjust_request_args(self, args):
args['url'] = self.args[0]
args["url"] = self.args[0]
return args
class CallbackKeywordArgumentsContract(Contract):
""" Contract to set the keyword arguments for the request.
The value should be a JSON-encoded dictionary, e.g.:
"""Contract to set the keyword arguments for the request.
The value should be a JSON-encoded dictionary, e.g.:
@cb_kwargs {"arg1": "some value"}
@cb_kwargs {"arg1": "some value"}
"""
name = 'cb_kwargs'
name = "cb_kwargs"
def adjust_request_args(self, args):
args['cb_kwargs'] = json.loads(' '.join(self.args))
args["cb_kwargs"] = json.loads(" ".join(self.args))
return args
class ReturnsContract(Contract):
""" Contract to check the output of a callback
"""Contract to check the output of a callback
general form:
@returns request(s)/item(s) [min=1 [max]]
general form:
@returns request(s)/item(s) [min=1 [max]]
e.g.:
@returns request
@returns request 2
@returns request 2 10
@returns request 0 10
e.g.:
@returns request
@returns request 2
@returns request 2 10
@returns request 0 10
"""
name = 'returns'
name = "returns"
object_type_verifiers = {
'request': lambda x: isinstance(x, Request),
'requests': lambda x: isinstance(x, Request),
'item': is_item,
'items': is_item,
"request": lambda x: isinstance(x, Request),
"requests": lambda x: isinstance(x, Request),
"item": is_item,
"items": is_item,
}
def __init__(self, *args, **kwargs):
@ -73,7 +73,7 @@ class ReturnsContract(Contract):
try:
self.max_bound = int(self.args[2])
except IndexError:
self.max_bound = float('inf')
self.max_bound = float("inf")
def post_process(self, output):
occurrences = 0
@ -81,23 +81,25 @@ class ReturnsContract(Contract):
if self.obj_type_verifier(x):
occurrences += 1
assertion = (self.min_bound <= occurrences <= self.max_bound)
assertion = self.min_bound <= occurrences <= self.max_bound
if not assertion:
if self.min_bound == self.max_bound:
expected = self.min_bound
else:
expected = f'{self.min_bound}..{self.max_bound}'
expected = f"{self.min_bound}..{self.max_bound}"
raise ContractFail(f"Returned {occurrences} {self.obj_name}, expected {expected}")
raise ContractFail(
f"Returned {occurrences} {self.obj_name}, expected {expected}"
)
class ScrapesContract(Contract):
""" Contract to check presence of fields in scraped items
@scrapes page_name page_body
"""Contract to check presence of fields in scraped items
@scrapes page_name page_body
"""
name = 'scrapes'
name = "scrapes"
def post_process(self, output):
for x in output:

View File

@ -41,9 +41,11 @@ class Slot:
def __repr__(self):
cls_name = self.__class__.__name__
return (f"{cls_name}(concurrency={self.concurrency!r}, "
f"delay={self.delay:.2f}, "
f"randomize_delay={self.randomize_delay!r})")
return (
f"{cls_name}(concurrency={self.concurrency!r}, "
f"delay={self.delay:.2f}, "
f"randomize_delay={self.randomize_delay!r})"
)
def __str__(self):
return (
@ -56,11 +58,11 @@ class Slot:
def _get_concurrency_delay(concurrency, spider, settings):
delay = settings.getfloat('DOWNLOAD_DELAY')
if hasattr(spider, 'download_delay'):
delay = settings.getfloat("DOWNLOAD_DELAY")
if hasattr(spider, "download_delay"):
delay = spider.download_delay
if hasattr(spider, 'max_concurrent_requests'):
if hasattr(spider, "max_concurrent_requests"):
concurrency = spider.max_concurrent_requests
return concurrency, delay
@ -68,7 +70,7 @@ def _get_concurrency_delay(concurrency, spider, settings):
class Downloader:
DOWNLOAD_SLOT = 'download_slot'
DOWNLOAD_SLOT = "download_slot"
def __init__(self, crawler):
self.settings = crawler.settings
@ -76,10 +78,10 @@ class Downloader:
self.slots = {}
self.active = set()
self.handlers = DownloadHandlers(crawler)
self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
self.total_concurrency = self.settings.getint("CONCURRENT_REQUESTS")
self.domain_concurrency = self.settings.getint("CONCURRENT_REQUESTS_PER_DOMAIN")
self.ip_concurrency = self.settings.getint("CONCURRENT_REQUESTS_PER_IP")
self.randomize_delay = self.settings.getbool("RANDOMIZE_DOWNLOAD_DELAY")
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
self._slot_gc_loop = task.LoopingCall(self._slot_gc)
self._slot_gc_loop.start(60)
@ -99,7 +101,9 @@ class Downloader:
def _get_slot(self, request, spider):
key = self._get_slot_key(request, spider)
if key not in self.slots:
conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
conc = (
self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
)
conc, delay = _get_concurrency_delay(conc, spider, self.settings)
self.slots[key] = Slot(conc, delay, self.randomize_delay)
@ -109,7 +113,7 @@ class Downloader:
if self.DOWNLOAD_SLOT in request.meta:
return request.meta[self.DOWNLOAD_SLOT]
key = urlparse_cached(request).hostname or ''
key = urlparse_cached(request).hostname or ""
if self.ip_concurrency:
key = dnscache.get(key, key)
@ -124,9 +128,9 @@ class Downloader:
return response
slot.active.add(request)
self.signals.send_catch_log(signal=signals.request_reached_downloader,
request=request,
spider=spider)
self.signals.send_catch_log(
signal=signals.request_reached_downloader, request=request, spider=spider
)
deferred = defer.Deferred().addBoth(_deactivate)
slot.queue.append((request, deferred))
self._process_queue(spider, slot)
@ -134,6 +138,7 @@ class Downloader:
def _process_queue(self, spider, slot):
from twisted.internet import reactor
if slot.latercall and slot.latercall.active():
return
@ -143,7 +148,9 @@ class Downloader:
if delay:
penalty = delay - now + slot.lastseen
if penalty > 0:
slot.latercall = reactor.callLater(penalty, self._process_queue, spider, slot)
slot.latercall = reactor.callLater(
penalty, self._process_queue, spider, slot
)
return
# Process enqueued requests if there are free slots to transfer for this slot
@ -166,11 +173,14 @@ class Downloader:
# 2. Notify response_downloaded listeners about the recent download
# before querying queue for next request
def _downloaded(response):
self.signals.send_catch_log(signal=signals.response_downloaded,
response=response,
request=request,
spider=spider)
self.signals.send_catch_log(
signal=signals.response_downloaded,
response=response,
request=request,
spider=spider,
)
return response
dfd.addCallback(_downloaded)
# 3. After response arrives, remove the request from transferring
@ -182,9 +192,9 @@ class Downloader:
def finish_transferring(_):
slot.transferring.remove(request)
self._process_queue(spider, slot)
self.signals.send_catch_log(signal=signals.request_left_downloader,
request=request,
spider=spider)
self.signals.send_catch_log(
signal=signals.request_left_downloader, request=request, spider=spider
)
return _
return dfd.addBoth(finish_transferring)

View File

@ -2,13 +2,22 @@ import warnings
from OpenSSL import SSL
from twisted.internet._sslverify import _setAcceptableProtocols
from twisted.internet.ssl import optionsForClientTLS, CertificateOptions, platformTrust, AcceptableCiphers
from twisted.internet.ssl import (
optionsForClientTLS,
CertificateOptions,
platformTrust,
AcceptableCiphers,
)
from twisted.web.client import BrowserLikePolicyForHTTPS
from twisted.web.iweb import IPolicyForHTTPS
from zope.interface.declarations import implementer
from zope.interface.verify import verifyObject
from scrapy.core.downloader.tls import DEFAULT_CIPHERS, openssl_methods, ScrapyClientTLSOptions
from scrapy.core.downloader.tls import (
DEFAULT_CIPHERS,
openssl_methods,
ScrapyClientTLSOptions,
)
from scrapy.utils.misc import create_instance, load_object
@ -24,7 +33,14 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
understand the TLSv1, TLSv1.1 and TLSv1.2 protocols.'
"""
def __init__(self, method=SSL.SSLv23_METHOD, tls_verbose_logging=False, tls_ciphers=None, *args, **kwargs):
def __init__(
self,
method=SSL.SSLv23_METHOD,
tls_verbose_logging=False,
tls_ciphers=None,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self._ssl_method = method
self.tls_verbose_logging = tls_verbose_logging
@ -35,9 +51,15 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
@classmethod
def from_settings(cls, settings, method=SSL.SSLv23_METHOD, *args, **kwargs):
tls_verbose_logging = settings.getbool('DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING')
tls_ciphers = settings['DOWNLOADER_CLIENT_TLS_CIPHERS']
return cls(method=method, tls_verbose_logging=tls_verbose_logging, tls_ciphers=tls_ciphers, *args, **kwargs)
tls_verbose_logging = settings.getbool("DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING")
tls_ciphers = settings["DOWNLOADER_CLIENT_TLS_CIPHERS"]
return cls(
method=method,
tls_verbose_logging=tls_verbose_logging,
tls_ciphers=tls_ciphers,
*args,
**kwargs,
)
def getCertificateOptions(self):
# setting verify=True will require you to provide CAs
@ -53,7 +75,7 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
# not calling super().__init__
return CertificateOptions(
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None)),
method=getattr(self, "method", getattr(self, "_ssl_method", None)),
fixBrokenPeers=True,
acceptableCiphers=self.tls_ciphers,
)
@ -64,8 +86,11 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
return self.getCertificateOptions().getContext()
def creatorForNetloc(self, hostname, port):
return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext(),
verbose_logging=self.tls_verbose_logging)
return ScrapyClientTLSOptions(
hostname.decode("ascii"),
self.getContext(),
verbose_logging=self.tls_verbose_logging,
)
@implementer(IPolicyForHTTPS)
@ -95,7 +120,7 @@ class BrowserLikeContextFactory(ScrapyClientContextFactory):
return optionsForClientTLS(
hostname=hostname.decode("ascii"),
trustRoot=platformTrust(),
extraCertificateOptions={'method': self._ssl_method},
extraCertificateOptions={"method": self._ssl_method},
)
@ -118,8 +143,8 @@ class AcceptableProtocolsContextFactory:
def load_context_factory_from_settings(settings, crawler):
ssl_method = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
context_factory_cls = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
ssl_method = openssl_methods[settings.get("DOWNLOADER_CLIENT_TLS_METHOD")]
context_factory_cls = load_object(settings["DOWNLOADER_CLIENTCONTEXTFACTORY"])
# try method-aware context factory
try:
context_factory = create_instance(

View File

@ -15,14 +15,14 @@ logger = logging.getLogger(__name__)
class DownloadHandlers:
def __init__(self, crawler):
self._crawler = crawler
self._schemes = {} # stores acceptable schemes on instancing
self._handlers = {} # stores instanced handlers for schemes
self._notconfigured = {} # remembers failed handlers
handlers = without_none_values(
crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
crawler.settings.getwithbase("DOWNLOAD_HANDLERS")
)
for scheme, clspath in handlers.items():
self._schemes[scheme] = clspath
self._load_handler(scheme, skip_lazy=True)
@ -38,7 +38,7 @@ class DownloadHandlers:
if scheme in self._notconfigured:
return None
if scheme not in self._schemes:
self._notconfigured[scheme] = 'no handler available for that scheme'
self._notconfigured[scheme] = "no handler available for that scheme"
return None
return self._load_handler(scheme)
@ -47,7 +47,7 @@ class DownloadHandlers:
path = self._schemes[scheme]
try:
dhcls = load_object(path)
if skip_lazy and getattr(dhcls, 'lazy', True):
if skip_lazy and getattr(dhcls, "lazy", True):
return None
dh = create_instance(
objcls=dhcls,
@ -58,9 +58,12 @@ class DownloadHandlers:
self._notconfigured[scheme] = str(ex)
return None
except Exception as ex:
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
{"clspath": path, "scheme": scheme},
exc_info=True, extra={'crawler': self._crawler})
logger.error(
'Loading "%(clspath)s" for scheme "%(scheme)s"',
{"clspath": path, "scheme": scheme},
exc_info=True,
extra={"crawler": self._crawler},
)
self._notconfigured[scheme] = str(ex)
return None
else:
@ -71,11 +74,13 @@ class DownloadHandlers:
scheme = urlparse_cached(request).scheme
handler = self._get_handler(scheme)
if not handler:
raise NotSupported(f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}")
raise NotSupported(
f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}"
)
return handler.download_request(request, spider)
@defer.inlineCallbacks
def _close(self, *_a, **_kw):
for dh in self._handlers.values():
if hasattr(dh, 'close'):
if hasattr(dh, "close"):
yield dh.close()

View File

@ -14,9 +14,8 @@ class DataURIDownloadHandler:
respcls = responsetypes.from_mimetype(uri.media_type)
resp_kwargs = {}
if (issubclass(respcls, TextResponse)
and uri.media_type.split('/')[0] == 'text'):
charset = uri.media_type_parameters.get('charset')
resp_kwargs['encoding'] = charset
if issubclass(respcls, TextResponse) and uri.media_type.split("/")[0] == "text":
charset = uri.media_type_parameters.get("charset")
resp_kwargs["encoding"] = charset
return respcls(url=request.url, body=uri.data, **resp_kwargs)

View File

@ -71,9 +71,9 @@ class FTPDownloadHandler:
}
def __init__(self, settings):
self.default_user = settings['FTP_USER']
self.default_password = settings['FTP_PASSWORD']
self.passive_mode = settings['FTP_PASSIVE_MODE']
self.default_user = settings["FTP_USER"]
self.default_password = settings["FTP_PASSWORD"]
self.passive_mode = settings["FTP_PASSIVE_MODE"]
@classmethod
def from_crawler(cls, crawler):
@ -81,12 +81,16 @@ class FTPDownloadHandler:
def download_request(self, request, spider):
from twisted.internet import reactor
parsed_url = urlparse_cached(request)
user = request.meta.get("ftp_user", self.default_user)
password = request.meta.get("ftp_password", self.default_password)
passive_mode = 1 if bool(request.meta.get("ftp_passive",
self.passive_mode)) else 0
creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode)
passive_mode = (
1 if bool(request.meta.get("ftp_passive", self.passive_mode)) else 0
)
creator = ClientCreator(
reactor, FTPClient, user, password, passive=passive_mode
)
dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
@ -103,7 +107,7 @@ class FTPDownloadHandler:
def _build_response(self, result, request, protocol):
self.result = result
protocol.close()
headers = {"local filename": protocol.filename or '', "size": protocol.size}
headers = {"local filename": protocol.filename or "", "size": protocol.size}
body = to_bytes(protocol.filename or protocol.body.read())
respcls = responsetypes.from_args(url=request.url, body=body)
return respcls(url=request.url, status=200, body=body, headers=headers)
@ -115,5 +119,7 @@ class FTPDownloadHandler:
if m:
ftpcode = m.group()
httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
return Response(url=request.url, status=httpcode, body=to_bytes(message))
return Response(
url=request.url, status=httpcode, body=to_bytes(message)
)
raise result.type(result.value)

View File

@ -8,8 +8,10 @@ class HTTP10DownloadHandler:
lazy = False
def __init__(self, settings, crawler=None):
self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
self.ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
self.HTTPClientFactory = load_object(settings["DOWNLOADER_HTTPCLIENTFACTORY"])
self.ClientContextFactory = load_object(
settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]
)
self._settings = settings
self._crawler = crawler
@ -25,8 +27,9 @@ class HTTP10DownloadHandler:
def _connect(self, factory):
from twisted.internet import reactor
host, port = to_unicode(factory.host), factory.port
if factory.scheme == b'https':
if factory.scheme == b"https":
client_context_factory = create_instance(
objcls=self.ClientContextFactory,
settings=self._settings,

View File

@ -12,7 +12,13 @@ from twisted.internet import defer, protocol, ssl
from twisted.internet.endpoints import TCP4ClientEndpoint
from twisted.internet.error import TimeoutError
from twisted.python.failure import Failure
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
from twisted.web.client import (
Agent,
HTTPConnectionPool,
ResponseDone,
ResponseFailed,
URI,
)
from twisted.web.http import _DataLoss, PotentialDataLoss
from twisted.web.http_headers import Headers as TxHeaders
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
@ -36,14 +42,17 @@ class HTTP11DownloadHandler:
self._crawler = crawler
from twisted.internet import reactor
self._pool = HTTPConnectionPool(reactor, persistent=True)
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
self._pool.maxPersistentPerHost = settings.getint(
"CONCURRENT_REQUESTS_PER_DOMAIN"
)
self._pool._factory.noisy = False
self._contextFactory = load_context_factory_from_settings(settings, crawler)
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
self._default_maxsize = settings.getint("DOWNLOAD_MAXSIZE")
self._default_warnsize = settings.getint("DOWNLOAD_WARNSIZE")
self._fail_on_dataloss = settings.getbool("DOWNLOAD_FAIL_ON_DATALOSS")
self._disconnect_timeout = 1
@classmethod
@ -55,8 +64,8 @@ class HTTP11DownloadHandler:
agent = ScrapyAgent(
contextFactory=self._contextFactory,
pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
maxsize=getattr(spider, "download_maxsize", self._default_maxsize),
warnsize=getattr(spider, "download_warnsize", self._default_warnsize),
fail_on_dataloss=self._fail_on_dataloss,
crawler=self._crawler,
)
@ -64,6 +73,7 @@ class HTTP11DownloadHandler:
def close(self):
from twisted.internet import reactor
d = self._pool.closeCachedConnections()
# closeCachedConnections will hang on network or server issues, so
# we'll manually timeout the deferred.
@ -96,11 +106,23 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
with this endpoint comes from the pool and a CONNECT has already been issued
for it.
"""
_truncatedLength = 1000
_responseAnswer = r'HTTP/1\.. (?P<status>\d{3})(?P<reason>.{,' + str(_truncatedLength) + r'})'
_responseAnswer = (
r"HTTP/1\.. (?P<status>\d{3})(?P<reason>.{," + str(_truncatedLength) + r"})"
)
_responseMatcher = re.compile(_responseAnswer.encode())
def __init__(self, reactor, host, port, proxyConf, contextFactory, timeout=30, bindAddress=None):
def __init__(
self,
reactor,
host,
port,
proxyConf,
contextFactory,
timeout=30,
bindAddress=None,
):
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
self._tunnelReadyDeferred = defer.Deferred()
@ -111,7 +133,9 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
def requestTunnel(self, protocol):
"""Asks the proxy to open a tunnel."""
tunnelReq = tunnel_request_data(self._tunneledHost, self._tunneledPort, self._proxyAuthHeader)
tunnelReq = tunnel_request_data(
self._tunneledHost, self._tunneledPort, self._proxyAuthHeader
)
protocol.transport.write(tunnelReq)
self._protocolDataReceived = protocol.dataReceived
protocol.dataReceived = self.processProxyResponse
@ -129,24 +153,30 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
# from the proxy so that we don't send those bytes to the TLS layer
#
# see https://github.com/scrapy/scrapy/issues/2491
if b'\r\n\r\n' not in self._connectBuffer:
if b"\r\n\r\n" not in self._connectBuffer:
return
self._protocol.dataReceived = self._protocolDataReceived
respm = TunnelingTCP4ClientEndpoint._responseMatcher.match(self._connectBuffer)
if respm and int(respm.group('status')) == 200:
if respm and int(respm.group("status")) == 200:
# set proper Server Name Indication extension
sslOptions = self._contextFactory.creatorForNetloc(self._tunneledHost, self._tunneledPort)
sslOptions = self._contextFactory.creatorForNetloc(
self._tunneledHost, self._tunneledPort
)
self._protocol.transport.startTLS(sslOptions, self._protocolFactory)
self._tunnelReadyDeferred.callback(self._protocol)
else:
if respm:
extra = {'status': int(respm.group('status')),
'reason': respm.group('reason').strip()}
extra = {
"status": int(respm.group("status")),
"reason": respm.group("reason").strip(),
}
else:
extra = rcvd_bytes[:self._truncatedLength]
extra = rcvd_bytes[: self._truncatedLength]
self._tunnelReadyDeferred.errback(
TunnelError('Could not open CONNECT tunnel with proxy '
f'{self._host}:{self._port} [{extra!r}]')
TunnelError(
"Could not open CONNECT tunnel with proxy "
f"{self._host}:{self._port} [{extra!r}]"
)
)
def connectFailed(self, reason):
@ -173,12 +203,12 @@ def tunnel_request_data(host, port, proxy_auth_header=None):
>>> s(tunnel_request_data(b"example.com", "8090"))
'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
"""
host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
tunnel_req += b'Host: ' + host_value + b'\r\n'
host_value = to_bytes(host, encoding="ascii") + b":" + to_bytes(str(port))
tunnel_req = b"CONNECT " + host_value + b" HTTP/1.1\r\n"
tunnel_req += b"Host: " + host_value + b"\r\n"
if proxy_auth_header:
tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
tunnel_req += b'\r\n'
tunnel_req += b"Proxy-Authorization: " + proxy_auth_header + b"\r\n"
tunnel_req += b"\r\n"
return tunnel_req
@ -190,8 +220,15 @@ class TunnelingAgent(Agent):
proxy involved.
"""
def __init__(self, reactor, proxyConf, contextFactory=None,
connectTimeout=None, bindAddress=None, pool=None):
def __init__(
self,
reactor,
proxyConf,
contextFactory=None,
connectTimeout=None,
bindAddress=None,
pool=None,
):
super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
self._proxyConf = proxyConf
self._contextFactory = contextFactory
@ -207,7 +244,9 @@ class TunnelingAgent(Agent):
bindAddress=self._endpointFactory._bindAddress,
)
def _requestWithEndpoint(self, key, endpoint, method, parsedURI, headers, bodyProducer, requestPath):
def _requestWithEndpoint(
self, key, endpoint, method, parsedURI, headers, bodyProducer, requestPath
):
# proxy host and port are required for HTTP pool `key`
# otherwise, same remote host connection request could reuse
# a cached tunneled connection to a different proxy
@ -224,8 +263,9 @@ class TunnelingAgent(Agent):
class ScrapyProxyAgent(Agent):
def __init__(self, reactor, proxyURI, connectTimeout=None, bindAddress=None, pool=None):
def __init__(
self, reactor, proxyURI, connectTimeout=None, bindAddress=None, pool=None
):
super().__init__(
reactor=reactor,
connectTimeout=connectTimeout,
@ -257,8 +297,17 @@ class ScrapyAgent:
_ProxyAgent = ScrapyProxyAgent
_TunnelingAgent = TunnelingAgent
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None):
def __init__(
self,
contextFactory=None,
connectTimeout=10,
bindAddress=None,
pool=None,
maxsize=0,
warnsize=0,
fail_on_dataloss=True,
crawler=None,
):
self._contextFactory = contextFactory
self._connectTimeout = connectTimeout
self._bindAddress = bindAddress
@ -271,14 +320,15 @@ class ScrapyAgent:
def _get_agent(self, request, timeout):
from twisted.internet import reactor
bindaddress = request.meta.get('bindaddress') or self._bindAddress
proxy = request.meta.get('proxy')
bindaddress = request.meta.get("bindaddress") or self._bindAddress
proxy = request.meta.get("proxy")
if proxy:
proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse(proxy)
scheme = _parse(request.url)[0]
proxyHost = to_unicode(proxyHost)
if scheme == b'https':
proxyAuth = request.headers.get(b'Proxy-Authorization', None)
if scheme == b"https":
proxyAuth = request.headers.get(b"Proxy-Authorization", None)
proxyConf = (proxyHost, proxyPort, proxyAuth)
return self._TunnelingAgent(
reactor=reactor,
@ -288,11 +338,11 @@ class ScrapyAgent:
bindAddress=bindaddress,
pool=self._pool,
)
proxyScheme = proxyScheme or b'http'
proxyURI = urlunparse((proxyScheme, proxyNetloc, proxyParams, '', '', ''))
proxyScheme = proxyScheme or b"http"
proxyURI = urlunparse((proxyScheme, proxyNetloc, proxyParams, "", "", ""))
return self._ProxyAgent(
reactor=reactor,
proxyURI=to_bytes(proxyURI, encoding='ascii'),
proxyURI=to_bytes(proxyURI, encoding="ascii"),
connectTimeout=timeout,
bindAddress=bindaddress,
pool=self._pool,
@ -308,7 +358,8 @@ class ScrapyAgent:
def download_request(self, request):
from twisted.internet import reactor
timeout = request.meta.get('download_timeout') or self._connectTimeout
timeout = request.meta.get("download_timeout") or self._connectTimeout
agent = self._get_agent(request, timeout)
# request details
@ -316,13 +367,15 @@ class ScrapyAgent:
method = to_bytes(request.method)
headers = TxHeaders(request.headers)
if isinstance(agent, self._TunnelingAgent):
headers.removeHeader(b'Proxy-Authorization')
headers.removeHeader(b"Proxy-Authorization")
if request.body:
bodyproducer = _RequestBodyProducer(request.body)
else:
bodyproducer = None
start_time = time()
d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
d = agent.request(
method, to_bytes(url, encoding="ascii"), headers, bodyproducer
)
# set download latency
d.addCallback(self._cb_latency, request, start_time)
# response body is ready to be consumed
@ -345,14 +398,14 @@ class ScrapyAgent:
raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
def _cb_latency(self, result, request, start_time):
request.meta['download_latency'] = time() - start_time
request.meta["download_latency"] = time() - start_time
return result
@staticmethod
def _headers_from_twisted_response(response):
headers = Headers()
if response.length != UNKNOWN_LENGTH:
headers[b'Content-Length'] = str(response.length).encode()
headers[b"Content-Length"] = str(response.length).encode()
headers.update(response.headers.getAllRawHeaders())
return headers
@ -366,8 +419,10 @@ class ScrapyAgent:
)
for handler, result in headers_received_result:
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
{"request": request, "handler": handler.__qualname__})
logger.debug(
"Download stopped for %(request)s from signal handler %(handler)s",
{"request": request, "handler": handler.__qualname__},
)
txresponse._transport.stopProducing()
txresponse._transport.loseConnection()
return {
@ -389,15 +444,23 @@ class ScrapyAgent:
"ip_address": None,
}
maxsize = request.meta.get('download_maxsize', self._maxsize)
warnsize = request.meta.get('download_warnsize', self._warnsize)
maxsize = request.meta.get("download_maxsize", self._maxsize)
warnsize = request.meta.get("download_warnsize", self._warnsize)
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)
fail_on_dataloss = request.meta.get(
"download_fail_on_dataloss", self._fail_on_dataloss
)
if maxsize and expected_size > maxsize:
warning_msg = ("Cancelling download of %(url)s: expected response "
"size (%(size)s) larger than download max size (%(maxsize)s).")
warning_args = {'url': request.url, 'size': expected_size, 'maxsize': maxsize}
warning_msg = (
"Cancelling download of %(url)s: expected response "
"size (%(size)s) larger than download max size (%(maxsize)s)."
)
warning_args = {
"url": request.url,
"size": expected_size,
"maxsize": maxsize,
}
logger.warning(warning_msg, warning_args)
@ -405,9 +468,11 @@ class ScrapyAgent:
raise defer.CancelledError(warning_msg % warning_args)
if warnsize and expected_size > warnsize:
logger.warning("Expected response size (%(size)s) larger than "
"download warn size (%(warnsize)s) in request %(request)s.",
{'size': expected_size, 'warnsize': warnsize, 'request': request})
logger.warning(
"Expected response size (%(size)s) larger than "
"download warn size (%(warnsize)s) in request %(request)s.",
{"size": expected_size, "warnsize": warnsize, "request": request},
)
def _cancel(_):
# Abort connection immediately.
@ -457,7 +522,6 @@ class ScrapyAgent:
@implementer(IBodyProducer)
class _RequestBodyProducer:
def __init__(self, body):
self.body = body
self.length = len(body)
@ -474,8 +538,16 @@ class _RequestBodyProducer:
class _ResponseReader(protocol.Protocol):
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler):
def __init__(
self,
finished,
txresponse,
request,
maxsize,
warnsize,
fail_on_dataloss,
crawler,
):
self._finished = finished
self._txresponse = txresponse
self._request = request
@ -491,22 +563,28 @@ class _ResponseReader(protocol.Protocol):
self._crawler = crawler
def _finish_response(self, flags=None, failure=None):
self._finished.callback({
"txresponse": self._txresponse,
"body": self._bodybuf.getvalue(),
"flags": flags,
"certificate": self._certificate,
"ip_address": self._ip_address,
"failure": failure,
})
self._finished.callback(
{
"txresponse": self._txresponse,
"body": self._bodybuf.getvalue(),
"flags": flags,
"certificate": self._certificate,
"ip_address": self._ip_address,
"failure": failure,
}
)
def connectionMade(self):
if self._certificate is None:
with suppress(AttributeError):
self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
self._certificate = ssl.Certificate(
self.transport._producer.getPeerCertificate()
)
if self._ip_address is None:
self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host)
self._ip_address = ipaddress.ip_address(
self.transport._producer.getPeer().host
)
def dataReceived(self, bodyBytes):
# This maybe called several times after cancel was called with buffered data.
@ -524,29 +602,40 @@ class _ResponseReader(protocol.Protocol):
)
for handler, result in bytes_received_result:
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
{"request": self._request, "handler": handler.__qualname__})
logger.debug(
"Download stopped for %(request)s from signal handler %(handler)s",
{"request": self._request, "handler": handler.__qualname__},
)
self.transport.stopProducing()
self.transport.loseConnection()
failure = result if result.value.fail else None
self._finish_response(flags=["download_stopped"], failure=failure)
if self._maxsize and self._bytes_received > self._maxsize:
logger.warning("Received (%(bytes)s) bytes larger than download "
"max size (%(maxsize)s) in request %(request)s.",
{'bytes': self._bytes_received,
'maxsize': self._maxsize,
'request': self._request})
logger.warning(
"Received (%(bytes)s) bytes larger than download "
"max size (%(maxsize)s) in request %(request)s.",
{
"bytes": self._bytes_received,
"maxsize": self._maxsize,
"request": self._request,
},
)
# Clear buffer earlier to avoid keeping data in memory for a long time.
self._bodybuf.truncate(0)
self._finished.cancel()
if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
if (
self._warnsize
and self._bytes_received > self._warnsize
and not self._reached_warnsize
):
self._reached_warnsize = True
logger.warning("Received more bytes than download "
"warn size (%(warnsize)s) in request %(request)s.",
{'warnsize': self._warnsize,
'request': self._request})
logger.warning(
"Received more bytes than download "
"warn size (%(warnsize)s) in request %(request)s.",
{"warnsize": self._warnsize, "request": self._request},
)
def connectionLost(self, reason):
if self._finished.called:
@ -560,16 +649,20 @@ class _ResponseReader(protocol.Protocol):
self._finish_response(flags=["partial"])
return
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
if reason.check(ResponseFailed) and any(
r.check(_DataLoss) for r in reason.value.reasons
):
if not self._fail_on_dataloss:
self._finish_response(flags=["dataloss"])
return
if not self._fail_on_dataloss_warned:
logger.warning("Got data loss in %s. If you want to process broken "
"responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
" -- This message won't be shown in further requests",
self._txresponse.request.absoluteURI.decode())
logger.warning(
"Got data loss in %s. If you want to process broken "
"responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
" -- This message won't be shown in further requests",
self._txresponse.request.absoluteURI.decode(),
)
self._fail_on_dataloss_warned = True
self._finished.errback(reason)

View File

@ -17,7 +17,9 @@ from scrapy.spiders import Spider
from scrapy.utils.python import to_bytes
H2DownloadHandlerOrSubclass = TypeVar("H2DownloadHandlerOrSubclass", bound="H2DownloadHandler")
H2DownloadHandlerOrSubclass = TypeVar(
"H2DownloadHandlerOrSubclass", bound="H2DownloadHandler"
)
class H2DownloadHandler:
@ -25,11 +27,14 @@ class H2DownloadHandler:
self._crawler = crawler
from twisted.internet import reactor
self._pool = H2ConnectionPool(reactor, settings)
self._context_factory = load_context_factory_from_settings(settings, crawler)
@classmethod
def from_crawler(cls: Type[H2DownloadHandlerOrSubclass], crawler: Crawler) -> H2DownloadHandlerOrSubclass:
def from_crawler(
cls: Type[H2DownloadHandlerOrSubclass], crawler: Crawler
) -> H2DownloadHandlerOrSubclass:
return cls(crawler.settings, crawler)
def download_request(self, request: Request, spider: Spider) -> Deferred:
@ -49,7 +54,8 @@ class ScrapyH2Agent:
_ProxyAgent = ScrapyProxyH2Agent
def __init__(
self, context_factory,
self,
context_factory,
pool: H2ConnectionPool,
connect_timeout: int = 10,
bind_address: Optional[bytes] = None,
@ -63,19 +69,22 @@ class ScrapyH2Agent:
def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent:
from twisted.internet import reactor
bind_address = request.meta.get('bindaddress') or self._bind_address
proxy = request.meta.get('proxy')
bind_address = request.meta.get("bindaddress") or self._bind_address
proxy = request.meta.get("proxy")
if proxy:
_, _, proxy_host, proxy_port, proxy_params = _parse(proxy)
scheme = _parse(request.url)[0]
if scheme == b'https':
if scheme == b"https":
# ToDo
raise NotImplementedError('Tunneling via CONNECT method using HTTP/2.0 is not yet supported')
raise NotImplementedError(
"Tunneling via CONNECT method using HTTP/2.0 is not yet supported"
)
return self._ProxyAgent(
reactor=reactor,
context_factory=self._context_factory,
proxy_uri=URI.fromBytes(to_bytes(proxy, encoding='ascii')),
proxy_uri=URI.fromBytes(to_bytes(proxy, encoding="ascii")),
connect_timeout=timeout,
bind_address=bind_address,
pool=self._pool,
@ -91,7 +100,8 @@ class ScrapyH2Agent:
def download_request(self, request: Request, spider: Spider) -> Deferred:
from twisted.internet import reactor
timeout = request.meta.get('download_timeout') or self._connect_timeout
timeout = request.meta.get("download_timeout") or self._connect_timeout
agent = self._get_agent(request, timeout)
start_time = time()
@ -103,12 +113,16 @@ class ScrapyH2Agent:
return d
@staticmethod
def _cb_latency(response: Response, request: Request, start_time: float) -> Response:
request.meta['download_latency'] = time() - start_time
def _cb_latency(
response: Response, request: Request, start_time: float
) -> Response:
request.meta["download_latency"] = time() - start_time
return response
@staticmethod
def _cb_timeout(response: Response, request: Request, timeout: float, timeout_cl: DelayedCall) -> Response:
def _cb_timeout(
response: Response, request: Request, timeout: float, timeout_cl: DelayedCall
) -> Response:
if timeout_cl.active():
timeout_cl.cancel()
return response

View File

@ -6,40 +6,49 @@ from scrapy.utils.misc import create_instance
class S3DownloadHandler:
def __init__(self, settings, *,
crawler=None,
aws_access_key_id=None, aws_secret_access_key=None,
aws_session_token=None,
httpdownloadhandler=HTTPDownloadHandler, **kw):
def __init__(
self,
settings,
*,
crawler=None,
aws_access_key_id=None,
aws_secret_access_key=None,
aws_session_token=None,
httpdownloadhandler=HTTPDownloadHandler,
**kw,
):
if not is_botocore_available():
raise NotConfigured('missing botocore library')
raise NotConfigured("missing botocore library")
if not aws_access_key_id:
aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
aws_access_key_id = settings["AWS_ACCESS_KEY_ID"]
if not aws_secret_access_key:
aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']
aws_secret_access_key = settings["AWS_SECRET_ACCESS_KEY"]
if not aws_session_token:
aws_session_token = settings['AWS_SESSION_TOKEN']
aws_session_token = settings["AWS_SESSION_TOKEN"]
# If no credentials could be found anywhere,
# consider this an anonymous connection request by default;
# unless 'anon' was set explicitly (True/False).
anon = kw.get('anon')
anon = kw.get("anon")
if anon is None and not aws_access_key_id and not aws_secret_access_key:
kw['anon'] = True
self.anon = kw.get('anon')
kw["anon"] = True
self.anon = kw.get("anon")
self._signer = None
import botocore.auth
import botocore.credentials
kw.pop('anon', None)
kw.pop("anon", None)
if kw:
raise TypeError(f'Unexpected keyword arguments: {kw}')
raise TypeError(f"Unexpected keyword arguments: {kw}")
if not self.anon:
SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
self._signer = SignerCls(botocore.credentials.Credentials(
aws_access_key_id, aws_secret_access_key, aws_session_token))
SignerCls = botocore.auth.AUTH_TYPE_MAPS["s3"]
self._signer = SignerCls(
botocore.credentials.Credentials(
aws_access_key_id, aws_secret_access_key, aws_session_token
)
)
_http_handler = create_instance(
objcls=httpdownloadhandler,
@ -54,20 +63,21 @@ class S3DownloadHandler:
def download_request(self, request, spider):
p = urlparse_cached(request)
scheme = 'https' if request.meta.get('is_secure') else 'http'
scheme = "https" if request.meta.get("is_secure") else "http"
bucket = p.hostname
path = p.path + '?' + p.query if p.query else p.path
url = f'{scheme}://{bucket}.s3.amazonaws.com{path}'
path = p.path + "?" + p.query if p.query else p.path
url = f"{scheme}://{bucket}.s3.amazonaws.com{path}"
if self.anon:
request = request.replace(url=url)
else:
import botocore.awsrequest
awsrequest = botocore.awsrequest.AWSRequest(
method=request.method,
url=f'{scheme}://s3.amazonaws.com/{bucket}{path}',
url=f"{scheme}://s3.amazonaws.com/{bucket}{path}",
headers=request.headers.to_unicode_dict(),
data=request.body)
data=request.body,
)
self._signer.add_auth(awsrequest)
request = request.replace(
url=url, headers=awsrequest.headers.items())
request = request.replace(url=url, headers=awsrequest.headers.items())
return self._download_http(request, spider)

View File

@ -18,28 +18,31 @@ from scrapy.utils.conf import build_component_list
class DownloaderMiddlewareManager(MiddlewareManager):
component_name = 'downloader middleware'
component_name = "downloader middleware"
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(
settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
return build_component_list(settings.getwithbase("DOWNLOADER_MIDDLEWARES"))
def _add_middleware(self, mw):
if hasattr(mw, 'process_request'):
self.methods['process_request'].append(mw.process_request)
if hasattr(mw, 'process_response'):
self.methods['process_response'].appendleft(mw.process_response)
if hasattr(mw, 'process_exception'):
self.methods['process_exception'].appendleft(mw.process_exception)
if hasattr(mw, "process_request"):
self.methods["process_request"].append(mw.process_request)
if hasattr(mw, "process_response"):
self.methods["process_response"].appendleft(mw.process_response)
if hasattr(mw, "process_exception"):
self.methods["process_exception"].appendleft(mw.process_exception)
def download(self, download_func: Callable, request: Request, spider: Spider):
@defer.inlineCallbacks
def process_request(request: Request):
for method in self.methods['process_request']:
for method in self.methods["process_request"]:
method = cast(Callable, method)
response = yield deferred_from_coro(method(request=request, spider=spider))
if response is not None and not isinstance(response, (Response, Request)):
response = yield deferred_from_coro(
method(request=request, spider=spider)
)
if response is not None and not isinstance(
response, (Response, Request)
):
raise _InvalidOutput(
f"Middleware {method.__qualname__} must return None, Response or "
f"Request, got {response.__class__.__name__}"
@ -55,9 +58,11 @@ class DownloaderMiddlewareManager(MiddlewareManager):
elif isinstance(response, Request):
return response
for method in self.methods['process_response']:
for method in self.methods["process_response"]:
method = cast(Callable, method)
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
response = yield deferred_from_coro(
method(request=request, response=response, spider=spider)
)
if not isinstance(response, (Response, Request)):
raise _InvalidOutput(
f"Middleware {method.__qualname__} must return Response or Request, "
@ -70,10 +75,14 @@ class DownloaderMiddlewareManager(MiddlewareManager):
@defer.inlineCallbacks
def process_exception(failure: Failure):
exception = failure.value
for method in self.methods['process_exception']:
for method in self.methods["process_exception"]:
method = cast(Callable, method)
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
if response is not None and not isinstance(response, (Response, Request)):
response = yield deferred_from_coro(
method(request=request, exception=exception, spider=spider)
)
if response is not None and not isinstance(
response, (Response, Request)
):
raise _InvalidOutput(
f"Middleware {method.__qualname__} must return None, Response or "
f"Request, got {type(response)}"

View File

@ -2,7 +2,11 @@ import logging
from OpenSSL import SSL
from service_identity.exceptions import CertificateError
from twisted.internet._sslverify import ClientTLSOptions, verifyHostname, VerificationError
from twisted.internet._sslverify import (
ClientTLSOptions,
verifyHostname,
VerificationError,
)
from twisted.internet.ssl import AcceptableCiphers
from scrapy.utils.ssl import x509name_to_string, get_temp_key_info
@ -10,17 +14,17 @@ from scrapy.utils.ssl import x509name_to_string, get_temp_key_info
logger = logging.getLogger(__name__)
METHOD_TLS = 'TLS'
METHOD_TLSv10 = 'TLSv1.0'
METHOD_TLSv11 = 'TLSv1.1'
METHOD_TLSv12 = 'TLSv1.2'
METHOD_TLS = "TLS"
METHOD_TLSv10 = "TLSv1.0"
METHOD_TLSv11 = "TLSv1.1"
METHOD_TLSv12 = "TLSv1.2"
openssl_methods = {
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
METHOD_TLSv11: getattr(SSL, 'TLSv1_1_METHOD', 5), # TLS 1.1 only
METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
METHOD_TLSv11: getattr(SSL, "TLSv1_1_METHOD", 5), # TLS 1.1 only
METHOD_TLSv12: getattr(SSL, "TLSv1_2_METHOD", 6), # TLS 1.2 only
}
@ -44,32 +48,38 @@ class ScrapyClientTLSOptions(ClientTLSOptions):
connection.set_tlsext_host_name(self._hostnameBytes)
elif where & SSL.SSL_CB_HANDSHAKE_DONE:
if self.verbose_logging:
logger.debug('SSL connection to %s using protocol %s, cipher %s',
self._hostnameASCII,
connection.get_protocol_version_name(),
connection.get_cipher_name(),
)
logger.debug(
"SSL connection to %s using protocol %s, cipher %s",
self._hostnameASCII,
connection.get_protocol_version_name(),
connection.get_cipher_name(),
)
server_cert = connection.get_peer_certificate()
logger.debug('SSL connection certificate: issuer "%s", subject "%s"',
x509name_to_string(server_cert.get_issuer()),
x509name_to_string(server_cert.get_subject()),
)
logger.debug(
'SSL connection certificate: issuer "%s", subject "%s"',
x509name_to_string(server_cert.get_issuer()),
x509name_to_string(server_cert.get_subject()),
)
key_info = get_temp_key_info(connection._ssl)
if key_info:
logger.debug('SSL temp key: %s', key_info)
logger.debug("SSL temp key: %s", key_info)
try:
verifyHostname(connection, self._hostnameASCII)
except (CertificateError, VerificationError) as e:
logger.warning(
'Remote certificate is not valid for hostname "%s"; %s',
self._hostnameASCII, e)
self._hostnameASCII,
e,
)
except ValueError as e:
logger.warning(
'Ignoring error while verifying certificate '
"Ignoring error while verifying certificate "
'from host "%s" (exception: %r)',
self._hostnameASCII, e)
self._hostnameASCII,
e,
)
DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString('DEFAULT')
DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString("DEFAULT")

View File

@ -15,33 +15,33 @@ from scrapy.responsetypes import responsetypes
def _parsed_url_args(parsed):
# Assume parsed is urlparse-d from Request.url,
# which was passed via safe_url_string and is ascii-only.
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
path = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
path = to_bytes(path, encoding="ascii")
host = to_bytes(parsed.hostname, encoding="ascii")
port = parsed.port
scheme = to_bytes(parsed.scheme, encoding="ascii")
netloc = to_bytes(parsed.netloc, encoding="ascii")
if port is None:
port = 443 if scheme == b'https' else 80
port = 443 if scheme == b"https" else 80
return scheme, netloc, host, port, path
def _parse(url):
""" Return tuple of (scheme, netloc, host, port, path),
"""Return tuple of (scheme, netloc, host, port, path),
all in bytes except for port which is int.
Assume url is from Request.url, which was passed via safe_url_string
and is ascii-only.
"""
url = url.strip()
if not re.match(r'^\w+://', url):
url = '//' + url
if not re.match(r"^\w+://", url):
url = "//" + url
parsed = urlparse(url)
return _parsed_url_args(parsed)
class ScrapyHTTPPageGetter(HTTPClient):
delimiter = b'\n'
delimiter = b"\n"
def connectionMade(self):
self.headers = Headers() # bucket for response headers
@ -75,8 +75,8 @@ class ScrapyHTTPPageGetter(HTTPClient):
self.factory.noPage(reason)
def handleResponse(self, response):
if self.factory.method.upper() == b'HEAD':
self.factory.page(b'')
if self.factory.method.upper() == b"HEAD":
self.factory.page(b"")
elif self.length is not None and self.length > 0:
self.factory.noPage(self._connection_lost_reason)
else:
@ -87,12 +87,15 @@ class ScrapyHTTPPageGetter(HTTPClient):
self.transport.loseConnection()
# transport cleanup needed for HTTPS connections
if self.factory.url.startswith(b'https'):
if self.factory.url.startswith(b"https"):
self.transport.stopProducing()
self.factory.noPage(
defer.TimeoutError(f"Getting {self.factory.url} took longer "
f"than {self.factory.timeout} seconds."))
defer.TimeoutError(
f"Getting {self.factory.url} took longer "
f"than {self.factory.timeout} seconds."
)
)
# This class used to inherit from Twisteds
@ -109,16 +112,24 @@ class ScrapyHTTPClientFactory(ClientFactory):
afterFoundGet = False
def _build_response(self, body, request):
request.meta['download_latency'] = self.headers_time - self.start_time
request.meta["download_latency"] = self.headers_time - self.start_time
status = int(self.status)
headers = Headers(self.response_headers)
respcls = responsetypes.from_args(headers=headers, url=self._url, body=body)
return respcls(url=self._url, status=status, headers=headers, body=body, protocol=to_unicode(self.version))
return respcls(
url=self._url,
status=status,
headers=headers,
body=body,
protocol=to_unicode(self.version),
)
def _set_connection_attributes(self, request):
parsed = urlparse_cached(request)
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
proxy = request.meta.get('proxy')
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(
parsed
)
proxy = request.meta.get("proxy")
if proxy:
self.scheme, _, self.host, self.port, _ = _parse(proxy)
self.path = self.url
@ -126,12 +137,12 @@ class ScrapyHTTPClientFactory(ClientFactory):
def __init__(self, request, timeout=180):
self._url = urldefrag(request.url)[0]
# converting to bytes to comply to Twisted interface
self.url = to_bytes(self._url, encoding='ascii')
self.method = to_bytes(request.method, encoding='ascii')
self.url = to_bytes(self._url, encoding="ascii")
self.method = to_bytes(request.method, encoding="ascii")
self.body = request.body or None
self.headers = Headers(request.headers)
self.response_headers = None
self.timeout = request.meta.get('download_timeout') or timeout
self.timeout = request.meta.get("download_timeout") or timeout
self.start_time = time()
self.deferred = defer.Deferred().addCallback(self._build_response, request)
@ -146,16 +157,16 @@ class ScrapyHTTPClientFactory(ClientFactory):
self._set_connection_attributes(request)
# set Host header based on url
self.headers.setdefault('Host', self.netloc)
self.headers.setdefault("Host", self.netloc)
# set Content-Length based len of body
if self.body is not None:
self.headers['Content-Length'] = len(self.body)
self.headers["Content-Length"] = len(self.body)
# just in case a broken http/1.1 decides to keep connection alive
self.headers.setdefault("Connection", "close")
# Content-Length must be specified in POST method even with no body
elif self.method == b'POST':
self.headers['Content-Length'] = 0
elif self.method == b"POST":
self.headers["Content-Length"] = 0
def __repr__(self):
return f"<{self.__class__.__name__}: {self.url}>"
@ -171,6 +182,7 @@ class ScrapyHTTPClientFactory(ClientFactory):
p.afterFoundGet = self.afterFoundGet
if self.timeout:
from twisted.internet import reactor
timeoutCall = reactor.callLater(self.timeout, p.timeout)
self.deferred.addBoth(self._cancelTimeout, timeoutCall)
return p

View File

@ -79,13 +79,14 @@ class ExecutionEngine:
self.running = False
self.paused = False
self.scheduler_cls = self._get_scheduler_class(crawler.settings)
downloader_cls = load_object(self.settings['DOWNLOADER'])
downloader_cls = load_object(self.settings["DOWNLOADER"])
self.downloader = downloader_cls(crawler)
self.scraper = Scraper(crawler)
self._spider_closed_callback = spider_closed_callback
def _get_scheduler_class(self, settings: BaseSettings) -> type:
from scrapy.core.scheduler import BaseScheduler
scheduler_cls = load_object(settings["SCHEDULER"])
if not issubclass(scheduler_cls, BaseScheduler):
raise TypeError(
@ -106,6 +107,7 @@ class ExecutionEngine:
def stop(self) -> Deferred:
"""Gracefully stop the execution engine"""
@inlineCallbacks
def _finish_stopping_engine(_) -> Deferred:
yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
@ -115,7 +117,11 @@ class ExecutionEngine:
raise RuntimeError("Engine not running")
self.running = False
dfd = self.close_spider(self.spider, reason="shutdown") if self.spider is not None else succeed(None)
dfd = (
self.close_spider(self.spider, reason="shutdown")
if self.spider is not None
else succeed(None)
)
return dfd.addBoth(_finish_stopping_engine)
def close(self) -> Deferred:
@ -126,7 +132,9 @@ class ExecutionEngine:
if self.running:
return self.stop() # will also close spider and downloader
if self.spider is not None:
return self.close_spider(self.spider, reason="shutdown") # will also close downloader
return self.close_spider(
self.spider, reason="shutdown"
) # will also close downloader
return succeed(self.downloader.close())
def pause(self) -> None:
@ -144,7 +152,10 @@ class ExecutionEngine:
if self.paused:
return None
while not self._needs_backout() and self._next_request_from_scheduler() is not None:
while (
not self._needs_backout()
and self._next_request_from_scheduler() is not None
):
pass
if self.slot.start_requests is not None and not self._needs_backout():
@ -154,7 +165,11 @@ class ExecutionEngine:
self.slot.start_requests = None
except Exception:
self.slot.start_requests = None
logger.error('Error while obtaining start requests', exc_info=True, extra={'spider': self.spider})
logger.error(
"Error while obtaining start requests",
exc_info=True,
extra={"spider": self.spider},
)
else:
self.crawl(request)
@ -179,18 +194,30 @@ class ExecutionEngine:
d = self._download(request, self.spider)
d.addBoth(self._handle_downloader_output, request)
d.addErrback(lambda f: logger.info('Error while handling downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': self.spider}))
d.addErrback(
lambda f: logger.info(
"Error while handling downloader output",
exc_info=failure_to_exc_info(f),
extra={"spider": self.spider},
)
)
d.addBoth(lambda _: self.slot.remove_request(request))
d.addErrback(lambda f: logger.info('Error while removing request from slot',
exc_info=failure_to_exc_info(f),
extra={'spider': self.spider}))
d.addErrback(
lambda f: logger.info(
"Error while removing request from slot",
exc_info=failure_to_exc_info(f),
extra={"spider": self.spider},
)
)
slot = self.slot
d.addBoth(lambda _: slot.nextcall.schedule())
d.addErrback(lambda f: logger.info('Error while scheduling new request',
exc_info=failure_to_exc_info(f),
extra={'spider': self.spider}))
d.addErrback(
lambda f: logger.info(
"Error while scheduling new request",
exc_info=failure_to_exc_info(f),
extra={"spider": self.spider},
)
)
return d
def _handle_downloader_output(
@ -199,7 +226,9 @@ class ExecutionEngine:
assert self.spider is not None # typing
if not isinstance(result, (Request, Response, Failure)):
raise TypeError(f"Incorrect type: expected Request, Response or Failure, got {type(result)}: {result!r}")
raise TypeError(
f"Incorrect type: expected Request, Response or Failure, got {type(result)}: {result!r}"
)
# downloader middleware can return requests (for example, redirects)
if isinstance(result, Request):
@ -211,7 +240,7 @@ class ExecutionEngine:
lambda f: logger.error(
"Error while enqueuing downloader output",
exc_info=failure_to_exc_info(f),
extra={'spider': self.spider},
extra={"spider": self.spider},
)
)
return d
@ -244,16 +273,22 @@ class ExecutionEngine:
stacklevel=2,
)
if spider is not self.spider:
raise RuntimeError(f"The spider {spider.name!r} does not match the open spider")
raise RuntimeError(
f"The spider {spider.name!r} does not match the open spider"
)
if self.spider is None:
raise RuntimeError(f"No open spider to crawl: {request}")
self._schedule_request(request, self.spider)
self.slot.nextcall.schedule() # type: ignore[union-attr]
def _schedule_request(self, request: Request, spider: Spider) -> None:
self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
self.signals.send_catch_log(
signals.request_scheduled, request=request, spider=spider
)
if not self.slot.scheduler.enqueue_request(request): # type: ignore[union-attr]
self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
self.signals.send_catch_log(
signals.request_dropped, request=request, spider=spider
)
def download(self, request: Request, spider: Optional[Spider] = None) -> Deferred:
"""Return a Deferred which fires with a Response as result, only downloader middlewares are applied"""
@ -264,10 +299,14 @@ class ExecutionEngine:
stacklevel=2,
)
if spider is not self.spider:
logger.warning("The spider '%s' does not match the open spider", spider.name)
logger.warning(
"The spider '%s' does not match the open spider", spider.name
)
if self.spider is None:
raise RuntimeError(f"No open spider to crawl: {request}")
return self._download(request, spider).addBoth(self._downloaded, request, spider)
return self._download(request, spider).addBoth(
self._downloaded, request, spider
)
def _downloaded(
self, result: Union[Response, Request], request: Request, spider: Spider
@ -286,7 +325,9 @@ class ExecutionEngine:
def _on_success(result: Union[Response, Request]) -> Union[Response, Request]:
if not isinstance(result, (Response, Request)):
raise TypeError(f"Incorrect type: expected Response or Request, got {type(result)}: {result!r}")
raise TypeError(
f"Incorrect type: expected Response or Request, got {type(result)}: {result!r}"
)
if isinstance(result, Response):
if result.request is None:
result.request = request
@ -311,13 +352,19 @@ class ExecutionEngine:
return dwld
@inlineCallbacks
def open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True):
def open_spider(
self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True
):
if self.slot is not None:
raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
logger.info("Spider opened", extra={'spider': spider})
logger.info("Spider opened", extra={"spider": spider})
nextcall = CallLaterOnce(self._next_request)
scheduler = create_instance(self.scheduler_cls, settings=None, crawler=self.crawler)
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
scheduler = create_instance(
self.scheduler_cls, settings=None, crawler=self.crawler
)
start_requests = yield self.scraper.spidermw.process_start_requests(
start_requests, spider
)
self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
self.spider = spider
if hasattr(scheduler, "open"):
@ -337,7 +384,9 @@ class ExecutionEngine:
"""
assert self.spider is not None # typing
expected_ex = (DontCloseSpider, CloseSpider)
res = self.signals.send_catch_log(signals.spider_idle, spider=self.spider, dont_log=expected_ex)
res = self.signals.send_catch_log(
signals.spider_idle, spider=self.spider, dont_log=expected_ex
)
detected_ex = {
ex: x.value
for _, x in res
@ -347,7 +396,7 @@ class ExecutionEngine:
if DontCloseSpider in detected_ex:
return None
if self.spider_is_idle():
ex = detected_ex.get(CloseSpider, CloseSpider(reason='finished'))
ex = detected_ex.get(CloseSpider, CloseSpider(reason="finished"))
assert isinstance(ex, CloseSpider) # typing
self.close_spider(self.spider, reason=ex.reason)
@ -359,40 +408,55 @@ class ExecutionEngine:
if self.slot.closing is not None:
return self.slot.closing
logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider})
logger.info(
"Closing spider (%(reason)s)", {"reason": reason}, extra={"spider": spider}
)
dfd = self.slot.close()
def log_failure(msg: str) -> Callable:
def errback(failure: Failure) -> None:
logger.error(msg, exc_info=failure_to_exc_info(failure), extra={'spider': spider})
logger.error(
msg, exc_info=failure_to_exc_info(failure), extra={"spider": spider}
)
return errback
dfd.addBoth(lambda _: self.downloader.close())
dfd.addErrback(log_failure('Downloader close failure'))
dfd.addErrback(log_failure("Downloader close failure"))
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
dfd.addErrback(log_failure('Scraper close failure'))
dfd.addErrback(log_failure("Scraper close failure"))
if hasattr(self.slot.scheduler, "close"):
dfd.addBoth(lambda _: self.slot.scheduler.close(reason))
dfd.addErrback(log_failure("Scheduler close failure"))
dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
signal=signals.spider_closed, spider=spider, reason=reason,
))
dfd.addErrback(log_failure('Error while sending spider_close signal'))
dfd.addBoth(
lambda _: self.signals.send_catch_log_deferred(
signal=signals.spider_closed,
spider=spider,
reason=reason,
)
)
dfd.addErrback(log_failure("Error while sending spider_close signal"))
dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
dfd.addErrback(log_failure('Stats close failure'))
dfd.addErrback(log_failure("Stats close failure"))
dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider}))
dfd.addBoth(
lambda _: logger.info(
"Spider closed (%(reason)s)",
{"reason": reason},
extra={"spider": spider},
)
)
dfd.addBoth(lambda _: setattr(self, 'slot', None))
dfd.addErrback(log_failure('Error while unassigning slot'))
dfd.addBoth(lambda _: setattr(self, "slot", None))
dfd.addErrback(log_failure("Error while unassigning slot"))
dfd.addBoth(lambda _: setattr(self, 'spider', None))
dfd.addErrback(log_failure('Error while unassigning spider'))
dfd.addBoth(lambda _: setattr(self, "spider", None))
dfd.addErrback(log_failure("Error while unassigning spider"))
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
@ -408,7 +472,11 @@ class ExecutionEngine:
return [self.spider] if self.spider is not None else []
def has_capacity(self) -> bool:
warnings.warn("ExecutionEngine.has_capacity is deprecated", ScrapyDeprecationWarning, stacklevel=2)
warnings.warn(
"ExecutionEngine.has_capacity is deprecated",
ScrapyDeprecationWarning,
stacklevel=2,
)
return not bool(self.slot)
def schedule(self, request: Request, spider: Spider) -> None:

View File

@ -28,7 +28,9 @@ class H2ConnectionPool:
# Save all requests that arrive before the connection is established
self._pending_requests: Dict[Tuple, Deque[Deferred]] = {}
def get_connection(self, key: Tuple, uri: URI, endpoint: HostnameEndpoint) -> Deferred:
def get_connection(
self, key: Tuple, uri: URI, endpoint: HostnameEndpoint
) -> Deferred:
if key in self._pending_requests:
# Received a request while connecting to remote
# Create a deferred which will fire with the H2ClientProtocol
@ -46,7 +48,9 @@ class H2ConnectionPool:
# No connection is established for the given URI
return self._new_connection(key, uri, endpoint)
def _new_connection(self, key: Tuple, uri: URI, endpoint: HostnameEndpoint) -> Deferred:
def _new_connection(
self, key: Tuple, uri: URI, endpoint: HostnameEndpoint
) -> Deferred:
self._pending_requests[key] = deque()
conn_lost_deferred = Deferred()
@ -102,7 +106,9 @@ class H2Agent:
) -> None:
self._reactor = reactor
self._pool = pool
self._context_factory = AcceptableProtocolsContextFactory(context_factory, acceptable_protocols=[b'h2'])
self._context_factory = AcceptableProtocolsContextFactory(
context_factory, acceptable_protocols=[b"h2"]
)
self.endpoint_factory = _StandardEndpointFactory(
self._reactor, self._context_factory, connect_timeout, bind_address
)
@ -118,7 +124,7 @@ class H2Agent:
return uri.scheme, uri.host, uri.port
def request(self, request: Request, spider: Spider) -> Deferred:
uri = URI.fromBytes(bytes(request.url, encoding='utf-8'))
uri = URI.fromBytes(bytes(request.url, encoding="utf-8"))
try:
endpoint = self.get_endpoint(uri)
except SchemeNotSupported:

View File

@ -9,9 +9,15 @@ from h2.config import H2Configuration
from h2.connection import H2Connection
from h2.errors import ErrorCodes
from h2.events import (
Event, ConnectionTerminated, DataReceived, ResponseReceived,
SettingsAcknowledged, StreamEnded, StreamReset, UnknownFrameReceived,
WindowUpdated
Event,
ConnectionTerminated,
DataReceived,
ResponseReceived,
SettingsAcknowledged,
StreamEnded,
StreamReset,
UnknownFrameReceived,
WindowUpdated,
)
from h2.exceptions import FrameTooLargeError, H2Error
from twisted.internet.defer import Deferred
@ -37,7 +43,6 @@ PROTOCOL_NAME = b"h2"
class InvalidNegotiatedProtocol(H2Error):
def __init__(self, negotiated_protocol: bytes) -> None:
self.negotiated_protocol = negotiated_protocol
@ -55,11 +60,13 @@ class RemoteTerminatedConnection(H2Error):
self.terminate_event = event
def __str__(self) -> str:
return f'Received GOAWAY frame from {self.remote_ip_address!r}'
return f"Received GOAWAY frame from {self.remote_ip_address!r}"
class MethodNotAllowed405(H2Error):
def __init__(self, remote_ip_address: Optional[Union[IPv4Address, IPv6Address]]) -> None:
def __init__(
self, remote_ip_address: Optional[Union[IPv4Address, IPv6Address]]
) -> None:
self.remote_ip_address = remote_ip_address
def __str__(self) -> str:
@ -70,7 +77,9 @@ class MethodNotAllowed405(H2Error):
class H2ClientProtocol(Protocol, TimeoutMixin):
IDLE_TIMEOUT = 240
def __init__(self, uri: URI, settings: Settings, conn_lost_deferred: Deferred) -> None:
def __init__(
self, uri: URI, settings: Settings, conn_lost_deferred: Deferred
) -> None:
"""
Arguments:
uri -- URI of the base url to which HTTP/2 Connection will be made.
@ -82,7 +91,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
"""
self._conn_lost_deferred = conn_lost_deferred
config = H2Configuration(client_side=True, header_encoding='utf-8')
config = H2Configuration(client_side=True, header_encoding="utf-8")
self.conn = H2Connection(config=config)
# ID of the next request stream
@ -105,31 +114,25 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
# initialized when connection is successfully made
self.metadata: Dict = {
# Peer certificate instance
'certificate': None,
"certificate": None,
# Address of the server we are connected to which
# is updated when HTTP/2 connection is made successfully
'ip_address': None,
"ip_address": None,
# URI of the peer HTTP/2 connection is made
'uri': uri,
"uri": uri,
# Both ip_address and uri are used by the Stream before
# initiating the request to verify that the base address
# Variables taken from Project Settings
'default_download_maxsize': settings.getint('DOWNLOAD_MAXSIZE'),
'default_download_warnsize': settings.getint('DOWNLOAD_WARNSIZE'),
"default_download_maxsize": settings.getint("DOWNLOAD_MAXSIZE"),
"default_download_warnsize": settings.getint("DOWNLOAD_WARNSIZE"),
# Counter to keep track of opened streams. This counter
# is used to make sure that not more than MAX_CONCURRENT_STREAMS
# streams are opened which leads to ProtocolError
# We use simple FIFO policy to handle pending requests
'active_streams': 0,
"active_streams": 0,
# Flag to keep track if settings were acknowledged by the remote
# This ensures that we have established a HTTP/2 connection
'settings_acknowledged': False,
"settings_acknowledged": False,
}
@property
@ -138,7 +141,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
This is used while initiating pending streams to make sure
that we initiate stream only during active HTTP/2 Connection
"""
return bool(self.transport.connected) and self.metadata['settings_acknowledged']
return bool(self.transport.connected) and self.metadata["settings_acknowledged"]
@property
def allowed_max_concurrent_streams(self) -> int:
@ -149,7 +152,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
"""
return min(
self.conn.local_settings.max_concurrent_streams,
self.conn.remote_settings.max_concurrent_streams
self.conn.remote_settings.max_concurrent_streams,
)
def _send_pending_requests(self) -> None:
@ -159,37 +162,39 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
"""
while (
self._pending_request_stream_pool
and self.metadata['active_streams'] < self.allowed_max_concurrent_streams
and self.metadata["active_streams"] < self.allowed_max_concurrent_streams
and self.h2_connected
):
self.metadata['active_streams'] += 1
self.metadata["active_streams"] += 1
stream = self._pending_request_stream_pool.popleft()
stream.initiate_request()
self._write_to_transport()
def pop_stream(self, stream_id: int) -> Stream:
"""Perform cleanup when a stream is closed
"""
"""Perform cleanup when a stream is closed"""
stream = self.streams.pop(stream_id)
self.metadata['active_streams'] -= 1
self.metadata["active_streams"] -= 1
self._send_pending_requests()
return stream
def _new_stream(self, request: Request, spider: Spider) -> Stream:
"""Instantiates a new Stream object
"""
"""Instantiates a new Stream object"""
stream = Stream(
stream_id=next(self._stream_id_generator),
request=request,
protocol=self,
download_maxsize=getattr(spider, 'download_maxsize', self.metadata['default_download_maxsize']),
download_warnsize=getattr(spider, 'download_warnsize', self.metadata['default_download_warnsize']),
download_maxsize=getattr(
spider, "download_maxsize", self.metadata["default_download_maxsize"]
),
download_warnsize=getattr(
spider, "download_warnsize", self.metadata["default_download_warnsize"]
),
)
self.streams[stream.stream_id] = stream
return stream
def _write_to_transport(self) -> None:
""" Write data to the underlying transport connection
"""Write data to the underlying transport connection
from the HTTP2 connection instance if any
"""
# Reset the idle timeout as connection is still actively sending data
@ -200,7 +205,9 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
def request(self, request: Request, spider: Spider) -> Deferred:
if not isinstance(request, Request):
raise TypeError(f'Expected scrapy.http.Request, received {request.__class__.__qualname__}')
raise TypeError(
f"Expected scrapy.http.Request, received {request.__class__.__qualname__}"
)
stream = self._new_stream(request, spider)
d = stream.get_response()
@ -221,7 +228,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
self.setTimeout(self.IDLE_TIMEOUT)
destination = self.transport.getPeer()
self.metadata['ip_address'] = ipaddress.ip_address(destination.host)
self.metadata["ip_address"] = ipaddress.ip_address(destination.host)
# Initiate H2 Connection
self.conn.initiate_connection()
@ -237,9 +244,14 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
"""
Close the connection if it's not made via the expected protocol
"""
if self.transport.negotiatedProtocol is not None and self.transport.negotiatedProtocol != PROTOCOL_NAME:
if (
self.transport.negotiatedProtocol is not None
and self.transport.negotiatedProtocol != PROTOCOL_NAME
):
# we have not initiated the connection yet, no need to send a GOAWAY frame to the remote peer
self._lose_connection_with_error([InvalidNegotiatedProtocol(self.transport.negotiatedProtocol)])
self._lose_connection_with_error(
[InvalidNegotiatedProtocol(self.transport.negotiatedProtocol)]
)
def _check_received_data(self, data: bytes) -> None:
"""Checks for edge cases where the connection to remote fails
@ -248,8 +260,8 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
Arguments:
data -- Data received from the remote
"""
if data.startswith(b'HTTP/2.0 405 Method Not Allowed'):
raise MethodNotAllowed405(self.metadata['ip_address'])
if data.startswith(b"HTTP/2.0 405 Method Not Allowed"):
raise MethodNotAllowed405(self.metadata["ip_address"])
def dataReceived(self, data: bytes) -> None:
# Reset the idle timeout as connection is still actively receiving data
@ -284,7 +296,7 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
if (
self.conn.open_outbound_streams > 0
or self.conn.open_inbound_streams > 0
or self.metadata['active_streams'] > 0
or self.metadata["active_streams"] > 0
):
error_code = ErrorCodes.PROTOCOL_ERROR
else:
@ -292,9 +304,9 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
self.conn.close_connection(error_code=error_code)
self._write_to_transport()
self._lose_connection_with_error([
TimeoutError(f"Connection was IDLE for more than {self.IDLE_TIMEOUT}s")
])
self._lose_connection_with_error(
[TimeoutError(f"Connection was IDLE for more than {self.IDLE_TIMEOUT}s")]
)
def connectionLost(self, reason: Failure = connectionDone) -> None:
"""Called by Twisted when the transport connection is lost.
@ -311,13 +323,13 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
self._conn_lost_deferred.callback(self._conn_lost_errors)
for stream in self.streams.values():
if stream.metadata['request_sent']:
if stream.metadata["request_sent"]:
close_reason = StreamCloseReason.CONNECTION_LOST
else:
close_reason = StreamCloseReason.INACTIVE
stream.close(close_reason, self._conn_lost_errors, from_protocol=True)
self.metadata['active_streams'] -= len(self.streams)
self.metadata["active_streams"] -= len(self.streams)
self.streams.clear()
self._pending_request_stream_pool.clear()
self.conn.close_connection()
@ -345,13 +357,13 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
elif isinstance(event, SettingsAcknowledged):
self.settings_acknowledged(event)
elif isinstance(event, UnknownFrameReceived):
logger.warning('Unknown frame received: %s', event.frame)
logger.warning("Unknown frame received: %s", event.frame)
# Event handler functions starts here
def connection_terminated(self, event: ConnectionTerminated) -> None:
self._lose_connection_with_error([
RemoteTerminatedConnection(self.metadata['ip_address'], event)
])
self._lose_connection_with_error(
[RemoteTerminatedConnection(self.metadata["ip_address"], event)]
)
def data_received(self, event: DataReceived) -> None:
try:
@ -370,14 +382,14 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
stream.receive_headers(event.headers)
def settings_acknowledged(self, event: SettingsAcknowledged) -> None:
self.metadata['settings_acknowledged'] = True
self.metadata["settings_acknowledged"] = True
# Send off all the pending requests as now we have
# established a proper HTTP/2 connection
self._send_pending_requests()
# Update certificate when our HTTP/2 connection is established
self.metadata['certificate'] = Certificate(self.transport.getPeerCertificate())
self.metadata["certificate"] = Certificate(self.transport.getPeerCertificate())
def stream_ended(self, event: StreamEnded) -> None:
try:
@ -406,7 +418,9 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
@implementer(IProtocolNegotiationFactory)
class H2ClientFactory(Factory):
def __init__(self, uri: URI, settings: Settings, conn_lost_deferred: Deferred) -> None:
def __init__(
self, uri: URI, settings: Settings, conn_lost_deferred: Deferred
) -> None:
self.uri = uri
self.settings = settings
self.conn_lost_deferred = conn_lost_deferred

View File

@ -32,18 +32,19 @@ class InactiveStreamClosed(ConnectionClosed):
self.request = request
def __str__(self) -> str:
return f'InactiveStreamClosed: Connection was closed without sending the request {self.request!r}'
return f"InactiveStreamClosed: Connection was closed without sending the request {self.request!r}"
class InvalidHostname(H2Error):
def __init__(self, request: Request, expected_hostname: str, expected_netloc: str) -> None:
def __init__(
self, request: Request, expected_hostname: str, expected_netloc: str
) -> None:
self.request = request
self.expected_hostname = expected_hostname
self.expected_netloc = expected_netloc
def __str__(self) -> str:
return f'InvalidHostname: Expected {self.expected_hostname} or {self.expected_netloc} in {self.request}'
return f"InvalidHostname: Expected {self.expected_hostname} or {self.expected_netloc} in {self.request}"
class StreamCloseReason(Enum):
@ -100,28 +101,31 @@ class Stream:
self._request: Request = request
self._protocol: "H2ClientProtocol" = protocol
self._download_maxsize = self._request.meta.get('download_maxsize', download_maxsize)
self._download_warnsize = self._request.meta.get('download_warnsize', download_warnsize)
self._download_maxsize = self._request.meta.get(
"download_maxsize", download_maxsize
)
self._download_warnsize = self._request.meta.get(
"download_warnsize", download_warnsize
)
# Metadata of an HTTP/2 connection stream
# initialized when stream is instantiated
self.metadata: Dict = {
'request_content_length': 0 if self._request.body is None else len(self._request.body),
"request_content_length": 0
if self._request.body is None
else len(self._request.body),
# Flag to keep track whether the stream has initiated the request
'request_sent': False,
"request_sent": False,
# Flag to track whether we have logged about exceeding download warnsize
'reached_warnsize': False,
"reached_warnsize": False,
# Each time we send a data frame, we will decrease value by the amount send.
'remaining_content_length': 0 if self._request.body is None else len(self._request.body),
"remaining_content_length": 0
if self._request.body is None
else len(self._request.body),
# Flag to keep track whether client (self) have closed this stream
'stream_closed_local': False,
"stream_closed_local": False,
# Flag to keep track whether the server has closed the stream
'stream_closed_server': False,
"stream_closed_server": False,
}
# Private variable used to build the response
@ -130,21 +134,19 @@ class Stream:
self._response: Dict = {
# Data received frame by frame from the server is appended
# and passed to the response Deferred when completely received.
'body': BytesIO(),
"body": BytesIO(),
# The amount of data received that counts against the
# flow control window
'flow_controlled_size': 0,
"flow_controlled_size": 0,
# Headers received after sending the request
'headers': Headers({}),
"headers": Headers({}),
}
def _cancel(_) -> None:
# Close this stream as gracefully as possible
# If the associated request is initiated we reset this stream
# else we directly call close() method
if self.metadata['request_sent']:
if self.metadata["request_sent"]:
self.reset_stream(StreamCloseReason.CANCELLED)
else:
self.close(StreamCloseReason.CANCELLED)
@ -152,7 +154,7 @@ class Stream:
self._deferred_response = Deferred(_cancel)
def __repr__(self) -> str:
return f'Stream(id={self.stream_id!r})'
return f"Stream(id={self.stream_id!r})"
@property
def _log_warnsize(self) -> bool:
@ -163,14 +165,16 @@ class Stream:
True if both the above conditions hold true
False if any of the conditions is false
"""
content_length_header = int(self._response['headers'].get(b'Content-Length', -1))
content_length_header = int(
self._response["headers"].get(b"Content-Length", -1)
)
return (
self._download_warnsize
and (
self._response['flow_controlled_size'] > self._download_warnsize
self._response["flow_controlled_size"] > self._download_warnsize
or content_length_header > self._download_warnsize
)
and not self.metadata['reached_warnsize']
and not self.metadata["reached_warnsize"]
)
def get_response(self) -> Deferred:
@ -183,9 +187,10 @@ class Stream:
# Make sure that we are sending the request to the correct URL
url = urlparse(self._request.url)
return (
url.netloc == str(self._protocol.metadata['uri'].host, 'utf-8')
or url.netloc == str(self._protocol.metadata['uri'].netloc, 'utf-8')
or url.netloc == f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}'
url.netloc == str(self._protocol.metadata["uri"].host, "utf-8")
or url.netloc == str(self._protocol.metadata["uri"].netloc, "utf-8")
or url.netloc
== f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}'
)
def _get_request_headers(self) -> List[Tuple[str, str]]:
@ -193,7 +198,7 @@ class Stream:
path = url.path
if url.query:
path += '?' + url.query
path += "?" + url.query
# This pseudo-header field MUST NOT be empty for "http" or "https"
# URIs; "http" or "https" URIs that do not contain a path component
@ -202,40 +207,40 @@ class Stream:
# a path component; these MUST include a ":path" pseudo-header field
# with a value of '*' (refer RFC 7540 - Section 8.1.2.3)
if not path:
path = '*' if self._request.method == 'OPTIONS' else '/'
path = "*" if self._request.method == "OPTIONS" else "/"
# Make sure pseudo-headers comes before all the other headers
headers = [
(':method', self._request.method),
(':authority', url.netloc),
(":method", self._request.method),
(":authority", url.netloc),
]
# The ":scheme" and ":path" pseudo-header fields MUST
# be omitted for CONNECT method (refer RFC 7540 - Section 8.3)
if self._request.method != 'CONNECT':
if self._request.method != "CONNECT":
headers += [
(':scheme', self._protocol.metadata['uri'].scheme),
(':path', path),
(":scheme", self._protocol.metadata["uri"].scheme),
(":path", path),
]
content_length = str(len(self._request.body))
headers.append(('Content-Length', content_length))
headers.append(("Content-Length", content_length))
content_length_name = self._request.headers.normkey(b'Content-Length')
content_length_name = self._request.headers.normkey(b"Content-Length")
for name, values in self._request.headers.items():
for value in values:
value = str(value, 'utf-8')
value = str(value, "utf-8")
if name == content_length_name:
if value != content_length:
logger.warning(
'Ignoring bad Content-Length header %r of request %r, '
'sending %r instead',
"Ignoring bad Content-Length header %r of request %r, "
"sending %r instead",
value,
self._request,
content_length,
)
continue
headers.append((str(name, 'utf-8'), value))
headers.append((str(name, "utf-8"), value))
return headers
@ -243,7 +248,7 @@ class Stream:
if self.check_request_url():
headers = self._get_request_headers()
self._protocol.conn.send_headers(self.stream_id, headers, end_stream=False)
self.metadata['request_sent'] = True
self.metadata["request_sent"] = True
self.send_data()
else:
# Close this stream calling the response errback
@ -252,44 +257,53 @@ class Stream:
def send_data(self) -> None:
"""Called immediately after the headers are sent. Here we send all the
data as part of the request.
data as part of the request.
If the content length is 0 initially then we end the stream immediately and
wait for response data.
If the content length is 0 initially then we end the stream immediately and
wait for response data.
Warning: Only call this method when stream not closed from client side
and has initiated request already by sending HEADER frame. If not then
stream will raise ProtocolError (raise by h2 state machine).
"""
if self.metadata['stream_closed_local']:
Warning: Only call this method when stream not closed from client side
and has initiated request already by sending HEADER frame. If not then
stream will raise ProtocolError (raise by h2 state machine).
"""
if self.metadata["stream_closed_local"]:
raise StreamClosedError(self.stream_id)
# Firstly, check what the flow control window is for current stream.
window_size = self._protocol.conn.local_flow_control_window(stream_id=self.stream_id)
window_size = self._protocol.conn.local_flow_control_window(
stream_id=self.stream_id
)
# Next, check what the maximum frame size is.
max_frame_size = self._protocol.conn.max_outbound_frame_size
# We will send no more than the window size or the remaining file size
# of data in this call, whichever is smaller.
bytes_to_send_size = min(window_size, self.metadata['remaining_content_length'])
bytes_to_send_size = min(window_size, self.metadata["remaining_content_length"])
# We now need to send a number of data frames.
while bytes_to_send_size > 0:
chunk_size = min(bytes_to_send_size, max_frame_size)
data_chunk_start_id = self.metadata['request_content_length'] - self.metadata['remaining_content_length']
data_chunk = self._request.body[data_chunk_start_id:data_chunk_start_id + chunk_size]
data_chunk_start_id = (
self.metadata["request_content_length"]
- self.metadata["remaining_content_length"]
)
data_chunk = self._request.body[
data_chunk_start_id : data_chunk_start_id + chunk_size
]
self._protocol.conn.send_data(self.stream_id, data_chunk, end_stream=False)
bytes_to_send_size -= chunk_size
self.metadata['remaining_content_length'] -= chunk_size
self.metadata["remaining_content_length"] -= chunk_size
self.metadata['remaining_content_length'] = max(0, self.metadata['remaining_content_length'])
self.metadata["remaining_content_length"] = max(
0, self.metadata["remaining_content_length"]
)
# End the stream if no more data needs to be send
if self.metadata['remaining_content_length'] == 0:
if self.metadata["remaining_content_length"] == 0:
self._protocol.conn.end_stream(self.stream_id)
# Q. What about the rest of the data?
@ -301,62 +315,64 @@ class Stream:
blocked behind the flow control.
"""
if (
self.metadata['remaining_content_length']
and not self.metadata['stream_closed_server']
and self.metadata['request_sent']
self.metadata["remaining_content_length"]
and not self.metadata["stream_closed_server"]
and self.metadata["request_sent"]
):
self.send_data()
def receive_data(self, data: bytes, flow_controlled_length: int) -> None:
self._response['body'].write(data)
self._response['flow_controlled_size'] += flow_controlled_length
self._response["body"].write(data)
self._response["flow_controlled_size"] += flow_controlled_length
# We check maxsize here in case the Content-Length header was not received
if self._download_maxsize and self._response['flow_controlled_size'] > self._download_maxsize:
if (
self._download_maxsize
and self._response["flow_controlled_size"] > self._download_maxsize
):
self.reset_stream(StreamCloseReason.MAXSIZE_EXCEEDED)
return
if self._log_warnsize:
self.metadata['reached_warnsize'] = True
self.metadata["reached_warnsize"] = True
warning_msg = (
f'Received more ({self._response["flow_controlled_size"]}) bytes than download '
f'warn size ({self._download_warnsize}) in request {self._request}'
f"warn size ({self._download_warnsize}) in request {self._request}"
)
logger.warning(warning_msg)
# Acknowledge the data received
self._protocol.conn.acknowledge_received_data(
self._response['flow_controlled_size'],
self.stream_id
self._response["flow_controlled_size"], self.stream_id
)
def receive_headers(self, headers: List[HeaderTuple]) -> None:
for name, value in headers:
self._response['headers'][name] = value
self._response["headers"][name] = value
# Check if we exceed the allowed max data size which can be received
expected_size = int(self._response['headers'].get(b'Content-Length', -1))
expected_size = int(self._response["headers"].get(b"Content-Length", -1))
if self._download_maxsize and expected_size > self._download_maxsize:
self.reset_stream(StreamCloseReason.MAXSIZE_EXCEEDED)
return
if self._log_warnsize:
self.metadata['reached_warnsize'] = True
self.metadata["reached_warnsize"] = True
warning_msg = (
f'Expected response size ({expected_size}) larger than '
f'download warn size ({self._download_warnsize}) in request {self._request}'
f"Expected response size ({expected_size}) larger than "
f"download warn size ({self._download_warnsize}) in request {self._request}"
)
logger.warning(warning_msg)
def reset_stream(self, reason: StreamCloseReason = StreamCloseReason.RESET) -> None:
"""Close this stream by sending a RST_FRAME to the remote peer"""
if self.metadata['stream_closed_local']:
if self.metadata["stream_closed_local"]:
raise StreamClosedError(self.stream_id)
# Clear buffer earlier to avoid keeping data in memory for a long time
self._response['body'].truncate(0)
self._response["body"].truncate(0)
self.metadata['stream_closed_local'] = True
self.metadata["stream_closed_local"] = True
self._protocol.conn.reset_stream(self.stream_id, ErrorCodes.REFUSED_STREAM)
self.close(reason)
@ -366,13 +382,14 @@ class Stream:
errors: Optional[List[BaseException]] = None,
from_protocol: bool = False,
) -> None:
"""Based on the reason sent we will handle each case.
"""
if self.metadata['stream_closed_server']:
"""Based on the reason sent we will handle each case."""
if self.metadata["stream_closed_server"]:
raise StreamClosedError(self.stream_id)
if not isinstance(reason, StreamCloseReason):
raise TypeError(f'Expected StreamCloseReason, received {reason.__class__.__qualname__}')
raise TypeError(
f"Expected StreamCloseReason, received {reason.__class__.__qualname__}"
)
# Have default value of errors as an empty list as
# some cases can add a list of exceptions
@ -381,7 +398,7 @@ class Stream:
if not from_protocol:
self._protocol.pop_stream(self.stream_id)
self.metadata['stream_closed_server'] = True
self.metadata["stream_closed_server"] = True
# We do not check for Content-Length or Transfer-Encoding in response headers
# and add `partial` flag as in HTTP/1.1 as 'A request or response that includes
@ -392,13 +409,14 @@ class Stream:
# receiving DATA_FRAME's when we have received the headers (not
# having Content-Length)
if reason is StreamCloseReason.MAXSIZE_EXCEEDED:
expected_size = int(self._response['headers'].get(
b'Content-Length',
self._response['flow_controlled_size'])
expected_size = int(
self._response["headers"].get(
b"Content-Length", self._response["flow_controlled_size"]
)
)
error_msg = (
f'Cancelling download of {self._request.url}: received response '
f'size ({expected_size}) larger than download max size ({self._download_maxsize})'
f"Cancelling download of {self._request.url}: received response "
f"size ({expected_size}) larger than download max size ({self._download_maxsize})"
)
logger.error(error_msg)
self._deferred_response.errback(CancelledError(error_msg))
@ -416,16 +434,20 @@ class Stream:
# There maybe no :status in headers, we make
# HTTP Status Code: 499 - Client Closed Request
self._response['headers'][':status'] = '499'
self._response["headers"][":status"] = "499"
self._fire_response_deferred()
elif reason is StreamCloseReason.RESET:
self._deferred_response.errback(ResponseFailed([
Failure(
f'Remote peer {self._protocol.metadata["ip_address"]} sent RST_STREAM',
ProtocolError
self._deferred_response.errback(
ResponseFailed(
[
Failure(
f'Remote peer {self._protocol.metadata["ip_address"]} sent RST_STREAM',
ProtocolError,
)
]
)
]))
)
elif reason is StreamCloseReason.CONNECTION_LOST:
self._deferred_response.errback(ResponseFailed(errors))
@ -436,33 +458,35 @@ class Stream:
else:
assert reason is StreamCloseReason.INVALID_HOSTNAME
self._deferred_response.errback(InvalidHostname(
self._request,
str(self._protocol.metadata['uri'].host, 'utf-8'),
f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}'
))
self._deferred_response.errback(
InvalidHostname(
self._request,
str(self._protocol.metadata["uri"].host, "utf-8"),
f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}',
)
)
def _fire_response_deferred(self) -> None:
"""Builds response from the self._response dict
and fires the response deferred callback with the
generated response instance"""
body = self._response['body'].getvalue()
body = self._response["body"].getvalue()
response_cls = responsetypes.from_args(
headers=self._response['headers'],
headers=self._response["headers"],
url=self._request.url,
body=body,
)
response = response_cls(
url=self._request.url,
status=int(self._response['headers'][':status']),
headers=self._response['headers'],
status=int(self._response["headers"][":status"]),
headers=self._response["headers"],
body=body,
request=self._request,
certificate=self._protocol.metadata['certificate'],
ip_address=self._protocol.metadata['ip_address'],
protocol='h2',
certificate=self._protocol.metadata["certificate"],
ip_address=self._protocol.metadata["ip_address"],
protocol="h2",
)
self._deferred_response.callback(response)

View File

@ -20,14 +20,18 @@ class BaseSchedulerMeta(type):
"""
Metaclass to check scheduler classes against the necessary interface
"""
def __instancecheck__(cls, instance):
return cls.__subclasscheck__(type(instance))
def __subclasscheck__(cls, subclass):
return (
hasattr(subclass, "has_pending_requests") and callable(subclass.has_pending_requests)
and hasattr(subclass, "enqueue_request") and callable(subclass.enqueue_request)
and hasattr(subclass, "next_request") and callable(subclass.next_request)
hasattr(subclass, "has_pending_requests")
and callable(subclass.has_pending_requests)
and hasattr(subclass, "enqueue_request")
and callable(subclass.enqueue_request)
and hasattr(subclass, "next_request")
and callable(subclass.next_request)
)
@ -162,6 +166,7 @@ class Scheduler(BaseScheduler):
:param crawler: The crawler object corresponding to the current crawl.
:type crawler: :class:`scrapy.crawler.Crawler`
"""
def __init__(
self,
dupefilter,
@ -187,15 +192,15 @@ class Scheduler(BaseScheduler):
"""
Factory method, initializes the scheduler with arguments taken from the crawl settings
"""
dupefilter_cls = load_object(crawler.settings['DUPEFILTER_CLASS'])
dupefilter_cls = load_object(crawler.settings["DUPEFILTER_CLASS"])
return cls(
dupefilter=create_instance(dupefilter_cls, crawler.settings, crawler),
jobdir=job_dir(crawler.settings),
dqclass=load_object(crawler.settings['SCHEDULER_DISK_QUEUE']),
mqclass=load_object(crawler.settings['SCHEDULER_MEMORY_QUEUE']),
logunser=crawler.settings.getbool('SCHEDULER_DEBUG'),
dqclass=load_object(crawler.settings["SCHEDULER_DISK_QUEUE"]),
mqclass=load_object(crawler.settings["SCHEDULER_MEMORY_QUEUE"]),
logunser=crawler.settings.getbool("SCHEDULER_DEBUG"),
stats=crawler.stats,
pqclass=load_object(crawler.settings['SCHEDULER_PRIORITY_QUEUE']),
pqclass=load_object(crawler.settings["SCHEDULER_PRIORITY_QUEUE"]),
crawler=crawler,
)
@ -239,11 +244,11 @@ class Scheduler(BaseScheduler):
return False
dqok = self._dqpush(request)
if dqok:
self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
self.stats.inc_value("scheduler/enqueued/disk", spider=self.spider)
else:
self._mqpush(request)
self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
self.stats.inc_value('scheduler/enqueued', spider=self.spider)
self.stats.inc_value("scheduler/enqueued/memory", spider=self.spider)
self.stats.inc_value("scheduler/enqueued", spider=self.spider)
return True
def next_request(self) -> Optional[Request]:
@ -257,13 +262,13 @@ class Scheduler(BaseScheduler):
"""
request = self.mqs.pop()
if request is not None:
self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider)
self.stats.inc_value("scheduler/dequeued/memory", spider=self.spider)
else:
request = self._dqpop()
if request is not None:
self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider)
self.stats.inc_value("scheduler/dequeued/disk", spider=self.spider)
if request is not None:
self.stats.inc_value('scheduler/dequeued', spider=self.spider)
self.stats.inc_value("scheduler/dequeued", spider=self.spider)
return request
def __len__(self) -> int:
@ -279,13 +284,19 @@ class Scheduler(BaseScheduler):
self.dqs.push(request)
except ValueError as e: # non serializable request
if self.logunser:
msg = ("Unable to serialize request: %(request)s - reason:"
" %(reason)s - no more unserializable requests will be"
" logged (stats being collected)")
logger.warning(msg, {'request': request, 'reason': e},
exc_info=True, extra={'spider': self.spider})
msg = (
"Unable to serialize request: %(request)s - reason:"
" %(reason)s - no more unserializable requests will be"
" logged (stats being collected)"
)
logger.warning(
msg,
{"request": request, "reason": e},
exc_info=True,
extra={"spider": self.spider},
)
self.logunser = False
self.stats.inc_value('scheduler/unserializable', spider=self.spider)
self.stats.inc_value("scheduler/unserializable", spider=self.spider)
return False
else:
return True
@ -299,43 +310,50 @@ class Scheduler(BaseScheduler):
return None
def _mq(self):
""" Create a new priority queue instance, with in-memory storage """
return create_instance(self.pqclass,
settings=None,
crawler=self.crawler,
downstream_queue_cls=self.mqclass,
key='')
"""Create a new priority queue instance, with in-memory storage"""
return create_instance(
self.pqclass,
settings=None,
crawler=self.crawler,
downstream_queue_cls=self.mqclass,
key="",
)
def _dq(self):
""" Create a new priority queue instance, with disk storage """
"""Create a new priority queue instance, with disk storage"""
state = self._read_dqs_state(self.dqdir)
q = create_instance(self.pqclass,
settings=None,
crawler=self.crawler,
downstream_queue_cls=self.dqclass,
key=self.dqdir,
startprios=state)
q = create_instance(
self.pqclass,
settings=None,
crawler=self.crawler,
downstream_queue_cls=self.dqclass,
key=self.dqdir,
startprios=state,
)
if q:
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
{'queuesize': len(q)}, extra={'spider': self.spider})
logger.info(
"Resuming crawl (%(queuesize)d requests scheduled)",
{"queuesize": len(q)},
extra={"spider": self.spider},
)
return q
def _dqdir(self, jobdir: Optional[str]) -> Optional[str]:
""" Return a folder name to keep disk queue state at """
"""Return a folder name to keep disk queue state at"""
if jobdir is not None:
dqdir = Path(jobdir, 'requests.queue')
dqdir = Path(jobdir, "requests.queue")
if not dqdir.exists():
dqdir.mkdir(parents=True)
return str(dqdir)
return None
def _read_dqs_state(self, dqdir: str) -> list:
path = Path(dqdir, 'active.json')
path = Path(dqdir, "active.json")
if not path.exists():
return []
with path.open(encoding="utf-8") as f:
return json.load(f)
def _write_dqs_state(self, dqdir: str, state: list) -> None:
with Path(dqdir, 'active.json').open('w', encoding="utf-8") as f:
with Path(dqdir, "active.json").open("w", encoding="utf-8") as f:
json.dump(state, f)

View File

@ -63,7 +63,9 @@ class Slot:
self.itemproc_size: int = 0
self.closing: Optional[Deferred] = None
def add_response_request(self, result: Union[Response, Failure], request: Request) -> Deferred:
def add_response_request(
self, result: Union[Response, Failure], request: Request
) -> Deferred:
deferred = Deferred()
self.queue.append((result, request, deferred))
if isinstance(result, Response):
@ -77,7 +79,9 @@ class Slot:
self.active.add(request)
return response, request, deferred
def finish_response(self, result: Union[Response, Failure], request: Request) -> None:
def finish_response(
self, result: Union[Response, Failure], request: Request
) -> None:
self.active.remove(request)
if isinstance(result, Response):
self.active_size -= max(len(result.body), self.MIN_RESPONSE_SIZE)
@ -92,13 +96,12 @@ class Slot:
class Scraper:
def __init__(self, crawler: Crawler) -> None:
self.slot: Optional[Slot] = None
self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
itemproc_cls = load_object(crawler.settings["ITEM_PROCESSOR"])
self.itemproc = itemproc_cls.from_crawler(crawler)
self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
self.concurrent_items = crawler.settings.getint("CONCURRENT_ITEMS")
self.crawler = crawler
self.signals = crawler.signals
self.logformatter = crawler.logformatter
@ -106,7 +109,7 @@ class Scraper:
@inlineCallbacks
def open_spider(self, spider: Spider):
"""Open the given spider for scraping and allocate resources for it"""
self.slot = Slot(self.crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE'))
self.slot = Slot(self.crawler.settings.getint("SCRAPER_SLOT_MAX_ACTIVE_SIZE"))
yield self.itemproc.open_spider(spider)
def close_spider(self, spider: Spider) -> Deferred:
@ -127,7 +130,9 @@ class Scraper:
if self.slot.closing and self.slot.is_idle():
self.slot.closing.callback(spider)
def enqueue_scrape(self, result: Union[Response, Failure], request: Request, spider: Spider) -> Deferred:
def enqueue_scrape(
self, result: Union[Response, Failure], request: Request, spider: Spider
) -> Deferred:
if self.slot is None:
raise RuntimeError("Scraper slot not assigned")
dfd = self.slot.add_response_request(result, request)
@ -140,10 +145,13 @@ class Scraper:
dfd.addBoth(finish_scraping)
dfd.addErrback(
lambda f: logger.error('Scraper bug processing %(request)s',
{'request': request},
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
lambda f: logger.error(
"Scraper bug processing %(request)s",
{"request": request},
exc_info=failure_to_exc_info(f),
extra={"spider": spider},
)
)
self._scrape_next(spider)
return dfd
@ -153,35 +161,49 @@ class Scraper:
response, request, deferred = self.slot.next_response_request_deferred()
self._scrape(response, request, spider).chainDeferred(deferred)
def _scrape(self, result: Union[Response, Failure], request: Request, spider: Spider) -> Deferred:
def _scrape(
self, result: Union[Response, Failure], request: Request, spider: Spider
) -> Deferred:
"""
Handle the downloaded response or failure through the spider callback/errback
"""
if not isinstance(result, (Response, Failure)):
raise TypeError(f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}")
dfd = self._scrape2(result, request, spider) # returns spider's processed output
raise TypeError(
f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}"
)
dfd = self._scrape2(
result, request, spider
) # returns spider's processed output
dfd.addErrback(self.handle_spider_error, request, result, spider)
dfd.addCallback(self.handle_spider_output, request, result, spider)
return dfd
def _scrape2(self, result: Union[Response, Failure], request: Request, spider: Spider) -> Deferred:
def _scrape2(
self, result: Union[Response, Failure], request: Request, spider: Spider
) -> Deferred:
"""
Handle the different cases of request's result been a Response or a Failure
"""
if isinstance(result, Response):
return self.spidermw.scrape_response(self.call_spider, result, request, spider)
return self.spidermw.scrape_response(
self.call_spider, result, request, spider
)
# else result is a Failure
dfd = self.call_spider(result, request, spider)
return dfd.addErrback(self._log_download_errors, result, request, spider)
def call_spider(self, result: Union[Response, Failure], request: Request, spider: Spider) -> Deferred:
def call_spider(
self, result: Union[Response, Failure], request: Request, spider: Spider
) -> Deferred:
if isinstance(result, Response):
if getattr(result, "request", None) is None:
result.request = request
callback = result.request.callback or spider._parse
warn_on_generator_with_return_value(spider, callback)
dfd = defer_succeed(result)
dfd.addCallbacks(callback=callback, callbackKeywords=result.request.cb_kwargs)
dfd.addCallbacks(
callback=callback, callbackKeywords=result.request.cb_kwargs
)
else: # result is a Failure
result.request = request
warn_on_generator_with_return_value(spider, request.errback)
@ -189,45 +211,69 @@ class Scraper:
dfd.addErrback(request.errback)
return dfd.addCallback(iterate_spider_output)
def handle_spider_error(self, _failure: Failure, request: Request, response: Response, spider: Spider) -> None:
def handle_spider_error(
self, _failure: Failure, request: Request, response: Response, spider: Spider
) -> None:
exc = _failure.value
if isinstance(exc, CloseSpider):
assert self.crawler.engine is not None # typing
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
self.crawler.engine.close_spider(spider, exc.reason or "cancelled")
return
logkws = self.logformatter.spider_error(_failure, request, response, spider)
logger.log(
*logformatter_adapter(logkws),
exc_info=failure_to_exc_info(_failure),
extra={'spider': spider}
extra={"spider": spider},
)
self.signals.send_catch_log(
signal=signals.spider_error,
failure=_failure, response=response,
spider=spider
failure=_failure,
response=response,
spider=spider,
)
self.crawler.stats.inc_value(
f"spider_exceptions/{_failure.value.__class__.__name__}",
spider=spider
f"spider_exceptions/{_failure.value.__class__.__name__}", spider=spider
)
def handle_spider_output(self, result: Union[Iterable, AsyncIterable], request: Request,
response: Response, spider: Spider) -> Deferred:
def handle_spider_output(
self,
result: Union[Iterable, AsyncIterable],
request: Request,
response: Response,
spider: Spider,
) -> Deferred:
if not result:
return defer_succeed(None)
it: Union[Generator, AsyncGenerator]
if isinstance(result, AsyncIterable):
it = aiter_errback(result, self.handle_spider_error, request, response, spider)
dfd = parallel_async(it, self.concurrent_items, self._process_spidermw_output,
request, response, spider)
it = aiter_errback(
result, self.handle_spider_error, request, response, spider
)
dfd = parallel_async(
it,
self.concurrent_items,
self._process_spidermw_output,
request,
response,
spider,
)
else:
it = iter_errback(result, self.handle_spider_error, request, response, spider)
dfd = parallel(it, self.concurrent_items, self._process_spidermw_output,
request, response, spider)
it = iter_errback(
result, self.handle_spider_error, request, response, spider
)
dfd = parallel(
it,
self.concurrent_items,
self._process_spidermw_output,
request,
response,
spider,
)
return dfd
def _process_spidermw_output(self, output: Any, request: Request, response: Response,
spider: Spider) -> Optional[Deferred]:
def _process_spidermw_output(
self, output: Any, request: Request, response: Response, spider: Spider
) -> Optional[Deferred]:
"""Process each Request/Item (given in the output parameter) returned
from the given spider
"""
@ -245,14 +291,19 @@ class Scraper:
else:
typename = type(output).__name__
logger.error(
'Spider must return request, item, or None, got %(typename)r in %(request)s',
{'request': request, 'typename': typename},
extra={'spider': spider},
"Spider must return request, item, or None, got %(typename)r in %(request)s",
{"request": request, "typename": typename},
extra={"spider": spider},
)
return None
def _log_download_errors(self, spider_failure: Failure, download_failure: Failure, request: Request,
spider: Spider) -> Union[Failure, None]:
def _log_download_errors(
self,
spider_failure: Failure,
download_failure: Failure,
request: Request,
spider: Spider,
) -> Union[Failure, None]:
"""Log and silence errors that come from the engine (typically download
errors that got propagated thru here).
@ -262,29 +313,33 @@ class Scraper:
"""
if not download_failure.check(IgnoreRequest):
if download_failure.frames:
logkws = self.logformatter.download_error(download_failure, request, spider)
logkws = self.logformatter.download_error(
download_failure, request, spider
)
logger.log(
*logformatter_adapter(logkws),
extra={'spider': spider},
extra={"spider": spider},
exc_info=failure_to_exc_info(download_failure),
)
else:
errmsg = download_failure.getErrorMessage()
if errmsg:
logkws = self.logformatter.download_error(
download_failure, request, spider, errmsg)
download_failure, request, spider, errmsg
)
logger.log(
*logformatter_adapter(logkws),
extra={'spider': spider},
extra={"spider": spider},
)
if spider_failure is not download_failure:
return spider_failure
return None
def _itemproc_finished(self, output: Any, item: Any, response: Response, spider: Spider) -> None:
"""ItemProcessor finished for the given ``item`` and returned ``output``
"""
def _itemproc_finished(
self, output: Any, item: Any, response: Response, spider: Spider
) -> None:
"""ItemProcessor finished for the given ``item`` and returned ``output``"""
assert self.slot is not None # typing
self.slot.itemproc_size -= 1
if isinstance(output, Failure):
@ -292,19 +347,30 @@ class Scraper:
if isinstance(ex, DropItem):
logkws = self.logformatter.dropped(item, ex, response, spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
logger.log(*logformatter_adapter(logkws), extra={"spider": spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_dropped, item=item, response=response,
spider=spider, exception=output.value)
signal=signals.item_dropped,
item=item,
response=response,
spider=spider,
exception=output.value,
)
logkws = self.logformatter.item_error(item, ex, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider},
exc_info=failure_to_exc_info(output))
logger.log(
*logformatter_adapter(logkws),
extra={"spider": spider},
exc_info=failure_to_exc_info(output),
)
return self.signals.send_catch_log_deferred(
signal=signals.item_error, item=item, response=response,
spider=spider, failure=output)
signal=signals.item_error,
item=item,
response=response,
spider=spider,
failure=output,
)
logkws = self.logformatter.scraped(output, response, spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
logger.log(*logformatter_adapter(logkws), extra={"spider": spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_scraped, item=output, response=response,
spider=spider)
signal=signals.item_scraped, item=output, response=response, spider=spider
)

View File

@ -6,7 +6,17 @@ See documentation in docs/topics/spider-middleware.rst
import logging
from inspect import isasyncgenfunction, iscoroutine
from itertools import islice
from typing import Any, AsyncGenerator, AsyncIterable, Callable, Generator, Iterable, Tuple, Union, cast
from typing import (
Any,
AsyncGenerator,
AsyncIterable,
Callable,
Generator,
Iterable,
Tuple,
Union,
cast,
)
from twisted.internet.defer import Deferred, inlineCallbacks
from twisted.python.failure import Failure
@ -17,7 +27,12 @@ from scrapy.http import Response
from scrapy.middleware import MiddlewareManager
from scrapy.utils.asyncgen import as_async_generator, collect_asyncgen
from scrapy.utils.conf import build_component_list
from scrapy.utils.defer import mustbe_deferred, deferred_from_coro, deferred_f_from_coro_f, maybe_deferred_to_future
from scrapy.utils.defer import (
mustbe_deferred,
deferred_from_coro,
deferred_f_from_coro_f,
maybe_deferred_to_future,
)
from scrapy.utils.python import MutableAsyncChain, MutableChain
@ -33,7 +48,7 @@ def _isiterable(o) -> bool:
class SpiderMiddlewareManager(MiddlewareManager):
component_name = 'spider middleware'
component_name = "spider middleware"
def __init__(self, *middlewares):
super().__init__(*middlewares)
@ -41,28 +56,35 @@ class SpiderMiddlewareManager(MiddlewareManager):
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
return build_component_list(settings.getwithbase("SPIDER_MIDDLEWARES"))
def _add_middleware(self, mw):
super()._add_middleware(mw)
if hasattr(mw, 'process_spider_input'):
self.methods['process_spider_input'].append(mw.process_spider_input)
if hasattr(mw, 'process_start_requests'):
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
process_spider_output = self._get_async_method_pair(mw, 'process_spider_output')
self.methods['process_spider_output'].appendleft(process_spider_output)
process_spider_exception = getattr(mw, 'process_spider_exception', None)
self.methods['process_spider_exception'].appendleft(process_spider_exception)
if hasattr(mw, "process_spider_input"):
self.methods["process_spider_input"].append(mw.process_spider_input)
if hasattr(mw, "process_start_requests"):
self.methods["process_start_requests"].appendleft(mw.process_start_requests)
process_spider_output = self._get_async_method_pair(mw, "process_spider_output")
self.methods["process_spider_output"].appendleft(process_spider_output)
process_spider_exception = getattr(mw, "process_spider_exception", None)
self.methods["process_spider_exception"].appendleft(process_spider_exception)
def _process_spider_input(self, scrape_func: ScrapeFunc, response: Response, request: Request,
spider: Spider) -> Any:
for method in self.methods['process_spider_input']:
def _process_spider_input(
self,
scrape_func: ScrapeFunc,
response: Response,
request: Request,
spider: Spider,
) -> Any:
for method in self.methods["process_spider_input"]:
method = cast(Callable, method)
try:
result = method(response=response, spider=spider)
if result is not None:
msg = (f"{method.__qualname__} must return None "
f"or raise an exception, got {type(result)}")
msg = (
f"{method.__qualname__} must return None "
f"or raise an exception, got {type(result)}"
)
raise _InvalidOutput(msg)
except _InvalidOutput:
raise
@ -70,17 +92,22 @@ class SpiderMiddlewareManager(MiddlewareManager):
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)
def _evaluate_iterable(self, response: Response, spider: Spider, iterable: Union[Iterable, AsyncIterable],
exception_processor_index: int, recover_to: Union[MutableChain, MutableAsyncChain]
) -> Union[Generator, AsyncGenerator]:
def _evaluate_iterable(
self,
response: Response,
spider: Spider,
iterable: Union[Iterable, AsyncIterable],
exception_processor_index: int,
recover_to: Union[MutableChain, MutableAsyncChain],
) -> Union[Generator, AsyncGenerator]:
def process_sync(iterable: Iterable):
try:
for r in iterable:
yield r
except Exception as ex:
exception_result = self._process_spider_exception(response, spider, Failure(ex),
exception_processor_index)
exception_result = self._process_spider_exception(
response, spider, Failure(ex), exception_processor_index
)
if isinstance(exception_result, Failure):
raise
recover_to.extend(exception_result)
@ -90,8 +117,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
async for r in iterable:
yield r
except Exception as ex:
exception_result = self._process_spider_exception(response, spider, Failure(ex),
exception_processor_index)
exception_result = self._process_spider_exception(
response, spider, Failure(ex), exception_processor_index
)
if isinstance(exception_result, Failure):
raise
recover_to.extend(exception_result)
@ -100,13 +128,20 @@ class SpiderMiddlewareManager(MiddlewareManager):
return process_async(iterable)
return process_sync(iterable)
def _process_spider_exception(self, response: Response, spider: Spider, _failure: Failure,
start_index: int = 0) -> Union[Failure, MutableChain]:
def _process_spider_exception(
self,
response: Response,
spider: Spider,
_failure: Failure,
start_index: int = 0,
) -> Union[Failure, MutableChain]:
exception = _failure.value
# don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
return _failure
method_list = islice(self.methods['process_spider_exception'], start_index, None)
method_list = islice(
self.methods["process_spider_exception"], start_index, None
)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
@ -115,7 +150,9 @@ class SpiderMiddlewareManager(MiddlewareManager):
if _isiterable(result):
# stop exception handling by handing control over to the
# process_spider_output chain if an iterable has been returned
dfd: Deferred = self._process_spider_output(response, spider, result, method_index + 1)
dfd: Deferred = self._process_spider_output(
response, spider, result, method_index + 1
)
# _process_spider_output() returns a Deferred only because of downgrading so this can be
# simplified when downgrading is removed.
if dfd.called:
@ -128,8 +165,10 @@ class SpiderMiddlewareManager(MiddlewareManager):
elif result is None:
continue
else:
msg = (f"{method.__qualname__} must return None "
f"or an iterable, got {type(result)}")
msg = (
f"{method.__qualname__} must return None "
f"or an iterable, got {type(result)}"
)
raise _InvalidOutput(msg)
return _failure
@ -137,9 +176,13 @@ class SpiderMiddlewareManager(MiddlewareManager):
# being available immediately which doesn't work when it's a wrapped coroutine.
# It also needs @inlineCallbacks only because of downgrading so it can be removed when downgrading is removed.
@inlineCallbacks
def _process_spider_output(self, response: Response, spider: Spider,
result: Union[Iterable, AsyncIterable], start_index: int = 0
) -> Deferred:
def _process_spider_output(
self,
response: Response,
spider: Spider,
result: Union[Iterable, AsyncIterable],
start_index: int = 0,
) -> Deferred:
# items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered: Union[MutableChain, MutableAsyncChain]
@ -156,7 +199,7 @@ class SpiderMiddlewareManager(MiddlewareManager):
# Storing methods and method tuples in the same list is weird but we should be able to roll this back
# when we drop this compatibility feature.
method_list = islice(self.methods['process_spider_output'], start_index, None)
method_list = islice(self.methods["process_spider_output"], start_index, None)
for method_index, method_pair in enumerate(method_list, start=start_index):
if method_pair is None:
continue
@ -177,24 +220,32 @@ class SpiderMiddlewareManager(MiddlewareManager):
result = as_async_generator(result)
elif need_downgrade:
if not self.downgrade_warning_done:
logger.warning(f"Async iterable passed to {method.__qualname__} "
f"was downgraded to a non-async one")
logger.warning(
f"Async iterable passed to {method.__qualname__} "
f"was downgraded to a non-async one"
)
self.downgrade_warning_done = True
assert isinstance(result, AsyncIterable)
# AsyncIterable -> Iterable
result = yield deferred_from_coro(collect_asyncgen(result))
if isinstance(recovered, AsyncIterable):
recovered_collected = yield deferred_from_coro(collect_asyncgen(recovered))
recovered_collected = yield deferred_from_coro(
collect_asyncgen(recovered)
)
recovered = MutableChain(recovered_collected)
# might fail directly if the output value is not a generator
result = method(response=response, result=result, spider=spider)
except Exception as ex:
exception_result = self._process_spider_exception(response, spider, Failure(ex), method_index + 1)
exception_result = self._process_spider_exception(
response, spider, Failure(ex), method_index + 1
)
if isinstance(exception_result, Failure):
raise
return exception_result
if _isiterable(result):
result = self._evaluate_iterable(response, spider, result, method_index + 1, recovered)
result = self._evaluate_iterable(
response, spider, result, method_index + 1, recovered
)
else:
if iscoroutine(result):
result.close() # Silence warning about not awaiting
@ -214,15 +265,18 @@ class SpiderMiddlewareManager(MiddlewareManager):
return MutableAsyncChain(result, recovered)
return MutableChain(result, recovered) # type: ignore[arg-type]
async def _process_callback_output(self, response: Response, spider: Spider, result: Union[Iterable, AsyncIterable]
) -> Union[MutableChain, MutableAsyncChain]:
async def _process_callback_output(
self, response: Response, spider: Spider, result: Union[Iterable, AsyncIterable]
) -> Union[MutableChain, MutableAsyncChain]:
recovered: Union[MutableChain, MutableAsyncChain]
if isinstance(result, AsyncIterable):
recovered = MutableAsyncChain()
else:
recovered = MutableChain()
result = self._evaluate_iterable(response, spider, result, 0, recovered)
result = await maybe_deferred_to_future(self._process_spider_output(response, spider, result))
result = await maybe_deferred_to_future(
self._process_spider_output(response, spider, result)
)
if isinstance(result, AsyncIterable):
return MutableAsyncChain(result, recovered)
if isinstance(recovered, AsyncIterable):
@ -230,41 +284,60 @@ class SpiderMiddlewareManager(MiddlewareManager):
recovered = MutableChain(recovered_collected)
return MutableChain(result, recovered) # type: ignore[arg-type]
def scrape_response(self, scrape_func: ScrapeFunc, response: Response, request: Request,
spider: Spider) -> Deferred:
async def process_callback_output(result: Union[Iterable, AsyncIterable]
) -> Union[MutableChain, MutableAsyncChain]:
def scrape_response(
self,
scrape_func: ScrapeFunc,
response: Response,
request: Request,
spider: Spider,
) -> Deferred:
async def process_callback_output(
result: Union[Iterable, AsyncIterable]
) -> Union[MutableChain, MutableAsyncChain]:
return await self._process_callback_output(response, spider, result)
def process_spider_exception(_failure: Failure) -> Union[Failure, MutableChain]:
return self._process_spider_exception(response, spider, _failure)
dfd = mustbe_deferred(self._process_spider_input, scrape_func, response, request, spider)
dfd.addCallbacks(callback=deferred_f_from_coro_f(process_callback_output), errback=process_spider_exception)
dfd = mustbe_deferred(
self._process_spider_input, scrape_func, response, request, spider
)
dfd.addCallbacks(
callback=deferred_f_from_coro_f(process_callback_output),
errback=process_spider_exception,
)
return dfd
def process_start_requests(self, start_requests, spider: Spider) -> Deferred:
return self._process_chain('process_start_requests', start_requests, spider)
return self._process_chain("process_start_requests", start_requests, spider)
# This method is only needed until _async compatibility methods are removed.
@staticmethod
def _get_async_method_pair(mw: Any, methodname: str) -> Union[None, Callable, Tuple[Callable, Callable]]:
def _get_async_method_pair(
mw: Any, methodname: str
) -> Union[None, Callable, Tuple[Callable, Callable]]:
normal_method = getattr(mw, methodname, None)
methodname_async = methodname + "_async"
async_method = getattr(mw, methodname_async, None)
if not async_method:
return normal_method
if not normal_method:
logger.error(f"Middleware {mw.__qualname__} has {methodname_async} "
f"without {methodname}, skipping this method.")
logger.error(
f"Middleware {mw.__qualname__} has {methodname_async} "
f"without {methodname}, skipping this method."
)
return None
if not isasyncgenfunction(async_method):
logger.error(f"{async_method.__qualname__} is not "
f"an async generator function, skipping this method.")
logger.error(
f"{async_method.__qualname__} is not "
f"an async generator function, skipping this method."
)
return normal_method
if isasyncgenfunction(normal_method):
logger.error(f"{normal_method.__qualname__} is an async "
f"generator function while {methodname_async} exists, "
f"skipping both methods.")
logger.error(
f"{normal_method.__qualname__} is an async "
f"generator function while {methodname_async} exists, "
f"skipping both methods."
)
return None
return normal_method, async_method

View File

@ -49,10 +49,9 @@ logger = logging.getLogger(__name__)
class Crawler:
def __init__(self, spidercls, settings=None, init_reactor: bool = False):
if isinstance(spidercls, Spider):
raise ValueError('The spidercls argument must be a class, not an object')
raise ValueError("The spidercls argument must be a class, not an object")
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
@ -63,14 +62,15 @@ class Crawler:
self.signals = SignalManager(self)
self.stats = load_object(self.settings['STATS_CLASS'])(self)
self.stats = load_object(self.settings["STATS_CLASS"])(self)
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
handler = LogCounterHandler(self, level=self.settings.get("LOG_LEVEL"))
logging.root.addHandler(handler)
d = dict(overridden_settings(self.settings))
logger.info("Overridden settings:\n%(settings)s",
{'settings': pprint.pformat(d)})
logger.info(
"Overridden settings:\n%(settings)s", {"settings": pprint.pformat(d)}
)
if get_scrapy_root_handler() is not None:
# scrapy root handler already installed: update it with new settings
@ -80,11 +80,11 @@ class Crawler:
self.__remove_handler = lambda: logging.root.removeHandler(handler)
self.signals.connect(self.__remove_handler, signals.engine_stopped)
lf_cls = load_object(self.settings['LOG_FORMATTER'])
lf_cls = load_object(self.settings["LOG_FORMATTER"])
self.logformatter = lf_cls.from_crawler(self)
self.request_fingerprinter: RequestFingerprinter = create_instance(
load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']),
load_object(self.settings["REQUEST_FINGERPRINTER_CLASS"]),
settings=self.settings,
crawler=self,
)
@ -160,23 +160,26 @@ class CrawlerRunner:
crawlers = property(
lambda self: self._crawlers,
doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
":meth:`crawl` and managed by this class."
":meth:`crawl` and managed by this class.",
)
@staticmethod
def _get_spider_loader(settings):
""" Get SpiderLoader instance from settings """
cls_path = settings.get('SPIDER_LOADER_CLASS')
"""Get SpiderLoader instance from settings"""
cls_path = settings.get("SPIDER_LOADER_CLASS")
loader_cls = load_object(cls_path)
excs = (DoesNotImplement, MultipleInvalid) if MultipleInvalid else DoesNotImplement
excs = (
(DoesNotImplement, MultipleInvalid) if MultipleInvalid else DoesNotImplement
)
try:
verifyClass(ISpiderLoader, loader_cls)
except excs:
warnings.warn(
'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does '
'not fully implement scrapy.interfaces.ISpiderLoader interface. '
'Please add all missing methods to avoid unexpected runtime errors.',
category=ScrapyDeprecationWarning, stacklevel=2
"SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does "
"not fully implement scrapy.interfaces.ISpiderLoader interface. "
"Please add all missing methods to avoid unexpected runtime errors.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
return loader_cls.from_settings(settings.frozencopy())
@ -191,9 +194,12 @@ class CrawlerRunner:
@property
def spiders(self):
warnings.warn("CrawlerRunner.spiders attribute is renamed to "
"CrawlerRunner.spider_loader.",
category=ScrapyDeprecationWarning, stacklevel=2)
warnings.warn(
"CrawlerRunner.spiders attribute is renamed to "
"CrawlerRunner.spider_loader.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
return self.spider_loader
def crawl(self, crawler_or_spidercls, *args, **kwargs):
@ -220,8 +226,9 @@ class CrawlerRunner:
"""
if isinstance(crawler_or_spidercls, Spider):
raise ValueError(
'The crawler_or_spidercls argument cannot be a spider object, '
'it must be a spider class (or a Crawler object)')
"The crawler_or_spidercls argument cannot be a spider object, "
"it must be a spider class (or a Crawler object)"
)
crawler = self.create_crawler(crawler_or_spidercls)
return self._crawl(crawler, *args, **kwargs)
@ -233,7 +240,7 @@ class CrawlerRunner:
def _done(result):
self.crawlers.discard(crawler)
self._active.discard(d)
self.bootstrap_failed |= not getattr(crawler, 'spider', None)
self.bootstrap_failed |= not getattr(crawler, "spider", None)
return result
return d.addBoth(_done)
@ -251,8 +258,9 @@ class CrawlerRunner:
"""
if isinstance(crawler_or_spidercls, Spider):
raise ValueError(
'The crawler_or_spidercls argument cannot be a spider object, '
'it must be a spider class (or a Crawler object)')
"The crawler_or_spidercls argument cannot be a spider object, "
"it must be a spider class (or a Crawler object)"
)
if isinstance(crawler_or_spidercls, Crawler):
return crawler_or_spidercls
return self._create_crawler(crawler_or_spidercls)
@ -314,18 +322,23 @@ class CrawlerProcess(CrawlerRunner):
def _signal_shutdown(self, signum, _):
from twisted.internet import reactor
install_shutdown_handlers(self._signal_kill)
signame = signal_names[signum]
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
{'signame': signame})
logger.info(
"Received %(signame)s, shutting down gracefully. Send again to force ",
{"signame": signame},
)
reactor.callFromThread(self._graceful_stop_reactor)
def _signal_kill(self, signum, _):
from twisted.internet import reactor
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
logger.info('Received %(signame)s twice, forcing unclean shutdown',
{'signame': signame})
logger.info(
"Received %(signame)s twice, forcing unclean shutdown", {"signame": signame}
)
reactor.callFromThread(self._stop_reactor)
def _create_crawler(self, spidercls):
@ -351,6 +364,7 @@ class CrawlerProcess(CrawlerRunner):
handlers (default: True)
"""
from twisted.internet import reactor
if stop_after_crawl:
d = self.join()
# Don't start the reactor if the deferreds are already fired
@ -364,8 +378,8 @@ class CrawlerProcess(CrawlerRunner):
resolver = create_instance(resolver_class, self.settings, self, reactor=reactor)
resolver.install_on_reactor()
tp = reactor.getThreadPool()
tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
tp.adjustPoolsize(maxthreads=self.settings.getint("REACTOR_THREADPOOL_MAXSIZE"))
reactor.addSystemEventTrigger("before", "shutdown", self.stop)
reactor.run(installSignalHandlers=False) # blocking call
def _graceful_stop_reactor(self):
@ -375,6 +389,7 @@ class CrawlerProcess(CrawlerRunner):
def _stop_reactor(self, _=None):
from twisted.internet import reactor
try:
reactor.stop()
except RuntimeError: # raised if already stopped or in shutdown stage

View File

@ -17,14 +17,14 @@ class AjaxCrawlMiddleware:
"""
def __init__(self, settings):
if not settings.getbool('AJAXCRAWL_ENABLED'):
if not settings.getbool("AJAXCRAWL_ENABLED"):
raise NotConfigured
# XXX: Google parses at least first 100k bytes; scrapy's redirect
# middleware parses first 4k. 4k turns out to be insufficient
# for this middleware, and parsing 100k could be slow.
# We use something in between (32K) by default.
self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
self.lookup_bytes = settings.getint("AJAXCRAWL_MAXSIZE", 32768)
@classmethod
def from_crawler(cls, crawler):
@ -35,23 +35,25 @@ class AjaxCrawlMiddleware:
if not isinstance(response, HtmlResponse) or response.status != 200:
return response
if request.method != 'GET':
if request.method != "GET":
# other HTTP methods are either not safe or don't have a body
return response
if 'ajax_crawlable' in request.meta: # prevent loops
if "ajax_crawlable" in request.meta: # prevent loops
return response
if not self._has_ajax_crawlable_variant(response):
return response
# scrapy already handles #! links properly
ajax_crawl_request = request.replace(url=request.url + '#!')
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
extra={'spider': spider})
ajax_crawl_request = request.replace(url=request.url + "#!")
logger.debug(
"Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
{"ajax_crawl_request": ajax_crawl_request, "request": request},
extra={"spider": spider},
)
ajax_crawl_request.meta['ajax_crawlable'] = True
ajax_crawl_request.meta["ajax_crawlable"] = True
return ajax_crawl_request
def _has_ajax_crawlable_variant(self, response):
@ -59,12 +61,14 @@ class AjaxCrawlMiddleware:
Return True if a page without hash fragment could be "AJAX crawlable"
according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
"""
body = response.text[:self.lookup_bytes]
body = response.text[: self.lookup_bytes]
return _has_ajaxcrawlable_meta(body)
# XXX: move it to w3lib?
_ajax_crawlable_re = re.compile(r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>')
_ajax_crawlable_re = re.compile(
r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
)
def _has_ajaxcrawlable_meta(text):
@ -82,12 +86,12 @@ def _has_ajaxcrawlable_meta(text):
# Stripping scripts and comments is slow (about 20x slower than
# just checking if a string is in text); this is a quick fail-fast
# path that should work for most pages.
if 'fragment' not in text:
if "fragment" not in text:
return False
if 'content' not in text:
if "content" not in text:
return False
text = html.remove_tags_with_content(text, ('script', 'noscript'))
text = html.remove_tags_with_content(text, ("script", "noscript"))
text = html.replace_entities(text)
text = html.remove_comments(text)
return _ajax_crawlable_re.search(text) is not None

View File

@ -29,14 +29,14 @@ class CookiesMiddleware:
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('COOKIES_ENABLED'):
if not crawler.settings.getbool("COOKIES_ENABLED"):
raise NotConfigured
return cls(crawler.settings.getbool('COOKIES_DEBUG'))
return cls(crawler.settings.getbool("COOKIES_DEBUG"))
def _process_cookies(self, cookies, *, jar, request):
for cookie in cookies:
cookie_domain = cookie.domain
if cookie_domain.startswith('.'):
if cookie_domain.startswith("."):
cookie_domain = cookie_domain[1:]
request_domain = urlparse_cached(request).hostname.lower()
@ -49,7 +49,7 @@ class CookiesMiddleware:
jar.set_cookie_if_ok(cookie, request)
def process_request(self, request, spider):
if request.meta.get('dont_merge_cookies', False):
if request.meta.get("dont_merge_cookies", False):
return
cookiejarkey = request.meta.get("cookiejar")
@ -58,12 +58,12 @@ class CookiesMiddleware:
self._process_cookies(cookies, jar=jar, request=request)
# set Cookie header
request.headers.pop('Cookie', None)
request.headers.pop("Cookie", None)
jar.add_cookie_header(request)
self._debug_cookie(request, spider)
def process_response(self, request, response, spider):
if request.meta.get('dont_merge_cookies', False):
if request.meta.get("dont_merge_cookies", False):
return response
# extract cookies from Set-Cookie and drop invalid/expired cookies
@ -78,21 +78,25 @@ class CookiesMiddleware:
def _debug_cookie(self, request, spider):
if self.debug:
cl = [to_unicode(c, errors='replace')
for c in request.headers.getlist('Cookie')]
cl = [
to_unicode(c, errors="replace")
for c in request.headers.getlist("Cookie")
]
if cl:
cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
msg = f"Sending cookies to: {request}\n{cookies}"
logger.debug(msg, extra={'spider': spider})
logger.debug(msg, extra={"spider": spider})
def _debug_set_cookie(self, response, spider):
if self.debug:
cl = [to_unicode(c, errors='replace')
for c in response.headers.getlist('Set-Cookie')]
cl = [
to_unicode(c, errors="replace")
for c in response.headers.getlist("Set-Cookie")
]
if cl:
cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
msg = f"Received cookies from: {response}\n{cookies}"
logger.debug(msg, extra={'spider': spider})
logger.debug(msg, extra={"spider": spider})
def _format_cookie(self, cookie, request):
"""
@ -113,8 +117,11 @@ class CookiesMiddleware:
try:
decoded[key] = cookie[key].decode("utf8")
except UnicodeDecodeError:
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
request, cookie)
logger.warning(
"Non UTF-8 encoded cookie found in request %s: %s",
request,
cookie,
)
decoded[key] = cookie[key].decode("latin1", errors="replace")
cookie_str = f"{decoded.pop('name')}={decoded.pop('value')}"

View File

@ -16,7 +16,7 @@ from scrapy.responsetypes import responsetypes
warn(
'scrapy.downloadermiddlewares.decompression is deprecated',
"scrapy.downloadermiddlewares.decompression is deprecated",
ScrapyDeprecationWarning,
stacklevel=2,
)
@ -26,15 +26,15 @@ logger = logging.getLogger(__name__)
class DecompressionMiddleware:
""" This middleware tries to recognise and extract the possibly compressed
responses that may arrive. """
"""This middleware tries to recognise and extract the possibly compressed
responses that may arrive."""
def __init__(self):
self._formats = {
'tar': self._is_tar,
'zip': self._is_zip,
'gz': self._is_gzip,
'bz2': self._is_bzip2
"tar": self._is_tar,
"zip": self._is_zip,
"gz": self._is_gzip,
"bz2": self._is_bzip2,
}
def _is_tar(self, response):
@ -86,7 +86,10 @@ class DecompressionMiddleware:
for fmt, func in self._formats.items():
new_response = func(response)
if new_response:
logger.debug('Decompressed response with format: %(responsefmt)s',
{'responsefmt': fmt}, extra={'spider': spider})
logger.debug(
"Decompressed response with format: %(responsefmt)s",
{"responsefmt": fmt},
extra={"spider": spider},
)
return new_response
return response

View File

@ -8,13 +8,12 @@ from scrapy.utils.python import without_none_values
class DefaultHeadersMiddleware:
def __init__(self, headers):
self._headers = headers
@classmethod
def from_crawler(cls, crawler):
headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS'])
headers = without_none_values(crawler.settings["DEFAULT_REQUEST_HEADERS"])
return cls(headers.items())
def process_request(self, request, spider):

View File

@ -8,19 +8,18 @@ from scrapy import signals
class DownloadTimeoutMiddleware:
def __init__(self, timeout=180):
self._timeout = timeout
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings.getfloat('DOWNLOAD_TIMEOUT'))
o = cls(crawler.settings.getfloat("DOWNLOAD_TIMEOUT"))
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self._timeout = getattr(spider, 'download_timeout', self._timeout)
self._timeout = getattr(spider, "download_timeout", self._timeout)
def process_request(self, request, spider):
if self._timeout:
request.meta.setdefault('download_timeout', self._timeout)
request.meta.setdefault("download_timeout", self._timeout)

View File

@ -24,27 +24,29 @@ class HttpAuthMiddleware:
return o
def spider_opened(self, spider):
usr = getattr(spider, 'http_user', '')
pwd = getattr(spider, 'http_pass', '')
usr = getattr(spider, "http_user", "")
pwd = getattr(spider, "http_pass", "")
if usr or pwd:
self.auth = basic_auth_header(usr, pwd)
if not hasattr(spider, 'http_auth_domain'):
warnings.warn('Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security '
'problems if the spider makes requests to several different domains. http_auth_domain '
'will be set to the domain of the first request, please set it to the correct value '
'explicitly.',
category=ScrapyDeprecationWarning)
if not hasattr(spider, "http_auth_domain"):
warnings.warn(
"Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security "
"problems if the spider makes requests to several different domains. http_auth_domain "
"will be set to the domain of the first request, please set it to the correct value "
"explicitly.",
category=ScrapyDeprecationWarning,
)
self.domain_unset = True
else:
self.domain = spider.http_auth_domain
self.domain_unset = False
def process_request(self, request, spider):
auth = getattr(self, 'auth', None)
if auth and b'Authorization' not in request.headers:
auth = getattr(self, "auth", None)
if auth and b"Authorization" not in request.headers:
domain = urlparse_cached(request).hostname
if self.domain_unset:
self.domain = domain
self.domain_unset = False
if not self.domain or url_is_from_any_domain(request.url, [self.domain]):
request.headers[b'Authorization'] = auth
request.headers[b"Authorization"] = auth

View File

@ -29,21 +29,31 @@ HttpCacheMiddlewareTV = TypeVar("HttpCacheMiddlewareTV", bound="HttpCacheMiddlew
class HttpCacheMiddleware:
DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError)
DOWNLOAD_EXCEPTIONS = (
defer.TimeoutError,
TimeoutError,
DNSLookupError,
ConnectionRefusedError,
ConnectionDone,
ConnectError,
ConnectionLost,
TCPTimedOutError,
ResponseFailed,
IOError,
)
def __init__(self, settings: Settings, stats: StatsCollector) -> None:
if not settings.getbool('HTTPCACHE_ENABLED'):
if not settings.getbool("HTTPCACHE_ENABLED"):
raise NotConfigured
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
self.policy = load_object(settings["HTTPCACHE_POLICY"])(settings)
self.storage = load_object(settings["HTTPCACHE_STORAGE"])(settings)
self.ignore_missing = settings.getbool("HTTPCACHE_IGNORE_MISSING")
self.stats = stats
@classmethod
def from_crawler(cls: Type[HttpCacheMiddlewareTV], crawler: Crawler) -> HttpCacheMiddlewareTV:
def from_crawler(
cls: Type[HttpCacheMiddlewareTV], crawler: Crawler
) -> HttpCacheMiddlewareTV:
o = cls(crawler.settings, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
@ -56,78 +66,86 @@ class HttpCacheMiddleware:
self.storage.close_spider(spider)
def process_request(self, request: Request, spider: Spider) -> Optional[Response]:
if request.meta.get('dont_cache', False):
if request.meta.get("dont_cache", False):
return None
# Skip uncacheable requests
if not self.policy.should_cache_request(request):
request.meta['_dont_cache'] = True # flag as uncacheable
request.meta["_dont_cache"] = True # flag as uncacheable
return None
# Look for cached response and check if expired
cachedresponse = self.storage.retrieve_response(spider, request)
if cachedresponse is None:
self.stats.inc_value('httpcache/miss', spider=spider)
self.stats.inc_value("httpcache/miss", spider=spider)
if self.ignore_missing:
self.stats.inc_value('httpcache/ignore', spider=spider)
self.stats.inc_value("httpcache/ignore", spider=spider)
raise IgnoreRequest(f"Ignored request not in cache: {request}")
return None # first time request
# Return cached response only if not expired
cachedresponse.flags.append('cached')
cachedresponse.flags.append("cached")
if self.policy.is_cached_response_fresh(cachedresponse, request):
self.stats.inc_value('httpcache/hit', spider=spider)
self.stats.inc_value("httpcache/hit", spider=spider)
return cachedresponse
# Keep a reference to cached response to avoid a second cache lookup on
# process_response hook
request.meta['cached_response'] = cachedresponse
request.meta["cached_response"] = cachedresponse
return None
def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
if request.meta.get('dont_cache', False):
def process_response(
self, request: Request, response: Response, spider: Spider
) -> Response:
if request.meta.get("dont_cache", False):
return response
# Skip cached responses and uncacheable requests
if 'cached' in response.flags or '_dont_cache' in request.meta:
request.meta.pop('_dont_cache', None)
if "cached" in response.flags or "_dont_cache" in request.meta:
request.meta.pop("_dont_cache", None)
return response
# RFC2616 requires origin server to set Date header,
# https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
if 'Date' not in response.headers:
response.headers['Date'] = formatdate(usegmt=True)
if "Date" not in response.headers:
response.headers["Date"] = formatdate(usegmt=True)
# Do not validate first-hand responses
cachedresponse = request.meta.pop('cached_response', None)
cachedresponse = request.meta.pop("cached_response", None)
if cachedresponse is None:
self.stats.inc_value('httpcache/firsthand', spider=spider)
self.stats.inc_value("httpcache/firsthand", spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
if self.policy.is_cached_response_valid(cachedresponse, response, request):
self.stats.inc_value('httpcache/revalidate', spider=spider)
self.stats.inc_value("httpcache/revalidate", spider=spider)
return cachedresponse
self.stats.inc_value('httpcache/invalidate', spider=spider)
self.stats.inc_value("httpcache/invalidate", spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
def process_exception(
self, request: Request, exception: Exception, spider: Spider
) -> Optional[Response]:
cachedresponse = request.meta.pop('cached_response', None)
if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
self.stats.inc_value('httpcache/errorrecovery', spider=spider)
cachedresponse = request.meta.pop("cached_response", None)
if cachedresponse is not None and isinstance(
exception, self.DOWNLOAD_EXCEPTIONS
):
self.stats.inc_value("httpcache/errorrecovery", spider=spider)
return cachedresponse
return None
def _cache_response(
self, spider: Spider, response: Response, request: Request, cachedresponse: Optional[Response]
self,
spider: Spider,
response: Response,
request: Request,
cachedresponse: Optional[Response],
) -> None:
if self.policy.should_cache_response(response, request):
self.stats.inc_value('httpcache/store', spider=spider)
self.stats.inc_value("httpcache/store", spider=spider)
self.storage.store_response(spider, request, response)
else:
self.stats.inc_value('httpcache/uncacheable', spider=spider)
self.stats.inc_value("httpcache/uncacheable", spider=spider)

View File

@ -8,17 +8,19 @@ from scrapy.responsetypes import responsetypes
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.gz import gunzip
ACCEPTED_ENCODINGS = [b'gzip', b'deflate']
ACCEPTED_ENCODINGS = [b"gzip", b"deflate"]
try:
import brotli
ACCEPTED_ENCODINGS.append(b'br')
ACCEPTED_ENCODINGS.append(b"br")
except ImportError:
pass
try:
import zstandard
ACCEPTED_ENCODINGS.append(b'zstd')
ACCEPTED_ENCODINGS.append(b"zstd")
except ImportError:
pass
@ -26,12 +28,13 @@ except ImportError:
class HttpCompressionMiddleware:
"""This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites"""
def __init__(self, stats=None):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('COMPRESSION_ENABLED'):
if not crawler.settings.getbool("COMPRESSION_ENABLED"):
raise NotConfigured
try:
return cls(stats=crawler.stats)
@ -47,21 +50,26 @@ class HttpCompressionMiddleware:
return result
def process_request(self, request, spider):
request.headers.setdefault('Accept-Encoding',
b", ".join(ACCEPTED_ENCODINGS))
request.headers.setdefault("Accept-Encoding", b", ".join(ACCEPTED_ENCODINGS))
def process_response(self, request, response, spider):
if request.method == 'HEAD':
if request.method == "HEAD":
return response
if isinstance(response, Response):
content_encoding = response.headers.getlist('Content-Encoding')
content_encoding = response.headers.getlist("Content-Encoding")
if content_encoding:
encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower())
if self.stats:
self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider)
self.stats.inc_value('httpcompression/response_count', spider=spider)
self.stats.inc_value(
"httpcompression/response_bytes",
len(decoded_body),
spider=spider,
)
self.stats.inc_value(
"httpcompression/response_count", spider=spider
)
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
@ -69,18 +77,18 @@ class HttpCompressionMiddleware:
if issubclass(respcls, TextResponse):
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
kwargs['encoding'] = None
kwargs["encoding"] = None
response = response.replace(**kwargs)
if not content_encoding:
del response.headers['Content-Encoding']
del response.headers["Content-Encoding"]
return response
def _decode(self, body, encoding):
if encoding == b'gzip' or encoding == b'x-gzip':
if encoding == b"gzip" or encoding == b"x-gzip":
body = gunzip(body)
if encoding == b'deflate':
if encoding == b"deflate":
try:
body = zlib.decompress(body)
except zlib.error:
@ -90,9 +98,9 @@ class HttpCompressionMiddleware:
# http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
# http://www.gzip.org/zlib/zlib_faq.html#faq38
body = zlib.decompress(body, -15)
if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
if encoding == b"br" and b"br" in ACCEPTED_ENCODINGS:
body = brotli.decompress(body)
if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS:
if encoding == b"zstd" and b"zstd" in ACCEPTED_ENCODINGS:
# Using its streaming API since its simple API could handle only cases
# where there is content size data embedded in the frame
reader = zstandard.ZstdDecompressor().stream_reader(io.BytesIO(body))

View File

@ -8,8 +8,7 @@ from scrapy.utils.python import to_bytes
class HttpProxyMiddleware:
def __init__(self, auth_encoding='latin-1'):
def __init__(self, auth_encoding="latin-1"):
self.auth_encoding = auth_encoding
self.proxies = {}
for type_, url in getproxies().items():
@ -22,20 +21,20 @@ class HttpProxyMiddleware:
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
if not crawler.settings.getbool("HTTPPROXY_ENABLED"):
raise NotConfigured
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
auth_encoding = crawler.settings.get("HTTPPROXY_AUTH_ENCODING")
return cls(auth_encoding)
def _basic_auth_header(self, username, password):
user_pass = to_bytes(
f'{unquote(username)}:{unquote(password)}',
encoding=self.auth_encoding)
f"{unquote(username)}:{unquote(password)}", encoding=self.auth_encoding
)
return base64.b64encode(user_pass)
def _get_proxy(self, url, orig_type):
proxy_type, user, password, hostport = _parse_proxy(url)
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
proxy_url = urlunparse((proxy_type or orig_type, hostport, "", "", "", ""))
if user:
creds = self._basic_auth_header(user, password)
@ -46,39 +45,36 @@ class HttpProxyMiddleware:
def process_request(self, request, spider):
creds, proxy_url = None, None
if 'proxy' in request.meta:
if request.meta['proxy'] is not None:
creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
if "proxy" in request.meta:
if request.meta["proxy"] is not None:
creds, proxy_url = self._get_proxy(request.meta["proxy"], "")
elif self.proxies:
parsed = urlparse_cached(request)
scheme = parsed.scheme
if (
(
# 'no_proxy' is only supported by http schemes
scheme not in ('http', 'https')
or not proxy_bypass(parsed.hostname)
)
and scheme in self.proxies
):
# 'no_proxy' is only supported by http schemes
scheme not in ("http", "https")
or not proxy_bypass(parsed.hostname)
) and scheme in self.proxies:
creds, proxy_url = self.proxies[scheme]
self._set_proxy_and_creds(request, proxy_url, creds)
def _set_proxy_and_creds(self, request, proxy_url, creds):
if proxy_url:
request.meta['proxy'] = proxy_url
elif request.meta.get('proxy') is not None:
request.meta['proxy'] = None
request.meta["proxy"] = proxy_url
elif request.meta.get("proxy") is not None:
request.meta["proxy"] = None
if creds:
request.headers[b'Proxy-Authorization'] = b'Basic ' + creds
request.meta['_auth_proxy'] = proxy_url
elif '_auth_proxy' in request.meta:
if proxy_url != request.meta['_auth_proxy']:
if b'Proxy-Authorization' in request.headers:
del request.headers[b'Proxy-Authorization']
del request.meta['_auth_proxy']
elif b'Proxy-Authorization' in request.headers:
request.headers[b"Proxy-Authorization"] = b"Basic " + creds
request.meta["_auth_proxy"] = proxy_url
elif "_auth_proxy" in request.meta:
if proxy_url != request.meta["_auth_proxy"]:
if b"Proxy-Authorization" in request.headers:
del request.headers[b"Proxy-Authorization"]
del request.meta["_auth_proxy"]
elif b"Proxy-Authorization" in request.headers:
if proxy_url:
request.meta['_auth_proxy'] = proxy_url
request.meta["_auth_proxy"] = proxy_url
else:
del request.headers[b'Proxy-Authorization']
del request.headers[b"Proxy-Authorization"]

View File

@ -17,57 +17,66 @@ def _build_redirect_request(source_request, *, url, **kwargs):
**kwargs,
cookies=None,
)
if 'Cookie' in redirect_request.headers:
if "Cookie" in redirect_request.headers:
source_request_netloc = urlparse_cached(source_request).netloc
redirect_request_netloc = urlparse_cached(redirect_request).netloc
if source_request_netloc != redirect_request_netloc:
del redirect_request.headers['Cookie']
del redirect_request.headers["Cookie"]
return redirect_request
class BaseRedirectMiddleware:
enabled_setting = 'REDIRECT_ENABLED'
enabled_setting = "REDIRECT_ENABLED"
def __init__(self, settings):
if not settings.getbool(self.enabled_setting):
raise NotConfigured
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
self.max_redirect_times = settings.getint("REDIRECT_MAX_TIMES")
self.priority_adjust = settings.getint("REDIRECT_PRIORITY_ADJUST")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
ttl = request.meta.setdefault("redirect_ttl", self.max_redirect_times)
redirects = request.meta.get("redirect_times", 0) + 1
if ttl and redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + [request.url]
redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + [reason]
redirected.meta["redirect_times"] = redirects
redirected.meta["redirect_ttl"] = ttl - 1
redirected.meta["redirect_urls"] = request.meta.get("redirect_urls", []) + [
request.url
]
redirected.meta["redirect_reasons"] = request.meta.get(
"redirect_reasons", []
) + [reason]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{'reason': reason, 'redirected': redirected, 'request': request},
extra={'spider': spider})
logger.debug(
"Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{"reason": reason, "redirected": redirected, "request": request},
extra={"spider": spider},
)
return redirected
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
logger.debug(
"Discarding %(request)s: max redirections reached",
{"request": request},
extra={"spider": spider},
)
raise IgnoreRequest("max redirections reached")
def _redirect_request_using_get(self, request, redirect_url):
redirect_request = _build_redirect_request(
request,
url=redirect_url,
method='GET',
body='',
method="GET",
body="",
)
redirect_request.headers.pop('Content-Type', None)
redirect_request.headers.pop('Content-Length', None)
redirect_request.headers.pop("Content-Type", None)
redirect_request.headers.pop("Content-Length", None)
return redirect_request
@ -79,25 +88,25 @@ class RedirectMiddleware(BaseRedirectMiddleware):
def process_response(self, request, response, spider):
if (
request.meta.get('dont_redirect', False)
or response.status in getattr(spider, 'handle_httpstatus_list', [])
or response.status in request.meta.get('handle_httpstatus_list', [])
or request.meta.get('handle_httpstatus_all', False)
request.meta.get("dont_redirect", False)
or response.status in getattr(spider, "handle_httpstatus_list", [])
or response.status in request.meta.get("handle_httpstatus_list", [])
or request.meta.get("handle_httpstatus_all", False)
):
return response
allowed_status = (301, 302, 303, 307, 308)
if 'Location' not in response.headers or response.status not in allowed_status:
if "Location" not in response.headers or response.status not in allowed_status:
return response
location = safe_url_string(response.headers['Location'])
if response.headers['Location'].startswith(b'//'):
location = safe_url_string(response.headers["Location"])
if response.headers["Location"].startswith(b"//"):
request_scheme = urlparse(request.url).scheme
location = request_scheme + '://' + location.lstrip('/')
location = request_scheme + "://" + location.lstrip("/")
redirected_url = urljoin(request.url, location)
if response.status in (301, 307, 308) or request.method == 'HEAD':
if response.status in (301, 307, 308) or request.method == "HEAD":
redirected = _build_redirect_request(request, url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
@ -107,25 +116,24 @@ class RedirectMiddleware(BaseRedirectMiddleware):
class MetaRefreshMiddleware(BaseRedirectMiddleware):
enabled_setting = 'METAREFRESH_ENABLED'
enabled_setting = "METAREFRESH_ENABLED"
def __init__(self, settings):
super().__init__(settings)
self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
self._maxdelay = settings.getint('METAREFRESH_MAXDELAY')
self._ignore_tags = settings.getlist("METAREFRESH_IGNORE_TAGS")
self._maxdelay = settings.getint("METAREFRESH_MAXDELAY")
def process_response(self, request, response, spider):
if (
request.meta.get('dont_redirect', False)
or request.method == 'HEAD'
request.meta.get("dont_redirect", False)
or request.method == "HEAD"
or not isinstance(response, HtmlResponse)
):
return response
interval, url = get_meta_refresh(response,
ignore_tags=self._ignore_tags)
interval, url = get_meta_refresh(response, ignore_tags=self._ignore_tags)
if url and interval < self._maxdelay:
redirected = self._redirect_request_using_get(request, url)
return self._redirect(redirected, request, spider, 'meta refresh')
return self._redirect(redirected, request, spider, "meta refresh")
return response

View File

@ -39,11 +39,11 @@ def get_retry_request(
request: Request,
*,
spider: Spider,
reason: Union[str, Exception] = 'unspecified',
reason: Union[str, Exception] = "unspecified",
max_retry_times: Optional[int] = None,
priority_adjust: Optional[int] = None,
logger: Logger = retry_logger,
stats_base_key: str = 'retry',
stats_base_key: str = "retry",
):
"""
Returns a new :class:`~scrapy.Request` object to retry the specified
@ -87,22 +87,22 @@ def get_retry_request(
"""
settings = spider.crawler.settings
stats = spider.crawler.stats
retry_times = request.meta.get('retry_times', 0) + 1
retry_times = request.meta.get("retry_times", 0) + 1
if max_retry_times is None:
max_retry_times = request.meta.get('max_retry_times')
max_retry_times = request.meta.get("max_retry_times")
if max_retry_times is None:
max_retry_times = settings.getint('RETRY_TIMES')
max_retry_times = settings.getint("RETRY_TIMES")
if retry_times <= max_retry_times:
logger.debug(
"Retrying %(request)s (failed %(retry_times)d times): %(reason)s",
{'request': request, 'retry_times': retry_times, 'reason': reason},
extra={'spider': spider}
{"request": request, "retry_times": retry_times, "reason": reason},
extra={"spider": spider},
)
new_request: Request = request.copy()
new_request.meta['retry_times'] = retry_times
new_request.meta["retry_times"] = retry_times
new_request.dont_filter = True
if priority_adjust is None:
priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST")
new_request.priority = request.priority + priority_adjust
if callable(reason):
@ -110,15 +110,14 @@ def get_retry_request(
if isinstance(reason, Exception):
reason = global_object_name(reason.__class__)
stats.inc_value(f'{stats_base_key}/count')
stats.inc_value(f'{stats_base_key}/reason_count/{reason}')
stats.inc_value(f"{stats_base_key}/count")
stats.inc_value(f"{stats_base_key}/reason_count/{reason}")
return new_request
stats.inc_value(f'{stats_base_key}/max_reached')
stats.inc_value(f"{stats_base_key}/max_reached")
logger.error(
"Gave up retrying %(request)s (failed %(retry_times)d times): "
"%(reason)s",
{'request': request, 'retry_times': retry_times, 'reason': reason},
extra={'spider': spider},
"Gave up retrying %(request)s (failed %(retry_times)d times): " "%(reason)s",
{"request": request, "retry_times": retry_times, "reason": reason},
extra={"spider": spider},
)
return None
@ -127,24 +126,35 @@ class RetryMiddleware:
# IOError is raised by the HttpCompression middleware when trying to
# decompress an empty response
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError, TunnelError)
EXCEPTIONS_TO_RETRY = (
defer.TimeoutError,
TimeoutError,
DNSLookupError,
ConnectionRefusedError,
ConnectionDone,
ConnectError,
ConnectionLost,
TCPTimedOutError,
ResponseFailed,
IOError,
TunnelError,
)
def __init__(self, settings):
if not settings.getbool('RETRY_ENABLED'):
if not settings.getbool("RETRY_ENABLED"):
raise NotConfigured
self.max_retry_times = settings.getint('RETRY_TIMES')
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
self.max_retry_times = settings.getint("RETRY_TIMES")
self.retry_http_codes = set(
int(x) for x in settings.getlist("RETRY_HTTP_CODES")
)
self.priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
if request.meta.get("dont_retry", False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
@ -152,15 +162,14 @@ class RetryMiddleware:
return response
def process_exception(self, request, exception, spider):
if (
isinstance(exception, self.EXCEPTIONS_TO_RETRY)
and not request.meta.get('dont_retry', False)
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) and not request.meta.get(
"dont_retry", False
):
return self._retry(request, exception, spider)
def _retry(self, request, reason, spider):
max_retry_times = request.meta.get('max_retry_times', self.max_retry_times)
priority_adjust = request.meta.get('priority_adjust', self.priority_adjust)
max_retry_times = request.meta.get("max_retry_times", self.max_retry_times)
priority_adjust = request.meta.get("priority_adjust", self.priority_adjust)
return get_retry_request(
request,
reason=reason,

View File

@ -20,23 +20,23 @@ class RobotsTxtMiddleware:
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
if not crawler.settings.getbool("ROBOTSTXT_OBEY"):
raise NotConfigured
self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy')
self._robotstxt_useragent = crawler.settings.get('ROBOTSTXT_USER_AGENT', None)
self._default_useragent = crawler.settings.get("USER_AGENT", "Scrapy")
self._robotstxt_useragent = crawler.settings.get("ROBOTSTXT_USER_AGENT", None)
self.crawler = crawler
self._parsers = {}
self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER'))
self._parserimpl = load_object(crawler.settings.get("ROBOTSTXT_PARSER"))
# check if parser dependencies are met, this should throw an error otherwise.
self._parserimpl.from_crawler(self.crawler, b'')
self._parserimpl.from_crawler(self.crawler, b"")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
if request.meta.get('dont_obey_robotstxt'):
if request.meta.get("dont_obey_robotstxt"):
return
d = maybeDeferred(self.robot_parser, request, spider)
d.addCallback(self.process_request_2, request, spider)
@ -48,11 +48,14 @@ class RobotsTxtMiddleware:
useragent = self._robotstxt_useragent
if not useragent:
useragent = request.headers.get(b'User-Agent', self._default_useragent)
useragent = request.headers.get(b"User-Agent", self._default_useragent)
if not rp.allowed(request.url, useragent):
logger.debug("Forbidden by robots.txt: %(request)s",
{'request': request}, extra={'spider': spider})
self.crawler.stats.inc_value('robotstxt/forbidden')
logger.debug(
"Forbidden by robots.txt: %(request)s",
{"request": request},
extra={"spider": spider},
)
self.crawler.stats.inc_value("robotstxt/forbidden")
raise IgnoreRequest("Forbidden by robots.txt")
def robot_parser(self, request, spider):
@ -65,13 +68,13 @@ class RobotsTxtMiddleware:
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={'dont_obey_robotstxt': True}
meta={"dont_obey_robotstxt": True},
)
dfd = self.crawler.engine.download(robotsreq)
dfd.addCallback(self._parse_robots, netloc, spider)
dfd.addErrback(self._logerror, robotsreq, spider)
dfd.addErrback(self._robots_error, netloc)
self.crawler.stats.inc_value('robotstxt/request_count')
self.crawler.stats.inc_value("robotstxt/request_count")
if isinstance(self._parsers[netloc], Deferred):
d = Deferred()
@ -79,21 +82,26 @@ class RobotsTxtMiddleware:
def cb(result):
d.callback(result)
return result
self._parsers[netloc].addCallback(cb)
return d
return self._parsers[netloc]
def _logerror(self, failure, request, spider):
if failure.type is not IgnoreRequest:
logger.error("Error downloading %(request)s: %(f_exception)s",
{'request': request, 'f_exception': failure.value},
exc_info=failure_to_exc_info(failure),
extra={'spider': spider})
logger.error(
"Error downloading %(request)s: %(f_exception)s",
{"request": request, "f_exception": failure.value},
exc_info=failure_to_exc_info(failure),
extra={"spider": spider},
)
return failure
def _parse_robots(self, response, netloc, spider):
self.crawler.stats.inc_value('robotstxt/response_count')
self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}')
self.crawler.stats.inc_value("robotstxt/response_count")
self.crawler.stats.inc_value(
f"robotstxt/response_status_count/{response.status}"
)
rp = self._parserimpl.from_crawler(self.crawler, response.body)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = rp
@ -101,7 +109,7 @@ class RobotsTxtMiddleware:
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = f'robotstxt/exception_count/{failure.type}'
key = f"robotstxt/exception_count/{failure.type}"
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None

View File

@ -11,40 +11,50 @@ def get_header_size(headers):
if isinstance(value, (list, tuple)):
for v in value:
size += len(b": ") + len(key) + len(v)
return size + len(b'\r\n') * (len(headers.keys()) - 1)
return size + len(b"\r\n") * (len(headers.keys()) - 1)
def get_status_size(response_status):
return len(to_bytes(http.RESPONSES.get(response_status, b''))) + 15
return len(to_bytes(http.RESPONSES.get(response_status, b""))) + 15
# resp.status + b"\r\n" + b"HTTP/1.1 <100-599> "
class DownloaderStats:
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('DOWNLOADER_STATS'):
if not crawler.settings.getbool("DOWNLOADER_STATS"):
raise NotConfigured
return cls(crawler.stats)
def process_request(self, request, spider):
self.stats.inc_value('downloader/request_count', spider=spider)
self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
self.stats.inc_value("downloader/request_count", spider=spider)
self.stats.inc_value(
f"downloader/request_method_count/{request.method}", spider=spider
)
reqlen = len(request_httprepr(request))
self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
self.stats.inc_value("downloader/request_bytes", reqlen, spider=spider)
def process_response(self, request, response, spider):
self.stats.inc_value('downloader/response_count', spider=spider)
self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
reslen = len(response.body) + get_header_size(response.headers) + get_status_size(response.status) + 4
self.stats.inc_value("downloader/response_count", spider=spider)
self.stats.inc_value(
f"downloader/response_status_count/{response.status}", spider=spider
)
reslen = (
len(response.body)
+ get_header_size(response.headers)
+ get_status_size(response.status)
+ 4
)
# response.body + b"\r\n"+ response.header + b"\r\n" + response.status
self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
self.stats.inc_value("downloader/response_bytes", reslen, spider=spider)
return response
def process_exception(self, request, exception, spider):
ex_class = global_object_name(exception.__class__)
self.stats.inc_value('downloader/exception_count', spider=spider)
self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)
self.stats.inc_value("downloader/exception_count", spider=spider)
self.stats.inc_value(
f"downloader/exception_type_count/{ex_class}", spider=spider
)

View File

@ -6,18 +6,18 @@ from scrapy import signals
class UserAgentMiddleware:
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy'):
def __init__(self, user_agent="Scrapy"):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings['USER_AGENT'])
o = cls(crawler.settings["USER_AGENT"])
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
self.user_agent = getattr(spider, "user_agent", self.user_agent)
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent)
request.headers.setdefault(b"User-Agent", self.user_agent)

View File

@ -18,7 +18,9 @@ BaseDupeFilterTV = TypeVar("BaseDupeFilterTV", bound="BaseDupeFilter")
class BaseDupeFilter:
@classmethod
def from_settings(cls: Type[BaseDupeFilterTV], settings: BaseSettings) -> BaseDupeFilterTV:
def from_settings(
cls: Type[BaseDupeFilterTV], settings: BaseSettings
) -> BaseDupeFilterTV:
return cls()
def request_seen(self, request: Request) -> bool:
@ -55,13 +57,15 @@ class RFPDupeFilter(BaseDupeFilter):
self.debug = debug
self.logger = logging.getLogger(__name__)
if path:
self.file = Path(path, 'requests.seen').open('a+', encoding="utf-8")
self.file = Path(path, "requests.seen").open("a+", encoding="utf-8")
self.file.seek(0)
self.fingerprints.update(x.rstrip() for x in self.file)
@classmethod
def from_settings(cls: Type[RFPDupeFilterTV], settings: BaseSettings, *, fingerprinter=None) -> RFPDupeFilterTV:
debug = settings.getbool('DUPEFILTER_DEBUG')
def from_settings(
cls: Type[RFPDupeFilterTV], settings: BaseSettings, *, fingerprinter=None
) -> RFPDupeFilterTV:
debug = settings.getbool("DUPEFILTER_DEBUG")
try:
return cls(job_dir(settings), debug, fingerprinter=fingerprinter)
except TypeError:
@ -100,7 +104,7 @@ class RFPDupeFilter(BaseDupeFilter):
return True
self.fingerprints.add(fp)
if self.file:
self.file.write(fp + '\n')
self.file.write(fp + "\n")
return False
def request_fingerprint(self, request: Request) -> str:
@ -113,13 +117,15 @@ class RFPDupeFilter(BaseDupeFilter):
def log(self, request: Request, spider: Spider) -> None:
if self.debug:
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
args = {'request': request, 'referer': referer_str(request)}
self.logger.debug(msg, args, extra={'spider': spider})
args = {"request": request, "referer": referer_str(request)}
self.logger.debug(msg, args, extra={"spider": spider})
elif self.logdupes:
msg = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
msg = (
"Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)"
)
self.logger.debug(msg, {"request": request}, extra={"spider": spider})
self.logdupes = False
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
spider.crawler.stats.inc_value("dupefilter/filtered", spider=spider)

View File

@ -10,6 +10,7 @@ new exceptions here without documenting them there.
class NotConfigured(Exception):
"""Indicates a missing configuration situation"""
pass
@ -18,6 +19,7 @@ class _InvalidOutput(TypeError):
Indicates an invalid value has been returned by a middleware's processing method.
Internal and undocumented, it should not be raised or caught by user code.
"""
pass
@ -30,13 +32,14 @@ class IgnoreRequest(Exception):
class DontCloseSpider(Exception):
"""Request the spider not to be closed yet"""
pass
class CloseSpider(Exception):
"""Raise this from callbacks to request the spider to be closed"""
def __init__(self, reason='cancelled'):
def __init__(self, reason="cancelled"):
super().__init__()
self.reason = reason
@ -58,11 +61,13 @@ class StopDownload(Exception):
class DropItem(Exception):
"""Drop item from the item pipeline"""
pass
class NotSupported(Exception):
"""Indicates a feature or method is not supported"""
pass
@ -73,7 +78,7 @@ class UsageError(Exception):
"""To indicate a command-line usage error"""
def __init__(self, *a, **kw):
self.print_help = kw.pop('print_help', True)
self.print_help = kw.pop("print_help", True)
super().__init__(*a, **kw)
@ -81,9 +86,11 @@ class ScrapyDeprecationWarning(Warning):
"""Warning category for deprecated features, since the default
DeprecationWarning is silenced on Python 2.7+
"""
pass
class ContractFail(AssertionError):
"""Error raised in case of a failing contract"""
pass

View File

@ -19,13 +19,19 @@ from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from scrapy.utils.serialize import ScrapyJSONEncoder
__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
'CsvItemExporter', 'XmlItemExporter', 'JsonLinesItemExporter',
'JsonItemExporter', 'MarshalItemExporter']
__all__ = [
"BaseItemExporter",
"PprintItemExporter",
"PickleItemExporter",
"CsvItemExporter",
"XmlItemExporter",
"JsonLinesItemExporter",
"JsonItemExporter",
"MarshalItemExporter",
]
class BaseItemExporter:
def __init__(self, *, dont_fail=False, **kwargs):
self._kwargs = kwargs
self._configure(kwargs, dont_fail=dont_fail)
@ -35,10 +41,10 @@ class BaseItemExporter:
If dont_fail is set, it won't raise an exception on unexpected options
(useful for using with keyword arguments in subclasses ``__init__`` methods)
"""
self.encoding = options.pop('encoding', None)
self.fields_to_export = options.pop('fields_to_export', None)
self.export_empty_fields = options.pop('export_empty_fields', False)
self.indent = options.pop('indent', None)
self.encoding = options.pop("encoding", None)
self.fields_to_export = options.pop("fields_to_export", None)
self.export_empty_fields = options.pop("export_empty_fields", False)
self.indent = options.pop("indent", None)
if not dont_fail and options:
raise TypeError(f"Unexpected options: {', '.join(options.keys())}")
@ -46,7 +52,7 @@ class BaseItemExporter:
raise NotImplementedError
def serialize_field(self, field, name, value):
serializer = field.get('serializer', lambda x: x)
serializer = field.get("serializer", lambda x: x)
return serializer(value)
def start_exporting(self):
@ -74,8 +80,7 @@ class BaseItemExporter:
field_iter = self.fields_to_export.items()
else:
field_iter = (
(x, y) for x, y in self.fields_to_export.items()
if x in item
(x, y) for x, y in self.fields_to_export.items() if x in item
)
else:
if include_empty:
@ -98,36 +103,36 @@ class BaseItemExporter:
class JsonLinesItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
self._kwargs.setdefault('ensure_ascii', not self.encoding)
self._kwargs.setdefault("ensure_ascii", not self.encoding)
self.encoder = ScrapyJSONEncoder(**self._kwargs)
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
data = self.encoder.encode(itemdict) + '\n'
data = self.encoder.encode(itemdict) + "\n"
self.file.write(to_bytes(data, self.encoding))
class JsonItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
# there is a small difference between the behaviour or JsonItemExporter.indent
# and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
# the addition of newlines everywhere
json_indent = self.indent if self.indent is not None and self.indent > 0 else None
self._kwargs.setdefault('indent', json_indent)
self._kwargs.setdefault('ensure_ascii', not self.encoding)
json_indent = (
self.indent if self.indent is not None and self.indent > 0 else None
)
self._kwargs.setdefault("indent", json_indent)
self._kwargs.setdefault("ensure_ascii", not self.encoding)
self.encoder = ScrapyJSONEncoder(**self._kwargs)
self.first_item = True
def _beautify_newline(self):
if self.indent is not None:
self.file.write(b'\n')
self.file.write(b"\n")
def start_exporting(self):
self.file.write(b"[")
@ -141,7 +146,7 @@ class JsonItemExporter(BaseItemExporter):
if self.first_item:
self.first_item = False
else:
self.file.write(b',')
self.file.write(b",")
self._beautify_newline()
itemdict = dict(self._get_serialized_fields(item))
data = self.encoder.encode(itemdict)
@ -149,22 +154,21 @@ class JsonItemExporter(BaseItemExporter):
class XmlItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self.item_element = kwargs.pop('item_element', 'item')
self.root_element = kwargs.pop('root_element', 'items')
self.item_element = kwargs.pop("item_element", "item")
self.root_element = kwargs.pop("root_element", "items")
super().__init__(**kwargs)
if not self.encoding:
self.encoding = 'utf-8'
self.encoding = "utf-8"
self.xg = XMLGenerator(file, encoding=self.encoding)
def _beautify_newline(self, new_item=False):
if self.indent is not None and (self.indent > 0 or new_item):
self.xg.characters('\n')
self.xg.characters("\n")
def _beautify_indent(self, depth=1):
if self.indent:
self.xg.characters(' ' * self.indent * depth)
self.xg.characters(" " * self.indent * depth)
def start_exporting(self):
self.xg.startDocument()
@ -175,7 +179,7 @@ class XmlItemExporter(BaseItemExporter):
self._beautify_indent(depth=1)
self.xg.startElement(self.item_element, {})
self._beautify_newline()
for name, value in self._get_serialized_fields(item, default_value=''):
for name, value in self._get_serialized_fields(item, default_value=""):
self._export_xml_field(name, value, depth=2)
self._beautify_indent(depth=1)
self.xg.endElement(self.item_element)
@ -188,7 +192,7 @@ class XmlItemExporter(BaseItemExporter):
def _export_xml_field(self, name, serialized_value, depth):
self._beautify_indent(depth=depth)
self.xg.startElement(name, {})
if hasattr(serialized_value, 'items'):
if hasattr(serialized_value, "items"):
self._beautify_newline()
for subname, value in serialized_value.items():
self._export_xml_field(subname, value, depth=depth + 1)
@ -196,7 +200,7 @@ class XmlItemExporter(BaseItemExporter):
elif is_listlike(serialized_value):
self._beautify_newline()
for value in serialized_value:
self._export_xml_field('value', value, depth=depth + 1)
self._export_xml_field("value", value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif isinstance(serialized_value, str):
self.xg.characters(serialized_value)
@ -207,18 +211,24 @@ class XmlItemExporter(BaseItemExporter):
class CsvItemExporter(BaseItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', errors=None, **kwargs):
def __init__(
self,
file,
include_headers_line=True,
join_multivalued=",",
errors=None,
**kwargs,
):
super().__init__(dont_fail=True, **kwargs)
if not self.encoding:
self.encoding = 'utf-8'
self.encoding = "utf-8"
self.include_headers_line = include_headers_line
self.stream = io.TextIOWrapper(
file,
line_buffering=False,
write_through=True,
encoding=self.encoding,
newline='', # Windows needs this https://github.com/scrapy/scrapy/issues/3034
newline="", # Windows needs this https://github.com/scrapy/scrapy/issues/3034
errors=errors,
)
self.csv_writer = csv.writer(self.stream, **self._kwargs)
@ -226,7 +236,7 @@ class CsvItemExporter(BaseItemExporter):
self._join_multivalued = join_multivalued
def serialize_field(self, field, name, value):
serializer = field.get('serializer', self._join_if_needed)
serializer = field.get("serializer", self._join_if_needed)
return serializer(value)
def _join_if_needed(self, value):
@ -242,8 +252,7 @@ class CsvItemExporter(BaseItemExporter):
self._headers_not_written = False
self._write_headers_and_set_fields_to_export(item)
fields = self._get_serialized_fields(item, default_value='',
include_empty=True)
fields = self._get_serialized_fields(item, default_value="", include_empty=True)
values = list(self._build_row(x for _, x in fields))
self.csv_writer.writerow(values)
@ -268,7 +277,6 @@ class CsvItemExporter(BaseItemExporter):
class PickleItemExporter(BaseItemExporter):
def __init__(self, file, protocol=4, **kwargs):
super().__init__(**kwargs)
self.file = file
@ -297,14 +305,13 @@ class MarshalItemExporter(BaseItemExporter):
class PprintItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(**kwargs)
self.file = file
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
self.file.write(to_bytes(pprint.pformat(itemdict) + '\n'))
self.file.write(to_bytes(pprint.pformat(itemdict) + "\n"))
class PythonItemExporter(BaseItemExporter):
@ -318,17 +325,18 @@ class PythonItemExporter(BaseItemExporter):
"""
def _configure(self, options, dont_fail=False):
self.binary = options.pop('binary', True)
self.binary = options.pop("binary", True)
super()._configure(options, dont_fail)
if self.binary:
warnings.warn(
"PythonItemExporter will drop support for binary export in the future",
ScrapyDeprecationWarning)
ScrapyDeprecationWarning,
)
if not self.encoding:
self.encoding = 'utf-8'
self.encoding = "utf-8"
def serialize_field(self, field, name, value):
serializer = field.get('serializer', self._serialize_value)
serializer = field.get("serializer", self._serialize_value)
return serializer(value)
def _serialize_value(self, value):

View File

@ -9,8 +9,8 @@ from scrapy.utils.conf import build_component_list
class ExtensionManager(MiddlewareManager):
component_name = 'extension'
component_name = "extension"
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(settings.getwithbase('EXTENSIONS'))
return build_component_list(settings.getwithbase("EXTENSIONS"))

View File

@ -11,15 +11,14 @@ from scrapy.exceptions import NotConfigured
class CloseSpider:
def __init__(self, crawler):
self.crawler = crawler
self.close_on = {
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
"timeout": crawler.settings.getfloat("CLOSESPIDER_TIMEOUT"),
"itemcount": crawler.settings.getint("CLOSESPIDER_ITEMCOUNT"),
"pagecount": crawler.settings.getint("CLOSESPIDER_PAGECOUNT"),
"errorcount": crawler.settings.getint("CLOSESPIDER_ERRORCOUNT"),
}
if not any(self.close_on.values()):
@ -27,13 +26,13 @@ class CloseSpider:
self.counter = defaultdict(int)
if self.close_on.get('errorcount'):
if self.close_on.get("errorcount"):
crawler.signals.connect(self.error_count, signal=signals.spider_error)
if self.close_on.get('pagecount'):
if self.close_on.get("pagecount"):
crawler.signals.connect(self.page_count, signal=signals.response_received)
if self.close_on.get('timeout'):
if self.close_on.get("timeout"):
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
if self.close_on.get('itemcount'):
if self.close_on.get("itemcount"):
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
@ -42,27 +41,31 @@ class CloseSpider:
return cls(crawler)
def error_count(self, failure, response, spider):
self.counter['errorcount'] += 1
if self.counter['errorcount'] == self.close_on['errorcount']:
self.crawler.engine.close_spider(spider, 'closespider_errorcount')
self.counter["errorcount"] += 1
if self.counter["errorcount"] == self.close_on["errorcount"]:
self.crawler.engine.close_spider(spider, "closespider_errorcount")
def page_count(self, response, request, spider):
self.counter['pagecount'] += 1
if self.counter['pagecount'] == self.close_on['pagecount']:
self.crawler.engine.close_spider(spider, 'closespider_pagecount')
self.counter["pagecount"] += 1
if self.counter["pagecount"] == self.close_on["pagecount"]:
self.crawler.engine.close_spider(spider, "closespider_pagecount")
def spider_opened(self, spider):
from twisted.internet import reactor
self.task = reactor.callLater(self.close_on['timeout'],
self.crawler.engine.close_spider, spider,
reason='closespider_timeout')
self.task = reactor.callLater(
self.close_on["timeout"],
self.crawler.engine.close_spider,
spider,
reason="closespider_timeout",
)
def item_scraped(self, item, spider):
self.counter['itemcount'] += 1
if self.counter['itemcount'] == self.close_on['itemcount']:
self.crawler.engine.close_spider(spider, 'closespider_itemcount')
self.counter["itemcount"] += 1
if self.counter["itemcount"] == self.close_on["itemcount"]:
self.crawler.engine.close_spider(spider, "closespider_itemcount")
def spider_closed(self, spider):
task = getattr(self, 'task', False)
task = getattr(self, "task", False)
if task and task.active():
task.cancel()

View File

@ -7,7 +7,6 @@ from scrapy import signals
class CoreStats:
def __init__(self, stats):
self.stats = stats
self.start_time = None
@ -24,23 +23,25 @@ class CoreStats:
def spider_opened(self, spider):
self.start_time = datetime.utcnow()
self.stats.set_value('start_time', self.start_time, spider=spider)
self.stats.set_value("start_time", self.start_time, spider=spider)
def spider_closed(self, spider, reason):
finish_time = datetime.utcnow()
elapsed_time = finish_time - self.start_time
elapsed_time_seconds = elapsed_time.total_seconds()
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
self.stats.set_value('finish_time', finish_time, spider=spider)
self.stats.set_value('finish_reason', reason, spider=spider)
self.stats.set_value(
"elapsed_time_seconds", elapsed_time_seconds, spider=spider
)
self.stats.set_value("finish_time", finish_time, spider=spider)
self.stats.set_value("finish_reason", reason, spider=spider)
def item_scraped(self, item, spider):
self.stats.inc_value('item_scraped_count', spider=spider)
self.stats.inc_value("item_scraped_count", spider=spider)
def response_received(self, spider):
self.stats.inc_value('response_received_count', spider=spider)
self.stats.inc_value("response_received_count", spider=spider)
def item_dropped(self, item, spider, exception):
reason = exception.__class__.__name__
self.stats.inc_value('item_dropped_count', spider=spider)
self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)
self.stats.inc_value("item_dropped_count", spider=spider)
self.stats.inc_value(f"item_dropped_reasons_count/{reason}", spider=spider)

View File

@ -18,7 +18,6 @@ logger = logging.getLogger(__name__)
class StackTraceDump:
def __init__(self, crawler=None):
self.crawler = crawler
try:
@ -34,20 +33,23 @@ class StackTraceDump:
def dump_stacktrace(self, signum, frame):
log_args = {
'stackdumps': self._thread_stacks(),
'enginestatus': format_engine_status(self.crawler.engine),
'liverefs': format_live_refs(),
"stackdumps": self._thread_stacks(),
"enginestatus": format_engine_status(self.crawler.engine),
"liverefs": format_live_refs(),
}
logger.info("Dumping stack trace and engine status\n"
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
log_args, extra={'crawler': self.crawler})
logger.info(
"Dumping stack trace and engine status\n"
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
log_args,
extra={"crawler": self.crawler},
)
def _thread_stacks(self):
id2name = dict((th.ident, th.name) for th in threading.enumerate())
dumps = ''
dumps = ""
for id_, frame in sys._current_frames().items():
name = id2name.get(id_, '')
dump = ''.join(traceback.format_stack(frame))
name = id2name.get(id_, "")
dump = "".join(traceback.format_stack(frame))
dumps += f"# Thread: {name}({id_})\n{dump}\n"
return dumps

View File

@ -34,15 +34,15 @@ logger = logging.getLogger(__name__)
def build_storage(builder, uri, *args, feed_options=None, preargs=(), **kwargs):
argument_names = get_func_args(builder)
if 'feed_options' in argument_names:
kwargs['feed_options'] = feed_options
if "feed_options" in argument_names:
kwargs["feed_options"] = feed_options
else:
warnings.warn(
f"{builder.__qualname__} does not support the 'feed_options' keyword argument. Add a "
"'feed_options' parameter to its signature to remove this "
"warning. This parameter will become mandatory in a future "
"version of Scrapy.",
category=ScrapyDeprecationWarning
category=ScrapyDeprecationWarning,
)
return builder(*preargs, uri, *args, **kwargs)
@ -55,6 +55,7 @@ class ItemFilter:
:param feed_options: feed specific options passed from FeedExporter
:type feed_options: dict
"""
feed_options: Optional[dict]
item_classes: Tuple
@ -62,7 +63,8 @@ class ItemFilter:
self.feed_options = feed_options
if feed_options is not None:
self.item_classes = tuple(
load_object(item_class) for item_class in feed_options.get("item_classes") or ()
load_object(item_class)
for item_class in feed_options.get("item_classes") or ()
)
else:
self.item_classes = tuple()
@ -98,13 +100,12 @@ class IFeedStorage(Interface):
@implementer(IFeedStorage)
class BlockingFeedStorage:
def open(self, spider):
path = spider.crawler.settings['FEED_TEMPDIR']
path = spider.crawler.settings["FEED_TEMPDIR"]
if path and not Path(path).is_dir():
raise OSError('Not a Directory: ' + str(path))
raise OSError("Not a Directory: " + str(path))
return NamedTemporaryFile(prefix='feed-', dir=path)
return NamedTemporaryFile(prefix="feed-", dir=path)
def store(self, file):
return threads.deferToThread(self._store_in_thread, file)
@ -115,16 +116,17 @@ class BlockingFeedStorage:
@implementer(IFeedStorage)
class StdoutFeedStorage:
def __init__(self, uri, _stdout=None, *, feed_options=None):
if not _stdout:
_stdout = sys.stdout.buffer
self._stdout = _stdout
if feed_options and feed_options.get('overwrite', False) is True:
logger.warning('Standard output (stdout) storage does not support '
'overwriting. To suppress this warning, remove the '
'overwrite option from your FEEDS setting, or set '
'it to False.')
if feed_options and feed_options.get("overwrite", False) is True:
logger.warning(
"Standard output (stdout) storage does not support "
"overwriting. To suppress this warning, remove the "
"overwrite option from your FEEDS setting, or set "
"it to False."
)
def open(self, spider):
return self._stdout
@ -135,11 +137,10 @@ class StdoutFeedStorage:
@implementer(IFeedStorage)
class FileFeedStorage:
def __init__(self, uri, *, feed_options=None):
self.path = file_uri_to_path(uri)
feed_options = feed_options or {}
self.write_mode = 'wb' if feed_options.get('overwrite', False) else 'ab'
self.write_mode = "wb" if feed_options.get("overwrite", False) else "ab"
def open(self, spider) -> IO[Any]:
dirname = Path(self.path).parent
@ -152,11 +153,19 @@ class FileFeedStorage:
class S3FeedStorage(BlockingFeedStorage):
def __init__(self, uri, access_key=None, secret_key=None, acl=None, endpoint_url=None, *,
feed_options=None, session_token=None):
def __init__(
self,
uri,
access_key=None,
secret_key=None,
acl=None,
endpoint_url=None,
*,
feed_options=None,
session_token=None,
):
if not is_botocore_available():
raise NotConfigured('missing botocore library')
raise NotConfigured("missing botocore library")
u = urlparse(uri)
self.bucketname = u.hostname
self.access_key = u.username or access_key
@ -166,41 +175,45 @@ class S3FeedStorage(BlockingFeedStorage):
self.acl = acl
self.endpoint_url = endpoint_url
import botocore.session
session = botocore.session.get_session()
self.s3_client = session.create_client(
's3', aws_access_key_id=self.access_key,
"s3",
aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key,
aws_session_token=self.session_token,
endpoint_url=self.endpoint_url)
if feed_options and feed_options.get('overwrite', True) is False:
logger.warning('S3 does not support appending to files. To '
'suppress this warning, remove the overwrite '
'option from your FEEDS setting or set it to True.')
endpoint_url=self.endpoint_url,
)
if feed_options and feed_options.get("overwrite", True) is False:
logger.warning(
"S3 does not support appending to files. To "
"suppress this warning, remove the overwrite "
"option from your FEEDS setting or set it to True."
)
@classmethod
def from_crawler(cls, crawler, uri, *, feed_options=None):
return build_storage(
cls,
uri,
access_key=crawler.settings['AWS_ACCESS_KEY_ID'],
secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'],
session_token=crawler.settings['AWS_SESSION_TOKEN'],
acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None,
endpoint_url=crawler.settings['AWS_ENDPOINT_URL'] or None,
access_key=crawler.settings["AWS_ACCESS_KEY_ID"],
secret_key=crawler.settings["AWS_SECRET_ACCESS_KEY"],
session_token=crawler.settings["AWS_SESSION_TOKEN"],
acl=crawler.settings["FEED_STORAGE_S3_ACL"] or None,
endpoint_url=crawler.settings["AWS_ENDPOINT_URL"] or None,
feed_options=feed_options,
)
def _store_in_thread(self, file):
file.seek(0)
kwargs = {'ACL': self.acl} if self.acl else {}
kwargs = {"ACL": self.acl} if self.acl else {}
self.s3_client.put_object(
Bucket=self.bucketname, Key=self.keyname, Body=file,
**kwargs)
Bucket=self.bucketname, Key=self.keyname, Body=file, **kwargs
)
file.close()
class GCSFeedStorage(BlockingFeedStorage):
def __init__(self, uri, project_id, acl):
self.project_id = project_id
self.acl = acl
@ -212,13 +225,14 @@ class GCSFeedStorage(BlockingFeedStorage):
def from_crawler(cls, crawler, uri):
return cls(
uri,
crawler.settings['GCS_PROJECT_ID'],
crawler.settings['FEED_STORAGE_GCS_ACL'] or None
crawler.settings["GCS_PROJECT_ID"],
crawler.settings["FEED_STORAGE_GCS_ACL"] or None,
)
def _store_in_thread(self, file):
file.seek(0)
from google.cloud.storage import Client
client = Client(project=self.project_id)
bucket = client.get_bucket(self.bucket_name)
blob = bucket.blob(self.blob_name)
@ -226,37 +240,51 @@ class GCSFeedStorage(BlockingFeedStorage):
class FTPFeedStorage(BlockingFeedStorage):
def __init__(self, uri, use_active_mode=False, *, feed_options=None):
u = urlparse(uri)
self.host = u.hostname
self.port = int(u.port or '21')
self.port = int(u.port or "21")
self.username = u.username
self.password = unquote(u.password or '')
self.password = unquote(u.password or "")
self.path = u.path
self.use_active_mode = use_active_mode
self.overwrite = not feed_options or feed_options.get('overwrite', True)
self.overwrite = not feed_options or feed_options.get("overwrite", True)
@classmethod
def from_crawler(cls, crawler, uri, *, feed_options=None):
return build_storage(
cls,
uri,
crawler.settings.getbool('FEED_STORAGE_FTP_ACTIVE'),
crawler.settings.getbool("FEED_STORAGE_FTP_ACTIVE"),
feed_options=feed_options,
)
def _store_in_thread(self, file):
ftp_store_file(
path=self.path, file=file, host=self.host,
port=self.port, username=self.username,
password=self.password, use_active_mode=self.use_active_mode,
path=self.path,
file=file,
host=self.host,
port=self.port,
username=self.username,
password=self.password,
use_active_mode=self.use_active_mode,
overwrite=self.overwrite,
)
class _FeedSlot:
def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, uri_template, filter):
def __init__(
self,
file,
exporter,
storage,
uri,
format,
store_empty,
batch_id,
uri_template,
filter,
):
self.file = file
self.exporter = exporter
self.storage = storage
@ -283,7 +311,6 @@ class _FeedSlot:
class FeedExporter:
@classmethod
def from_crawler(cls, crawler):
exporter = cls(crawler)
@ -299,48 +326,55 @@ class FeedExporter:
self.slots = []
self.filters = {}
if not self.settings['FEEDS'] and not self.settings['FEED_URI']:
if not self.settings["FEEDS"] and not self.settings["FEED_URI"]:
raise NotConfigured
# Begin: Backward compatibility for FEED_URI and FEED_FORMAT settings
if self.settings['FEED_URI']:
if self.settings["FEED_URI"]:
warnings.warn(
'The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of '
'the `FEEDS` setting. Please see the `FEEDS` setting docs for more details',
category=ScrapyDeprecationWarning, stacklevel=2,
"The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of "
"the `FEEDS` setting. Please see the `FEEDS` setting docs for more details",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
uri = str(self.settings["FEED_URI"]) # handle pathlib.Path objects
feed_options = {"format": self.settings.get("FEED_FORMAT", "jsonlines")}
self.feeds[uri] = feed_complete_default_values_from_settings(
feed_options, self.settings
)
uri = str(self.settings['FEED_URI']) # handle pathlib.Path objects
feed_options = {'format': self.settings.get('FEED_FORMAT', 'jsonlines')}
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
self.filters[uri] = self._load_filter(feed_options)
# End: Backward compatibility for FEED_URI and FEED_FORMAT settings
# 'FEEDS' setting takes precedence over 'FEED_URI'
for uri, feed_options in self.settings.getdict('FEEDS').items():
for uri, feed_options in self.settings.getdict("FEEDS").items():
uri = str(uri) # handle pathlib.Path objects
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
self.feeds[uri] = feed_complete_default_values_from_settings(
feed_options, self.settings
)
self.filters[uri] = self._load_filter(feed_options)
self.storages = self._load_components('FEED_STORAGES')
self.exporters = self._load_components('FEED_EXPORTERS')
self.storages = self._load_components("FEED_STORAGES")
self.exporters = self._load_components("FEED_EXPORTERS")
for uri, feed_options in self.feeds.items():
if not self._storage_supported(uri, feed_options):
raise NotConfigured
if not self._settings_are_valid():
raise NotConfigured
if not self._exporter_supported(feed_options['format']):
if not self._exporter_supported(feed_options["format"]):
raise NotConfigured
def open_spider(self, spider):
for uri, feed_options in self.feeds.items():
uri_params = self._get_uri_params(spider, feed_options['uri_params'])
self.slots.append(self._start_new_batch(
batch_id=1,
uri=uri % uri_params,
feed_options=feed_options,
spider=spider,
uri_template=uri,
))
uri_params = self._get_uri_params(spider, feed_options["uri_params"])
self.slots.append(
self._start_new_batch(
batch_id=1,
uri=uri % uri_params,
feed_options=feed_options,
spider=spider,
uri_template=uri,
)
)
def close_spider(self, spider):
deferred_list = []
@ -368,16 +402,15 @@ class FeedExporter:
def _handle_store_error(self, f, logmsg, spider, slot_type):
logger.error(
"Error storing %s", logmsg,
exc_info=failure_to_exc_info(f), extra={'spider': spider}
"Error storing %s",
logmsg,
exc_info=failure_to_exc_info(f),
extra={"spider": spider},
)
self.crawler.stats.inc_value(f"feedexport/failed_count/{slot_type}")
def _handle_store_success(self, f, logmsg, spider, slot_type):
logger.info(
"Stored %s", logmsg,
extra={'spider': spider}
)
logger.info("Stored %s", logmsg, extra={"spider": spider})
self.crawler.stats.inc_value(f"feedexport/success_count/{slot_type}")
def _start_new_batch(self, batch_id, uri, feed_options, spider, uri_template):
@ -393,26 +426,28 @@ class FeedExporter:
storage = self._get_storage(uri, feed_options)
file = storage.open(spider)
if "postprocessing" in feed_options:
file = PostProcessingManager(feed_options["postprocessing"], file, feed_options)
file = PostProcessingManager(
feed_options["postprocessing"], file, feed_options
)
exporter = self._get_exporter(
file=file,
format=feed_options['format'],
fields_to_export=feed_options['fields'],
encoding=feed_options['encoding'],
indent=feed_options['indent'],
**feed_options['item_export_kwargs'],
format=feed_options["format"],
fields_to_export=feed_options["fields"],
encoding=feed_options["encoding"],
indent=feed_options["indent"],
**feed_options["item_export_kwargs"],
)
slot = _FeedSlot(
file=file,
exporter=exporter,
storage=storage,
uri=uri,
format=feed_options['format'],
store_empty=feed_options['store_empty'],
format=feed_options["format"],
store_empty=feed_options["store_empty"],
batch_id=batch_id,
uri_template=uri_template,
filter=self.filters[uri_template]
filter=self.filters[uri_template],
)
if slot.store_empty:
slot.start_exporting()
@ -422,7 +457,9 @@ class FeedExporter:
slots = []
for slot in self.slots:
if not slot.filter.accepts(item):
slots.append(slot) # if slot doesn't accept item, continue with next slot
slots.append(
slot
) # if slot doesn't accept item, continue with next slot
continue
slot.start_exporting()
@ -430,18 +467,22 @@ class FeedExporter:
slot.itemcount += 1
# create new slot for each slot with itemcount == FEED_EXPORT_BATCH_ITEM_COUNT and close the old one
if (
self.feeds[slot.uri_template]['batch_item_count']
and slot.itemcount >= self.feeds[slot.uri_template]['batch_item_count']
self.feeds[slot.uri_template]["batch_item_count"]
and slot.itemcount >= self.feeds[slot.uri_template]["batch_item_count"]
):
uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot)
uri_params = self._get_uri_params(
spider, self.feeds[slot.uri_template]["uri_params"], slot
)
self._close_slot(slot, spider)
slots.append(self._start_new_batch(
batch_id=slot.batch_id + 1,
uri=slot.uri_template % uri_params,
feed_options=self.feeds[slot.uri_template],
spider=spider,
uri_template=slot.uri_template,
))
slots.append(
self._start_new_batch(
batch_id=slot.batch_id + 1,
uri=slot.uri_template % uri_params,
feed_options=self.feeds[slot.uri_template],
spider=spider,
uri_template=slot.uri_template,
)
)
else:
slots.append(slot)
self.slots = slots
@ -459,7 +500,7 @@ class FeedExporter:
def _exporter_supported(self, format):
if format in self.exporters:
return True
logger.error("Unknown feed format: %(format)s", {'format': format})
logger.error("Unknown feed format: %(format)s", {"format": format})
def _settings_are_valid(self):
"""
@ -467,12 +508,14 @@ class FeedExporter:
%(batch_time)s or %(batch_id)d to distinguish different files of partial output
"""
for uri_template, values in self.feeds.items():
if values['batch_item_count'] and not re.search(r'%\(batch_time\)s|%\(batch_id\)', uri_template):
if values["batch_item_count"] and not re.search(
r"%\(batch_time\)s|%\(batch_id\)", uri_template
):
logger.error(
'%%(batch_time)s or %%(batch_id)d must be in the feed URI (%s) if FEED_EXPORT_BATCH_ITEM_COUNT '
'setting or FEEDS.batch_item_count is specified and greater than 0. For more info see: '
'https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count',
uri_template
"%%(batch_time)s or %%(batch_id)d must be in the feed URI (%s) if FEED_EXPORT_BATCH_ITEM_COUNT "
"setting or FEEDS.batch_item_count is specified and greater than 0. For more info see: "
"https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count",
uri_template,
)
return False
return True
@ -484,17 +527,17 @@ class FeedExporter:
self._get_storage(uri, feed_options)
return True
except NotConfigured as e:
logger.error("Disabled feed storage scheme: %(scheme)s. "
"Reason: %(reason)s",
{'scheme': scheme, 'reason': str(e)})
logger.error(
"Disabled feed storage scheme: %(scheme)s. " "Reason: %(reason)s",
{"scheme": scheme, "reason": str(e)},
)
else:
logger.error("Unknown feed storage scheme: %(scheme)s",
{'scheme': scheme})
logger.error("Unknown feed storage scheme: %(scheme)s", {"scheme": scheme})
def _get_instance(self, objcls, *args, **kwargs):
return create_instance(
objcls, self.settings, getattr(self, 'crawler', None),
*args, **kwargs)
objcls, self.settings, getattr(self, "crawler", None), *args, **kwargs
)
def _get_exporter(self, file, format, *args, **kwargs):
return self._get_instance(self.exporters[format], file, *args, **kwargs)
@ -506,20 +549,22 @@ class FeedExporter:
do not support it, and issuing a deprecation warning instead.
"""
feedcls = self.storages[urlparse(uri).scheme]
crawler = getattr(self, 'crawler', None)
crawler = getattr(self, "crawler", None)
def build_instance(builder, *preargs):
return build_storage(builder, uri, feed_options=feed_options, preargs=preargs)
return build_storage(
builder, uri, feed_options=feed_options, preargs=preargs
)
if crawler and hasattr(feedcls, 'from_crawler'):
if crawler and hasattr(feedcls, "from_crawler"):
instance = build_instance(feedcls.from_crawler, crawler)
method_name = 'from_crawler'
elif hasattr(feedcls, 'from_settings'):
method_name = "from_crawler"
elif hasattr(feedcls, "from_settings"):
instance = build_instance(feedcls.from_settings, self.settings)
method_name = 'from_settings'
method_name = "from_settings"
else:
instance = build_instance(feedcls)
method_name = '__new__'
method_name = "__new__"
if instance is None:
raise TypeError(f"{feedcls.__qualname__}.{method_name} returned None")
return instance
@ -534,19 +579,23 @@ class FeedExporter:
for k in dir(spider):
params[k] = getattr(spider, k)
utc_now = datetime.utcnow()
params['time'] = utc_now.replace(microsecond=0).isoformat().replace(':', '-')
params['batch_time'] = utc_now.isoformat().replace(':', '-')
params['batch_id'] = slot.batch_id + 1 if slot is not None else 1
params["time"] = utc_now.replace(microsecond=0).isoformat().replace(":", "-")
params["batch_time"] = utc_now.isoformat().replace(":", "-")
params["batch_id"] = slot.batch_id + 1 if slot is not None else 1
original_params = params.copy()
uripar_function = load_object(uri_params_function) if uri_params_function else lambda params, _: params
uripar_function = (
load_object(uri_params_function)
if uri_params_function
else lambda params, _: params
)
new_params = uripar_function(params, spider)
if new_params is None or original_params != params:
warnings.warn(
'Modifying the params dictionary in-place in the function defined in '
'the FEED_URI_PARAMS setting or in the uri_params key of the FEEDS '
'setting is deprecated. The function must return a new dictionary '
'instead.',
category=ScrapyDeprecationWarning
"Modifying the params dictionary in-place in the function defined in "
"the FEED_URI_PARAMS setting or in the uri_params key of the FEEDS "
"setting is deprecated. The function must return a new dictionary "
"instead.",
category=ScrapyDeprecationWarning,
)
return new_params if new_params is not None else params

View File

@ -21,10 +21,11 @@ logger = logging.getLogger(__name__)
class DummyPolicy:
def __init__(self, settings):
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
self.ignore_schemes = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
self.ignore_http_codes = [
int(x) for x in settings.getlist("HTTPCACHE_IGNORE_HTTP_CODES")
]
def should_cache_request(self, request):
return urlparse_cached(request).scheme not in self.ignore_schemes
@ -44,16 +45,17 @@ class RFC2616Policy:
MAXAGE = 3600 * 24 * 365 # one year
def __init__(self, settings):
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.always_store = settings.getbool("HTTPCACHE_ALWAYS_STORE")
self.ignore_schemes = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
self._cc_parsed = WeakKeyDictionary()
self.ignore_response_cache_controls = [
to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
to_bytes(cc)
for cc in settings.getlist("HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS")
]
def _parse_cachecontrol(self, r):
if r not in self._cc_parsed:
cch = r.headers.get(b'Cache-Control', b'')
cch = r.headers.get(b"Cache-Control", b"")
parsed = parse_cachecontrol(cch)
if isinstance(r, Response):
for key in self.ignore_response_cache_controls:
@ -66,7 +68,7 @@ class RFC2616Policy:
return False
cc = self._parse_cachecontrol(request)
# obey user-agent directive "Cache-Control: no-store"
if b'no-store' in cc:
if b"no-store" in cc:
return False
# Any other is eligible for caching
return True
@ -77,7 +79,7 @@ class RFC2616Policy:
# Status code 206 is not included because cache can not deal with partial contents
cc = self._parse_cachecontrol(response)
# obey directive "Cache-Control: no-store"
if b'no-store' in cc:
if b"no-store" in cc:
return False
# Never cache 304 (Not Modified) responses
if response.status == 304:
@ -86,14 +88,14 @@ class RFC2616Policy:
if self.always_store:
return True
# Any hint on response expiration is good
if b'max-age' in cc or b'Expires' in response.headers:
if b"max-age" in cc or b"Expires" in response.headers:
return True
# Firefox fallbacks this statuses to one year expiration if none is set
if response.status in (300, 301, 308):
return True
# Other statuses without expiration requires at least one validator
if response.status in (200, 203, 401):
return b'Last-Modified' in response.headers or b'ETag' in response.headers
return b"Last-Modified" in response.headers or b"ETag" in response.headers
# Any other is probably not eligible for caching
# Makes no sense to cache responses that does not contain expiration
# info and can not be revalidated
@ -102,11 +104,13 @@ class RFC2616Policy:
def is_cached_response_fresh(self, cachedresponse, request):
cc = self._parse_cachecontrol(cachedresponse)
ccreq = self._parse_cachecontrol(request)
if b'no-cache' in cc or b'no-cache' in ccreq:
if b"no-cache" in cc or b"no-cache" in ccreq:
return False
now = time()
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
freshnesslifetime = self._compute_freshness_lifetime(
cachedresponse, request, now
)
currentage = self._compute_current_age(cachedresponse, request, now)
reqmaxage = self._get_max_age(ccreq)
@ -116,7 +120,7 @@ class RFC2616Policy:
if currentage < freshnesslifetime:
return True
if b'max-stale' in ccreq and b'must-revalidate' not in cc:
if b"max-stale" in ccreq and b"must-revalidate" not in cc:
# From RFC2616: "Indicates that the client is willing to
# accept a response that has exceeded its expiration time.
# If max-stale is assigned a value, then the client is
@ -124,7 +128,7 @@ class RFC2616Policy:
# expiration time by no more than the specified number of
# seconds. If no value is assigned to max-stale, then the
# client is willing to accept a stale response of any age."
staleage = ccreq[b'max-stale']
staleage = ccreq[b"max-stale"]
if staleage is None:
return True
@ -143,22 +147,24 @@ class RFC2616Policy:
# as long as the old response didn't specify must-revalidate.
if response.status >= 500:
cc = self._parse_cachecontrol(cachedresponse)
if b'must-revalidate' not in cc:
if b"must-revalidate" not in cc:
return True
# Use the cached response if the server says it hasn't changed.
return response.status == 304
def _set_conditional_validators(self, request, cachedresponse):
if b'Last-Modified' in cachedresponse.headers:
request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
if b"Last-Modified" in cachedresponse.headers:
request.headers[b"If-Modified-Since"] = cachedresponse.headers[
b"Last-Modified"
]
if b'ETag' in cachedresponse.headers:
request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
if b"ETag" in cachedresponse.headers:
request.headers[b"If-None-Match"] = cachedresponse.headers[b"ETag"]
def _get_max_age(self, cc):
try:
return max(0, int(cc[b'max-age']))
return max(0, int(cc[b"max-age"]))
except (KeyError, ValueError):
return None
@ -171,18 +177,18 @@ class RFC2616Policy:
return maxage
# Parse date header or synthesize it if none exists
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
date = rfc1123_to_epoch(response.headers.get(b"Date")) or now
# Try HTTP/1.0 Expires header
if b'Expires' in response.headers:
expires = rfc1123_to_epoch(response.headers[b'Expires'])
if b"Expires" in response.headers:
expires = rfc1123_to_epoch(response.headers[b"Expires"])
# When parsing Expires header fails RFC 2616 section 14.21 says we
# should treat this as an expiration time in the past.
return max(0, expires - date) if expires else 0
# Fallback to heuristic using last-modified header
# This is not in RFC but on Firefox caching implementation
lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
lastmodified = rfc1123_to_epoch(response.headers.get(b"Last-Modified"))
if lastmodified and lastmodified <= date:
return (date - lastmodified) / 10
@ -199,13 +205,13 @@ class RFC2616Policy:
currentage = 0
# If Date header is not set we assume it is a fast connection, and
# clock is in sync with the server
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
date = rfc1123_to_epoch(response.headers.get(b"Date")) or now
if now > date:
currentage = now - date
if b'Age' in response.headers:
if b"Age" in response.headers:
try:
age = int(response.headers[b'Age'])
age = int(response.headers[b"Age"])
currentage = max(currentage, age)
except ValueError:
pass
@ -214,18 +220,21 @@ class RFC2616Policy:
class DbmCacheStorage:
def __init__(self, settings):
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
self.cachedir = data_path(settings["HTTPCACHE_DIR"], createdir=True)
self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")
self.dbmodule = import_module(settings["HTTPCACHE_DBM_MODULE"])
self.db = None
def open_spider(self, spider: Spider):
dbpath = Path(self.cachedir, f'{spider.name}.db')
self.db = self.dbmodule.open(str(dbpath), 'c')
dbpath = Path(self.cachedir, f"{spider.name}.db")
self.db = self.dbmodule.open(str(dbpath), "c")
logger.debug("Using DBM cache storage in %(cachepath)s", {'cachepath': dbpath}, extra={'spider': spider})
logger.debug(
"Using DBM cache storage in %(cachepath)s",
{"cachepath": dbpath},
extra={"spider": spider},
)
self._fingerprinter = spider.crawler.request_fingerprinter
@ -236,10 +245,10 @@ class DbmCacheStorage:
data = self._read_data(spider, request)
if data is None:
return # not cached
url = data['url']
status = data['status']
headers = Headers(data['headers'])
body = data['body']
url = data["url"]
status = data["status"]
headers = Headers(data["headers"])
body = data["body"]
respcls = responsetypes.from_args(headers=headers, url=url, body=body)
response = respcls(url=url, headers=headers, status=status, body=body)
return response
@ -247,18 +256,18 @@ class DbmCacheStorage:
def store_response(self, spider, request, response):
key = self._fingerprinter.fingerprint(request).hex()
data = {
'status': response.status,
'url': response.url,
'headers': dict(response.headers),
'body': response.body,
"status": response.status,
"url": response.url,
"headers": dict(response.headers),
"body": response.body,
}
self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
self.db[f'{key}_time'] = str(time())
self.db[f"{key}_data"] = pickle.dumps(data, protocol=4)
self.db[f"{key}_time"] = str(time())
def _read_data(self, spider, request):
key = self._fingerprinter.fingerprint(request).hex()
db = self.db
tkey = f'{key}_time'
tkey = f"{key}_time"
if tkey not in db:
return # not found
@ -266,20 +275,22 @@ class DbmCacheStorage:
if 0 < self.expiration_secs < time() - float(ts):
return # expired
return pickle.loads(db[f'{key}_data'])
return pickle.loads(db[f"{key}_data"])
class FilesystemCacheStorage:
def __init__(self, settings):
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
self.cachedir = data_path(settings["HTTPCACHE_DIR"])
self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")
self.use_gzip = settings.getbool("HTTPCACHE_GZIP")
self._open = gzip.open if self.use_gzip else open
def open_spider(self, spider: Spider):
logger.debug("Using filesystem cache storage in %(cachedir)s", {'cachedir': self.cachedir},
extra={'spider': spider})
logger.debug(
"Using filesystem cache storage in %(cachedir)s",
{"cachedir": self.cachedir},
extra={"spider": spider},
)
self._fingerprinter = spider.crawler.request_fingerprinter
@ -292,12 +303,12 @@ class FilesystemCacheStorage:
if metadata is None:
return # not cached
rpath = Path(self._get_request_path(spider, request))
with self._open(rpath / 'response_body', 'rb') as f:
with self._open(rpath / "response_body", "rb") as f:
body = f.read()
with self._open(rpath / 'response_headers', 'rb') as f:
with self._open(rpath / "response_headers", "rb") as f:
rawheaders = f.read()
url = metadata.get('response_url')
status = metadata['status']
url = metadata.get("response_url")
status = metadata["status"]
headers = Headers(headers_raw_to_dict(rawheaders))
respcls = responsetypes.from_args(headers=headers, url=url, body=body)
response = respcls(url=url, headers=headers, status=status, body=body)
@ -309,23 +320,23 @@ class FilesystemCacheStorage:
if not rpath.exists():
rpath.mkdir(parents=True)
metadata = {
'url': request.url,
'method': request.method,
'status': response.status,
'response_url': response.url,
'timestamp': time(),
"url": request.url,
"method": request.method,
"status": response.status,
"response_url": response.url,
"timestamp": time(),
}
with self._open(rpath / 'meta', 'wb') as f:
with self._open(rpath / "meta", "wb") as f:
f.write(to_bytes(repr(metadata)))
with self._open(rpath / 'pickled_meta', 'wb') as f:
with self._open(rpath / "pickled_meta", "wb") as f:
pickle.dump(metadata, f, protocol=4)
with self._open(rpath / 'response_headers', 'wb') as f:
with self._open(rpath / "response_headers", "wb") as f:
f.write(headers_dict_to_raw(response.headers))
with self._open(rpath / 'response_body', 'wb') as f:
with self._open(rpath / "response_body", "wb") as f:
f.write(response.body)
with self._open(rpath / 'request_headers', 'wb') as f:
with self._open(rpath / "request_headers", "wb") as f:
f.write(headers_dict_to_raw(request.headers))
with self._open(rpath / 'request_body', 'wb') as f:
with self._open(rpath / "request_body", "wb") as f:
f.write(request.body)
def _get_request_path(self, spider: Spider, request: Request) -> str:
@ -334,13 +345,13 @@ class FilesystemCacheStorage:
def _read_meta(self, spider: Spider, request: Request):
rpath = Path(self._get_request_path(spider, request))
metapath = rpath / 'pickled_meta'
metapath = rpath / "pickled_meta"
if not metapath.exists():
return # not found
mtime = metapath.stat().st_mtime
if 0 < self.expiration_secs < time() - mtime:
return # expired
with self._open(metapath, 'rb') as f:
with self._open(metapath, "rb") as f:
return pickle.load(f)
@ -357,8 +368,8 @@ def parse_cachecontrol(header):
"""
directives = {}
for directive in header.split(b','):
key, sep, val = directive.strip().partition(b'=')
for directive in header.split(b","):
key, sep, val = directive.strip().partition(b"=")
if key:
directives[key.lower()] = val if sep else None
return directives
@ -366,7 +377,7 @@ def parse_cachecontrol(header):
def rfc1123_to_epoch(date_str):
try:
date_str = to_unicode(date_str, encoding='ascii')
date_str = to_unicode(date_str, encoding="ascii")
return mktime_tz(parsedate_tz(date_str))
except Exception:
return None

View File

@ -19,7 +19,7 @@ class LogStats:
@classmethod
def from_crawler(cls, crawler):
interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
interval = crawler.settings.getfloat("LOGSTATS_INTERVAL")
if not interval:
raise NotConfigured
o = cls(crawler.stats, interval)
@ -35,17 +35,23 @@ class LogStats:
self.task.start(self.interval)
def log(self, spider):
items = self.stats.get_value('item_scraped_count', 0)
pages = self.stats.get_value('response_received_count', 0)
items = self.stats.get_value("item_scraped_count", 0)
pages = self.stats.get_value("response_received_count", 0)
irate = (items - self.itemsprev) * self.multiplier
prate = (pages - self.pagesprev) * self.multiplier
self.pagesprev, self.itemsprev = pages, items
msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
"scraped %(items)d items (at %(itemrate)d items/min)")
log_args = {'pages': pages, 'pagerate': prate,
'items': items, 'itemrate': irate}
logger.info(msg, log_args, extra={'spider': spider})
msg = (
"Crawled %(pages)d pages (at %(pagerate)d pages/min), "
"scraped %(items)d items (at %(itemrate)d items/min)"
)
log_args = {
"pages": pages,
"pagerate": prate,
"items": items,
"itemrate": irate,
}
logger.info(msg, log_args, extra={"spider": spider})
def spider_closed(self, spider, reason):
if self.task and self.task.running:

View File

@ -12,13 +12,12 @@ from scrapy.utils.trackref import live_refs
class MemoryDebugger:
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('MEMDEBUG_ENABLED'):
if not crawler.settings.getbool("MEMDEBUG_ENABLED"):
raise NotConfigured
o = cls(crawler.stats)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
@ -26,8 +25,12 @@ class MemoryDebugger:
def spider_closed(self, spider, reason):
gc.collect()
self.stats.set_value('memdebug/gc_garbage_count', len(gc.garbage), spider=spider)
self.stats.set_value(
"memdebug/gc_garbage_count", len(gc.garbage), spider=spider
)
for cls, wdict in live_refs.items():
if not wdict:
continue
self.stats.set_value(f'memdebug/live_refs/{cls.__name__}', len(wdict), spider=spider)
self.stats.set_value(
f"memdebug/live_refs/{cls.__name__}", len(wdict), spider=spider
)

View File

@ -20,22 +20,23 @@ logger = logging.getLogger(__name__)
class MemoryUsage:
def __init__(self, crawler):
if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
if not crawler.settings.getbool("MEMUSAGE_ENABLED"):
raise NotConfigured
try:
# stdlib's resource module is only available on unix platforms.
self.resource = import_module('resource')
self.resource = import_module("resource")
except ImportError:
raise NotConfigured
self.crawler = crawler
self.warned = False
self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB') * 1024 * 1024
self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB') * 1024 * 1024
self.check_interval = crawler.settings.getfloat('MEMUSAGE_CHECK_INTERVAL_SECONDS')
self.notify_mails = crawler.settings.getlist("MEMUSAGE_NOTIFY_MAIL")
self.limit = crawler.settings.getint("MEMUSAGE_LIMIT_MB") * 1024 * 1024
self.warning = crawler.settings.getint("MEMUSAGE_WARNING_MB") * 1024 * 1024
self.check_interval = crawler.settings.getfloat(
"MEMUSAGE_CHECK_INTERVAL_SECONDS"
)
self.mail = MailSender.from_settings(crawler.settings)
crawler.signals.connect(self.engine_started, signal=signals.engine_started)
crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
@ -46,13 +47,13 @@ class MemoryUsage:
def get_virtual_size(self):
size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
if sys.platform != 'darwin':
if sys.platform != "darwin":
# on macOS ru_maxrss is in bytes, on Linux it is in KB
size *= 1024
return size
def engine_started(self):
self.crawler.stats.set_value('memusage/startup', self.get_virtual_size())
self.crawler.stats.set_value("memusage/startup", self.get_virtual_size())
self.tasks = []
tsk = task.LoopingCall(self.update)
self.tasks.append(tsk)
@ -72,45 +73,56 @@ class MemoryUsage:
tsk.stop()
def update(self):
self.crawler.stats.max_value('memusage/max', self.get_virtual_size())
self.crawler.stats.max_value("memusage/max", self.get_virtual_size())
def _check_limit(self):
peak_mem_usage = self.get_virtual_size()
if peak_mem_usage > self.limit:
self.crawler.stats.set_value('memusage/limit_reached', 1)
self.crawler.stats.set_value("memusage/limit_reached", 1)
mem = self.limit / 1024 / 1024
logger.error("Memory usage exceeded %(memusage)dMiB. Shutting down Scrapy...",
{'memusage': mem}, extra={'crawler': self.crawler})
logger.error(
"Memory usage exceeded %(memusage)dMiB. Shutting down Scrapy...",
{"memusage": mem},
extra={"crawler": self.crawler},
)
if self.notify_mails:
subj = (
f"{self.crawler.settings['BOT_NAME']} terminated: "
f"memory usage exceeded {mem}MiB at {socket.gethostname()}"
)
self._send_report(self.notify_mails, subj)
self.crawler.stats.set_value('memusage/limit_notified', 1)
self.crawler.stats.set_value("memusage/limit_notified", 1)
if self.crawler.engine.spider is not None:
self.crawler.engine.close_spider(self.crawler.engine.spider, 'memusage_exceeded')
self.crawler.engine.close_spider(
self.crawler.engine.spider, "memusage_exceeded"
)
else:
self.crawler.stop()
else:
logger.info("Peak memory usage is %(virtualsize)dMiB", {'virtualsize': peak_mem_usage / 1024 / 1024})
logger.info(
"Peak memory usage is %(virtualsize)dMiB",
{"virtualsize": peak_mem_usage / 1024 / 1024},
)
def _check_warning(self):
if self.warned: # warn only once
return
if self.get_virtual_size() > self.warning:
self.crawler.stats.set_value('memusage/warning_reached', 1)
self.crawler.stats.set_value("memusage/warning_reached", 1)
mem = self.warning / 1024 / 1024
logger.warning("Memory usage reached %(memusage)dMiB",
{'memusage': mem}, extra={'crawler': self.crawler})
logger.warning(
"Memory usage reached %(memusage)dMiB",
{"memusage": mem},
extra={"crawler": self.crawler},
)
if self.notify_mails:
subj = (
f"{self.crawler.settings['BOT_NAME']} warning: "
f"memory usage reached {mem}MiB at {socket.gethostname()}"
)
self._send_report(self.notify_mails, subj)
self.crawler.stats.set_value('memusage/warning_notified', 1)
self.crawler.stats.set_value("memusage/warning_notified", 1)
self.warned = True
def _send_report(self, rcpts, subject):
@ -120,7 +132,9 @@ class MemoryUsage:
s += f"Maximum memory usage : {stats.get_value('memusage/max')/1024/1024}M\r\n"
s += f"Current memory usage : {self.get_virtual_size()/1024/1024}M\r\n"
s += "ENGINE STATUS ------------------------------------------------------- \r\n"
s += (
"ENGINE STATUS ------------------------------------------------------- \r\n"
)
s += "\r\n"
s += pformat(get_engine_status(self.crawler.engine))
s += "\r\n"

View File

@ -29,8 +29,13 @@ class GzipPlugin:
compress_level = self.feed_options.get("gzip_compresslevel", 9)
mtime = self.feed_options.get("gzip_mtime")
filename = self.feed_options.get("gzip_filename")
self.gzipfile = GzipFile(fileobj=self.file, mode="wb", compresslevel=compress_level,
mtime=mtime, filename=filename)
self.gzipfile = GzipFile(
fileobj=self.file,
mode="wb",
compresslevel=compress_level,
mtime=mtime,
filename=filename,
)
def write(self, data: bytes) -> int:
return self.gzipfile.write(data)
@ -55,7 +60,9 @@ class Bz2Plugin:
self.file = file
self.feed_options = feed_options
compress_level = self.feed_options.get("bz2_compresslevel", 9)
self.bz2file = BZ2File(filename=self.file, mode="wb", compresslevel=compress_level)
self.bz2file = BZ2File(
filename=self.file, mode="wb", compresslevel=compress_level
)
def write(self, data: bytes) -> int:
return self.bz2file.write(data)
@ -90,8 +97,14 @@ class LZMAPlugin:
check = self.feed_options.get("lzma_check", -1)
preset = self.feed_options.get("lzma_preset")
filters = self.feed_options.get("lzma_filters")
self.lzmafile = LZMAFile(filename=self.file, mode="wb", format=format,
check=check, preset=preset, filters=filters)
self.lzmafile = LZMAFile(
filename=self.file,
mode="wb",
format=format,
check=check,
preset=preset,
filters=filters,
)
def write(self, data: bytes) -> int:
return self.lzmafile.write(data)
@ -114,7 +127,9 @@ class PostProcessingManager(IOBase):
:type file: file like object
"""
def __init__(self, plugins: List[Any], file: BinaryIO, feed_options: Dict[str, Any]) -> None:
def __init__(
self, plugins: List[Any], file: BinaryIO, feed_options: Dict[str, Any]
) -> None:
self.plugins = self._load_plugins(plugins)
self.file = file
self.feed_options = feed_options

View File

@ -25,16 +25,16 @@ class SpiderState:
def spider_closed(self, spider):
if self.jobdir:
with Path(self.statefn).open('wb') as f:
with Path(self.statefn).open("wb") as f:
pickle.dump(spider.state, f, protocol=4)
def spider_opened(self, spider):
if self.jobdir and Path(self.statefn).exists():
with Path(self.statefn).open('rb') as f:
with Path(self.statefn).open("rb") as f:
spider.state = pickle.load(f)
else:
spider.state = {}
@property
def statefn(self) -> str:
return str(Path(self.jobdir, 'spider.state'))
return str(Path(self.jobdir, "spider.state"))

View File

@ -10,7 +10,6 @@ from scrapy.exceptions import NotConfigured
class StatsMailer:
def __init__(self, stats, recipients, mail):
self.stats = stats
self.recipients = recipients

View File

@ -15,6 +15,7 @@ from twisted.internet import protocol
try:
from twisted.conch import manhole, telnet
from twisted.conch.insults import insults
TWISTED_CONCH_AVAILABLE = True
except (ImportError, SyntaxError):
_TWISTED_CONCH_TRACEBACK = traceback.format_exc()
@ -35,24 +36,26 @@ update_telnet_vars = object()
class TelnetConsole(protocol.ServerFactory):
def __init__(self, crawler):
if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
if not crawler.settings.getbool("TELNETCONSOLE_ENABLED"):
raise NotConfigured
if not TWISTED_CONCH_AVAILABLE:
raise NotConfigured(
'TELNETCONSOLE_ENABLED setting is True but required twisted '
'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
"TELNETCONSOLE_ENABLED setting is True but required twisted "
"modules failed to import:\n" + _TWISTED_CONCH_TRACEBACK
)
self.crawler = crawler
self.noisy = False
self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
self.host = crawler.settings['TELNETCONSOLE_HOST']
self.username = crawler.settings['TELNETCONSOLE_USERNAME']
self.password = crawler.settings['TELNETCONSOLE_PASSWORD']
self.portrange = [
int(x) for x in crawler.settings.getlist("TELNETCONSOLE_PORT")
]
self.host = crawler.settings["TELNETCONSOLE_HOST"]
self.username = crawler.settings["TELNETCONSOLE_USERNAME"]
self.password = crawler.settings["TELNETCONSOLE_PASSWORD"]
if not self.password:
self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
logger.info('Telnet Password: %s', self.password)
self.password = binascii.hexlify(os.urandom(8)).decode("utf8")
logger.info("Telnet Password: %s", self.password)
self.crawler.signals.connect(self.start_listening, signals.engine_started)
self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
@ -64,9 +67,11 @@ class TelnetConsole(protocol.ServerFactory):
def start_listening(self):
self.port = listen_tcp(self.portrange, self.host, self)
h = self.port.getHost()
logger.info("Telnet console listening on %(host)s:%(port)d",
{'host': h.host, 'port': h.port},
extra={'crawler': self.crawler})
logger.info(
"Telnet console listening on %(host)s:%(port)d",
{"host": h.host, "port": h.port},
extra={"crawler": self.crawler},
)
def stop_listening(self):
self.port.stopListening()
@ -74,41 +79,37 @@ class TelnetConsole(protocol.ServerFactory):
def protocol(self):
class Portal:
"""An implementation of IPortal"""
@defers
def login(self_, credentials, mind, *interfaces):
if not (
credentials.username == self.username.encode('utf8')
and credentials.checkPassword(self.password.encode('utf8'))
credentials.username == self.username.encode("utf8")
and credentials.checkPassword(self.password.encode("utf8"))
):
raise ValueError("Invalid credentials")
protocol = telnet.TelnetBootstrapProtocol(
insults.ServerProtocol,
manhole.Manhole,
self._get_telnet_vars()
insults.ServerProtocol, manhole.Manhole, self._get_telnet_vars()
)
return (interfaces[0], protocol, lambda: None)
return telnet.TelnetTransport(
telnet.AuthenticatingTelnetProtocol,
Portal()
)
return telnet.TelnetTransport(telnet.AuthenticatingTelnetProtocol, Portal())
def _get_telnet_vars(self):
# Note: if you add entries here also update topics/telnetconsole.rst
telnet_vars = {
'engine': self.crawler.engine,
'spider': self.crawler.engine.spider,
'slot': self.crawler.engine.slot,
'crawler': self.crawler,
'extensions': self.crawler.extensions,
'stats': self.crawler.stats,
'settings': self.crawler.settings,
'est': lambda: print_engine_status(self.crawler.engine),
'p': pprint.pprint,
'prefs': print_live_refs,
'help': "This is Scrapy telnet console. For more info see: "
"https://docs.scrapy.org/en/latest/topics/telnetconsole.html",
"engine": self.crawler.engine,
"spider": self.crawler.engine.spider,
"slot": self.crawler.engine.slot,
"crawler": self.crawler,
"extensions": self.crawler.extensions,
"stats": self.crawler.stats,
"settings": self.crawler.settings,
"est": lambda: print_engine_status(self.crawler.engine),
"p": pprint.pprint,
"prefs": print_live_refs,
"help": "This is Scrapy telnet console. For more info see: "
"https://docs.scrapy.org/en/latest/topics/telnetconsole.html",
}
self.crawler.signals.send_catch_log(update_telnet_vars, telnet_vars=telnet_vars)
return telnet_vars

View File

@ -7,16 +7,19 @@ logger = logging.getLogger(__name__)
class AutoThrottle:
def __init__(self, crawler):
self.crawler = crawler
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
if not crawler.settings.getbool("AUTOTHROTTLE_ENABLED"):
raise NotConfigured
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
self.target_concurrency = crawler.settings.getfloat(
"AUTOTHROTTLE_TARGET_CONCURRENCY"
)
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
crawler.signals.connect(
self._response_downloaded, signal=signals.response_downloaded
)
@classmethod
def from_crawler(cls, crawler):
@ -29,17 +32,19 @@ class AutoThrottle:
def _min_delay(self, spider):
s = self.crawler.settings
return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
return getattr(spider, "download_delay", s.getfloat("DOWNLOAD_DELAY"))
def _max_delay(self, spider):
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
return self.crawler.settings.getfloat("AUTOTHROTTLE_MAX_DELAY")
def _start_delay(self, spider):
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
return max(
self.mindelay, self.crawler.settings.getfloat("AUTOTHROTTLE_START_DELAY")
)
def _response_downloaded(self, response, request, spider):
key, slot = self._get_slot(request, spider)
latency = request.meta.get('download_latency')
latency = request.meta.get("download_latency")
if latency is None or slot is None:
return
@ -54,15 +59,18 @@ class AutoThrottle:
"delay:%(delay)5d ms (%(delaydiff)+d) | "
"latency:%(latency)5d ms | size:%(size)6d bytes",
{
'slot': key, 'concurrency': conc,
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
'latency': latency * 1000, 'size': size,
"slot": key,
"concurrency": conc,
"delay": slot.delay * 1000,
"delaydiff": diff * 1000,
"latency": latency * 1000,
"size": size,
},
extra={'spider': spider}
extra={"spider": spider},
)
def _get_slot(self, request, spider):
key = request.meta.get('download_slot')
key = request.meta.get("download_slot")
return key, self.crawler.engine.downloader.slots.get(key)
def _adjust_delay(self, slot, latency, response):

View File

@ -3,4 +3,5 @@ def obsolete_setter(setter, attrname):
c = self.__class__.__name__
msg = f"{c}.{attrname} is not modifiable, use {c}.replace() instead"
raise AttributeError(msg)
return newsetter

View File

@ -36,7 +36,7 @@ class CookieJar:
if not IPV4_RE.search(req_host):
hosts = potential_domain_matches(req_host)
if '.' not in req_host:
if "." not in req_host:
hosts += [req_host + ".local"]
else:
hosts = [req_host]
@ -96,14 +96,14 @@ def potential_domain_matches(domain):
"""
matches = [domain]
try:
start = domain.index('.') + 1
end = domain.rindex('.')
start = domain.index(".") + 1
end = domain.rindex(".")
while start < end:
matches.append(domain[start:])
start = domain.index('.', start) + 1
start = domain.index(".", start) + 1
except ValueError:
pass
return matches + ['.' + d for d in matches]
return matches + ["." + d for d in matches]
class _DummyLock:
@ -140,7 +140,7 @@ class WrappedRequest:
HTML document, and the user had no option to approve the automatic
fetching of the image, this should be true.
"""
return self.request.meta.get('is_unverifiable', False)
return self.request.meta.get("is_unverifiable", False)
@property
def full_url(self):
@ -166,13 +166,14 @@ class WrappedRequest:
return name in self.request.headers
def get_header(self, name, default=None):
return to_unicode(self.request.headers.get(name, default),
errors='replace')
return to_unicode(self.request.headers.get(name, default), errors="replace")
def header_items(self):
return [
(to_unicode(k, errors='replace'),
[to_unicode(x, errors='replace') for x in v])
(
to_unicode(k, errors="replace"),
[to_unicode(x, errors="replace") for x in v],
)
for k, v in self.request.headers.items()
]
@ -181,7 +182,6 @@ class WrappedRequest:
class WrappedResponse:
def __init__(self, response):
self.response = response
@ -189,5 +189,6 @@ class WrappedResponse:
return self
def get_all(self, name, default=None):
return [to_unicode(v, errors='replace')
for v in self.response.headers.getlist(name)]
return [
to_unicode(v, errors="replace") for v in self.response.headers.getlist(name)
]

View File

@ -8,7 +8,7 @@ from scrapy.utils.python import to_unicode
class Headers(CaselessDict):
"""Case insensitive http headers dictionary"""
def __init__(self, seq=None, encoding='utf-8'):
def __init__(self, seq=None, encoding="utf-8"):
self.encoding = encoding
super().__init__(seq)
@ -29,7 +29,7 @@ class Headers(CaselessDict):
value = []
elif isinstance(value, (str, bytes)):
value = [value]
elif not hasattr(value, '__iter__'):
elif not hasattr(value, "__iter__"):
value = [value]
return [self._tobytes(x) for x in value]
@ -41,7 +41,7 @@ class Headers(CaselessDict):
return x.encode(self.encoding)
if isinstance(x, int):
return str(x).encode(self.encoding)
raise TypeError(f'Unsupported value type: {type(x)}')
raise TypeError(f"Unsupported value type: {type(x)}")
def __getitem__(self, key):
try:
@ -84,13 +84,16 @@ class Headers(CaselessDict):
return headers_dict_to_raw(self)
def to_unicode_dict(self):
""" Return headers as a CaselessDict with unicode keys
"""Return headers as a CaselessDict with unicode keys
and unicode values. Multiple values are joined with ','.
"""
return CaselessDict(
(to_unicode(key, encoding=self.encoding),
to_unicode(b','.join(value), encoding=self.encoding))
for key, value in self.items())
(
to_unicode(key, encoding=self.encoding),
to_unicode(b",".join(value), encoding=self.encoding),
)
for key, value in self.items()
)
def __copy__(self):
return self.__class__(self)

View File

@ -27,9 +27,19 @@ class Request(object_ref):
"""
attributes: Tuple[str, ...] = (
"url", "callback", "method", "headers", "body",
"cookies", "meta", "encoding", "priority",
"dont_filter", "errback", "flags", "cb_kwargs",
"url",
"callback",
"method",
"headers",
"body",
"cookies",
"meta",
"encoding",
"priority",
"dont_filter",
"errback",
"flags",
"cb_kwargs",
)
"""A tuple of :class:`str` objects containing the name of all public
attributes of the class that are also keyword parameters of the
@ -64,9 +74,11 @@ class Request(object_ref):
self.priority = priority
if callback is not None and not callable(callback):
raise TypeError(f'callback must be a callable, got {type(callback).__name__}')
raise TypeError(
f"callback must be a callable, got {type(callback).__name__}"
)
if errback is not None and not callable(errback):
raise TypeError(f'errback must be a callable, got {type(errback).__name__}')
raise TypeError(f"errback must be a callable, got {type(errback).__name__}")
self.callback = callback
self.errback = errback
@ -101,13 +113,13 @@ class Request(object_ref):
self._url = escape_ajax(s)
if (
'://' not in self._url
and not self._url.startswith('about:')
and not self._url.startswith('data:')
"://" not in self._url
and not self._url.startswith("about:")
and not self._url.startswith("data:")
):
raise ValueError(f'Missing scheme in request url: {self._url}')
raise ValueError(f"Missing scheme in request url: {self._url}")
url = property(_get_url, obsolete_setter(_set_url, 'url'))
url = property(_get_url, obsolete_setter(_set_url, "url"))
def _get_body(self) -> bytes:
return self._body
@ -115,7 +127,7 @@ class Request(object_ref):
def _set_body(self, body: Optional[Union[str, bytes]]) -> None:
self._body = b"" if body is None else to_bytes(body, self.encoding)
body = property(_get_body, obsolete_setter(_set_body, 'body'))
body = property(_get_body, obsolete_setter(_set_body, "body"))
@property
def encoding(self) -> str:
@ -131,12 +143,15 @@ class Request(object_ref):
"""Create a new Request with the same attributes except for those given new values"""
for x in self.attributes:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
cls = kwargs.pop("cls", self.__class__)
return cls(*args, **kwargs)
@classmethod
def from_curl(
cls: Type[RequestTypeVar], curl_command: str, ignore_unknown_options: bool = True, **kwargs
cls: Type[RequestTypeVar],
curl_command: str,
ignore_unknown_options: bool = True,
**kwargs,
) -> RequestTypeVar:
"""Create a Request object from a string containing a `cURL
<https://curl.haxx.se/>`_ command. It populates the HTTP method, the
@ -179,21 +194,25 @@ class Request(object_ref):
"""
d = {
"url": self.url, # urls are safe (safe_string_url)
"callback": _find_method(spider, self.callback) if callable(self.callback) else self.callback,
"errback": _find_method(spider, self.errback) if callable(self.errback) else self.errback,
"callback": _find_method(spider, self.callback)
if callable(self.callback)
else self.callback,
"errback": _find_method(spider, self.errback)
if callable(self.errback)
else self.errback,
"headers": dict(self.headers),
}
for attr in self.attributes:
d.setdefault(attr, getattr(self, attr))
if type(self) is not Request: # pylint: disable=unidiomatic-typecheck
d["_class"] = self.__module__ + '.' + self.__class__.__name__
d["_class"] = self.__module__ + "." + self.__class__.__name__
return d
def _find_method(obj, func):
"""Helper function for Request.to_dict"""
# Only instance methods contain ``__func__``
if obj and hasattr(func, '__func__'):
if obj and hasattr(func, "__func__"):
members = inspect.getmembers(obj, predicate=inspect.ismethod)
for name, obj_func in members:
# We need to use __func__ to access the original function object because instance

View File

@ -24,22 +24,26 @@ FormdataType = Optional[Union[dict, List[Tuple[str, str]]]]
class FormRequest(Request):
valid_form_methods = ['GET', 'POST']
valid_form_methods = ["GET", "POST"]
def __init__(self, *args, formdata: FormdataType = None, **kwargs) -> None:
if formdata and kwargs.get('method') is None:
kwargs['method'] = 'POST'
if formdata and kwargs.get("method") is None:
kwargs["method"] = "POST"
super().__init__(*args, **kwargs)
if formdata:
items = formdata.items() if isinstance(formdata, dict) else formdata
form_query_str = _urlencode(items, self.encoding)
if self.method == 'POST':
self.headers.setdefault(b'Content-Type', b'application/x-www-form-urlencoded')
if self.method == "POST":
self.headers.setdefault(
b"Content-Type", b"application/x-www-form-urlencoded"
)
self._set_body(form_query_str)
else:
self._set_url(urlunsplit(urlsplit(self.url)._replace(query=form_query_str)))
self._set_url(
urlunsplit(urlsplit(self.url)._replace(query=form_query_str))
)
@classmethod
def from_response(
@ -55,28 +59,29 @@ class FormRequest(Request):
formcss: Optional[str] = None,
**kwargs,
) -> FormRequestTypeVar:
kwargs.setdefault('encoding', response.encoding)
kwargs.setdefault("encoding", response.encoding)
if formcss is not None:
from parsel.csstranslator import HTMLTranslator
formxpath = HTMLTranslator().css_to_xpath(formcss)
form = _get_form(response, formname, formid, formnumber, formxpath)
formdata = _get_inputs(form, formdata, dont_click, clickdata)
url = _get_form_url(form, kwargs.pop('url', None))
url = _get_form_url(form, kwargs.pop("url", None))
method = kwargs.pop('method', form.method)
method = kwargs.pop("method", form.method)
if method is not None:
method = method.upper()
if method not in cls.valid_form_methods:
method = 'GET'
method = "GET"
return cls(url=url, method=method, formdata=formdata, **kwargs)
def _get_form_url(form: FormElement, url: Optional[str]) -> str:
if url is None:
action = form.get('action')
action = form.get("action")
if action is None:
return form.base_url
return urljoin(form.base_url, strip_html5_whitespace(action))
@ -84,9 +89,11 @@ def _get_form_url(form: FormElement, url: Optional[str]) -> str:
def _urlencode(seq: Iterable, enc: str) -> str:
values = [(to_bytes(k, enc), to_bytes(v, enc))
for k, vs in seq
for v in (vs if is_listlike(vs) else [vs])]
values = [
(to_bytes(k, enc), to_bytes(v, enc))
for k, vs in seq
for v in (vs if is_listlike(vs) else [vs])
]
return urlencode(values, doseq=True)
@ -99,7 +106,7 @@ def _get_form(
) -> FormElement:
"""Find the wanted form element within the given response."""
root = create_root_node(response.text, HTMLParser, base_url=get_base_url(response))
forms = root.xpath('//form')
forms = root.xpath("//form")
if not forms:
raise ValueError(f"No <form> element found in {response}")
@ -119,12 +126,12 @@ def _get_form(
if nodes:
el = nodes[0]
while True:
if el.tag == 'form':
if el.tag == "form":
return el
el = el.getparent()
if el is None:
break
raise ValueError(f'No <form> element found with {formxpath}')
raise ValueError(f"No <form> element found with {formxpath}")
# If we get here, it means that either formname was None or invalid
if formnumber is not None:
@ -146,19 +153,21 @@ def _get_inputs(
try:
formdata_keys = dict(formdata or ()).keys()
except (ValueError, TypeError):
raise ValueError('formdata should be a dict or iterable of tuples')
raise ValueError("formdata should be a dict or iterable of tuples")
if not formdata:
formdata = []
inputs = form.xpath('descendant::textarea'
'|descendant::select'
'|descendant::input[not(@type) or @type['
' not(re:test(., "^(?:submit|image|reset)$", "i"))'
' and (../@checked or'
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
namespaces={"re": "http://exslt.org/regular-expressions"})
inputs = form.xpath(
"descendant::textarea"
"|descendant::select"
"|descendant::input[not(@type) or @type["
' not(re:test(., "^(?:submit|image|reset)$", "i"))'
" and (../@checked or"
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
namespaces={"re": "http://exslt.org/regular-expressions"},
)
values = [
(k, '' if v is None else v)
(k, "" if v is None else v)
for k, v in (_value(e) for e in inputs)
if k and k not in formdata_keys
]
@ -178,7 +187,7 @@ def _get_inputs(
def _value(ele: HtmlElement):
n = ele.name
v = ele.value
if ele.tag == 'select':
if ele.tag == "select":
return _select_value(ele, n, v)
return n, v
@ -193,51 +202,57 @@ def _select_value(ele: SelectElement, n: str, v: str):
if v is not None and multiple:
# This is a workround to bug in lxml fixed 2.3.1
# fix https://github.com/lxml/lxml/commit/57f49eed82068a20da3db8f1b18ae00c1bab8b12#L1L1139
selected_options = ele.xpath('.//option[@selected]')
values = [(o.get('value') or o.text or '').strip() for o in selected_options]
selected_options = ele.xpath(".//option[@selected]")
values = [(o.get("value") or o.text or "").strip() for o in selected_options]
return n, values
return n, v
def _get_clickable(clickdata: Optional[dict], form: FormElement) -> Optional[Tuple[str, str]]:
def _get_clickable(
clickdata: Optional[dict], form: FormElement
) -> Optional[Tuple[str, str]]:
"""
Returns the clickable element specified in clickdata,
if the latter is given. If not, it returns the first
clickable element found
"""
clickables = list(form.xpath(
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
namespaces={"re": "http://exslt.org/regular-expressions"}
))
clickables = list(
form.xpath(
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
namespaces={"re": "http://exslt.org/regular-expressions"},
)
)
if not clickables:
return None
# If we don't have clickdata, we just use the first clickable element
if clickdata is None:
el = clickables[0]
return (el.get('name'), el.get('value') or '')
return (el.get("name"), el.get("value") or "")
# If clickdata is given, we compare it to the clickable elements to find a
# match. We first look to see if the number is specified in clickdata,
# because that uniquely identifies the element
nr = clickdata.get('nr', None)
nr = clickdata.get("nr", None)
if nr is not None:
try:
el = list(form.inputs)[nr]
except IndexError:
pass
else:
return (el.get('name'), el.get('value') or '')
return (el.get("name"), el.get("value") or "")
# We didn't find it, so now we build an XPath expression out of the other
# arguments, because they can be used as such
xpath = './/*' + ''.join(f'[@{k}="{v}"]' for k, v in clickdata.items())
xpath = ".//*" + "".join(f'[@{k}="{v}"]' for k, v in clickdata.items())
el = form.xpath(xpath)
if len(el) == 1:
return (el[0].get('name'), el[0].get('value') or '')
return (el[0].get("name"), el[0].get("value") or "")
if len(el) > 1:
raise ValueError(f"Multiple elements found ({el!r}) matching the "
f"criteria in clickdata: {clickdata!r}")
raise ValueError(
f"Multiple elements found ({el!r}) matching the "
f"criteria in clickdata: {clickdata!r}"
)
else:
raise ValueError(f'No clickable element matching clickdata: {clickdata!r}')
raise ValueError(f"No clickable element matching clickdata: {clickdata!r}")

View File

@ -20,42 +20,44 @@ class JsonRequest(Request):
def __init__(self, *args, dumps_kwargs: Optional[dict] = None, **kwargs) -> None:
dumps_kwargs = copy.deepcopy(dumps_kwargs) if dumps_kwargs is not None else {}
dumps_kwargs.setdefault('sort_keys', True)
dumps_kwargs.setdefault("sort_keys", True)
self._dumps_kwargs = dumps_kwargs
body_passed = kwargs.get('body', None) is not None
data = kwargs.pop('data', None)
body_passed = kwargs.get("body", None) is not None
data = kwargs.pop("data", None)
data_passed = data is not None
if body_passed and data_passed:
warnings.warn('Both body and data passed. data will be ignored')
warnings.warn("Both body and data passed. data will be ignored")
elif not body_passed and data_passed:
kwargs['body'] = self._dumps(data)
if 'method' not in kwargs:
kwargs['method'] = 'POST'
kwargs["body"] = self._dumps(data)
if "method" not in kwargs:
kwargs["method"] = "POST"
super().__init__(*args, **kwargs)
self.headers.setdefault('Content-Type', 'application/json')
self.headers.setdefault('Accept', 'application/json, text/javascript, */*; q=0.01')
self.headers.setdefault("Content-Type", "application/json")
self.headers.setdefault(
"Accept", "application/json, text/javascript, */*; q=0.01"
)
@property
def dumps_kwargs(self) -> dict:
return self._dumps_kwargs
def replace(self, *args, **kwargs) -> Request:
body_passed = kwargs.get('body', None) is not None
data = kwargs.pop('data', None)
body_passed = kwargs.get("body", None) is not None
data = kwargs.pop("data", None)
data_passed = data is not None
if body_passed and data_passed:
warnings.warn('Both body and data passed. data will be ignored')
warnings.warn("Both body and data passed. data will be ignored")
elif not body_passed and data_passed:
kwargs['body'] = self._dumps(data)
kwargs["body"] = self._dumps(data)
return super().replace(*args, **kwargs)
def _dumps(self, data: dict) -> str:
"""Convert to JSON """
"""Convert to JSON"""
return json.dumps(data, **self._dumps_kwargs)

View File

@ -15,21 +15,20 @@ DUMPS_ARGS = get_func_args(xmlrpclib.dumps)
class XmlRpcRequest(Request):
def __init__(self, *args, encoding: Optional[str] = None, **kwargs):
if 'body' not in kwargs and 'params' in kwargs:
if "body" not in kwargs and "params" in kwargs:
kw = dict((k, kwargs.pop(k)) for k in DUMPS_ARGS if k in kwargs)
kwargs['body'] = xmlrpclib.dumps(**kw)
kwargs["body"] = xmlrpclib.dumps(**kw)
# spec defines that requests must use POST method
kwargs.setdefault('method', 'POST')
kwargs.setdefault("method", "POST")
# xmlrpc query multiples times over the same url
kwargs.setdefault('dont_filter', True)
kwargs.setdefault("dont_filter", True)
# restore encoding
if encoding is not None:
kwargs['encoding'] = encoding
kwargs["encoding"] = encoding
super().__init__(*args, **kwargs)
self.headers.setdefault('Content-Type', 'text/xml')
self.headers.setdefault("Content-Type", "text/xml")

View File

@ -21,7 +21,15 @@ class Response(object_ref):
"""
attributes: Tuple[str, ...] = (
"url", "status", "headers", "body", "flags", "request", "certificate", "ip_address", "protocol",
"url",
"status",
"headers",
"body",
"flags",
"request",
"certificate",
"ip_address",
"protocol",
)
"""A tuple of :class:`str` objects containing the name of all public
attributes of the class that are also keyword parameters of the
@ -79,26 +87,28 @@ class Response(object_ref):
if isinstance(url, str):
self._url = url
else:
raise TypeError(f'{type(self).__name__} url must be str, '
f'got {type(url).__name__}')
raise TypeError(
f"{type(self).__name__} url must be str, " f"got {type(url).__name__}"
)
url = property(_get_url, obsolete_setter(_set_url, 'url'))
url = property(_get_url, obsolete_setter(_set_url, "url"))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
self._body = b""
elif not isinstance(body, bytes):
raise TypeError(
"Response body must be bytes. "
"If you want to pass unicode body use TextResponse "
"or HtmlResponse.")
"or HtmlResponse."
)
else:
self._body = body
body = property(_get_body, obsolete_setter(_set_body, 'body'))
body = property(_get_body, obsolete_setter(_set_body, "body"))
def __repr__(self):
return f"<{self.status} {self.url}>"
@ -111,7 +121,7 @@ class Response(object_ref):
"""Create a new Response with the same attributes except for those given new values"""
for x in self.attributes:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
cls = kwargs.pop("cls", self.__class__)
return cls(*args, **kwargs)
def urljoin(self, url):
@ -138,9 +148,22 @@ class Response(object_ref):
"""
raise NotSupported("Response content isn't text")
def follow(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None) -> Request:
def follow(
self,
url,
callback=None,
method="GET",
headers=None,
body=None,
cookies=None,
meta=None,
encoding="utf-8",
priority=0,
dont_filter=False,
errback=None,
cb_kwargs=None,
flags=None,
) -> Request:
"""
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
@ -176,10 +199,22 @@ class Response(object_ref):
flags=flags,
)
def follow_all(self, urls, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, cb_kwargs=None,
flags=None) -> Generator[Request, None, None]:
def follow_all(
self,
urls,
callback=None,
method="GET",
headers=None,
body=None,
cookies=None,
meta=None,
encoding="utf-8",
priority=0,
dont_filter=False,
errback=None,
cb_kwargs=None,
flags=None,
) -> Generator[Request, None, None]:
"""
.. versionadded:: 2.0
@ -192,7 +227,7 @@ class Response(object_ref):
method which supports selectors in addition to absolute/relative URLs
and Link objects.
"""
if not hasattr(urls, '__iter__'):
if not hasattr(urls, "__iter__"):
raise TypeError("'urls' argument must be an iterable")
return (
self.follow(

View File

@ -30,13 +30,13 @@ _NONE = object()
class TextResponse(Response):
_DEFAULT_ENCODING = 'ascii'
_DEFAULT_ENCODING = "ascii"
_cached_decoded_json = _NONE
attributes: Tuple[str, ...] = Response.attributes + ("encoding",)
def __init__(self, *args, **kwargs):
self._encoding = kwargs.pop('encoding', None)
self._encoding = kwargs.pop("encoding", None)
self._cached_benc = None
self._cached_ubody = None
self._cached_selector = None
@ -49,11 +49,13 @@ class TextResponse(Response):
super()._set_url(url)
def _set_body(self, body):
self._body = b'' # used by encoding detection
self._body = b"" # used by encoding detection
if isinstance(body, str):
if self._encoding is None:
raise TypeError('Cannot convert unicode body - '
f'{type(self).__name__} has no encoding')
raise TypeError(
"Cannot convert unicode body - "
f"{type(self).__name__} has no encoding"
)
self._body = body.encode(self._encoding)
else:
super()._set_body(body)
@ -82,12 +84,12 @@ class TextResponse(Response):
@property
def text(self):
""" Body as unicode """
"""Body as unicode"""
# access self.encoding before _cached_ubody to make sure
# _body_inferred_encoding is called
benc = self.encoding
if self._cached_ubody is None:
charset = f'charset={benc}'
charset = f"charset={benc}"
self._cached_ubody = html_to_unicode(charset, self.body)[1]
return self._cached_ubody
@ -98,21 +100,24 @@ class TextResponse(Response):
@memoizemethod_noargs
def _headers_encoding(self):
content_type = self.headers.get(b'Content-Type', b'')
content_type = self.headers.get(b"Content-Type", b"")
return http_content_type_encoding(to_unicode(content_type))
def _body_inferred_encoding(self):
if self._cached_benc is None:
content_type = to_unicode(self.headers.get(b'Content-Type', b''))
benc, ubody = html_to_unicode(content_type, self.body,
auto_detect_fun=self._auto_detect_fun,
default_encoding=self._DEFAULT_ENCODING)
content_type = to_unicode(self.headers.get(b"Content-Type", b""))
benc, ubody = html_to_unicode(
content_type,
self.body,
auto_detect_fun=self._auto_detect_fun,
default_encoding=self._DEFAULT_ENCODING,
)
self._cached_benc = benc
self._cached_ubody = ubody
return self._cached_benc
def _auto_detect_fun(self, text):
for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
try:
text.decode(enc)
except UnicodeError:
@ -130,6 +135,7 @@ class TextResponse(Response):
@property
def selector(self):
from scrapy.selector import Selector
if self._cached_selector is None:
self._cached_selector = Selector(self)
return self._cached_selector
@ -140,9 +146,22 @@ class TextResponse(Response):
def css(self, query):
return self.selector.css(query)
def follow(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding=None, priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None) -> Request:
def follow(
self,
url,
callback=None,
method="GET",
headers=None,
body=None,
cookies=None,
meta=None,
encoding=None,
priority=0,
dont_filter=False,
errback=None,
cb_kwargs=None,
flags=None,
) -> Request:
"""
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
@ -180,10 +199,24 @@ class TextResponse(Response):
flags=flags,
)
def follow_all(self, urls=None, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding=None, priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None,
css=None, xpath=None) -> Generator[Request, None, None]:
def follow_all(
self,
urls=None,
callback=None,
method="GET",
headers=None,
body=None,
cookies=None,
meta=None,
encoding=None,
priority=0,
dont_filter=False,
errback=None,
cb_kwargs=None,
flags=None,
css=None,
xpath=None,
) -> Generator[Request, None, None]:
"""
A generator that produces :class:`~.Request` instances to follow all
links in ``urls``. It accepts the same arguments as the :class:`~.Request`'s
@ -251,12 +284,13 @@ def _url_from_selector(sel):
if isinstance(sel.root, str):
# e.g. ::attr(href) result
return strip_html5_whitespace(sel.root)
if not hasattr(sel.root, 'tag'):
if not hasattr(sel.root, "tag"):
raise _InvalidSelector(f"Unsupported selector: {sel}")
if sel.root.tag not in ('a', 'link'):
raise _InvalidSelector("Only <a> and <link> elements are supported; "
f"got <{sel.root.tag}>")
href = sel.root.get('href')
if sel.root.tag not in ("a", "link"):
raise _InvalidSelector(
"Only <a> and <link> elements are supported; " f"got <{sel.root.tag}>"
)
href = sel.root.get("href")
if href is None:
raise _InvalidSelector(f"<{sel.root.tag}> element has no href attribute: {sel}")
return strip_html5_whitespace(href)

View File

@ -2,7 +2,6 @@ from zope.interface import Interface
class ISpiderLoader(Interface):
def from_settings(settings):
"""Return an instance of the class for the given settings"""

View File

@ -24,11 +24,11 @@ class ItemMeta(ABCMeta):
"""
def __new__(mcs, class_name, bases, attrs):
classcell = attrs.pop('__classcell__', None)
new_bases = tuple(base._class for base in bases if hasattr(base, '_class'))
_class = super().__new__(mcs, 'x_' + class_name, new_bases, attrs)
classcell = attrs.pop("__classcell__", None)
new_bases = tuple(base._class for base in bases if hasattr(base, "_class"))
_class = super().__new__(mcs, "x_" + class_name, new_bases, attrs)
fields = getattr(_class, 'fields', {})
fields = getattr(_class, "fields", {})
new_attrs = {}
for n in dir(_class):
v = getattr(_class, n)
@ -37,10 +37,10 @@ class ItemMeta(ABCMeta):
elif n in attrs:
new_attrs[n] = attrs[n]
new_attrs['fields'] = fields
new_attrs['_class'] = _class
new_attrs["fields"] = fields
new_attrs["_class"] = _class
if classcell is not None:
new_attrs['__classcell__'] = classcell
new_attrs["__classcell__"] = classcell
return super().__new__(mcs, class_name, bases, new_attrs)
@ -93,7 +93,7 @@ class Item(MutableMapping, object_ref, metaclass=ItemMeta):
raise AttributeError(name)
def __setattr__(self, name, value):
if not name.startswith('_'):
if not name.startswith("_"):
raise AttributeError(f"Use item[{name!r}] = {value!r} to set field value")
super().__setattr__(name, value)
@ -115,6 +115,5 @@ class Item(MutableMapping, object_ref, metaclass=ItemMeta):
return self.__class__(self)
def deepcopy(self):
"""Return a :func:`~copy.deepcopy` of this item.
"""
"""Return a :func:`~copy.deepcopy` of this item."""
return deepcopy(self)

View File

@ -24,9 +24,9 @@ class Link:
of the anchor tag.
"""
__slots__ = ['url', 'text', 'fragment', 'nofollow']
__slots__ = ["url", "text", "fragment", "nofollow"]
def __init__(self, url, text='', fragment='', nofollow=False):
def __init__(self, url, text="", fragment="", nofollow=False):
if not isinstance(url, str):
got = url.__class__.__name__
raise TypeError(f"Link urls must be str objects, got {got}")
@ -44,10 +44,12 @@ class Link:
)
def __hash__(self):
return hash(self.url) ^ hash(self.text) ^ hash(self.fragment) ^ hash(self.nofollow)
return (
hash(self.url) ^ hash(self.text) ^ hash(self.fragment) ^ hash(self.nofollow)
)
def __repr__(self):
return (
f'Link(url={self.url!r}, text={self.text!r}, '
f'fragment={self.fragment!r}, nofollow={self.nofollow!r})'
f"Link(url={self.url!r}, text={self.text!r}, "
f"fragment={self.fragment!r}, nofollow={self.nofollow!r})"
)

View File

@ -10,25 +10,81 @@ import re
# common file extensions that are not followed if they occur in links
IGNORED_EXTENSIONS = [
# archives
'7z', '7zip', 'bz2', 'rar', 'tar', 'tar.gz', 'xz', 'zip',
"7z",
"7zip",
"bz2",
"rar",
"tar",
"tar.gz",
"xz",
"zip",
# images
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'cdr', 'ico',
"mng",
"pct",
"bmp",
"gif",
"jpg",
"jpeg",
"png",
"pst",
"psp",
"tif",
"tiff",
"ai",
"drw",
"dxf",
"eps",
"ps",
"svg",
"cdr",
"ico",
# audio
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
"mp3",
"wma",
"ogg",
"wav",
"ra",
"aac",
"mid",
"au",
"aiff",
# video
'3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',
'm4a', 'm4v', 'flv', 'webm',
"3gp",
"asf",
"asx",
"avi",
"mov",
"mp4",
"mpg",
"qt",
"rm",
"swf",
"wmv",
"m4a",
"m4v",
"flv",
"webm",
# office suites
'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg',
'odp',
"xls",
"xlsx",
"ppt",
"pptx",
"pps",
"doc",
"docx",
"odt",
"ods",
"odg",
"odp",
# other
'css', 'pdf', 'exe', 'bin', 'rss', 'dmg', 'iso', 'apk'
"css",
"pdf",
"exe",
"bin",
"rss",
"dmg",
"iso",
"apk",
]
@ -40,7 +96,7 @@ def _matches(url, regexs):
def _is_valid_url(url):
return url.split('://', 1)[0] in {'http', 'https', 'file', 'ftp'}
return url.split("://", 1)[0] in {"http", "https", "file", "ftp"}
# Top-level imports

View File

@ -11,8 +11,13 @@ from w3lib.html import strip_html5_whitespace
from w3lib.url import canonicalize_url, safe_url_string
from scrapy.link import Link
from scrapy.linkextractors import (IGNORED_EXTENSIONS, _is_valid_url, _matches,
_re_type, re)
from scrapy.linkextractors import (
IGNORED_EXTENSIONS,
_is_valid_url,
_matches,
_re_type,
re,
)
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list
from scrapy.utils.response import get_base_url
@ -26,8 +31,8 @@ _collect_string_content = etree.XPath("string()")
def _nons(tag):
if isinstance(tag, str):
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE) + 1] == XHTML_NAMESPACE:
return tag.split('}')[-1]
if tag[0] == "{" and tag[1 : len(XHTML_NAMESPACE) + 1] == XHTML_NAMESPACE:
return tag.split("}")[-1]
return tag
@ -41,14 +46,22 @@ def _canonicalize_link_url(link):
class LxmlParserLinkExtractor:
def __init__(
self, tag="a", attr="href", process=None, unique=False, strip=True, canonicalized=False
self,
tag="a",
attr="href",
process=None,
unique=False,
strip=True,
canonicalized=False,
):
self.scan_tag = tag if callable(tag) else partial(operator.eq, tag)
self.scan_attr = attr if callable(attr) else partial(operator.eq, attr)
self.process_attr = process if callable(process) else _identity
self.unique = unique
self.strip = strip
self.link_key = operator.attrgetter("url") if canonicalized else _canonicalize_link_url
self.link_key = (
operator.attrgetter("url") if canonicalized else _canonicalize_link_url
)
def _iter_links(self, document):
for el in document.iter(etree.Element):
@ -78,17 +91,22 @@ class LxmlParserLinkExtractor:
url = safe_url_string(url, encoding=response_encoding)
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or '',
nofollow=rel_has_nofollow(el.get('rel')))
link = Link(
url,
_collect_string_content(el) or "",
nofollow=rel_has_nofollow(el.get("rel")),
)
links.append(link)
return self._deduplicate_if_needed(links)
def extract_links(self, response):
base_url = get_base_url(response)
return self._extract_links(response.selector, response.url, response.encoding, base_url)
return self._extract_links(
response.selector, response.url, response.encoding, base_url
)
def _process_links(self, links):
""" Normalize and filter extracted links
"""Normalize and filter extracted links
The subclass should override it if necessary
"""
@ -110,8 +128,8 @@ class LxmlLinkExtractor:
allow_domains=(),
deny_domains=(),
restrict_xpaths=(),
tags=('a', 'area'),
attrs=('href',),
tags=("a", "area"),
attrs=("href",),
canonicalize=False,
unique=True,
process_value=None,
@ -127,26 +145,31 @@ class LxmlLinkExtractor:
unique=unique,
process=process_value,
strip=strip,
canonicalized=canonicalize
canonicalized=canonicalize,
)
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x)
for x in arg_to_iter(allow)]
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x)
for x in arg_to_iter(deny)]
self.allow_res = [
x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)
]
self.deny_res = [
x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)
]
self.allow_domains = set(arg_to_iter(allow_domains))
self.deny_domains = set(arg_to_iter(deny_domains))
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
self.restrict_xpaths += tuple(map(self._csstranslator.css_to_xpath,
arg_to_iter(restrict_css)))
self.restrict_xpaths += tuple(
map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css))
)
if deny_extensions is None:
deny_extensions = IGNORED_EXTENSIONS
self.canonicalize = canonicalize
self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
self.restrict_text = [x if isinstance(x, _re_type) else re.compile(x)
for x in arg_to_iter(restrict_text)]
self.deny_extensions = {"." + e for e in arg_to_iter(deny_extensions)}
self.restrict_text = [
x if isinstance(x, _re_type) else re.compile(x)
for x in arg_to_iter(restrict_text)
]
def _link_allowed(self, link):
if not _is_valid_url(link.url):
@ -156,11 +179,15 @@ class LxmlLinkExtractor:
if self.deny_res and _matches(link.url, self.deny_res):
return False
parsed_url = urlparse(link.url)
if self.allow_domains and not url_is_from_any_domain(parsed_url, self.allow_domains):
if self.allow_domains and not url_is_from_any_domain(
parsed_url, self.allow_domains
):
return False
if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains):
return False
if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions):
if self.deny_extensions and url_has_any_extension(
parsed_url, self.deny_extensions
):
return False
if self.restrict_text and not _matches(link.text, self.restrict_text):
return False
@ -173,7 +200,11 @@ class LxmlLinkExtractor:
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
return False
allowed = (regex.search(url) for regex in self.allow_res) if self.allow_res else [True]
allowed = (
(regex.search(url) for regex in self.allow_res)
if self.allow_res
else [True]
)
denied = (regex.search(url) for regex in self.deny_res) if self.deny_res else []
return any(allowed) and not any(denied)
@ -200,9 +231,7 @@ class LxmlLinkExtractor:
base_url = get_base_url(response)
if self.restrict_xpaths:
docs = [
subdoc
for x in self.restrict_xpaths
for subdoc in response.xpath(x)
subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x)
]
else:
docs = [response.selector]

View File

@ -15,7 +15,7 @@ def wrap_loader_context(function, context):
"scrapy.loader.common.wrap_loader_context has moved to a new library."
"Please update your reference to itemloaders.common.wrap_loader_context",
ScrapyDeprecationWarning,
stacklevel=2
stacklevel=2,
)
return common.wrap_loader_context(function, context)

View File

@ -8,14 +8,14 @@ from itemloaders import processors
from scrapy.utils.deprecate import create_deprecated_class
MapCompose = create_deprecated_class('MapCompose', processors.MapCompose)
MapCompose = create_deprecated_class("MapCompose", processors.MapCompose)
Compose = create_deprecated_class('Compose', processors.Compose)
Compose = create_deprecated_class("Compose", processors.Compose)
TakeFirst = create_deprecated_class('TakeFirst', processors.TakeFirst)
TakeFirst = create_deprecated_class("TakeFirst", processors.TakeFirst)
Identity = create_deprecated_class('Identity', processors.Identity)
Identity = create_deprecated_class("Identity", processors.Identity)
SelectJmes = create_deprecated_class('SelectJmes', processors.SelectJmes)
SelectJmes = create_deprecated_class("SelectJmes", processors.SelectJmes)
Join = create_deprecated_class('Join', processors.Join)
Join = create_deprecated_class("Join", processors.Join)

View File

@ -54,20 +54,20 @@ class LogFormatter:
def crawled(self, request, response, spider):
"""Logs a message when the crawler finds a webpage."""
request_flags = f' {str(request.flags)}' if request.flags else ''
response_flags = f' {str(response.flags)}' if response.flags else ''
request_flags = f" {str(request.flags)}" if request.flags else ""
response_flags = f" {str(response.flags)}" if response.flags else ""
return {
'level': logging.DEBUG,
'msg': CRAWLEDMSG,
'args': {
'status': response.status,
'request': request,
'request_flags': request_flags,
'referer': referer_str(request),
'response_flags': response_flags,
"level": logging.DEBUG,
"msg": CRAWLEDMSG,
"args": {
"status": response.status,
"request": request,
"request_flags": request_flags,
"referer": referer_str(request),
"response_flags": response_flags,
# backward compatibility with Scrapy logformatter below 1.4 version
'flags': response_flags
}
"flags": response_flags,
},
}
def scraped(self, item, response, spider):
@ -77,23 +77,23 @@ class LogFormatter:
else:
src = response
return {
'level': logging.DEBUG,
'msg': SCRAPEDMSG,
'args': {
'src': src,
'item': item,
}
"level": logging.DEBUG,
"msg": SCRAPEDMSG,
"args": {
"src": src,
"item": item,
},
}
def dropped(self, item, exception, response, spider):
"""Logs a message when an item is dropped while it is passing through the item pipeline."""
return {
'level': logging.WARNING,
'msg': DROPPEDMSG,
'args': {
'exception': exception,
'item': item,
}
"level": logging.WARNING,
"msg": DROPPEDMSG,
"args": {
"exception": exception,
"item": item,
},
}
def item_error(self, item, exception, response, spider):
@ -103,11 +103,11 @@ class LogFormatter:
.. versionadded:: 2.0
"""
return {
'level': logging.ERROR,
'msg': ITEMERRORMSG,
'args': {
'item': item,
}
"level": logging.ERROR,
"msg": ITEMERRORMSG,
"args": {
"item": item,
},
}
def spider_error(self, failure, request, response, spider):
@ -116,12 +116,12 @@ class LogFormatter:
.. versionadded:: 2.0
"""
return {
'level': logging.ERROR,
'msg': SPIDERERRORMSG,
'args': {
'request': request,
'referer': referer_str(request),
}
"level": logging.ERROR,
"msg": SPIDERERRORMSG,
"args": {
"request": request,
"referer": referer_str(request),
},
}
def download_error(self, failure, request, spider, errmsg=None):
@ -130,16 +130,16 @@ class LogFormatter:
.. versionadded:: 2.0
"""
args = {'request': request}
args = {"request": request}
if errmsg:
msg = DOWNLOADERRORMSG_LONG
args['errmsg'] = errmsg
args["errmsg"] = errmsg
else:
msg = DOWNLOADERRORMSG_SHORT
return {
'level': logging.ERROR,
'msg': msg,
'args': args,
"level": logging.ERROR,
"msg": msg,
"args": args,
}
@classmethod

View File

@ -34,8 +34,15 @@ def _to_bytes_or_none(text):
class MailSender:
def __init__(
self, smtphost='localhost', mailfrom='scrapy@localhost', smtpuser=None,
smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False
self,
smtphost="localhost",
mailfrom="scrapy@localhost",
smtpuser=None,
smtppass=None,
smtpport=25,
smtptls=False,
smtpssl=False,
debug=False,
):
self.smtphost = smtphost
self.smtpport = smtpport
@ -49,44 +56,57 @@ class MailSender:
@classmethod
def from_settings(cls, settings):
return cls(
smtphost=settings['MAIL_HOST'],
mailfrom=settings['MAIL_FROM'],
smtpuser=settings['MAIL_USER'],
smtppass=settings['MAIL_PASS'],
smtpport=settings.getint('MAIL_PORT'),
smtptls=settings.getbool('MAIL_TLS'),
smtpssl=settings.getbool('MAIL_SSL'),
smtphost=settings["MAIL_HOST"],
mailfrom=settings["MAIL_FROM"],
smtpuser=settings["MAIL_USER"],
smtppass=settings["MAIL_PASS"],
smtpport=settings.getint("MAIL_PORT"),
smtptls=settings.getbool("MAIL_TLS"),
smtpssl=settings.getbool("MAIL_SSL"),
)
def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
def send(
self,
to,
subject,
body,
cc=None,
attachs=(),
mimetype="text/plain",
charset=None,
_callback=None,
):
from twisted.internet import reactor
if attachs:
msg = MIMEMultipart()
else:
msg = MIMENonMultipart(*mimetype.split('/', 1))
msg = MIMENonMultipart(*mimetype.split("/", 1))
to = list(arg_to_iter(to))
cc = list(arg_to_iter(cc))
msg['From'] = self.mailfrom
msg['To'] = COMMASPACE.join(to)
msg['Date'] = formatdate(localtime=True)
msg['Subject'] = subject
msg["From"] = self.mailfrom
msg["To"] = COMMASPACE.join(to)
msg["Date"] = formatdate(localtime=True)
msg["Subject"] = subject
rcpts = to[:]
if cc:
rcpts.extend(cc)
msg['Cc'] = COMMASPACE.join(cc)
msg["Cc"] = COMMASPACE.join(cc)
if charset:
msg.set_charset(charset)
if attachs:
msg.attach(MIMEText(body, 'plain', charset or 'us-ascii'))
msg.attach(MIMEText(body, "plain", charset or "us-ascii"))
for attach_name, mimetype, f in attachs:
part = MIMEBase(*mimetype.split('/'))
part = MIMEBase(*mimetype.split("/"))
part.set_payload(f.read())
Encoders.encode_base64(part)
part.add_header('Content-Disposition', 'attachment', filename=attach_name)
part.add_header(
"Content-Disposition", "attachment", filename=attach_name
)
msg.attach(part)
else:
msg.set_payload(body)
@ -95,50 +115,79 @@ class MailSender:
_callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg)
if self.debug:
logger.debug('Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
'mailattachs': len(attachs)})
logger.debug(
"Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s "
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
{
"mailto": to,
"mailcc": cc,
"mailsubject": subject,
"mailattachs": len(attachs),
},
)
return
dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8'))
dfd = self._sendmail(rcpts, msg.as_string().encode(charset or "utf-8"))
dfd.addCallbacks(
callback=self._sent_ok,
errback=self._sent_failed,
callbackArgs=[to, cc, subject, len(attachs)],
errbackArgs=[to, cc, subject, len(attachs)],
)
reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
reactor.addSystemEventTrigger("before", "shutdown", lambda: dfd)
return dfd
def _sent_ok(self, result, to, cc, subject, nattachs):
logger.info('Mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
'mailattachs': nattachs})
logger.info(
"Mail sent OK: To=%(mailto)s Cc=%(mailcc)s "
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
{
"mailto": to,
"mailcc": cc,
"mailsubject": subject,
"mailattachs": nattachs,
},
)
def _sent_failed(self, failure, to, cc, subject, nattachs):
errstr = str(failure.value)
logger.error('Unable to send mail: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
'- %(mailerr)s',
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
'mailattachs': nattachs, 'mailerr': errstr})
logger.error(
"Unable to send mail: To=%(mailto)s Cc=%(mailcc)s "
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
"- %(mailerr)s",
{
"mailto": to,
"mailcc": cc,
"mailsubject": subject,
"mailattachs": nattachs,
"mailerr": errstr,
},
)
def _sendmail(self, to_addrs, msg):
# Import twisted.mail here because it is not available in python3
from twisted.internet import reactor
from twisted.mail.smtp import ESMTPSenderFactory
msg = BytesIO(msg)
d = defer.Deferred()
factory = ESMTPSenderFactory(
self.smtpuser, self.smtppass, self.mailfrom, to_addrs, msg, d,
heloFallback=True, requireAuthentication=False, requireTransportSecurity=self.smtptls,
self.smtpuser,
self.smtppass,
self.mailfrom,
to_addrs,
msg,
d,
heloFallback=True,
requireAuthentication=False,
requireTransportSecurity=self.smtptls,
)
factory.noisy = False
if self.smtpssl:
reactor.connectSSL(self.smtphost, self.smtpport, factory, ssl.ClientContextFactory())
reactor.connectSSL(
self.smtphost, self.smtpport, factory, ssl.ClientContextFactory()
)
else:
reactor.connectTCP(self.smtphost, self.smtpport, factory)

View File

@ -17,13 +17,15 @@ logger = logging.getLogger(__name__)
class MiddlewareManager:
"""Base class for implementing middleware managers"""
component_name = 'foo middleware'
component_name = "foo middleware"
def __init__(self, *middlewares: Any) -> None:
self.middlewares = middlewares
# Only process_spider_output and process_spider_exception can be None.
# Only process_spider_output can be a tuple, and only until _async compatibility methods are removed.
self.methods: Dict[str, Deque[Union[None, Callable, Tuple[Callable, Callable]]]] = defaultdict(deque)
self.methods: Dict[
str, Deque[Union[None, Callable, Tuple[Callable, Callable]]]
] = defaultdict(deque)
for mw in middlewares:
self._add_middleware(mw)
@ -44,15 +46,21 @@ class MiddlewareManager:
enabled.append(clspath)
except NotConfigured as e:
if e.args:
clsname = clspath.split('.')[-1]
logger.warning("Disabled %(clsname)s: %(eargs)s",
{'clsname': clsname, 'eargs': e.args[0]},
extra={'crawler': crawler})
clsname = clspath.split(".")[-1]
logger.warning(
"Disabled %(clsname)s: %(eargs)s",
{"clsname": clsname, "eargs": e.args[0]},
extra={"crawler": crawler},
)
logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
{'componentname': cls.component_name,
'enabledlist': pprint.pformat(enabled)},
extra={'crawler': crawler})
logger.info(
"Enabled %(componentname)ss:\n%(enabledlist)s",
{
"componentname": cls.component_name,
"enabledlist": pprint.pformat(enabled),
},
extra={"crawler": crawler},
)
return cls(*middlewares)
@classmethod
@ -60,10 +68,10 @@ class MiddlewareManager:
return cls.from_settings(crawler.settings, crawler)
def _add_middleware(self, mw) -> None:
if hasattr(mw, 'open_spider'):
self.methods['open_spider'].append(mw.open_spider)
if hasattr(mw, 'close_spider'):
self.methods['close_spider'].appendleft(mw.close_spider)
if hasattr(mw, "open_spider"):
self.methods["open_spider"].append(mw.open_spider)
if hasattr(mw, "close_spider"):
self.methods["close_spider"].appendleft(mw.close_spider)
def _process_parallel(self, methodname: str, obj, *args) -> Deferred:
methods = cast(Iterable[Callable], self.methods[methodname])
@ -74,7 +82,7 @@ class MiddlewareManager:
return process_chain(methods, obj, *args)
def open_spider(self, spider: Spider) -> Deferred:
return self._process_parallel('open_spider', spider)
return self._process_parallel("open_spider", spider)
def close_spider(self, spider: Spider) -> Deferred:
return self._process_parallel('close_spider', spider)
return self._process_parallel("close_spider", spider)

View File

@ -11,16 +11,18 @@ from scrapy.utils.defer import deferred_f_from_coro_f
class ItemPipelineManager(MiddlewareManager):
component_name = 'item pipeline'
component_name = "item pipeline"
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(settings.getwithbase('ITEM_PIPELINES'))
return build_component_list(settings.getwithbase("ITEM_PIPELINES"))
def _add_middleware(self, pipe):
super()._add_middleware(pipe)
if hasattr(pipe, 'process_item'):
self.methods['process_item'].append(deferred_f_from_coro_f(pipe.process_item))
if hasattr(pipe, "process_item"):
self.methods["process_item"].append(
deferred_f_from_coro_f(pipe.process_item)
)
def process_item(self, item, spider):
return self._process_chain('process_item', item, spider)
return self._process_chain("process_item", item, spider)

View File

@ -42,8 +42,8 @@ class FileException(Exception):
class FSFilesStore:
def __init__(self, basedir: str):
if '://' in basedir:
basedir = basedir.split('://', 1)[1]
if "://" in basedir:
basedir = basedir.split("://", 1)[1]
self.basedir = basedir
self._mkdir(Path(self.basedir))
self.created_directories: DefaultDict[str, Set[str]] = defaultdict(set)
@ -60,13 +60,13 @@ class FSFilesStore:
except os.error:
return {}
with absolute_path.open('rb') as f:
with absolute_path.open("rb") as f:
checksum = md5sum(f)
return {'last_modified': last_modified, 'checksum': checksum}
return {"last_modified": last_modified, "checksum": checksum}
def _get_filesystem_path(self, path: str) -> Path:
path_comps = path.split('/')
path_comps = path.split("/")
return Path(self.basedir, *path_comps)
def _mkdir(self, dirname: Path, domain: Optional[str] = None):
@ -86,49 +86,49 @@ class S3FilesStore:
AWS_USE_SSL = None
AWS_VERIFY = None
POLICY = 'private' # Overridden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
POLICY = "private" # Overridden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
HEADERS = {
'Cache-Control': 'max-age=172800',
"Cache-Control": "max-age=172800",
}
def __init__(self, uri):
if not is_botocore_available():
raise NotConfigured('missing botocore library')
raise NotConfigured("missing botocore library")
import botocore.session
session = botocore.session.get_session()
self.s3_client = session.create_client(
's3',
"s3",
aws_access_key_id=self.AWS_ACCESS_KEY_ID,
aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY,
aws_session_token=self.AWS_SESSION_TOKEN,
endpoint_url=self.AWS_ENDPOINT_URL,
region_name=self.AWS_REGION_NAME,
use_ssl=self.AWS_USE_SSL,
verify=self.AWS_VERIFY
verify=self.AWS_VERIFY,
)
if not uri.startswith("s3://"):
raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'")
self.bucket, self.prefix = uri[5:].split('/', 1)
self.bucket, self.prefix = uri[5:].split("/", 1)
def stat_file(self, path, info):
def _onsuccess(boto_key):
checksum = boto_key['ETag'].strip('"')
last_modified = boto_key['LastModified']
checksum = boto_key["ETag"].strip('"')
last_modified = boto_key["LastModified"]
modified_stamp = time.mktime(last_modified.timetuple())
return {'checksum': checksum, 'last_modified': modified_stamp}
return {"checksum": checksum, "last_modified": modified_stamp}
return self._get_boto_key(path).addCallback(_onsuccess)
def _get_boto_key(self, path):
key_name = f'{self.prefix}{path}'
key_name = f"{self.prefix}{path}"
return threads.deferToThread(
self.s3_client.head_object,
Bucket=self.bucket,
Key=key_name)
self.s3_client.head_object, Bucket=self.bucket, Key=key_name
)
def persist_file(self, path, buf, info, meta=None, headers=None):
"""Upload file to S3 storage"""
key_name = f'{self.prefix}{path}'
key_name = f"{self.prefix}{path}"
buf.seek(0)
extra = self._headers_to_botocore_kwargs(self.HEADERS)
if headers:
@ -140,39 +140,41 @@ class S3FilesStore:
Body=buf,
Metadata={k: str(v) for k, v in (meta or {}).items()},
ACL=self.POLICY,
**extra)
**extra,
)
def _headers_to_botocore_kwargs(self, headers):
""" Convert headers to botocore keyword arguments.
"""
"""Convert headers to botocore keyword arguments."""
# This is required while we need to support both boto and botocore.
mapping = CaselessDict({
'Content-Type': 'ContentType',
'Cache-Control': 'CacheControl',
'Content-Disposition': 'ContentDisposition',
'Content-Encoding': 'ContentEncoding',
'Content-Language': 'ContentLanguage',
'Content-Length': 'ContentLength',
'Content-MD5': 'ContentMD5',
'Expires': 'Expires',
'X-Amz-Grant-Full-Control': 'GrantFullControl',
'X-Amz-Grant-Read': 'GrantRead',
'X-Amz-Grant-Read-ACP': 'GrantReadACP',
'X-Amz-Grant-Write-ACP': 'GrantWriteACP',
'X-Amz-Object-Lock-Legal-Hold': 'ObjectLockLegalHoldStatus',
'X-Amz-Object-Lock-Mode': 'ObjectLockMode',
'X-Amz-Object-Lock-Retain-Until-Date': 'ObjectLockRetainUntilDate',
'X-Amz-Request-Payer': 'RequestPayer',
'X-Amz-Server-Side-Encryption': 'ServerSideEncryption',
'X-Amz-Server-Side-Encryption-Aws-Kms-Key-Id': 'SSEKMSKeyId',
'X-Amz-Server-Side-Encryption-Context': 'SSEKMSEncryptionContext',
'X-Amz-Server-Side-Encryption-Customer-Algorithm': 'SSECustomerAlgorithm',
'X-Amz-Server-Side-Encryption-Customer-Key': 'SSECustomerKey',
'X-Amz-Server-Side-Encryption-Customer-Key-Md5': 'SSECustomerKeyMD5',
'X-Amz-Storage-Class': 'StorageClass',
'X-Amz-Tagging': 'Tagging',
'X-Amz-Website-Redirect-Location': 'WebsiteRedirectLocation',
})
mapping = CaselessDict(
{
"Content-Type": "ContentType",
"Cache-Control": "CacheControl",
"Content-Disposition": "ContentDisposition",
"Content-Encoding": "ContentEncoding",
"Content-Language": "ContentLanguage",
"Content-Length": "ContentLength",
"Content-MD5": "ContentMD5",
"Expires": "Expires",
"X-Amz-Grant-Full-Control": "GrantFullControl",
"X-Amz-Grant-Read": "GrantRead",
"X-Amz-Grant-Read-ACP": "GrantReadACP",
"X-Amz-Grant-Write-ACP": "GrantWriteACP",
"X-Amz-Object-Lock-Legal-Hold": "ObjectLockLegalHoldStatus",
"X-Amz-Object-Lock-Mode": "ObjectLockMode",
"X-Amz-Object-Lock-Retain-Until-Date": "ObjectLockRetainUntilDate",
"X-Amz-Request-Payer": "RequestPayer",
"X-Amz-Server-Side-Encryption": "ServerSideEncryption",
"X-Amz-Server-Side-Encryption-Aws-Kms-Key-Id": "SSEKMSKeyId",
"X-Amz-Server-Side-Encryption-Context": "SSEKMSEncryptionContext",
"X-Amz-Server-Side-Encryption-Customer-Algorithm": "SSECustomerAlgorithm",
"X-Amz-Server-Side-Encryption-Customer-Key": "SSECustomerKey",
"X-Amz-Server-Side-Encryption-Customer-Key-Md5": "SSECustomerKeyMD5",
"X-Amz-Storage-Class": "StorageClass",
"X-Amz-Tagging": "Tagging",
"X-Amz-Website-Redirect-Location": "WebsiteRedirectLocation",
}
)
extra = {}
for key, value in headers.items():
try:
@ -188,7 +190,7 @@ class GCSFilesStore:
GCS_PROJECT_ID = None
CACHE_CONTROL = 'max-age=172800'
CACHE_CONTROL = "max-age=172800"
# The bucket's default object ACL will be applied to the object.
# Overridden from settings.FILES_STORE_GCS_ACL in FilesPipeline.from_settings.
@ -196,23 +198,24 @@ class GCSFilesStore:
def __init__(self, uri):
from google.cloud import storage
client = storage.Client(project=self.GCS_PROJECT_ID)
bucket, prefix = uri[5:].split('/', 1)
bucket, prefix = uri[5:].split("/", 1)
self.bucket = client.bucket(bucket)
self.prefix = prefix
permissions = self.bucket.test_iam_permissions(
['storage.objects.get', 'storage.objects.create']
["storage.objects.get", "storage.objects.create"]
)
if 'storage.objects.get' not in permissions:
if "storage.objects.get" not in permissions:
logger.warning(
"No 'storage.objects.get' permission for GSC bucket %(bucket)s. "
"Checking if files are up to date will be impossible. Files will be downloaded every time.",
{'bucket': bucket}
{"bucket": bucket},
)
if 'storage.objects.create' not in permissions:
if "storage.objects.create" not in permissions:
logger.error(
"No 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!",
{'bucket': bucket}
{"bucket": bucket},
)
def stat_file(self, path, info):
@ -220,15 +223,18 @@ class GCSFilesStore:
if blob:
checksum = blob.md5_hash
last_modified = time.mktime(blob.updated.timetuple())
return {'checksum': checksum, 'last_modified': last_modified}
return {"checksum": checksum, "last_modified": last_modified}
return {}
blob_path = self._get_blob_path(path)
return threads.deferToThread(self.bucket.get_blob, blob_path).addCallback(_onsuccess)
return threads.deferToThread(self.bucket.get_blob, blob_path).addCallback(
_onsuccess
)
def _get_content_type(self, headers):
if headers and 'Content-Type' in headers:
return headers['Content-Type']
return 'application/octet-stream'
if headers and "Content-Type" in headers:
return headers["Content-Type"]
return "application/octet-stream"
def _get_blob_path(self, path):
return self.prefix + path
@ -242,7 +248,7 @@ class GCSFilesStore:
blob.upload_from_string,
data=buf.getvalue(),
content_type=self._get_content_type(headers),
predefined_acl=self.POLICY
predefined_acl=self.POLICY,
)
@ -261,14 +267,19 @@ class FTPFilesStore:
self.port = int(u.port or 21)
self.username = u.username or self.FTP_USERNAME
self.password = u.password or self.FTP_PASSWORD
self.basedir = u.path.rstrip('/')
self.basedir = u.path.rstrip("/")
def persist_file(self, path, buf, info, meta=None, headers=None):
path = f'{self.basedir}/{path}'
path = f"{self.basedir}/{path}"
return threads.deferToThread(
ftp_store_file, path=path, file=buf,
host=self.host, port=self.port, username=self.username,
password=self.password, use_active_mode=self.USE_ACTIVE_MODE
ftp_store_file,
path=path,
file=buf,
host=self.host,
port=self.port,
username=self.username,
password=self.password,
use_active_mode=self.USE_ACTIVE_MODE,
)
def stat_file(self, path, info):
@ -282,11 +293,12 @@ class FTPFilesStore:
file_path = f"{self.basedir}/{path}"
last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
m = hashlib.md5()
ftp.retrbinary(f'RETR {file_path}', m.update)
return {'last_modified': last_modified, 'checksum': m.hexdigest()}
ftp.retrbinary(f"RETR {file_path}", m.update)
return {"last_modified": last_modified, "checksum": m.hexdigest()}
# The file doesn't exist
except Exception:
return {}
return threads.deferToThread(_stat_file, path)
@ -312,14 +324,14 @@ class FilesPipeline(MediaPipeline):
MEDIA_NAME = "file"
EXPIRES = 90
STORE_SCHEMES = {
'': FSFilesStore,
'file': FSFilesStore,
's3': S3FilesStore,
'gs': GCSFilesStore,
'ftp': FTPFilesStore
"": FSFilesStore,
"file": FSFilesStore,
"s3": S3FilesStore,
"gs": GCSFilesStore,
"ftp": FTPFilesStore,
}
DEFAULT_FILES_URLS_FIELD = 'file_urls'
DEFAULT_FILES_RESULT_FIELD = 'files'
DEFAULT_FILES_URLS_FIELD = "file_urls"
DEFAULT_FILES_RESULT_FIELD = "files"
def __init__(self, store_uri, download_func=None, settings=None):
if not store_uri:
@ -330,52 +342,50 @@ class FilesPipeline(MediaPipeline):
cls_name = "FilesPipeline"
self.store = self._get_store(store_uri)
resolve = functools.partial(self._key_for_pipe,
base_class_name=cls_name,
settings=settings)
self.expires = settings.getint(
resolve('FILES_EXPIRES'), self.EXPIRES
resolve = functools.partial(
self._key_for_pipe, base_class_name=cls_name, settings=settings
)
self.expires = settings.getint(resolve("FILES_EXPIRES"), self.EXPIRES)
if not hasattr(self, "FILES_URLS_FIELD"):
self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
if not hasattr(self, "FILES_RESULT_FIELD"):
self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
self.files_urls_field = settings.get(
resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
resolve("FILES_URLS_FIELD"), self.FILES_URLS_FIELD
)
self.files_result_field = settings.get(
resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
resolve("FILES_RESULT_FIELD"), self.FILES_RESULT_FIELD
)
super().__init__(download_func=download_func, settings=settings)
@classmethod
def from_settings(cls, settings):
s3store = cls.STORE_SCHEMES['s3']
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
s3store.AWS_SESSION_TOKEN = settings['AWS_SESSION_TOKEN']
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
s3store.AWS_VERIFY = settings['AWS_VERIFY']
s3store.POLICY = settings['FILES_STORE_S3_ACL']
s3store = cls.STORE_SCHEMES["s3"]
s3store.AWS_ACCESS_KEY_ID = settings["AWS_ACCESS_KEY_ID"]
s3store.AWS_SECRET_ACCESS_KEY = settings["AWS_SECRET_ACCESS_KEY"]
s3store.AWS_SESSION_TOKEN = settings["AWS_SESSION_TOKEN"]
s3store.AWS_ENDPOINT_URL = settings["AWS_ENDPOINT_URL"]
s3store.AWS_REGION_NAME = settings["AWS_REGION_NAME"]
s3store.AWS_USE_SSL = settings["AWS_USE_SSL"]
s3store.AWS_VERIFY = settings["AWS_VERIFY"]
s3store.POLICY = settings["FILES_STORE_S3_ACL"]
gcs_store = cls.STORE_SCHEMES['gs']
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
gcs_store.POLICY = settings['FILES_STORE_GCS_ACL'] or None
gcs_store = cls.STORE_SCHEMES["gs"]
gcs_store.GCS_PROJECT_ID = settings["GCS_PROJECT_ID"]
gcs_store.POLICY = settings["FILES_STORE_GCS_ACL"] or None
ftp_store = cls.STORE_SCHEMES['ftp']
ftp_store.FTP_USERNAME = settings['FTP_USER']
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
ftp_store = cls.STORE_SCHEMES["ftp"]
ftp_store.FTP_USERNAME = settings["FTP_USER"]
ftp_store.FTP_PASSWORD = settings["FTP_PASSWORD"]
ftp_store.USE_ACTIVE_MODE = settings.getbool("FEED_STORAGE_FTP_ACTIVE")
store_uri = settings['FILES_STORE']
store_uri = settings["FILES_STORE"]
return cls(store_uri, settings=settings)
def _get_store(self, uri: str):
if Path(uri).is_absolute(): # to support win32 paths like: C:\\some\dir
scheme = 'file'
scheme = "file"
else:
scheme = urlparse(uri).scheme
store_cls = self.STORE_SCHEMES[scheme]
@ -386,7 +396,7 @@ class FilesPipeline(MediaPipeline):
if not result:
return # returning None force download
last_modified = result.get('last_modified', None)
last_modified = result.get("last_modified", None)
if not last_modified:
return # returning None force download
@ -397,25 +407,30 @@ class FilesPipeline(MediaPipeline):
referer = referer_str(request)
logger.debug(
'File (uptodate): Downloaded %(medianame)s from %(request)s '
'referred in <%(referer)s>',
{'medianame': self.MEDIA_NAME, 'request': request,
'referer': referer},
extra={'spider': info.spider}
"File (uptodate): Downloaded %(medianame)s from %(request)s "
"referred in <%(referer)s>",
{"medianame": self.MEDIA_NAME, "request": request, "referer": referer},
extra={"spider": info.spider},
)
self.inc_stats(info.spider, 'uptodate')
self.inc_stats(info.spider, "uptodate")
checksum = result.get('checksum', None)
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': 'uptodate'}
checksum = result.get("checksum", None)
return {
"url": request.url,
"path": path,
"checksum": checksum,
"status": "uptodate",
}
path = self.file_path(request, info=info, item=item)
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
dfd.addCallbacks(_onsuccess, lambda _: None)
dfd.addErrback(
lambda f:
logger.error(self.__class__.__name__ + '.store.stat_file',
exc_info=failure_to_exc_info(f),
extra={'spider': info.spider})
lambda f: logger.error(
self.__class__.__name__ + ".store.stat_file",
exc_info=failure_to_exc_info(f),
extra={"spider": info.spider},
)
)
return dfd
@ -423,11 +438,15 @@ class FilesPipeline(MediaPipeline):
if not isinstance(failure.value, IgnoreRequest):
referer = referer_str(request)
logger.warning(
'File (unknown-error): Error downloading %(medianame)s from '
'%(request)s referred in <%(referer)s>: %(exception)s',
{'medianame': self.MEDIA_NAME, 'request': request,
'referer': referer, 'exception': failure.value},
extra={'spider': info.spider}
"File (unknown-error): Error downloading %(medianame)s from "
"%(request)s referred in <%(referer)s>: %(exception)s",
{
"medianame": self.MEDIA_NAME,
"request": request,
"referer": referer,
"exception": failure.value,
},
extra={"spider": info.spider},
)
raise FileException
@ -437,29 +456,28 @@ class FilesPipeline(MediaPipeline):
if response.status != 200:
logger.warning(
'File (code: %(status)s): Error downloading file from '
'%(request)s referred in <%(referer)s>',
{'status': response.status,
'request': request, 'referer': referer},
extra={'spider': info.spider}
"File (code: %(status)s): Error downloading file from "
"%(request)s referred in <%(referer)s>",
{"status": response.status, "request": request, "referer": referer},
extra={"spider": info.spider},
)
raise FileException('download-error')
raise FileException("download-error")
if not response.body:
logger.warning(
'File (empty-content): Empty file from %(request)s referred '
'in <%(referer)s>: no-content',
{'request': request, 'referer': referer},
extra={'spider': info.spider}
"File (empty-content): Empty file from %(request)s referred "
"in <%(referer)s>: no-content",
{"request": request, "referer": referer},
extra={"spider": info.spider},
)
raise FileException('empty-content')
raise FileException("empty-content")
status = 'cached' if 'cached' in response.flags else 'downloaded'
status = "cached" if "cached" in response.flags else "downloaded"
logger.debug(
'File (%(status)s): Downloaded file from %(request)s referred in '
'<%(referer)s>',
{'status': status, 'request': request, 'referer': referer},
extra={'spider': info.spider}
"File (%(status)s): Downloaded file from %(request)s referred in "
"<%(referer)s>",
{"status": status, "request": request, "referer": referer},
extra={"spider": info.spider},
)
self.inc_stats(info.spider, status)
@ -468,26 +486,33 @@ class FilesPipeline(MediaPipeline):
checksum = self.file_downloaded(response, request, info, item=item)
except FileException as exc:
logger.warning(
'File (error): Error processing file from %(request)s '
'referred in <%(referer)s>: %(errormsg)s',
{'request': request, 'referer': referer, 'errormsg': str(exc)},
extra={'spider': info.spider}, exc_info=True
"File (error): Error processing file from %(request)s "
"referred in <%(referer)s>: %(errormsg)s",
{"request": request, "referer": referer, "errormsg": str(exc)},
extra={"spider": info.spider},
exc_info=True,
)
raise
except Exception as exc:
logger.error(
'File (unknown-error): Error processing file from %(request)s '
'referred in <%(referer)s>',
{'request': request, 'referer': referer},
exc_info=True, extra={'spider': info.spider}
"File (unknown-error): Error processing file from %(request)s "
"referred in <%(referer)s>",
{"request": request, "referer": referer},
exc_info=True,
extra={"spider": info.spider},
)
raise FileException(str(exc))
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': status}
return {
"url": request.url,
"path": path,
"checksum": checksum,
"status": status,
}
def inc_stats(self, spider, status):
spider.crawler.stats.inc_value('file_count', spider=spider)
spider.crawler.stats.inc_value(f'file_status_count/{status}', spider=spider)
spider.crawler.stats.inc_value("file_count", spider=spider)
spider.crawler.stats.inc_value(f"file_status_count/{status}", spider=spider)
# Overridable Interface
def get_media_requests(self, item, info):
@ -513,8 +538,8 @@ class FilesPipeline(MediaPipeline):
# Handles empty and wild extensions by trying to guess the
# mime type then extension or default to empty string otherwise
if media_ext not in mimetypes.types_map:
media_ext = ''
media_ext = ""
media_type = mimetypes.guess_type(request.url)[0]
if media_type:
media_ext = mimetypes.guess_extension(media_type)
return f'full/{media_guid}{media_ext}'
return f"full/{media_guid}{media_ext}"

View File

@ -14,6 +14,7 @@ from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem, NotConfigured, ScrapyDeprecationWarning
from scrapy.http import Request
from scrapy.pipelines.files import FileException, FilesPipeline
# TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.misc import md5sum
@ -24,7 +25,11 @@ class NoimagesDrop(DropItem):
"""Product with no images exception"""
def __init__(self, *args, **kwargs):
warnings.warn("The NoimagesDrop class is deprecated", category=ScrapyDeprecationWarning, stacklevel=2)
warnings.warn(
"The NoimagesDrop class is deprecated",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
super().__init__(*args, **kwargs)
@ -33,11 +38,9 @@ class ImageException(FileException):
class ImagesPipeline(FilesPipeline):
"""Abstract pipeline that implement the image thumbnail generation logic
"""Abstract pipeline that implement the image thumbnail generation logic"""
"""
MEDIA_NAME = 'image'
MEDIA_NAME = "image"
# Uppercase attributes kept for backward compatibility with code that subclasses
# ImagesPipeline. They may be overridden by settings.
@ -45,16 +48,17 @@ class ImagesPipeline(FilesPipeline):
MIN_HEIGHT = 0
EXPIRES = 90
THUMBS = {}
DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
DEFAULT_IMAGES_RESULT_FIELD = 'images'
DEFAULT_IMAGES_URLS_FIELD = "image_urls"
DEFAULT_IMAGES_RESULT_FIELD = "images"
def __init__(self, store_uri, download_func=None, settings=None):
try:
from PIL import Image
self._Image = Image
except ImportError:
raise NotConfigured(
'ImagesPipeline requires installing Pillow 4.0.0 or later'
"ImagesPipeline requires installing Pillow 4.0.0 or later"
)
super().__init__(store_uri, settings=settings, download_func=download_func)
@ -62,12 +66,10 @@ class ImagesPipeline(FilesPipeline):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
resolve = functools.partial(self._key_for_pipe,
base_class_name="ImagesPipeline",
settings=settings)
self.expires = settings.getint(
resolve("IMAGES_EXPIRES"), self.EXPIRES
resolve = functools.partial(
self._key_for_pipe, base_class_name="ImagesPipeline", settings=settings
)
self.expires = settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES)
if not hasattr(self, "IMAGES_RESULT_FIELD"):
self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
@ -75,47 +77,39 @@ class ImagesPipeline(FilesPipeline):
self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
self.images_urls_field = settings.get(
resolve('IMAGES_URLS_FIELD'),
self.IMAGES_URLS_FIELD
resolve("IMAGES_URLS_FIELD"), self.IMAGES_URLS_FIELD
)
self.images_result_field = settings.get(
resolve('IMAGES_RESULT_FIELD'),
self.IMAGES_RESULT_FIELD
)
self.min_width = settings.getint(
resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
)
self.min_height = settings.getint(
resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
)
self.thumbs = settings.get(
resolve('IMAGES_THUMBS'), self.THUMBS
resolve("IMAGES_RESULT_FIELD"), self.IMAGES_RESULT_FIELD
)
self.min_width = settings.getint(resolve("IMAGES_MIN_WIDTH"), self.MIN_WIDTH)
self.min_height = settings.getint(resolve("IMAGES_MIN_HEIGHT"), self.MIN_HEIGHT)
self.thumbs = settings.get(resolve("IMAGES_THUMBS"), self.THUMBS)
self._deprecated_convert_image = None
@classmethod
def from_settings(cls, settings):
s3store = cls.STORE_SCHEMES['s3']
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
s3store.AWS_SESSION_TOKEN = settings['AWS_SESSION_TOKEN']
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
s3store.AWS_VERIFY = settings['AWS_VERIFY']
s3store.POLICY = settings['IMAGES_STORE_S3_ACL']
s3store = cls.STORE_SCHEMES["s3"]
s3store.AWS_ACCESS_KEY_ID = settings["AWS_ACCESS_KEY_ID"]
s3store.AWS_SECRET_ACCESS_KEY = settings["AWS_SECRET_ACCESS_KEY"]
s3store.AWS_SESSION_TOKEN = settings["AWS_SESSION_TOKEN"]
s3store.AWS_ENDPOINT_URL = settings["AWS_ENDPOINT_URL"]
s3store.AWS_REGION_NAME = settings["AWS_REGION_NAME"]
s3store.AWS_USE_SSL = settings["AWS_USE_SSL"]
s3store.AWS_VERIFY = settings["AWS_VERIFY"]
s3store.POLICY = settings["IMAGES_STORE_S3_ACL"]
gcs_store = cls.STORE_SCHEMES['gs']
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None
gcs_store = cls.STORE_SCHEMES["gs"]
gcs_store.GCS_PROJECT_ID = settings["GCS_PROJECT_ID"]
gcs_store.POLICY = settings["IMAGES_STORE_GCS_ACL"] or None
ftp_store = cls.STORE_SCHEMES['ftp']
ftp_store.FTP_USERNAME = settings['FTP_USER']
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
ftp_store = cls.STORE_SCHEMES["ftp"]
ftp_store.FTP_USERNAME = settings["FTP_USER"]
ftp_store.FTP_PASSWORD = settings["FTP_PASSWORD"]
ftp_store.USE_ACTIVE_MODE = settings.getbool("FEED_STORAGE_FTP_ACTIVE")
store_uri = settings['IMAGES_STORE']
store_uri = settings["IMAGES_STORE"]
return cls(store_uri, settings=settings)
def file_downloaded(self, response, request, info, *, item=None):
@ -129,9 +123,12 @@ class ImagesPipeline(FilesPipeline):
checksum = md5sum(buf)
width, height = image.size
self.store.persist_file(
path, buf, info,
meta={'width': width, 'height': height},
headers={'Content-Type': 'image/jpeg'})
path,
buf,
info,
meta={"width": width, "height": height},
headers={"Content-Type": "image/jpeg"},
)
return checksum
def get_images(self, response, request, info, *, item=None):
@ -140,25 +137,35 @@ class ImagesPipeline(FilesPipeline):
width, height = orig_image.size
if width < self.min_width or height < self.min_height:
raise ImageException("Image too small "
f"({width}x{height} < "
f"{self.min_width}x{self.min_height})")
raise ImageException(
"Image too small "
f"({width}x{height} < "
f"{self.min_width}x{self.min_height})"
)
if self._deprecated_convert_image is None:
self._deprecated_convert_image = 'response_body' not in get_func_args(self.convert_image)
self._deprecated_convert_image = "response_body" not in get_func_args(
self.convert_image
)
if self._deprecated_convert_image:
warnings.warn(f'{self.__class__.__name__}.convert_image() method overriden in a deprecated way, '
'overriden method does not accept response_body argument.',
category=ScrapyDeprecationWarning)
warnings.warn(
f"{self.__class__.__name__}.convert_image() method overriden in a deprecated way, "
"overriden method does not accept response_body argument.",
category=ScrapyDeprecationWarning,
)
if self._deprecated_convert_image:
image, buf = self.convert_image(orig_image)
else:
image, buf = self.convert_image(orig_image, response_body=BytesIO(response.body))
image, buf = self.convert_image(
orig_image, response_body=BytesIO(response.body)
)
yield path, image, buf
for thumb_id, size in self.thumbs.items():
thumb_path = self.thumb_path(request, thumb_id, response=response, info=info, item=item)
thumb_path = self.thumb_path(
request, thumb_id, response=response, info=info, item=item
)
if self._deprecated_convert_image:
thumb_image, thumb_buf = self.convert_image(image, size)
else:
@ -167,21 +174,24 @@ class ImagesPipeline(FilesPipeline):
def convert_image(self, image, size=None, response_body=None):
if response_body is None:
warnings.warn(f'{self.__class__.__name__}.convert_image() method called in a deprecated way, '
'method called without response_body argument.',
category=ScrapyDeprecationWarning, stacklevel=2)
warnings.warn(
f"{self.__class__.__name__}.convert_image() method called in a deprecated way, "
"method called without response_body argument.",
category=ScrapyDeprecationWarning,
stacklevel=2,
)
if image.format == 'PNG' and image.mode == 'RGBA':
background = self._Image.new('RGBA', image.size, (255, 255, 255))
if image.format == "PNG" and image.mode == "RGBA":
background = self._Image.new("RGBA", image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode == 'P':
image = background.convert("RGB")
elif image.mode == "P":
image = image.convert("RGBA")
background = self._Image.new('RGBA', image.size, (255, 255, 255))
background = self._Image.new("RGBA", image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode != 'RGB':
image = image.convert('RGB')
image = background.convert("RGB")
elif image.mode != "RGB":
image = image.convert("RGB")
if size:
image = image.copy()
@ -193,11 +203,11 @@ class ImagesPipeline(FilesPipeline):
except AttributeError:
resampling_filter = self._Image.ANTIALIAS
image.thumbnail(size, resampling_filter)
elif response_body is not None and image.format == 'JPEG':
elif response_body is not None and image.format == "JPEG":
return image, response_body
buf = BytesIO()
image.save(buf, 'JPEG')
image.save(buf, "JPEG")
return image, buf
def get_media_requests(self, item, info):
@ -211,8 +221,8 @@ class ImagesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
return f'full/{image_guid}.jpg'
return f"full/{image_guid}.jpg"
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
return f'thumbs/{thumb_id}/{thumb_guid}.jpg'
return f"thumbs/{thumb_id}/{thumb_guid}.jpg"

Some files were not shown because too many files have changed in this diff Show More